convert_UTF: rewrite in C++

This allows us to namespace the symbols properly. Bug: google-breakpad:725 Change-Id: Iea8052547eef6c0acb299c1995781735c6d8994f Reviewed-on: https://chromium-review.googlesource.com/c/breakpad/breakpad/+/1769236 Reviewed-by: Mark Mentovai <mark@chromium.org>
2026-02-19 00:39:38 +00:00 · 2019-08-03 12:12:40 -04:00
parent abfe08e789
commit db1cda2653
8 changed files with 53 additions and 44 deletions
--- a/src/client/minidump_file_writer_unittest.cc
+++ b/src/client/minidump_file_writer_unittest.cc
@@ -30,7 +30,7 @@
 // Author: waylonis@google.com (Dan Waylonis)

 /*
- g++ -I../ ../common/convert_UTF.c \
+ g++ -I../ ../common/convert_UTF.cc \
 ../common/string_conversion.cc \
 minidump_file_writer.cc \
 minidump_file_writer_unittest.cc \
--- a/src/client/solaris/handler/Makefile
+++ b/src/client/solaris/handler/Makefile
@@ -40,13 +40,13 @@ BIN_DIR=.

 THREAD_SRC=solaris_lwp.cc
 SHARE_SRC=../../minidump_file_writer.cc\
+	  ../../../common/convert_UTF.cc\
 	  ../../../common/md5.cc\
 	  ../../../common/string_conversion.cc\
 	  ../../../common/solaris/file_id.cc\
 	  minidump_generator.cc
 HANDLER_SRC=exception_handler.cc\
 	  ../../../common/solaris/guid_creator.cc
-SHARE_C_SRC=../../../common/convert_UTF.c

 MINIDUMP_TEST_SRC=minidump_test.cc
 EXCEPTION_TEST_SRC=exception_handler_test.cc
@@ -54,11 +54,10 @@ EXCEPTION_TEST_SRC=exception_handler_test.cc
 THREAD_OBJ=$(patsubst %.cc,$(OBJ_DIR)/%.o,$(THREAD_SRC))
 SHARE_OBJ=$(patsubst %.cc,$(OBJ_DIR)/%.o,$(SHARE_SRC))
 HANDLER_OBJ=$(patsubst %.cc,$(OBJ_DIR)/%.o,$(HANDLER_SRC))
-SHARE_C_OBJ=$(patsubst %.c,$(OBJ_DIR)/%.o,$(SHARE_C_SRC))
 MINIDUMP_TEST_OBJ=$(patsubst %.cc,$(OBJ_DIR)/%.o, $(MINIDUMP_TEST_SRC))\
-		  $(THREAD_OBJ) $(SHARE_OBJ) $(SHARE_C_OBJ) $(HANDLER_OBJ)
+		  $(THREAD_OBJ) $(SHARE_OBJ) $(HANDLER_OBJ)
 EXCEPTION_TEST_OBJ=$(patsubst %.cc,$(OBJ_DIR)/%.o, $(EXCEPTION_TEST_SRC))\
-          $(THREAD_OBJ) $(SHARE_OBJ) $(SHARE_C_OBJ) $(HANDLER_OBJ)
+          $(THREAD_OBJ) $(SHARE_OBJ) $(HANDLER_OBJ)

 BIN=$(BIN_DIR)/minidump_test\
    $(BIN_DIR)/exception_handler_test
--- a/src/common/common.gyp
+++ b/src/common/common.gyp
@@ -61,7 +61,7 @@
        'android/ucontext_constants.h',
        'basictypes.h',
        'byte_cursor.h',
-        'convert_UTF.c',
+        'convert_UTF.cc',
        'convert_UTF.h',
        'dwarf/bytereader-inl.h',
        'dwarf/bytereader.cc',
--- a/src/common/convert_UTF.cc
+++ b/src/common/convert_UTF.cc
@@ -60,10 +60,16 @@ See the header file "ConvertUTF.h" for complete documentation.
 #include <stdio.h>
 #endif

-static const int halfShift  = 10; /* used for shifting by 10 bits */
+namespace google_breakpad {

-static const UTF32 halfBase = 0x0010000UL;
-static const UTF32 halfMask = 0x3FFUL;
+namespace {
+
+const int halfShift  = 10; /* used for shifting by 10 bits */
+
+const UTF32 halfBase = 0x0010000UL;
+const UTF32 halfMask = 0x3FFUL;
+
+}  // namespace

 #define UNI_SUR_HIGH_START  (UTF32)0xD800
 #define UNI_SUR_HIGH_END    (UTF32)0xDBFF
@@ -183,6 +189,8 @@ ConversionResult ConvertUTF16toUTF32 (const UTF16** sourceStart, const UTF16* so

 /* --------------------------------------------------------------------- */

+namespace {
+
 /*
 * Index into the table below with the first byte of a UTF-8 sequence to
 * get the number of trailing bytes that are supposed to follow it.
@@ -190,7 +198,7 @@ ConversionResult ConvertUTF16toUTF32 (const UTF16** sourceStart, const UTF16* so
 * left as-is for anyone who may want to do such conversion, which was
 * allowed in earlier algorithms.
 */
-static const char trailingBytesForUTF8[256] = {
+const char trailingBytesForUTF8[256] = {
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
@@ -206,7 +214,7 @@ static const char trailingBytesForUTF8[256] = {
 * This table contains as many values as there might be trailing bytes
 * in a UTF-8 sequence.
 */
-static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
+const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
  0x03C82080UL, 0xFA082080UL, 0x82082080UL };

 /*
@@ -216,7 +224,7 @@ static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080
 * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
 * for *legal* UTF-8 will be 4 or fewer bytes total.
 */
-static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
+const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };

 /* --------------------------------------------------------------------- */

@@ -228,6 +236,8 @@ static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
 * into an inline function.
 */

+}  // namespace
+
 /* --------------------------------------------------------------------- */

 ConversionResult ConvertUTF16toUTF8 (const UTF16** sourceStart, const UTF16* sourceEnd,
@@ -299,6 +309,8 @@ return result;

 /* --------------------------------------------------------------------- */

+namespace {
+
 /*
 * Utility routine to tell whether a sequence of bytes is legal UTF-8.
 * This must be called with the length pre-determined by the first byte.
@@ -309,8 +321,7 @@ return result;
 * If presented with a length > 4, this returns false.  The Unicode
 * definition of UTF-8 goes up to 4-byte sequences.
 */
-
-static Boolean isLegalUTF8(const UTF8 *source, int length) {
+Boolean isLegalUTF8(const UTF8 *source, int length) {
  UTF8 a;
  const UTF8 *srcptr = source+length;
  switch (length) {
@@ -335,6 +346,8 @@ static Boolean isLegalUTF8(const UTF8 *source, int length) {
  return true;
 }

+}  // namespace
+
 /* --------------------------------------------------------------------- */

 /*
@@ -552,3 +565,5 @@ In UTF-8 writing code, the switches on "bytesToWrite" are
 similarly unrolled loops.

 --------------------------------------------------------------------- */
+
+}  // namespace google_breakpad
--- a/src/common/convert_UTF.h
+++ b/src/common/convert_UTF.h
@@ -106,6 +106,8 @@ All should be unsigned values to avoid sign extension during
 bit mask & shift operations.
 ------------------------------------------------------------------------ */

+namespace google_breakpad {
+
 typedef unsigned long	UTF32;	/* at least 32 bits */
 typedef unsigned short	UTF16;	/* at least 16 bits */
 typedef unsigned char	UTF8;	/* typically 8 bits */
@@ -130,11 +132,6 @@ typedef enum {
 	lenientConversion
 } ConversionFlags;

-/* This is for C++ and does no harm in C */
-#ifdef __cplusplus
-extern "C" {
-#endif
-
 ConversionResult ConvertUTF8toUTF16 (const UTF8** sourceStart, const UTF8* sourceEnd,
                                     UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags);

@@ -155,9 +152,7 @@ ConversionResult ConvertUTF32toUTF16 (const UTF32** sourceStart, const UTF32* so

 Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd);

-#ifdef __cplusplus
-}
-#endif
+}  // namespace google_breakpad

 /* --------------------------------------------------------------------- */