Detect icu4c error codes more precisely

Negative values are warnings, not errors; failing on them breals encoding when it would otherwise work. Unfortunately, these negative numbers are being converted into large positive ones through the cgo bridge, so this becomes a slightly more complicated check than necessary.
2017-03-29 17:09:13 +01:00
parent 029aa0206e
commit 03c771153c
3 changed files with 42 additions and 30 deletions
--- a/c_bridge.c
+++ b/c_bridge.c
@@ -6,16 +6,16 @@
 #include <unicode/ucnv.h>

 // See description in c_bridge.h
-const int detectCharset(void        *detector, 
-                        void        *input, 
-                        int         input_len, 
-                        int         *status, 
-                        MatchData   *matchBuffer, 
+const int detectCharset(void        *detector,
+                        void        *input,
+                        int         input_len,
+                        int         *status,
+                        MatchData   *matchBuffer,
                        int         matchBufferSize) {

    // Put input bytes in the detector.
    ucsdet_setText((UCharsetDetector*)detector, (char*)input, input_len, status);
-    if (*status != U_ZERO_ERROR) {
+    if U_FAILURE(*status) {
        return 0;
    }

@@ -25,7 +25,7 @@ const int detectCharset(void        *detector,

    // Perform analysis and return all guesses and their count.
    bestGuesses = ucsdet_detectAll((UCharsetDetector*)detector, &matchCount, status);
-    if (*status != U_ZERO_ERROR) {
+    if U_FAILURE(*status) {
        return 0;
    }

@@ -42,19 +42,19 @@ const int detectCharset(void        *detector,

        // Fill guessed encoding
        bestGuessedCharset = ucsdet_getName(bestGuess, status);
-        if (*status != U_ZERO_ERROR) {
+        if U_FAILURE(*status) {
            return 0;
        }

        // Fill guessed language
        bestGuessedLanguage = ucsdet_getLanguage(bestGuess, status);
-        if (*status != U_ZERO_ERROR) {
+        if U_FAILURE(*status) {
            return 0;
        }

        // Fill its confidence rating
        int32_t conf = ucsdet_getConfidence(bestGuess, status);
-        if (*status != U_ZERO_ERROR) {
+        if U_FAILURE(*status) {
            return 0;
        }

@@ -69,7 +69,7 @@ const int detectCharset(void        *detector,

 // See description in c_bridge.h
 int convertToUtf16(const char   *srcEncoding,
-                   UChar        *dest, 
+                   UChar        *dest,
                   int32_t      destCapacity,
                   const char   *src,
                   int32_t      srcLength,
@@ -77,13 +77,13 @@ int convertToUtf16(const char   *srcEncoding,
    UConverter *conv;

    conv = ucnv_open(srcEncoding, status);
-    if (*status != U_ZERO_ERROR) {
+    if U_FAILURE(*status) {
        return 0;
    }

    /* Convert from original encoding to UTF-16 */
    int len = ucnv_toUChars(conv, dest, destCapacity, src, srcLength, status);
-    if (*status != U_ZERO_ERROR) {
+    if U_FAILURE(*status) {
        return 0;
    }

@@ -94,7 +94,7 @@ int convertToUtf16(const char   *srcEncoding,

 // See description in c_bridge.h
 int convertFromUtf16(const char   *destEncoding,
-                     char         *dest, 
+                     char         *dest,
                     int32_t      destCapacity,
                     const UChar  *src,
                     int32_t      srcLength,
@@ -102,17 +102,17 @@ int convertFromUtf16(const char   *destEncoding,
    UConverter *conv;

    conv = ucnv_open(destEncoding, status);
-    if (*status != U_ZERO_ERROR) {
+    if U_FAILURE(*status) {
        return 0;
    }

    /* Convert from UTF-16 to destination encoding */
    int len = ucnv_fromUChars(conv, dest, destCapacity, src, srcLength, status);
-    if (*status != U_ZERO_ERROR) {
+    if U_FAILURE(*status) {
        return 0;
    }

    ucnv_close(conv);

    return len;
-}
+}
--- a/convert.go
+++ b/convert.go
@@ -14,7 +14,7 @@ const (
    DefaultMaxTextSize = 1024 * 1024    // Default value for the max text length in conversion operations
    utf8MaxCharSize = 4
    utf16MaxCharSize = 4
-) 
+)

 var (
    Utf8CString = C.CString("UTF-8")
@@ -35,7 +35,7 @@ type CharsetConverter struct {
 // are created in memory once and then used. 'maxTextSize' sets the size of these buffers.
 // ICU library would return error if any processed text is longer than this parameter.
 //
-// NOTE: 
+// NOTE:
 //
 // UTF8 uses 1 to 4 bytes for each symbol.
 // UTF16 uses 2 bytes to 4 bytes for each symbol.
@@ -79,7 +79,7 @@ func (conv *CharsetConverter) ConvertToUtf8(input []byte, srcEncoding string) ([
            C.int32_t(len(input)),
            (*C.int)(unsafe.Pointer(&status)))

-    if status == U_ZERO_ERROR {
+    if isSuccess(status) {
        nConvLen := C.convertFromUtf16(
            Utf8CString,
            (*C.char)(unsafe.Pointer(&conv.utf8Buffer[0])),
@@ -88,7 +88,7 @@ func (conv *CharsetConverter) ConvertToUtf8(input []byte, srcEncoding string) ([
            C.int32_t(convLen),
            (*C.int)(unsafe.Pointer(&status)))

-        if status == U_ZERO_ERROR {
+        if isSuccess(status) {
            resStr := conv.utf8Buffer[:nConvLen]
            return ([]byte)(resStr), nil
        }
--- a/detect.go
+++ b/detect.go
@@ -11,9 +11,21 @@ import (
 )

 const (
-    U_ZERO_ERROR        = 0     // ICU common constant error code which means that no error occured
-    MatchDataBufferSize = 25    // Size of the buffer for detection results (Max count of returned guesses per detect call)
-) 
+    U_ZERO_ERROR        = 0          // ICU common constant error code which means that no error occured
+    U_ERROR_LIMIT       = 0x7FFFFFFF // Dirty hack, negative error codes are are being turned into large positive ints
+    MatchDataBufferSize = 25         // Size of the buffer for detection results (Max count of returned guesses per detect call)
+)
+
+// Go implementation of the icu U_SUCCESS macro. Negative status codes are
+// warnings, 0 is a success without warnings, > 0 is an error
+func isSuccess(status int) bool {
+    return status <= U_ZERO_ERROR || status >= U_ERROR_LIMIT
+}
+
+// Go implementation of the icu U_FAILURE macro.
+func isFailure(status int) bool {
+    return status > U_ZERO_ERROR && status < U_ERROR_LIMIT
+}

 // CharsetDetector provides ICU charset detection functionality.
 type CharsetDetector struct {
@@ -39,7 +51,7 @@ func NewCharsetDetector() (*CharsetDetector, error) {

    det.ptr = C.ucsdet_open((*C.UErrorCode)(statusPtr))

-    if status != U_ZERO_ERROR {
+    if isFailure(status) {
        return nil, fmt.Errorf("ICU Error code returned: %d", status)
    }

@@ -63,14 +75,14 @@ func (det *CharsetDetector) GuessCharset(input []byte) (matches []Match, err err
    // Perform detection. Guess count is the number of matches returned.
    // The matches themself are put in the result buffer
    guessCount := C.detectCharset(
-        unsafe.Pointer(det.ptr), 
-        unsafe.Pointer(&input[0]), 
-        C.int(inputLen), 
-        (*C.int)(unsafe.Pointer(&status)), 
+        unsafe.Pointer(det.ptr),
+        unsafe.Pointer(&input[0]),
+        C.int(inputLen),
+        (*C.int)(unsafe.Pointer(&status)),
        (*C.MatchData)(unsafe.Pointer(&det.resBuffer[0])),
        C.int(MatchDataBufferSize))

-    if status == U_ZERO_ERROR {
+    if isSuccess(status) {
        // Convert the returned number of entries from result buffer to a slice
        // that will be returned
        count := int(guessCount)