Detect icu4c error codes more precisely

Negative values are warnings, not errors; failing on them breals encoding when
it would otherwise work.

Unfortunately, these negative numbers are being converted into large positive
ones through the cgo bridge, so this becomes a slightly more complicated
check than necessary.
This commit is contained in:
Nick Thomas
2017-03-29 17:09:13 +01:00
parent 029aa0206e
commit 03c771153c
3 changed files with 42 additions and 30 deletions

View File

@@ -15,7 +15,7 @@ const int detectCharset(void *detector,
// Put input bytes in the detector. // Put input bytes in the detector.
ucsdet_setText((UCharsetDetector*)detector, (char*)input, input_len, status); ucsdet_setText((UCharsetDetector*)detector, (char*)input, input_len, status);
if (*status != U_ZERO_ERROR) { if U_FAILURE(*status) {
return 0; return 0;
} }
@@ -25,7 +25,7 @@ const int detectCharset(void *detector,
// Perform analysis and return all guesses and their count. // Perform analysis and return all guesses and their count.
bestGuesses = ucsdet_detectAll((UCharsetDetector*)detector, &matchCount, status); bestGuesses = ucsdet_detectAll((UCharsetDetector*)detector, &matchCount, status);
if (*status != U_ZERO_ERROR) { if U_FAILURE(*status) {
return 0; return 0;
} }
@@ -42,19 +42,19 @@ const int detectCharset(void *detector,
// Fill guessed encoding // Fill guessed encoding
bestGuessedCharset = ucsdet_getName(bestGuess, status); bestGuessedCharset = ucsdet_getName(bestGuess, status);
if (*status != U_ZERO_ERROR) { if U_FAILURE(*status) {
return 0; return 0;
} }
// Fill guessed language // Fill guessed language
bestGuessedLanguage = ucsdet_getLanguage(bestGuess, status); bestGuessedLanguage = ucsdet_getLanguage(bestGuess, status);
if (*status != U_ZERO_ERROR) { if U_FAILURE(*status) {
return 0; return 0;
} }
// Fill its confidence rating // Fill its confidence rating
int32_t conf = ucsdet_getConfidence(bestGuess, status); int32_t conf = ucsdet_getConfidence(bestGuess, status);
if (*status != U_ZERO_ERROR) { if U_FAILURE(*status) {
return 0; return 0;
} }
@@ -77,13 +77,13 @@ int convertToUtf16(const char *srcEncoding,
UConverter *conv; UConverter *conv;
conv = ucnv_open(srcEncoding, status); conv = ucnv_open(srcEncoding, status);
if (*status != U_ZERO_ERROR) { if U_FAILURE(*status) {
return 0; return 0;
} }
/* Convert from original encoding to UTF-16 */ /* Convert from original encoding to UTF-16 */
int len = ucnv_toUChars(conv, dest, destCapacity, src, srcLength, status); int len = ucnv_toUChars(conv, dest, destCapacity, src, srcLength, status);
if (*status != U_ZERO_ERROR) { if U_FAILURE(*status) {
return 0; return 0;
} }
@@ -102,13 +102,13 @@ int convertFromUtf16(const char *destEncoding,
UConverter *conv; UConverter *conv;
conv = ucnv_open(destEncoding, status); conv = ucnv_open(destEncoding, status);
if (*status != U_ZERO_ERROR) { if U_FAILURE(*status) {
return 0; return 0;
} }
/* Convert from UTF-16 to destination encoding */ /* Convert from UTF-16 to destination encoding */
int len = ucnv_fromUChars(conv, dest, destCapacity, src, srcLength, status); int len = ucnv_fromUChars(conv, dest, destCapacity, src, srcLength, status);
if (*status != U_ZERO_ERROR) { if U_FAILURE(*status) {
return 0; return 0;
} }

View File

@@ -79,7 +79,7 @@ func (conv *CharsetConverter) ConvertToUtf8(input []byte, srcEncoding string) ([
C.int32_t(len(input)), C.int32_t(len(input)),
(*C.int)(unsafe.Pointer(&status))) (*C.int)(unsafe.Pointer(&status)))
if status == U_ZERO_ERROR { if isSuccess(status) {
nConvLen := C.convertFromUtf16( nConvLen := C.convertFromUtf16(
Utf8CString, Utf8CString,
(*C.char)(unsafe.Pointer(&conv.utf8Buffer[0])), (*C.char)(unsafe.Pointer(&conv.utf8Buffer[0])),
@@ -88,7 +88,7 @@ func (conv *CharsetConverter) ConvertToUtf8(input []byte, srcEncoding string) ([
C.int32_t(convLen), C.int32_t(convLen),
(*C.int)(unsafe.Pointer(&status))) (*C.int)(unsafe.Pointer(&status)))
if status == U_ZERO_ERROR { if isSuccess(status) {
resStr := conv.utf8Buffer[:nConvLen] resStr := conv.utf8Buffer[:nConvLen]
return ([]byte)(resStr), nil return ([]byte)(resStr), nil
} }

View File

@@ -12,9 +12,21 @@ import (
const ( const (
U_ZERO_ERROR = 0 // ICU common constant error code which means that no error occured U_ZERO_ERROR = 0 // ICU common constant error code which means that no error occured
U_ERROR_LIMIT = 0x7FFFFFFF // Dirty hack, negative error codes are are being turned into large positive ints
MatchDataBufferSize = 25 // Size of the buffer for detection results (Max count of returned guesses per detect call) MatchDataBufferSize = 25 // Size of the buffer for detection results (Max count of returned guesses per detect call)
) )
// Go implementation of the icu U_SUCCESS macro. Negative status codes are
// warnings, 0 is a success without warnings, > 0 is an error
func isSuccess(status int) bool {
return status <= U_ZERO_ERROR || status >= U_ERROR_LIMIT
}
// Go implementation of the icu U_FAILURE macro.
func isFailure(status int) bool {
return status > U_ZERO_ERROR && status < U_ERROR_LIMIT
}
// CharsetDetector provides ICU charset detection functionality. // CharsetDetector provides ICU charset detection functionality.
type CharsetDetector struct { type CharsetDetector struct {
ptr *C.UCharsetDetector // ICU struct needed for detection ptr *C.UCharsetDetector // ICU struct needed for detection
@@ -39,7 +51,7 @@ func NewCharsetDetector() (*CharsetDetector, error) {
det.ptr = C.ucsdet_open((*C.UErrorCode)(statusPtr)) det.ptr = C.ucsdet_open((*C.UErrorCode)(statusPtr))
if status != U_ZERO_ERROR { if isFailure(status) {
return nil, fmt.Errorf("ICU Error code returned: %d", status) return nil, fmt.Errorf("ICU Error code returned: %d", status)
} }
@@ -70,7 +82,7 @@ func (det *CharsetDetector) GuessCharset(input []byte) (matches []Match, err err
(*C.MatchData)(unsafe.Pointer(&det.resBuffer[0])), (*C.MatchData)(unsafe.Pointer(&det.resBuffer[0])),
C.int(MatchDataBufferSize)) C.int(MatchDataBufferSize))
if status == U_ZERO_ERROR { if isSuccess(status) {
// Convert the returned number of entries from result buffer to a slice // Convert the returned number of entries from result buffer to a slice
// that will be returned // that will be returned
count := int(guessCount) count := int(guessCount)