Detect icu4c error codes more precisely
Negative values are warnings, not errors; failing on them breals encoding when it would otherwise work. Unfortunately, these negative numbers are being converted into large positive ones through the cgo bridge, so this becomes a slightly more complicated check than necessary.
This commit is contained in:
34
c_bridge.c
34
c_bridge.c
@@ -6,16 +6,16 @@
|
|||||||
#include <unicode/ucnv.h>
|
#include <unicode/ucnv.h>
|
||||||
|
|
||||||
// See description in c_bridge.h
|
// See description in c_bridge.h
|
||||||
const int detectCharset(void *detector,
|
const int detectCharset(void *detector,
|
||||||
void *input,
|
void *input,
|
||||||
int input_len,
|
int input_len,
|
||||||
int *status,
|
int *status,
|
||||||
MatchData *matchBuffer,
|
MatchData *matchBuffer,
|
||||||
int matchBufferSize) {
|
int matchBufferSize) {
|
||||||
|
|
||||||
// Put input bytes in the detector.
|
// Put input bytes in the detector.
|
||||||
ucsdet_setText((UCharsetDetector*)detector, (char*)input, input_len, status);
|
ucsdet_setText((UCharsetDetector*)detector, (char*)input, input_len, status);
|
||||||
if (*status != U_ZERO_ERROR) {
|
if U_FAILURE(*status) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -25,7 +25,7 @@ const int detectCharset(void *detector,
|
|||||||
|
|
||||||
// Perform analysis and return all guesses and their count.
|
// Perform analysis and return all guesses and their count.
|
||||||
bestGuesses = ucsdet_detectAll((UCharsetDetector*)detector, &matchCount, status);
|
bestGuesses = ucsdet_detectAll((UCharsetDetector*)detector, &matchCount, status);
|
||||||
if (*status != U_ZERO_ERROR) {
|
if U_FAILURE(*status) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -42,19 +42,19 @@ const int detectCharset(void *detector,
|
|||||||
|
|
||||||
// Fill guessed encoding
|
// Fill guessed encoding
|
||||||
bestGuessedCharset = ucsdet_getName(bestGuess, status);
|
bestGuessedCharset = ucsdet_getName(bestGuess, status);
|
||||||
if (*status != U_ZERO_ERROR) {
|
if U_FAILURE(*status) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Fill guessed language
|
// Fill guessed language
|
||||||
bestGuessedLanguage = ucsdet_getLanguage(bestGuess, status);
|
bestGuessedLanguage = ucsdet_getLanguage(bestGuess, status);
|
||||||
if (*status != U_ZERO_ERROR) {
|
if U_FAILURE(*status) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Fill its confidence rating
|
// Fill its confidence rating
|
||||||
int32_t conf = ucsdet_getConfidence(bestGuess, status);
|
int32_t conf = ucsdet_getConfidence(bestGuess, status);
|
||||||
if (*status != U_ZERO_ERROR) {
|
if U_FAILURE(*status) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -69,7 +69,7 @@ const int detectCharset(void *detector,
|
|||||||
|
|
||||||
// See description in c_bridge.h
|
// See description in c_bridge.h
|
||||||
int convertToUtf16(const char *srcEncoding,
|
int convertToUtf16(const char *srcEncoding,
|
||||||
UChar *dest,
|
UChar *dest,
|
||||||
int32_t destCapacity,
|
int32_t destCapacity,
|
||||||
const char *src,
|
const char *src,
|
||||||
int32_t srcLength,
|
int32_t srcLength,
|
||||||
@@ -77,13 +77,13 @@ int convertToUtf16(const char *srcEncoding,
|
|||||||
UConverter *conv;
|
UConverter *conv;
|
||||||
|
|
||||||
conv = ucnv_open(srcEncoding, status);
|
conv = ucnv_open(srcEncoding, status);
|
||||||
if (*status != U_ZERO_ERROR) {
|
if U_FAILURE(*status) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Convert from original encoding to UTF-16 */
|
/* Convert from original encoding to UTF-16 */
|
||||||
int len = ucnv_toUChars(conv, dest, destCapacity, src, srcLength, status);
|
int len = ucnv_toUChars(conv, dest, destCapacity, src, srcLength, status);
|
||||||
if (*status != U_ZERO_ERROR) {
|
if U_FAILURE(*status) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -94,7 +94,7 @@ int convertToUtf16(const char *srcEncoding,
|
|||||||
|
|
||||||
// See description in c_bridge.h
|
// See description in c_bridge.h
|
||||||
int convertFromUtf16(const char *destEncoding,
|
int convertFromUtf16(const char *destEncoding,
|
||||||
char *dest,
|
char *dest,
|
||||||
int32_t destCapacity,
|
int32_t destCapacity,
|
||||||
const UChar *src,
|
const UChar *src,
|
||||||
int32_t srcLength,
|
int32_t srcLength,
|
||||||
@@ -102,17 +102,17 @@ int convertFromUtf16(const char *destEncoding,
|
|||||||
UConverter *conv;
|
UConverter *conv;
|
||||||
|
|
||||||
conv = ucnv_open(destEncoding, status);
|
conv = ucnv_open(destEncoding, status);
|
||||||
if (*status != U_ZERO_ERROR) {
|
if U_FAILURE(*status) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Convert from UTF-16 to destination encoding */
|
/* Convert from UTF-16 to destination encoding */
|
||||||
int len = ucnv_fromUChars(conv, dest, destCapacity, src, srcLength, status);
|
int len = ucnv_fromUChars(conv, dest, destCapacity, src, srcLength, status);
|
||||||
if (*status != U_ZERO_ERROR) {
|
if U_FAILURE(*status) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
ucnv_close(conv);
|
ucnv_close(conv);
|
||||||
|
|
||||||
return len;
|
return len;
|
||||||
}
|
}
|
||||||
|
@@ -14,7 +14,7 @@ const (
|
|||||||
DefaultMaxTextSize = 1024 * 1024 // Default value for the max text length in conversion operations
|
DefaultMaxTextSize = 1024 * 1024 // Default value for the max text length in conversion operations
|
||||||
utf8MaxCharSize = 4
|
utf8MaxCharSize = 4
|
||||||
utf16MaxCharSize = 4
|
utf16MaxCharSize = 4
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
Utf8CString = C.CString("UTF-8")
|
Utf8CString = C.CString("UTF-8")
|
||||||
@@ -35,7 +35,7 @@ type CharsetConverter struct {
|
|||||||
// are created in memory once and then used. 'maxTextSize' sets the size of these buffers.
|
// are created in memory once and then used. 'maxTextSize' sets the size of these buffers.
|
||||||
// ICU library would return error if any processed text is longer than this parameter.
|
// ICU library would return error if any processed text is longer than this parameter.
|
||||||
//
|
//
|
||||||
// NOTE:
|
// NOTE:
|
||||||
//
|
//
|
||||||
// UTF8 uses 1 to 4 bytes for each symbol.
|
// UTF8 uses 1 to 4 bytes for each symbol.
|
||||||
// UTF16 uses 2 bytes to 4 bytes for each symbol.
|
// UTF16 uses 2 bytes to 4 bytes for each symbol.
|
||||||
@@ -79,7 +79,7 @@ func (conv *CharsetConverter) ConvertToUtf8(input []byte, srcEncoding string) ([
|
|||||||
C.int32_t(len(input)),
|
C.int32_t(len(input)),
|
||||||
(*C.int)(unsafe.Pointer(&status)))
|
(*C.int)(unsafe.Pointer(&status)))
|
||||||
|
|
||||||
if status == U_ZERO_ERROR {
|
if isSuccess(status) {
|
||||||
nConvLen := C.convertFromUtf16(
|
nConvLen := C.convertFromUtf16(
|
||||||
Utf8CString,
|
Utf8CString,
|
||||||
(*C.char)(unsafe.Pointer(&conv.utf8Buffer[0])),
|
(*C.char)(unsafe.Pointer(&conv.utf8Buffer[0])),
|
||||||
@@ -88,7 +88,7 @@ func (conv *CharsetConverter) ConvertToUtf8(input []byte, srcEncoding string) ([
|
|||||||
C.int32_t(convLen),
|
C.int32_t(convLen),
|
||||||
(*C.int)(unsafe.Pointer(&status)))
|
(*C.int)(unsafe.Pointer(&status)))
|
||||||
|
|
||||||
if status == U_ZERO_ERROR {
|
if isSuccess(status) {
|
||||||
resStr := conv.utf8Buffer[:nConvLen]
|
resStr := conv.utf8Buffer[:nConvLen]
|
||||||
return ([]byte)(resStr), nil
|
return ([]byte)(resStr), nil
|
||||||
}
|
}
|
||||||
|
30
detect.go
30
detect.go
@@ -11,9 +11,21 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
U_ZERO_ERROR = 0 // ICU common constant error code which means that no error occured
|
U_ZERO_ERROR = 0 // ICU common constant error code which means that no error occured
|
||||||
MatchDataBufferSize = 25 // Size of the buffer for detection results (Max count of returned guesses per detect call)
|
U_ERROR_LIMIT = 0x7FFFFFFF // Dirty hack, negative error codes are are being turned into large positive ints
|
||||||
)
|
MatchDataBufferSize = 25 // Size of the buffer for detection results (Max count of returned guesses per detect call)
|
||||||
|
)
|
||||||
|
|
||||||
|
// Go implementation of the icu U_SUCCESS macro. Negative status codes are
|
||||||
|
// warnings, 0 is a success without warnings, > 0 is an error
|
||||||
|
func isSuccess(status int) bool {
|
||||||
|
return status <= U_ZERO_ERROR || status >= U_ERROR_LIMIT
|
||||||
|
}
|
||||||
|
|
||||||
|
// Go implementation of the icu U_FAILURE macro.
|
||||||
|
func isFailure(status int) bool {
|
||||||
|
return status > U_ZERO_ERROR && status < U_ERROR_LIMIT
|
||||||
|
}
|
||||||
|
|
||||||
// CharsetDetector provides ICU charset detection functionality.
|
// CharsetDetector provides ICU charset detection functionality.
|
||||||
type CharsetDetector struct {
|
type CharsetDetector struct {
|
||||||
@@ -39,7 +51,7 @@ func NewCharsetDetector() (*CharsetDetector, error) {
|
|||||||
|
|
||||||
det.ptr = C.ucsdet_open((*C.UErrorCode)(statusPtr))
|
det.ptr = C.ucsdet_open((*C.UErrorCode)(statusPtr))
|
||||||
|
|
||||||
if status != U_ZERO_ERROR {
|
if isFailure(status) {
|
||||||
return nil, fmt.Errorf("ICU Error code returned: %d", status)
|
return nil, fmt.Errorf("ICU Error code returned: %d", status)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -63,14 +75,14 @@ func (det *CharsetDetector) GuessCharset(input []byte) (matches []Match, err err
|
|||||||
// Perform detection. Guess count is the number of matches returned.
|
// Perform detection. Guess count is the number of matches returned.
|
||||||
// The matches themself are put in the result buffer
|
// The matches themself are put in the result buffer
|
||||||
guessCount := C.detectCharset(
|
guessCount := C.detectCharset(
|
||||||
unsafe.Pointer(det.ptr),
|
unsafe.Pointer(det.ptr),
|
||||||
unsafe.Pointer(&input[0]),
|
unsafe.Pointer(&input[0]),
|
||||||
C.int(inputLen),
|
C.int(inputLen),
|
||||||
(*C.int)(unsafe.Pointer(&status)),
|
(*C.int)(unsafe.Pointer(&status)),
|
||||||
(*C.MatchData)(unsafe.Pointer(&det.resBuffer[0])),
|
(*C.MatchData)(unsafe.Pointer(&det.resBuffer[0])),
|
||||||
C.int(MatchDataBufferSize))
|
C.int(MatchDataBufferSize))
|
||||||
|
|
||||||
if status == U_ZERO_ERROR {
|
if isSuccess(status) {
|
||||||
// Convert the returned number of entries from result buffer to a slice
|
// Convert the returned number of entries from result buffer to a slice
|
||||||
// that will be returned
|
// that will be returned
|
||||||
count := int(guessCount)
|
count := int(guessCount)
|
||||||
|
Reference in New Issue
Block a user