101 lines
2.8 KiB
Go
101 lines
2.8 KiB
Go
![]() |
package icu
|
||
|
|
||
|
// #cgo pkg-config: icu-i18n
|
||
|
// #include "c_bridge.h"
|
||
|
// #include "stdlib.h"
|
||
|
import "C"
|
||
|
import (
|
||
|
"fmt"
|
||
|
"sync"
|
||
|
"unsafe"
|
||
|
)
|
||
|
|
||
|
const (
|
||
|
U_ZERO_ERROR = 0 // ICU common constant error code which means that no error occured
|
||
|
MatchDataBufferSize = 25 // Size of the buffer for detection results (Max count of returned guesses per detect call)
|
||
|
)
|
||
|
|
||
|
// CharsetDetector provides ICU charset detection functionality.
|
||
|
type CharsetDetector struct {
|
||
|
ptr *C.UCharsetDetector // ICU struct needed for detection
|
||
|
resBuffer [MatchDataBufferSize]C.MatchData
|
||
|
gMutex sync.Mutex // Mutex used to guarantee thread safety for ICU calls
|
||
|
}
|
||
|
|
||
|
// An equivalent of MatchData C structure (see c_bridge.h)
|
||
|
type Match struct {
|
||
|
Charset string
|
||
|
Language string
|
||
|
Confidence int
|
||
|
}
|
||
|
|
||
|
// Creates new charset detector. If it is successfully created, it
|
||
|
// must be closed as it needs to free native ICU resources.
|
||
|
func NewCharsetDetector() (*CharsetDetector, error) {
|
||
|
det := new(CharsetDetector)
|
||
|
|
||
|
var status int
|
||
|
statusPtr := unsafe.Pointer(&status)
|
||
|
|
||
|
det.ptr = C.ucsdet_open((*C.UErrorCode)(statusPtr))
|
||
|
|
||
|
if status != U_ZERO_ERROR {
|
||
|
return nil, fmt.Errorf("ICU Error code returned: %d", status)
|
||
|
}
|
||
|
|
||
|
return det, nil
|
||
|
}
|
||
|
|
||
|
func (det *CharsetDetector) GuessCharset(input []byte) (matches []Match, err error) {
|
||
|
|
||
|
// As described in c_bridge.h, detection operations are not thread safe and
|
||
|
// should be called consequently. So a mutex is used here.
|
||
|
det.gMutex.Lock()
|
||
|
defer det.gMutex.Unlock()
|
||
|
|
||
|
inputLen := len(input)
|
||
|
if inputLen == 0 {
|
||
|
return nil, fmt.Errorf("Input data len is 0")
|
||
|
}
|
||
|
|
||
|
var status int
|
||
|
|
||
|
// Perform detection. Guess count is the number of matches returned.
|
||
|
// The matches themself are put in the result buffer
|
||
|
guessCount := C.detectCharset(
|
||
|
unsafe.Pointer(det.ptr),
|
||
|
unsafe.Pointer(&input[0]),
|
||
|
C.int(inputLen),
|
||
|
(*C.int)(unsafe.Pointer(&status)),
|
||
|
(*C.MatchData)(unsafe.Pointer(&det.resBuffer[0])),
|
||
|
C.int(MatchDataBufferSize))
|
||
|
|
||
|
if status == U_ZERO_ERROR {
|
||
|
// Convert the returned number of entries from result buffer to a slice
|
||
|
// that will be returned
|
||
|
count := int(guessCount)
|
||
|
mt := make([]Match, count, count)
|
||
|
|
||
|
for i := 0; i < count; i++ {
|
||
|
mData := det.resBuffer[i]
|
||
|
charset := C.GoString(mData.charset)
|
||
|
language := C.GoString(mData.language)
|
||
|
mt[i] = Match{charset, language, int(mData.confidence)}
|
||
|
}
|
||
|
|
||
|
return mt, nil
|
||
|
}
|
||
|
|
||
|
return nil, fmt.Errorf("ICU Error code returned: %d", status)
|
||
|
}
|
||
|
|
||
|
// Close frees native C resources
|
||
|
func (det *CharsetDetector) Close() {
|
||
|
det.gMutex.Lock()
|
||
|
defer det.gMutex.Unlock()
|
||
|
|
||
|
if det.ptr != nil {
|
||
|
C.ucsdet_close(det.ptr)
|
||
|
}
|
||
|
}
|