Платформа ЦРНП "Мирокод" для разработки проектов
https://git.mirocod.ru
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
71 lines
1.3 KiB
71 lines
1.3 KiB
package chardet |
|
|
|
import ( |
|
"bytes" |
|
) |
|
|
|
var utf8Bom = []byte{0xEF, 0xBB, 0xBF} |
|
|
|
type recognizerUtf8 struct { |
|
} |
|
|
|
func newRecognizer_utf8() *recognizerUtf8 { |
|
return &recognizerUtf8{} |
|
} |
|
|
|
func (*recognizerUtf8) Match(input *recognizerInput) (output recognizerOutput) { |
|
output = recognizerOutput{ |
|
Charset: "UTF-8", |
|
} |
|
hasBom := bytes.HasPrefix(input.raw, utf8Bom) |
|
inputLen := len(input.raw) |
|
var numValid, numInvalid uint32 |
|
var trailBytes uint8 |
|
for i := 0; i < inputLen; i++ { |
|
c := input.raw[i] |
|
if c&0x80 == 0 { |
|
continue |
|
} |
|
if c&0xE0 == 0xC0 { |
|
trailBytes = 1 |
|
} else if c&0xF0 == 0xE0 { |
|
trailBytes = 2 |
|
} else if c&0xF8 == 0xF0 { |
|
trailBytes = 3 |
|
} else { |
|
numInvalid++ |
|
if numInvalid > 5 { |
|
break |
|
} |
|
trailBytes = 0 |
|
} |
|
|
|
for i++; i < inputLen; i++ { |
|
c = input.raw[i] |
|
if c&0xC0 != 0x80 { |
|
numInvalid++ |
|
break |
|
} |
|
if trailBytes--; trailBytes == 0 { |
|
numValid++ |
|
break |
|
} |
|
} |
|
} |
|
|
|
if hasBom && numInvalid == 0 { |
|
output.Confidence = 100 |
|
} else if hasBom && numValid > numInvalid*10 { |
|
output.Confidence = 80 |
|
} else if numValid > 3 && numInvalid == 0 { |
|
output.Confidence = 100 |
|
} else if numValid > 0 && numInvalid == 0 { |
|
output.Confidence = 80 |
|
} else if numValid == 0 && numInvalid == 0 { |
|
// Plain ASCII |
|
output.Confidence = 10 |
|
} else if numValid > numInvalid*10 { |
|
output.Confidence = 25 |
|
} |
|
return |
|
}
|
|
|