utf8.go 1.3 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071
  1. package chardet
  2. import (
  3. "bytes"
  4. )
  5. var utf8Bom = []byte{0xEF, 0xBB, 0xBF}
  6. type recognizerUtf8 struct {
  7. }
  8. func newRecognizer_utf8() *recognizerUtf8 {
  9. return &recognizerUtf8{}
  10. }
  11. func (*recognizerUtf8) Match(input *recognizerInput) (output recognizerOutput) {
  12. output = recognizerOutput{
  13. Charset: "UTF-8",
  14. }
  15. hasBom := bytes.HasPrefix(input.raw, utf8Bom)
  16. inputLen := len(input.raw)
  17. var numValid, numInvalid uint32
  18. var trailBytes uint8
  19. for i := 0; i < inputLen; i++ {
  20. c := input.raw[i]
  21. if c&0x80 == 0 {
  22. continue
  23. }
  24. if c&0xE0 == 0xC0 {
  25. trailBytes = 1
  26. } else if c&0xF0 == 0xE0 {
  27. trailBytes = 2
  28. } else if c&0xF8 == 0xF0 {
  29. trailBytes = 3
  30. } else {
  31. numInvalid++
  32. if numInvalid > 5 {
  33. break
  34. }
  35. trailBytes = 0
  36. }
  37. for i++; i < inputLen; i++ {
  38. c = input.raw[i]
  39. if c&0xC0 != 0x80 {
  40. numInvalid++
  41. break
  42. }
  43. if trailBytes--; trailBytes == 0 {
  44. numValid++
  45. break
  46. }
  47. }
  48. }
  49. if hasBom && numInvalid == 0 {
  50. output.Confidence = 100
  51. } else if hasBom && numValid > numInvalid*10 {
  52. output.Confidence = 80
  53. } else if numValid > 3 && numInvalid == 0 {
  54. output.Confidence = 100
  55. } else if numValid > 0 && numInvalid == 0 {
  56. output.Confidence = 80
  57. } else if numValid == 0 && numInvalid == 0 {
  58. // Plain ASCII
  59. output.Confidence = 10
  60. } else if numValid > numInvalid*10 {
  61. output.Confidence = 25
  62. }
  63. return
  64. }