unicode.go 2.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103
  1. package chardet
  2. import (
  3. "bytes"
  4. )
  5. var (
  6. utf16beBom = []byte{0xFE, 0xFF}
  7. utf16leBom = []byte{0xFF, 0xFE}
  8. utf32beBom = []byte{0x00, 0x00, 0xFE, 0xFF}
  9. utf32leBom = []byte{0xFF, 0xFE, 0x00, 0x00}
  10. )
  11. type recognizerUtf16be struct {
  12. }
  13. func newRecognizer_utf16be() *recognizerUtf16be {
  14. return &recognizerUtf16be{}
  15. }
  16. func (*recognizerUtf16be) Match(input *recognizerInput) (output recognizerOutput) {
  17. output = recognizerOutput{
  18. Charset: "UTF-16BE",
  19. }
  20. if bytes.HasPrefix(input.raw, utf16beBom) {
  21. output.Confidence = 100
  22. }
  23. return
  24. }
  25. type recognizerUtf16le struct {
  26. }
  27. func newRecognizer_utf16le() *recognizerUtf16le {
  28. return &recognizerUtf16le{}
  29. }
  30. func (*recognizerUtf16le) Match(input *recognizerInput) (output recognizerOutput) {
  31. output = recognizerOutput{
  32. Charset: "UTF-16LE",
  33. }
  34. if bytes.HasPrefix(input.raw, utf16leBom) && !bytes.HasPrefix(input.raw, utf32leBom) {
  35. output.Confidence = 100
  36. }
  37. return
  38. }
  39. type recognizerUtf32 struct {
  40. name string
  41. bom []byte
  42. decodeChar func(input []byte) uint32
  43. }
  44. func decodeUtf32be(input []byte) uint32 {
  45. return uint32(input[0])<<24 | uint32(input[1])<<16 | uint32(input[2])<<8 | uint32(input[3])
  46. }
  47. func decodeUtf32le(input []byte) uint32 {
  48. return uint32(input[3])<<24 | uint32(input[2])<<16 | uint32(input[1])<<8 | uint32(input[0])
  49. }
  50. func newRecognizer_utf32be() *recognizerUtf32 {
  51. return &recognizerUtf32{
  52. "UTF-32BE",
  53. utf32beBom,
  54. decodeUtf32be,
  55. }
  56. }
  57. func newRecognizer_utf32le() *recognizerUtf32 {
  58. return &recognizerUtf32{
  59. "UTF-32LE",
  60. utf32leBom,
  61. decodeUtf32le,
  62. }
  63. }
  64. func (r *recognizerUtf32) Match(input *recognizerInput) (output recognizerOutput) {
  65. output = recognizerOutput{
  66. Charset: r.name,
  67. }
  68. hasBom := bytes.HasPrefix(input.raw, r.bom)
  69. var numValid, numInvalid uint32
  70. for b := input.raw; len(b) >= 4; b = b[4:] {
  71. if c := r.decodeChar(b); c >= 0x10FFFF || (c >= 0xD800 && c <= 0xDFFF) {
  72. numInvalid++
  73. } else {
  74. numValid++
  75. }
  76. }
  77. if hasBom && numInvalid == 0 {
  78. output.Confidence = 100
  79. } else if hasBom && numValid > numInvalid*10 {
  80. output.Confidence = 80
  81. } else if numValid > 3 && numInvalid == 0 {
  82. output.Confidence = 100
  83. } else if numValid > 0 && numInvalid == 0 {
  84. output.Confidence = 80
  85. } else if numValid > numInvalid*10 {
  86. output.Confidence = 25
  87. }
  88. return
  89. }