gbk.go 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281
  1. // Copyright 2013 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package simplifiedchinese
  5. import (
  6. "errors"
  7. "unicode/utf8"
  8. "golang.org/x/text/encoding"
  9. "golang.org/x/text/encoding/internal"
  10. "golang.org/x/text/encoding/internal/identifier"
  11. "golang.org/x/text/transform"
  12. )
  13. var (
  14. // GB18030 is the GB18030 encoding.
  15. GB18030 encoding.Encoding = &gbk18030
  16. // GBK is the GBK encoding. It encodes an extension of the GB2312 character set
  17. // and is also known as Code Page 936.
  18. GBK encoding.Encoding = &gbk
  19. )
  20. var gbk = internal.Encoding{
  21. &internal.SimpleEncoding{
  22. gbkDecoder{gb18030: false},
  23. gbkEncoder{gb18030: false},
  24. },
  25. "GBK",
  26. identifier.GBK,
  27. }
  28. var gbk18030 = internal.Encoding{
  29. &internal.SimpleEncoding{
  30. gbkDecoder{gb18030: true},
  31. gbkEncoder{gb18030: true},
  32. },
  33. "GB18030",
  34. identifier.GB18030,
  35. }
  36. var (
  37. errInvalidGB18030 = errors.New("simplifiedchinese: invalid GB18030 encoding")
  38. errInvalidGBK = errors.New("simplifiedchinese: invalid GBK encoding")
  39. )
  40. type gbkDecoder struct {
  41. transform.NopResetter
  42. gb18030 bool
  43. }
  44. func (d gbkDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
  45. r, size := rune(0), 0
  46. loop:
  47. for ; nSrc < len(src); nSrc += size {
  48. switch c0 := src[nSrc]; {
  49. case c0 < utf8.RuneSelf:
  50. r, size = rune(c0), 1
  51. // Microsoft's Code Page 936 extends GBK 1.0 to encode the euro sign U+20AC
  52. // as 0x80. The HTML5 specification at http://encoding.spec.whatwg.org/#gbk
  53. // says to treat "gbk" as Code Page 936.
  54. case c0 == 0x80:
  55. r, size = '€', 1
  56. case c0 < 0xff:
  57. if nSrc+1 >= len(src) {
  58. err = transform.ErrShortSrc
  59. break loop
  60. }
  61. c1 := src[nSrc+1]
  62. switch {
  63. case 0x40 <= c1 && c1 < 0x7f:
  64. c1 -= 0x40
  65. case 0x80 <= c1 && c1 < 0xff:
  66. c1 -= 0x41
  67. case d.gb18030 && 0x30 <= c1 && c1 < 0x40:
  68. if nSrc+3 >= len(src) {
  69. err = transform.ErrShortSrc
  70. break loop
  71. }
  72. c2 := src[nSrc+2]
  73. if c2 < 0x81 || 0xff <= c2 {
  74. err = errInvalidGB18030
  75. break loop
  76. }
  77. c3 := src[nSrc+3]
  78. if c3 < 0x30 || 0x3a <= c3 {
  79. err = errInvalidGB18030
  80. break loop
  81. }
  82. size = 4
  83. r = ((rune(c0-0x81)*10+rune(c1-0x30))*126+rune(c2-0x81))*10 + rune(c3-0x30)
  84. if r < 39420 {
  85. i, j := 0, len(gb18030)
  86. for i < j {
  87. h := i + (j-i)/2
  88. if r >= rune(gb18030[h][0]) {
  89. i = h + 1
  90. } else {
  91. j = h
  92. }
  93. }
  94. dec := &gb18030[i-1]
  95. r += rune(dec[1]) - rune(dec[0])
  96. goto write
  97. }
  98. r -= 189000
  99. if 0 <= r && r < 0x100000 {
  100. r += 0x10000
  101. goto write
  102. }
  103. err = errInvalidGB18030
  104. break loop
  105. default:
  106. if d.gb18030 {
  107. err = errInvalidGB18030
  108. } else {
  109. err = errInvalidGBK
  110. }
  111. break loop
  112. }
  113. r, size = '\ufffd', 2
  114. if i := int(c0-0x81)*190 + int(c1); i < len(decode) {
  115. r = rune(decode[i])
  116. if r == 0 {
  117. r = '\ufffd'
  118. }
  119. }
  120. default:
  121. if d.gb18030 {
  122. err = errInvalidGB18030
  123. } else {
  124. err = errInvalidGBK
  125. }
  126. break loop
  127. }
  128. write:
  129. if nDst+utf8.RuneLen(r) > len(dst) {
  130. err = transform.ErrShortDst
  131. break loop
  132. }
  133. nDst += utf8.EncodeRune(dst[nDst:], r)
  134. }
  135. if atEOF && err == transform.ErrShortSrc {
  136. if d.gb18030 {
  137. err = errInvalidGB18030
  138. } else {
  139. err = errInvalidGBK
  140. }
  141. }
  142. return nDst, nSrc, err
  143. }
  144. type gbkEncoder struct {
  145. transform.NopResetter
  146. gb18030 bool
  147. }
  148. func (e gbkEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
  149. r, r2, size := rune(0), rune(0), 0
  150. for ; nSrc < len(src); nSrc += size {
  151. r = rune(src[nSrc])
  152. // Decode a 1-byte rune.
  153. if r < utf8.RuneSelf {
  154. size = 1
  155. } else {
  156. // Decode a multi-byte rune.
  157. r, size = utf8.DecodeRune(src[nSrc:])
  158. if size == 1 {
  159. // All valid runes of size 1 (those below utf8.RuneSelf) were
  160. // handled above. We have invalid UTF-8 or we haven't seen the
  161. // full character yet.
  162. if !atEOF && !utf8.FullRune(src[nSrc:]) {
  163. err = transform.ErrShortSrc
  164. break
  165. }
  166. }
  167. // func init checks that the switch covers all tables.
  168. switch {
  169. case encode0Low <= r && r < encode0High:
  170. if r2 = rune(encode0[r-encode0Low]); r2 != 0 {
  171. goto write2
  172. }
  173. case encode1Low <= r && r < encode1High:
  174. // Microsoft's Code Page 936 extends GBK 1.0 to encode the euro sign U+20AC
  175. // as 0x80. The HTML5 specification at http://encoding.spec.whatwg.org/#gbk
  176. // says to treat "gbk" as Code Page 936.
  177. if r == '€' {
  178. r = 0x80
  179. goto write1
  180. }
  181. if r2 = rune(encode1[r-encode1Low]); r2 != 0 {
  182. goto write2
  183. }
  184. case encode2Low <= r && r < encode2High:
  185. if r2 = rune(encode2[r-encode2Low]); r2 != 0 {
  186. goto write2
  187. }
  188. case encode3Low <= r && r < encode3High:
  189. if r2 = rune(encode3[r-encode3Low]); r2 != 0 {
  190. goto write2
  191. }
  192. case encode4Low <= r && r < encode4High:
  193. if r2 = rune(encode4[r-encode4Low]); r2 != 0 {
  194. goto write2
  195. }
  196. }
  197. if e.gb18030 {
  198. if r < 0x10000 {
  199. i, j := 0, len(gb18030)
  200. for i < j {
  201. h := i + (j-i)/2
  202. if r >= rune(gb18030[h][1]) {
  203. i = h + 1
  204. } else {
  205. j = h
  206. }
  207. }
  208. dec := &gb18030[i-1]
  209. r += rune(dec[0]) - rune(dec[1])
  210. goto write4
  211. } else if r < 0x110000 {
  212. r += 189000 - 0x10000
  213. goto write4
  214. }
  215. }
  216. err = internal.ErrASCIIReplacement
  217. break
  218. }
  219. write1:
  220. if nDst >= len(dst) {
  221. err = transform.ErrShortDst
  222. break
  223. }
  224. dst[nDst] = uint8(r)
  225. nDst++
  226. continue
  227. write2:
  228. if nDst+2 > len(dst) {
  229. err = transform.ErrShortDst
  230. break
  231. }
  232. dst[nDst+0] = uint8(r2 >> 8)
  233. dst[nDst+1] = uint8(r2)
  234. nDst += 2
  235. continue
  236. write4:
  237. if nDst+4 > len(dst) {
  238. err = transform.ErrShortDst
  239. break
  240. }
  241. dst[nDst+3] = uint8(r%10 + 0x30)
  242. r /= 10
  243. dst[nDst+2] = uint8(r%126 + 0x81)
  244. r /= 126
  245. dst[nDst+1] = uint8(r%10 + 0x30)
  246. r /= 10
  247. dst[nDst+0] = uint8(r + 0x81)
  248. nDst += 4
  249. continue
  250. }
  251. return nDst, nSrc, err
  252. }
  253. func init() {
  254. // Check that the hard-coded encode switch covers all tables.
  255. if numEncodeTables != 5 {
  256. panic("bad numEncodeTables")
  257. }
  258. }