iter_str.go 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215
  1. package jsoniter
  2. import (
  3. "fmt"
  4. "unicode/utf16"
  5. )
  6. // ReadString read string from iterator
  7. func (iter *Iterator) ReadString() (ret string) {
  8. c := iter.nextToken()
  9. if c == '"' {
  10. for i := iter.head; i < iter.tail; i++ {
  11. c := iter.buf[i]
  12. if c == '"' {
  13. ret = string(iter.buf[iter.head:i])
  14. iter.head = i + 1
  15. return ret
  16. } else if c == '\\' {
  17. break
  18. } else if c < ' ' {
  19. iter.ReportError("ReadString",
  20. fmt.Sprintf(`invalid control character found: %d`, c))
  21. return
  22. }
  23. }
  24. return iter.readStringSlowPath()
  25. } else if c == 'n' {
  26. iter.skipThreeBytes('u', 'l', 'l')
  27. return ""
  28. }
  29. iter.ReportError("ReadString", `expects " or n, but found `+string([]byte{c}))
  30. return
  31. }
  32. func (iter *Iterator) readStringSlowPath() (ret string) {
  33. var str []byte
  34. var c byte
  35. for iter.Error == nil {
  36. c = iter.readByte()
  37. if c == '"' {
  38. return string(str)
  39. }
  40. if c == '\\' {
  41. c = iter.readByte()
  42. str = iter.readEscapedChar(c, str)
  43. } else {
  44. str = append(str, c)
  45. }
  46. }
  47. iter.ReportError("readStringSlowPath", "unexpected end of input")
  48. return
  49. }
  50. func (iter *Iterator) readEscapedChar(c byte, str []byte) []byte {
  51. switch c {
  52. case 'u':
  53. r := iter.readU4()
  54. if utf16.IsSurrogate(r) {
  55. c = iter.readByte()
  56. if iter.Error != nil {
  57. return nil
  58. }
  59. if c != '\\' {
  60. iter.unreadByte()
  61. str = appendRune(str, r)
  62. return str
  63. }
  64. c = iter.readByte()
  65. if iter.Error != nil {
  66. return nil
  67. }
  68. if c != 'u' {
  69. str = appendRune(str, r)
  70. return iter.readEscapedChar(c, str)
  71. }
  72. r2 := iter.readU4()
  73. if iter.Error != nil {
  74. return nil
  75. }
  76. combined := utf16.DecodeRune(r, r2)
  77. if combined == '\uFFFD' {
  78. str = appendRune(str, r)
  79. str = appendRune(str, r2)
  80. } else {
  81. str = appendRune(str, combined)
  82. }
  83. } else {
  84. str = appendRune(str, r)
  85. }
  86. case '"':
  87. str = append(str, '"')
  88. case '\\':
  89. str = append(str, '\\')
  90. case '/':
  91. str = append(str, '/')
  92. case 'b':
  93. str = append(str, '\b')
  94. case 'f':
  95. str = append(str, '\f')
  96. case 'n':
  97. str = append(str, '\n')
  98. case 'r':
  99. str = append(str, '\r')
  100. case 't':
  101. str = append(str, '\t')
  102. default:
  103. iter.ReportError("readEscapedChar",
  104. `invalid escape char after \`)
  105. return nil
  106. }
  107. return str
  108. }
  109. // ReadStringAsSlice read string from iterator without copying into string form.
  110. // The []byte can not be kept, as it will change after next iterator call.
  111. func (iter *Iterator) ReadStringAsSlice() (ret []byte) {
  112. c := iter.nextToken()
  113. if c == '"' {
  114. for i := iter.head; i < iter.tail; i++ {
  115. // require ascii string and no escape
  116. // for: field name, base64, number
  117. if iter.buf[i] == '"' {
  118. // fast path: reuse the underlying buffer
  119. ret = iter.buf[iter.head:i]
  120. iter.head = i + 1
  121. return ret
  122. }
  123. }
  124. readLen := iter.tail - iter.head
  125. copied := make([]byte, readLen, readLen*2)
  126. copy(copied, iter.buf[iter.head:iter.tail])
  127. iter.head = iter.tail
  128. for iter.Error == nil {
  129. c := iter.readByte()
  130. if c == '"' {
  131. return copied
  132. }
  133. copied = append(copied, c)
  134. }
  135. return copied
  136. }
  137. iter.ReportError("ReadStringAsSlice", `expects " or n, but found `+string([]byte{c}))
  138. return
  139. }
  140. func (iter *Iterator) readU4() (ret rune) {
  141. for i := 0; i < 4; i++ {
  142. c := iter.readByte()
  143. if iter.Error != nil {
  144. return
  145. }
  146. if c >= '0' && c <= '9' {
  147. ret = ret*16 + rune(c-'0')
  148. } else if c >= 'a' && c <= 'f' {
  149. ret = ret*16 + rune(c-'a'+10)
  150. } else if c >= 'A' && c <= 'F' {
  151. ret = ret*16 + rune(c-'A'+10)
  152. } else {
  153. iter.ReportError("readU4", "expects 0~9 or a~f, but found "+string([]byte{c}))
  154. return
  155. }
  156. }
  157. return ret
  158. }
  159. const (
  160. t1 = 0x00 // 0000 0000
  161. tx = 0x80 // 1000 0000
  162. t2 = 0xC0 // 1100 0000
  163. t3 = 0xE0 // 1110 0000
  164. t4 = 0xF0 // 1111 0000
  165. t5 = 0xF8 // 1111 1000
  166. maskx = 0x3F // 0011 1111
  167. mask2 = 0x1F // 0001 1111
  168. mask3 = 0x0F // 0000 1111
  169. mask4 = 0x07 // 0000 0111
  170. rune1Max = 1<<7 - 1
  171. rune2Max = 1<<11 - 1
  172. rune3Max = 1<<16 - 1
  173. surrogateMin = 0xD800
  174. surrogateMax = 0xDFFF
  175. maxRune = '\U0010FFFF' // Maximum valid Unicode code point.
  176. runeError = '\uFFFD' // the "error" Rune or "Unicode replacement character"
  177. )
  178. func appendRune(p []byte, r rune) []byte {
  179. // Negative values are erroneous. Making it unsigned addresses the problem.
  180. switch i := uint32(r); {
  181. case i <= rune1Max:
  182. p = append(p, byte(r))
  183. return p
  184. case i <= rune2Max:
  185. p = append(p, t2|byte(r>>6))
  186. p = append(p, tx|byte(r)&maskx)
  187. return p
  188. case i > maxRune, surrogateMin <= i && i <= surrogateMax:
  189. r = runeError
  190. fallthrough
  191. case i <= rune3Max:
  192. p = append(p, t3|byte(r>>12))
  193. p = append(p, tx|byte(r>>6)&maskx)
  194. p = append(p, tx|byte(r)&maskx)
  195. return p
  196. default:
  197. p = append(p, t4|byte(r>>18))
  198. p = append(p, tx|byte(r>>12)&maskx)
  199. p = append(p, tx|byte(r>>6)&maskx)
  200. p = append(p, tx|byte(r)&maskx)
  201. return p
  202. }
  203. }