runes.go 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355
  1. // Copyright 2014 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // Package runes provide transforms for UTF-8 encoded text.
  5. package runes // import "golang.org/x/text/runes"
  6. import (
  7. "unicode"
  8. "unicode/utf8"
  9. "golang.org/x/text/transform"
  10. )
  11. // A Set is a collection of runes.
  12. type Set interface {
  13. // Contains returns true if r is contained in the set.
  14. Contains(r rune) bool
  15. }
  16. type setFunc func(rune) bool
  17. func (s setFunc) Contains(r rune) bool {
  18. return s(r)
  19. }
  20. // Note: using funcs here instead of wrapping types result in cleaner
  21. // documentation and a smaller API.
  22. // In creates a Set with a Contains method that returns true for all runes in
  23. // the given RangeTable.
  24. func In(rt *unicode.RangeTable) Set {
  25. return setFunc(func(r rune) bool { return unicode.Is(rt, r) })
  26. }
  27. // In creates a Set with a Contains method that returns true for all runes not
  28. // in the given RangeTable.
  29. func NotIn(rt *unicode.RangeTable) Set {
  30. return setFunc(func(r rune) bool { return !unicode.Is(rt, r) })
  31. }
  32. // Predicate creates a Set with a Contains method that returns f(r).
  33. func Predicate(f func(rune) bool) Set {
  34. return setFunc(f)
  35. }
  36. // Transformer implements the transform.Transformer interface.
  37. type Transformer struct {
  38. t transform.SpanningTransformer
  39. }
  40. func (t Transformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
  41. return t.t.Transform(dst, src, atEOF)
  42. }
  43. func (t Transformer) Span(b []byte, atEOF bool) (n int, err error) {
  44. return t.t.Span(b, atEOF)
  45. }
  46. func (t Transformer) Reset() { t.t.Reset() }
  47. // Bytes returns a new byte slice with the result of converting b using t. It
  48. // calls Reset on t. It returns nil if any error was found. This can only happen
  49. // if an error-producing Transformer is passed to If.
  50. func (t Transformer) Bytes(b []byte) []byte {
  51. b, _, err := transform.Bytes(t, b)
  52. if err != nil {
  53. return nil
  54. }
  55. return b
  56. }
  57. // String returns a string with the result of converting s using t. It calls
  58. // Reset on t. It returns the empty string if any error was found. This can only
  59. // happen if an error-producing Transformer is passed to If.
  60. func (t Transformer) String(s string) string {
  61. s, _, err := transform.String(t, s)
  62. if err != nil {
  63. return ""
  64. }
  65. return s
  66. }
  67. // TODO:
  68. // - Copy: copying strings and bytes in whole-rune units.
  69. // - Validation (maybe)
  70. // - Well-formed-ness (maybe)
  71. const runeErrorString = string(utf8.RuneError)
  72. // Remove returns a Transformer that removes runes r for which s.Contains(r).
  73. // Illegal input bytes are replaced by RuneError before being passed to f.
  74. func Remove(s Set) Transformer {
  75. if f, ok := s.(setFunc); ok {
  76. // This little trick cuts the running time of BenchmarkRemove for sets
  77. // created by Predicate roughly in half.
  78. // TODO: special-case RangeTables as well.
  79. return Transformer{remove(f)}
  80. }
  81. return Transformer{remove(s.Contains)}
  82. }
  83. // TODO: remove transform.RemoveFunc.
  84. type remove func(r rune) bool
  85. func (remove) Reset() {}
  86. // Span implements transform.Spanner.
  87. func (t remove) Span(src []byte, atEOF bool) (n int, err error) {
  88. for r, size := rune(0), 0; n < len(src); {
  89. if r = rune(src[n]); r < utf8.RuneSelf {
  90. size = 1
  91. } else if r, size = utf8.DecodeRune(src[n:]); size == 1 {
  92. // Invalid rune.
  93. if !atEOF && !utf8.FullRune(src[n:]) {
  94. err = transform.ErrShortSrc
  95. } else {
  96. err = transform.ErrEndOfSpan
  97. }
  98. break
  99. }
  100. if t(r) {
  101. err = transform.ErrEndOfSpan
  102. break
  103. }
  104. n += size
  105. }
  106. return
  107. }
  108. // Transform implements transform.Transformer.
  109. func (t remove) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
  110. for r, size := rune(0), 0; nSrc < len(src); {
  111. if r = rune(src[nSrc]); r < utf8.RuneSelf {
  112. size = 1
  113. } else if r, size = utf8.DecodeRune(src[nSrc:]); size == 1 {
  114. // Invalid rune.
  115. if !atEOF && !utf8.FullRune(src[nSrc:]) {
  116. err = transform.ErrShortSrc
  117. break
  118. }
  119. // We replace illegal bytes with RuneError. Not doing so might
  120. // otherwise turn a sequence of invalid UTF-8 into valid UTF-8.
  121. // The resulting byte sequence may subsequently contain runes
  122. // for which t(r) is true that were passed unnoticed.
  123. if !t(utf8.RuneError) {
  124. if nDst+3 > len(dst) {
  125. err = transform.ErrShortDst
  126. break
  127. }
  128. dst[nDst+0] = runeErrorString[0]
  129. dst[nDst+1] = runeErrorString[1]
  130. dst[nDst+2] = runeErrorString[2]
  131. nDst += 3
  132. }
  133. nSrc++
  134. continue
  135. }
  136. if t(r) {
  137. nSrc += size
  138. continue
  139. }
  140. if nDst+size > len(dst) {
  141. err = transform.ErrShortDst
  142. break
  143. }
  144. for i := 0; i < size; i++ {
  145. dst[nDst] = src[nSrc]
  146. nDst++
  147. nSrc++
  148. }
  149. }
  150. return
  151. }
  152. // Map returns a Transformer that maps the runes in the input using the given
  153. // mapping. Illegal bytes in the input are converted to utf8.RuneError before
  154. // being passed to the mapping func.
  155. func Map(mapping func(rune) rune) Transformer {
  156. return Transformer{mapper(mapping)}
  157. }
  158. type mapper func(rune) rune
  159. func (mapper) Reset() {}
  160. // Span implements transform.Spanner.
  161. func (t mapper) Span(src []byte, atEOF bool) (n int, err error) {
  162. for r, size := rune(0), 0; n < len(src); n += size {
  163. if r = rune(src[n]); r < utf8.RuneSelf {
  164. size = 1
  165. } else if r, size = utf8.DecodeRune(src[n:]); size == 1 {
  166. // Invalid rune.
  167. if !atEOF && !utf8.FullRune(src[n:]) {
  168. err = transform.ErrShortSrc
  169. } else {
  170. err = transform.ErrEndOfSpan
  171. }
  172. break
  173. }
  174. if t(r) != r {
  175. err = transform.ErrEndOfSpan
  176. break
  177. }
  178. }
  179. return n, err
  180. }
  181. // Transform implements transform.Transformer.
  182. func (t mapper) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
  183. var replacement rune
  184. var b [utf8.UTFMax]byte
  185. for r, size := rune(0), 0; nSrc < len(src); {
  186. if r = rune(src[nSrc]); r < utf8.RuneSelf {
  187. if replacement = t(r); replacement < utf8.RuneSelf {
  188. if nDst == len(dst) {
  189. err = transform.ErrShortDst
  190. break
  191. }
  192. dst[nDst] = byte(replacement)
  193. nDst++
  194. nSrc++
  195. continue
  196. }
  197. size = 1
  198. } else if r, size = utf8.DecodeRune(src[nSrc:]); size == 1 {
  199. // Invalid rune.
  200. if !atEOF && !utf8.FullRune(src[nSrc:]) {
  201. err = transform.ErrShortSrc
  202. break
  203. }
  204. if replacement = t(utf8.RuneError); replacement == utf8.RuneError {
  205. if nDst+3 > len(dst) {
  206. err = transform.ErrShortDst
  207. break
  208. }
  209. dst[nDst+0] = runeErrorString[0]
  210. dst[nDst+1] = runeErrorString[1]
  211. dst[nDst+2] = runeErrorString[2]
  212. nDst += 3
  213. nSrc++
  214. continue
  215. }
  216. } else if replacement = t(r); replacement == r {
  217. if nDst+size > len(dst) {
  218. err = transform.ErrShortDst
  219. break
  220. }
  221. for i := 0; i < size; i++ {
  222. dst[nDst] = src[nSrc]
  223. nDst++
  224. nSrc++
  225. }
  226. continue
  227. }
  228. n := utf8.EncodeRune(b[:], replacement)
  229. if nDst+n > len(dst) {
  230. err = transform.ErrShortDst
  231. break
  232. }
  233. for i := 0; i < n; i++ {
  234. dst[nDst] = b[i]
  235. nDst++
  236. }
  237. nSrc += size
  238. }
  239. return
  240. }
  241. // ReplaceIllFormed returns a transformer that replaces all input bytes that are
  242. // not part of a well-formed UTF-8 code sequence with utf8.RuneError.
  243. func ReplaceIllFormed() Transformer {
  244. return Transformer{&replaceIllFormed{}}
  245. }
  246. type replaceIllFormed struct{ transform.NopResetter }
  247. func (t replaceIllFormed) Span(src []byte, atEOF bool) (n int, err error) {
  248. for n < len(src) {
  249. // ASCII fast path.
  250. if src[n] < utf8.RuneSelf {
  251. n++
  252. continue
  253. }
  254. r, size := utf8.DecodeRune(src[n:])
  255. // Look for a valid non-ASCII rune.
  256. if r != utf8.RuneError || size != 1 {
  257. n += size
  258. continue
  259. }
  260. // Look for short source data.
  261. if !atEOF && !utf8.FullRune(src[n:]) {
  262. err = transform.ErrShortSrc
  263. break
  264. }
  265. // We have an invalid rune.
  266. err = transform.ErrEndOfSpan
  267. break
  268. }
  269. return n, err
  270. }
  271. func (t replaceIllFormed) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
  272. for nSrc < len(src) {
  273. // ASCII fast path.
  274. if r := src[nSrc]; r < utf8.RuneSelf {
  275. if nDst == len(dst) {
  276. err = transform.ErrShortDst
  277. break
  278. }
  279. dst[nDst] = r
  280. nDst++
  281. nSrc++
  282. continue
  283. }
  284. // Look for a valid non-ASCII rune.
  285. if _, size := utf8.DecodeRune(src[nSrc:]); size != 1 {
  286. if size != copy(dst[nDst:], src[nSrc:nSrc+size]) {
  287. err = transform.ErrShortDst
  288. break
  289. }
  290. nDst += size
  291. nSrc += size
  292. continue
  293. }
  294. // Look for short source data.
  295. if !atEOF && !utf8.FullRune(src[nSrc:]) {
  296. err = transform.ErrShortSrc
  297. break
  298. }
  299. // We have an invalid rune.
  300. if nDst+3 > len(dst) {
  301. err = transform.ErrShortDst
  302. break
  303. }
  304. dst[nDst+0] = runeErrorString[0]
  305. dst[nDst+1] = runeErrorString[1]
  306. dst[nDst+2] = runeErrorString[2]
  307. nDst += 3
  308. nSrc++
  309. }
  310. return nDst, nSrc, err
  311. }