123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355 |
- // Copyright 2014 The Go Authors. All rights reserved.
- // Use of this source code is governed by a BSD-style
- // license that can be found in the LICENSE file.
- // Package runes provide transforms for UTF-8 encoded text.
- package runes // import "golang.org/x/text/runes"
- import (
- "unicode"
- "unicode/utf8"
- "golang.org/x/text/transform"
- )
- // A Set is a collection of runes.
- type Set interface {
- // Contains returns true if r is contained in the set.
- Contains(r rune) bool
- }
- type setFunc func(rune) bool
- func (s setFunc) Contains(r rune) bool {
- return s(r)
- }
- // Note: using funcs here instead of wrapping types result in cleaner
- // documentation and a smaller API.
- // In creates a Set with a Contains method that returns true for all runes in
- // the given RangeTable.
- func In(rt *unicode.RangeTable) Set {
- return setFunc(func(r rune) bool { return unicode.Is(rt, r) })
- }
- // In creates a Set with a Contains method that returns true for all runes not
- // in the given RangeTable.
- func NotIn(rt *unicode.RangeTable) Set {
- return setFunc(func(r rune) bool { return !unicode.Is(rt, r) })
- }
- // Predicate creates a Set with a Contains method that returns f(r).
- func Predicate(f func(rune) bool) Set {
- return setFunc(f)
- }
- // Transformer implements the transform.Transformer interface.
- type Transformer struct {
- t transform.SpanningTransformer
- }
- func (t Transformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
- return t.t.Transform(dst, src, atEOF)
- }
- func (t Transformer) Span(b []byte, atEOF bool) (n int, err error) {
- return t.t.Span(b, atEOF)
- }
- func (t Transformer) Reset() { t.t.Reset() }
- // Bytes returns a new byte slice with the result of converting b using t. It
- // calls Reset on t. It returns nil if any error was found. This can only happen
- // if an error-producing Transformer is passed to If.
- func (t Transformer) Bytes(b []byte) []byte {
- b, _, err := transform.Bytes(t, b)
- if err != nil {
- return nil
- }
- return b
- }
- // String returns a string with the result of converting s using t. It calls
- // Reset on t. It returns the empty string if any error was found. This can only
- // happen if an error-producing Transformer is passed to If.
- func (t Transformer) String(s string) string {
- s, _, err := transform.String(t, s)
- if err != nil {
- return ""
- }
- return s
- }
- // TODO:
- // - Copy: copying strings and bytes in whole-rune units.
- // - Validation (maybe)
- // - Well-formed-ness (maybe)
- const runeErrorString = string(utf8.RuneError)
- // Remove returns a Transformer that removes runes r for which s.Contains(r).
- // Illegal input bytes are replaced by RuneError before being passed to f.
- func Remove(s Set) Transformer {
- if f, ok := s.(setFunc); ok {
- // This little trick cuts the running time of BenchmarkRemove for sets
- // created by Predicate roughly in half.
- // TODO: special-case RangeTables as well.
- return Transformer{remove(f)}
- }
- return Transformer{remove(s.Contains)}
- }
- // TODO: remove transform.RemoveFunc.
- type remove func(r rune) bool
- func (remove) Reset() {}
- // Span implements transform.Spanner.
- func (t remove) Span(src []byte, atEOF bool) (n int, err error) {
- for r, size := rune(0), 0; n < len(src); {
- if r = rune(src[n]); r < utf8.RuneSelf {
- size = 1
- } else if r, size = utf8.DecodeRune(src[n:]); size == 1 {
- // Invalid rune.
- if !atEOF && !utf8.FullRune(src[n:]) {
- err = transform.ErrShortSrc
- } else {
- err = transform.ErrEndOfSpan
- }
- break
- }
- if t(r) {
- err = transform.ErrEndOfSpan
- break
- }
- n += size
- }
- return
- }
- // Transform implements transform.Transformer.
- func (t remove) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
- for r, size := rune(0), 0; nSrc < len(src); {
- if r = rune(src[nSrc]); r < utf8.RuneSelf {
- size = 1
- } else if r, size = utf8.DecodeRune(src[nSrc:]); size == 1 {
- // Invalid rune.
- if !atEOF && !utf8.FullRune(src[nSrc:]) {
- err = transform.ErrShortSrc
- break
- }
- // We replace illegal bytes with RuneError. Not doing so might
- // otherwise turn a sequence of invalid UTF-8 into valid UTF-8.
- // The resulting byte sequence may subsequently contain runes
- // for which t(r) is true that were passed unnoticed.
- if !t(utf8.RuneError) {
- if nDst+3 > len(dst) {
- err = transform.ErrShortDst
- break
- }
- dst[nDst+0] = runeErrorString[0]
- dst[nDst+1] = runeErrorString[1]
- dst[nDst+2] = runeErrorString[2]
- nDst += 3
- }
- nSrc++
- continue
- }
- if t(r) {
- nSrc += size
- continue
- }
- if nDst+size > len(dst) {
- err = transform.ErrShortDst
- break
- }
- for i := 0; i < size; i++ {
- dst[nDst] = src[nSrc]
- nDst++
- nSrc++
- }
- }
- return
- }
- // Map returns a Transformer that maps the runes in the input using the given
- // mapping. Illegal bytes in the input are converted to utf8.RuneError before
- // being passed to the mapping func.
- func Map(mapping func(rune) rune) Transformer {
- return Transformer{mapper(mapping)}
- }
- type mapper func(rune) rune
- func (mapper) Reset() {}
- // Span implements transform.Spanner.
- func (t mapper) Span(src []byte, atEOF bool) (n int, err error) {
- for r, size := rune(0), 0; n < len(src); n += size {
- if r = rune(src[n]); r < utf8.RuneSelf {
- size = 1
- } else if r, size = utf8.DecodeRune(src[n:]); size == 1 {
- // Invalid rune.
- if !atEOF && !utf8.FullRune(src[n:]) {
- err = transform.ErrShortSrc
- } else {
- err = transform.ErrEndOfSpan
- }
- break
- }
- if t(r) != r {
- err = transform.ErrEndOfSpan
- break
- }
- }
- return n, err
- }
- // Transform implements transform.Transformer.
- func (t mapper) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
- var replacement rune
- var b [utf8.UTFMax]byte
- for r, size := rune(0), 0; nSrc < len(src); {
- if r = rune(src[nSrc]); r < utf8.RuneSelf {
- if replacement = t(r); replacement < utf8.RuneSelf {
- if nDst == len(dst) {
- err = transform.ErrShortDst
- break
- }
- dst[nDst] = byte(replacement)
- nDst++
- nSrc++
- continue
- }
- size = 1
- } else if r, size = utf8.DecodeRune(src[nSrc:]); size == 1 {
- // Invalid rune.
- if !atEOF && !utf8.FullRune(src[nSrc:]) {
- err = transform.ErrShortSrc
- break
- }
- if replacement = t(utf8.RuneError); replacement == utf8.RuneError {
- if nDst+3 > len(dst) {
- err = transform.ErrShortDst
- break
- }
- dst[nDst+0] = runeErrorString[0]
- dst[nDst+1] = runeErrorString[1]
- dst[nDst+2] = runeErrorString[2]
- nDst += 3
- nSrc++
- continue
- }
- } else if replacement = t(r); replacement == r {
- if nDst+size > len(dst) {
- err = transform.ErrShortDst
- break
- }
- for i := 0; i < size; i++ {
- dst[nDst] = src[nSrc]
- nDst++
- nSrc++
- }
- continue
- }
- n := utf8.EncodeRune(b[:], replacement)
- if nDst+n > len(dst) {
- err = transform.ErrShortDst
- break
- }
- for i := 0; i < n; i++ {
- dst[nDst] = b[i]
- nDst++
- }
- nSrc += size
- }
- return
- }
- // ReplaceIllFormed returns a transformer that replaces all input bytes that are
- // not part of a well-formed UTF-8 code sequence with utf8.RuneError.
- func ReplaceIllFormed() Transformer {
- return Transformer{&replaceIllFormed{}}
- }
- type replaceIllFormed struct{ transform.NopResetter }
- func (t replaceIllFormed) Span(src []byte, atEOF bool) (n int, err error) {
- for n < len(src) {
- // ASCII fast path.
- if src[n] < utf8.RuneSelf {
- n++
- continue
- }
- r, size := utf8.DecodeRune(src[n:])
- // Look for a valid non-ASCII rune.
- if r != utf8.RuneError || size != 1 {
- n += size
- continue
- }
- // Look for short source data.
- if !atEOF && !utf8.FullRune(src[n:]) {
- err = transform.ErrShortSrc
- break
- }
- // We have an invalid rune.
- err = transform.ErrEndOfSpan
- break
- }
- return n, err
- }
- func (t replaceIllFormed) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
- for nSrc < len(src) {
- // ASCII fast path.
- if r := src[nSrc]; r < utf8.RuneSelf {
- if nDst == len(dst) {
- err = transform.ErrShortDst
- break
- }
- dst[nDst] = r
- nDst++
- nSrc++
- continue
- }
- // Look for a valid non-ASCII rune.
- if _, size := utf8.DecodeRune(src[nSrc:]); size != 1 {
- if size != copy(dst[nDst:], src[nSrc:nSrc+size]) {
- err = transform.ErrShortDst
- break
- }
- nDst += size
- nSrc += size
- continue
- }
- // Look for short source data.
- if !atEOF && !utf8.FullRune(src[nSrc:]) {
- err = transform.ErrShortSrc
- break
- }
- // We have an invalid rune.
- if nDst+3 > len(dst) {
- err = transform.ErrShortDst
- break
- }
- dst[nDst+0] = runeErrorString[0]
- dst[nDst+1] = runeErrorString[1]
- dst[nDst+2] = runeErrorString[2]
- nDst += 3
- nSrc++
- }
- return nDst, nSrc, err
- }
|