parse.go 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859
  1. // Copyright 2013 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package language
  5. import (
  6. "bytes"
  7. "errors"
  8. "fmt"
  9. "sort"
  10. "strconv"
  11. "strings"
  12. "golang.org/x/text/internal/tag"
  13. )
  14. // isAlpha returns true if the byte is not a digit.
  15. // b must be an ASCII letter or digit.
  16. func isAlpha(b byte) bool {
  17. return b > '9'
  18. }
  19. // isAlphaNum returns true if the string contains only ASCII letters or digits.
  20. func isAlphaNum(s []byte) bool {
  21. for _, c := range s {
  22. if !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9') {
  23. return false
  24. }
  25. }
  26. return true
  27. }
  28. // errSyntax is returned by any of the parsing functions when the
  29. // input is not well-formed, according to BCP 47.
  30. // TODO: return the position at which the syntax error occurred?
  31. var errSyntax = errors.New("language: tag is not well-formed")
  32. // ValueError is returned by any of the parsing functions when the
  33. // input is well-formed but the respective subtag is not recognized
  34. // as a valid value.
  35. type ValueError struct {
  36. v [8]byte
  37. }
  38. func mkErrInvalid(s []byte) error {
  39. var e ValueError
  40. copy(e.v[:], s)
  41. return e
  42. }
  43. func (e ValueError) tag() []byte {
  44. n := bytes.IndexByte(e.v[:], 0)
  45. if n == -1 {
  46. n = 8
  47. }
  48. return e.v[:n]
  49. }
  50. // Error implements the error interface.
  51. func (e ValueError) Error() string {
  52. return fmt.Sprintf("language: subtag %q is well-formed but unknown", e.tag())
  53. }
  54. // Subtag returns the subtag for which the error occurred.
  55. func (e ValueError) Subtag() string {
  56. return string(e.tag())
  57. }
  58. // scanner is used to scan BCP 47 tokens, which are separated by _ or -.
  59. type scanner struct {
  60. b []byte
  61. bytes [max99thPercentileSize]byte
  62. token []byte
  63. start int // start position of the current token
  64. end int // end position of the current token
  65. next int // next point for scan
  66. err error
  67. done bool
  68. }
  69. func makeScannerString(s string) scanner {
  70. scan := scanner{}
  71. if len(s) <= len(scan.bytes) {
  72. scan.b = scan.bytes[:copy(scan.bytes[:], s)]
  73. } else {
  74. scan.b = []byte(s)
  75. }
  76. scan.init()
  77. return scan
  78. }
  79. // makeScanner returns a scanner using b as the input buffer.
  80. // b is not copied and may be modified by the scanner routines.
  81. func makeScanner(b []byte) scanner {
  82. scan := scanner{b: b}
  83. scan.init()
  84. return scan
  85. }
  86. func (s *scanner) init() {
  87. for i, c := range s.b {
  88. if c == '_' {
  89. s.b[i] = '-'
  90. }
  91. }
  92. s.scan()
  93. }
  94. // restToLower converts the string between start and end to lower case.
  95. func (s *scanner) toLower(start, end int) {
  96. for i := start; i < end; i++ {
  97. c := s.b[i]
  98. if 'A' <= c && c <= 'Z' {
  99. s.b[i] += 'a' - 'A'
  100. }
  101. }
  102. }
  103. func (s *scanner) setError(e error) {
  104. if s.err == nil || (e == errSyntax && s.err != errSyntax) {
  105. s.err = e
  106. }
  107. }
  108. // resizeRange shrinks or grows the array at position oldStart such that
  109. // a new string of size newSize can fit between oldStart and oldEnd.
  110. // Sets the scan point to after the resized range.
  111. func (s *scanner) resizeRange(oldStart, oldEnd, newSize int) {
  112. s.start = oldStart
  113. if end := oldStart + newSize; end != oldEnd {
  114. diff := end - oldEnd
  115. if end < cap(s.b) {
  116. b := make([]byte, len(s.b)+diff)
  117. copy(b, s.b[:oldStart])
  118. copy(b[end:], s.b[oldEnd:])
  119. s.b = b
  120. } else {
  121. s.b = append(s.b[end:], s.b[oldEnd:]...)
  122. }
  123. s.next = end + (s.next - s.end)
  124. s.end = end
  125. }
  126. }
  127. // replace replaces the current token with repl.
  128. func (s *scanner) replace(repl string) {
  129. s.resizeRange(s.start, s.end, len(repl))
  130. copy(s.b[s.start:], repl)
  131. }
  132. // gobble removes the current token from the input.
  133. // Caller must call scan after calling gobble.
  134. func (s *scanner) gobble(e error) {
  135. s.setError(e)
  136. if s.start == 0 {
  137. s.b = s.b[:+copy(s.b, s.b[s.next:])]
  138. s.end = 0
  139. } else {
  140. s.b = s.b[:s.start-1+copy(s.b[s.start-1:], s.b[s.end:])]
  141. s.end = s.start - 1
  142. }
  143. s.next = s.start
  144. }
  145. // deleteRange removes the given range from s.b before the current token.
  146. func (s *scanner) deleteRange(start, end int) {
  147. s.setError(errSyntax)
  148. s.b = s.b[:start+copy(s.b[start:], s.b[end:])]
  149. diff := end - start
  150. s.next -= diff
  151. s.start -= diff
  152. s.end -= diff
  153. }
  154. // scan parses the next token of a BCP 47 string. Tokens that are larger
  155. // than 8 characters or include non-alphanumeric characters result in an error
  156. // and are gobbled and removed from the output.
  157. // It returns the end position of the last token consumed.
  158. func (s *scanner) scan() (end int) {
  159. end = s.end
  160. s.token = nil
  161. for s.start = s.next; s.next < len(s.b); {
  162. i := bytes.IndexByte(s.b[s.next:], '-')
  163. if i == -1 {
  164. s.end = len(s.b)
  165. s.next = len(s.b)
  166. i = s.end - s.start
  167. } else {
  168. s.end = s.next + i
  169. s.next = s.end + 1
  170. }
  171. token := s.b[s.start:s.end]
  172. if i < 1 || i > 8 || !isAlphaNum(token) {
  173. s.gobble(errSyntax)
  174. continue
  175. }
  176. s.token = token
  177. return end
  178. }
  179. if n := len(s.b); n > 0 && s.b[n-1] == '-' {
  180. s.setError(errSyntax)
  181. s.b = s.b[:len(s.b)-1]
  182. }
  183. s.done = true
  184. return end
  185. }
  186. // acceptMinSize parses multiple tokens of the given size or greater.
  187. // It returns the end position of the last token consumed.
  188. func (s *scanner) acceptMinSize(min int) (end int) {
  189. end = s.end
  190. s.scan()
  191. for ; len(s.token) >= min; s.scan() {
  192. end = s.end
  193. }
  194. return end
  195. }
  196. // Parse parses the given BCP 47 string and returns a valid Tag. If parsing
  197. // failed it returns an error and any part of the tag that could be parsed.
  198. // If parsing succeeded but an unknown value was found, it returns
  199. // ValueError. The Tag returned in this case is just stripped of the unknown
  200. // value. All other values are preserved. It accepts tags in the BCP 47 format
  201. // and extensions to this standard defined in
  202. // http://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
  203. // The resulting tag is canonicalized using the default canonicalization type.
  204. func Parse(s string) (t Tag, err error) {
  205. return Default.Parse(s)
  206. }
  207. // Parse parses the given BCP 47 string and returns a valid Tag. If parsing
  208. // failed it returns an error and any part of the tag that could be parsed.
  209. // If parsing succeeded but an unknown value was found, it returns
  210. // ValueError. The Tag returned in this case is just stripped of the unknown
  211. // value. All other values are preserved. It accepts tags in the BCP 47 format
  212. // and extensions to this standard defined in
  213. // http://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
  214. // The resulting tag is canonicalized using the the canonicalization type c.
  215. func (c CanonType) Parse(s string) (t Tag, err error) {
  216. // TODO: consider supporting old-style locale key-value pairs.
  217. if s == "" {
  218. return und, errSyntax
  219. }
  220. if len(s) <= maxAltTaglen {
  221. b := [maxAltTaglen]byte{}
  222. for i, c := range s {
  223. // Generating invalid UTF-8 is okay as it won't match.
  224. if 'A' <= c && c <= 'Z' {
  225. c += 'a' - 'A'
  226. } else if c == '_' {
  227. c = '-'
  228. }
  229. b[i] = byte(c)
  230. }
  231. if t, ok := grandfathered(b); ok {
  232. return t, nil
  233. }
  234. }
  235. scan := makeScannerString(s)
  236. t, err = parse(&scan, s)
  237. t, changed := t.canonicalize(c)
  238. if changed {
  239. t.remakeString()
  240. }
  241. return t, err
  242. }
  243. func parse(scan *scanner, s string) (t Tag, err error) {
  244. t = und
  245. var end int
  246. if n := len(scan.token); n <= 1 {
  247. scan.toLower(0, len(scan.b))
  248. if n == 0 || scan.token[0] != 'x' {
  249. return t, errSyntax
  250. }
  251. end = parseExtensions(scan)
  252. } else if n >= 4 {
  253. return und, errSyntax
  254. } else { // the usual case
  255. t, end = parseTag(scan)
  256. if n := len(scan.token); n == 1 {
  257. t.pExt = uint16(end)
  258. end = parseExtensions(scan)
  259. } else if end < len(scan.b) {
  260. scan.setError(errSyntax)
  261. scan.b = scan.b[:end]
  262. }
  263. }
  264. if int(t.pVariant) < len(scan.b) {
  265. if end < len(s) {
  266. s = s[:end]
  267. }
  268. if len(s) > 0 && tag.Compare(s, scan.b) == 0 {
  269. t.str = s
  270. } else {
  271. t.str = string(scan.b)
  272. }
  273. } else {
  274. t.pVariant, t.pExt = 0, 0
  275. }
  276. return t, scan.err
  277. }
  278. // parseTag parses language, script, region and variants.
  279. // It returns a Tag and the end position in the input that was parsed.
  280. func parseTag(scan *scanner) (t Tag, end int) {
  281. var e error
  282. // TODO: set an error if an unknown lang, script or region is encountered.
  283. t.lang, e = getLangID(scan.token)
  284. scan.setError(e)
  285. scan.replace(t.lang.String())
  286. langStart := scan.start
  287. end = scan.scan()
  288. for len(scan.token) == 3 && isAlpha(scan.token[0]) {
  289. // From http://tools.ietf.org/html/bcp47, <lang>-<extlang> tags are equivalent
  290. // to a tag of the form <extlang>.
  291. lang, e := getLangID(scan.token)
  292. if lang != 0 {
  293. t.lang = lang
  294. copy(scan.b[langStart:], lang.String())
  295. scan.b[langStart+3] = '-'
  296. scan.start = langStart + 4
  297. }
  298. scan.gobble(e)
  299. end = scan.scan()
  300. }
  301. if len(scan.token) == 4 && isAlpha(scan.token[0]) {
  302. t.script, e = getScriptID(script, scan.token)
  303. if t.script == 0 {
  304. scan.gobble(e)
  305. }
  306. end = scan.scan()
  307. }
  308. if n := len(scan.token); n >= 2 && n <= 3 {
  309. t.region, e = getRegionID(scan.token)
  310. if t.region == 0 {
  311. scan.gobble(e)
  312. } else {
  313. scan.replace(t.region.String())
  314. }
  315. end = scan.scan()
  316. }
  317. scan.toLower(scan.start, len(scan.b))
  318. t.pVariant = byte(end)
  319. end = parseVariants(scan, end, t)
  320. t.pExt = uint16(end)
  321. return t, end
  322. }
  323. var separator = []byte{'-'}
  324. // parseVariants scans tokens as long as each token is a valid variant string.
  325. // Duplicate variants are removed.
  326. func parseVariants(scan *scanner, end int, t Tag) int {
  327. start := scan.start
  328. varIDBuf := [4]uint8{}
  329. variantBuf := [4][]byte{}
  330. varID := varIDBuf[:0]
  331. variant := variantBuf[:0]
  332. last := -1
  333. needSort := false
  334. for ; len(scan.token) >= 4; scan.scan() {
  335. // TODO: measure the impact of needing this conversion and redesign
  336. // the data structure if there is an issue.
  337. v, ok := variantIndex[string(scan.token)]
  338. if !ok {
  339. // unknown variant
  340. // TODO: allow user-defined variants?
  341. scan.gobble(mkErrInvalid(scan.token))
  342. continue
  343. }
  344. varID = append(varID, v)
  345. variant = append(variant, scan.token)
  346. if !needSort {
  347. if last < int(v) {
  348. last = int(v)
  349. } else {
  350. needSort = true
  351. // There is no legal combinations of more than 7 variants
  352. // (and this is by no means a useful sequence).
  353. const maxVariants = 8
  354. if len(varID) > maxVariants {
  355. break
  356. }
  357. }
  358. }
  359. end = scan.end
  360. }
  361. if needSort {
  362. sort.Sort(variantsSort{varID, variant})
  363. k, l := 0, -1
  364. for i, v := range varID {
  365. w := int(v)
  366. if l == w {
  367. // Remove duplicates.
  368. continue
  369. }
  370. varID[k] = varID[i]
  371. variant[k] = variant[i]
  372. k++
  373. l = w
  374. }
  375. if str := bytes.Join(variant[:k], separator); len(str) == 0 {
  376. end = start - 1
  377. } else {
  378. scan.resizeRange(start, end, len(str))
  379. copy(scan.b[scan.start:], str)
  380. end = scan.end
  381. }
  382. }
  383. return end
  384. }
  385. type variantsSort struct {
  386. i []uint8
  387. v [][]byte
  388. }
  389. func (s variantsSort) Len() int {
  390. return len(s.i)
  391. }
  392. func (s variantsSort) Swap(i, j int) {
  393. s.i[i], s.i[j] = s.i[j], s.i[i]
  394. s.v[i], s.v[j] = s.v[j], s.v[i]
  395. }
  396. func (s variantsSort) Less(i, j int) bool {
  397. return s.i[i] < s.i[j]
  398. }
  399. type bytesSort [][]byte
  400. func (b bytesSort) Len() int {
  401. return len(b)
  402. }
  403. func (b bytesSort) Swap(i, j int) {
  404. b[i], b[j] = b[j], b[i]
  405. }
  406. func (b bytesSort) Less(i, j int) bool {
  407. return bytes.Compare(b[i], b[j]) == -1
  408. }
  409. // parseExtensions parses and normalizes the extensions in the buffer.
  410. // It returns the last position of scan.b that is part of any extension.
  411. // It also trims scan.b to remove excess parts accordingly.
  412. func parseExtensions(scan *scanner) int {
  413. start := scan.start
  414. exts := [][]byte{}
  415. private := []byte{}
  416. end := scan.end
  417. for len(scan.token) == 1 {
  418. extStart := scan.start
  419. ext := scan.token[0]
  420. end = parseExtension(scan)
  421. extension := scan.b[extStart:end]
  422. if len(extension) < 3 || (ext != 'x' && len(extension) < 4) {
  423. scan.setError(errSyntax)
  424. end = extStart
  425. continue
  426. } else if start == extStart && (ext == 'x' || scan.start == len(scan.b)) {
  427. scan.b = scan.b[:end]
  428. return end
  429. } else if ext == 'x' {
  430. private = extension
  431. break
  432. }
  433. exts = append(exts, extension)
  434. }
  435. sort.Sort(bytesSort(exts))
  436. if len(private) > 0 {
  437. exts = append(exts, private)
  438. }
  439. scan.b = scan.b[:start]
  440. if len(exts) > 0 {
  441. scan.b = append(scan.b, bytes.Join(exts, separator)...)
  442. } else if start > 0 {
  443. // Strip trailing '-'.
  444. scan.b = scan.b[:start-1]
  445. }
  446. return end
  447. }
  448. // parseExtension parses a single extension and returns the position of
  449. // the extension end.
  450. func parseExtension(scan *scanner) int {
  451. start, end := scan.start, scan.end
  452. switch scan.token[0] {
  453. case 'u':
  454. attrStart := end
  455. scan.scan()
  456. for last := []byte{}; len(scan.token) > 2; scan.scan() {
  457. if bytes.Compare(scan.token, last) != -1 {
  458. // Attributes are unsorted. Start over from scratch.
  459. p := attrStart + 1
  460. scan.next = p
  461. attrs := [][]byte{}
  462. for scan.scan(); len(scan.token) > 2; scan.scan() {
  463. attrs = append(attrs, scan.token)
  464. end = scan.end
  465. }
  466. sort.Sort(bytesSort(attrs))
  467. copy(scan.b[p:], bytes.Join(attrs, separator))
  468. break
  469. }
  470. last = scan.token
  471. end = scan.end
  472. }
  473. var last, key []byte
  474. for attrEnd := end; len(scan.token) == 2; last = key {
  475. key = scan.token
  476. keyEnd := scan.end
  477. end = scan.acceptMinSize(3)
  478. // TODO: check key value validity
  479. if keyEnd == end || bytes.Compare(key, last) != 1 {
  480. // We have an invalid key or the keys are not sorted.
  481. // Start scanning keys from scratch and reorder.
  482. p := attrEnd + 1
  483. scan.next = p
  484. keys := [][]byte{}
  485. for scan.scan(); len(scan.token) == 2; {
  486. keyStart, keyEnd := scan.start, scan.end
  487. end = scan.acceptMinSize(3)
  488. if keyEnd != end {
  489. keys = append(keys, scan.b[keyStart:end])
  490. } else {
  491. scan.setError(errSyntax)
  492. end = keyStart
  493. }
  494. }
  495. sort.Sort(bytesSort(keys))
  496. reordered := bytes.Join(keys, separator)
  497. if e := p + len(reordered); e < end {
  498. scan.deleteRange(e, end)
  499. end = e
  500. }
  501. copy(scan.b[p:], bytes.Join(keys, separator))
  502. break
  503. }
  504. }
  505. case 't':
  506. scan.scan()
  507. if n := len(scan.token); n >= 2 && n <= 3 && isAlpha(scan.token[1]) {
  508. _, end = parseTag(scan)
  509. scan.toLower(start, end)
  510. }
  511. for len(scan.token) == 2 && !isAlpha(scan.token[1]) {
  512. end = scan.acceptMinSize(3)
  513. }
  514. case 'x':
  515. end = scan.acceptMinSize(1)
  516. default:
  517. end = scan.acceptMinSize(2)
  518. }
  519. return end
  520. }
  521. // Compose creates a Tag from individual parts, which may be of type Tag, Base,
  522. // Script, Region, Variant, []Variant, Extension, []Extension or error. If a
  523. // Base, Script or Region or slice of type Variant or Extension is passed more
  524. // than once, the latter will overwrite the former. Variants and Extensions are
  525. // accumulated, but if two extensions of the same type are passed, the latter
  526. // will replace the former. A Tag overwrites all former values and typically
  527. // only makes sense as the first argument. The resulting tag is returned after
  528. // canonicalizing using the Default CanonType. If one or more errors are
  529. // encountered, one of the errors is returned.
  530. func Compose(part ...interface{}) (t Tag, err error) {
  531. return Default.Compose(part...)
  532. }
  533. // Compose creates a Tag from individual parts, which may be of type Tag, Base,
  534. // Script, Region, Variant, []Variant, Extension, []Extension or error. If a
  535. // Base, Script or Region or slice of type Variant or Extension is passed more
  536. // than once, the latter will overwrite the former. Variants and Extensions are
  537. // accumulated, but if two extensions of the same type are passed, the latter
  538. // will replace the former. A Tag overwrites all former values and typically
  539. // only makes sense as the first argument. The resulting tag is returned after
  540. // canonicalizing using CanonType c. If one or more errors are encountered,
  541. // one of the errors is returned.
  542. func (c CanonType) Compose(part ...interface{}) (t Tag, err error) {
  543. var b builder
  544. if err = b.update(part...); err != nil {
  545. return und, err
  546. }
  547. t, _ = b.tag.canonicalize(c)
  548. if len(b.ext) > 0 || len(b.variant) > 0 {
  549. sort.Sort(sortVariant(b.variant))
  550. sort.Strings(b.ext)
  551. if b.private != "" {
  552. b.ext = append(b.ext, b.private)
  553. }
  554. n := maxCoreSize + tokenLen(b.variant...) + tokenLen(b.ext...)
  555. buf := make([]byte, n)
  556. p := t.genCoreBytes(buf)
  557. t.pVariant = byte(p)
  558. p += appendTokens(buf[p:], b.variant...)
  559. t.pExt = uint16(p)
  560. p += appendTokens(buf[p:], b.ext...)
  561. t.str = string(buf[:p])
  562. } else if b.private != "" {
  563. t.str = b.private
  564. t.remakeString()
  565. }
  566. return
  567. }
  568. type builder struct {
  569. tag Tag
  570. private string // the x extension
  571. ext []string
  572. variant []string
  573. err error
  574. }
  575. func (b *builder) addExt(e string) {
  576. if e == "" {
  577. } else if e[0] == 'x' {
  578. b.private = e
  579. } else {
  580. b.ext = append(b.ext, e)
  581. }
  582. }
  583. var errInvalidArgument = errors.New("invalid Extension or Variant")
  584. func (b *builder) update(part ...interface{}) (err error) {
  585. replace := func(l *[]string, s string, eq func(a, b string) bool) bool {
  586. if s == "" {
  587. b.err = errInvalidArgument
  588. return true
  589. }
  590. for i, v := range *l {
  591. if eq(v, s) {
  592. (*l)[i] = s
  593. return true
  594. }
  595. }
  596. return false
  597. }
  598. for _, x := range part {
  599. switch v := x.(type) {
  600. case Tag:
  601. b.tag.lang = v.lang
  602. b.tag.region = v.region
  603. b.tag.script = v.script
  604. if v.str != "" {
  605. b.variant = nil
  606. for x, s := "", v.str[v.pVariant:v.pExt]; s != ""; {
  607. x, s = nextToken(s)
  608. b.variant = append(b.variant, x)
  609. }
  610. b.ext, b.private = nil, ""
  611. for i, e := int(v.pExt), ""; i < len(v.str); {
  612. i, e = getExtension(v.str, i)
  613. b.addExt(e)
  614. }
  615. }
  616. case Base:
  617. b.tag.lang = v.langID
  618. case Script:
  619. b.tag.script = v.scriptID
  620. case Region:
  621. b.tag.region = v.regionID
  622. case Variant:
  623. if !replace(&b.variant, v.variant, func(a, b string) bool { return a == b }) {
  624. b.variant = append(b.variant, v.variant)
  625. }
  626. case Extension:
  627. if !replace(&b.ext, v.s, func(a, b string) bool { return a[0] == b[0] }) {
  628. b.addExt(v.s)
  629. }
  630. case []Variant:
  631. b.variant = nil
  632. for _, x := range v {
  633. b.update(x)
  634. }
  635. case []Extension:
  636. b.ext, b.private = nil, ""
  637. for _, e := range v {
  638. b.update(e)
  639. }
  640. // TODO: support parsing of raw strings based on morphology or just extensions?
  641. case error:
  642. err = v
  643. }
  644. }
  645. return
  646. }
  647. func tokenLen(token ...string) (n int) {
  648. for _, t := range token {
  649. n += len(t) + 1
  650. }
  651. return
  652. }
  653. func appendTokens(b []byte, token ...string) int {
  654. p := 0
  655. for _, t := range token {
  656. b[p] = '-'
  657. copy(b[p+1:], t)
  658. p += 1 + len(t)
  659. }
  660. return p
  661. }
  662. type sortVariant []string
  663. func (s sortVariant) Len() int {
  664. return len(s)
  665. }
  666. func (s sortVariant) Swap(i, j int) {
  667. s[j], s[i] = s[i], s[j]
  668. }
  669. func (s sortVariant) Less(i, j int) bool {
  670. return variantIndex[s[i]] < variantIndex[s[j]]
  671. }
  672. func findExt(list []string, x byte) int {
  673. for i, e := range list {
  674. if e[0] == x {
  675. return i
  676. }
  677. }
  678. return -1
  679. }
  680. // getExtension returns the name, body and end position of the extension.
  681. func getExtension(s string, p int) (end int, ext string) {
  682. if s[p] == '-' {
  683. p++
  684. }
  685. if s[p] == 'x' {
  686. return len(s), s[p:]
  687. }
  688. end = nextExtension(s, p)
  689. return end, s[p:end]
  690. }
  691. // nextExtension finds the next extension within the string, searching
  692. // for the -<char>- pattern from position p.
  693. // In the fast majority of cases, language tags will have at most
  694. // one extension and extensions tend to be small.
  695. func nextExtension(s string, p int) int {
  696. for n := len(s) - 3; p < n; {
  697. if s[p] == '-' {
  698. if s[p+2] == '-' {
  699. return p
  700. }
  701. p += 3
  702. } else {
  703. p++
  704. }
  705. }
  706. return len(s)
  707. }
  708. var errInvalidWeight = errors.New("ParseAcceptLanguage: invalid weight")
  709. // ParseAcceptLanguage parses the contents of a Accept-Language header as
  710. // defined in http://www.ietf.org/rfc/rfc2616.txt and returns a list of Tags and
  711. // a list of corresponding quality weights. It is more permissive than RFC 2616
  712. // and may return non-nil slices even if the input is not valid.
  713. // The Tags will be sorted by highest weight first and then by first occurrence.
  714. // Tags with a weight of zero will be dropped. An error will be returned if the
  715. // input could not be parsed.
  716. func ParseAcceptLanguage(s string) (tag []Tag, q []float32, err error) {
  717. var entry string
  718. for s != "" {
  719. if entry, s = split(s, ','); entry == "" {
  720. continue
  721. }
  722. entry, weight := split(entry, ';')
  723. // Scan the language.
  724. t, err := Parse(entry)
  725. if err != nil {
  726. id, ok := acceptFallback[entry]
  727. if !ok {
  728. return nil, nil, err
  729. }
  730. t = Tag{lang: id}
  731. }
  732. // Scan the optional weight.
  733. w := 1.0
  734. if weight != "" {
  735. weight = consume(weight, 'q')
  736. weight = consume(weight, '=')
  737. // consume returns the empty string when a token could not be
  738. // consumed, resulting in an error for ParseFloat.
  739. if w, err = strconv.ParseFloat(weight, 32); err != nil {
  740. return nil, nil, errInvalidWeight
  741. }
  742. // Drop tags with a quality weight of 0.
  743. if w <= 0 {
  744. continue
  745. }
  746. }
  747. tag = append(tag, t)
  748. q = append(q, float32(w))
  749. }
  750. sortStable(&tagSort{tag, q})
  751. return tag, q, nil
  752. }
  753. // consume removes a leading token c from s and returns the result or the empty
  754. // string if there is no such token.
  755. func consume(s string, c byte) string {
  756. if s == "" || s[0] != c {
  757. return ""
  758. }
  759. return strings.TrimSpace(s[1:])
  760. }
  761. func split(s string, c byte) (head, tail string) {
  762. if i := strings.IndexByte(s, c); i >= 0 {
  763. return strings.TrimSpace(s[:i]), strings.TrimSpace(s[i+1:])
  764. }
  765. return strings.TrimSpace(s), ""
  766. }
  767. // Add hack mapping to deal with a small number of cases that that occur
  768. // in Accept-Language (with reasonable frequency).
  769. var acceptFallback = map[string]langID{
  770. "english": _en,
  771. "deutsch": _de,
  772. "italian": _it,
  773. "french": _fr,
  774. "*": _mul, // defined in the spec to match all languages.
  775. }
  776. type tagSort struct {
  777. tag []Tag
  778. q []float32
  779. }
  780. func (s *tagSort) Len() int {
  781. return len(s.q)
  782. }
  783. func (s *tagSort) Less(i, j int) bool {
  784. return s.q[i] > s.q[j]
  785. }
  786. func (s *tagSort) Swap(i, j int) {
  787. s.tag[i], s.tag[j] = s.tag[j], s.tag[i]
  788. s.q[i], s.q[j] = s.q[j], s.q[i]
  789. }