match.go 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841
  1. // Copyright 2013 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package language
  5. import "errors"
  6. // Matcher is the interface that wraps the Match method.
  7. //
  8. // Match returns the best match for any of the given tags, along with
  9. // a unique index associated with the returned tag and a confidence
  10. // score.
  11. type Matcher interface {
  12. Match(t ...Tag) (tag Tag, index int, c Confidence)
  13. }
  14. // Comprehends reports the confidence score for a speaker of a given language
  15. // to being able to comprehend the written form of an alternative language.
  16. func Comprehends(speaker, alternative Tag) Confidence {
  17. _, _, c := NewMatcher([]Tag{alternative}).Match(speaker)
  18. return c
  19. }
  20. // NewMatcher returns a Matcher that matches an ordered list of preferred tags
  21. // against a list of supported tags based on written intelligibility, closeness
  22. // of dialect, equivalence of subtags and various other rules. It is initialized
  23. // with the list of supported tags. The first element is used as the default
  24. // value in case no match is found.
  25. //
  26. // Its Match method matches the first of the given Tags to reach a certain
  27. // confidence threshold. The tags passed to Match should therefore be specified
  28. // in order of preference. Extensions are ignored for matching.
  29. //
  30. // The index returned by the Match method corresponds to the index of the
  31. // matched tag in t, but is augmented with the Unicode extension ('u')of the
  32. // corresponding preferred tag. This allows user locale options to be passed
  33. // transparently.
  34. func NewMatcher(t []Tag) Matcher {
  35. return newMatcher(t)
  36. }
  37. func (m *matcher) Match(want ...Tag) (t Tag, index int, c Confidence) {
  38. match, w, c := m.getBest(want...)
  39. if match == nil {
  40. t = m.default_.tag
  41. } else {
  42. t, index = match.tag, match.index
  43. }
  44. // Copy options from the user-provided tag into the result tag. This is hard
  45. // to do after the fact, so we do it here.
  46. // TODO: consider also adding in variants that are compatible with the
  47. // matched language.
  48. // TODO: Add back region if it is non-ambiguous? Or create another tag to
  49. // preserve the region?
  50. if u, ok := w.Extension('u'); ok {
  51. t, _ = Raw.Compose(t, u)
  52. }
  53. return t, index, c
  54. }
  55. type scriptRegionFlags uint8
  56. const (
  57. isList = 1 << iota
  58. scriptInFrom
  59. regionInFrom
  60. )
  61. func (t *Tag) setUndefinedLang(id langID) {
  62. if t.lang == 0 {
  63. t.lang = id
  64. }
  65. }
  66. func (t *Tag) setUndefinedScript(id scriptID) {
  67. if t.script == 0 {
  68. t.script = id
  69. }
  70. }
  71. func (t *Tag) setUndefinedRegion(id regionID) {
  72. if t.region == 0 || t.region.contains(id) {
  73. t.region = id
  74. }
  75. }
  76. // ErrMissingLikelyTagsData indicates no information was available
  77. // to compute likely values of missing tags.
  78. var ErrMissingLikelyTagsData = errors.New("missing likely tags data")
  79. // addLikelySubtags sets subtags to their most likely value, given the locale.
  80. // In most cases this means setting fields for unknown values, but in some
  81. // cases it may alter a value. It returns a ErrMissingLikelyTagsData error
  82. // if the given locale cannot be expanded.
  83. func (t Tag) addLikelySubtags() (Tag, error) {
  84. id, err := addTags(t)
  85. if err != nil {
  86. return t, err
  87. } else if id.equalTags(t) {
  88. return t, nil
  89. }
  90. id.remakeString()
  91. return id, nil
  92. }
  93. // specializeRegion attempts to specialize a group region.
  94. func specializeRegion(t *Tag) bool {
  95. if i := regionInclusion[t.region]; i < nRegionGroups {
  96. x := likelyRegionGroup[i]
  97. if langID(x.lang) == t.lang && scriptID(x.script) == t.script {
  98. t.region = regionID(x.region)
  99. }
  100. return true
  101. }
  102. return false
  103. }
  104. func addTags(t Tag) (Tag, error) {
  105. // We leave private use identifiers alone.
  106. if t.private() {
  107. return t, nil
  108. }
  109. if t.script != 0 && t.region != 0 {
  110. if t.lang != 0 {
  111. // already fully specified
  112. specializeRegion(&t)
  113. return t, nil
  114. }
  115. // Search matches for und-script-region. Note that for these cases
  116. // region will never be a group so there is no need to check for this.
  117. list := likelyRegion[t.region : t.region+1]
  118. if x := list[0]; x.flags&isList != 0 {
  119. list = likelyRegionList[x.lang : x.lang+uint16(x.script)]
  120. }
  121. for _, x := range list {
  122. // Deviating from the spec. See match_test.go for details.
  123. if scriptID(x.script) == t.script {
  124. t.setUndefinedLang(langID(x.lang))
  125. return t, nil
  126. }
  127. }
  128. }
  129. if t.lang != 0 {
  130. // Search matches for lang-script and lang-region, where lang != und.
  131. if t.lang < langNoIndexOffset {
  132. x := likelyLang[t.lang]
  133. if x.flags&isList != 0 {
  134. list := likelyLangList[x.region : x.region+uint16(x.script)]
  135. if t.script != 0 {
  136. for _, x := range list {
  137. if scriptID(x.script) == t.script && x.flags&scriptInFrom != 0 {
  138. t.setUndefinedRegion(regionID(x.region))
  139. return t, nil
  140. }
  141. }
  142. } else if t.region != 0 {
  143. count := 0
  144. goodScript := true
  145. tt := t
  146. for _, x := range list {
  147. // We visit all entries for which the script was not
  148. // defined, including the ones where the region was not
  149. // defined. This allows for proper disambiguation within
  150. // regions.
  151. if x.flags&scriptInFrom == 0 && t.region.contains(regionID(x.region)) {
  152. tt.region = regionID(x.region)
  153. tt.setUndefinedScript(scriptID(x.script))
  154. goodScript = goodScript && tt.script == scriptID(x.script)
  155. count++
  156. }
  157. }
  158. if count == 1 {
  159. return tt, nil
  160. }
  161. // Even if we fail to find a unique Region, we might have
  162. // an unambiguous script.
  163. if goodScript {
  164. t.script = tt.script
  165. }
  166. }
  167. }
  168. }
  169. } else {
  170. // Search matches for und-script.
  171. if t.script != 0 {
  172. x := likelyScript[t.script]
  173. if x.region != 0 {
  174. t.setUndefinedRegion(regionID(x.region))
  175. t.setUndefinedLang(langID(x.lang))
  176. return t, nil
  177. }
  178. }
  179. // Search matches for und-region. If und-script-region exists, it would
  180. // have been found earlier.
  181. if t.region != 0 {
  182. if i := regionInclusion[t.region]; i < nRegionGroups {
  183. x := likelyRegionGroup[i]
  184. if x.region != 0 {
  185. t.setUndefinedLang(langID(x.lang))
  186. t.setUndefinedScript(scriptID(x.script))
  187. t.region = regionID(x.region)
  188. }
  189. } else {
  190. x := likelyRegion[t.region]
  191. if x.flags&isList != 0 {
  192. x = likelyRegionList[x.lang]
  193. }
  194. if x.script != 0 && x.flags != scriptInFrom {
  195. t.setUndefinedLang(langID(x.lang))
  196. t.setUndefinedScript(scriptID(x.script))
  197. return t, nil
  198. }
  199. }
  200. }
  201. }
  202. // Search matches for lang.
  203. if t.lang < langNoIndexOffset {
  204. x := likelyLang[t.lang]
  205. if x.flags&isList != 0 {
  206. x = likelyLangList[x.region]
  207. }
  208. if x.region != 0 {
  209. t.setUndefinedScript(scriptID(x.script))
  210. t.setUndefinedRegion(regionID(x.region))
  211. }
  212. specializeRegion(&t)
  213. if t.lang == 0 {
  214. t.lang = _en // default language
  215. }
  216. return t, nil
  217. }
  218. return t, ErrMissingLikelyTagsData
  219. }
  220. func (t *Tag) setTagsFrom(id Tag) {
  221. t.lang = id.lang
  222. t.script = id.script
  223. t.region = id.region
  224. }
  225. // minimize removes the region or script subtags from t such that
  226. // t.addLikelySubtags() == t.minimize().addLikelySubtags().
  227. func (t Tag) minimize() (Tag, error) {
  228. t, err := minimizeTags(t)
  229. if err != nil {
  230. return t, err
  231. }
  232. t.remakeString()
  233. return t, nil
  234. }
  235. // minimizeTags mimics the behavior of the ICU 51 C implementation.
  236. func minimizeTags(t Tag) (Tag, error) {
  237. if t.equalTags(und) {
  238. return t, nil
  239. }
  240. max, err := addTags(t)
  241. if err != nil {
  242. return t, err
  243. }
  244. for _, id := range [...]Tag{
  245. {lang: t.lang},
  246. {lang: t.lang, region: t.region},
  247. {lang: t.lang, script: t.script},
  248. } {
  249. if x, err := addTags(id); err == nil && max.equalTags(x) {
  250. t.setTagsFrom(id)
  251. break
  252. }
  253. }
  254. return t, nil
  255. }
  256. // Tag Matching
  257. // CLDR defines an algorithm for finding the best match between two sets of language
  258. // tags. The basic algorithm defines how to score a possible match and then find
  259. // the match with the best score
  260. // (see http://www.unicode.org/reports/tr35/#LanguageMatching).
  261. // Using scoring has several disadvantages. The scoring obfuscates the importance of
  262. // the various factors considered, making the algorithm harder to understand. Using
  263. // scoring also requires the full score to be computed for each pair of tags.
  264. //
  265. // We will use a different algorithm which aims to have the following properties:
  266. // - clarity on the precedence of the various selection factors, and
  267. // - improved performance by allowing early termination of a comparison.
  268. //
  269. // Matching algorithm (overview)
  270. // Input:
  271. // - supported: a set of supported tags
  272. // - default: the default tag to return in case there is no match
  273. // - desired: list of desired tags, ordered by preference, starting with
  274. // the most-preferred.
  275. //
  276. // Algorithm:
  277. // 1) Set the best match to the lowest confidence level
  278. // 2) For each tag in "desired":
  279. // a) For each tag in "supported":
  280. // 1) compute the match between the two tags.
  281. // 2) if the match is better than the previous best match, replace it
  282. // with the new match. (see next section)
  283. // b) if the current best match is above a certain threshold, return this
  284. // match without proceeding to the next tag in "desired". [See Note 1]
  285. // 3) If the best match so far is below a certain threshold, return "default".
  286. //
  287. // Ranking:
  288. // We use two phases to determine whether one pair of tags are a better match
  289. // than another pair of tags. First, we determine a rough confidence level. If the
  290. // levels are different, the one with the highest confidence wins.
  291. // Second, if the rough confidence levels are identical, we use a set of tie-breaker
  292. // rules.
  293. //
  294. // The confidence level of matching a pair of tags is determined by finding the
  295. // lowest confidence level of any matches of the corresponding subtags (the
  296. // result is deemed as good as its weakest link).
  297. // We define the following levels:
  298. // Exact - An exact match of a subtag, before adding likely subtags.
  299. // MaxExact - An exact match of a subtag, after adding likely subtags.
  300. // [See Note 2].
  301. // High - High level of mutual intelligibility between different subtag
  302. // variants.
  303. // Low - Low level of mutual intelligibility between different subtag
  304. // variants.
  305. // No - No mutual intelligibility.
  306. //
  307. // The following levels can occur for each type of subtag:
  308. // Base: Exact, MaxExact, High, Low, No
  309. // Script: Exact, MaxExact [see Note 3], Low, No
  310. // Region: Exact, MaxExact, High
  311. // Variant: Exact, High
  312. // Private: Exact, No
  313. //
  314. // Any result with a confidence level of Low or higher is deemed a possible match.
  315. // Once a desired tag matches any of the supported tags with a level of MaxExact
  316. // or higher, the next desired tag is not considered (see Step 2.b).
  317. // Note that CLDR provides languageMatching data that defines close equivalence
  318. // classes for base languages, scripts and regions.
  319. //
  320. // Tie-breaking
  321. // If we get the same confidence level for two matches, we apply a sequence of
  322. // tie-breaking rules. The first that succeeds defines the result. The rules are
  323. // applied in the following order.
  324. // 1) Original language was defined and was identical.
  325. // 2) Original region was defined and was identical.
  326. // 3) Distance between two maximized regions was the smallest.
  327. // 4) Original script was defined and was identical.
  328. // 5) Distance from want tag to have tag using the parent relation [see Note 5.]
  329. // If there is still no winner after these rules are applied, the first match
  330. // found wins.
  331. //
  332. // Notes:
  333. // [1] Note that even if we may not have a perfect match, if a match is above a
  334. // certain threshold, it is considered a better match than any other match
  335. // to a tag later in the list of preferred language tags.
  336. // [2] In practice, as matching of Exact is done in a separate phase from
  337. // matching the other levels, we reuse the Exact level to mean MaxExact in
  338. // the second phase. As a consequence, we only need the levels defined by
  339. // the Confidence type. The MaxExact confidence level is mapped to High in
  340. // the public API.
  341. // [3] We do not differentiate between maximized script values that were derived
  342. // from suppressScript versus most likely tag data. We determined that in
  343. // ranking the two, one ranks just after the other. Moreover, the two cannot
  344. // occur concurrently. As a consequence, they are identical for practical
  345. // purposes.
  346. // [4] In case of deprecated, macro-equivalents and legacy mappings, we assign
  347. // the MaxExact level to allow iw vs he to still be a closer match than
  348. // en-AU vs en-US, for example.
  349. // [5] In CLDR a locale inherits fields that are unspecified for this locale
  350. // from its parent. Therefore, if a locale is a parent of another locale,
  351. // it is a strong measure for closeness, especially when no other tie
  352. // breaker rule applies. One could also argue it is inconsistent, for
  353. // example, when pt-AO matches pt (which CLDR equates with pt-BR), even
  354. // though its parent is pt-PT according to the inheritance rules.
  355. //
  356. // Implementation Details:
  357. // There are several performance considerations worth pointing out. Most notably,
  358. // we preprocess as much as possible (within reason) at the time of creation of a
  359. // matcher. This includes:
  360. // - creating a per-language map, which includes data for the raw base language
  361. // and its canonicalized variant (if applicable),
  362. // - expanding entries for the equivalence classes defined in CLDR's
  363. // languageMatch data.
  364. // The per-language map ensures that typically only a very small number of tags
  365. // need to be considered. The pre-expansion of canonicalized subtags and
  366. // equivalence classes reduces the amount of map lookups that need to be done at
  367. // runtime.
  368. // matcher keeps a set of supported language tags, indexed by language.
  369. type matcher struct {
  370. default_ *haveTag
  371. index map[langID]*matchHeader
  372. passSettings bool
  373. }
  374. // matchHeader has the lists of tags for exact matches and matches based on
  375. // maximized and canonicalized tags for a given language.
  376. type matchHeader struct {
  377. exact []*haveTag
  378. max []*haveTag
  379. }
  380. // haveTag holds a supported Tag and its maximized script and region. The maximized
  381. // or canonicalized language is not stored as it is not needed during matching.
  382. type haveTag struct {
  383. tag Tag
  384. // index of this tag in the original list of supported tags.
  385. index int
  386. // conf is the maximum confidence that can result from matching this haveTag.
  387. // When conf < Exact this means it was inserted after applying a CLDR equivalence rule.
  388. conf Confidence
  389. // Maximized region and script.
  390. maxRegion regionID
  391. maxScript scriptID
  392. // altScript may be checked as an alternative match to maxScript. If altScript
  393. // matches, the confidence level for this match is Low. Theoretically there
  394. // could be multiple alternative scripts. This does not occur in practice.
  395. altScript scriptID
  396. // nextMax is the index of the next haveTag with the same maximized tags.
  397. nextMax uint16
  398. }
  399. func makeHaveTag(tag Tag, index int) (haveTag, langID) {
  400. max := tag
  401. if tag.lang != 0 {
  402. max, _ = max.canonicalize(All)
  403. max, _ = addTags(max)
  404. max.remakeString()
  405. }
  406. return haveTag{tag, index, Exact, max.region, max.script, altScript(max.lang, max.script), 0}, max.lang
  407. }
  408. // altScript returns an alternative script that may match the given script with
  409. // a low confidence. At the moment, the langMatch data allows for at most one
  410. // script to map to another and we rely on this to keep the code simple.
  411. func altScript(l langID, s scriptID) scriptID {
  412. for _, alt := range matchScript {
  413. if (alt.lang == 0 || langID(alt.lang) == l) && scriptID(alt.have) == s {
  414. return scriptID(alt.want)
  415. }
  416. }
  417. return 0
  418. }
  419. // addIfNew adds a haveTag to the list of tags only if it is a unique tag.
  420. // Tags that have the same maximized values are linked by index.
  421. func (h *matchHeader) addIfNew(n haveTag, exact bool) {
  422. // Don't add new exact matches.
  423. for _, v := range h.exact {
  424. if v.tag.equalsRest(n.tag) {
  425. return
  426. }
  427. }
  428. if exact {
  429. h.exact = append(h.exact, &n)
  430. }
  431. // Allow duplicate maximized tags, but create a linked list to allow quickly
  432. // comparing the equivalents and bail out.
  433. for i, v := range h.max {
  434. if v.maxScript == n.maxScript &&
  435. v.maxRegion == n.maxRegion &&
  436. v.tag.variantOrPrivateTagStr() == n.tag.variantOrPrivateTagStr() {
  437. for h.max[i].nextMax != 0 {
  438. i = int(h.max[i].nextMax)
  439. }
  440. h.max[i].nextMax = uint16(len(h.max))
  441. break
  442. }
  443. }
  444. h.max = append(h.max, &n)
  445. }
  446. // header returns the matchHeader for the given language. It creates one if
  447. // it doesn't already exist.
  448. func (m *matcher) header(l langID) *matchHeader {
  449. if h := m.index[l]; h != nil {
  450. return h
  451. }
  452. h := &matchHeader{}
  453. m.index[l] = h
  454. return h
  455. }
  456. // newMatcher builds an index for the given supported tags and returns it as
  457. // a matcher. It also expands the index by considering various equivalence classes
  458. // for a given tag.
  459. func newMatcher(supported []Tag) *matcher {
  460. m := &matcher{
  461. index: make(map[langID]*matchHeader),
  462. }
  463. if len(supported) == 0 {
  464. m.default_ = &haveTag{}
  465. return m
  466. }
  467. // Add supported languages to the index. Add exact matches first to give
  468. // them precedence.
  469. for i, tag := range supported {
  470. pair, _ := makeHaveTag(tag, i)
  471. m.header(tag.lang).addIfNew(pair, true)
  472. }
  473. m.default_ = m.header(supported[0].lang).exact[0]
  474. for i, tag := range supported {
  475. pair, max := makeHaveTag(tag, i)
  476. if max != tag.lang {
  477. m.header(max).addIfNew(pair, false)
  478. }
  479. }
  480. // update is used to add indexes in the map for equivalent languages.
  481. // If force is true, the update will also apply to derived entries. To
  482. // avoid applying a "transitive closure", use false.
  483. update := func(want, have uint16, conf Confidence, force bool) {
  484. if hh := m.index[langID(have)]; hh != nil {
  485. if !force && len(hh.exact) == 0 {
  486. return
  487. }
  488. hw := m.header(langID(want))
  489. for _, ht := range hh.max {
  490. v := *ht
  491. if conf < v.conf {
  492. v.conf = conf
  493. }
  494. v.nextMax = 0 // this value needs to be recomputed
  495. if v.altScript != 0 {
  496. v.altScript = altScript(langID(want), v.maxScript)
  497. }
  498. hw.addIfNew(v, conf == Exact && len(hh.exact) > 0)
  499. }
  500. }
  501. }
  502. // Add entries for languages with mutual intelligibility as defined by CLDR's
  503. // languageMatch data.
  504. for _, ml := range matchLang {
  505. update(ml.want, ml.have, Confidence(ml.conf), false)
  506. if !ml.oneway {
  507. update(ml.have, ml.want, Confidence(ml.conf), false)
  508. }
  509. }
  510. // Add entries for possible canonicalizations. This is an optimization to
  511. // ensure that only one map lookup needs to be done at runtime per desired tag.
  512. // First we match deprecated equivalents. If they are perfect equivalents
  513. // (their canonicalization simply substitutes a different language code, but
  514. // nothing else), the match confidence is Exact, otherwise it is High.
  515. for i, lm := range langAliasMap {
  516. if lm.from == _sh {
  517. continue
  518. }
  519. // If deprecated codes match and there is no fiddling with the script or
  520. // or region, we consider it an exact match.
  521. conf := Exact
  522. if langAliasTypes[i] != langMacro {
  523. if !isExactEquivalent(langID(lm.from)) {
  524. conf = High
  525. }
  526. update(lm.to, lm.from, conf, true)
  527. }
  528. update(lm.from, lm.to, conf, true)
  529. }
  530. return m
  531. }
  532. // getBest gets the best matching tag in m for any of the given tags, taking into
  533. // account the order of preference of the given tags.
  534. func (m *matcher) getBest(want ...Tag) (got *haveTag, orig Tag, c Confidence) {
  535. best := bestMatch{}
  536. for _, w := range want {
  537. var max Tag
  538. // Check for exact match first.
  539. h := m.index[w.lang]
  540. if w.lang != 0 {
  541. // Base language is defined.
  542. if h == nil {
  543. continue
  544. }
  545. for i := range h.exact {
  546. have := h.exact[i]
  547. if have.tag.equalsRest(w) {
  548. return have, w, Exact
  549. }
  550. }
  551. max, _ = w.canonicalize(Legacy | Deprecated)
  552. max, _ = addTags(max)
  553. } else {
  554. // Base language is not defined.
  555. if h != nil {
  556. for i := range h.exact {
  557. have := h.exact[i]
  558. if have.tag.equalsRest(w) {
  559. return have, w, Exact
  560. }
  561. }
  562. }
  563. if w.script == 0 && w.region == 0 {
  564. // We skip all tags matching und for approximate matching, including
  565. // private tags.
  566. continue
  567. }
  568. max, _ = addTags(w)
  569. if h = m.index[max.lang]; h == nil {
  570. continue
  571. }
  572. }
  573. // Check for match based on maximized tag.
  574. for i := range h.max {
  575. have := h.max[i]
  576. best.update(have, w, max.script, max.region)
  577. if best.conf == Exact {
  578. for have.nextMax != 0 {
  579. have = h.max[have.nextMax]
  580. best.update(have, w, max.script, max.region)
  581. }
  582. return best.have, best.want, High
  583. }
  584. }
  585. }
  586. if best.conf <= No {
  587. if len(want) != 0 {
  588. return nil, want[0], No
  589. }
  590. return nil, Tag{}, No
  591. }
  592. return best.have, best.want, best.conf
  593. }
  594. // bestMatch accumulates the best match so far.
  595. type bestMatch struct {
  596. have *haveTag
  597. want Tag
  598. conf Confidence
  599. // Cached results from applying tie-breaking rules.
  600. origLang bool
  601. origReg bool
  602. regDist uint8
  603. origScript bool
  604. parentDist uint8 // 255 if have is not an ancestor of want tag.
  605. }
  606. // update updates the existing best match if the new pair is considered to be a
  607. // better match.
  608. // To determine if the given pair is a better match, it first computes the rough
  609. // confidence level. If this surpasses the current match, it will replace it and
  610. // update the tie-breaker rule cache. If there is a tie, it proceeds with applying
  611. // a series of tie-breaker rules. If there is no conclusive winner after applying
  612. // the tie-breaker rules, it leaves the current match as the preferred match.
  613. func (m *bestMatch) update(have *haveTag, tag Tag, maxScript scriptID, maxRegion regionID) {
  614. // Bail if the maximum attainable confidence is below that of the current best match.
  615. c := have.conf
  616. if c < m.conf {
  617. return
  618. }
  619. if have.maxScript != maxScript {
  620. // There is usually very little comprehension between different scripts.
  621. // In a few cases there may still be Low comprehension. This possibility is
  622. // pre-computed and stored in have.altScript.
  623. if Low < m.conf || have.altScript != maxScript {
  624. return
  625. }
  626. c = Low
  627. } else if have.maxRegion != maxRegion {
  628. // There is usually a small difference between languages across regions.
  629. // We use the region distance (below) to disambiguate between equal matches.
  630. if High < c {
  631. c = High
  632. }
  633. }
  634. // We store the results of the computations of the tie-breaker rules along
  635. // with the best match. There is no need to do the checks once we determine
  636. // we have a winner, but we do still need to do the tie-breaker computations.
  637. // We use "beaten" to keep track if we still need to do the checks.
  638. beaten := false // true if the new pair defeats the current one.
  639. if c != m.conf {
  640. if c < m.conf {
  641. return
  642. }
  643. beaten = true
  644. }
  645. // Tie-breaker rules:
  646. // We prefer if the pre-maximized language was specified and identical.
  647. origLang := have.tag.lang == tag.lang && tag.lang != 0
  648. if !beaten && m.origLang != origLang {
  649. if m.origLang {
  650. return
  651. }
  652. beaten = true
  653. }
  654. // We prefer if the pre-maximized region was specified and identical.
  655. origReg := have.tag.region == tag.region && tag.region != 0
  656. if !beaten && m.origReg != origReg {
  657. if m.origReg {
  658. return
  659. }
  660. beaten = true
  661. }
  662. // Next we prefer smaller distances between regions, as defined by regionDist.
  663. regDist := regionDist(have.maxRegion, maxRegion, tag.lang)
  664. if !beaten && m.regDist != regDist {
  665. if regDist > m.regDist {
  666. return
  667. }
  668. beaten = true
  669. }
  670. // Next we prefer if the pre-maximized script was specified and identical.
  671. origScript := have.tag.script == tag.script && tag.script != 0
  672. if !beaten && m.origScript != origScript {
  673. if m.origScript {
  674. return
  675. }
  676. beaten = true
  677. }
  678. // Finally we prefer tags which have a closer parent relationship.
  679. parentDist := parentDistance(have.tag.region, tag)
  680. if !beaten && m.parentDist != parentDist {
  681. if parentDist > m.parentDist {
  682. return
  683. }
  684. beaten = true
  685. }
  686. // Update m to the newly found best match.
  687. if beaten {
  688. m.have = have
  689. m.want = tag
  690. m.conf = c
  691. m.origLang = origLang
  692. m.origReg = origReg
  693. m.origScript = origScript
  694. m.regDist = regDist
  695. m.parentDist = parentDist
  696. }
  697. }
  698. // parentDistance returns the number of times Parent must be called before the
  699. // regions match. It is assumed that it has already been checked that lang and
  700. // script are identical. If haveRegion does not occur in the ancestor chain of
  701. // tag, it returns 255.
  702. func parentDistance(haveRegion regionID, tag Tag) uint8 {
  703. p := tag.Parent()
  704. d := uint8(1)
  705. for haveRegion != p.region {
  706. if p.region == 0 {
  707. return 255
  708. }
  709. p = p.Parent()
  710. d++
  711. }
  712. return d
  713. }
  714. // regionDist wraps regionDistance with some exceptions to the algorithmic distance.
  715. func regionDist(a, b regionID, lang langID) uint8 {
  716. if lang == _en {
  717. // Two variants of non-US English are close to each other, regardless of distance.
  718. if a != _US && b != _US {
  719. return 2
  720. }
  721. }
  722. return uint8(regionDistance(a, b))
  723. }
  724. // regionDistance computes the distance between two regions based on the
  725. // distance in the graph of region containments as defined in CLDR. It iterates
  726. // over increasingly inclusive sets of groups, represented as bit vectors, until
  727. // the source bit vector has bits in common with the destination vector.
  728. func regionDistance(a, b regionID) int {
  729. if a == b {
  730. return 0
  731. }
  732. p, q := regionInclusion[a], regionInclusion[b]
  733. if p < nRegionGroups {
  734. p, q = q, p
  735. }
  736. set := regionInclusionBits
  737. if q < nRegionGroups && set[p]&(1<<q) != 0 {
  738. return 1
  739. }
  740. d := 2
  741. for goal := set[q]; set[p]&goal == 0; p = regionInclusionNext[p] {
  742. d++
  743. }
  744. return d
  745. }
  746. func (t Tag) variants() string {
  747. if t.pVariant == 0 {
  748. return ""
  749. }
  750. return t.str[t.pVariant:t.pExt]
  751. }
  752. // variantOrPrivateTagStr returns variants or private use tags.
  753. func (t Tag) variantOrPrivateTagStr() string {
  754. if t.pExt > 0 {
  755. return t.str[t.pVariant:t.pExt]
  756. }
  757. return t.str[t.pVariant:]
  758. }
  759. // equalsRest compares everything except the language.
  760. func (a Tag) equalsRest(b Tag) bool {
  761. // TODO: don't include extensions in this comparison. To do this efficiently,
  762. // though, we should handle private tags separately.
  763. return a.script == b.script && a.region == b.region && a.variantOrPrivateTagStr() == b.variantOrPrivateTagStr()
  764. }
  765. // isExactEquivalent returns true if canonicalizing the language will not alter
  766. // the script or region of a tag.
  767. func isExactEquivalent(l langID) bool {
  768. for _, o := range notEquivalent {
  769. if o == l {
  770. return false
  771. }
  772. }
  773. return true
  774. }
  775. var notEquivalent []langID
  776. func init() {
  777. // Create a list of all languages for which canonicalization may alter the
  778. // script or region.
  779. for _, lm := range langAliasMap {
  780. tag := Tag{lang: langID(lm.from)}
  781. if tag, _ = tag.canonicalize(All); tag.script != 0 || tag.region != 0 {
  782. notEquivalent = append(notEquivalent, langID(lm.from))
  783. }
  784. }
  785. }