language.go 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887
  1. // Copyright 2013 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. //go:generate go run gen.go gen_common.go -output tables.go
  5. //go:generate go run gen_index.go
  6. package language
  7. // TODO: Remove above NOTE after:
  8. // - verifying that tables are dropped correctly (most notably matcher tables).
  9. import (
  10. "errors"
  11. "fmt"
  12. "strings"
  13. )
  14. const (
  15. // maxCoreSize is the maximum size of a BCP 47 tag without variants and
  16. // extensions. Equals max lang (3) + script (4) + max reg (3) + 2 dashes.
  17. maxCoreSize = 12
  18. // max99thPercentileSize is a somewhat arbitrary buffer size that presumably
  19. // is large enough to hold at least 99% of the BCP 47 tags.
  20. max99thPercentileSize = 32
  21. // maxSimpleUExtensionSize is the maximum size of a -u extension with one
  22. // key-type pair. Equals len("-u-") + key (2) + dash + max value (8).
  23. maxSimpleUExtensionSize = 14
  24. )
  25. // Tag represents a BCP 47 language tag. It is used to specify an instance of a
  26. // specific language or locale. All language tag values are guaranteed to be
  27. // well-formed.
  28. type Tag struct {
  29. lang langID
  30. region regionID
  31. // TODO: we will soon run out of positions for script. Idea: instead of
  32. // storing lang, region, and script codes, store only the compact index and
  33. // have a lookup table from this code to its expansion. This greatly speeds
  34. // up table lookup, speed up common variant cases.
  35. // This will also immediately free up 3 extra bytes. Also, the pVariant
  36. // field can now be moved to the lookup table, as the compact index uniquely
  37. // determines the offset of a possible variant.
  38. script scriptID
  39. pVariant byte // offset in str, includes preceding '-'
  40. pExt uint16 // offset of first extension, includes preceding '-'
  41. // str is the string representation of the Tag. It will only be used if the
  42. // tag has variants or extensions.
  43. str string
  44. }
  45. // Make is a convenience wrapper for Parse that omits the error.
  46. // In case of an error, a sensible default is returned.
  47. func Make(s string) Tag {
  48. return Default.Make(s)
  49. }
  50. // Make is a convenience wrapper for c.Parse that omits the error.
  51. // In case of an error, a sensible default is returned.
  52. func (c CanonType) Make(s string) Tag {
  53. t, _ := c.Parse(s)
  54. return t
  55. }
  56. // Raw returns the raw base language, script and region, without making an
  57. // attempt to infer their values.
  58. func (t Tag) Raw() (b Base, s Script, r Region) {
  59. return Base{t.lang}, Script{t.script}, Region{t.region}
  60. }
  61. // equalTags compares language, script and region subtags only.
  62. func (t Tag) equalTags(a Tag) bool {
  63. return t.lang == a.lang && t.script == a.script && t.region == a.region
  64. }
  65. // IsRoot returns true if t is equal to language "und".
  66. func (t Tag) IsRoot() bool {
  67. if int(t.pVariant) < len(t.str) {
  68. return false
  69. }
  70. return t.equalTags(und)
  71. }
  72. // private reports whether the Tag consists solely of a private use tag.
  73. func (t Tag) private() bool {
  74. return t.str != "" && t.pVariant == 0
  75. }
  76. // CanonType can be used to enable or disable various types of canonicalization.
  77. type CanonType int
  78. const (
  79. // Replace deprecated base languages with their preferred replacements.
  80. DeprecatedBase CanonType = 1 << iota
  81. // Replace deprecated scripts with their preferred replacements.
  82. DeprecatedScript
  83. // Replace deprecated regions with their preferred replacements.
  84. DeprecatedRegion
  85. // Remove redundant scripts.
  86. SuppressScript
  87. // Normalize legacy encodings. This includes legacy languages defined in
  88. // CLDR as well as bibliographic codes defined in ISO-639.
  89. Legacy
  90. // Map the dominant language of a macro language group to the macro language
  91. // subtag. For example cmn -> zh.
  92. Macro
  93. // The CLDR flag should be used if full compatibility with CLDR is required.
  94. // There are a few cases where language.Tag may differ from CLDR. To follow all
  95. // of CLDR's suggestions, use All|CLDR.
  96. CLDR
  97. // Raw can be used to Compose or Parse without Canonicalization.
  98. Raw CanonType = 0
  99. // Replace all deprecated tags with their preferred replacements.
  100. Deprecated = DeprecatedBase | DeprecatedScript | DeprecatedRegion
  101. // All canonicalizations recommended by BCP 47.
  102. BCP47 = Deprecated | SuppressScript
  103. // All canonicalizations.
  104. All = BCP47 | Legacy | Macro
  105. // Default is the canonicalization used by Parse, Make and Compose. To
  106. // preserve as much information as possible, canonicalizations that remove
  107. // potentially valuable information are not included. The Matcher is
  108. // designed to recognize similar tags that would be the same if
  109. // they were canonicalized using All.
  110. Default = Deprecated | Legacy
  111. canonLang = DeprecatedBase | Legacy | Macro
  112. // TODO: LikelyScript, LikelyRegion: suppress similar to ICU.
  113. )
  114. // canonicalize returns the canonicalized equivalent of the tag and
  115. // whether there was any change.
  116. func (t Tag) canonicalize(c CanonType) (Tag, bool) {
  117. if c == Raw {
  118. return t, false
  119. }
  120. changed := false
  121. if c&SuppressScript != 0 {
  122. if t.lang < langNoIndexOffset && uint8(t.script) == suppressScript[t.lang] {
  123. t.script = 0
  124. changed = true
  125. }
  126. }
  127. if c&canonLang != 0 {
  128. for {
  129. if l, aliasType := normLang(t.lang); l != t.lang {
  130. switch aliasType {
  131. case langLegacy:
  132. if c&Legacy != 0 {
  133. if t.lang == _sh && t.script == 0 {
  134. t.script = _Latn
  135. }
  136. t.lang = l
  137. changed = true
  138. }
  139. case langMacro:
  140. if c&Macro != 0 {
  141. // We deviate here from CLDR. The mapping "nb" -> "no"
  142. // qualifies as a typical Macro language mapping. However,
  143. // for legacy reasons, CLDR maps "no", the macro language
  144. // code for Norwegian, to the dominant variant "nb". This
  145. // change is currently under consideration for CLDR as well.
  146. // See http://unicode.org/cldr/trac/ticket/2698 and also
  147. // http://unicode.org/cldr/trac/ticket/1790 for some of the
  148. // practical implications. TODO: this check could be removed
  149. // if CLDR adopts this change.
  150. if c&CLDR == 0 || t.lang != _nb {
  151. changed = true
  152. t.lang = l
  153. }
  154. }
  155. case langDeprecated:
  156. if c&DeprecatedBase != 0 {
  157. if t.lang == _mo && t.region == 0 {
  158. t.region = _MD
  159. }
  160. t.lang = l
  161. changed = true
  162. // Other canonicalization types may still apply.
  163. continue
  164. }
  165. }
  166. } else if c&Legacy != 0 && t.lang == _no && c&CLDR != 0 {
  167. t.lang = _nb
  168. changed = true
  169. }
  170. break
  171. }
  172. }
  173. if c&DeprecatedScript != 0 {
  174. if t.script == _Qaai {
  175. changed = true
  176. t.script = _Zinh
  177. }
  178. }
  179. if c&DeprecatedRegion != 0 {
  180. if r := normRegion(t.region); r != 0 {
  181. changed = true
  182. t.region = r
  183. }
  184. }
  185. return t, changed
  186. }
  187. // Canonicalize returns the canonicalized equivalent of the tag.
  188. func (c CanonType) Canonicalize(t Tag) (Tag, error) {
  189. t, changed := t.canonicalize(c)
  190. if changed {
  191. t.remakeString()
  192. }
  193. return t, nil
  194. }
  195. // Confidence indicates the level of certainty for a given return value.
  196. // For example, Serbian may be written in Cyrillic or Latin script.
  197. // The confidence level indicates whether a value was explicitly specified,
  198. // whether it is typically the only possible value, or whether there is
  199. // an ambiguity.
  200. type Confidence int
  201. const (
  202. No Confidence = iota // full confidence that there was no match
  203. Low // most likely value picked out of a set of alternatives
  204. High // value is generally assumed to be the correct match
  205. Exact // exact match or explicitly specified value
  206. )
  207. var confName = []string{"No", "Low", "High", "Exact"}
  208. func (c Confidence) String() string {
  209. return confName[c]
  210. }
  211. // remakeString is used to update t.str in case lang, script or region changed.
  212. // It is assumed that pExt and pVariant still point to the start of the
  213. // respective parts.
  214. func (t *Tag) remakeString() {
  215. if t.str == "" {
  216. return
  217. }
  218. extra := t.str[t.pVariant:]
  219. if t.pVariant > 0 {
  220. extra = extra[1:]
  221. }
  222. if t.equalTags(und) && strings.HasPrefix(extra, "x-") {
  223. t.str = extra
  224. t.pVariant = 0
  225. t.pExt = 0
  226. return
  227. }
  228. var buf [max99thPercentileSize]byte // avoid extra memory allocation in most cases.
  229. b := buf[:t.genCoreBytes(buf[:])]
  230. if extra != "" {
  231. diff := len(b) - int(t.pVariant)
  232. b = append(b, '-')
  233. b = append(b, extra...)
  234. t.pVariant = uint8(int(t.pVariant) + diff)
  235. t.pExt = uint16(int(t.pExt) + diff)
  236. } else {
  237. t.pVariant = uint8(len(b))
  238. t.pExt = uint16(len(b))
  239. }
  240. t.str = string(b)
  241. }
  242. // genCoreBytes writes a string for the base languages, script and region tags
  243. // to the given buffer and returns the number of bytes written. It will never
  244. // write more than maxCoreSize bytes.
  245. func (t *Tag) genCoreBytes(buf []byte) int {
  246. n := t.lang.stringToBuf(buf[:])
  247. if t.script != 0 {
  248. n += copy(buf[n:], "-")
  249. n += copy(buf[n:], t.script.String())
  250. }
  251. if t.region != 0 {
  252. n += copy(buf[n:], "-")
  253. n += copy(buf[n:], t.region.String())
  254. }
  255. return n
  256. }
  257. // String returns the canonical string representation of the language tag.
  258. func (t Tag) String() string {
  259. if t.str != "" {
  260. return t.str
  261. }
  262. if t.script == 0 && t.region == 0 {
  263. return t.lang.String()
  264. }
  265. buf := [maxCoreSize]byte{}
  266. return string(buf[:t.genCoreBytes(buf[:])])
  267. }
  268. // Base returns the base language of the language tag. If the base language is
  269. // unspecified, an attempt will be made to infer it from the context.
  270. // It uses a variant of CLDR's Add Likely Subtags algorithm. This is subject to change.
  271. func (t Tag) Base() (Base, Confidence) {
  272. if t.lang != 0 {
  273. return Base{t.lang}, Exact
  274. }
  275. c := High
  276. if t.script == 0 && !(Region{t.region}).IsCountry() {
  277. c = Low
  278. }
  279. if tag, err := addTags(t); err == nil && tag.lang != 0 {
  280. return Base{tag.lang}, c
  281. }
  282. return Base{0}, No
  283. }
  284. // Script infers the script for the language tag. If it was not explicitly given, it will infer
  285. // a most likely candidate.
  286. // If more than one script is commonly used for a language, the most likely one
  287. // is returned with a low confidence indication. For example, it returns (Cyrl, Low)
  288. // for Serbian.
  289. // If a script cannot be inferred (Zzzz, No) is returned. We do not use Zyyy (undetermined)
  290. // as one would suspect from the IANA registry for BCP 47. In a Unicode context Zyyy marks
  291. // common characters (like 1, 2, 3, '.', etc.) and is therefore more like multiple scripts.
  292. // See http://www.unicode.org/reports/tr24/#Values for more details. Zzzz is also used for
  293. // unknown value in CLDR. (Zzzz, Exact) is returned if Zzzz was explicitly specified.
  294. // Note that an inferred script is never guaranteed to be the correct one. Latin is
  295. // almost exclusively used for Afrikaans, but Arabic has been used for some texts
  296. // in the past. Also, the script that is commonly used may change over time.
  297. // It uses a variant of CLDR's Add Likely Subtags algorithm. This is subject to change.
  298. func (t Tag) Script() (Script, Confidence) {
  299. if t.script != 0 {
  300. return Script{t.script}, Exact
  301. }
  302. sc, c := scriptID(_Zzzz), No
  303. if t.lang < langNoIndexOffset {
  304. if scr := scriptID(suppressScript[t.lang]); scr != 0 {
  305. // Note: it is not always the case that a language with a suppress
  306. // script value is only written in one script (e.g. kk, ms, pa).
  307. if t.region == 0 {
  308. return Script{scriptID(scr)}, High
  309. }
  310. sc, c = scr, High
  311. }
  312. }
  313. if tag, err := addTags(t); err == nil {
  314. if tag.script != sc {
  315. sc, c = tag.script, Low
  316. }
  317. } else {
  318. t, _ = (Deprecated | Macro).Canonicalize(t)
  319. if tag, err := addTags(t); err == nil && tag.script != sc {
  320. sc, c = tag.script, Low
  321. }
  322. }
  323. return Script{sc}, c
  324. }
  325. // Region returns the region for the language tag. If it was not explicitly given, it will
  326. // infer a most likely candidate from the context.
  327. // It uses a variant of CLDR's Add Likely Subtags algorithm. This is subject to change.
  328. func (t Tag) Region() (Region, Confidence) {
  329. if t.region != 0 {
  330. return Region{t.region}, Exact
  331. }
  332. if t, err := addTags(t); err == nil {
  333. return Region{t.region}, Low // TODO: differentiate between high and low.
  334. }
  335. t, _ = (Deprecated | Macro).Canonicalize(t)
  336. if tag, err := addTags(t); err == nil {
  337. return Region{tag.region}, Low
  338. }
  339. return Region{_ZZ}, No // TODO: return world instead of undetermined?
  340. }
  341. // Variant returns the variants specified explicitly for this language tag.
  342. // or nil if no variant was specified.
  343. func (t Tag) Variants() []Variant {
  344. v := []Variant{}
  345. if int(t.pVariant) < int(t.pExt) {
  346. for x, str := "", t.str[t.pVariant:t.pExt]; str != ""; {
  347. x, str = nextToken(str)
  348. v = append(v, Variant{x})
  349. }
  350. }
  351. return v
  352. }
  353. // Parent returns the CLDR parent of t. In CLDR, missing fields in data for a
  354. // specific language are substituted with fields from the parent language.
  355. // The parent for a language may change for newer versions of CLDR.
  356. func (t Tag) Parent() Tag {
  357. if t.str != "" {
  358. // Strip the variants and extensions.
  359. t, _ = Raw.Compose(t.Raw())
  360. if t.region == 0 && t.script != 0 && t.lang != 0 {
  361. base, _ := addTags(Tag{lang: t.lang})
  362. if base.script == t.script {
  363. return Tag{lang: t.lang}
  364. }
  365. }
  366. return t
  367. }
  368. if t.lang != 0 {
  369. if t.region != 0 {
  370. maxScript := t.script
  371. if maxScript == 0 {
  372. max, _ := addTags(t)
  373. maxScript = max.script
  374. }
  375. for i := range parents {
  376. if langID(parents[i].lang) == t.lang && scriptID(parents[i].maxScript) == maxScript {
  377. for _, r := range parents[i].fromRegion {
  378. if regionID(r) == t.region {
  379. return Tag{
  380. lang: t.lang,
  381. script: scriptID(parents[i].script),
  382. region: regionID(parents[i].toRegion),
  383. }
  384. }
  385. }
  386. }
  387. }
  388. // Strip the script if it is the default one.
  389. base, _ := addTags(Tag{lang: t.lang})
  390. if base.script != maxScript {
  391. return Tag{lang: t.lang, script: maxScript}
  392. }
  393. return Tag{lang: t.lang}
  394. } else if t.script != 0 {
  395. // The parent for an base-script pair with a non-default script is
  396. // "und" instead of the base language.
  397. base, _ := addTags(Tag{lang: t.lang})
  398. if base.script != t.script {
  399. return und
  400. }
  401. return Tag{lang: t.lang}
  402. }
  403. }
  404. return und
  405. }
  406. // returns token t and the rest of the string.
  407. func nextToken(s string) (t, tail string) {
  408. p := strings.Index(s[1:], "-")
  409. if p == -1 {
  410. return s[1:], ""
  411. }
  412. p++
  413. return s[1:p], s[p:]
  414. }
  415. // Extension is a single BCP 47 extension.
  416. type Extension struct {
  417. s string
  418. }
  419. // String returns the string representation of the extension, including the
  420. // type tag.
  421. func (e Extension) String() string {
  422. return e.s
  423. }
  424. // ParseExtension parses s as an extension and returns it on success.
  425. func ParseExtension(s string) (e Extension, err error) {
  426. scan := makeScannerString(s)
  427. var end int
  428. if n := len(scan.token); n != 1 {
  429. return Extension{}, errSyntax
  430. }
  431. scan.toLower(0, len(scan.b))
  432. end = parseExtension(&scan)
  433. if end != len(s) {
  434. return Extension{}, errSyntax
  435. }
  436. return Extension{string(scan.b)}, nil
  437. }
  438. // Type returns the one-byte extension type of e. It returns 0 for the zero
  439. // exception.
  440. func (e Extension) Type() byte {
  441. if e.s == "" {
  442. return 0
  443. }
  444. return e.s[0]
  445. }
  446. // Tokens returns the list of tokens of e.
  447. func (e Extension) Tokens() []string {
  448. return strings.Split(e.s, "-")
  449. }
  450. // Extension returns the extension of type x for tag t. It will return
  451. // false for ok if t does not have the requested extension. The returned
  452. // extension will be invalid in this case.
  453. func (t Tag) Extension(x byte) (ext Extension, ok bool) {
  454. for i := int(t.pExt); i < len(t.str)-1; {
  455. var ext string
  456. i, ext = getExtension(t.str, i)
  457. if ext[0] == x {
  458. return Extension{ext}, true
  459. }
  460. }
  461. return Extension{}, false
  462. }
  463. // Extensions returns all extensions of t.
  464. func (t Tag) Extensions() []Extension {
  465. e := []Extension{}
  466. for i := int(t.pExt); i < len(t.str)-1; {
  467. var ext string
  468. i, ext = getExtension(t.str, i)
  469. e = append(e, Extension{ext})
  470. }
  471. return e
  472. }
  473. // TypeForKey returns the type associated with the given key, where key and type
  474. // are of the allowed values defined for the Unicode locale extension ('u') in
  475. // http://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
  476. // TypeForKey will traverse the inheritance chain to get the correct value.
  477. func (t Tag) TypeForKey(key string) string {
  478. if start, end, _ := t.findTypeForKey(key); end != start {
  479. return t.str[start:end]
  480. }
  481. return ""
  482. }
  483. var (
  484. errPrivateUse = errors.New("cannot set a key on a private use tag")
  485. errInvalidArguments = errors.New("invalid key or type")
  486. )
  487. // SetTypeForKey returns a new Tag with the key set to type, where key and type
  488. // are of the allowed values defined for the Unicode locale extension ('u') in
  489. // http://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
  490. // An empty value removes an existing pair with the same key.
  491. func (t Tag) SetTypeForKey(key, value string) (Tag, error) {
  492. if t.private() {
  493. return t, errPrivateUse
  494. }
  495. if len(key) != 2 {
  496. return t, errInvalidArguments
  497. }
  498. // Remove the setting if value is "".
  499. if value == "" {
  500. start, end, _ := t.findTypeForKey(key)
  501. if start != end {
  502. // Remove key tag and leading '-'.
  503. start -= 4
  504. // Remove a possible empty extension.
  505. if (end == len(t.str) || t.str[end+2] == '-') && t.str[start-2] == '-' {
  506. start -= 2
  507. }
  508. if start == int(t.pVariant) && end == len(t.str) {
  509. t.str = ""
  510. t.pVariant, t.pExt = 0, 0
  511. } else {
  512. t.str = fmt.Sprintf("%s%s", t.str[:start], t.str[end:])
  513. }
  514. }
  515. return t, nil
  516. }
  517. if len(value) < 3 || len(value) > 8 {
  518. return t, errInvalidArguments
  519. }
  520. var (
  521. buf [maxCoreSize + maxSimpleUExtensionSize]byte
  522. uStart int // start of the -u extension.
  523. )
  524. // Generate the tag string if needed.
  525. if t.str == "" {
  526. uStart = t.genCoreBytes(buf[:])
  527. buf[uStart] = '-'
  528. uStart++
  529. }
  530. // Create new key-type pair and parse it to verify.
  531. b := buf[uStart:]
  532. copy(b, "u-")
  533. copy(b[2:], key)
  534. b[4] = '-'
  535. b = b[:5+copy(b[5:], value)]
  536. scan := makeScanner(b)
  537. if parseExtensions(&scan); scan.err != nil {
  538. return t, scan.err
  539. }
  540. // Assemble the replacement string.
  541. if t.str == "" {
  542. t.pVariant, t.pExt = byte(uStart-1), uint16(uStart-1)
  543. t.str = string(buf[:uStart+len(b)])
  544. } else {
  545. s := t.str
  546. start, end, hasExt := t.findTypeForKey(key)
  547. if start == end {
  548. if hasExt {
  549. b = b[2:]
  550. }
  551. t.str = fmt.Sprintf("%s-%s%s", s[:start], b, s[end:])
  552. } else {
  553. t.str = fmt.Sprintf("%s%s%s", s[:start], value, s[end:])
  554. }
  555. }
  556. return t, nil
  557. }
  558. // findKeyAndType returns the start and end position for the type corresponding
  559. // to key or the point at which to insert the key-value pair if the type
  560. // wasn't found. The hasExt return value reports whether an -u extension was present.
  561. // Note: the extensions are typically very small and are likely to contain
  562. // only one key-type pair.
  563. func (t Tag) findTypeForKey(key string) (start, end int, hasExt bool) {
  564. p := int(t.pExt)
  565. if len(key) != 2 || p == len(t.str) || p == 0 {
  566. return p, p, false
  567. }
  568. s := t.str
  569. // Find the correct extension.
  570. for p++; s[p] != 'u'; p++ {
  571. if s[p] > 'u' {
  572. p--
  573. return p, p, false
  574. }
  575. if p = nextExtension(s, p); p == len(s) {
  576. return len(s), len(s), false
  577. }
  578. }
  579. // Proceed to the hyphen following the extension name.
  580. p++
  581. // curKey is the key currently being processed.
  582. curKey := ""
  583. // Iterate over keys until we get the end of a section.
  584. for {
  585. // p points to the hyphen preceding the current token.
  586. if p3 := p + 3; s[p3] == '-' {
  587. // Found a key.
  588. // Check whether we just processed the key that was requested.
  589. if curKey == key {
  590. return start, p, true
  591. }
  592. // Set to the next key and continue scanning type tokens.
  593. curKey = s[p+1 : p3]
  594. if curKey > key {
  595. return p, p, true
  596. }
  597. // Start of the type token sequence.
  598. start = p + 4
  599. // A type is at least 3 characters long.
  600. p += 7 // 4 + 3
  601. } else {
  602. // Attribute or type, which is at least 3 characters long.
  603. p += 4
  604. }
  605. // p points past the third character of a type or attribute.
  606. max := p + 5 // maximum length of token plus hyphen.
  607. if len(s) < max {
  608. max = len(s)
  609. }
  610. for ; p < max && s[p] != '-'; p++ {
  611. }
  612. // Bail if we have exhausted all tokens or if the next token starts
  613. // a new extension.
  614. if p == len(s) || s[p+2] == '-' {
  615. if curKey == key {
  616. return start, p, true
  617. }
  618. return p, p, true
  619. }
  620. }
  621. }
  622. // CompactIndex returns an index, where 0 <= index < NumCompactTags, for tags
  623. // for which data exists in the text repository. The index will change over time
  624. // and should not be stored in persistent storage. Extensions, except for the
  625. // 'va' type of the 'u' extension, are ignored. It will return 0, false if no
  626. // compact tag exists, where 0 is the index for the root language (Und).
  627. func CompactIndex(t Tag) (index int, ok bool) {
  628. // TODO: perhaps give more frequent tags a lower index.
  629. // TODO: we could make the indexes stable. This will excluded some
  630. // possibilities for optimization, so don't do this quite yet.
  631. b, s, r := t.Raw()
  632. if len(t.str) > 0 {
  633. if strings.HasPrefix(t.str, "x-") {
  634. // We have no entries for user-defined tags.
  635. return 0, false
  636. }
  637. if uint16(t.pVariant) != t.pExt {
  638. // There are no tags with variants and an u-va type.
  639. if t.TypeForKey("va") != "" {
  640. return 0, false
  641. }
  642. t, _ = Raw.Compose(b, s, r, t.Variants())
  643. } else if _, ok := t.Extension('u'); ok {
  644. // Strip all but the 'va' entry.
  645. variant := t.TypeForKey("va")
  646. t, _ = Raw.Compose(b, s, r)
  647. t, _ = t.SetTypeForKey("va", variant)
  648. }
  649. if len(t.str) > 0 {
  650. // We have some variants.
  651. for i, s := range specialTags {
  652. if s == t {
  653. return i + 1, true
  654. }
  655. }
  656. return 0, false
  657. }
  658. }
  659. // No variants specified: just compare core components.
  660. // The key has the form lllssrrr, where l, s, and r are nibbles for
  661. // respectively the langID, scriptID, and regionID.
  662. key := uint32(b.langID) << (8 + 12)
  663. key |= uint32(s.scriptID) << 12
  664. key |= uint32(r.regionID)
  665. x, ok := coreTags[key]
  666. return int(x), ok
  667. }
  668. // Base is an ISO 639 language code, used for encoding the base language
  669. // of a language tag.
  670. type Base struct {
  671. langID
  672. }
  673. // ParseBase parses a 2- or 3-letter ISO 639 code.
  674. // It returns a ValueError if s is a well-formed but unknown language identifier
  675. // or another error if another error occurred.
  676. func ParseBase(s string) (Base, error) {
  677. if n := len(s); n < 2 || 3 < n {
  678. return Base{}, errSyntax
  679. }
  680. var buf [3]byte
  681. l, err := getLangID(buf[:copy(buf[:], s)])
  682. return Base{l}, err
  683. }
  684. // Script is a 4-letter ISO 15924 code for representing scripts.
  685. // It is idiomatically represented in title case.
  686. type Script struct {
  687. scriptID
  688. }
  689. // ParseScript parses a 4-letter ISO 15924 code.
  690. // It returns a ValueError if s is a well-formed but unknown script identifier
  691. // or another error if another error occurred.
  692. func ParseScript(s string) (Script, error) {
  693. if len(s) != 4 {
  694. return Script{}, errSyntax
  695. }
  696. var buf [4]byte
  697. sc, err := getScriptID(script, buf[:copy(buf[:], s)])
  698. return Script{sc}, err
  699. }
  700. // Region is an ISO 3166-1 or UN M.49 code for representing countries and regions.
  701. type Region struct {
  702. regionID
  703. }
  704. // EncodeM49 returns the Region for the given UN M.49 code.
  705. // It returns an error if r is not a valid code.
  706. func EncodeM49(r int) (Region, error) {
  707. rid, err := getRegionM49(r)
  708. return Region{rid}, err
  709. }
  710. // ParseRegion parses a 2- or 3-letter ISO 3166-1 or a UN M.49 code.
  711. // It returns a ValueError if s is a well-formed but unknown region identifier
  712. // or another error if another error occurred.
  713. func ParseRegion(s string) (Region, error) {
  714. if n := len(s); n < 2 || 3 < n {
  715. return Region{}, errSyntax
  716. }
  717. var buf [3]byte
  718. r, err := getRegionID(buf[:copy(buf[:], s)])
  719. return Region{r}, err
  720. }
  721. // IsCountry returns whether this region is a country or autonomous area. This
  722. // includes non-standard definitions from CLDR.
  723. func (r Region) IsCountry() bool {
  724. if r.regionID == 0 || r.IsGroup() || r.IsPrivateUse() && r.regionID != _XK {
  725. return false
  726. }
  727. return true
  728. }
  729. // IsGroup returns whether this region defines a collection of regions. This
  730. // includes non-standard definitions from CLDR.
  731. func (r Region) IsGroup() bool {
  732. if r.regionID == 0 {
  733. return false
  734. }
  735. return int(regionInclusion[r.regionID]) < len(regionContainment)
  736. }
  737. // Contains returns whether Region c is contained by Region r. It returns true
  738. // if c == r.
  739. func (r Region) Contains(c Region) bool {
  740. return r.regionID.contains(c.regionID)
  741. }
  742. func (r regionID) contains(c regionID) bool {
  743. if r == c {
  744. return true
  745. }
  746. g := regionInclusion[r]
  747. if g >= nRegionGroups {
  748. return false
  749. }
  750. m := regionContainment[g]
  751. d := regionInclusion[c]
  752. b := regionInclusionBits[d]
  753. // A contained country may belong to multiple disjoint groups. Matching any
  754. // of these indicates containment. If the contained region is a group, it
  755. // must strictly be a subset.
  756. if d >= nRegionGroups {
  757. return b&m != 0
  758. }
  759. return b&^m == 0
  760. }
  761. var errNoTLD = errors.New("language: region is not a valid ccTLD")
  762. // TLD returns the country code top-level domain (ccTLD). UK is returned for GB.
  763. // In all other cases it returns either the region itself or an error.
  764. //
  765. // This method may return an error for a region for which there exists a
  766. // canonical form with a ccTLD. To get that ccTLD canonicalize r first. The
  767. // region will already be canonicalized it was obtained from a Tag that was
  768. // obtained using any of the default methods.
  769. func (r Region) TLD() (Region, error) {
  770. // See http://en.wikipedia.org/wiki/Country_code_top-level_domain for the
  771. // difference between ISO 3166-1 and IANA ccTLD.
  772. if r.regionID == _GB {
  773. r = Region{_UK}
  774. }
  775. if (r.typ() & ccTLD) == 0 {
  776. return Region{}, errNoTLD
  777. }
  778. return r, nil
  779. }
  780. // Canonicalize returns the region or a possible replacement if the region is
  781. // deprecated. It will not return a replacement for deprecated regions that
  782. // are split into multiple regions.
  783. func (r Region) Canonicalize() Region {
  784. if cr := normRegion(r.regionID); cr != 0 {
  785. return Region{cr}
  786. }
  787. return r
  788. }
  789. // Variant represents a registered variant of a language as defined by BCP 47.
  790. type Variant struct {
  791. variant string
  792. }
  793. // ParseVariant parses and returns a Variant. An error is returned if s is not
  794. // a valid variant.
  795. func ParseVariant(s string) (Variant, error) {
  796. s = strings.ToLower(s)
  797. if _, ok := variantIndex[s]; ok {
  798. return Variant{s}, nil
  799. }
  800. return Variant{}, mkErrInvalid([]byte(s))
  801. }
  802. // String returns the string representation of the variant.
  803. func (v Variant) String() string {
  804. return v.variant
  805. }