maketables.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524
  1. // Copyright 2013 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // +build ignore
  5. package main
  6. import (
  7. "bufio"
  8. "fmt"
  9. "log"
  10. "net/http"
  11. "sort"
  12. "strings"
  13. "unicode/utf8"
  14. "golang.org/x/text/encoding"
  15. "golang.org/x/text/internal/gen"
  16. )
  17. const ascii = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" +
  18. "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" +
  19. ` !"#$%&'()*+,-./0123456789:;<=>?` +
  20. `@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_` +
  21. "`abcdefghijklmnopqrstuvwxyz{|}~\u007f"
  22. var encodings = []struct {
  23. name string
  24. mib string
  25. comment string
  26. varName string
  27. replacement byte
  28. mapping string
  29. }{
  30. {
  31. "IBM Code Page 437",
  32. "PC8CodePage437",
  33. "",
  34. "CodePage437",
  35. encoding.ASCIISub,
  36. "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM437-2.1.2.ucm",
  37. },
  38. {
  39. "IBM Code Page 850",
  40. "PC850Multilingual",
  41. "",
  42. "CodePage850",
  43. encoding.ASCIISub,
  44. "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM850-2.1.2.ucm",
  45. },
  46. {
  47. "IBM Code Page 852",
  48. "PCp852",
  49. "",
  50. "CodePage852",
  51. encoding.ASCIISub,
  52. "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM852-2.1.2.ucm",
  53. },
  54. {
  55. "IBM Code Page 855",
  56. "IBM855",
  57. "",
  58. "CodePage855",
  59. encoding.ASCIISub,
  60. "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM855-2.1.2.ucm",
  61. },
  62. {
  63. "Windows Code Page 858", // PC latin1 with Euro
  64. "IBM00858",
  65. "",
  66. "CodePage858",
  67. encoding.ASCIISub,
  68. "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/windows-858-2000.ucm",
  69. },
  70. {
  71. "IBM Code Page 860",
  72. "IBM860",
  73. "",
  74. "CodePage860",
  75. encoding.ASCIISub,
  76. "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM860-2.1.2.ucm",
  77. },
  78. {
  79. "IBM Code Page 862",
  80. "PC862LatinHebrew",
  81. "",
  82. "CodePage862",
  83. encoding.ASCIISub,
  84. "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM862-2.1.2.ucm",
  85. },
  86. {
  87. "IBM Code Page 863",
  88. "IBM863",
  89. "",
  90. "CodePage863",
  91. encoding.ASCIISub,
  92. "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM863-2.1.2.ucm",
  93. },
  94. {
  95. "IBM Code Page 865",
  96. "IBM865",
  97. "",
  98. "CodePage865",
  99. encoding.ASCIISub,
  100. "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM865-2.1.2.ucm",
  101. },
  102. {
  103. "IBM Code Page 866",
  104. "IBM866",
  105. "",
  106. "CodePage866",
  107. encoding.ASCIISub,
  108. "http://encoding.spec.whatwg.org/index-ibm866.txt",
  109. },
  110. {
  111. "ISO 8859-1",
  112. "ISOLatin1",
  113. "",
  114. "ISO8859_1",
  115. encoding.ASCIISub,
  116. "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/iso-8859_1-1998.ucm",
  117. },
  118. {
  119. "ISO 8859-2",
  120. "ISOLatin2",
  121. "",
  122. "ISO8859_2",
  123. encoding.ASCIISub,
  124. "http://encoding.spec.whatwg.org/index-iso-8859-2.txt",
  125. },
  126. {
  127. "ISO 8859-3",
  128. "ISOLatin3",
  129. "",
  130. "ISO8859_3",
  131. encoding.ASCIISub,
  132. "http://encoding.spec.whatwg.org/index-iso-8859-3.txt",
  133. },
  134. {
  135. "ISO 8859-4",
  136. "ISOLatin4",
  137. "",
  138. "ISO8859_4",
  139. encoding.ASCIISub,
  140. "http://encoding.spec.whatwg.org/index-iso-8859-4.txt",
  141. },
  142. {
  143. "ISO 8859-5",
  144. "ISOLatinCyrillic",
  145. "",
  146. "ISO8859_5",
  147. encoding.ASCIISub,
  148. "http://encoding.spec.whatwg.org/index-iso-8859-5.txt",
  149. },
  150. {
  151. "ISO 8859-6",
  152. "ISOLatinArabic",
  153. "",
  154. "ISO8859_6,ISO8859_6E,ISO8859_6I",
  155. encoding.ASCIISub,
  156. "http://encoding.spec.whatwg.org/index-iso-8859-6.txt",
  157. },
  158. {
  159. "ISO 8859-7",
  160. "ISOLatinGreek",
  161. "",
  162. "ISO8859_7",
  163. encoding.ASCIISub,
  164. "http://encoding.spec.whatwg.org/index-iso-8859-7.txt",
  165. },
  166. {
  167. "ISO 8859-8",
  168. "ISOLatinHebrew",
  169. "",
  170. "ISO8859_8,ISO8859_8E,ISO8859_8I",
  171. encoding.ASCIISub,
  172. "http://encoding.spec.whatwg.org/index-iso-8859-8.txt",
  173. },
  174. {
  175. "ISO 8859-10",
  176. "ISOLatin6",
  177. "",
  178. "ISO8859_10",
  179. encoding.ASCIISub,
  180. "http://encoding.spec.whatwg.org/index-iso-8859-10.txt",
  181. },
  182. {
  183. "ISO 8859-13",
  184. "ISO885913",
  185. "",
  186. "ISO8859_13",
  187. encoding.ASCIISub,
  188. "http://encoding.spec.whatwg.org/index-iso-8859-13.txt",
  189. },
  190. {
  191. "ISO 8859-14",
  192. "ISO885914",
  193. "",
  194. "ISO8859_14",
  195. encoding.ASCIISub,
  196. "http://encoding.spec.whatwg.org/index-iso-8859-14.txt",
  197. },
  198. {
  199. "ISO 8859-15",
  200. "ISO885915",
  201. "",
  202. "ISO8859_15",
  203. encoding.ASCIISub,
  204. "http://encoding.spec.whatwg.org/index-iso-8859-15.txt",
  205. },
  206. {
  207. "ISO 8859-16",
  208. "ISO885916",
  209. "",
  210. "ISO8859_16",
  211. encoding.ASCIISub,
  212. "http://encoding.spec.whatwg.org/index-iso-8859-16.txt",
  213. },
  214. {
  215. "KOI8-R",
  216. "KOI8R",
  217. "",
  218. "KOI8R",
  219. encoding.ASCIISub,
  220. "http://encoding.spec.whatwg.org/index-koi8-r.txt",
  221. },
  222. {
  223. "KOI8-U",
  224. "KOI8U",
  225. "",
  226. "KOI8U",
  227. encoding.ASCIISub,
  228. "http://encoding.spec.whatwg.org/index-koi8-u.txt",
  229. },
  230. {
  231. "Macintosh",
  232. "Macintosh",
  233. "",
  234. "Macintosh",
  235. encoding.ASCIISub,
  236. "http://encoding.spec.whatwg.org/index-macintosh.txt",
  237. },
  238. {
  239. "Macintosh Cyrillic",
  240. "MacintoshCyrillic",
  241. "",
  242. "MacintoshCyrillic",
  243. encoding.ASCIISub,
  244. "http://encoding.spec.whatwg.org/index-x-mac-cyrillic.txt",
  245. },
  246. {
  247. "Windows 874",
  248. "Windows874",
  249. "",
  250. "Windows874",
  251. encoding.ASCIISub,
  252. "http://encoding.spec.whatwg.org/index-windows-874.txt",
  253. },
  254. {
  255. "Windows 1250",
  256. "Windows1250",
  257. "",
  258. "Windows1250",
  259. encoding.ASCIISub,
  260. "http://encoding.spec.whatwg.org/index-windows-1250.txt",
  261. },
  262. {
  263. "Windows 1251",
  264. "Windows1251",
  265. "",
  266. "Windows1251",
  267. encoding.ASCIISub,
  268. "http://encoding.spec.whatwg.org/index-windows-1251.txt",
  269. },
  270. {
  271. "Windows 1252",
  272. "Windows1252",
  273. "",
  274. "Windows1252",
  275. encoding.ASCIISub,
  276. "http://encoding.spec.whatwg.org/index-windows-1252.txt",
  277. },
  278. {
  279. "Windows 1253",
  280. "Windows1253",
  281. "",
  282. "Windows1253",
  283. encoding.ASCIISub,
  284. "http://encoding.spec.whatwg.org/index-windows-1253.txt",
  285. },
  286. {
  287. "Windows 1254",
  288. "Windows1254",
  289. "",
  290. "Windows1254",
  291. encoding.ASCIISub,
  292. "http://encoding.spec.whatwg.org/index-windows-1254.txt",
  293. },
  294. {
  295. "Windows 1255",
  296. "Windows1255",
  297. "",
  298. "Windows1255",
  299. encoding.ASCIISub,
  300. "http://encoding.spec.whatwg.org/index-windows-1255.txt",
  301. },
  302. {
  303. "Windows 1256",
  304. "Windows1256",
  305. "",
  306. "Windows1256",
  307. encoding.ASCIISub,
  308. "http://encoding.spec.whatwg.org/index-windows-1256.txt",
  309. },
  310. {
  311. "Windows 1257",
  312. "Windows1257",
  313. "",
  314. "Windows1257",
  315. encoding.ASCIISub,
  316. "http://encoding.spec.whatwg.org/index-windows-1257.txt",
  317. },
  318. {
  319. "Windows 1258",
  320. "Windows1258",
  321. "",
  322. "Windows1258",
  323. encoding.ASCIISub,
  324. "http://encoding.spec.whatwg.org/index-windows-1258.txt",
  325. },
  326. {
  327. "X-User-Defined",
  328. "XUserDefined",
  329. "It is defined at http://encoding.spec.whatwg.org/#x-user-defined",
  330. "XUserDefined",
  331. encoding.ASCIISub,
  332. ascii +
  333. "\uf780\uf781\uf782\uf783\uf784\uf785\uf786\uf787" +
  334. "\uf788\uf789\uf78a\uf78b\uf78c\uf78d\uf78e\uf78f" +
  335. "\uf790\uf791\uf792\uf793\uf794\uf795\uf796\uf797" +
  336. "\uf798\uf799\uf79a\uf79b\uf79c\uf79d\uf79e\uf79f" +
  337. "\uf7a0\uf7a1\uf7a2\uf7a3\uf7a4\uf7a5\uf7a6\uf7a7" +
  338. "\uf7a8\uf7a9\uf7aa\uf7ab\uf7ac\uf7ad\uf7ae\uf7af" +
  339. "\uf7b0\uf7b1\uf7b2\uf7b3\uf7b4\uf7b5\uf7b6\uf7b7" +
  340. "\uf7b8\uf7b9\uf7ba\uf7bb\uf7bc\uf7bd\uf7be\uf7bf" +
  341. "\uf7c0\uf7c1\uf7c2\uf7c3\uf7c4\uf7c5\uf7c6\uf7c7" +
  342. "\uf7c8\uf7c9\uf7ca\uf7cb\uf7cc\uf7cd\uf7ce\uf7cf" +
  343. "\uf7d0\uf7d1\uf7d2\uf7d3\uf7d4\uf7d5\uf7d6\uf7d7" +
  344. "\uf7d8\uf7d9\uf7da\uf7db\uf7dc\uf7dd\uf7de\uf7df" +
  345. "\uf7e0\uf7e1\uf7e2\uf7e3\uf7e4\uf7e5\uf7e6\uf7e7" +
  346. "\uf7e8\uf7e9\uf7ea\uf7eb\uf7ec\uf7ed\uf7ee\uf7ef" +
  347. "\uf7f0\uf7f1\uf7f2\uf7f3\uf7f4\uf7f5\uf7f6\uf7f7" +
  348. "\uf7f8\uf7f9\uf7fa\uf7fb\uf7fc\uf7fd\uf7fe\uf7ff",
  349. },
  350. }
  351. func getWHATWG(url string) string {
  352. res, err := http.Get(url)
  353. if err != nil {
  354. log.Fatalf("%q: Get: %v", url, err)
  355. }
  356. defer res.Body.Close()
  357. mapping := make([]rune, 128)
  358. for i := range mapping {
  359. mapping[i] = '\ufffd'
  360. }
  361. scanner := bufio.NewScanner(res.Body)
  362. for scanner.Scan() {
  363. s := strings.TrimSpace(scanner.Text())
  364. if s == "" || s[0] == '#' {
  365. continue
  366. }
  367. x, y := 0, 0
  368. if _, err := fmt.Sscanf(s, "%d\t0x%x", &x, &y); err != nil {
  369. log.Fatalf("could not parse %q", s)
  370. }
  371. if x < 0 || 128 <= x {
  372. log.Fatalf("code %d is out of range", x)
  373. }
  374. if 0x80 <= y && y < 0xa0 {
  375. // We diverge from the WHATWG spec by mapping control characters
  376. // in the range [0x80, 0xa0) to U+FFFD.
  377. continue
  378. }
  379. mapping[x] = rune(y)
  380. }
  381. return ascii + string(mapping)
  382. }
  383. func getUCM(url string) string {
  384. res, err := http.Get(url)
  385. if err != nil {
  386. log.Fatalf("%q: Get: %v", url, err)
  387. }
  388. defer res.Body.Close()
  389. mapping := make([]rune, 256)
  390. for i := range mapping {
  391. mapping[i] = '\ufffd'
  392. }
  393. charsFound := 0
  394. scanner := bufio.NewScanner(res.Body)
  395. for scanner.Scan() {
  396. s := strings.TrimSpace(scanner.Text())
  397. if s == "" || s[0] == '#' {
  398. continue
  399. }
  400. var c byte
  401. var r rune
  402. if _, err := fmt.Sscanf(s, `<U%x> \x%x |0`, &r, &c); err != nil {
  403. continue
  404. }
  405. mapping[c] = r
  406. charsFound++
  407. }
  408. if charsFound < 200 {
  409. log.Fatalf("%q: only %d characters found (wrong page format?)", url, charsFound)
  410. }
  411. return string(mapping)
  412. }
  413. func main() {
  414. mibs := map[string]bool{}
  415. all := []string{}
  416. w := gen.NewCodeWriter()
  417. defer w.WriteGoFile("tables.go", "charmap")
  418. printf := func(s string, a ...interface{}) { fmt.Fprintf(w, s, a...) }
  419. printf("import (\n")
  420. printf("\t\"golang.org/x/text/encoding\"\n")
  421. printf("\t\"golang.org/x/text/encoding/internal/identifier\"\n")
  422. printf(")\n\n")
  423. for _, e := range encodings {
  424. varNames := strings.Split(e.varName, ",")
  425. all = append(all, varNames...)
  426. varName := varNames[0]
  427. switch {
  428. case strings.HasPrefix(e.mapping, "http://encoding.spec.whatwg.org/"):
  429. e.mapping = getWHATWG(e.mapping)
  430. case strings.HasPrefix(e.mapping, "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/"):
  431. e.mapping = getUCM(e.mapping)
  432. }
  433. asciiSuperset, low := strings.HasPrefix(e.mapping, ascii), 0x00
  434. if asciiSuperset {
  435. low = 0x80
  436. }
  437. lvn := 1
  438. if strings.HasPrefix(varName, "ISO") || strings.HasPrefix(varName, "KOI") {
  439. lvn = 3
  440. }
  441. lowerVarName := strings.ToLower(varName[:lvn]) + varName[lvn:]
  442. printf("// %s is the %s encoding.\n", varName, e.name)
  443. if e.comment != "" {
  444. printf("//\n// %s\n", e.comment)
  445. }
  446. printf("var %s encoding.Encoding = &%s\n\nvar %s = charmap{\nname: %q,\n",
  447. varName, lowerVarName, lowerVarName, e.name)
  448. if mibs[e.mib] {
  449. log.Fatalf("MIB type %q declared multiple times.", e.mib)
  450. }
  451. printf("mib: identifier.%s,\n", e.mib)
  452. printf("asciiSuperset: %t,\n", asciiSuperset)
  453. printf("low: 0x%02x,\n", low)
  454. printf("replacement: 0x%02x,\n", e.replacement)
  455. printf("decode: [256]utf8Enc{\n")
  456. i, backMapping := 0, map[rune]byte{}
  457. for _, c := range e.mapping {
  458. if _, ok := backMapping[c]; !ok && c != utf8.RuneError {
  459. backMapping[c] = byte(i)
  460. }
  461. var buf [8]byte
  462. n := utf8.EncodeRune(buf[:], c)
  463. if n > 3 {
  464. panic(fmt.Sprintf("rune %q (%U) is too long", c, c))
  465. }
  466. printf("{%d,[3]byte{0x%02x,0x%02x,0x%02x}},", n, buf[0], buf[1], buf[2])
  467. if i%2 == 1 {
  468. printf("\n")
  469. }
  470. i++
  471. }
  472. printf("},\n")
  473. printf("encode: [256]uint32{\n")
  474. encode := make([]uint32, 0, 256)
  475. for c, i := range backMapping {
  476. encode = append(encode, uint32(i)<<24|uint32(c))
  477. }
  478. sort.Sort(byRune(encode))
  479. for len(encode) < cap(encode) {
  480. encode = append(encode, encode[len(encode)-1])
  481. }
  482. for i, enc := range encode {
  483. printf("0x%08x,", enc)
  484. if i%8 == 7 {
  485. printf("\n")
  486. }
  487. }
  488. printf("},\n}\n")
  489. // Add an estimate of the size of a single charmap{} struct value, which
  490. // includes two 256 elem arrays of 4 bytes and some extra fields, which
  491. // align to 3 uint64s on 64-bit architectures.
  492. w.Size += 2*4*256 + 3*8
  493. }
  494. // TODO: add proper line breaking.
  495. printf("var listAll = []encoding.Encoding{\n%s,\n}\n\n", strings.Join(all, ",\n"))
  496. }
  497. type byRune []uint32
  498. func (b byRune) Len() int { return len(b) }
  499. func (b byRune) Less(i, j int) bool { return b[i]&0xffffff < b[j]&0xffffff }
  500. func (b byRune) Swap(i, j int) { b[i], b[j] = b[j], b[i] }