ianaindex.go 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214
  1. // Copyright 2015 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. //go:generate go run gen.go
  5. // Package ianaindex maps names to Encodings as specified by the IANA registry.
  6. // This includes both the MIME and IANA names.
  7. //
  8. // See http://www.iana.org/assignments/character-sets/character-sets.xhtml for
  9. // more details.
  10. package ianaindex
  11. import (
  12. "errors"
  13. "sort"
  14. "strings"
  15. "golang.org/x/text/encoding"
  16. "golang.org/x/text/encoding/charmap"
  17. "golang.org/x/text/encoding/internal/identifier"
  18. "golang.org/x/text/encoding/japanese"
  19. "golang.org/x/text/encoding/korean"
  20. "golang.org/x/text/encoding/simplifiedchinese"
  21. "golang.org/x/text/encoding/traditionalchinese"
  22. "golang.org/x/text/encoding/unicode"
  23. )
  24. // TODO: remove the "Status... incomplete" in the package doc comment.
  25. // TODO: allow users to specify their own aliases?
  26. // TODO: allow users to specify their own indexes?
  27. // TODO: allow canonicalizing names
  28. // NOTE: only use these top-level variables if we can get the linker to drop
  29. // the indexes when they are not used. Make them a function or perhaps only
  30. // support MIME otherwise.
  31. var (
  32. // MIME is an index to map MIME names.
  33. MIME *Index = mime
  34. // IANA is an index that supports all names and aliases using IANA names as
  35. // the canonical identifier.
  36. IANA *Index = iana
  37. // MIB is an index that associates the MIB display name with an Encoding.
  38. MIB *Index = mib
  39. mime = &Index{mimeName, ianaToMIB, ianaAliases, encodings[:]}
  40. iana = &Index{ianaName, ianaToMIB, ianaAliases, encodings[:]}
  41. mib = &Index{mibName, ianaToMIB, ianaAliases, encodings[:]}
  42. )
  43. // Index maps names registered by IANA to Encodings.
  44. // Currently different Indexes only differ in the names they return for
  45. // encodings. In the future they may also differ in supported aliases.
  46. type Index struct {
  47. names func(i int) string
  48. toMIB []identifier.MIB // Sorted slice of supported MIBs
  49. alias map[string]int
  50. enc []encoding.Encoding
  51. }
  52. var (
  53. errInvalidName = errors.New("ianaindex: invalid encoding name")
  54. errUnknown = errors.New("ianaindex: unknown Encoding")
  55. errUnsupported = errors.New("ianaindex: unsupported Encoding")
  56. )
  57. // Encoding returns an Encoding for IANA-registered names. Matching is
  58. // case-insensitive.
  59. //
  60. // If the provided name doesn't match a IANA-registered charset, an error is
  61. // returned. If the name matches a IANA-registered charset but isn't supported,
  62. // a nil encoding and a nil error are returned.
  63. func (x *Index) Encoding(name string) (encoding.Encoding, error) {
  64. name = strings.TrimSpace(name)
  65. // First try without lowercasing (possibly creating an allocation).
  66. i, ok := x.alias[name]
  67. if !ok {
  68. i, ok = x.alias[strings.ToLower(name)]
  69. if !ok {
  70. return nil, errInvalidName
  71. }
  72. }
  73. return x.enc[i], nil
  74. }
  75. // Name reports the canonical name of the given Encoding. It will return an
  76. // error if the e is not associated with a known encoding scheme.
  77. func (x *Index) Name(e encoding.Encoding) (string, error) {
  78. id, ok := e.(identifier.Interface)
  79. if !ok {
  80. return "", errUnknown
  81. }
  82. mib, _ := id.ID()
  83. if mib == 0 {
  84. return "", errUnknown
  85. }
  86. v := findMIB(x.toMIB, mib)
  87. if v == -1 {
  88. return "", errUnsupported
  89. }
  90. return x.names(v), nil
  91. }
  92. // TODO: the coverage of this index is rather spotty. Allowing users to set
  93. // encodings would allow:
  94. // - users to increase coverage
  95. // - allow a partially loaded set of encodings in case the user doesn't need to
  96. // them all.
  97. // - write an OS-specific wrapper for supported encodings and set them.
  98. // The exact definition of Set depends a bit on if and how we want to let users
  99. // write their own Encoding implementations. Also, it is not possible yet to
  100. // only partially load the encodings without doing some refactoring. Until this
  101. // is solved, we might as well not support Set.
  102. // // Set sets the e to be used for the encoding scheme identified by name. Only
  103. // // canonical names may be used. An empty name assigns e to its internally
  104. // // associated encoding scheme.
  105. // func (x *Index) Set(name string, e encoding.Encoding) error {
  106. // panic("TODO: implement")
  107. // }
  108. func findMIB(x []identifier.MIB, mib identifier.MIB) int {
  109. i := sort.Search(len(x), func(i int) bool { return x[i] >= mib })
  110. if i < len(x) && x[i] == mib {
  111. return i
  112. }
  113. return -1
  114. }
  115. const maxMIMENameLen = '0' - 1 // officially 40, but we leave some buffer.
  116. func mimeName(x int) string {
  117. n := ianaNames[x]
  118. // See gen.go for a description of the encoding.
  119. if n[0] <= maxMIMENameLen {
  120. return n[1:n[0]]
  121. }
  122. return n
  123. }
  124. func ianaName(x int) string {
  125. n := ianaNames[x]
  126. // See gen.go for a description of the encoding.
  127. if n[0] <= maxMIMENameLen {
  128. return n[n[0]:]
  129. }
  130. return n
  131. }
  132. func mibName(x int) string {
  133. return mibNames[x]
  134. }
  135. var encodings = [numIANA]encoding.Encoding{
  136. enc3: asciiEnc,
  137. enc106: unicode.UTF8,
  138. enc1015: unicode.UTF16(unicode.BigEndian, unicode.UseBOM),
  139. enc1013: unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM),
  140. enc1014: unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM),
  141. enc2028: charmap.CodePage037,
  142. enc2011: charmap.CodePage437,
  143. enc2009: charmap.CodePage850,
  144. enc2010: charmap.CodePage852,
  145. enc2046: charmap.CodePage855,
  146. enc2089: charmap.CodePage858,
  147. enc2048: charmap.CodePage860,
  148. enc2013: charmap.CodePage862,
  149. enc2050: charmap.CodePage863,
  150. enc2052: charmap.CodePage865,
  151. enc2086: charmap.CodePage866,
  152. enc2102: charmap.CodePage1047,
  153. enc2091: charmap.CodePage1140,
  154. enc4: charmap.ISO8859_1,
  155. enc5: charmap.ISO8859_2,
  156. enc6: charmap.ISO8859_3,
  157. enc7: charmap.ISO8859_4,
  158. enc8: charmap.ISO8859_5,
  159. enc9: charmap.ISO8859_6,
  160. enc81: charmap.ISO8859_6E,
  161. enc82: charmap.ISO8859_6I,
  162. enc10: charmap.ISO8859_7,
  163. enc11: charmap.ISO8859_8,
  164. enc84: charmap.ISO8859_8E,
  165. enc85: charmap.ISO8859_8I,
  166. enc12: charmap.ISO8859_9,
  167. enc13: charmap.ISO8859_10,
  168. enc109: charmap.ISO8859_13,
  169. enc110: charmap.ISO8859_14,
  170. enc111: charmap.ISO8859_15,
  171. enc112: charmap.ISO8859_16,
  172. enc2084: charmap.KOI8R,
  173. enc2088: charmap.KOI8U,
  174. enc2027: charmap.Macintosh,
  175. enc2109: charmap.Windows874,
  176. enc2250: charmap.Windows1250,
  177. enc2251: charmap.Windows1251,
  178. enc2252: charmap.Windows1252,
  179. enc2253: charmap.Windows1253,
  180. enc2254: charmap.Windows1254,
  181. enc2255: charmap.Windows1255,
  182. enc2256: charmap.Windows1256,
  183. enc2257: charmap.Windows1257,
  184. enc2258: charmap.Windows1258,
  185. enc18: japanese.EUCJP,
  186. enc39: japanese.ISO2022JP,
  187. enc17: japanese.ShiftJIS,
  188. enc38: korean.EUCKR,
  189. enc114: simplifiedchinese.GB18030,
  190. enc113: simplifiedchinese.GBK,
  191. enc2085: simplifiedchinese.HZGB2312,
  192. enc2026: traditionalchinese.Big5,
  193. }