util.go 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481
  1. package parse
  2. import (
  3. "bytes"
  4. "fmt"
  5. "strconv"
  6. "unicode"
  7. )
  8. // Copy returns a copy of the given byte slice.
  9. func Copy(src []byte) (dst []byte) {
  10. dst = make([]byte, len(src))
  11. copy(dst, src)
  12. return
  13. }
  14. // ToLower converts all characters in the byte slice from A-Z to a-z.
  15. func ToLower(src []byte) []byte {
  16. for i, c := range src {
  17. if c >= 'A' && c <= 'Z' {
  18. src[i] = c + ('a' - 'A')
  19. }
  20. }
  21. return src
  22. }
  23. // EqualFold returns true when s matches case-insensitively the targetLower (which must be lowercase).
  24. func EqualFold(s, targetLower []byte) bool {
  25. if len(s) != len(targetLower) {
  26. return false
  27. }
  28. for i, c := range targetLower {
  29. d := s[i]
  30. if d != c && (d < 'A' || d > 'Z' || d+('a'-'A') != c) {
  31. return false
  32. }
  33. }
  34. return true
  35. }
  36. // Printable returns a printable string for given rune
  37. func Printable(r rune) string {
  38. if unicode.IsGraphic(r) {
  39. return fmt.Sprintf("%c", r)
  40. } else if r < 128 {
  41. return fmt.Sprintf("0x%02X", r)
  42. }
  43. return fmt.Sprintf("%U", r)
  44. }
  45. var whitespaceTable = [256]bool{
  46. // ASCII
  47. false, false, false, false, false, false, false, false,
  48. false, true, true, false, true, true, false, false, // tab, new line, form feed, carriage return
  49. false, false, false, false, false, false, false, false,
  50. false, false, false, false, false, false, false, false,
  51. true, false, false, false, false, false, false, false, // space
  52. false, false, false, false, false, false, false, false,
  53. false, false, false, false, false, false, false, false,
  54. false, false, false, false, false, false, false, false,
  55. false, false, false, false, false, false, false, false,
  56. false, false, false, false, false, false, false, false,
  57. false, false, false, false, false, false, false, false,
  58. false, false, false, false, false, false, false, false,
  59. false, false, false, false, false, false, false, false,
  60. false, false, false, false, false, false, false, false,
  61. false, false, false, false, false, false, false, false,
  62. false, false, false, false, false, false, false, false,
  63. // non-ASCII
  64. false, false, false, false, false, false, false, false,
  65. false, false, false, false, false, false, false, false,
  66. false, false, false, false, false, false, false, false,
  67. false, false, false, false, false, false, false, false,
  68. false, false, false, false, false, false, false, false,
  69. false, false, false, false, false, false, false, false,
  70. false, false, false, false, false, false, false, false,
  71. false, false, false, false, false, false, false, false,
  72. false, false, false, false, false, false, false, false,
  73. false, false, false, false, false, false, false, false,
  74. false, false, false, false, false, false, false, false,
  75. false, false, false, false, false, false, false, false,
  76. false, false, false, false, false, false, false, false,
  77. false, false, false, false, false, false, false, false,
  78. false, false, false, false, false, false, false, false,
  79. false, false, false, false, false, false, false, false,
  80. }
  81. // IsWhitespace returns true for space, \n, \r, \t, \f.
  82. func IsWhitespace(c byte) bool {
  83. return whitespaceTable[c]
  84. }
  85. var newlineTable = [256]bool{
  86. // ASCII
  87. false, false, false, false, false, false, false, false,
  88. false, false, true, false, false, true, false, false, // new line, carriage return
  89. false, false, false, false, false, false, false, false,
  90. false, false, false, false, false, false, false, false,
  91. false, false, false, false, false, false, false, false,
  92. false, false, false, false, false, false, false, false,
  93. false, false, false, false, false, false, false, false,
  94. false, false, false, false, false, false, false, false,
  95. false, false, false, false, false, false, false, false,
  96. false, false, false, false, false, false, false, false,
  97. false, false, false, false, false, false, false, false,
  98. false, false, false, false, false, false, false, false,
  99. false, false, false, false, false, false, false, false,
  100. false, false, false, false, false, false, false, false,
  101. false, false, false, false, false, false, false, false,
  102. false, false, false, false, false, false, false, false,
  103. // non-ASCII
  104. false, false, false, false, false, false, false, false,
  105. false, false, false, false, false, false, false, false,
  106. false, false, false, false, false, false, false, false,
  107. false, false, false, false, false, false, false, false,
  108. false, false, false, false, false, false, false, false,
  109. false, false, false, false, false, false, false, false,
  110. false, false, false, false, false, false, false, false,
  111. false, false, false, false, false, false, false, false,
  112. false, false, false, false, false, false, false, false,
  113. false, false, false, false, false, false, false, false,
  114. false, false, false, false, false, false, false, false,
  115. false, false, false, false, false, false, false, false,
  116. false, false, false, false, false, false, false, false,
  117. false, false, false, false, false, false, false, false,
  118. false, false, false, false, false, false, false, false,
  119. false, false, false, false, false, false, false, false,
  120. }
  121. // IsNewline returns true for \n, \r.
  122. func IsNewline(c byte) bool {
  123. return newlineTable[c]
  124. }
  125. // IsAllWhitespace returns true when the entire byte slice consists of space, \n, \r, \t, \f.
  126. func IsAllWhitespace(b []byte) bool {
  127. for _, c := range b {
  128. if !IsWhitespace(c) {
  129. return false
  130. }
  131. }
  132. return true
  133. }
  134. // TrimWhitespace removes any leading and trailing whitespace characters.
  135. func TrimWhitespace(b []byte) []byte {
  136. n := len(b)
  137. start := n
  138. for i := 0; i < n; i++ {
  139. if !IsWhitespace(b[i]) {
  140. start = i
  141. break
  142. }
  143. }
  144. end := n
  145. for i := n - 1; i >= start; i-- {
  146. if !IsWhitespace(b[i]) {
  147. end = i + 1
  148. break
  149. }
  150. }
  151. return b[start:end]
  152. }
  153. // ReplaceMultipleWhitespace replaces character series of space, \n, \t, \f, \r into a single space or newline (when the serie contained a \n or \r).
  154. func ReplaceMultipleWhitespace(b []byte) []byte {
  155. j, k := 0, 0 // j is write position, k is start of next text section
  156. for i := 0; i < len(b); i++ {
  157. if IsWhitespace(b[i]) {
  158. start := i
  159. newline := IsNewline(b[i])
  160. i++
  161. for ; i < len(b) && IsWhitespace(b[i]); i++ {
  162. if IsNewline(b[i]) {
  163. newline = true
  164. }
  165. }
  166. if newline {
  167. b[start] = '\n'
  168. } else {
  169. b[start] = ' '
  170. }
  171. if 1 < i-start { // more than one whitespace
  172. if j == 0 {
  173. j = start + 1
  174. } else {
  175. j += copy(b[j:], b[k:start+1])
  176. }
  177. k = i
  178. }
  179. }
  180. }
  181. if j == 0 {
  182. return b
  183. } else if j == 1 { // only if starts with whitespace
  184. b[k-1] = b[0]
  185. return b[k-1:]
  186. } else if k < len(b) {
  187. j += copy(b[j:], b[k:])
  188. }
  189. return b[:j]
  190. }
  191. // replaceEntities will replace in b at index i, assuming that b[i] == '&' and that i+3<len(b). The returned int will be the last character of the entity, so that the next iteration can safely do i++ to continue and not miss any entitites.
  192. func replaceEntities(b []byte, i int, entitiesMap map[string][]byte, revEntitiesMap map[byte][]byte) ([]byte, int) {
  193. const MaxEntityLength = 31 // longest HTML entity: CounterClockwiseContourIntegral
  194. var r []byte
  195. j := i + 1
  196. if b[j] == '#' {
  197. j++
  198. if b[j] == 'x' {
  199. j++
  200. c := 0
  201. for ; j < len(b) && (b[j] >= '0' && b[j] <= '9' || b[j] >= 'a' && b[j] <= 'f' || b[j] >= 'A' && b[j] <= 'F'); j++ {
  202. if b[j] <= '9' {
  203. c = c<<4 + int(b[j]-'0')
  204. } else if b[j] <= 'F' {
  205. c = c<<4 + int(b[j]-'A') + 10
  206. } else if b[j] <= 'f' {
  207. c = c<<4 + int(b[j]-'a') + 10
  208. }
  209. }
  210. if j <= i+3 || 10000 <= c {
  211. return b, j - 1
  212. }
  213. if c < 128 {
  214. r = []byte{byte(c)}
  215. } else {
  216. r = append(r, '&', '#')
  217. r = strconv.AppendInt(r, int64(c), 10)
  218. r = append(r, ';')
  219. }
  220. } else {
  221. c := 0
  222. for ; j < len(b) && c < 128 && b[j] >= '0' && b[j] <= '9'; j++ {
  223. c = c*10 + int(b[j]-'0')
  224. }
  225. if j <= i+2 || 128 <= c {
  226. return b, j - 1
  227. }
  228. r = []byte{byte(c)}
  229. }
  230. } else {
  231. for ; j < len(b) && j-i-1 <= MaxEntityLength && b[j] != ';'; j++ {
  232. }
  233. if j <= i+1 || len(b) <= j {
  234. return b, j - 1
  235. }
  236. var ok bool
  237. r, ok = entitiesMap[string(b[i+1:j])]
  238. if !ok {
  239. return b, j
  240. }
  241. }
  242. // j is at semicolon
  243. n := j + 1 - i
  244. if j < len(b) && b[j] == ';' && 2 < n {
  245. if len(r) == 1 {
  246. if q, ok := revEntitiesMap[r[0]]; ok {
  247. if len(q) == len(b[i:j+1]) && bytes.Equal(q, b[i:j+1]) {
  248. return b, j
  249. }
  250. r = q
  251. } else if r[0] == '&' {
  252. // check if for example &amp; is followed by something that could potentially be an entity
  253. k := j + 1
  254. if k < len(b) && (b[k] >= '0' && b[k] <= '9' || b[k] >= 'a' && b[k] <= 'z' || b[k] >= 'A' && b[k] <= 'Z' || b[k] == '#') {
  255. return b, k
  256. }
  257. }
  258. }
  259. copy(b[i:], r)
  260. copy(b[i+len(r):], b[j+1:])
  261. b = b[:len(b)-n+len(r)]
  262. return b, i + len(r) - 1
  263. }
  264. return b, i
  265. }
  266. // ReplaceEntities replaces all occurrences of entites (such as &quot;) to their respective unencoded bytes.
  267. func ReplaceEntities(b []byte, entitiesMap map[string][]byte, revEntitiesMap map[byte][]byte) []byte {
  268. for i := 0; i < len(b); i++ {
  269. if b[i] == '&' && i+3 < len(b) {
  270. b, i = replaceEntities(b, i, entitiesMap, revEntitiesMap)
  271. }
  272. }
  273. return b
  274. }
  275. // ReplaceMultipleWhitespaceAndEntities is a combination of ReplaceMultipleWhitespace and ReplaceEntities. It is faster than executing both sequentially.
  276. func ReplaceMultipleWhitespaceAndEntities(b []byte, entitiesMap map[string][]byte, revEntitiesMap map[byte][]byte) []byte {
  277. j, k := 0, 0 // j is write position, k is start of next text section
  278. for i := 0; i < len(b); i++ {
  279. if IsWhitespace(b[i]) {
  280. start := i
  281. newline := IsNewline(b[i])
  282. i++
  283. for ; i < len(b) && IsWhitespace(b[i]); i++ {
  284. if IsNewline(b[i]) {
  285. newline = true
  286. }
  287. }
  288. if newline {
  289. b[start] = '\n'
  290. } else {
  291. b[start] = ' '
  292. }
  293. if 1 < i-start { // more than one whitespace
  294. if j == 0 {
  295. j = start + 1
  296. } else {
  297. j += copy(b[j:], b[k:start+1])
  298. }
  299. k = i
  300. }
  301. }
  302. if i+3 < len(b) && b[i] == '&' {
  303. b, i = replaceEntities(b, i, entitiesMap, revEntitiesMap)
  304. }
  305. }
  306. if j == 0 {
  307. return b
  308. } else if j == 1 { // only if starts with whitespace
  309. b[k-1] = b[0]
  310. return b[k-1:]
  311. } else if k < len(b) {
  312. j += copy(b[j:], b[k:])
  313. }
  314. return b[:j]
  315. }
  316. // URLEncodingTable is a charmap for which characters need escaping in the URL encoding scheme
  317. var URLEncodingTable = [256]bool{
  318. // ASCII
  319. true, true, true, true, true, true, true, true,
  320. true, true, true, true, true, true, true, true,
  321. true, true, true, true, true, true, true, true,
  322. true, true, true, true, true, true, true, true,
  323. true, false, true, true, true, true, true, false, // space, ", #, $, %, &
  324. false, false, false, true, true, false, false, true, // +, comma, /
  325. false, false, false, false, false, false, false, false,
  326. false, false, true, true, true, true, true, true, // :, ;, <, =, >, ?
  327. true, false, false, false, false, false, false, false, // @
  328. false, false, false, false, false, false, false, false,
  329. false, false, false, false, false, false, false, false,
  330. false, false, false, true, true, true, true, false, // [, \, ], ^
  331. true, false, false, false, false, false, false, false, // `
  332. false, false, false, false, false, false, false, false,
  333. false, false, false, false, false, false, false, false,
  334. false, false, false, true, true, true, false, true, // {, |, }, DEL
  335. // non-ASCII
  336. true, true, true, true, true, true, true, true,
  337. true, true, true, true, true, true, true, true,
  338. true, true, true, true, true, true, true, true,
  339. true, true, true, true, true, true, true, true,
  340. true, true, true, true, true, true, true, true,
  341. true, true, true, true, true, true, true, true,
  342. true, true, true, true, true, true, true, true,
  343. true, true, true, true, true, true, true, true,
  344. true, true, true, true, true, true, true, true,
  345. true, true, true, true, true, true, true, true,
  346. true, true, true, true, true, true, true, true,
  347. true, true, true, true, true, true, true, true,
  348. true, true, true, true, true, true, true, true,
  349. true, true, true, true, true, true, true, true,
  350. true, true, true, true, true, true, true, true,
  351. true, true, true, true, true, true, true, true,
  352. }
  353. // DataURIEncodingTable is a charmap for which characters need escaping in the Data URI encoding scheme
  354. // Escape only non-printable characters, unicode and %, #, &.
  355. // IE11 additionally requires encoding of \, [, ], ", <, >, `, {, }, |, ^ which is not required by Chrome, Firefox, Opera, Edge, Safari, Yandex
  356. // To pass the HTML validator, restricted URL characters must be escaped: non-printable characters, space, <, >, #, %, "
  357. var DataURIEncodingTable = [256]bool{
  358. // ASCII
  359. true, true, true, true, true, true, true, true,
  360. true, true, true, true, true, true, true, true,
  361. true, true, true, true, true, true, true, true,
  362. true, true, true, true, true, true, true, true,
  363. true, false, true, true, false, true, true, false, // space, ", #, %, &
  364. false, false, false, false, false, false, false, false,
  365. false, false, false, false, false, false, false, false,
  366. false, false, false, false, true, false, true, false, // <, >
  367. false, false, false, false, false, false, false, false,
  368. false, false, false, false, false, false, false, false,
  369. false, false, false, false, false, false, false, false,
  370. false, false, false, true, true, true, true, false, // [, \, ], ^
  371. true, false, false, false, false, false, false, false, // `
  372. false, false, false, false, false, false, false, false,
  373. false, false, false, false, false, false, false, false,
  374. false, false, false, true, true, true, false, true, // {, |, }, DEL
  375. // non-ASCII
  376. true, true, true, true, true, true, true, true,
  377. true, true, true, true, true, true, true, true,
  378. true, true, true, true, true, true, true, true,
  379. true, true, true, true, true, true, true, true,
  380. true, true, true, true, true, true, true, true,
  381. true, true, true, true, true, true, true, true,
  382. true, true, true, true, true, true, true, true,
  383. true, true, true, true, true, true, true, true,
  384. true, true, true, true, true, true, true, true,
  385. true, true, true, true, true, true, true, true,
  386. true, true, true, true, true, true, true, true,
  387. true, true, true, true, true, true, true, true,
  388. true, true, true, true, true, true, true, true,
  389. true, true, true, true, true, true, true, true,
  390. true, true, true, true, true, true, true, true,
  391. true, true, true, true, true, true, true, true,
  392. }
  393. // EncodeURL encodes bytes using the URL encoding scheme
  394. func EncodeURL(b []byte, table [256]bool) []byte {
  395. for i := 0; i < len(b); i++ {
  396. c := b[i]
  397. if table[c] {
  398. b = append(b, 0, 0)
  399. copy(b[i+3:], b[i+1:])
  400. b[i+0] = '%'
  401. b[i+1] = "0123456789ABCDEF"[c>>4]
  402. b[i+2] = "0123456789ABCDEF"[c&15]
  403. }
  404. }
  405. return b
  406. }
  407. // DecodeURL decodes an URL encoded using the URL encoding scheme
  408. func DecodeURL(b []byte) []byte {
  409. for i := 0; i < len(b); i++ {
  410. if b[i] == '%' && i+2 < len(b) {
  411. j := i + 1
  412. c := 0
  413. for ; j < i+3 && (b[j] >= '0' && b[j] <= '9' || b[j] >= 'a' && b[j] <= 'f' || b[j] >= 'A' && b[j] <= 'F'); j++ {
  414. if b[j] <= '9' {
  415. c = c<<4 + int(b[j]-'0')
  416. } else if b[j] <= 'F' {
  417. c = c<<4 + int(b[j]-'A') + 10
  418. } else if b[j] <= 'f' {
  419. c = c<<4 + int(b[j]-'a') + 10
  420. }
  421. }
  422. if j == i+3 && c < 128 {
  423. b[i] = byte(c)
  424. b = append(b[:i+1], b[i+3:]...)
  425. }
  426. } else if b[i] == '+' {
  427. b[i] = ' '
  428. }
  429. }
  430. return b
  431. }