123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481 |
- package parse
- import (
- "bytes"
- "fmt"
- "strconv"
- "unicode"
- )
- // Copy returns a copy of the given byte slice.
- func Copy(src []byte) (dst []byte) {
- dst = make([]byte, len(src))
- copy(dst, src)
- return
- }
- // ToLower converts all characters in the byte slice from A-Z to a-z.
- func ToLower(src []byte) []byte {
- for i, c := range src {
- if c >= 'A' && c <= 'Z' {
- src[i] = c + ('a' - 'A')
- }
- }
- return src
- }
- // EqualFold returns true when s matches case-insensitively the targetLower (which must be lowercase).
- func EqualFold(s, targetLower []byte) bool {
- if len(s) != len(targetLower) {
- return false
- }
- for i, c := range targetLower {
- d := s[i]
- if d != c && (d < 'A' || d > 'Z' || d+('a'-'A') != c) {
- return false
- }
- }
- return true
- }
- // Printable returns a printable string for given rune
- func Printable(r rune) string {
- if unicode.IsGraphic(r) {
- return fmt.Sprintf("%c", r)
- } else if r < 128 {
- return fmt.Sprintf("0x%02X", r)
- }
- return fmt.Sprintf("%U", r)
- }
- var whitespaceTable = [256]bool{
- // ASCII
- false, false, false, false, false, false, false, false,
- false, true, true, false, true, true, false, false, // tab, new line, form feed, carriage return
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- true, false, false, false, false, false, false, false, // space
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- // non-ASCII
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- }
- // IsWhitespace returns true for space, \n, \r, \t, \f.
- func IsWhitespace(c byte) bool {
- return whitespaceTable[c]
- }
- var newlineTable = [256]bool{
- // ASCII
- false, false, false, false, false, false, false, false,
- false, false, true, false, false, true, false, false, // new line, carriage return
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- // non-ASCII
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- }
- // IsNewline returns true for \n, \r.
- func IsNewline(c byte) bool {
- return newlineTable[c]
- }
- // IsAllWhitespace returns true when the entire byte slice consists of space, \n, \r, \t, \f.
- func IsAllWhitespace(b []byte) bool {
- for _, c := range b {
- if !IsWhitespace(c) {
- return false
- }
- }
- return true
- }
- // TrimWhitespace removes any leading and trailing whitespace characters.
- func TrimWhitespace(b []byte) []byte {
- n := len(b)
- start := n
- for i := 0; i < n; i++ {
- if !IsWhitespace(b[i]) {
- start = i
- break
- }
- }
- end := n
- for i := n - 1; i >= start; i-- {
- if !IsWhitespace(b[i]) {
- end = i + 1
- break
- }
- }
- return b[start:end]
- }
- // ReplaceMultipleWhitespace replaces character series of space, \n, \t, \f, \r into a single space or newline (when the serie contained a \n or \r).
- func ReplaceMultipleWhitespace(b []byte) []byte {
- j, k := 0, 0 // j is write position, k is start of next text section
- for i := 0; i < len(b); i++ {
- if IsWhitespace(b[i]) {
- start := i
- newline := IsNewline(b[i])
- i++
- for ; i < len(b) && IsWhitespace(b[i]); i++ {
- if IsNewline(b[i]) {
- newline = true
- }
- }
- if newline {
- b[start] = '\n'
- } else {
- b[start] = ' '
- }
- if 1 < i-start { // more than one whitespace
- if j == 0 {
- j = start + 1
- } else {
- j += copy(b[j:], b[k:start+1])
- }
- k = i
- }
- }
- }
- if j == 0 {
- return b
- } else if j == 1 { // only if starts with whitespace
- b[k-1] = b[0]
- return b[k-1:]
- } else if k < len(b) {
- j += copy(b[j:], b[k:])
- }
- return b[:j]
- }
- // replaceEntities will replace in b at index i, assuming that b[i] == '&' and that i+3<len(b). The returned int will be the last character of the entity, so that the next iteration can safely do i++ to continue and not miss any entitites.
- func replaceEntities(b []byte, i int, entitiesMap map[string][]byte, revEntitiesMap map[byte][]byte) ([]byte, int) {
- const MaxEntityLength = 31 // longest HTML entity: CounterClockwiseContourIntegral
- var r []byte
- j := i + 1
- if b[j] == '#' {
- j++
- if b[j] == 'x' {
- j++
- c := 0
- for ; j < len(b) && (b[j] >= '0' && b[j] <= '9' || b[j] >= 'a' && b[j] <= 'f' || b[j] >= 'A' && b[j] <= 'F'); j++ {
- if b[j] <= '9' {
- c = c<<4 + int(b[j]-'0')
- } else if b[j] <= 'F' {
- c = c<<4 + int(b[j]-'A') + 10
- } else if b[j] <= 'f' {
- c = c<<4 + int(b[j]-'a') + 10
- }
- }
- if j <= i+3 || 10000 <= c {
- return b, j - 1
- }
- if c < 128 {
- r = []byte{byte(c)}
- } else {
- r = append(r, '&', '#')
- r = strconv.AppendInt(r, int64(c), 10)
- r = append(r, ';')
- }
- } else {
- c := 0
- for ; j < len(b) && c < 128 && b[j] >= '0' && b[j] <= '9'; j++ {
- c = c*10 + int(b[j]-'0')
- }
- if j <= i+2 || 128 <= c {
- return b, j - 1
- }
- r = []byte{byte(c)}
- }
- } else {
- for ; j < len(b) && j-i-1 <= MaxEntityLength && b[j] != ';'; j++ {
- }
- if j <= i+1 || len(b) <= j {
- return b, j - 1
- }
- var ok bool
- r, ok = entitiesMap[string(b[i+1:j])]
- if !ok {
- return b, j
- }
- }
- // j is at semicolon
- n := j + 1 - i
- if j < len(b) && b[j] == ';' && 2 < n {
- if len(r) == 1 {
- if q, ok := revEntitiesMap[r[0]]; ok {
- if len(q) == len(b[i:j+1]) && bytes.Equal(q, b[i:j+1]) {
- return b, j
- }
- r = q
- } else if r[0] == '&' {
- // check if for example & is followed by something that could potentially be an entity
- k := j + 1
- if k < len(b) && (b[k] >= '0' && b[k] <= '9' || b[k] >= 'a' && b[k] <= 'z' || b[k] >= 'A' && b[k] <= 'Z' || b[k] == '#') {
- return b, k
- }
- }
- }
- copy(b[i:], r)
- copy(b[i+len(r):], b[j+1:])
- b = b[:len(b)-n+len(r)]
- return b, i + len(r) - 1
- }
- return b, i
- }
- // ReplaceEntities replaces all occurrences of entites (such as ") to their respective unencoded bytes.
- func ReplaceEntities(b []byte, entitiesMap map[string][]byte, revEntitiesMap map[byte][]byte) []byte {
- for i := 0; i < len(b); i++ {
- if b[i] == '&' && i+3 < len(b) {
- b, i = replaceEntities(b, i, entitiesMap, revEntitiesMap)
- }
- }
- return b
- }
- // ReplaceMultipleWhitespaceAndEntities is a combination of ReplaceMultipleWhitespace and ReplaceEntities. It is faster than executing both sequentially.
- func ReplaceMultipleWhitespaceAndEntities(b []byte, entitiesMap map[string][]byte, revEntitiesMap map[byte][]byte) []byte {
- j, k := 0, 0 // j is write position, k is start of next text section
- for i := 0; i < len(b); i++ {
- if IsWhitespace(b[i]) {
- start := i
- newline := IsNewline(b[i])
- i++
- for ; i < len(b) && IsWhitespace(b[i]); i++ {
- if IsNewline(b[i]) {
- newline = true
- }
- }
- if newline {
- b[start] = '\n'
- } else {
- b[start] = ' '
- }
- if 1 < i-start { // more than one whitespace
- if j == 0 {
- j = start + 1
- } else {
- j += copy(b[j:], b[k:start+1])
- }
- k = i
- }
- }
- if i+3 < len(b) && b[i] == '&' {
- b, i = replaceEntities(b, i, entitiesMap, revEntitiesMap)
- }
- }
- if j == 0 {
- return b
- } else if j == 1 { // only if starts with whitespace
- b[k-1] = b[0]
- return b[k-1:]
- } else if k < len(b) {
- j += copy(b[j:], b[k:])
- }
- return b[:j]
- }
- // URLEncodingTable is a charmap for which characters need escaping in the URL encoding scheme
- var URLEncodingTable = [256]bool{
- // ASCII
- true, true, true, true, true, true, true, true,
- true, true, true, true, true, true, true, true,
- true, true, true, true, true, true, true, true,
- true, true, true, true, true, true, true, true,
- true, false, true, true, true, true, true, false, // space, ", #, $, %, &
- false, false, false, true, true, false, false, true, // +, comma, /
- false, false, false, false, false, false, false, false,
- false, false, true, true, true, true, true, true, // :, ;, <, =, >, ?
- true, false, false, false, false, false, false, false, // @
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, true, true, true, true, false, // [, \, ], ^
- true, false, false, false, false, false, false, false, // `
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, true, true, true, false, true, // {, |, }, DEL
- // non-ASCII
- true, true, true, true, true, true, true, true,
- true, true, true, true, true, true, true, true,
- true, true, true, true, true, true, true, true,
- true, true, true, true, true, true, true, true,
- true, true, true, true, true, true, true, true,
- true, true, true, true, true, true, true, true,
- true, true, true, true, true, true, true, true,
- true, true, true, true, true, true, true, true,
- true, true, true, true, true, true, true, true,
- true, true, true, true, true, true, true, true,
- true, true, true, true, true, true, true, true,
- true, true, true, true, true, true, true, true,
- true, true, true, true, true, true, true, true,
- true, true, true, true, true, true, true, true,
- true, true, true, true, true, true, true, true,
- true, true, true, true, true, true, true, true,
- }
- // DataURIEncodingTable is a charmap for which characters need escaping in the Data URI encoding scheme
- // Escape only non-printable characters, unicode and %, #, &.
- // IE11 additionally requires encoding of \, [, ], ", <, >, `, {, }, |, ^ which is not required by Chrome, Firefox, Opera, Edge, Safari, Yandex
- // To pass the HTML validator, restricted URL characters must be escaped: non-printable characters, space, <, >, #, %, "
- var DataURIEncodingTable = [256]bool{
- // ASCII
- true, true, true, true, true, true, true, true,
- true, true, true, true, true, true, true, true,
- true, true, true, true, true, true, true, true,
- true, true, true, true, true, true, true, true,
- true, false, true, true, false, true, true, false, // space, ", #, %, &
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, true, false, true, false, // <, >
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, true, true, true, true, false, // [, \, ], ^
- true, false, false, false, false, false, false, false, // `
- false, false, false, false, false, false, false, false,
- false, false, false, false, false, false, false, false,
- false, false, false, true, true, true, false, true, // {, |, }, DEL
- // non-ASCII
- true, true, true, true, true, true, true, true,
- true, true, true, true, true, true, true, true,
- true, true, true, true, true, true, true, true,
- true, true, true, true, true, true, true, true,
- true, true, true, true, true, true, true, true,
- true, true, true, true, true, true, true, true,
- true, true, true, true, true, true, true, true,
- true, true, true, true, true, true, true, true,
- true, true, true, true, true, true, true, true,
- true, true, true, true, true, true, true, true,
- true, true, true, true, true, true, true, true,
- true, true, true, true, true, true, true, true,
- true, true, true, true, true, true, true, true,
- true, true, true, true, true, true, true, true,
- true, true, true, true, true, true, true, true,
- true, true, true, true, true, true, true, true,
- }
- // EncodeURL encodes bytes using the URL encoding scheme
- func EncodeURL(b []byte, table [256]bool) []byte {
- for i := 0; i < len(b); i++ {
- c := b[i]
- if table[c] {
- b = append(b, 0, 0)
- copy(b[i+3:], b[i+1:])
- b[i+0] = '%'
- b[i+1] = "0123456789ABCDEF"[c>>4]
- b[i+2] = "0123456789ABCDEF"[c&15]
- }
- }
- return b
- }
- // DecodeURL decodes an URL encoded using the URL encoding scheme
- func DecodeURL(b []byte) []byte {
- for i := 0; i < len(b); i++ {
- if b[i] == '%' && i+2 < len(b) {
- j := i + 1
- c := 0
- for ; j < i+3 && (b[j] >= '0' && b[j] <= '9' || b[j] >= 'a' && b[j] <= 'f' || b[j] >= 'A' && b[j] <= 'F'); j++ {
- if b[j] <= '9' {
- c = c<<4 + int(b[j]-'0')
- } else if b[j] <= 'F' {
- c = c<<4 + int(b[j]-'A') + 10
- } else if b[j] <= 'f' {
- c = c<<4 + int(b[j]-'a') + 10
- }
- }
- if j == i+3 && c < 128 {
- b[i] = byte(c)
- b = append(b[:i+1], b[i+3:]...)
- }
- } else if b[i] == '+' {
- b[i] = ' '
- }
- }
- return b
- }
|