parse.go 8.1 KB


  1. package html
  2. import (
  3. "bytes"
  4. "fmt"
  5. "io"
  6. "strings"
  7. "github.com/tdewolff/parse/v2"
  8. "github.com/tdewolff/parse/v2/css"
  9. )
  10. type AST struct {
  11. Children []*Tag
  12. Text []byte
  13. }
  14. func (ast *AST) String() string {
  15. sb := strings.Builder{}
  16. for i, child := range ast.Children {
  17. if i != 0 {
  18. sb.WriteString("\n")
  19. }
  20. sb.WriteString(child.ASTString())
  21. }
  22. return sb.String()
  23. }
  24. type Attr struct {
  25. Key, Val []byte
  26. }
  27. func (attr *Attr) String() string {
  28. return fmt.Sprintf(`%s="%s"`, string(attr.Key), string(attr.Val))
  29. }
  30. type Tag struct {
  31. Root *AST
  32. Parent *Tag
  33. Prev, Next *Tag
  34. Children []*Tag
  35. Index int
  36. Name []byte
  37. Attrs []Attr
  38. textStart, textEnd int
  39. }
  40. func (tag *Tag) getAttr(key []byte) ([]byte, bool) {
  41. for _, attr := range tag.Attrs {
  42. if bytes.Equal(key, attr.Key) {
  43. return attr.Val, true
  44. }
  45. }
  46. return nil, false
  47. }
  48. func (tag *Tag) GetAttr(key string) (string, bool) {
  49. val, ok := tag.getAttr([]byte(key))
  50. return string(val), ok
  51. }
  52. func (tag *Tag) Text() string {
  53. return string(tag.Root.Text[tag.textStart:tag.textEnd])
  54. }
  55. func (tag *Tag) String() string {
  56. sb := strings.Builder{}
  57. sb.WriteString("<")
  58. sb.Write(tag.Name)
  59. for _, attr := range tag.Attrs {
  60. sb.WriteString(" ")
  61. sb.WriteString(attr.String())
  62. }
  63. sb.WriteString(">")
  64. return sb.String()
  65. }
  66. func (tag *Tag) ASTString() string {
  67. sb := strings.Builder{}
  68. sb.WriteString(tag.String())
  69. for _, child := range tag.Children {
  70. sb.WriteString("\n ")
  71. s := child.ASTString()
  72. s = strings.ReplaceAll(s, "\n", "\n ")
  73. sb.WriteString(s)
  74. }
  75. return sb.String()
  76. }
  77. func Parse(r *parse.Input) (*AST, error) {
  78. ast := &AST{}
  79. root := &Tag{}
  80. cur := root
  81. l := NewLexer(r)
  82. for {
  83. tt, data := l.Next()
  84. switch tt {
  85. case ErrorToken:
  86. if err := l.Err(); err != io.EOF {
  87. return nil, err
  88. }
  89. ast.Children = root.Children
  90. return ast, nil
  91. case TextToken:
  92. ast.Text = append(ast.Text, data...)
  93. case StartTagToken:
  94. child := &Tag{
  95. Root: ast,
  96. Parent: cur,
  97. Index: len(cur.Children),
  98. Name: l.Text(),
  99. textStart: len(ast.Text),
  100. }
  101. if 0 < len(cur.Children) {
  102. child.Prev = cur.Children[len(cur.Children)-1]
  103. child.Prev.Next = child
  104. }
  105. cur.Children = append(cur.Children, child)
  106. cur = child
  107. case AttributeToken:
  108. val := l.AttrVal()
  109. if 0 < len(val) && (val[0] == '"' || val[0] == '\'') {
  110. val = val[1 : len(val)-1]
  111. }
  112. cur.Attrs = append(cur.Attrs, Attr{l.AttrKey(), val})
  113. case StartTagCloseToken:
  114. if voidTags[string(cur.Name)] {
  115. cur.textEnd = len(ast.Text)
  116. cur = cur.Parent
  117. }
  118. case EndTagToken, StartTagVoidToken:
  119. start := cur
  120. for start != root && !bytes.Equal(l.Text(), start.Name) {
  121. start = start.Parent
  122. }
  123. if start == root {
  124. // ignore
  125. } else {
  126. parent := start.Parent
  127. for cur != parent {
  128. cur.textEnd = len(ast.Text)
  129. cur = cur.Parent
  130. }
  131. }
  132. }
  133. }
  134. }
  135. func (ast *AST) Query(s string) (*Tag, error) {
  136. sel, err := ParseSelector(s)
  137. if err != nil {
  138. return nil, err
  139. }
  140. for _, child := range ast.Children {
  141. if match := child.query(sel); match != nil {
  142. return match, nil
  143. }
  144. }
  145. return nil, nil
  146. }
  147. func (tag *Tag) query(sel selector) *Tag {
  148. if sel.AppliesTo(tag) {
  149. return tag
  150. }
  151. for _, child := range tag.Children {
  152. if match := child.query(sel); match != nil {
  153. return match
  154. }
  155. }
  156. return nil
  157. }
  158. func (ast *AST) QueryAll(s string) ([]*Tag, error) {
  159. sel, err := ParseSelector(s)
  160. if err != nil {
  161. return nil, err
  162. }
  163. matches := []*Tag{}
  164. for _, child := range ast.Children {
  165. child.queryAll(&matches, sel)
  166. }
  167. return matches, nil
  168. }
  169. func (tag *Tag) queryAll(matches *[]*Tag, sel selector) {
  170. if sel.AppliesTo(tag) {
  171. *matches = append(*matches, tag)
  172. }
  173. for _, child := range tag.Children {
  174. child.queryAll(matches, sel)
  175. }
  176. }
  177. type attrSelector struct {
  178. op byte // empty, =, ~, |
  179. attr []byte
  180. val []byte
  181. }
  182. func (sel attrSelector) AppliesTo(tag *Tag) bool {
  183. val, ok := tag.getAttr(sel.attr)
  184. if !ok {
  185. return false
  186. }
  187. switch sel.op {
  188. case 0:
  189. return true
  190. case '=':
  191. return bytes.Equal(val, sel.val)
  192. case '~':
  193. if 0 < len(sel.val) {
  194. vals := bytes.Split(val, []byte(" "))
  195. for _, val := range vals {
  196. if bytes.Equal(val, sel.val) {
  197. return true
  198. }
  199. }
  200. }
  201. case '|':
  202. return bytes.Equal(val, sel.val) || bytes.HasPrefix(val, append(sel.val, '-'))
  203. }
  204. return false
  205. }
  206. func (attr attrSelector) String() string {
  207. sb := strings.Builder{}
  208. sb.Write(attr.attr)
  209. if attr.op != 0 {
  210. sb.WriteByte(attr.op)
  211. if attr.op != '=' {
  212. sb.WriteByte('=')
  213. }
  214. sb.WriteByte('"')
  215. sb.Write(attr.val)
  216. sb.WriteByte('"')
  217. }
  218. return sb.String()
  219. }
  220. type selectorNode struct {
  221. typ []byte // is * for universal
  222. attrs []attrSelector
  223. op byte // space or >, last is NULL
  224. }
  225. func (sel selectorNode) AppliesTo(tag *Tag) bool {
  226. if 0 < len(sel.typ) && !bytes.Equal(sel.typ, []byte("*")) && !bytes.Equal(sel.typ, tag.Name) {
  227. return false
  228. }
  229. for _, attr := range sel.attrs {
  230. if !attr.AppliesTo(tag) {
  231. return false
  232. }
  233. }
  234. return true
  235. }
  236. func (sel selectorNode) String() string {
  237. sb := strings.Builder{}
  238. sb.Write(sel.typ)
  239. for _, attr := range sel.attrs {
  240. if bytes.Equal(attr.attr, []byte("id")) && attr.op == '=' {
  241. sb.WriteByte('#')
  242. sb.Write(attr.val)
  243. } else if bytes.Equal(attr.attr, []byte("class")) && attr.op == '~' {
  244. sb.WriteByte('.')
  245. sb.Write(attr.val)
  246. } else {
  247. sb.WriteByte('[')
  248. sb.WriteString(attr.String())
  249. sb.WriteByte(']')
  250. }
  251. }
  252. if sel.op != 0 {
  253. sb.WriteByte(' ')
  254. sb.WriteByte(sel.op)
  255. sb.WriteByte(' ')
  256. }
  257. return sb.String()
  258. }
  259. type token struct {
  260. tt css.TokenType
  261. data []byte
  262. }
  263. type selector []selectorNode
  264. func ParseSelector(s string) (selector, error) {
  265. ts := []token{}
  266. l := css.NewLexer(parse.NewInputString(s))
  267. for {
  268. tt, data := l.Next()
  269. if tt == css.ErrorToken {
  270. if err := l.Err(); err != io.EOF {
  271. return selector{}, err
  272. }
  273. break
  274. }
  275. ts = append(ts, token{
  276. tt: tt,
  277. data: data,
  278. })
  279. }
  280. sel := selector{}
  281. node := selectorNode{}
  282. for i := 0; i < len(ts); i++ {
  283. t := ts[i]
  284. if 0 < i && (t.tt == css.WhitespaceToken || t.tt == css.DelimToken && t.data[0] == '>') {
  285. if t.tt == css.DelimToken {
  286. node.op = '>'
  287. } else {
  288. node.op = ' '
  289. }
  290. sel = append(sel, node)
  291. node = selectorNode{}
  292. } else if t.tt == css.IdentToken || t.tt == css.DelimToken && t.data[0] == '*' {
  293. node.typ = t.data
  294. } else if t.tt == css.DelimToken && (t.data[0] == '.' || t.data[0] == '#') && i+1 < len(ts) && ts[i+1].tt == css.IdentToken {
  295. if t.data[0] == '#' {
  296. node.attrs = append(node.attrs, attrSelector{op: '=', attr: []byte("id"), val: ts[i+1].data})
  297. } else {
  298. node.attrs = append(node.attrs, attrSelector{op: '~', attr: []byte("class"), val: ts[i+1].data})
  299. }
  300. i++
  301. } else if t.tt == css.DelimToken && t.data[0] == '[' && i+2 < len(ts) && ts[i+1].tt == css.IdentToken && ts[i+2].tt == css.DelimToken {
  302. if ts[i+2].data[0] == ']' {
  303. node.attrs = append(node.attrs, attrSelector{op: 0, attr: ts[i+1].data})
  304. i += 2
  305. } else if i+4 < len(ts) && ts[i+3].tt == css.IdentToken && ts[i+4].tt == css.DelimToken && ts[i+4].data[0] == ']' {
  306. node.attrs = append(node.attrs, attrSelector{op: ts[i+2].data[0], attr: ts[i+1].data, val: ts[i+3].data})
  307. i += 4
  308. }
  309. }
  310. }
  311. sel = append(sel, node)
  312. return sel, nil
  313. }
  314. func (sels selector) AppliesTo(tag *Tag) bool {
  315. if len(sels) == 0 {
  316. return true
  317. } else if !sels[len(sels)-1].AppliesTo(tag) {
  318. return false
  319. }
  320. tag = tag.Parent
  321. isel := len(sels) - 2
  322. for 0 <= isel && tag != nil {
  323. switch sels[isel].op {
  324. case ' ':
  325. for tag != nil {
  326. if sels[isel].AppliesTo(tag) {
  327. break
  328. }
  329. tag = tag.Parent
  330. }
  331. case '>':
  332. if !sels[isel].AppliesTo(tag) {
  333. return false
  334. }
  335. tag = tag.Parent
  336. default:
  337. return false
  338. }
  339. isel--
  340. }
  341. return len(sels) != 0 && isel == -1
  342. }
  343. func (sels selector) String() string {
  344. if len(sels) == 0 {
  345. return ""
  346. }
  347. sb := strings.Builder{}
  348. for _, sel := range sels {
  349. sb.WriteString(sel.String())
  350. }
  351. return sb.String()[1:]
  352. }
  353. var voidTags = map[string]bool{
  354. "area": true,
  355. "base": true,
  356. "br": true,
  357. "col": true,
  358. "embed": true,
  359. "hr": true,
  360. "img": true,
  361. "input": true,
  362. "link": true,
  363. "meta": true,
  364. "source": true,
  365. "track": true,
  366. "wbr": true,
  367. }