buffer.go 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139
  1. package html
  2. import (
  3. "github.com/tdewolff/parse/v2"
  4. "github.com/tdewolff/parse/v2/html"
  5. )
  6. // Token is a single token unit with an attribute value (if given) and hash of the data.
  7. type Token struct {
  8. html.TokenType
  9. Hash Hash
  10. Data []byte
  11. Text []byte
  12. AttrVal []byte
  13. Traits traits
  14. Offset int
  15. HasTemplate bool
  16. }
  17. // TokenBuffer is a buffer that allows for token look-ahead.
  18. type TokenBuffer struct {
  19. r *parse.Input
  20. l *html.Lexer
  21. buf []Token
  22. pos int
  23. attrBuffer []*Token
  24. }
  25. // NewTokenBuffer returns a new TokenBuffer.
  26. func NewTokenBuffer(r *parse.Input, l *html.Lexer) *TokenBuffer {
  27. return &TokenBuffer{
  28. r: r,
  29. l: l,
  30. buf: make([]Token, 0, 8),
  31. }
  32. }
  33. func (z *TokenBuffer) read(t *Token) {
  34. t.Offset = z.r.Offset()
  35. t.TokenType, t.Data = z.l.Next()
  36. t.Text = z.l.Text()
  37. t.HasTemplate = z.l.HasTemplate()
  38. if t.TokenType == html.AttributeToken {
  39. t.Offset += 1 + len(t.Text) + 1
  40. t.AttrVal = z.l.AttrVal()
  41. if 1 < len(t.AttrVal) && (t.AttrVal[0] == '"' || t.AttrVal[0] == '\'') {
  42. t.Offset++
  43. t.AttrVal = t.AttrVal[1 : len(t.AttrVal)-1] // quotes will be readded in attribute loop if necessary
  44. }
  45. t.Hash = ToHash(t.Text)
  46. t.Traits = attrMap[t.Hash]
  47. } else if t.TokenType == html.StartTagToken || t.TokenType == html.EndTagToken {
  48. t.AttrVal = nil
  49. t.Hash = ToHash(t.Text)
  50. t.Traits = tagMap[t.Hash] // zero if not exist
  51. } else {
  52. t.AttrVal = nil
  53. t.Hash = 0
  54. t.Traits = 0
  55. }
  56. }
  57. // Peek returns the ith element and possibly does an allocation.
  58. // Peeking past an error will panic.
  59. func (z *TokenBuffer) Peek(pos int) *Token {
  60. pos += z.pos
  61. if pos >= len(z.buf) {
  62. if len(z.buf) > 0 && z.buf[len(z.buf)-1].TokenType == html.ErrorToken {
  63. return &z.buf[len(z.buf)-1]
  64. }
  65. c := cap(z.buf)
  66. d := len(z.buf) - z.pos
  67. p := pos - z.pos + 1 // required peek length
  68. var buf []Token
  69. if 2*p > c {
  70. buf = make([]Token, 0, 2*c+p)
  71. } else {
  72. buf = z.buf
  73. }
  74. copy(buf[:d], z.buf[z.pos:])
  75. buf = buf[:p]
  76. pos -= z.pos
  77. for i := d; i < p; i++ {
  78. z.read(&buf[i])
  79. if buf[i].TokenType == html.ErrorToken {
  80. buf = buf[:i+1]
  81. pos = i
  82. break
  83. }
  84. }
  85. z.pos, z.buf = 0, buf
  86. }
  87. return &z.buf[pos]
  88. }
  89. // Shift returns the first element and advances position.
  90. func (z *TokenBuffer) Shift() *Token {
  91. if z.pos >= len(z.buf) {
  92. t := &z.buf[:1][0]
  93. z.read(t)
  94. return t
  95. }
  96. t := &z.buf[z.pos]
  97. z.pos++
  98. return t
  99. }
  100. // Attributes extracts the gives attribute hashes from a tag.
  101. // It returns in the same order pointers to the requested token data or nil.
  102. func (z *TokenBuffer) Attributes(hashes ...Hash) []*Token {
  103. n := 0
  104. for {
  105. if t := z.Peek(n); t.TokenType != html.AttributeToken {
  106. break
  107. }
  108. n++
  109. }
  110. if len(hashes) > cap(z.attrBuffer) {
  111. z.attrBuffer = make([]*Token, len(hashes))
  112. } else {
  113. z.attrBuffer = z.attrBuffer[:len(hashes)]
  114. for i := range z.attrBuffer {
  115. z.attrBuffer[i] = nil
  116. }
  117. }
  118. for i := z.pos; i < z.pos+n; i++ {
  119. attr := &z.buf[i]
  120. for j, hash := range hashes {
  121. if hash == attr.Hash {
  122. z.attrBuffer[j] = attr
  123. }
  124. }
  125. }
  126. return z.attrBuffer
  127. }