lex.go 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340
  1. // Package xml is an XML1.0 lexer following the specifications at http://www.w3.org/TR/xml/.
  2. package xml
  3. import (
  4. "strconv"
  5. "github.com/tdewolff/parse/v2"
  6. )
  7. // TokenType determines the type of token, eg. a number or a semicolon.
  8. type TokenType uint32
  9. // TokenType values.
  10. const (
  11. ErrorToken TokenType = iota // extra token when errors occur
  12. CommentToken
  13. DOCTYPEToken
  14. CDATAToken
  15. StartTagToken
  16. StartTagPIToken
  17. StartTagCloseToken
  18. StartTagCloseVoidToken
  19. StartTagClosePIToken
  20. EndTagToken
  21. AttributeToken
  22. TextToken
  23. )
  24. // String returns the string representation of a TokenType.
  25. func (tt TokenType) String() string {
  26. switch tt {
  27. case ErrorToken:
  28. return "Error"
  29. case CommentToken:
  30. return "Comment"
  31. case DOCTYPEToken:
  32. return "DOCTYPE"
  33. case CDATAToken:
  34. return "CDATA"
  35. case StartTagToken:
  36. return "StartTag"
  37. case StartTagPIToken:
  38. return "StartTagPI"
  39. case StartTagCloseToken:
  40. return "StartTagClose"
  41. case StartTagCloseVoidToken:
  42. return "StartTagCloseVoid"
  43. case StartTagClosePIToken:
  44. return "StartTagClosePI"
  45. case EndTagToken:
  46. return "EndTag"
  47. case AttributeToken:
  48. return "Attribute"
  49. case TextToken:
  50. return "Text"
  51. }
  52. return "Invalid(" + strconv.Itoa(int(tt)) + ")"
  53. }
  54. ////////////////////////////////////////////////////////////////
  55. // Lexer is the state for the lexer.
  56. type Lexer struct {
  57. r *parse.Input
  58. err error
  59. inTag bool
  60. text []byte
  61. attrVal []byte
  62. }
  63. // NewLexer returns a new Lexer for a given io.Reader.
  64. func NewLexer(r *parse.Input) *Lexer {
  65. return &Lexer{
  66. r: r,
  67. }
  68. }
  69. // Err returns the error encountered during lexing, this is often io.EOF but also other errors can be returned.
  70. func (l *Lexer) Err() error {
  71. if l.err != nil {
  72. return l.err
  73. }
  74. return l.r.Err()
  75. }
  76. // Text returns the textual representation of a token. This excludes delimiters and additional leading/trailing characters.
  77. func (l *Lexer) Text() []byte {
  78. return l.text
  79. }
  80. // AttrVal returns the attribute value when an AttributeToken was returned from Next.
  81. func (l *Lexer) AttrVal() []byte {
  82. return l.attrVal
  83. }
  84. // Next returns the next Token. It returns ErrorToken when an error was encountered. Using Err() one can retrieve the error message.
  85. func (l *Lexer) Next() (TokenType, []byte) {
  86. l.text = nil
  87. var c byte
  88. if l.inTag {
  89. l.attrVal = nil
  90. for { // before attribute name state
  91. if c = l.r.Peek(0); c == ' ' || c == '\t' || c == '\n' || c == '\r' {
  92. l.r.Move(1)
  93. continue
  94. }
  95. break
  96. }
  97. if c == 0 {
  98. if l.r.Err() == nil {
  99. l.err = parse.NewErrorLexer(l.r, "unexpected NULL character")
  100. }
  101. return ErrorToken, nil
  102. } else if c != '>' && (c != '/' && c != '?' || l.r.Peek(1) != '>') {
  103. return AttributeToken, l.shiftAttribute()
  104. }
  105. l.r.Skip()
  106. l.inTag = false
  107. if c == '/' {
  108. l.r.Move(2)
  109. return StartTagCloseVoidToken, l.r.Shift()
  110. } else if c == '?' {
  111. l.r.Move(2)
  112. return StartTagClosePIToken, l.r.Shift()
  113. } else {
  114. l.r.Move(1)
  115. return StartTagCloseToken, l.r.Shift()
  116. }
  117. }
  118. for {
  119. c = l.r.Peek(0)
  120. if c == '<' {
  121. if l.r.Pos() > 0 {
  122. l.text = l.r.Shift()
  123. return TextToken, l.text
  124. }
  125. c = l.r.Peek(1)
  126. if c == '/' {
  127. l.r.Move(2)
  128. return EndTagToken, l.shiftEndTag()
  129. } else if c == '!' {
  130. l.r.Move(2)
  131. if l.at('-', '-') {
  132. l.r.Move(2)
  133. return CommentToken, l.shiftCommentText()
  134. } else if l.at('[', 'C', 'D', 'A', 'T', 'A', '[') {
  135. l.r.Move(7)
  136. return CDATAToken, l.shiftCDATAText()
  137. } else if l.at('D', 'O', 'C', 'T', 'Y', 'P', 'E') {
  138. l.r.Move(7)
  139. return DOCTYPEToken, l.shiftDOCTYPEText()
  140. }
  141. l.r.Move(-2)
  142. } else if c == '?' {
  143. l.r.Move(2)
  144. l.inTag = true
  145. return StartTagPIToken, l.shiftStartTag()
  146. }
  147. l.r.Move(1)
  148. l.inTag = true
  149. return StartTagToken, l.shiftStartTag()
  150. } else if c == 0 {
  151. if l.r.Pos() > 0 {
  152. l.text = l.r.Shift()
  153. return TextToken, l.text
  154. }
  155. if l.r.Err() == nil {
  156. l.err = parse.NewErrorLexer(l.r, "unexpected NULL character")
  157. }
  158. return ErrorToken, nil
  159. }
  160. l.r.Move(1)
  161. }
  162. }
  163. ////////////////////////////////////////////////////////////////
  164. // The following functions follow the specifications at http://www.w3.org/html/wg/drafts/html/master/syntax.html
  165. func (l *Lexer) shiftDOCTYPEText() []byte {
  166. inString := false
  167. inBrackets := false
  168. for {
  169. c := l.r.Peek(0)
  170. if c == '"' {
  171. inString = !inString
  172. } else if (c == '[' || c == ']') && !inString {
  173. inBrackets = (c == '[')
  174. } else if c == '>' && !inString && !inBrackets {
  175. l.text = l.r.Lexeme()[9:]
  176. l.r.Move(1)
  177. return l.r.Shift()
  178. } else if c == 0 {
  179. l.text = l.r.Lexeme()[9:]
  180. return l.r.Shift()
  181. }
  182. l.r.Move(1)
  183. }
  184. }
  185. func (l *Lexer) shiftCDATAText() []byte {
  186. for {
  187. c := l.r.Peek(0)
  188. if c == ']' && l.r.Peek(1) == ']' && l.r.Peek(2) == '>' {
  189. l.text = l.r.Lexeme()[9:]
  190. l.r.Move(3)
  191. return l.r.Shift()
  192. } else if c == 0 {
  193. l.text = l.r.Lexeme()[9:]
  194. return l.r.Shift()
  195. }
  196. l.r.Move(1)
  197. }
  198. }
  199. func (l *Lexer) shiftCommentText() []byte {
  200. for {
  201. c := l.r.Peek(0)
  202. if c == '-' && l.r.Peek(1) == '-' && l.r.Peek(2) == '>' {
  203. l.text = l.r.Lexeme()[4:]
  204. l.r.Move(3)
  205. return l.r.Shift()
  206. } else if c == 0 {
  207. return l.r.Shift()
  208. }
  209. l.r.Move(1)
  210. }
  211. }
  212. func (l *Lexer) shiftStartTag() []byte {
  213. nameStart := l.r.Pos()
  214. for {
  215. if c := l.r.Peek(0); c == ' ' || c == '>' || (c == '/' || c == '?') && l.r.Peek(1) == '>' || c == '\t' || c == '\n' || c == '\r' || c == 0 {
  216. break
  217. }
  218. l.r.Move(1)
  219. }
  220. l.text = l.r.Lexeme()[nameStart:]
  221. return l.r.Shift()
  222. }
  223. func (l *Lexer) shiftAttribute() []byte {
  224. nameStart := l.r.Pos()
  225. var c byte
  226. for { // attribute name state
  227. if c = l.r.Peek(0); c == ' ' || c == '=' || c == '>' || (c == '/' || c == '?') && l.r.Peek(1) == '>' || c == '\t' || c == '\n' || c == '\r' || c == 0 {
  228. break
  229. }
  230. l.r.Move(1)
  231. }
  232. nameEnd := l.r.Pos()
  233. for { // after attribute name state
  234. if c = l.r.Peek(0); c == ' ' || c == '\t' || c == '\n' || c == '\r' {
  235. l.r.Move(1)
  236. continue
  237. }
  238. break
  239. }
  240. if c == '=' {
  241. l.r.Move(1)
  242. for { // before attribute value state
  243. if c = l.r.Peek(0); c == ' ' || c == '\t' || c == '\n' || c == '\r' {
  244. l.r.Move(1)
  245. continue
  246. }
  247. break
  248. }
  249. attrPos := l.r.Pos()
  250. delim := c
  251. if delim == '"' || delim == '\'' { // attribute value single- and double-quoted state
  252. l.r.Move(1)
  253. for {
  254. c = l.r.Peek(0)
  255. if c == delim {
  256. l.r.Move(1)
  257. break
  258. } else if c == 0 {
  259. break
  260. }
  261. l.r.Move(1)
  262. if c == '\t' || c == '\n' || c == '\r' {
  263. l.r.Lexeme()[l.r.Pos()-1] = ' '
  264. }
  265. }
  266. } else { // attribute value unquoted state
  267. for {
  268. if c = l.r.Peek(0); c == ' ' || c == '>' || (c == '/' || c == '?') && l.r.Peek(1) == '>' || c == '\t' || c == '\n' || c == '\r' || c == 0 {
  269. break
  270. }
  271. l.r.Move(1)
  272. }
  273. }
  274. l.attrVal = l.r.Lexeme()[attrPos:]
  275. } else {
  276. l.r.Rewind(nameEnd)
  277. l.attrVal = nil
  278. }
  279. l.text = l.r.Lexeme()[nameStart:nameEnd]
  280. return l.r.Shift()
  281. }
  282. func (l *Lexer) shiftEndTag() []byte {
  283. for {
  284. c := l.r.Peek(0)
  285. if c == '>' {
  286. l.text = l.r.Lexeme()[2:]
  287. l.r.Move(1)
  288. break
  289. } else if c == 0 {
  290. l.text = l.r.Lexeme()[2:]
  291. break
  292. }
  293. l.r.Move(1)
  294. }
  295. end := len(l.text)
  296. for end > 0 {
  297. if c := l.text[end-1]; c == ' ' || c == '\t' || c == '\n' || c == '\r' {
  298. end--
  299. continue
  300. }
  301. break
  302. }
  303. l.text = l.text[:end]
  304. return l.r.Shift()
  305. }
  306. ////////////////////////////////////////////////////////////////
  307. func (l *Lexer) at(b ...byte) bool {
  308. for i, c := range b {
  309. if l.r.Peek(i) != c {
  310. return false
  311. }
  312. }
  313. return true
  314. }