lex.go 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583
  1. // Package html is an HTML5 lexer following the specifications at http://www.w3.org/TR/html5/syntax.html.
  2. package html
  3. import (
  4. "strconv"
  5. "github.com/tdewolff/parse/v2"
  6. )
  7. // TokenType determines the type of token, eg. a number or a semicolon.
  8. type TokenType uint32
  9. // TokenType values.
  10. const (
  11. ErrorToken TokenType = iota // extra token when errors occur
  12. CommentToken
  13. DoctypeToken
  14. StartTagToken
  15. StartTagCloseToken
  16. StartTagVoidToken
  17. EndTagToken
  18. AttributeToken
  19. TextToken
  20. SvgToken
  21. MathToken
  22. )
  23. // String returns the string representation of a TokenType.
  24. func (tt TokenType) String() string {
  25. switch tt {
  26. case ErrorToken:
  27. return "Error"
  28. case CommentToken:
  29. return "Comment"
  30. case DoctypeToken:
  31. return "Doctype"
  32. case StartTagToken:
  33. return "StartTag"
  34. case StartTagCloseToken:
  35. return "StartTagClose"
  36. case StartTagVoidToken:
  37. return "StartTagVoid"
  38. case EndTagToken:
  39. return "EndTag"
  40. case AttributeToken:
  41. return "Attribute"
  42. case TextToken:
  43. return "Text"
  44. case SvgToken:
  45. return "Svg"
  46. case MathToken:
  47. return "Math"
  48. }
  49. return "Invalid(" + strconv.Itoa(int(tt)) + ")"
  50. }
  51. ////////////////////////////////////////////////////////////////
  52. var GoTemplate = [2]string{"{{", "}}"}
  53. var HandlebarsTemplate = [2]string{"{{", "}}"}
  54. var MustacheTemplate = [2]string{"{{", "}}"}
  55. var EJSTemplate = [2]string{"<%", "%>"}
  56. var ASPTemplate = [2]string{"<%", "%>"}
  57. var PHPTemplate = [2]string{"<?", "?>"}
  58. // Lexer is the state for the lexer.
  59. type Lexer struct {
  60. r *parse.Input
  61. tmplBegin []byte
  62. tmplEnd []byte
  63. err error
  64. rawTag Hash
  65. inTag bool
  66. text []byte
  67. attrVal []byte
  68. hasTmpl bool
  69. }
  70. // NewLexer returns a new Lexer for a given io.Reader.
  71. func NewLexer(r *parse.Input) *Lexer {
  72. return &Lexer{
  73. r: r,
  74. }
  75. }
  76. func NewTemplateLexer(r *parse.Input, tmpl [2]string) *Lexer {
  77. return &Lexer{
  78. r: r,
  79. tmplBegin: []byte(tmpl[0]),
  80. tmplEnd: []byte(tmpl[1]),
  81. }
  82. }
  83. // Err returns the error encountered during lexing, this is often io.EOF but also other errors can be returned.
  84. func (l *Lexer) Err() error {
  85. if l.err != nil {
  86. return l.err
  87. }
  88. return l.r.Err()
  89. }
  90. // Text returns the textual representation of a token. This excludes delimiters and additional leading/trailing characters.
  91. func (l *Lexer) Text() []byte {
  92. return l.text
  93. }
  94. // AttrKey returns the attribute key when an AttributeToken was returned from Next.
  95. func (l *Lexer) AttrKey() []byte {
  96. return l.text
  97. }
  98. // AttrVal returns the attribute value when an AttributeToken was returned from Next.
  99. func (l *Lexer) AttrVal() []byte {
  100. return l.attrVal
  101. }
  102. // HasTemplate returns the true if the token value contains a template.
  103. func (l *Lexer) HasTemplate() bool {
  104. return l.hasTmpl
  105. }
  106. // Next returns the next Token. It returns ErrorToken when an error was encountered. Using Err() one can retrieve the error message.
  107. func (l *Lexer) Next() (TokenType, []byte) {
  108. l.text = nil
  109. l.hasTmpl = false
  110. var c byte
  111. if l.inTag {
  112. l.attrVal = nil
  113. for { // before attribute name state
  114. if c = l.r.Peek(0); c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' {
  115. l.r.Move(1)
  116. continue
  117. }
  118. break
  119. }
  120. if c == 0 && l.r.Err() != nil {
  121. return ErrorToken, nil
  122. } else if c != '>' && (c != '/' || l.r.Peek(1) != '>') {
  123. return AttributeToken, l.shiftAttribute()
  124. }
  125. l.r.Skip()
  126. l.inTag = false
  127. if c == '/' {
  128. l.r.Move(2)
  129. return StartTagVoidToken, l.r.Shift()
  130. }
  131. l.r.Move(1)
  132. return StartTagCloseToken, l.r.Shift()
  133. }
  134. if l.rawTag != 0 {
  135. if rawText := l.shiftRawText(); 0 < len(rawText) {
  136. l.text = rawText
  137. l.rawTag = 0
  138. return TextToken, rawText
  139. }
  140. l.rawTag = 0
  141. }
  142. for {
  143. c = l.r.Peek(0)
  144. if c == '<' {
  145. c = l.r.Peek(1)
  146. isEndTag := c == '/' && l.r.Peek(2) != '>' && (l.r.Peek(2) != 0 || l.r.PeekErr(2) == nil)
  147. if !isEndTag && (c < 'a' || 'z' < c) && (c < 'A' || 'Z' < c) && c != '!' && c != '?' {
  148. // not a tag
  149. } else if 0 < l.r.Pos() {
  150. // return currently buffered texttoken so that we can return tag next iteration
  151. l.text = l.r.Shift()
  152. return TextToken, l.text
  153. } else if isEndTag {
  154. l.r.Move(2)
  155. // only endtags that are not followed by > or EOF arrive here
  156. if c = l.r.Peek(0); !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') {
  157. return CommentToken, l.shiftBogusComment()
  158. }
  159. return EndTagToken, l.shiftEndTag()
  160. } else if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' {
  161. l.r.Move(1)
  162. l.inTag = true
  163. return l.shiftStartTag()
  164. } else if c == '!' {
  165. l.r.Move(2)
  166. return l.readMarkup()
  167. } else if c == '?' {
  168. l.r.Move(1)
  169. return CommentToken, l.shiftBogusComment()
  170. }
  171. } else if 0 < len(l.tmplBegin) && l.at(l.tmplBegin...) {
  172. l.r.Move(len(l.tmplBegin))
  173. l.moveTemplate()
  174. l.hasTmpl = true
  175. } else if c == 0 && l.r.Err() != nil {
  176. if 0 < l.r.Pos() {
  177. l.text = l.r.Shift()
  178. return TextToken, l.text
  179. }
  180. return ErrorToken, nil
  181. }
  182. l.r.Move(1)
  183. }
  184. }
  185. ////////////////////////////////////////////////////////////////
  186. // The following functions follow the specifications at https://html.spec.whatwg.org/multipage/parsing.html
  187. func (l *Lexer) shiftRawText() []byte {
  188. if l.rawTag == Plaintext {
  189. for {
  190. if l.r.Peek(0) == 0 && l.r.Err() != nil {
  191. return l.r.Shift()
  192. }
  193. l.r.Move(1)
  194. }
  195. } else { // RCDATA, RAWTEXT and SCRIPT
  196. for {
  197. c := l.r.Peek(0)
  198. if c == '<' {
  199. if l.r.Peek(1) == '/' {
  200. mark := l.r.Pos()
  201. l.r.Move(2)
  202. for {
  203. if c = l.r.Peek(0); !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') {
  204. break
  205. }
  206. l.r.Move(1)
  207. }
  208. if h := ToHash(parse.ToLower(parse.Copy(l.r.Lexeme()[mark+2:]))); h == l.rawTag { // copy so that ToLower doesn't change the case of the underlying slice
  209. l.r.Rewind(mark)
  210. return l.r.Shift()
  211. }
  212. } else if l.rawTag == Script && l.r.Peek(1) == '!' && l.r.Peek(2) == '-' && l.r.Peek(3) == '-' {
  213. l.r.Move(4)
  214. inScript := false
  215. for {
  216. c := l.r.Peek(0)
  217. if c == '-' && l.r.Peek(1) == '-' && l.r.Peek(2) == '>' {
  218. l.r.Move(3)
  219. break
  220. } else if c == '<' {
  221. isEnd := l.r.Peek(1) == '/'
  222. if isEnd {
  223. l.r.Move(2)
  224. } else {
  225. l.r.Move(1)
  226. }
  227. mark := l.r.Pos()
  228. for {
  229. if c = l.r.Peek(0); !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') {
  230. break
  231. }
  232. l.r.Move(1)
  233. }
  234. if h := ToHash(parse.ToLower(parse.Copy(l.r.Lexeme()[mark:]))); h == Script { // copy so that ToLower doesn't change the case of the underlying slice
  235. if !isEnd {
  236. inScript = true
  237. } else {
  238. if !inScript {
  239. l.r.Rewind(mark - 2)
  240. return l.r.Shift()
  241. }
  242. inScript = false
  243. }
  244. }
  245. } else if c == 0 && l.r.Err() != nil {
  246. return l.r.Shift()
  247. } else {
  248. l.r.Move(1)
  249. }
  250. }
  251. } else {
  252. l.r.Move(1)
  253. }
  254. } else if 0 < len(l.tmplBegin) && l.at(l.tmplBegin...) {
  255. l.r.Move(len(l.tmplBegin))
  256. l.moveTemplate()
  257. l.hasTmpl = true
  258. } else if c == 0 && l.r.Err() != nil {
  259. return l.r.Shift()
  260. } else {
  261. l.r.Move(1)
  262. }
  263. }
  264. }
  265. }
  266. func (l *Lexer) readMarkup() (TokenType, []byte) {
  267. if l.at('-', '-') {
  268. l.r.Move(2)
  269. for {
  270. if l.r.Peek(0) == 0 && l.r.Err() != nil {
  271. l.text = l.r.Lexeme()[4:]
  272. return CommentToken, l.r.Shift()
  273. } else if l.at('-', '-', '>') {
  274. l.text = l.r.Lexeme()[4:]
  275. l.r.Move(3)
  276. return CommentToken, l.r.Shift()
  277. } else if l.at('-', '-', '!', '>') {
  278. l.text = l.r.Lexeme()[4:]
  279. l.r.Move(4)
  280. return CommentToken, l.r.Shift()
  281. }
  282. l.r.Move(1)
  283. }
  284. } else if l.at('[', 'C', 'D', 'A', 'T', 'A', '[') {
  285. l.r.Move(7)
  286. for {
  287. if l.r.Peek(0) == 0 && l.r.Err() != nil {
  288. l.text = l.r.Lexeme()[9:]
  289. return TextToken, l.r.Shift()
  290. } else if l.at(']', ']', '>') {
  291. l.text = l.r.Lexeme()[9:]
  292. l.r.Move(3)
  293. return TextToken, l.r.Shift()
  294. }
  295. l.r.Move(1)
  296. }
  297. } else {
  298. if l.atCaseInsensitive('d', 'o', 'c', 't', 'y', 'p', 'e') {
  299. l.r.Move(7)
  300. if l.r.Peek(0) == ' ' {
  301. l.r.Move(1)
  302. }
  303. for {
  304. if c := l.r.Peek(0); c == '>' || c == 0 && l.r.Err() != nil {
  305. l.text = l.r.Lexeme()[9:]
  306. if c == '>' {
  307. l.r.Move(1)
  308. }
  309. return DoctypeToken, l.r.Shift()
  310. }
  311. l.r.Move(1)
  312. }
  313. }
  314. }
  315. return CommentToken, l.shiftBogusComment()
  316. }
  317. func (l *Lexer) shiftBogusComment() []byte {
  318. for {
  319. c := l.r.Peek(0)
  320. if c == '>' {
  321. l.text = l.r.Lexeme()[2:]
  322. l.r.Move(1)
  323. return l.r.Shift()
  324. } else if c == 0 && l.r.Err() != nil {
  325. l.text = l.r.Lexeme()[2:]
  326. return l.r.Shift()
  327. }
  328. l.r.Move(1)
  329. }
  330. }
  331. func (l *Lexer) shiftStartTag() (TokenType, []byte) {
  332. for {
  333. if c := l.r.Peek(0); c == ' ' || c == '>' || c == '/' && l.r.Peek(1) == '>' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == 0 && l.r.Err() != nil {
  334. break
  335. }
  336. l.r.Move(1)
  337. }
  338. l.text = parse.ToLower(l.r.Lexeme()[1:])
  339. if h := ToHash(l.text); h == Textarea || h == Title || h == Style || h == Xmp || h == Iframe || h == Script || h == Plaintext || h == Svg || h == Math {
  340. if h == Svg || h == Math {
  341. data := l.shiftXML(h)
  342. if l.err != nil {
  343. return ErrorToken, nil
  344. }
  345. l.inTag = false
  346. if h == Svg {
  347. return SvgToken, data
  348. }
  349. return MathToken, data
  350. }
  351. l.rawTag = h
  352. }
  353. return StartTagToken, l.r.Shift()
  354. }
  355. func (l *Lexer) shiftAttribute() []byte {
  356. nameStart := l.r.Pos()
  357. var c byte
  358. if 0 < len(l.tmplBegin) && l.at(l.tmplBegin...) {
  359. l.r.Move(len(l.tmplBegin))
  360. l.moveTemplate()
  361. l.hasTmpl = true
  362. }
  363. for { // attribute name state
  364. if c = l.r.Peek(0); c == ' ' || c == '=' || c == '>' || c == '/' && l.r.Peek(1) == '>' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == 0 && l.r.Err() != nil {
  365. break
  366. }
  367. l.r.Move(1)
  368. }
  369. nameEnd := l.r.Pos()
  370. for { // after attribute name state
  371. if c = l.r.Peek(0); c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' {
  372. l.r.Move(1)
  373. continue
  374. }
  375. break
  376. }
  377. nameHasTmpl := l.hasTmpl
  378. if c == '=' {
  379. l.r.Move(1)
  380. for { // before attribute value state
  381. if c = l.r.Peek(0); c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' {
  382. l.r.Move(1)
  383. continue
  384. }
  385. break
  386. }
  387. attrPos := l.r.Pos()
  388. delim := c
  389. if delim == '"' || delim == '\'' { // attribute value single- and double-quoted state
  390. l.r.Move(1)
  391. for {
  392. c := l.r.Peek(0)
  393. if c == delim {
  394. l.r.Move(1)
  395. break
  396. } else if 0 < len(l.tmplBegin) && l.at(l.tmplBegin...) {
  397. l.r.Move(len(l.tmplBegin))
  398. l.moveTemplate()
  399. l.hasTmpl = true
  400. } else if c == 0 && l.r.Err() != nil {
  401. break
  402. } else {
  403. l.r.Move(1)
  404. }
  405. }
  406. } else if 0 < len(l.tmplBegin) && l.at(l.tmplBegin...) {
  407. l.r.Move(len(l.tmplBegin))
  408. l.moveTemplate()
  409. l.hasTmpl = true
  410. } else { // attribute value unquoted state
  411. for {
  412. if c := l.r.Peek(0); c == ' ' || c == '>' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == 0 && l.r.Err() != nil {
  413. break
  414. }
  415. l.r.Move(1)
  416. }
  417. }
  418. l.attrVal = l.r.Lexeme()[attrPos:]
  419. } else {
  420. l.r.Rewind(nameEnd)
  421. l.attrVal = nil
  422. }
  423. if 0 < len(l.tmplBegin) && l.at(l.tmplBegin...) {
  424. l.r.Move(len(l.tmplBegin))
  425. l.moveTemplate()
  426. l.hasTmpl = true
  427. }
  428. l.text = l.r.Lexeme()[nameStart:nameEnd]
  429. if !nameHasTmpl {
  430. l.text = parse.ToLower(l.text)
  431. }
  432. return l.r.Shift()
  433. }
  434. func (l *Lexer) shiftEndTag() []byte {
  435. for {
  436. c := l.r.Peek(0)
  437. if c == '>' {
  438. l.text = l.r.Lexeme()[2:]
  439. l.r.Move(1)
  440. break
  441. } else if c == 0 && l.r.Err() != nil {
  442. l.text = l.r.Lexeme()[2:]
  443. break
  444. }
  445. l.r.Move(1)
  446. }
  447. end := len(l.text)
  448. for end > 0 {
  449. if c := l.text[end-1]; c == ' ' || c == '\t' || c == '\n' || c == '\r' {
  450. end--
  451. continue
  452. }
  453. break
  454. }
  455. l.text = l.text[:end]
  456. return parse.ToLower(l.r.Shift())
  457. }
  458. // shiftXML parses the content of a svg or math tag according to the XML 1.1 specifications, including the tag itself.
  459. // So far we have already parsed `<svg` or `<math`.
  460. func (l *Lexer) shiftXML(rawTag Hash) []byte {
  461. inQuote := false
  462. for {
  463. c := l.r.Peek(0)
  464. if c == '"' {
  465. inQuote = !inQuote
  466. l.r.Move(1)
  467. } else if c == '<' && !inQuote && l.r.Peek(1) == '/' {
  468. mark := l.r.Pos()
  469. l.r.Move(2)
  470. for {
  471. if c = l.r.Peek(0); !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') {
  472. break
  473. }
  474. l.r.Move(1)
  475. }
  476. if h := ToHash(parse.ToLower(parse.Copy(l.r.Lexeme()[mark+2:]))); h == rawTag { // copy so that ToLower doesn't change the case of the underlying slice
  477. break
  478. }
  479. } else if c == 0 {
  480. if l.r.Err() == nil {
  481. l.err = parse.NewErrorLexer(l.r, "unexpected NULL character")
  482. }
  483. return l.r.Shift()
  484. } else {
  485. l.r.Move(1)
  486. }
  487. }
  488. for {
  489. c := l.r.Peek(0)
  490. if c == '>' {
  491. l.r.Move(1)
  492. break
  493. } else if c == 0 {
  494. if l.r.Err() == nil {
  495. l.err = parse.NewErrorLexer(l.r, "unexpected NULL character")
  496. }
  497. return l.r.Shift()
  498. }
  499. l.r.Move(1)
  500. }
  501. return l.r.Shift()
  502. }
  503. func (l *Lexer) moveTemplate() {
  504. for {
  505. if c := l.r.Peek(0); l.at(l.tmplEnd...) || c == 0 && l.r.Err() != nil {
  506. if c != 0 {
  507. l.r.Move(len(l.tmplEnd))
  508. }
  509. break
  510. } else if c == '"' || c == '\'' {
  511. l.r.Move(1)
  512. escape := false
  513. for {
  514. if c2 := l.r.Peek(0); !escape && c2 == c || c2 == 0 && l.r.Err() != nil {
  515. if c2 != 0 {
  516. l.r.Move(1)
  517. }
  518. break
  519. } else if c2 == '\\' {
  520. escape = !escape
  521. } else {
  522. escape = false
  523. }
  524. l.r.Move(1)
  525. }
  526. } else {
  527. l.r.Move(1)
  528. }
  529. }
  530. }
  531. ////////////////////////////////////////////////////////////////
  532. func (l *Lexer) at(b ...byte) bool {
  533. for i, c := range b {
  534. if l.r.Peek(i) != c {
  535. return false
  536. }
  537. }
  538. return true
  539. }
  540. func (l *Lexer) atCaseInsensitive(b ...byte) bool {
  541. for i, c := range b {
  542. if l.r.Peek(i) != c && (l.r.Peek(i)+('a'-'A')) != c {
  543. return false
  544. }
  545. }
  546. return true
  547. }