lex.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589
  1. // Package html is an HTML5 lexer following the specifications at http://www.w3.org/TR/html5/syntax.html.
  2. package html
  3. import (
  4. "strconv"
  5. "github.com/tdewolff/parse/v2"
  6. )
  7. // TokenType determines the type of token, eg. a number or a semicolon.
  8. type TokenType uint32
  9. // TokenType values.
  10. const (
  11. ErrorToken TokenType = iota // extra token when errors occur
  12. CommentToken
  13. DoctypeToken
  14. StartTagToken
  15. StartTagCloseToken
  16. StartTagVoidToken
  17. EndTagToken
  18. AttributeToken
  19. TextToken
  20. SvgToken
  21. MathToken
  22. )
  23. // String returns the string representation of a TokenType.
  24. func (tt TokenType) String() string {
  25. switch tt {
  26. case ErrorToken:
  27. return "Error"
  28. case CommentToken:
  29. return "Comment"
  30. case DoctypeToken:
  31. return "Doctype"
  32. case StartTagToken:
  33. return "StartTag"
  34. case StartTagCloseToken:
  35. return "StartTagClose"
  36. case StartTagVoidToken:
  37. return "StartTagVoid"
  38. case EndTagToken:
  39. return "EndTag"
  40. case AttributeToken:
  41. return "Attribute"
  42. case TextToken:
  43. return "Text"
  44. case SvgToken:
  45. return "Svg"
  46. case MathToken:
  47. return "Math"
  48. }
  49. return "Invalid(" + strconv.Itoa(int(tt)) + ")"
  50. }
  51. ////////////////////////////////////////////////////////////////
  52. var GoTemplate = [2]string{"{{", "}}"}
  53. var HandlebarsTemplate = [2]string{"{{", "}}"}
  54. var MustacheTemplate = [2]string{"{{", "}}"}
  55. var EJSTemplate = [2]string{"<%", "%>"}
  56. var ASPTemplate = [2]string{"<%", "%>"}
  57. var PHPTemplate = [2]string{"<?", "?>"}
  58. // Lexer is the state for the lexer.
  59. type Lexer struct {
  60. r *parse.Input
  61. tmplBegin []byte
  62. tmplEnd []byte
  63. err error
  64. rawTag Hash
  65. inTag bool
  66. text []byte
  67. attrVal []byte
  68. hasTmpl bool
  69. }
  70. // NewLexer returns a new Lexer for a given io.Reader.
  71. func NewLexer(r *parse.Input) *Lexer {
  72. return &Lexer{
  73. r: r,
  74. }
  75. }
  76. func NewTemplateLexer(r *parse.Input, tmpl [2]string) *Lexer {
  77. return &Lexer{
  78. r: r,
  79. tmplBegin: []byte(tmpl[0]),
  80. tmplEnd: []byte(tmpl[1]),
  81. }
  82. }
  83. // Err returns the error encountered during lexing, this is often io.EOF but also other errors can be returned.
  84. func (l *Lexer) Err() error {
  85. if l.err != nil {
  86. return l.err
  87. }
  88. return l.r.Err()
  89. }
  90. // Text returns the textual representation of a token. This excludes delimiters and additional leading/trailing characters.
  91. func (l *Lexer) Text() []byte {
  92. return l.text
  93. }
  94. // AttrKey returns the attribute key when an AttributeToken was returned from Next.
  95. func (l *Lexer) AttrKey() []byte {
  96. return l.text
  97. }
  98. // AttrVal returns the attribute value when an AttributeToken was returned from Next.
  99. func (l *Lexer) AttrVal() []byte {
  100. return l.attrVal
  101. }
  102. // HasTemplate returns the true if the token value contains a template.
  103. func (l *Lexer) HasTemplate() bool {
  104. return l.hasTmpl
  105. }
  106. // Next returns the next Token. It returns ErrorToken when an error was encountered. Using Err() one can retrieve the error message.
  107. func (l *Lexer) Next() (TokenType, []byte) {
  108. l.text = nil
  109. l.hasTmpl = false
  110. var c byte
  111. if l.inTag {
  112. l.attrVal = nil
  113. for { // before attribute name state
  114. if c = l.r.Peek(0); c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' {
  115. l.r.Move(1)
  116. continue
  117. }
  118. break
  119. }
  120. if c == 0 && l.r.Err() != nil {
  121. return ErrorToken, nil
  122. } else if c != '>' && (c != '/' || l.r.Peek(1) != '>') {
  123. return AttributeToken, l.shiftAttribute()
  124. }
  125. l.r.Skip()
  126. l.inTag = false
  127. if c == '/' {
  128. l.r.Move(2)
  129. return StartTagVoidToken, l.r.Shift()
  130. }
  131. l.r.Move(1)
  132. return StartTagCloseToken, l.r.Shift()
  133. }
  134. if l.rawTag != 0 {
  135. if rawText := l.shiftRawText(); 0 < len(rawText) {
  136. l.text = rawText
  137. l.rawTag = 0
  138. return TextToken, rawText
  139. }
  140. l.rawTag = 0
  141. }
  142. for {
  143. c = l.r.Peek(0)
  144. if c == '<' {
  145. c = l.r.Peek(1)
  146. isEndTag := c == '/' && l.r.Peek(2) != '>' && (l.r.Peek(2) != 0 || l.r.PeekErr(2) == nil)
  147. if !isEndTag && (c < 'a' || 'z' < c) && (c < 'A' || 'Z' < c) && c != '!' && c != '?' {
  148. // not a tag
  149. l.r.Move(1)
  150. } else if 0 < l.r.Pos() {
  151. // return currently buffered texttoken so that we can return tag next iteration
  152. l.text = l.r.Shift()
  153. return TextToken, l.text
  154. } else if isEndTag {
  155. l.r.Move(2)
  156. // only endtags that are not followed by > or EOF arrive here
  157. if c = l.r.Peek(0); !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') {
  158. return CommentToken, l.shiftBogusComment()
  159. }
  160. return EndTagToken, l.shiftEndTag()
  161. } else if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' {
  162. l.r.Move(1)
  163. l.inTag = true
  164. return l.shiftStartTag()
  165. } else if c == '!' {
  166. l.r.Move(2)
  167. return l.readMarkup()
  168. } else if 0 < len(l.tmplBegin) && l.at(l.tmplBegin...) {
  169. l.r.Move(len(l.tmplBegin))
  170. l.moveTemplate()
  171. l.hasTmpl = true
  172. } else if c == '?' {
  173. l.r.Move(1)
  174. return CommentToken, l.shiftBogusComment()
  175. }
  176. } else if 0 < len(l.tmplBegin) && l.at(l.tmplBegin...) {
  177. l.r.Move(len(l.tmplBegin))
  178. l.moveTemplate()
  179. l.hasTmpl = true
  180. } else if c == 0 && l.r.Err() != nil {
  181. if 0 < l.r.Pos() {
  182. l.text = l.r.Shift()
  183. return TextToken, l.text
  184. }
  185. return ErrorToken, nil
  186. } else {
  187. l.r.Move(1)
  188. }
  189. }
  190. }
  191. ////////////////////////////////////////////////////////////////
  192. // The following functions follow the specifications at https://html.spec.whatwg.org/multipage/parsing.html
  193. func (l *Lexer) shiftRawText() []byte {
  194. if l.rawTag == Plaintext {
  195. for {
  196. if l.r.Peek(0) == 0 && l.r.Err() != nil {
  197. return l.r.Shift()
  198. }
  199. l.r.Move(1)
  200. }
  201. } else { // RCDATA, RAWTEXT and SCRIPT
  202. for {
  203. c := l.r.Peek(0)
  204. if c == '<' {
  205. if l.r.Peek(1) == '/' {
  206. mark := l.r.Pos()
  207. l.r.Move(2)
  208. for {
  209. if c = l.r.Peek(0); !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') {
  210. break
  211. }
  212. l.r.Move(1)
  213. }
  214. if h := ToHash(parse.ToLower(parse.Copy(l.r.Lexeme()[mark+2:]))); h == l.rawTag { // copy so that ToLower doesn't change the case of the underlying slice
  215. l.r.Rewind(mark)
  216. return l.r.Shift()
  217. }
  218. } else if l.rawTag == Script && l.r.Peek(1) == '!' && l.r.Peek(2) == '-' && l.r.Peek(3) == '-' {
  219. l.r.Move(4)
  220. inScript := false
  221. for {
  222. c := l.r.Peek(0)
  223. if c == '-' && l.r.Peek(1) == '-' && l.r.Peek(2) == '>' {
  224. l.r.Move(3)
  225. break
  226. } else if c == '<' {
  227. isEnd := l.r.Peek(1) == '/'
  228. if isEnd {
  229. l.r.Move(2)
  230. } else {
  231. l.r.Move(1)
  232. }
  233. mark := l.r.Pos()
  234. for {
  235. if c = l.r.Peek(0); !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') {
  236. break
  237. }
  238. l.r.Move(1)
  239. }
  240. if h := ToHash(parse.ToLower(parse.Copy(l.r.Lexeme()[mark:]))); h == Script { // copy so that ToLower doesn't change the case of the underlying slice
  241. if !isEnd {
  242. inScript = true
  243. } else {
  244. if !inScript {
  245. l.r.Rewind(mark - 2)
  246. return l.r.Shift()
  247. }
  248. inScript = false
  249. }
  250. }
  251. } else if c == 0 && l.r.Err() != nil {
  252. return l.r.Shift()
  253. } else {
  254. l.r.Move(1)
  255. }
  256. }
  257. } else {
  258. l.r.Move(1)
  259. }
  260. } else if 0 < len(l.tmplBegin) && l.at(l.tmplBegin...) {
  261. l.r.Move(len(l.tmplBegin))
  262. l.moveTemplate()
  263. l.hasTmpl = true
  264. } else if c == 0 && l.r.Err() != nil {
  265. return l.r.Shift()
  266. } else {
  267. l.r.Move(1)
  268. }
  269. }
  270. }
  271. }
  272. func (l *Lexer) readMarkup() (TokenType, []byte) {
  273. if l.at('-', '-') {
  274. l.r.Move(2)
  275. for {
  276. if l.r.Peek(0) == 0 && l.r.Err() != nil {
  277. l.text = l.r.Lexeme()[4:]
  278. return CommentToken, l.r.Shift()
  279. } else if l.at('-', '-', '>') {
  280. l.text = l.r.Lexeme()[4:]
  281. l.r.Move(3)
  282. return CommentToken, l.r.Shift()
  283. } else if l.at('-', '-', '!', '>') {
  284. l.text = l.r.Lexeme()[4:]
  285. l.r.Move(4)
  286. return CommentToken, l.r.Shift()
  287. }
  288. l.r.Move(1)
  289. }
  290. } else if l.at('[', 'C', 'D', 'A', 'T', 'A', '[') {
  291. l.r.Move(7)
  292. for {
  293. if l.r.Peek(0) == 0 && l.r.Err() != nil {
  294. l.text = l.r.Lexeme()[9:]
  295. return TextToken, l.r.Shift()
  296. } else if l.at(']', ']', '>') {
  297. l.text = l.r.Lexeme()[9:]
  298. l.r.Move(3)
  299. return TextToken, l.r.Shift()
  300. }
  301. l.r.Move(1)
  302. }
  303. } else {
  304. if l.atCaseInsensitive('d', 'o', 'c', 't', 'y', 'p', 'e') {
  305. l.r.Move(7)
  306. if l.r.Peek(0) == ' ' {
  307. l.r.Move(1)
  308. }
  309. for {
  310. if c := l.r.Peek(0); c == '>' || c == 0 && l.r.Err() != nil {
  311. l.text = l.r.Lexeme()[9:]
  312. if c == '>' {
  313. l.r.Move(1)
  314. }
  315. return DoctypeToken, l.r.Shift()
  316. }
  317. l.r.Move(1)
  318. }
  319. }
  320. }
  321. return CommentToken, l.shiftBogusComment()
  322. }
  323. func (l *Lexer) shiftBogusComment() []byte {
  324. for {
  325. c := l.r.Peek(0)
  326. if c == '>' {
  327. l.text = l.r.Lexeme()[2:]
  328. l.r.Move(1)
  329. return l.r.Shift()
  330. } else if c == 0 && l.r.Err() != nil {
  331. l.text = l.r.Lexeme()[2:]
  332. return l.r.Shift()
  333. }
  334. l.r.Move(1)
  335. }
  336. }
  337. func (l *Lexer) shiftStartTag() (TokenType, []byte) {
  338. for {
  339. if c := l.r.Peek(0); c == ' ' || c == '>' || c == '/' && l.r.Peek(1) == '>' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == 0 && l.r.Err() != nil {
  340. break
  341. }
  342. l.r.Move(1)
  343. }
  344. l.text = parse.ToLower(l.r.Lexeme()[1:])
  345. if h := ToHash(l.text); h == Textarea || h == Title || h == Style || h == Xmp || h == Iframe || h == Script || h == Plaintext || h == Svg || h == Math {
  346. if h == Svg || h == Math {
  347. data := l.shiftXML(h)
  348. if l.err != nil {
  349. return ErrorToken, nil
  350. }
  351. l.inTag = false
  352. if h == Svg {
  353. return SvgToken, data
  354. }
  355. return MathToken, data
  356. }
  357. l.rawTag = h
  358. }
  359. return StartTagToken, l.r.Shift()
  360. }
  361. func (l *Lexer) shiftAttribute() []byte {
  362. nameStart := l.r.Pos()
  363. var c byte
  364. if 0 < len(l.tmplBegin) && l.at(l.tmplBegin...) {
  365. l.r.Move(len(l.tmplBegin))
  366. l.moveTemplate()
  367. l.hasTmpl = true
  368. }
  369. for { // attribute name state
  370. if c = l.r.Peek(0); c == ' ' || c == '=' || c == '>' || c == '/' && l.r.Peek(1) == '>' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == 0 && l.r.Err() != nil {
  371. break
  372. }
  373. l.r.Move(1)
  374. }
  375. nameEnd := l.r.Pos()
  376. for { // after attribute name state
  377. if c = l.r.Peek(0); c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' {
  378. l.r.Move(1)
  379. continue
  380. }
  381. break
  382. }
  383. nameHasTmpl := l.hasTmpl
  384. if c == '=' {
  385. l.r.Move(1)
  386. for { // before attribute value state
  387. if c = l.r.Peek(0); c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' {
  388. l.r.Move(1)
  389. continue
  390. }
  391. break
  392. }
  393. attrPos := l.r.Pos()
  394. delim := c
  395. if delim == '"' || delim == '\'' { // attribute value single- and double-quoted state
  396. l.r.Move(1)
  397. for {
  398. c := l.r.Peek(0)
  399. if c == delim {
  400. l.r.Move(1)
  401. break
  402. } else if 0 < len(l.tmplBegin) && l.at(l.tmplBegin...) {
  403. l.r.Move(len(l.tmplBegin))
  404. l.moveTemplate()
  405. l.hasTmpl = true
  406. } else if c == 0 && l.r.Err() != nil {
  407. break
  408. } else {
  409. l.r.Move(1)
  410. }
  411. }
  412. } else if 0 < len(l.tmplBegin) && l.at(l.tmplBegin...) {
  413. l.r.Move(len(l.tmplBegin))
  414. l.moveTemplate()
  415. l.hasTmpl = true
  416. } else { // attribute value unquoted state
  417. for {
  418. if c := l.r.Peek(0); c == ' ' || c == '>' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == 0 && l.r.Err() != nil {
  419. break
  420. }
  421. l.r.Move(1)
  422. }
  423. }
  424. l.attrVal = l.r.Lexeme()[attrPos:]
  425. } else {
  426. l.r.Rewind(nameEnd)
  427. l.attrVal = nil
  428. }
  429. if 0 < len(l.tmplBegin) && l.at(l.tmplBegin...) {
  430. l.r.Move(len(l.tmplBegin))
  431. l.moveTemplate()
  432. l.hasTmpl = true
  433. }
  434. l.text = l.r.Lexeme()[nameStart:nameEnd]
  435. if !nameHasTmpl {
  436. l.text = parse.ToLower(l.text)
  437. }
  438. return l.r.Shift()
  439. }
  440. func (l *Lexer) shiftEndTag() []byte {
  441. for {
  442. c := l.r.Peek(0)
  443. if c == '>' {
  444. l.text = l.r.Lexeme()[2:]
  445. l.r.Move(1)
  446. break
  447. } else if c == 0 && l.r.Err() != nil {
  448. l.text = l.r.Lexeme()[2:]
  449. break
  450. }
  451. l.r.Move(1)
  452. }
  453. end := len(l.text)
  454. for end > 0 {
  455. if c := l.text[end-1]; c == ' ' || c == '\t' || c == '\n' || c == '\r' {
  456. end--
  457. continue
  458. }
  459. break
  460. }
  461. l.text = l.text[:end]
  462. return parse.ToLower(l.r.Shift())
  463. }
  464. // shiftXML parses the content of a svg or math tag according to the XML 1.1 specifications, including the tag itself.
  465. // So far we have already parsed `<svg` or `<math`.
  466. func (l *Lexer) shiftXML(rawTag Hash) []byte {
  467. inQuote := false
  468. for {
  469. c := l.r.Peek(0)
  470. if c == '"' {
  471. inQuote = !inQuote
  472. l.r.Move(1)
  473. } else if c == '<' && !inQuote && l.r.Peek(1) == '/' {
  474. mark := l.r.Pos()
  475. l.r.Move(2)
  476. for {
  477. if c = l.r.Peek(0); !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') {
  478. break
  479. }
  480. l.r.Move(1)
  481. }
  482. if h := ToHash(parse.ToLower(parse.Copy(l.r.Lexeme()[mark+2:]))); h == rawTag { // copy so that ToLower doesn't change the case of the underlying slice
  483. break
  484. }
  485. } else if c == 0 {
  486. if l.r.Err() == nil {
  487. l.err = parse.NewErrorLexer(l.r, "unexpected NULL character")
  488. }
  489. return l.r.Shift()
  490. } else {
  491. l.r.Move(1)
  492. }
  493. }
  494. for {
  495. c := l.r.Peek(0)
  496. if c == '>' {
  497. l.r.Move(1)
  498. break
  499. } else if c == 0 {
  500. if l.r.Err() == nil {
  501. l.err = parse.NewErrorLexer(l.r, "unexpected NULL character")
  502. }
  503. return l.r.Shift()
  504. }
  505. l.r.Move(1)
  506. }
  507. return l.r.Shift()
  508. }
  509. func (l *Lexer) moveTemplate() {
  510. for {
  511. if c := l.r.Peek(0); c == 0 && l.r.Err() != nil {
  512. return
  513. } else if l.at(l.tmplEnd...) {
  514. l.r.Move(len(l.tmplEnd))
  515. return
  516. } else if c == '"' || c == '\'' {
  517. l.r.Move(1)
  518. escape := false
  519. for {
  520. if c2 := l.r.Peek(0); c2 == 0 && l.r.Err() != nil {
  521. return
  522. } else if !escape && c2 == c {
  523. l.r.Move(1)
  524. break
  525. } else if c2 == '\\' {
  526. escape = !escape
  527. } else {
  528. escape = false
  529. }
  530. l.r.Move(1)
  531. }
  532. } else {
  533. l.r.Move(1)
  534. }
  535. }
  536. }
  537. ////////////////////////////////////////////////////////////////
  538. func (l *Lexer) at(b ...byte) bool {
  539. for i, c := range b {
  540. if l.r.Peek(i) != c {
  541. return false
  542. }
  543. }
  544. return true
  545. }
  546. func (l *Lexer) atCaseInsensitive(b ...byte) bool {
  547. for i, c := range b {
  548. if l.r.Peek(i) != c && (l.r.Peek(i)+('a'-'A')) != c {
  549. return false
  550. }
  551. }
  552. return true
  553. }