lex.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698
  1. // Package css is a CSS3 lexer and parser following the specifications at http://www.w3.org/TR/css-syntax-3/.
  2. package css
  3. // TODO: \uFFFD replacement character for NULL bytes in strings for example, or atleast don't end the string early
  4. import (
  5. "bytes"
  6. "io"
  7. "strconv"
  8. "github.com/tdewolff/parse/v2"
  9. )
  10. // TokenType determines the type of token, eg. a number or a semicolon.
  11. type TokenType uint32
  12. // TokenType values.
  13. const (
  14. ErrorToken TokenType = iota // extra token when errors occur
  15. IdentToken
  16. FunctionToken // rgb( rgba( ...
  17. AtKeywordToken // @abc
  18. HashToken // #abc
  19. StringToken
  20. BadStringToken
  21. URLToken
  22. BadURLToken
  23. DelimToken // any unmatched character
  24. NumberToken // 5
  25. PercentageToken // 5%
  26. DimensionToken // 5em
  27. UnicodeRangeToken // U+554A
  28. IncludeMatchToken // ~=
  29. DashMatchToken // |=
  30. PrefixMatchToken // ^=
  31. SuffixMatchToken // $=
  32. SubstringMatchToken // *=
  33. ColumnToken // ||
  34. WhitespaceToken // space \t \r \n \f
  35. CDOToken // <!--
  36. CDCToken // -->
  37. ColonToken // :
  38. SemicolonToken // ;
  39. CommaToken // ,
  40. LeftBracketToken // [
  41. RightBracketToken // ]
  42. LeftParenthesisToken // (
  43. RightParenthesisToken // )
  44. LeftBraceToken // {
  45. RightBraceToken // }
  46. CommentToken // extra token for comments
  47. EmptyToken
  48. CustomPropertyNameToken
  49. CustomPropertyValueToken
  50. )
  51. // String returns the string representation of a TokenType.
  52. func (tt TokenType) String() string {
  53. switch tt {
  54. case ErrorToken:
  55. return "Error"
  56. case IdentToken:
  57. return "Ident"
  58. case FunctionToken:
  59. return "Function"
  60. case AtKeywordToken:
  61. return "AtKeyword"
  62. case HashToken:
  63. return "Hash"
  64. case StringToken:
  65. return "String"
  66. case BadStringToken:
  67. return "BadString"
  68. case URLToken:
  69. return "URL"
  70. case BadURLToken:
  71. return "BadURL"
  72. case DelimToken:
  73. return "Delim"
  74. case NumberToken:
  75. return "Number"
  76. case PercentageToken:
  77. return "Percentage"
  78. case DimensionToken:
  79. return "Dimension"
  80. case UnicodeRangeToken:
  81. return "UnicodeRange"
  82. case IncludeMatchToken:
  83. return "IncludeMatch"
  84. case DashMatchToken:
  85. return "DashMatch"
  86. case PrefixMatchToken:
  87. return "PrefixMatch"
  88. case SuffixMatchToken:
  89. return "SuffixMatch"
  90. case SubstringMatchToken:
  91. return "SubstringMatch"
  92. case ColumnToken:
  93. return "Column"
  94. case WhitespaceToken:
  95. return "Whitespace"
  96. case CDOToken:
  97. return "CDO"
  98. case CDCToken:
  99. return "CDC"
  100. case ColonToken:
  101. return "Colon"
  102. case SemicolonToken:
  103. return "Semicolon"
  104. case CommaToken:
  105. return "Comma"
  106. case LeftBracketToken:
  107. return "LeftBracket"
  108. case RightBracketToken:
  109. return "RightBracket"
  110. case LeftParenthesisToken:
  111. return "LeftParenthesis"
  112. case RightParenthesisToken:
  113. return "RightParenthesis"
  114. case LeftBraceToken:
  115. return "LeftBrace"
  116. case RightBraceToken:
  117. return "RightBrace"
  118. case CommentToken:
  119. return "Comment"
  120. case EmptyToken:
  121. return "Empty"
  122. case CustomPropertyNameToken:
  123. return "CustomPropertyName"
  124. case CustomPropertyValueToken:
  125. return "CustomPropertyValue"
  126. }
  127. return "Invalid(" + strconv.Itoa(int(tt)) + ")"
  128. }
  129. ////////////////////////////////////////////////////////////////
  130. // Lexer is the state for the lexer.
  131. type Lexer struct {
  132. r *parse.Input
  133. }
  134. // NewLexer returns a new Lexer for a given io.Reader.
  135. func NewLexer(r *parse.Input) *Lexer {
  136. return &Lexer{
  137. r: r,
  138. }
  139. }
  140. // Err returns the error encountered during lexing, this is often io.EOF but also other errors can be returned.
  141. func (l *Lexer) Err() error {
  142. return l.r.Err()
  143. }
  144. // Next returns the next Token. It returns ErrorToken when an error was encountered. Using Err() one can retrieve the error message.
  145. func (l *Lexer) Next() (TokenType, []byte) {
  146. switch l.r.Peek(0) {
  147. case ' ', '\t', '\n', '\r', '\f':
  148. l.r.Move(1)
  149. for l.consumeWhitespace() {
  150. }
  151. return WhitespaceToken, l.r.Shift()
  152. case ':':
  153. l.r.Move(1)
  154. return ColonToken, l.r.Shift()
  155. case ';':
  156. l.r.Move(1)
  157. return SemicolonToken, l.r.Shift()
  158. case ',':
  159. l.r.Move(1)
  160. return CommaToken, l.r.Shift()
  161. case '(', ')', '[', ']', '{', '}':
  162. if t := l.consumeBracket(); t != ErrorToken {
  163. return t, l.r.Shift()
  164. }
  165. case '#':
  166. if l.consumeHashToken() {
  167. return HashToken, l.r.Shift()
  168. }
  169. case '"', '\'':
  170. if t := l.consumeString(); t != ErrorToken {
  171. return t, l.r.Shift()
  172. }
  173. case '.', '+':
  174. if t := l.consumeNumeric(); t != ErrorToken {
  175. return t, l.r.Shift()
  176. }
  177. case '-':
  178. if t := l.consumeNumeric(); t != ErrorToken {
  179. return t, l.r.Shift()
  180. } else if t := l.consumeIdentlike(); t != ErrorToken {
  181. return t, l.r.Shift()
  182. } else if l.consumeCDCToken() {
  183. return CDCToken, l.r.Shift()
  184. } else if l.consumeCustomVariableToken() {
  185. return CustomPropertyNameToken, l.r.Shift()
  186. }
  187. case '@':
  188. if l.consumeAtKeywordToken() {
  189. return AtKeywordToken, l.r.Shift()
  190. }
  191. case '$', '*', '^', '~':
  192. if t := l.consumeMatch(); t != ErrorToken {
  193. return t, l.r.Shift()
  194. }
  195. case '/':
  196. if l.consumeComment() {
  197. return CommentToken, l.r.Shift()
  198. }
  199. case '<':
  200. if l.consumeCDOToken() {
  201. return CDOToken, l.r.Shift()
  202. }
  203. case '\\':
  204. if t := l.consumeIdentlike(); t != ErrorToken {
  205. return t, l.r.Shift()
  206. }
  207. case 'u', 'U':
  208. if l.consumeUnicodeRangeToken() {
  209. return UnicodeRangeToken, l.r.Shift()
  210. } else if t := l.consumeIdentlike(); t != ErrorToken {
  211. return t, l.r.Shift()
  212. }
  213. case '|':
  214. if t := l.consumeMatch(); t != ErrorToken {
  215. return t, l.r.Shift()
  216. } else if l.consumeColumnToken() {
  217. return ColumnToken, l.r.Shift()
  218. }
  219. case 0:
  220. if l.r.Err() != nil {
  221. return ErrorToken, nil
  222. }
  223. default:
  224. if t := l.consumeNumeric(); t != ErrorToken {
  225. return t, l.r.Shift()
  226. } else if t := l.consumeIdentlike(); t != ErrorToken {
  227. return t, l.r.Shift()
  228. }
  229. }
  230. // can't be rune because consumeIdentlike consumes that as an identifier
  231. l.r.Move(1)
  232. return DelimToken, l.r.Shift()
  233. }
  234. ////////////////////////////////////////////////////////////////
  235. /*
  236. The following functions follow the railroad diagrams in http://www.w3.org/TR/css3-syntax/
  237. */
  238. func (l *Lexer) consumeByte(c byte) bool {
  239. if l.r.Peek(0) == c {
  240. l.r.Move(1)
  241. return true
  242. }
  243. return false
  244. }
  245. func (l *Lexer) consumeComment() bool {
  246. if l.r.Peek(0) != '/' || l.r.Peek(1) != '*' {
  247. return false
  248. }
  249. l.r.Move(2)
  250. for {
  251. c := l.r.Peek(0)
  252. if c == 0 && l.r.Err() != nil {
  253. break
  254. } else if c == '*' && l.r.Peek(1) == '/' {
  255. l.r.Move(2)
  256. return true
  257. }
  258. l.r.Move(1)
  259. }
  260. return true
  261. }
  262. func (l *Lexer) consumeNewline() bool {
  263. c := l.r.Peek(0)
  264. if c == '\n' || c == '\f' {
  265. l.r.Move(1)
  266. return true
  267. } else if c == '\r' {
  268. if l.r.Peek(1) == '\n' {
  269. l.r.Move(2)
  270. } else {
  271. l.r.Move(1)
  272. }
  273. return true
  274. }
  275. return false
  276. }
  277. func (l *Lexer) consumeWhitespace() bool {
  278. c := l.r.Peek(0)
  279. if c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' {
  280. l.r.Move(1)
  281. return true
  282. }
  283. return false
  284. }
  285. func (l *Lexer) consumeDigit() bool {
  286. c := l.r.Peek(0)
  287. if c >= '0' && c <= '9' {
  288. l.r.Move(1)
  289. return true
  290. }
  291. return false
  292. }
  293. func (l *Lexer) consumeHexDigit() bool {
  294. c := l.r.Peek(0)
  295. if (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F') {
  296. l.r.Move(1)
  297. return true
  298. }
  299. return false
  300. }
  301. func (l *Lexer) consumeEscape() bool {
  302. if l.r.Peek(0) != '\\' {
  303. return false
  304. }
  305. mark := l.r.Pos()
  306. l.r.Move(1)
  307. if l.consumeNewline() {
  308. l.r.Rewind(mark)
  309. return false
  310. } else if l.consumeHexDigit() {
  311. for k := 1; k < 6; k++ {
  312. if !l.consumeHexDigit() {
  313. break
  314. }
  315. }
  316. l.consumeWhitespace()
  317. return true
  318. } else {
  319. c := l.r.Peek(0)
  320. if c >= 0xC0 {
  321. _, n := l.r.PeekRune(0)
  322. l.r.Move(n)
  323. return true
  324. } else if c == 0 && l.r.Err() != nil {
  325. l.r.Rewind(mark)
  326. return false
  327. }
  328. }
  329. l.r.Move(1)
  330. return true
  331. }
  332. func (l *Lexer) consumeIdentToken() bool {
  333. mark := l.r.Pos()
  334. if l.r.Peek(0) == '-' {
  335. l.r.Move(1)
  336. }
  337. c := l.r.Peek(0)
  338. if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c >= 0x80) {
  339. if c != '\\' || !l.consumeEscape() {
  340. l.r.Rewind(mark)
  341. return false
  342. }
  343. } else {
  344. l.r.Move(1)
  345. }
  346. for {
  347. c := l.r.Peek(0)
  348. if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' || c == '-' || c >= 0x80) {
  349. if c != '\\' || !l.consumeEscape() {
  350. break
  351. }
  352. } else {
  353. l.r.Move(1)
  354. }
  355. }
  356. return true
  357. }
  358. // support custom variables, https://www.w3.org/TR/css-variables-1/
  359. func (l *Lexer) consumeCustomVariableToken() bool {
  360. // expect to be on a '-'
  361. l.r.Move(1)
  362. if l.r.Peek(0) != '-' {
  363. l.r.Move(-1)
  364. return false
  365. }
  366. if !l.consumeIdentToken() {
  367. l.r.Move(-1)
  368. return false
  369. }
  370. return true
  371. }
  372. func (l *Lexer) consumeAtKeywordToken() bool {
  373. // expect to be on an '@'
  374. l.r.Move(1)
  375. if !l.consumeIdentToken() {
  376. l.r.Move(-1)
  377. return false
  378. }
  379. return true
  380. }
  381. func (l *Lexer) consumeHashToken() bool {
  382. // expect to be on a '#'
  383. mark := l.r.Pos()
  384. l.r.Move(1)
  385. c := l.r.Peek(0)
  386. if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' || c == '-' || c >= 0x80) {
  387. if c != '\\' || !l.consumeEscape() {
  388. l.r.Rewind(mark)
  389. return false
  390. }
  391. } else {
  392. l.r.Move(1)
  393. }
  394. for {
  395. c := l.r.Peek(0)
  396. if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' || c == '-' || c >= 0x80) {
  397. if c != '\\' || !l.consumeEscape() {
  398. break
  399. }
  400. } else {
  401. l.r.Move(1)
  402. }
  403. }
  404. return true
  405. }
  406. func (l *Lexer) consumeNumberToken() bool {
  407. mark := l.r.Pos()
  408. c := l.r.Peek(0)
  409. if c == '+' || c == '-' {
  410. l.r.Move(1)
  411. }
  412. firstDigit := l.consumeDigit()
  413. if firstDigit {
  414. for l.consumeDigit() {
  415. }
  416. }
  417. if l.r.Peek(0) == '.' {
  418. l.r.Move(1)
  419. if l.consumeDigit() {
  420. for l.consumeDigit() {
  421. }
  422. } else if firstDigit {
  423. // . could belong to the next token
  424. l.r.Move(-1)
  425. return true
  426. } else {
  427. l.r.Rewind(mark)
  428. return false
  429. }
  430. } else if !firstDigit {
  431. l.r.Rewind(mark)
  432. return false
  433. }
  434. mark = l.r.Pos()
  435. c = l.r.Peek(0)
  436. if c == 'e' || c == 'E' {
  437. l.r.Move(1)
  438. c = l.r.Peek(0)
  439. if c == '+' || c == '-' {
  440. l.r.Move(1)
  441. }
  442. if !l.consumeDigit() {
  443. // e could belong to next token
  444. l.r.Rewind(mark)
  445. return true
  446. }
  447. for l.consumeDigit() {
  448. }
  449. }
  450. return true
  451. }
  452. func (l *Lexer) consumeUnicodeRangeToken() bool {
  453. c := l.r.Peek(0)
  454. if (c != 'u' && c != 'U') || l.r.Peek(1) != '+' {
  455. return false
  456. }
  457. mark := l.r.Pos()
  458. l.r.Move(2)
  459. // consume up to 6 hexDigits
  460. k := 0
  461. for l.consumeHexDigit() {
  462. k++
  463. }
  464. // either a minus or a question mark or the end is expected
  465. if l.consumeByte('-') {
  466. if k == 0 || 6 < k {
  467. l.r.Rewind(mark)
  468. return false
  469. }
  470. // consume another up to 6 hexDigits
  471. if l.consumeHexDigit() {
  472. k = 1
  473. for l.consumeHexDigit() {
  474. k++
  475. }
  476. } else {
  477. l.r.Rewind(mark)
  478. return false
  479. }
  480. } else if l.consumeByte('?') {
  481. // could be filled up to 6 characters with question marks or else regular hexDigits
  482. k++
  483. for l.consumeByte('?') {
  484. k++
  485. }
  486. }
  487. if k == 0 || 6 < k {
  488. l.r.Rewind(mark)
  489. return false
  490. }
  491. return true
  492. }
  493. func (l *Lexer) consumeColumnToken() bool {
  494. if l.r.Peek(0) == '|' && l.r.Peek(1) == '|' {
  495. l.r.Move(2)
  496. return true
  497. }
  498. return false
  499. }
  500. func (l *Lexer) consumeCDOToken() bool {
  501. if l.r.Peek(0) == '<' && l.r.Peek(1) == '!' && l.r.Peek(2) == '-' && l.r.Peek(3) == '-' {
  502. l.r.Move(4)
  503. return true
  504. }
  505. return false
  506. }
  507. func (l *Lexer) consumeCDCToken() bool {
  508. if l.r.Peek(0) == '-' && l.r.Peek(1) == '-' && l.r.Peek(2) == '>' {
  509. l.r.Move(3)
  510. return true
  511. }
  512. return false
  513. }
  514. ////////////////////////////////////////////////////////////////
  515. // consumeMatch consumes any MatchToken.
  516. func (l *Lexer) consumeMatch() TokenType {
  517. if l.r.Peek(1) == '=' {
  518. switch l.r.Peek(0) {
  519. case '~':
  520. l.r.Move(2)
  521. return IncludeMatchToken
  522. case '|':
  523. l.r.Move(2)
  524. return DashMatchToken
  525. case '^':
  526. l.r.Move(2)
  527. return PrefixMatchToken
  528. case '$':
  529. l.r.Move(2)
  530. return SuffixMatchToken
  531. case '*':
  532. l.r.Move(2)
  533. return SubstringMatchToken
  534. }
  535. }
  536. return ErrorToken
  537. }
  538. // consumeBracket consumes any bracket token.
  539. func (l *Lexer) consumeBracket() TokenType {
  540. switch l.r.Peek(0) {
  541. case '(':
  542. l.r.Move(1)
  543. return LeftParenthesisToken
  544. case ')':
  545. l.r.Move(1)
  546. return RightParenthesisToken
  547. case '[':
  548. l.r.Move(1)
  549. return LeftBracketToken
  550. case ']':
  551. l.r.Move(1)
  552. return RightBracketToken
  553. case '{':
  554. l.r.Move(1)
  555. return LeftBraceToken
  556. case '}':
  557. l.r.Move(1)
  558. return RightBraceToken
  559. }
  560. return ErrorToken
  561. }
  562. // consumeNumeric consumes NumberToken, PercentageToken or DimensionToken.
  563. func (l *Lexer) consumeNumeric() TokenType {
  564. if l.consumeNumberToken() {
  565. if l.consumeByte('%') {
  566. return PercentageToken
  567. } else if l.consumeIdentToken() {
  568. return DimensionToken
  569. }
  570. return NumberToken
  571. }
  572. return ErrorToken
  573. }
  574. // consumeString consumes a string and may return BadStringToken when a newline is encountered.
  575. func (l *Lexer) consumeString() TokenType {
  576. // assume to be on " or '
  577. delim := l.r.Peek(0)
  578. l.r.Move(1)
  579. for {
  580. c := l.r.Peek(0)
  581. if c == 0 && l.r.Err() != nil {
  582. break
  583. } else if c == '\n' || c == '\r' || c == '\f' {
  584. l.r.Move(1)
  585. return BadStringToken
  586. } else if c == delim {
  587. l.r.Move(1)
  588. break
  589. } else if c == '\\' {
  590. if !l.consumeEscape() {
  591. // either newline or EOF after backslash
  592. l.r.Move(1)
  593. l.consumeNewline()
  594. }
  595. } else {
  596. l.r.Move(1)
  597. }
  598. }
  599. return StringToken
  600. }
  601. func (l *Lexer) consumeUnquotedURL() bool {
  602. for {
  603. c := l.r.Peek(0)
  604. if c == 0 && l.r.Err() != nil || c == ')' {
  605. break
  606. } else if c == '"' || c == '\'' || c == '(' || c == '\\' || c == ' ' || c <= 0x1F || c == 0x7F {
  607. if c != '\\' || !l.consumeEscape() {
  608. return false
  609. }
  610. } else {
  611. l.r.Move(1)
  612. }
  613. }
  614. return true
  615. }
  616. // consumeRemnantsBadUrl consumes bytes of a BadUrlToken so that normal tokenization may continue.
  617. func (l *Lexer) consumeRemnantsBadURL() {
  618. for {
  619. if l.consumeByte(')') || l.r.Err() != nil {
  620. break
  621. } else if !l.consumeEscape() {
  622. l.r.Move(1)
  623. }
  624. }
  625. }
  626. // consumeIdentlike consumes IdentToken, FunctionToken or UrlToken.
  627. func (l *Lexer) consumeIdentlike() TokenType {
  628. if l.consumeIdentToken() {
  629. if l.r.Peek(0) != '(' {
  630. return IdentToken
  631. } else if !parse.EqualFold(bytes.Replace(l.r.Lexeme(), []byte{'\\'}, nil, -1), []byte{'u', 'r', 'l'}) {
  632. l.r.Move(1)
  633. return FunctionToken
  634. }
  635. l.r.Move(1)
  636. // consume url
  637. for l.consumeWhitespace() {
  638. }
  639. if c := l.r.Peek(0); c == '"' || c == '\'' {
  640. if l.consumeString() == BadStringToken {
  641. l.consumeRemnantsBadURL()
  642. return BadURLToken
  643. }
  644. } else if !l.consumeUnquotedURL() && !l.consumeWhitespace() { // if unquoted URL fails due to encountering whitespace, continue
  645. l.consumeRemnantsBadURL()
  646. return BadURLToken
  647. }
  648. for l.consumeWhitespace() {
  649. }
  650. if !l.consumeByte(')') && l.r.Err() != io.EOF {
  651. l.consumeRemnantsBadURL()
  652. return BadURLToken
  653. }
  654. return URLToken
  655. }
  656. return ErrorToken
  657. }