123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698 |
- // Package css is a CSS3 lexer and parser following the specifications at http://www.w3.org/TR/css-syntax-3/.
- package css
- // TODO: \uFFFD replacement character for NULL bytes in strings for example, or atleast don't end the string early
- import (
- "bytes"
- "io"
- "strconv"
- "github.com/tdewolff/parse/v2"
- )
- // TokenType determines the type of token, eg. a number or a semicolon.
- type TokenType uint32
- // TokenType values.
- const (
- ErrorToken TokenType = iota // extra token when errors occur
- IdentToken
- FunctionToken // rgb( rgba( ...
- AtKeywordToken // @abc
- HashToken // #abc
- StringToken
- BadStringToken
- URLToken
- BadURLToken
- DelimToken // any unmatched character
- NumberToken // 5
- PercentageToken // 5%
- DimensionToken // 5em
- UnicodeRangeToken // U+554A
- IncludeMatchToken // ~=
- DashMatchToken // |=
- PrefixMatchToken // ^=
- SuffixMatchToken // $=
- SubstringMatchToken // *=
- ColumnToken // ||
- WhitespaceToken // space \t \r \n \f
- CDOToken // <!--
- CDCToken // -->
- ColonToken // :
- SemicolonToken // ;
- CommaToken // ,
- LeftBracketToken // [
- RightBracketToken // ]
- LeftParenthesisToken // (
- RightParenthesisToken // )
- LeftBraceToken // {
- RightBraceToken // }
- CommentToken // extra token for comments
- EmptyToken
- CustomPropertyNameToken
- CustomPropertyValueToken
- )
- // String returns the string representation of a TokenType.
- func (tt TokenType) String() string {
- switch tt {
- case ErrorToken:
- return "Error"
- case IdentToken:
- return "Ident"
- case FunctionToken:
- return "Function"
- case AtKeywordToken:
- return "AtKeyword"
- case HashToken:
- return "Hash"
- case StringToken:
- return "String"
- case BadStringToken:
- return "BadString"
- case URLToken:
- return "URL"
- case BadURLToken:
- return "BadURL"
- case DelimToken:
- return "Delim"
- case NumberToken:
- return "Number"
- case PercentageToken:
- return "Percentage"
- case DimensionToken:
- return "Dimension"
- case UnicodeRangeToken:
- return "UnicodeRange"
- case IncludeMatchToken:
- return "IncludeMatch"
- case DashMatchToken:
- return "DashMatch"
- case PrefixMatchToken:
- return "PrefixMatch"
- case SuffixMatchToken:
- return "SuffixMatch"
- case SubstringMatchToken:
- return "SubstringMatch"
- case ColumnToken:
- return "Column"
- case WhitespaceToken:
- return "Whitespace"
- case CDOToken:
- return "CDO"
- case CDCToken:
- return "CDC"
- case ColonToken:
- return "Colon"
- case SemicolonToken:
- return "Semicolon"
- case CommaToken:
- return "Comma"
- case LeftBracketToken:
- return "LeftBracket"
- case RightBracketToken:
- return "RightBracket"
- case LeftParenthesisToken:
- return "LeftParenthesis"
- case RightParenthesisToken:
- return "RightParenthesis"
- case LeftBraceToken:
- return "LeftBrace"
- case RightBraceToken:
- return "RightBrace"
- case CommentToken:
- return "Comment"
- case EmptyToken:
- return "Empty"
- case CustomPropertyNameToken:
- return "CustomPropertyName"
- case CustomPropertyValueToken:
- return "CustomPropertyValue"
- }
- return "Invalid(" + strconv.Itoa(int(tt)) + ")"
- }
- ////////////////////////////////////////////////////////////////
- // Lexer is the state for the lexer.
- type Lexer struct {
- r *parse.Input
- }
- // NewLexer returns a new Lexer for a given io.Reader.
- func NewLexer(r *parse.Input) *Lexer {
- return &Lexer{
- r: r,
- }
- }
- // Err returns the error encountered during lexing, this is often io.EOF but also other errors can be returned.
- func (l *Lexer) Err() error {
- return l.r.Err()
- }
- // Next returns the next Token. It returns ErrorToken when an error was encountered. Using Err() one can retrieve the error message.
- func (l *Lexer) Next() (TokenType, []byte) {
- switch l.r.Peek(0) {
- case ' ', '\t', '\n', '\r', '\f':
- l.r.Move(1)
- for l.consumeWhitespace() {
- }
- return WhitespaceToken, l.r.Shift()
- case ':':
- l.r.Move(1)
- return ColonToken, l.r.Shift()
- case ';':
- l.r.Move(1)
- return SemicolonToken, l.r.Shift()
- case ',':
- l.r.Move(1)
- return CommaToken, l.r.Shift()
- case '(', ')', '[', ']', '{', '}':
- if t := l.consumeBracket(); t != ErrorToken {
- return t, l.r.Shift()
- }
- case '#':
- if l.consumeHashToken() {
- return HashToken, l.r.Shift()
- }
- case '"', '\'':
- if t := l.consumeString(); t != ErrorToken {
- return t, l.r.Shift()
- }
- case '.', '+':
- if t := l.consumeNumeric(); t != ErrorToken {
- return t, l.r.Shift()
- }
- case '-':
- if t := l.consumeNumeric(); t != ErrorToken {
- return t, l.r.Shift()
- } else if t := l.consumeIdentlike(); t != ErrorToken {
- return t, l.r.Shift()
- } else if l.consumeCDCToken() {
- return CDCToken, l.r.Shift()
- } else if l.consumeCustomVariableToken() {
- return CustomPropertyNameToken, l.r.Shift()
- }
- case '@':
- if l.consumeAtKeywordToken() {
- return AtKeywordToken, l.r.Shift()
- }
- case '$', '*', '^', '~':
- if t := l.consumeMatch(); t != ErrorToken {
- return t, l.r.Shift()
- }
- case '/':
- if l.consumeComment() {
- return CommentToken, l.r.Shift()
- }
- case '<':
- if l.consumeCDOToken() {
- return CDOToken, l.r.Shift()
- }
- case '\\':
- if t := l.consumeIdentlike(); t != ErrorToken {
- return t, l.r.Shift()
- }
- case 'u', 'U':
- if l.consumeUnicodeRangeToken() {
- return UnicodeRangeToken, l.r.Shift()
- } else if t := l.consumeIdentlike(); t != ErrorToken {
- return t, l.r.Shift()
- }
- case '|':
- if t := l.consumeMatch(); t != ErrorToken {
- return t, l.r.Shift()
- } else if l.consumeColumnToken() {
- return ColumnToken, l.r.Shift()
- }
- case 0:
- if l.r.Err() != nil {
- return ErrorToken, nil
- }
- default:
- if t := l.consumeNumeric(); t != ErrorToken {
- return t, l.r.Shift()
- } else if t := l.consumeIdentlike(); t != ErrorToken {
- return t, l.r.Shift()
- }
- }
- // can't be rune because consumeIdentlike consumes that as an identifier
- l.r.Move(1)
- return DelimToken, l.r.Shift()
- }
- ////////////////////////////////////////////////////////////////
- /*
- The following functions follow the railroad diagrams in http://www.w3.org/TR/css3-syntax/
- */
- func (l *Lexer) consumeByte(c byte) bool {
- if l.r.Peek(0) == c {
- l.r.Move(1)
- return true
- }
- return false
- }
- func (l *Lexer) consumeComment() bool {
- if l.r.Peek(0) != '/' || l.r.Peek(1) != '*' {
- return false
- }
- l.r.Move(2)
- for {
- c := l.r.Peek(0)
- if c == 0 && l.r.Err() != nil {
- break
- } else if c == '*' && l.r.Peek(1) == '/' {
- l.r.Move(2)
- return true
- }
- l.r.Move(1)
- }
- return true
- }
- func (l *Lexer) consumeNewline() bool {
- c := l.r.Peek(0)
- if c == '\n' || c == '\f' {
- l.r.Move(1)
- return true
- } else if c == '\r' {
- if l.r.Peek(1) == '\n' {
- l.r.Move(2)
- } else {
- l.r.Move(1)
- }
- return true
- }
- return false
- }
- func (l *Lexer) consumeWhitespace() bool {
- c := l.r.Peek(0)
- if c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' {
- l.r.Move(1)
- return true
- }
- return false
- }
- func (l *Lexer) consumeDigit() bool {
- c := l.r.Peek(0)
- if c >= '0' && c <= '9' {
- l.r.Move(1)
- return true
- }
- return false
- }
- func (l *Lexer) consumeHexDigit() bool {
- c := l.r.Peek(0)
- if (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F') {
- l.r.Move(1)
- return true
- }
- return false
- }
- func (l *Lexer) consumeEscape() bool {
- if l.r.Peek(0) != '\\' {
- return false
- }
- mark := l.r.Pos()
- l.r.Move(1)
- if l.consumeNewline() {
- l.r.Rewind(mark)
- return false
- } else if l.consumeHexDigit() {
- for k := 1; k < 6; k++ {
- if !l.consumeHexDigit() {
- break
- }
- }
- l.consumeWhitespace()
- return true
- } else {
- c := l.r.Peek(0)
- if c >= 0xC0 {
- _, n := l.r.PeekRune(0)
- l.r.Move(n)
- return true
- } else if c == 0 && l.r.Err() != nil {
- l.r.Rewind(mark)
- return false
- }
- }
- l.r.Move(1)
- return true
- }
- func (l *Lexer) consumeIdentToken() bool {
- mark := l.r.Pos()
- if l.r.Peek(0) == '-' {
- l.r.Move(1)
- }
- c := l.r.Peek(0)
- if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c >= 0x80) {
- if c != '\\' || !l.consumeEscape() {
- l.r.Rewind(mark)
- return false
- }
- } else {
- l.r.Move(1)
- }
- for {
- c := l.r.Peek(0)
- if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' || c == '-' || c >= 0x80) {
- if c != '\\' || !l.consumeEscape() {
- break
- }
- } else {
- l.r.Move(1)
- }
- }
- return true
- }
- // support custom variables, https://www.w3.org/TR/css-variables-1/
- func (l *Lexer) consumeCustomVariableToken() bool {
- // expect to be on a '-'
- l.r.Move(1)
- if l.r.Peek(0) != '-' {
- l.r.Move(-1)
- return false
- }
- if !l.consumeIdentToken() {
- l.r.Move(-1)
- return false
- }
- return true
- }
- func (l *Lexer) consumeAtKeywordToken() bool {
- // expect to be on an '@'
- l.r.Move(1)
- if !l.consumeIdentToken() {
- l.r.Move(-1)
- return false
- }
- return true
- }
- func (l *Lexer) consumeHashToken() bool {
- // expect to be on a '#'
- mark := l.r.Pos()
- l.r.Move(1)
- c := l.r.Peek(0)
- if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' || c == '-' || c >= 0x80) {
- if c != '\\' || !l.consumeEscape() {
- l.r.Rewind(mark)
- return false
- }
- } else {
- l.r.Move(1)
- }
- for {
- c := l.r.Peek(0)
- if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' || c == '-' || c >= 0x80) {
- if c != '\\' || !l.consumeEscape() {
- break
- }
- } else {
- l.r.Move(1)
- }
- }
- return true
- }
- func (l *Lexer) consumeNumberToken() bool {
- mark := l.r.Pos()
- c := l.r.Peek(0)
- if c == '+' || c == '-' {
- l.r.Move(1)
- }
- firstDigit := l.consumeDigit()
- if firstDigit {
- for l.consumeDigit() {
- }
- }
- if l.r.Peek(0) == '.' {
- l.r.Move(1)
- if l.consumeDigit() {
- for l.consumeDigit() {
- }
- } else if firstDigit {
- // . could belong to the next token
- l.r.Move(-1)
- return true
- } else {
- l.r.Rewind(mark)
- return false
- }
- } else if !firstDigit {
- l.r.Rewind(mark)
- return false
- }
- mark = l.r.Pos()
- c = l.r.Peek(0)
- if c == 'e' || c == 'E' {
- l.r.Move(1)
- c = l.r.Peek(0)
- if c == '+' || c == '-' {
- l.r.Move(1)
- }
- if !l.consumeDigit() {
- // e could belong to next token
- l.r.Rewind(mark)
- return true
- }
- for l.consumeDigit() {
- }
- }
- return true
- }
- func (l *Lexer) consumeUnicodeRangeToken() bool {
- c := l.r.Peek(0)
- if (c != 'u' && c != 'U') || l.r.Peek(1) != '+' {
- return false
- }
- mark := l.r.Pos()
- l.r.Move(2)
- // consume up to 6 hexDigits
- k := 0
- for l.consumeHexDigit() {
- k++
- }
- // either a minus or a question mark or the end is expected
- if l.consumeByte('-') {
- if k == 0 || 6 < k {
- l.r.Rewind(mark)
- return false
- }
- // consume another up to 6 hexDigits
- if l.consumeHexDigit() {
- k = 1
- for l.consumeHexDigit() {
- k++
- }
- } else {
- l.r.Rewind(mark)
- return false
- }
- } else if l.consumeByte('?') {
- // could be filled up to 6 characters with question marks or else regular hexDigits
- k++
- for l.consumeByte('?') {
- k++
- }
- }
- if k == 0 || 6 < k {
- l.r.Rewind(mark)
- return false
- }
- return true
- }
- func (l *Lexer) consumeColumnToken() bool {
- if l.r.Peek(0) == '|' && l.r.Peek(1) == '|' {
- l.r.Move(2)
- return true
- }
- return false
- }
- func (l *Lexer) consumeCDOToken() bool {
- if l.r.Peek(0) == '<' && l.r.Peek(1) == '!' && l.r.Peek(2) == '-' && l.r.Peek(3) == '-' {
- l.r.Move(4)
- return true
- }
- return false
- }
- func (l *Lexer) consumeCDCToken() bool {
- if l.r.Peek(0) == '-' && l.r.Peek(1) == '-' && l.r.Peek(2) == '>' {
- l.r.Move(3)
- return true
- }
- return false
- }
- ////////////////////////////////////////////////////////////////
- // consumeMatch consumes any MatchToken.
- func (l *Lexer) consumeMatch() TokenType {
- if l.r.Peek(1) == '=' {
- switch l.r.Peek(0) {
- case '~':
- l.r.Move(2)
- return IncludeMatchToken
- case '|':
- l.r.Move(2)
- return DashMatchToken
- case '^':
- l.r.Move(2)
- return PrefixMatchToken
- case '$':
- l.r.Move(2)
- return SuffixMatchToken
- case '*':
- l.r.Move(2)
- return SubstringMatchToken
- }
- }
- return ErrorToken
- }
- // consumeBracket consumes any bracket token.
- func (l *Lexer) consumeBracket() TokenType {
- switch l.r.Peek(0) {
- case '(':
- l.r.Move(1)
- return LeftParenthesisToken
- case ')':
- l.r.Move(1)
- return RightParenthesisToken
- case '[':
- l.r.Move(1)
- return LeftBracketToken
- case ']':
- l.r.Move(1)
- return RightBracketToken
- case '{':
- l.r.Move(1)
- return LeftBraceToken
- case '}':
- l.r.Move(1)
- return RightBraceToken
- }
- return ErrorToken
- }
- // consumeNumeric consumes NumberToken, PercentageToken or DimensionToken.
- func (l *Lexer) consumeNumeric() TokenType {
- if l.consumeNumberToken() {
- if l.consumeByte('%') {
- return PercentageToken
- } else if l.consumeIdentToken() {
- return DimensionToken
- }
- return NumberToken
- }
- return ErrorToken
- }
- // consumeString consumes a string and may return BadStringToken when a newline is encountered.
- func (l *Lexer) consumeString() TokenType {
- // assume to be on " or '
- delim := l.r.Peek(0)
- l.r.Move(1)
- for {
- c := l.r.Peek(0)
- if c == 0 && l.r.Err() != nil {
- break
- } else if c == '\n' || c == '\r' || c == '\f' {
- l.r.Move(1)
- return BadStringToken
- } else if c == delim {
- l.r.Move(1)
- break
- } else if c == '\\' {
- if !l.consumeEscape() {
- // either newline or EOF after backslash
- l.r.Move(1)
- l.consumeNewline()
- }
- } else {
- l.r.Move(1)
- }
- }
- return StringToken
- }
- func (l *Lexer) consumeUnquotedURL() bool {
- for {
- c := l.r.Peek(0)
- if c == 0 && l.r.Err() != nil || c == ')' {
- break
- } else if c == '"' || c == '\'' || c == '(' || c == '\\' || c == ' ' || c <= 0x1F || c == 0x7F {
- if c != '\\' || !l.consumeEscape() {
- return false
- }
- } else {
- l.r.Move(1)
- }
- }
- return true
- }
- // consumeRemnantsBadUrl consumes bytes of a BadUrlToken so that normal tokenization may continue.
- func (l *Lexer) consumeRemnantsBadURL() {
- for {
- if l.consumeByte(')') || l.r.Err() != nil {
- break
- } else if !l.consumeEscape() {
- l.r.Move(1)
- }
- }
- }
- // consumeIdentlike consumes IdentToken, FunctionToken or UrlToken.
- func (l *Lexer) consumeIdentlike() TokenType {
- if l.consumeIdentToken() {
- if l.r.Peek(0) != '(' {
- return IdentToken
- } else if !parse.EqualFold(bytes.Replace(l.r.Lexeme(), []byte{'\\'}, nil, -1), []byte{'u', 'r', 'l'}) {
- l.r.Move(1)
- return FunctionToken
- }
- l.r.Move(1)
- // consume url
- for l.consumeWhitespace() {
- }
- if c := l.r.Peek(0); c == '"' || c == '\'' {
- if l.consumeString() == BadStringToken {
- l.consumeRemnantsBadURL()
- return BadURLToken
- }
- } else if !l.consumeUnquotedURL() && !l.consumeWhitespace() { // if unquoted URL fails due to encountering whitespace, continue
- l.consumeRemnantsBadURL()
- return BadURLToken
- }
- for l.consumeWhitespace() {
- }
- if !l.consumeByte(')') && l.r.Err() != io.EOF {
- l.consumeRemnantsBadURL()
- return BadURLToken
- }
- return URLToken
- }
- return ErrorToken
- }
|