yongxu
/
sparrow


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589
							// Package html is an HTML5 lexer following the specifications at http://www.w3.org/TR/html5/syntax.html.
package html

import (
	"strconv"

	"github.com/tdewolff/parse/v2"
)

// TokenType determines the type of token, eg. a number or a semicolon.
type TokenType uint32

// TokenType values.
const (
	ErrorToken TokenType = iota // extra token when errors occur
	CommentToken
	DoctypeToken
	StartTagToken
	StartTagCloseToken
	StartTagVoidToken
	EndTagToken
	AttributeToken
	TextToken
	SvgToken
	MathToken
)

// String returns the string representation of a TokenType.
func (tt TokenType) String() string {
	switch tt {
	case ErrorToken:
		return "Error"
	case CommentToken:
		return "Comment"
	case DoctypeToken:
		return "Doctype"
	case StartTagToken:
		return "StartTag"
	case StartTagCloseToken:
		return "StartTagClose"
	case StartTagVoidToken:
		return "StartTagVoid"
	case EndTagToken:
		return "EndTag"
	case AttributeToken:
		return "Attribute"
	case TextToken:
		return "Text"
	case SvgToken:
		return "Svg"
	case MathToken:
		return "Math"
	}
	return "Invalid(" + strconv.Itoa(int(tt)) + ")"
}

////////////////////////////////////////////////////////////////

var GoTemplate = [2]string{"{{", "}}"}
var HandlebarsTemplate = [2]string{"{{", "}}"}
var MustacheTemplate = [2]string{"{{", "}}"}
var EJSTemplate = [2]string{"<%", "%>"}
var ASPTemplate = [2]string{"<%", "%>"}
var PHPTemplate = [2]string{"<?", "?>"}

// Lexer is the state for the lexer.
type Lexer struct {
	r         *parse.Input
	tmplBegin []byte
	tmplEnd   []byte
	err       error

	rawTag Hash
	inTag  bool

	text    []byte
	attrVal []byte
	hasTmpl bool
}

// NewLexer returns a new Lexer for a given io.Reader.
func NewLexer(r *parse.Input) *Lexer {
	return &Lexer{
		r: r,
	}
}

func NewTemplateLexer(r *parse.Input, tmpl [2]string) *Lexer {
	return &Lexer{
		r:         r,
		tmplBegin: []byte(tmpl[0]),
		tmplEnd:   []byte(tmpl[1]),
	}
}

// Err returns the error encountered during lexing, this is often io.EOF but also other errors can be returned.
func (l *Lexer) Err() error {
	if l.err != nil {
		return l.err
	}
	return l.r.Err()
}

// Text returns the textual representation of a token. This excludes delimiters and additional leading/trailing characters.
func (l *Lexer) Text() []byte {
	return l.text
}

// AttrKey returns the attribute key when an AttributeToken was returned from Next.
func (l *Lexer) AttrKey() []byte {
	return l.text
}

// AttrVal returns the attribute value when an AttributeToken was returned from Next.
func (l *Lexer) AttrVal() []byte {
	return l.attrVal
}

// HasTemplate returns the true if the token value contains a template.
func (l *Lexer) HasTemplate() bool {
	return l.hasTmpl
}

// Next returns the next Token. It returns ErrorToken when an error was encountered. Using Err() one can retrieve the error message.
func (l *Lexer) Next() (TokenType, []byte) {
	l.text = nil
	l.hasTmpl = false
	var c byte
	if l.inTag {
		l.attrVal = nil
		for { // before attribute name state
			if c = l.r.Peek(0); c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' {
				l.r.Move(1)
				continue
			}
			break
		}
		if c == 0 && l.r.Err() != nil {
			return ErrorToken, nil
		} else if c != '>' && (c != '/' || l.r.Peek(1) != '>') {
			return AttributeToken, l.shiftAttribute()
		}
		l.r.Skip()
		l.inTag = false
		if c == '/' {
			l.r.Move(2)
			return StartTagVoidToken, l.r.Shift()
		}
		l.r.Move(1)
		return StartTagCloseToken, l.r.Shift()
	}

	if l.rawTag != 0 {
		if rawText := l.shiftRawText(); 0 < len(rawText) {
			l.text = rawText
			l.rawTag = 0
			return TextToken, rawText
		}
		l.rawTag = 0
	}

	for {
		c = l.r.Peek(0)
		if c == '<' {
			c = l.r.Peek(1)
			isEndTag := c == '/' && l.r.Peek(2) != '>' && (l.r.Peek(2) != 0 || l.r.PeekErr(2) == nil)
			if !isEndTag && (c < 'a' || 'z' < c) && (c < 'A' || 'Z' < c) && c != '!' && c != '?' {
				// not a tag
				l.r.Move(1)
			} else if 0 < l.r.Pos() {
				// return currently buffered texttoken so that we can return tag next iteration
				l.text = l.r.Shift()
				return TextToken, l.text
			} else if isEndTag {
				l.r.Move(2)
				// only endtags that are not followed by > or EOF arrive here
				if c = l.r.Peek(0); !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') {
					return CommentToken, l.shiftBogusComment()
				}
				return EndTagToken, l.shiftEndTag()
			} else if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' {
				l.r.Move(1)
				l.inTag = true
				return l.shiftStartTag()
			} else if c == '!' {
				l.r.Move(2)
				return l.readMarkup()
			} else if 0 < len(l.tmplBegin) && l.at(l.tmplBegin...) {
				l.r.Move(len(l.tmplBegin))
				l.moveTemplate()
				l.hasTmpl = true
			} else if c == '?' {
				l.r.Move(1)
				return CommentToken, l.shiftBogusComment()
			}
		} else if 0 < len(l.tmplBegin) && l.at(l.tmplBegin...) {
			l.r.Move(len(l.tmplBegin))
			l.moveTemplate()
			l.hasTmpl = true
		} else if c == 0 && l.r.Err() != nil {
			if 0 < l.r.Pos() {
				l.text = l.r.Shift()
				return TextToken, l.text
			}
			return ErrorToken, nil
		} else {
			l.r.Move(1)
		}
	}
}

////////////////////////////////////////////////////////////////

// The following functions follow the specifications at https://html.spec.whatwg.org/multipage/parsing.html

func (l *Lexer) shiftRawText() []byte {
	if l.rawTag == Plaintext {
		for {
			if l.r.Peek(0) == 0 && l.r.Err() != nil {
				return l.r.Shift()
			}
			l.r.Move(1)
		}
	} else { // RCDATA, RAWTEXT and SCRIPT
		for {
			c := l.r.Peek(0)
			if c == '<' {
				if l.r.Peek(1) == '/' {
					mark := l.r.Pos()
					l.r.Move(2)
					for {
						if c = l.r.Peek(0); !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') {
							break
						}
						l.r.Move(1)
					}
					if h := ToHash(parse.ToLower(parse.Copy(l.r.Lexeme()[mark+2:]))); h == l.rawTag { // copy so that ToLower doesn't change the case of the underlying slice
						l.r.Rewind(mark)
						return l.r.Shift()
					}
				} else if l.rawTag == Script && l.r.Peek(1) == '!' && l.r.Peek(2) == '-' && l.r.Peek(3) == '-' {
					l.r.Move(4)
					inScript := false
					for {
						c := l.r.Peek(0)
						if c == '-' && l.r.Peek(1) == '-' && l.r.Peek(2) == '>' {
							l.r.Move(3)
							break
						} else if c == '<' {
							isEnd := l.r.Peek(1) == '/'
							if isEnd {
								l.r.Move(2)
							} else {
								l.r.Move(1)
							}
							mark := l.r.Pos()
							for {
								if c = l.r.Peek(0); !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') {
									break
								}
								l.r.Move(1)
							}
							if h := ToHash(parse.ToLower(parse.Copy(l.r.Lexeme()[mark:]))); h == Script { // copy so that ToLower doesn't change the case of the underlying slice
								if !isEnd {
									inScript = true
								} else {
									if !inScript {
										l.r.Rewind(mark - 2)
										return l.r.Shift()
									}
									inScript = false
								}
							}
						} else if c == 0 && l.r.Err() != nil {
							return l.r.Shift()
						} else {
							l.r.Move(1)
						}
					}
				} else {
					l.r.Move(1)
				}
			} else if 0 < len(l.tmplBegin) && l.at(l.tmplBegin...) {
				l.r.Move(len(l.tmplBegin))
				l.moveTemplate()
				l.hasTmpl = true
			} else if c == 0 && l.r.Err() != nil {
				return l.r.Shift()
			} else {
				l.r.Move(1)
			}
		}
	}
}

func (l *Lexer) readMarkup() (TokenType, []byte) {
	if l.at('-', '-') {
		l.r.Move(2)
		for {
			if l.r.Peek(0) == 0 && l.r.Err() != nil {
				l.text = l.r.Lexeme()[4:]
				return CommentToken, l.r.Shift()
			} else if l.at('-', '-', '>') {
				l.text = l.r.Lexeme()[4:]
				l.r.Move(3)
				return CommentToken, l.r.Shift()
			} else if l.at('-', '-', '!', '>') {
				l.text = l.r.Lexeme()[4:]
				l.r.Move(4)
				return CommentToken, l.r.Shift()
			}
			l.r.Move(1)
		}
	} else if l.at('[', 'C', 'D', 'A', 'T', 'A', '[') {
		l.r.Move(7)
		for {
			if l.r.Peek(0) == 0 && l.r.Err() != nil {
				l.text = l.r.Lexeme()[9:]
				return TextToken, l.r.Shift()
			} else if l.at(']', ']', '>') {
				l.text = l.r.Lexeme()[9:]
				l.r.Move(3)
				return TextToken, l.r.Shift()
			}
			l.r.Move(1)
		}
	} else {
		if l.atCaseInsensitive('d', 'o', 'c', 't', 'y', 'p', 'e') {
			l.r.Move(7)
			if l.r.Peek(0) == ' ' {
				l.r.Move(1)
			}
			for {
				if c := l.r.Peek(0); c == '>' || c == 0 && l.r.Err() != nil {
					l.text = l.r.Lexeme()[9:]
					if c == '>' {
						l.r.Move(1)
					}
					return DoctypeToken, l.r.Shift()
				}
				l.r.Move(1)
			}
		}
	}
	return CommentToken, l.shiftBogusComment()
}

func (l *Lexer) shiftBogusComment() []byte {
	for {
		c := l.r.Peek(0)
		if c == '>' {
			l.text = l.r.Lexeme()[2:]
			l.r.Move(1)
			return l.r.Shift()
		} else if c == 0 && l.r.Err() != nil {
			l.text = l.r.Lexeme()[2:]
			return l.r.Shift()
		}
		l.r.Move(1)
	}
}

func (l *Lexer) shiftStartTag() (TokenType, []byte) {
	for {
		if c := l.r.Peek(0); c == ' ' || c == '>' || c == '/' && l.r.Peek(1) == '>' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == 0 && l.r.Err() != nil {
			break
		}
		l.r.Move(1)
	}
	l.text = parse.ToLower(l.r.Lexeme()[1:])
	if h := ToHash(l.text); h == Textarea || h == Title || h == Style || h == Xmp || h == Iframe || h == Script || h == Plaintext || h == Svg || h == Math {
		if h == Svg || h == Math {
			data := l.shiftXML(h)
			if l.err != nil {
				return ErrorToken, nil
			}

			l.inTag = false
			if h == Svg {
				return SvgToken, data
			}
			return MathToken, data
		}
		l.rawTag = h
	}
	return StartTagToken, l.r.Shift()
}

func (l *Lexer) shiftAttribute() []byte {
	nameStart := l.r.Pos()
	var c byte
	if 0 < len(l.tmplBegin) && l.at(l.tmplBegin...) {
		l.r.Move(len(l.tmplBegin))
		l.moveTemplate()
		l.hasTmpl = true
	}
	for { // attribute name state
		if c = l.r.Peek(0); c == ' ' || c == '=' || c == '>' || c == '/' && l.r.Peek(1) == '>' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == 0 && l.r.Err() != nil {
			break
		}
		l.r.Move(1)
	}
	nameEnd := l.r.Pos()
	for { // after attribute name state
		if c = l.r.Peek(0); c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' {
			l.r.Move(1)
			continue
		}
		break
	}
	nameHasTmpl := l.hasTmpl
	if c == '=' {
		l.r.Move(1)
		for { // before attribute value state
			if c = l.r.Peek(0); c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' {
				l.r.Move(1)
				continue
			}
			break
		}
		attrPos := l.r.Pos()
		delim := c
		if delim == '"' || delim == '\'' { // attribute value single- and double-quoted state
			l.r.Move(1)
			for {
				c := l.r.Peek(0)
				if c == delim {
					l.r.Move(1)
					break
				} else if 0 < len(l.tmplBegin) && l.at(l.tmplBegin...) {
					l.r.Move(len(l.tmplBegin))
					l.moveTemplate()
					l.hasTmpl = true
				} else if c == 0 && l.r.Err() != nil {
					break
				} else {
					l.r.Move(1)
				}
			}
		} else if 0 < len(l.tmplBegin) && l.at(l.tmplBegin...) {
			l.r.Move(len(l.tmplBegin))
			l.moveTemplate()
			l.hasTmpl = true
		} else { // attribute value unquoted state
			for {
				if c := l.r.Peek(0); c == ' ' || c == '>' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == 0 && l.r.Err() != nil {
					break
				}
				l.r.Move(1)
			}
		}
		l.attrVal = l.r.Lexeme()[attrPos:]
	} else {
		l.r.Rewind(nameEnd)
		l.attrVal = nil
	}
	if 0 < len(l.tmplBegin) && l.at(l.tmplBegin...) {
		l.r.Move(len(l.tmplBegin))
		l.moveTemplate()
		l.hasTmpl = true
	}
	l.text = l.r.Lexeme()[nameStart:nameEnd]
	if !nameHasTmpl {
		l.text = parse.ToLower(l.text)
	}
	return l.r.Shift()
}

func (l *Lexer) shiftEndTag() []byte {
	for {
		c := l.r.Peek(0)
		if c == '>' {
			l.text = l.r.Lexeme()[2:]
			l.r.Move(1)
			break
		} else if c == 0 && l.r.Err() != nil {
			l.text = l.r.Lexeme()[2:]
			break
		}
		l.r.Move(1)
	}

	end := len(l.text)
	for end > 0 {
		if c := l.text[end-1]; c == ' ' || c == '\t' || c == '\n' || c == '\r' {
			end--
			continue
		}
		break
	}
	l.text = l.text[:end]
	return parse.ToLower(l.r.Shift())
}

// shiftXML parses the content of a svg or math tag according to the XML 1.1 specifications, including the tag itself.
// So far we have already parsed `<svg` or `<math`.
func (l *Lexer) shiftXML(rawTag Hash) []byte {
	inQuote := false
	for {
		c := l.r.Peek(0)
		if c == '"' {
			inQuote = !inQuote
			l.r.Move(1)
		} else if c == '<' && !inQuote && l.r.Peek(1) == '/' {
			mark := l.r.Pos()
			l.r.Move(2)
			for {
				if c = l.r.Peek(0); !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') {
					break
				}
				l.r.Move(1)
			}
			if h := ToHash(parse.ToLower(parse.Copy(l.r.Lexeme()[mark+2:]))); h == rawTag { // copy so that ToLower doesn't change the case of the underlying slice
				break
			}
		} else if c == 0 {
			if l.r.Err() == nil {
				l.err = parse.NewErrorLexer(l.r, "unexpected NULL character")
			}
			return l.r.Shift()
		} else {
			l.r.Move(1)
		}
	}

	for {
		c := l.r.Peek(0)
		if c == '>' {
			l.r.Move(1)
			break
		} else if c == 0 {
			if l.r.Err() == nil {
				l.err = parse.NewErrorLexer(l.r, "unexpected NULL character")
			}
			return l.r.Shift()
		}
		l.r.Move(1)
	}
	return l.r.Shift()
}

func (l *Lexer) moveTemplate() {
	for {
		if c := l.r.Peek(0); c == 0 && l.r.Err() != nil {
			return
		} else if l.at(l.tmplEnd...) {
			l.r.Move(len(l.tmplEnd))
			return
		} else if c == '"' || c == '\'' {
			l.r.Move(1)
			escape := false
			for {
				if c2 := l.r.Peek(0); c2 == 0 && l.r.Err() != nil {
					return
				} else if !escape && c2 == c {
					l.r.Move(1)
					break
				} else if c2 == '\\' {
					escape = !escape
				} else {
					escape = false
				}
				l.r.Move(1)
			}
		} else {
			l.r.Move(1)
		}
	}
}

////////////////////////////////////////////////////////////////

func (l *Lexer) at(b ...byte) bool {
	for i, c := range b {
		if l.r.Peek(i) != c {
			return false
		}
	}
	return true
}

func (l *Lexer) atCaseInsensitive(b ...byte) bool {
	for i, c := range b {
		if l.r.Peek(i) != c && (l.r.Peek(i)+('a'-'A')) != c {
			return false
		}
	}
	return true
}