123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340 |
- // Package xml is an XML1.0 lexer following the specifications at http://www.w3.org/TR/xml/.
- package xml
- import (
- "strconv"
- "github.com/tdewolff/parse/v2"
- )
- // TokenType determines the type of token, eg. a number or a semicolon.
- type TokenType uint32
- // TokenType values.
- const (
- ErrorToken TokenType = iota // extra token when errors occur
- CommentToken
- DOCTYPEToken
- CDATAToken
- StartTagToken
- StartTagPIToken
- StartTagCloseToken
- StartTagCloseVoidToken
- StartTagClosePIToken
- EndTagToken
- AttributeToken
- TextToken
- )
- // String returns the string representation of a TokenType.
- func (tt TokenType) String() string {
- switch tt {
- case ErrorToken:
- return "Error"
- case CommentToken:
- return "Comment"
- case DOCTYPEToken:
- return "DOCTYPE"
- case CDATAToken:
- return "CDATA"
- case StartTagToken:
- return "StartTag"
- case StartTagPIToken:
- return "StartTagPI"
- case StartTagCloseToken:
- return "StartTagClose"
- case StartTagCloseVoidToken:
- return "StartTagCloseVoid"
- case StartTagClosePIToken:
- return "StartTagClosePI"
- case EndTagToken:
- return "EndTag"
- case AttributeToken:
- return "Attribute"
- case TextToken:
- return "Text"
- }
- return "Invalid(" + strconv.Itoa(int(tt)) + ")"
- }
- ////////////////////////////////////////////////////////////////
- // Lexer is the state for the lexer.
- type Lexer struct {
- r *parse.Input
- err error
- inTag bool
- text []byte
- attrVal []byte
- }
- // NewLexer returns a new Lexer for a given io.Reader.
- func NewLexer(r *parse.Input) *Lexer {
- return &Lexer{
- r: r,
- }
- }
- // Err returns the error encountered during lexing, this is often io.EOF but also other errors can be returned.
- func (l *Lexer) Err() error {
- if l.err != nil {
- return l.err
- }
- return l.r.Err()
- }
- // Text returns the textual representation of a token. This excludes delimiters and additional leading/trailing characters.
- func (l *Lexer) Text() []byte {
- return l.text
- }
- // AttrVal returns the attribute value when an AttributeToken was returned from Next.
- func (l *Lexer) AttrVal() []byte {
- return l.attrVal
- }
- // Next returns the next Token. It returns ErrorToken when an error was encountered. Using Err() one can retrieve the error message.
- func (l *Lexer) Next() (TokenType, []byte) {
- l.text = nil
- var c byte
- if l.inTag {
- l.attrVal = nil
- for { // before attribute name state
- if c = l.r.Peek(0); c == ' ' || c == '\t' || c == '\n' || c == '\r' {
- l.r.Move(1)
- continue
- }
- break
- }
- if c == 0 {
- if l.r.Err() == nil {
- l.err = parse.NewErrorLexer(l.r, "unexpected NULL character")
- }
- return ErrorToken, nil
- } else if c != '>' && (c != '/' && c != '?' || l.r.Peek(1) != '>') {
- return AttributeToken, l.shiftAttribute()
- }
- l.r.Skip()
- l.inTag = false
- if c == '/' {
- l.r.Move(2)
- return StartTagCloseVoidToken, l.r.Shift()
- } else if c == '?' {
- l.r.Move(2)
- return StartTagClosePIToken, l.r.Shift()
- } else {
- l.r.Move(1)
- return StartTagCloseToken, l.r.Shift()
- }
- }
- for {
- c = l.r.Peek(0)
- if c == '<' {
- if l.r.Pos() > 0 {
- l.text = l.r.Shift()
- return TextToken, l.text
- }
- c = l.r.Peek(1)
- if c == '/' {
- l.r.Move(2)
- return EndTagToken, l.shiftEndTag()
- } else if c == '!' {
- l.r.Move(2)
- if l.at('-', '-') {
- l.r.Move(2)
- return CommentToken, l.shiftCommentText()
- } else if l.at('[', 'C', 'D', 'A', 'T', 'A', '[') {
- l.r.Move(7)
- return CDATAToken, l.shiftCDATAText()
- } else if l.at('D', 'O', 'C', 'T', 'Y', 'P', 'E') {
- l.r.Move(7)
- return DOCTYPEToken, l.shiftDOCTYPEText()
- }
- l.r.Move(-2)
- } else if c == '?' {
- l.r.Move(2)
- l.inTag = true
- return StartTagPIToken, l.shiftStartTag()
- }
- l.r.Move(1)
- l.inTag = true
- return StartTagToken, l.shiftStartTag()
- } else if c == 0 {
- if l.r.Pos() > 0 {
- l.text = l.r.Shift()
- return TextToken, l.text
- }
- if l.r.Err() == nil {
- l.err = parse.NewErrorLexer(l.r, "unexpected NULL character")
- }
- return ErrorToken, nil
- }
- l.r.Move(1)
- }
- }
- ////////////////////////////////////////////////////////////////
- // The following functions follow the specifications at http://www.w3.org/html/wg/drafts/html/master/syntax.html
- func (l *Lexer) shiftDOCTYPEText() []byte {
- inString := false
- inBrackets := false
- for {
- c := l.r.Peek(0)
- if c == '"' {
- inString = !inString
- } else if (c == '[' || c == ']') && !inString {
- inBrackets = (c == '[')
- } else if c == '>' && !inString && !inBrackets {
- l.text = l.r.Lexeme()[9:]
- l.r.Move(1)
- return l.r.Shift()
- } else if c == 0 {
- l.text = l.r.Lexeme()[9:]
- return l.r.Shift()
- }
- l.r.Move(1)
- }
- }
- func (l *Lexer) shiftCDATAText() []byte {
- for {
- c := l.r.Peek(0)
- if c == ']' && l.r.Peek(1) == ']' && l.r.Peek(2) == '>' {
- l.text = l.r.Lexeme()[9:]
- l.r.Move(3)
- return l.r.Shift()
- } else if c == 0 {
- l.text = l.r.Lexeme()[9:]
- return l.r.Shift()
- }
- l.r.Move(1)
- }
- }
- func (l *Lexer) shiftCommentText() []byte {
- for {
- c := l.r.Peek(0)
- if c == '-' && l.r.Peek(1) == '-' && l.r.Peek(2) == '>' {
- l.text = l.r.Lexeme()[4:]
- l.r.Move(3)
- return l.r.Shift()
- } else if c == 0 {
- return l.r.Shift()
- }
- l.r.Move(1)
- }
- }
- func (l *Lexer) shiftStartTag() []byte {
- nameStart := l.r.Pos()
- for {
- if c := l.r.Peek(0); c == ' ' || c == '>' || (c == '/' || c == '?') && l.r.Peek(1) == '>' || c == '\t' || c == '\n' || c == '\r' || c == 0 {
- break
- }
- l.r.Move(1)
- }
- l.text = l.r.Lexeme()[nameStart:]
- return l.r.Shift()
- }
- func (l *Lexer) shiftAttribute() []byte {
- nameStart := l.r.Pos()
- var c byte
- for { // attribute name state
- if c = l.r.Peek(0); c == ' ' || c == '=' || c == '>' || (c == '/' || c == '?') && l.r.Peek(1) == '>' || c == '\t' || c == '\n' || c == '\r' || c == 0 {
- break
- }
- l.r.Move(1)
- }
- nameEnd := l.r.Pos()
- for { // after attribute name state
- if c = l.r.Peek(0); c == ' ' || c == '\t' || c == '\n' || c == '\r' {
- l.r.Move(1)
- continue
- }
- break
- }
- if c == '=' {
- l.r.Move(1)
- for { // before attribute value state
- if c = l.r.Peek(0); c == ' ' || c == '\t' || c == '\n' || c == '\r' {
- l.r.Move(1)
- continue
- }
- break
- }
- attrPos := l.r.Pos()
- delim := c
- if delim == '"' || delim == '\'' { // attribute value single- and double-quoted state
- l.r.Move(1)
- for {
- c = l.r.Peek(0)
- if c == delim {
- l.r.Move(1)
- break
- } else if c == 0 {
- break
- }
- l.r.Move(1)
- if c == '\t' || c == '\n' || c == '\r' {
- l.r.Lexeme()[l.r.Pos()-1] = ' '
- }
- }
- } else { // attribute value unquoted state
- for {
- if c = l.r.Peek(0); c == ' ' || c == '>' || (c == '/' || c == '?') && l.r.Peek(1) == '>' || c == '\t' || c == '\n' || c == '\r' || c == 0 {
- break
- }
- l.r.Move(1)
- }
- }
- l.attrVal = l.r.Lexeme()[attrPos:]
- } else {
- l.r.Rewind(nameEnd)
- l.attrVal = nil
- }
- l.text = l.r.Lexeme()[nameStart:nameEnd]
- return l.r.Shift()
- }
- func (l *Lexer) shiftEndTag() []byte {
- for {
- c := l.r.Peek(0)
- if c == '>' {
- l.text = l.r.Lexeme()[2:]
- l.r.Move(1)
- break
- } else if c == 0 {
- l.text = l.r.Lexeme()[2:]
- break
- }
- l.r.Move(1)
- }
- end := len(l.text)
- for end > 0 {
- if c := l.text[end-1]; c == ' ' || c == '\t' || c == '\n' || c == '\r' {
- end--
- continue
- }
- break
- }
- l.text = l.text[:end]
- return l.r.Shift()
- }
- ////////////////////////////////////////////////////////////////
- func (l *Lexer) at(b ...byte) bool {
- for i, c := range b {
- if l.r.Peek(i) != c {
- return false
- }
- }
- return true
- }
|