|
1 year ago | |
---|---|---|
.. | ||
README.md | 1 year ago | |
hash.go | 1 year ago | |
lex.go | 1 year ago | |
parse.go | 1 year ago | |
util.go | 1 year ago |
This package is an HTML5 lexer written in Go. It follows the specification at The HTML syntax. The lexer takes an io.Reader and converts it into tokens until the EOF.
Run the following command
go get -u github.com/tdewolff/parse/v2/html
or add the following import and run project with go get
import "github.com/tdewolff/parse/v2/html"
The following initializes a new Lexer with io.Reader r
:
l := html.NewLexer(parse.NewInput(r))
To tokenize until EOF an error, use:
for {
tt, data := l.Next()
switch tt {
case html.ErrorToken:
// error or EOF set in l.Err()
return
case html.StartTagToken:
// ...
for {
ttAttr, dataAttr := l.Next()
if ttAttr != html.AttributeToken {
break
}
// ...
}
// ...
}
}
All tokens:
ErrorToken TokenType = iota // extra token when errors occur
CommentToken
DoctypeToken
StartTagToken
StartTagCloseToken
StartTagVoidToken
EndTagToken
AttributeToken
TextToken
package main
import (
"os"
"github.com/tdewolff/parse/v2/html"
)
// Tokenize HTML from stdin.
func main() {
l := html.NewLexer(parse.NewInput(os.Stdin))
for {
tt, data := l.Next()
switch tt {
case html.ErrorToken:
if l.Err() != io.EOF {
fmt.Println("Error on line", l.Line(), ":", l.Err())
}
return
case html.StartTagToken:
fmt.Println("Tag", string(data))
for {
ttAttr, dataAttr := l.Next()
if ttAttr != html.AttributeToken {
break
}
key := dataAttr
val := l.AttrVal()
fmt.Println("Attribute", string(key), "=", string(val))
}
// ...
}
}
}
Released under the MIT license.