parse.go 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308
  1. // Package json is a JSON parser following the specifications at http://json.org/.
  2. package json
  3. import (
  4. "strconv"
  5. "github.com/tdewolff/parse/v2"
  6. )
  7. // GrammarType determines the type of grammar
  8. type GrammarType uint32
  9. // GrammarType values.
  10. const (
  11. ErrorGrammar GrammarType = iota // extra grammar when errors occur
  12. WhitespaceGrammar
  13. LiteralGrammar
  14. NumberGrammar
  15. StringGrammar
  16. StartObjectGrammar // {
  17. EndObjectGrammar // }
  18. StartArrayGrammar // [
  19. EndArrayGrammar // ]
  20. )
  21. // String returns the string representation of a GrammarType.
  22. func (gt GrammarType) String() string {
  23. switch gt {
  24. case ErrorGrammar:
  25. return "Error"
  26. case WhitespaceGrammar:
  27. return "Whitespace"
  28. case LiteralGrammar:
  29. return "Literal"
  30. case NumberGrammar:
  31. return "Number"
  32. case StringGrammar:
  33. return "String"
  34. case StartObjectGrammar:
  35. return "StartObject"
  36. case EndObjectGrammar:
  37. return "EndObject"
  38. case StartArrayGrammar:
  39. return "StartArray"
  40. case EndArrayGrammar:
  41. return "EndArray"
  42. }
  43. return "Invalid(" + strconv.Itoa(int(gt)) + ")"
  44. }
  45. ////////////////////////////////////////////////////////////////
  46. // State determines the current state the parser is in.
  47. type State uint32
  48. // State values.
  49. const (
  50. ValueState State = iota // extra token when errors occur
  51. ObjectKeyState
  52. ObjectValueState
  53. ArrayState
  54. )
  55. // String returns the string representation of a State.
  56. func (state State) String() string {
  57. switch state {
  58. case ValueState:
  59. return "Value"
  60. case ObjectKeyState:
  61. return "ObjectKey"
  62. case ObjectValueState:
  63. return "ObjectValue"
  64. case ArrayState:
  65. return "Array"
  66. }
  67. return "Invalid(" + strconv.Itoa(int(state)) + ")"
  68. }
  69. ////////////////////////////////////////////////////////////////
  70. // Parser is the state for the lexer.
  71. type Parser struct {
  72. r *parse.Input
  73. state []State
  74. err error
  75. needComma bool
  76. }
  77. // NewParser returns a new Parser for a given io.Reader.
  78. func NewParser(r *parse.Input) *Parser {
  79. return &Parser{
  80. r: r,
  81. state: []State{ValueState},
  82. }
  83. }
  84. // Err returns the error encountered during tokenization, this is often io.EOF but also other errors can be returned.
  85. func (p *Parser) Err() error {
  86. if p.err != nil {
  87. return p.err
  88. }
  89. return p.r.Err()
  90. }
  91. // State returns the state the parser is currently in (ie. which token is expected).
  92. func (p *Parser) State() State {
  93. return p.state[len(p.state)-1]
  94. }
  95. // Next returns the next Grammar. It returns ErrorGrammar when an error was encountered. Using Err() one can retrieve the error message.
  96. func (p *Parser) Next() (GrammarType, []byte) {
  97. p.moveWhitespace()
  98. c := p.r.Peek(0)
  99. state := p.state[len(p.state)-1]
  100. if c == ',' {
  101. if state != ArrayState && state != ObjectKeyState {
  102. p.err = parse.NewErrorLexer(p.r, "unexpected comma character")
  103. return ErrorGrammar, nil
  104. }
  105. p.r.Move(1)
  106. p.moveWhitespace()
  107. p.needComma = false
  108. c = p.r.Peek(0)
  109. }
  110. p.r.Skip()
  111. if p.needComma && c != '}' && c != ']' && c != 0 {
  112. p.err = parse.NewErrorLexer(p.r, "expected comma character or an array or object ending")
  113. return ErrorGrammar, nil
  114. } else if c == '{' {
  115. p.state = append(p.state, ObjectKeyState)
  116. p.r.Move(1)
  117. return StartObjectGrammar, p.r.Shift()
  118. } else if c == '}' {
  119. if state != ObjectKeyState {
  120. p.err = parse.NewErrorLexer(p.r, "unexpected right brace character")
  121. return ErrorGrammar, nil
  122. }
  123. p.needComma = true
  124. p.state = p.state[:len(p.state)-1]
  125. if p.state[len(p.state)-1] == ObjectValueState {
  126. p.state[len(p.state)-1] = ObjectKeyState
  127. }
  128. p.r.Move(1)
  129. return EndObjectGrammar, p.r.Shift()
  130. } else if c == '[' {
  131. p.state = append(p.state, ArrayState)
  132. p.r.Move(1)
  133. return StartArrayGrammar, p.r.Shift()
  134. } else if c == ']' {
  135. p.needComma = true
  136. if state != ArrayState {
  137. p.err = parse.NewErrorLexer(p.r, "unexpected right bracket character")
  138. return ErrorGrammar, nil
  139. }
  140. p.state = p.state[:len(p.state)-1]
  141. if p.state[len(p.state)-1] == ObjectValueState {
  142. p.state[len(p.state)-1] = ObjectKeyState
  143. }
  144. p.r.Move(1)
  145. return EndArrayGrammar, p.r.Shift()
  146. } else if state == ObjectKeyState {
  147. if c != '"' || !p.consumeStringToken() {
  148. p.err = parse.NewErrorLexer(p.r, "expected object key to be a quoted string")
  149. return ErrorGrammar, nil
  150. }
  151. n := p.r.Pos()
  152. p.moveWhitespace()
  153. if c := p.r.Peek(0); c != ':' {
  154. p.err = parse.NewErrorLexer(p.r, "expected colon character after object key")
  155. return ErrorGrammar, nil
  156. }
  157. p.r.Move(1)
  158. p.state[len(p.state)-1] = ObjectValueState
  159. return StringGrammar, p.r.Shift()[:n]
  160. } else {
  161. p.needComma = true
  162. if state == ObjectValueState {
  163. p.state[len(p.state)-1] = ObjectKeyState
  164. }
  165. if c == '"' && p.consumeStringToken() {
  166. return StringGrammar, p.r.Shift()
  167. } else if p.consumeNumberToken() {
  168. return NumberGrammar, p.r.Shift()
  169. } else if p.consumeLiteralToken() {
  170. return LiteralGrammar, p.r.Shift()
  171. }
  172. c := p.r.Peek(0) // pick up movement from consumeStringToken to detect NULL or EOF
  173. if c == 0 && p.r.Err() == nil {
  174. p.err = parse.NewErrorLexer(p.r, "unexpected NULL character")
  175. return ErrorGrammar, nil
  176. } else if c == 0 { // EOF
  177. return ErrorGrammar, nil
  178. }
  179. }
  180. p.err = parse.NewErrorLexer(p.r, "unexpected character '%c'", c)
  181. return ErrorGrammar, nil
  182. }
  183. ////////////////////////////////////////////////////////////////
  184. /*
  185. The following functions follow the specifications at http://json.org/
  186. */
  187. func (p *Parser) moveWhitespace() {
  188. for {
  189. if c := p.r.Peek(0); c != ' ' && c != '\n' && c != '\r' && c != '\t' {
  190. break
  191. }
  192. p.r.Move(1)
  193. }
  194. }
  195. func (p *Parser) consumeLiteralToken() bool {
  196. c := p.r.Peek(0)
  197. if c == 't' && p.r.Peek(1) == 'r' && p.r.Peek(2) == 'u' && p.r.Peek(3) == 'e' {
  198. p.r.Move(4)
  199. return true
  200. } else if c == 'f' && p.r.Peek(1) == 'a' && p.r.Peek(2) == 'l' && p.r.Peek(3) == 's' && p.r.Peek(4) == 'e' {
  201. p.r.Move(5)
  202. return true
  203. } else if c == 'n' && p.r.Peek(1) == 'u' && p.r.Peek(2) == 'l' && p.r.Peek(3) == 'l' {
  204. p.r.Move(4)
  205. return true
  206. }
  207. return false
  208. }
  209. func (p *Parser) consumeNumberToken() bool {
  210. mark := p.r.Pos()
  211. if p.r.Peek(0) == '-' {
  212. p.r.Move(1)
  213. }
  214. c := p.r.Peek(0)
  215. if c >= '1' && c <= '9' {
  216. p.r.Move(1)
  217. for {
  218. if c := p.r.Peek(0); c < '0' || c > '9' {
  219. break
  220. }
  221. p.r.Move(1)
  222. }
  223. } else if c != '0' {
  224. p.r.Rewind(mark)
  225. return false
  226. } else {
  227. p.r.Move(1) // 0
  228. }
  229. if c := p.r.Peek(0); c == '.' {
  230. p.r.Move(1)
  231. if c := p.r.Peek(0); c < '0' || c > '9' {
  232. p.r.Move(-1)
  233. return true
  234. }
  235. for {
  236. if c := p.r.Peek(0); c < '0' || c > '9' {
  237. break
  238. }
  239. p.r.Move(1)
  240. }
  241. }
  242. mark = p.r.Pos()
  243. if c := p.r.Peek(0); c == 'e' || c == 'E' {
  244. p.r.Move(1)
  245. if c := p.r.Peek(0); c == '+' || c == '-' {
  246. p.r.Move(1)
  247. }
  248. if c := p.r.Peek(0); c < '0' || c > '9' {
  249. p.r.Rewind(mark)
  250. return true
  251. }
  252. for {
  253. if c := p.r.Peek(0); c < '0' || c > '9' {
  254. break
  255. }
  256. p.r.Move(1)
  257. }
  258. }
  259. return true
  260. }
  261. func (p *Parser) consumeStringToken() bool {
  262. // assume to be on "
  263. p.r.Move(1)
  264. for {
  265. c := p.r.Peek(0)
  266. if c == '"' {
  267. escaped := false
  268. for i := p.r.Pos() - 1; i >= 0; i-- {
  269. if p.r.Lexeme()[i] == '\\' {
  270. escaped = !escaped
  271. } else {
  272. break
  273. }
  274. }
  275. if !escaped {
  276. p.r.Move(1)
  277. break
  278. }
  279. } else if c == 0 {
  280. return false
  281. }
  282. p.r.Move(1)
  283. }
  284. return true
  285. }