lex.go 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224
  1. package toml
  2. import (
  3. "fmt"
  4. "reflect"
  5. "runtime"
  6. "strings"
  7. "unicode"
  8. "unicode/utf8"
  9. )
  10. type itemType int
  11. const (
  12. itemError itemType = iota
  13. itemNIL // used in the parser to indicate no type
  14. itemEOF
  15. itemText
  16. itemString
  17. itemRawString
  18. itemMultilineString
  19. itemRawMultilineString
  20. itemBool
  21. itemInteger
  22. itemFloat
  23. itemDatetime
  24. itemArray // the start of an array
  25. itemArrayEnd
  26. itemTableStart
  27. itemTableEnd
  28. itemArrayTableStart
  29. itemArrayTableEnd
  30. itemKeyStart
  31. itemKeyEnd
  32. itemCommentStart
  33. itemInlineTableStart
  34. itemInlineTableEnd
  35. )
  36. const eof = 0
  37. type stateFn func(lx *lexer) stateFn
  38. func (p Position) String() string {
  39. return fmt.Sprintf("at line %d; start %d; length %d", p.Line, p.Start, p.Len)
  40. }
  41. type lexer struct {
  42. input string
  43. start int
  44. pos int
  45. line int
  46. state stateFn
  47. items chan item
  48. // Allow for backing up up to 4 runes. This is necessary because TOML
  49. // contains 3-rune tokens (""" and ''').
  50. prevWidths [4]int
  51. nprev int // how many of prevWidths are in use
  52. atEOF bool // If we emit an eof, we can still back up, but it is not OK to call next again.
  53. // A stack of state functions used to maintain context.
  54. //
  55. // The idea is to reuse parts of the state machine in various places. For
  56. // example, values can appear at the top level or within arbitrarily nested
  57. // arrays. The last state on the stack is used after a value has been lexed.
  58. // Similarly for comments.
  59. stack []stateFn
  60. }
  61. type item struct {
  62. typ itemType
  63. val string
  64. err error
  65. pos Position
  66. }
  67. func (lx *lexer) nextItem() item {
  68. for {
  69. select {
  70. case item := <-lx.items:
  71. return item
  72. default:
  73. lx.state = lx.state(lx)
  74. //fmt.Printf(" STATE %-24s current: %-10q stack: %s\n", lx.state, lx.current(), lx.stack)
  75. }
  76. }
  77. }
  78. func lex(input string) *lexer {
  79. lx := &lexer{
  80. input: input,
  81. state: lexTop,
  82. items: make(chan item, 10),
  83. stack: make([]stateFn, 0, 10),
  84. line: 1,
  85. }
  86. return lx
  87. }
  88. func (lx *lexer) push(state stateFn) {
  89. lx.stack = append(lx.stack, state)
  90. }
  91. func (lx *lexer) pop() stateFn {
  92. if len(lx.stack) == 0 {
  93. return lx.errorf("BUG in lexer: no states to pop")
  94. }
  95. last := lx.stack[len(lx.stack)-1]
  96. lx.stack = lx.stack[0 : len(lx.stack)-1]
  97. return last
  98. }
  99. func (lx *lexer) current() string {
  100. return lx.input[lx.start:lx.pos]
  101. }
  102. func (lx lexer) getPos() Position {
  103. p := Position{
  104. Line: lx.line,
  105. Start: lx.start,
  106. Len: lx.pos - lx.start,
  107. }
  108. if p.Len <= 0 {
  109. p.Len = 1
  110. }
  111. return p
  112. }
  113. func (lx *lexer) emit(typ itemType) {
  114. // Needed for multiline strings ending with an incomplete UTF-8 sequence.
  115. if lx.start > lx.pos {
  116. lx.error(errLexUTF8{lx.input[lx.pos]})
  117. return
  118. }
  119. lx.items <- item{typ: typ, pos: lx.getPos(), val: lx.current()}
  120. lx.start = lx.pos
  121. }
  122. func (lx *lexer) emitTrim(typ itemType) {
  123. lx.items <- item{typ: typ, pos: lx.getPos(), val: strings.TrimSpace(lx.current())}
  124. lx.start = lx.pos
  125. }
  126. func (lx *lexer) next() (r rune) {
  127. if lx.atEOF {
  128. panic("BUG in lexer: next called after EOF")
  129. }
  130. if lx.pos >= len(lx.input) {
  131. lx.atEOF = true
  132. return eof
  133. }
  134. if lx.input[lx.pos] == '\n' {
  135. lx.line++
  136. }
  137. lx.prevWidths[3] = lx.prevWidths[2]
  138. lx.prevWidths[2] = lx.prevWidths[1]
  139. lx.prevWidths[1] = lx.prevWidths[0]
  140. if lx.nprev < 4 {
  141. lx.nprev++
  142. }
  143. r, w := utf8.DecodeRuneInString(lx.input[lx.pos:])
  144. if r == utf8.RuneError {
  145. lx.error(errLexUTF8{lx.input[lx.pos]})
  146. return utf8.RuneError
  147. }
  148. // Note: don't use peek() here, as this calls next().
  149. if isControl(r) || (r == '\r' && (len(lx.input)-1 == lx.pos || lx.input[lx.pos+1] != '\n')) {
  150. lx.errorControlChar(r)
  151. return utf8.RuneError
  152. }
  153. lx.prevWidths[0] = w
  154. lx.pos += w
  155. return r
  156. }
  157. // ignore skips over the pending input before this point.
  158. func (lx *lexer) ignore() {
  159. lx.start = lx.pos
  160. }
  161. // backup steps back one rune. Can be called 4 times between calls to next.
  162. func (lx *lexer) backup() {
  163. if lx.atEOF {
  164. lx.atEOF = false
  165. return
  166. }
  167. if lx.nprev < 1 {
  168. panic("BUG in lexer: backed up too far")
  169. }
  170. w := lx.prevWidths[0]
  171. lx.prevWidths[0] = lx.prevWidths[1]
  172. lx.prevWidths[1] = lx.prevWidths[2]
  173. lx.prevWidths[2] = lx.prevWidths[3]
  174. lx.nprev--
  175. lx.pos -= w
  176. if lx.pos < len(lx.input) && lx.input[lx.pos] == '\n' {
  177. lx.line--
  178. }
  179. }
  180. // accept consumes the next rune if it's equal to `valid`.
  181. func (lx *lexer) accept(valid rune) bool {
  182. if lx.next() == valid {
  183. return true
  184. }
  185. lx.backup()
  186. return false
  187. }
  188. // peek returns but does not consume the next rune in the input.
  189. func (lx *lexer) peek() rune {
  190. r := lx.next()
  191. lx.backup()
  192. return r
  193. }
  194. // skip ignores all input that matches the given predicate.
  195. func (lx *lexer) skip(pred func(rune) bool) {
  196. for {
  197. r := lx.next()
  198. if pred(r) {
  199. continue
  200. }
  201. lx.backup()
  202. lx.ignore()
  203. return
  204. }
  205. }
  206. // error stops all lexing by emitting an error and returning `nil`.
  207. //
  208. // Note that any value that is a character is escaped if it's a special
  209. // character (newlines, tabs, etc.).
  210. func (lx *lexer) error(err error) stateFn {
  211. if lx.atEOF {
  212. return lx.errorPrevLine(err)
  213. }
  214. lx.items <- item{typ: itemError, pos: lx.getPos(), err: err}
  215. return nil
  216. }
  217. // errorfPrevline is like error(), but sets the position to the last column of
  218. // the previous line.
  219. //
  220. // This is so that unexpected EOF or NL errors don't show on a new blank line.
  221. func (lx *lexer) errorPrevLine(err error) stateFn {
  222. pos := lx.getPos()
  223. pos.Line--
  224. pos.Len = 1
  225. pos.Start = lx.pos - 1
  226. lx.items <- item{typ: itemError, pos: pos, err: err}
  227. return nil
  228. }
  229. // errorPos is like error(), but allows explicitly setting the position.
  230. func (lx *lexer) errorPos(start, length int, err error) stateFn {
  231. pos := lx.getPos()
  232. pos.Start = start
  233. pos.Len = length
  234. lx.items <- item{typ: itemError, pos: pos, err: err}
  235. return nil
  236. }
  237. // errorf is like error, and creates a new error.
  238. func (lx *lexer) errorf(format string, values ...interface{}) stateFn {
  239. if lx.atEOF {
  240. pos := lx.getPos()
  241. pos.Line--
  242. pos.Len = 1
  243. pos.Start = lx.pos - 1
  244. lx.items <- item{typ: itemError, pos: pos, err: fmt.Errorf(format, values...)}
  245. return nil
  246. }
  247. lx.items <- item{typ: itemError, pos: lx.getPos(), err: fmt.Errorf(format, values...)}
  248. return nil
  249. }
  250. func (lx *lexer) errorControlChar(cc rune) stateFn {
  251. return lx.errorPos(lx.pos-1, 1, errLexControl{cc})
  252. }
  253. // lexTop consumes elements at the top level of TOML data.
  254. func lexTop(lx *lexer) stateFn {
  255. r := lx.next()
  256. if isWhitespace(r) || isNL(r) {
  257. return lexSkip(lx, lexTop)
  258. }
  259. switch r {
  260. case '#':
  261. lx.push(lexTop)
  262. return lexCommentStart
  263. case '[':
  264. return lexTableStart
  265. case eof:
  266. if lx.pos > lx.start {
  267. return lx.errorf("unexpected EOF")
  268. }
  269. lx.emit(itemEOF)
  270. return nil
  271. }
  272. // At this point, the only valid item can be a key, so we back up
  273. // and let the key lexer do the rest.
  274. lx.backup()
  275. lx.push(lexTopEnd)
  276. return lexKeyStart
  277. }
  278. // lexTopEnd is entered whenever a top-level item has been consumed. (A value
  279. // or a table.) It must see only whitespace, and will turn back to lexTop
  280. // upon a newline. If it sees EOF, it will quit the lexer successfully.
  281. func lexTopEnd(lx *lexer) stateFn {
  282. r := lx.next()
  283. switch {
  284. case r == '#':
  285. // a comment will read to a newline for us.
  286. lx.push(lexTop)
  287. return lexCommentStart
  288. case isWhitespace(r):
  289. return lexTopEnd
  290. case isNL(r):
  291. lx.ignore()
  292. return lexTop
  293. case r == eof:
  294. lx.emit(itemEOF)
  295. return nil
  296. }
  297. return lx.errorf(
  298. "expected a top-level item to end with a newline, comment, or EOF, but got %q instead",
  299. r)
  300. }
  301. // lexTable lexes the beginning of a table. Namely, it makes sure that
  302. // it starts with a character other than '.' and ']'.
  303. // It assumes that '[' has already been consumed.
  304. // It also handles the case that this is an item in an array of tables.
  305. // e.g., '[[name]]'.
  306. func lexTableStart(lx *lexer) stateFn {
  307. if lx.peek() == '[' {
  308. lx.next()
  309. lx.emit(itemArrayTableStart)
  310. lx.push(lexArrayTableEnd)
  311. } else {
  312. lx.emit(itemTableStart)
  313. lx.push(lexTableEnd)
  314. }
  315. return lexTableNameStart
  316. }
  317. func lexTableEnd(lx *lexer) stateFn {
  318. lx.emit(itemTableEnd)
  319. return lexTopEnd
  320. }
  321. func lexArrayTableEnd(lx *lexer) stateFn {
  322. if r := lx.next(); r != ']' {
  323. return lx.errorf("expected end of table array name delimiter ']', but got %q instead", r)
  324. }
  325. lx.emit(itemArrayTableEnd)
  326. return lexTopEnd
  327. }
  328. func lexTableNameStart(lx *lexer) stateFn {
  329. lx.skip(isWhitespace)
  330. switch r := lx.peek(); {
  331. case r == ']' || r == eof:
  332. return lx.errorf("unexpected end of table name (table names cannot be empty)")
  333. case r == '.':
  334. return lx.errorf("unexpected table separator (table names cannot be empty)")
  335. case r == '"' || r == '\'':
  336. lx.ignore()
  337. lx.push(lexTableNameEnd)
  338. return lexQuotedName
  339. default:
  340. lx.push(lexTableNameEnd)
  341. return lexBareName
  342. }
  343. }
  344. // lexTableNameEnd reads the end of a piece of a table name, optionally
  345. // consuming whitespace.
  346. func lexTableNameEnd(lx *lexer) stateFn {
  347. lx.skip(isWhitespace)
  348. switch r := lx.next(); {
  349. case isWhitespace(r):
  350. return lexTableNameEnd
  351. case r == '.':
  352. lx.ignore()
  353. return lexTableNameStart
  354. case r == ']':
  355. return lx.pop()
  356. default:
  357. return lx.errorf("expected '.' or ']' to end table name, but got %q instead", r)
  358. }
  359. }
  360. // lexBareName lexes one part of a key or table.
  361. //
  362. // It assumes that at least one valid character for the table has already been
  363. // read.
  364. //
  365. // Lexes only one part, e.g. only 'a' inside 'a.b'.
  366. func lexBareName(lx *lexer) stateFn {
  367. r := lx.next()
  368. if isBareKeyChar(r) {
  369. return lexBareName
  370. }
  371. lx.backup()
  372. lx.emit(itemText)
  373. return lx.pop()
  374. }
  375. // lexBareName lexes one part of a key or table.
  376. //
  377. // It assumes that at least one valid character for the table has already been
  378. // read.
  379. //
  380. // Lexes only one part, e.g. only '"a"' inside '"a".b'.
  381. func lexQuotedName(lx *lexer) stateFn {
  382. r := lx.next()
  383. switch {
  384. case isWhitespace(r):
  385. return lexSkip(lx, lexValue)
  386. case r == '"':
  387. lx.ignore() // ignore the '"'
  388. return lexString
  389. case r == '\'':
  390. lx.ignore() // ignore the "'"
  391. return lexRawString
  392. case r == eof:
  393. return lx.errorf("unexpected EOF; expected value")
  394. default:
  395. return lx.errorf("expected value but found %q instead", r)
  396. }
  397. }
  398. // lexKeyStart consumes all key parts until a '='.
  399. func lexKeyStart(lx *lexer) stateFn {
  400. lx.skip(isWhitespace)
  401. switch r := lx.peek(); {
  402. case r == '=' || r == eof:
  403. return lx.errorf("unexpected '=': key name appears blank")
  404. case r == '.':
  405. return lx.errorf("unexpected '.': keys cannot start with a '.'")
  406. case r == '"' || r == '\'':
  407. lx.ignore()
  408. fallthrough
  409. default: // Bare key
  410. lx.emit(itemKeyStart)
  411. return lexKeyNameStart
  412. }
  413. }
  414. func lexKeyNameStart(lx *lexer) stateFn {
  415. lx.skip(isWhitespace)
  416. switch r := lx.peek(); {
  417. case r == '=' || r == eof:
  418. return lx.errorf("unexpected '='")
  419. case r == '.':
  420. return lx.errorf("unexpected '.'")
  421. case r == '"' || r == '\'':
  422. lx.ignore()
  423. lx.push(lexKeyEnd)
  424. return lexQuotedName
  425. default:
  426. lx.push(lexKeyEnd)
  427. return lexBareName
  428. }
  429. }
  430. // lexKeyEnd consumes the end of a key and trims whitespace (up to the key
  431. // separator).
  432. func lexKeyEnd(lx *lexer) stateFn {
  433. lx.skip(isWhitespace)
  434. switch r := lx.next(); {
  435. case isWhitespace(r):
  436. return lexSkip(lx, lexKeyEnd)
  437. case r == eof:
  438. return lx.errorf("unexpected EOF; expected key separator '='")
  439. case r == '.':
  440. lx.ignore()
  441. return lexKeyNameStart
  442. case r == '=':
  443. lx.emit(itemKeyEnd)
  444. return lexSkip(lx, lexValue)
  445. default:
  446. return lx.errorf("expected '.' or '=', but got %q instead", r)
  447. }
  448. }
  449. // lexValue starts the consumption of a value anywhere a value is expected.
  450. // lexValue will ignore whitespace.
  451. // After a value is lexed, the last state on the next is popped and returned.
  452. func lexValue(lx *lexer) stateFn {
  453. // We allow whitespace to precede a value, but NOT newlines.
  454. // In array syntax, the array states are responsible for ignoring newlines.
  455. r := lx.next()
  456. switch {
  457. case isWhitespace(r):
  458. return lexSkip(lx, lexValue)
  459. case isDigit(r):
  460. lx.backup() // avoid an extra state and use the same as above
  461. return lexNumberOrDateStart
  462. }
  463. switch r {
  464. case '[':
  465. lx.ignore()
  466. lx.emit(itemArray)
  467. return lexArrayValue
  468. case '{':
  469. lx.ignore()
  470. lx.emit(itemInlineTableStart)
  471. return lexInlineTableValue
  472. case '"':
  473. if lx.accept('"') {
  474. if lx.accept('"') {
  475. lx.ignore() // Ignore """
  476. return lexMultilineString
  477. }
  478. lx.backup()
  479. }
  480. lx.ignore() // ignore the '"'
  481. return lexString
  482. case '\'':
  483. if lx.accept('\'') {
  484. if lx.accept('\'') {
  485. lx.ignore() // Ignore """
  486. return lexMultilineRawString
  487. }
  488. lx.backup()
  489. }
  490. lx.ignore() // ignore the "'"
  491. return lexRawString
  492. case '.': // special error case, be kind to users
  493. return lx.errorf("floats must start with a digit, not '.'")
  494. case 'i', 'n':
  495. if (lx.accept('n') && lx.accept('f')) || (lx.accept('a') && lx.accept('n')) {
  496. lx.emit(itemFloat)
  497. return lx.pop()
  498. }
  499. case '-', '+':
  500. return lexDecimalNumberStart
  501. }
  502. if unicode.IsLetter(r) {
  503. // Be permissive here; lexBool will give a nice error if the
  504. // user wrote something like
  505. // x = foo
  506. // (i.e. not 'true' or 'false' but is something else word-like.)
  507. lx.backup()
  508. return lexBool
  509. }
  510. if r == eof {
  511. return lx.errorf("unexpected EOF; expected value")
  512. }
  513. return lx.errorf("expected value but found %q instead", r)
  514. }
  515. // lexArrayValue consumes one value in an array. It assumes that '[' or ','
  516. // have already been consumed. All whitespace and newlines are ignored.
  517. func lexArrayValue(lx *lexer) stateFn {
  518. r := lx.next()
  519. switch {
  520. case isWhitespace(r) || isNL(r):
  521. return lexSkip(lx, lexArrayValue)
  522. case r == '#':
  523. lx.push(lexArrayValue)
  524. return lexCommentStart
  525. case r == ',':
  526. return lx.errorf("unexpected comma")
  527. case r == ']':
  528. return lexArrayEnd
  529. }
  530. lx.backup()
  531. lx.push(lexArrayValueEnd)
  532. return lexValue
  533. }
  534. // lexArrayValueEnd consumes everything between the end of an array value and
  535. // the next value (or the end of the array): it ignores whitespace and newlines
  536. // and expects either a ',' or a ']'.
  537. func lexArrayValueEnd(lx *lexer) stateFn {
  538. switch r := lx.next(); {
  539. case isWhitespace(r) || isNL(r):
  540. return lexSkip(lx, lexArrayValueEnd)
  541. case r == '#':
  542. lx.push(lexArrayValueEnd)
  543. return lexCommentStart
  544. case r == ',':
  545. lx.ignore()
  546. return lexArrayValue // move on to the next value
  547. case r == ']':
  548. return lexArrayEnd
  549. default:
  550. return lx.errorf("expected a comma (',') or array terminator (']'), but got %s", runeOrEOF(r))
  551. }
  552. }
  553. // lexArrayEnd finishes the lexing of an array.
  554. // It assumes that a ']' has just been consumed.
  555. func lexArrayEnd(lx *lexer) stateFn {
  556. lx.ignore()
  557. lx.emit(itemArrayEnd)
  558. return lx.pop()
  559. }
  560. // lexInlineTableValue consumes one key/value pair in an inline table.
  561. // It assumes that '{' or ',' have already been consumed. Whitespace is ignored.
  562. func lexInlineTableValue(lx *lexer) stateFn {
  563. r := lx.next()
  564. switch {
  565. case isWhitespace(r):
  566. return lexSkip(lx, lexInlineTableValue)
  567. case isNL(r):
  568. return lx.errorPrevLine(errLexInlineTableNL{})
  569. case r == '#':
  570. lx.push(lexInlineTableValue)
  571. return lexCommentStart
  572. case r == ',':
  573. return lx.errorf("unexpected comma")
  574. case r == '}':
  575. return lexInlineTableEnd
  576. }
  577. lx.backup()
  578. lx.push(lexInlineTableValueEnd)
  579. return lexKeyStart
  580. }
  581. // lexInlineTableValueEnd consumes everything between the end of an inline table
  582. // key/value pair and the next pair (or the end of the table):
  583. // it ignores whitespace and expects either a ',' or a '}'.
  584. func lexInlineTableValueEnd(lx *lexer) stateFn {
  585. switch r := lx.next(); {
  586. case isWhitespace(r):
  587. return lexSkip(lx, lexInlineTableValueEnd)
  588. case isNL(r):
  589. return lx.errorPrevLine(errLexInlineTableNL{})
  590. case r == '#':
  591. lx.push(lexInlineTableValueEnd)
  592. return lexCommentStart
  593. case r == ',':
  594. lx.ignore()
  595. lx.skip(isWhitespace)
  596. if lx.peek() == '}' {
  597. return lx.errorf("trailing comma not allowed in inline tables")
  598. }
  599. return lexInlineTableValue
  600. case r == '}':
  601. return lexInlineTableEnd
  602. default:
  603. return lx.errorf("expected a comma or an inline table terminator '}', but got %s instead", runeOrEOF(r))
  604. }
  605. }
  606. func runeOrEOF(r rune) string {
  607. if r == eof {
  608. return "end of file"
  609. }
  610. return "'" + string(r) + "'"
  611. }
  612. // lexInlineTableEnd finishes the lexing of an inline table.
  613. // It assumes that a '}' has just been consumed.
  614. func lexInlineTableEnd(lx *lexer) stateFn {
  615. lx.ignore()
  616. lx.emit(itemInlineTableEnd)
  617. return lx.pop()
  618. }
  619. // lexString consumes the inner contents of a string. It assumes that the
  620. // beginning '"' has already been consumed and ignored.
  621. func lexString(lx *lexer) stateFn {
  622. r := lx.next()
  623. switch {
  624. case r == eof:
  625. return lx.errorf(`unexpected EOF; expected '"'`)
  626. case isNL(r):
  627. return lx.errorPrevLine(errLexStringNL{})
  628. case r == '\\':
  629. lx.push(lexString)
  630. return lexStringEscape
  631. case r == '"':
  632. lx.backup()
  633. lx.emit(itemString)
  634. lx.next()
  635. lx.ignore()
  636. return lx.pop()
  637. }
  638. return lexString
  639. }
  640. // lexMultilineString consumes the inner contents of a string. It assumes that
  641. // the beginning '"""' has already been consumed and ignored.
  642. func lexMultilineString(lx *lexer) stateFn {
  643. r := lx.next()
  644. switch r {
  645. default:
  646. return lexMultilineString
  647. case eof:
  648. return lx.errorf(`unexpected EOF; expected '"""'`)
  649. case '\\':
  650. return lexMultilineStringEscape
  651. case '"':
  652. /// Found " → try to read two more "".
  653. if lx.accept('"') {
  654. if lx.accept('"') {
  655. /// Peek ahead: the string can contain " and "", including at the
  656. /// end: """str"""""
  657. /// 6 or more at the end, however, is an error.
  658. if lx.peek() == '"' {
  659. /// Check if we already lexed 5 's; if so we have 6 now, and
  660. /// that's just too many man!
  661. if strings.HasSuffix(lx.current(), `"""""`) {
  662. return lx.errorf(`unexpected '""""""'`)
  663. }
  664. lx.backup()
  665. lx.backup()
  666. return lexMultilineString
  667. }
  668. lx.backup() /// backup: don't include the """ in the item.
  669. lx.backup()
  670. lx.backup()
  671. lx.emit(itemMultilineString)
  672. lx.next() /// Read over ''' again and discard it.
  673. lx.next()
  674. lx.next()
  675. lx.ignore()
  676. return lx.pop()
  677. }
  678. lx.backup()
  679. }
  680. return lexMultilineString
  681. }
  682. }
  683. // lexRawString consumes a raw string. Nothing can be escaped in such a string.
  684. // It assumes that the beginning "'" has already been consumed and ignored.
  685. func lexRawString(lx *lexer) stateFn {
  686. r := lx.next()
  687. switch {
  688. default:
  689. return lexRawString
  690. case r == eof:
  691. return lx.errorf(`unexpected EOF; expected "'"`)
  692. case isNL(r):
  693. return lx.errorPrevLine(errLexStringNL{})
  694. case r == '\'':
  695. lx.backup()
  696. lx.emit(itemRawString)
  697. lx.next()
  698. lx.ignore()
  699. return lx.pop()
  700. }
  701. }
  702. // lexMultilineRawString consumes a raw string. Nothing can be escaped in such
  703. // a string. It assumes that the beginning "'''" has already been consumed and
  704. // ignored.
  705. func lexMultilineRawString(lx *lexer) stateFn {
  706. r := lx.next()
  707. switch r {
  708. default:
  709. return lexMultilineRawString
  710. case eof:
  711. return lx.errorf(`unexpected EOF; expected "'''"`)
  712. case '\'':
  713. /// Found ' → try to read two more ''.
  714. if lx.accept('\'') {
  715. if lx.accept('\'') {
  716. /// Peek ahead: the string can contain ' and '', including at the
  717. /// end: '''str'''''
  718. /// 6 or more at the end, however, is an error.
  719. if lx.peek() == '\'' {
  720. /// Check if we already lexed 5 's; if so we have 6 now, and
  721. /// that's just too many man!
  722. if strings.HasSuffix(lx.current(), "'''''") {
  723. return lx.errorf(`unexpected "''''''"`)
  724. }
  725. lx.backup()
  726. lx.backup()
  727. return lexMultilineRawString
  728. }
  729. lx.backup() /// backup: don't include the ''' in the item.
  730. lx.backup()
  731. lx.backup()
  732. lx.emit(itemRawMultilineString)
  733. lx.next() /// Read over ''' again and discard it.
  734. lx.next()
  735. lx.next()
  736. lx.ignore()
  737. return lx.pop()
  738. }
  739. lx.backup()
  740. }
  741. return lexMultilineRawString
  742. }
  743. }
  744. // lexMultilineStringEscape consumes an escaped character. It assumes that the
  745. // preceding '\\' has already been consumed.
  746. func lexMultilineStringEscape(lx *lexer) stateFn {
  747. // Handle the special case first:
  748. if isNL(lx.next()) {
  749. return lexMultilineString
  750. }
  751. lx.backup()
  752. lx.push(lexMultilineString)
  753. return lexStringEscape(lx)
  754. }
  755. func lexStringEscape(lx *lexer) stateFn {
  756. r := lx.next()
  757. switch r {
  758. case 'b':
  759. fallthrough
  760. case 't':
  761. fallthrough
  762. case 'n':
  763. fallthrough
  764. case 'f':
  765. fallthrough
  766. case 'r':
  767. fallthrough
  768. case '"':
  769. fallthrough
  770. case ' ', '\t':
  771. // Inside """ .. """ strings you can use \ to escape newlines, and any
  772. // amount of whitespace can be between the \ and \n.
  773. fallthrough
  774. case '\\':
  775. return lx.pop()
  776. case 'u':
  777. return lexShortUnicodeEscape
  778. case 'U':
  779. return lexLongUnicodeEscape
  780. }
  781. return lx.error(errLexEscape{r})
  782. }
  783. func lexShortUnicodeEscape(lx *lexer) stateFn {
  784. var r rune
  785. for i := 0; i < 4; i++ {
  786. r = lx.next()
  787. if !isHexadecimal(r) {
  788. return lx.errorf(
  789. `expected four hexadecimal digits after '\u', but got %q instead`,
  790. lx.current())
  791. }
  792. }
  793. return lx.pop()
  794. }
  795. func lexLongUnicodeEscape(lx *lexer) stateFn {
  796. var r rune
  797. for i := 0; i < 8; i++ {
  798. r = lx.next()
  799. if !isHexadecimal(r) {
  800. return lx.errorf(
  801. `expected eight hexadecimal digits after '\U', but got %q instead`,
  802. lx.current())
  803. }
  804. }
  805. return lx.pop()
  806. }
  807. // lexNumberOrDateStart processes the first character of a value which begins
  808. // with a digit. It exists to catch values starting with '0', so that
  809. // lexBaseNumberOrDate can differentiate base prefixed integers from other
  810. // types.
  811. func lexNumberOrDateStart(lx *lexer) stateFn {
  812. r := lx.next()
  813. switch r {
  814. case '0':
  815. return lexBaseNumberOrDate
  816. }
  817. if !isDigit(r) {
  818. // The only way to reach this state is if the value starts
  819. // with a digit, so specifically treat anything else as an
  820. // error.
  821. return lx.errorf("expected a digit but got %q", r)
  822. }
  823. return lexNumberOrDate
  824. }
  825. // lexNumberOrDate consumes either an integer, float or datetime.
  826. func lexNumberOrDate(lx *lexer) stateFn {
  827. r := lx.next()
  828. if isDigit(r) {
  829. return lexNumberOrDate
  830. }
  831. switch r {
  832. case '-', ':':
  833. return lexDatetime
  834. case '_':
  835. return lexDecimalNumber
  836. case '.', 'e', 'E':
  837. return lexFloat
  838. }
  839. lx.backup()
  840. lx.emit(itemInteger)
  841. return lx.pop()
  842. }
  843. // lexDatetime consumes a Datetime, to a first approximation.
  844. // The parser validates that it matches one of the accepted formats.
  845. func lexDatetime(lx *lexer) stateFn {
  846. r := lx.next()
  847. if isDigit(r) {
  848. return lexDatetime
  849. }
  850. switch r {
  851. case '-', ':', 'T', 't', ' ', '.', 'Z', 'z', '+':
  852. return lexDatetime
  853. }
  854. lx.backup()
  855. lx.emitTrim(itemDatetime)
  856. return lx.pop()
  857. }
  858. // lexHexInteger consumes a hexadecimal integer after seeing the '0x' prefix.
  859. func lexHexInteger(lx *lexer) stateFn {
  860. r := lx.next()
  861. if isHexadecimal(r) {
  862. return lexHexInteger
  863. }
  864. switch r {
  865. case '_':
  866. return lexHexInteger
  867. }
  868. lx.backup()
  869. lx.emit(itemInteger)
  870. return lx.pop()
  871. }
  872. // lexOctalInteger consumes an octal integer after seeing the '0o' prefix.
  873. func lexOctalInteger(lx *lexer) stateFn {
  874. r := lx.next()
  875. if isOctal(r) {
  876. return lexOctalInteger
  877. }
  878. switch r {
  879. case '_':
  880. return lexOctalInteger
  881. }
  882. lx.backup()
  883. lx.emit(itemInteger)
  884. return lx.pop()
  885. }
  886. // lexBinaryInteger consumes a binary integer after seeing the '0b' prefix.
  887. func lexBinaryInteger(lx *lexer) stateFn {
  888. r := lx.next()
  889. if isBinary(r) {
  890. return lexBinaryInteger
  891. }
  892. switch r {
  893. case '_':
  894. return lexBinaryInteger
  895. }
  896. lx.backup()
  897. lx.emit(itemInteger)
  898. return lx.pop()
  899. }
  900. // lexDecimalNumber consumes a decimal float or integer.
  901. func lexDecimalNumber(lx *lexer) stateFn {
  902. r := lx.next()
  903. if isDigit(r) {
  904. return lexDecimalNumber
  905. }
  906. switch r {
  907. case '.', 'e', 'E':
  908. return lexFloat
  909. case '_':
  910. return lexDecimalNumber
  911. }
  912. lx.backup()
  913. lx.emit(itemInteger)
  914. return lx.pop()
  915. }
  916. // lexDecimalNumber consumes the first digit of a number beginning with a sign.
  917. // It assumes the sign has already been consumed. Values which start with a sign
  918. // are only allowed to be decimal integers or floats.
  919. //
  920. // The special "nan" and "inf" values are also recognized.
  921. func lexDecimalNumberStart(lx *lexer) stateFn {
  922. r := lx.next()
  923. // Special error cases to give users better error messages
  924. switch r {
  925. case 'i':
  926. if !lx.accept('n') || !lx.accept('f') {
  927. return lx.errorf("invalid float: '%s'", lx.current())
  928. }
  929. lx.emit(itemFloat)
  930. return lx.pop()
  931. case 'n':
  932. if !lx.accept('a') || !lx.accept('n') {
  933. return lx.errorf("invalid float: '%s'", lx.current())
  934. }
  935. lx.emit(itemFloat)
  936. return lx.pop()
  937. case '0':
  938. p := lx.peek()
  939. switch p {
  940. case 'b', 'o', 'x':
  941. return lx.errorf("cannot use sign with non-decimal numbers: '%s%c'", lx.current(), p)
  942. }
  943. case '.':
  944. return lx.errorf("floats must start with a digit, not '.'")
  945. }
  946. if isDigit(r) {
  947. return lexDecimalNumber
  948. }
  949. return lx.errorf("expected a digit but got %q", r)
  950. }
  951. // lexBaseNumberOrDate differentiates between the possible values which
  952. // start with '0'. It assumes that before reaching this state, the initial '0'
  953. // has been consumed.
  954. func lexBaseNumberOrDate(lx *lexer) stateFn {
  955. r := lx.next()
  956. // Note: All datetimes start with at least two digits, so we don't
  957. // handle date characters (':', '-', etc.) here.
  958. if isDigit(r) {
  959. return lexNumberOrDate
  960. }
  961. switch r {
  962. case '_':
  963. // Can only be decimal, because there can't be an underscore
  964. // between the '0' and the base designator, and dates can't
  965. // contain underscores.
  966. return lexDecimalNumber
  967. case '.', 'e', 'E':
  968. return lexFloat
  969. case 'b':
  970. r = lx.peek()
  971. if !isBinary(r) {
  972. lx.errorf("not a binary number: '%s%c'", lx.current(), r)
  973. }
  974. return lexBinaryInteger
  975. case 'o':
  976. r = lx.peek()
  977. if !isOctal(r) {
  978. lx.errorf("not an octal number: '%s%c'", lx.current(), r)
  979. }
  980. return lexOctalInteger
  981. case 'x':
  982. r = lx.peek()
  983. if !isHexadecimal(r) {
  984. lx.errorf("not a hexidecimal number: '%s%c'", lx.current(), r)
  985. }
  986. return lexHexInteger
  987. }
  988. lx.backup()
  989. lx.emit(itemInteger)
  990. return lx.pop()
  991. }
  992. // lexFloat consumes the elements of a float. It allows any sequence of
  993. // float-like characters, so floats emitted by the lexer are only a first
  994. // approximation and must be validated by the parser.
  995. func lexFloat(lx *lexer) stateFn {
  996. r := lx.next()
  997. if isDigit(r) {
  998. return lexFloat
  999. }
  1000. switch r {
  1001. case '_', '.', '-', '+', 'e', 'E':
  1002. return lexFloat
  1003. }
  1004. lx.backup()
  1005. lx.emit(itemFloat)
  1006. return lx.pop()
  1007. }
  1008. // lexBool consumes a bool string: 'true' or 'false.
  1009. func lexBool(lx *lexer) stateFn {
  1010. var rs []rune
  1011. for {
  1012. r := lx.next()
  1013. if !unicode.IsLetter(r) {
  1014. lx.backup()
  1015. break
  1016. }
  1017. rs = append(rs, r)
  1018. }
  1019. s := string(rs)
  1020. switch s {
  1021. case "true", "false":
  1022. lx.emit(itemBool)
  1023. return lx.pop()
  1024. }
  1025. return lx.errorf("expected value but found %q instead", s)
  1026. }
  1027. // lexCommentStart begins the lexing of a comment. It will emit
  1028. // itemCommentStart and consume no characters, passing control to lexComment.
  1029. func lexCommentStart(lx *lexer) stateFn {
  1030. lx.ignore()
  1031. lx.emit(itemCommentStart)
  1032. return lexComment
  1033. }
  1034. // lexComment lexes an entire comment. It assumes that '#' has been consumed.
  1035. // It will consume *up to* the first newline character, and pass control
  1036. // back to the last state on the stack.
  1037. func lexComment(lx *lexer) stateFn {
  1038. switch r := lx.next(); {
  1039. case isNL(r) || r == eof:
  1040. lx.backup()
  1041. lx.emit(itemText)
  1042. return lx.pop()
  1043. default:
  1044. return lexComment
  1045. }
  1046. }
  1047. // lexSkip ignores all slurped input and moves on to the next state.
  1048. func lexSkip(lx *lexer, nextState stateFn) stateFn {
  1049. lx.ignore()
  1050. return nextState
  1051. }
  1052. func (s stateFn) String() string {
  1053. name := runtime.FuncForPC(reflect.ValueOf(s).Pointer()).Name()
  1054. if i := strings.LastIndexByte(name, '.'); i > -1 {
  1055. name = name[i+1:]
  1056. }
  1057. if s == nil {
  1058. name = "<nil>"
  1059. }
  1060. return name + "()"
  1061. }
  1062. func (itype itemType) String() string {
  1063. switch itype {
  1064. case itemError:
  1065. return "Error"
  1066. case itemNIL:
  1067. return "NIL"
  1068. case itemEOF:
  1069. return "EOF"
  1070. case itemText:
  1071. return "Text"
  1072. case itemString, itemRawString, itemMultilineString, itemRawMultilineString:
  1073. return "String"
  1074. case itemBool:
  1075. return "Bool"
  1076. case itemInteger:
  1077. return "Integer"
  1078. case itemFloat:
  1079. return "Float"
  1080. case itemDatetime:
  1081. return "DateTime"
  1082. case itemTableStart:
  1083. return "TableStart"
  1084. case itemTableEnd:
  1085. return "TableEnd"
  1086. case itemKeyStart:
  1087. return "KeyStart"
  1088. case itemKeyEnd:
  1089. return "KeyEnd"
  1090. case itemArray:
  1091. return "Array"
  1092. case itemArrayEnd:
  1093. return "ArrayEnd"
  1094. case itemCommentStart:
  1095. return "CommentStart"
  1096. case itemInlineTableStart:
  1097. return "InlineTableStart"
  1098. case itemInlineTableEnd:
  1099. return "InlineTableEnd"
  1100. }
  1101. panic(fmt.Sprintf("BUG: Unknown type '%d'.", int(itype)))
  1102. }
  1103. func (item item) String() string {
  1104. return fmt.Sprintf("(%s, %s)", item.typ.String(), item.val)
  1105. }
  1106. func isWhitespace(r rune) bool { return r == '\t' || r == ' ' }
  1107. func isNL(r rune) bool { return r == '\n' || r == '\r' }
  1108. func isControl(r rune) bool { // Control characters except \t, \r, \n
  1109. switch r {
  1110. case '\t', '\r', '\n':
  1111. return false
  1112. default:
  1113. return (r >= 0x00 && r <= 0x1f) || r == 0x7f
  1114. }
  1115. }
  1116. func isDigit(r rune) bool { return r >= '0' && r <= '9' }
  1117. func isBinary(r rune) bool { return r == '0' || r == '1' }
  1118. func isOctal(r rune) bool { return r >= '0' && r <= '7' }
  1119. func isHexadecimal(r rune) bool {
  1120. return (r >= '0' && r <= '9') || (r >= 'a' && r <= 'f') || (r >= 'A' && r <= 'F')
  1121. }
  1122. func isBareKeyChar(r rune) bool {
  1123. return (r >= 'A' && r <= 'Z') ||
  1124. (r >= 'a' && r <= 'z') ||
  1125. (r >= '0' && r <= '9') ||
  1126. r == '_' || r == '-'
  1127. }