lex.go 30 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283
  1. package toml
  2. import (
  3. "fmt"
  4. "reflect"
  5. "runtime"
  6. "strings"
  7. "unicode"
  8. "unicode/utf8"
  9. )
  10. type itemType int
  11. const (
  12. itemError itemType = iota
  13. itemNIL // used in the parser to indicate no type
  14. itemEOF
  15. itemText
  16. itemString
  17. itemRawString
  18. itemMultilineString
  19. itemRawMultilineString
  20. itemBool
  21. itemInteger
  22. itemFloat
  23. itemDatetime
  24. itemArray // the start of an array
  25. itemArrayEnd
  26. itemTableStart
  27. itemTableEnd
  28. itemArrayTableStart
  29. itemArrayTableEnd
  30. itemKeyStart
  31. itemKeyEnd
  32. itemCommentStart
  33. itemInlineTableStart
  34. itemInlineTableEnd
  35. )
  36. const eof = 0
  37. type stateFn func(lx *lexer) stateFn
  38. func (p Position) String() string {
  39. return fmt.Sprintf("at line %d; start %d; length %d", p.Line, p.Start, p.Len)
  40. }
  41. type lexer struct {
  42. input string
  43. start int
  44. pos int
  45. line int
  46. state stateFn
  47. items chan item
  48. tomlNext bool
  49. // Allow for backing up up to 4 runes. This is necessary because TOML
  50. // contains 3-rune tokens (""" and ''').
  51. prevWidths [4]int
  52. nprev int // how many of prevWidths are in use
  53. atEOF bool // If we emit an eof, we can still back up, but it is not OK to call next again.
  54. // A stack of state functions used to maintain context.
  55. //
  56. // The idea is to reuse parts of the state machine in various places. For
  57. // example, values can appear at the top level or within arbitrarily nested
  58. // arrays. The last state on the stack is used after a value has been lexed.
  59. // Similarly for comments.
  60. stack []stateFn
  61. }
  62. type item struct {
  63. typ itemType
  64. val string
  65. err error
  66. pos Position
  67. }
  68. func (lx *lexer) nextItem() item {
  69. for {
  70. select {
  71. case item := <-lx.items:
  72. return item
  73. default:
  74. lx.state = lx.state(lx)
  75. //fmt.Printf(" STATE %-24s current: %-10s stack: %s\n", lx.state, lx.current(), lx.stack)
  76. }
  77. }
  78. }
  79. func lex(input string, tomlNext bool) *lexer {
  80. lx := &lexer{
  81. input: input,
  82. state: lexTop,
  83. items: make(chan item, 10),
  84. stack: make([]stateFn, 0, 10),
  85. line: 1,
  86. tomlNext: tomlNext,
  87. }
  88. return lx
  89. }
  90. func (lx *lexer) push(state stateFn) {
  91. lx.stack = append(lx.stack, state)
  92. }
  93. func (lx *lexer) pop() stateFn {
  94. if len(lx.stack) == 0 {
  95. return lx.errorf("BUG in lexer: no states to pop")
  96. }
  97. last := lx.stack[len(lx.stack)-1]
  98. lx.stack = lx.stack[0 : len(lx.stack)-1]
  99. return last
  100. }
  101. func (lx *lexer) current() string {
  102. return lx.input[lx.start:lx.pos]
  103. }
  104. func (lx lexer) getPos() Position {
  105. p := Position{
  106. Line: lx.line,
  107. Start: lx.start,
  108. Len: lx.pos - lx.start,
  109. }
  110. if p.Len <= 0 {
  111. p.Len = 1
  112. }
  113. return p
  114. }
  115. func (lx *lexer) emit(typ itemType) {
  116. // Needed for multiline strings ending with an incomplete UTF-8 sequence.
  117. if lx.start > lx.pos {
  118. lx.error(errLexUTF8{lx.input[lx.pos]})
  119. return
  120. }
  121. lx.items <- item{typ: typ, pos: lx.getPos(), val: lx.current()}
  122. lx.start = lx.pos
  123. }
  124. func (lx *lexer) emitTrim(typ itemType) {
  125. lx.items <- item{typ: typ, pos: lx.getPos(), val: strings.TrimSpace(lx.current())}
  126. lx.start = lx.pos
  127. }
  128. func (lx *lexer) next() (r rune) {
  129. if lx.atEOF {
  130. panic("BUG in lexer: next called after EOF")
  131. }
  132. if lx.pos >= len(lx.input) {
  133. lx.atEOF = true
  134. return eof
  135. }
  136. if lx.input[lx.pos] == '\n' {
  137. lx.line++
  138. }
  139. lx.prevWidths[3] = lx.prevWidths[2]
  140. lx.prevWidths[2] = lx.prevWidths[1]
  141. lx.prevWidths[1] = lx.prevWidths[0]
  142. if lx.nprev < 4 {
  143. lx.nprev++
  144. }
  145. r, w := utf8.DecodeRuneInString(lx.input[lx.pos:])
  146. if r == utf8.RuneError {
  147. lx.error(errLexUTF8{lx.input[lx.pos]})
  148. return utf8.RuneError
  149. }
  150. // Note: don't use peek() here, as this calls next().
  151. if isControl(r) || (r == '\r' && (len(lx.input)-1 == lx.pos || lx.input[lx.pos+1] != '\n')) {
  152. lx.errorControlChar(r)
  153. return utf8.RuneError
  154. }
  155. lx.prevWidths[0] = w
  156. lx.pos += w
  157. return r
  158. }
  159. // ignore skips over the pending input before this point.
  160. func (lx *lexer) ignore() {
  161. lx.start = lx.pos
  162. }
  163. // backup steps back one rune. Can be called 4 times between calls to next.
  164. func (lx *lexer) backup() {
  165. if lx.atEOF {
  166. lx.atEOF = false
  167. return
  168. }
  169. if lx.nprev < 1 {
  170. panic("BUG in lexer: backed up too far")
  171. }
  172. w := lx.prevWidths[0]
  173. lx.prevWidths[0] = lx.prevWidths[1]
  174. lx.prevWidths[1] = lx.prevWidths[2]
  175. lx.prevWidths[2] = lx.prevWidths[3]
  176. lx.nprev--
  177. lx.pos -= w
  178. if lx.pos < len(lx.input) && lx.input[lx.pos] == '\n' {
  179. lx.line--
  180. }
  181. }
  182. // accept consumes the next rune if it's equal to `valid`.
  183. func (lx *lexer) accept(valid rune) bool {
  184. if lx.next() == valid {
  185. return true
  186. }
  187. lx.backup()
  188. return false
  189. }
  190. // peek returns but does not consume the next rune in the input.
  191. func (lx *lexer) peek() rune {
  192. r := lx.next()
  193. lx.backup()
  194. return r
  195. }
  196. // skip ignores all input that matches the given predicate.
  197. func (lx *lexer) skip(pred func(rune) bool) {
  198. for {
  199. r := lx.next()
  200. if pred(r) {
  201. continue
  202. }
  203. lx.backup()
  204. lx.ignore()
  205. return
  206. }
  207. }
  208. // error stops all lexing by emitting an error and returning `nil`.
  209. //
  210. // Note that any value that is a character is escaped if it's a special
  211. // character (newlines, tabs, etc.).
  212. func (lx *lexer) error(err error) stateFn {
  213. if lx.atEOF {
  214. return lx.errorPrevLine(err)
  215. }
  216. lx.items <- item{typ: itemError, pos: lx.getPos(), err: err}
  217. return nil
  218. }
  219. // errorfPrevline is like error(), but sets the position to the last column of
  220. // the previous line.
  221. //
  222. // This is so that unexpected EOF or NL errors don't show on a new blank line.
  223. func (lx *lexer) errorPrevLine(err error) stateFn {
  224. pos := lx.getPos()
  225. pos.Line--
  226. pos.Len = 1
  227. pos.Start = lx.pos - 1
  228. lx.items <- item{typ: itemError, pos: pos, err: err}
  229. return nil
  230. }
  231. // errorPos is like error(), but allows explicitly setting the position.
  232. func (lx *lexer) errorPos(start, length int, err error) stateFn {
  233. pos := lx.getPos()
  234. pos.Start = start
  235. pos.Len = length
  236. lx.items <- item{typ: itemError, pos: pos, err: err}
  237. return nil
  238. }
  239. // errorf is like error, and creates a new error.
  240. func (lx *lexer) errorf(format string, values ...interface{}) stateFn {
  241. if lx.atEOF {
  242. pos := lx.getPos()
  243. pos.Line--
  244. pos.Len = 1
  245. pos.Start = lx.pos - 1
  246. lx.items <- item{typ: itemError, pos: pos, err: fmt.Errorf(format, values...)}
  247. return nil
  248. }
  249. lx.items <- item{typ: itemError, pos: lx.getPos(), err: fmt.Errorf(format, values...)}
  250. return nil
  251. }
  252. func (lx *lexer) errorControlChar(cc rune) stateFn {
  253. return lx.errorPos(lx.pos-1, 1, errLexControl{cc})
  254. }
  255. // lexTop consumes elements at the top level of TOML data.
  256. func lexTop(lx *lexer) stateFn {
  257. r := lx.next()
  258. if isWhitespace(r) || isNL(r) {
  259. return lexSkip(lx, lexTop)
  260. }
  261. switch r {
  262. case '#':
  263. lx.push(lexTop)
  264. return lexCommentStart
  265. case '[':
  266. return lexTableStart
  267. case eof:
  268. if lx.pos > lx.start {
  269. return lx.errorf("unexpected EOF")
  270. }
  271. lx.emit(itemEOF)
  272. return nil
  273. }
  274. // At this point, the only valid item can be a key, so we back up
  275. // and let the key lexer do the rest.
  276. lx.backup()
  277. lx.push(lexTopEnd)
  278. return lexKeyStart
  279. }
  280. // lexTopEnd is entered whenever a top-level item has been consumed. (A value
  281. // or a table.) It must see only whitespace, and will turn back to lexTop
  282. // upon a newline. If it sees EOF, it will quit the lexer successfully.
  283. func lexTopEnd(lx *lexer) stateFn {
  284. r := lx.next()
  285. switch {
  286. case r == '#':
  287. // a comment will read to a newline for us.
  288. lx.push(lexTop)
  289. return lexCommentStart
  290. case isWhitespace(r):
  291. return lexTopEnd
  292. case isNL(r):
  293. lx.ignore()
  294. return lexTop
  295. case r == eof:
  296. lx.emit(itemEOF)
  297. return nil
  298. }
  299. return lx.errorf(
  300. "expected a top-level item to end with a newline, comment, or EOF, but got %q instead",
  301. r)
  302. }
  303. // lexTable lexes the beginning of a table. Namely, it makes sure that
  304. // it starts with a character other than '.' and ']'.
  305. // It assumes that '[' has already been consumed.
  306. // It also handles the case that this is an item in an array of tables.
  307. // e.g., '[[name]]'.
  308. func lexTableStart(lx *lexer) stateFn {
  309. if lx.peek() == '[' {
  310. lx.next()
  311. lx.emit(itemArrayTableStart)
  312. lx.push(lexArrayTableEnd)
  313. } else {
  314. lx.emit(itemTableStart)
  315. lx.push(lexTableEnd)
  316. }
  317. return lexTableNameStart
  318. }
  319. func lexTableEnd(lx *lexer) stateFn {
  320. lx.emit(itemTableEnd)
  321. return lexTopEnd
  322. }
  323. func lexArrayTableEnd(lx *lexer) stateFn {
  324. if r := lx.next(); r != ']' {
  325. return lx.errorf("expected end of table array name delimiter ']', but got %q instead", r)
  326. }
  327. lx.emit(itemArrayTableEnd)
  328. return lexTopEnd
  329. }
  330. func lexTableNameStart(lx *lexer) stateFn {
  331. lx.skip(isWhitespace)
  332. switch r := lx.peek(); {
  333. case r == ']' || r == eof:
  334. return lx.errorf("unexpected end of table name (table names cannot be empty)")
  335. case r == '.':
  336. return lx.errorf("unexpected table separator (table names cannot be empty)")
  337. case r == '"' || r == '\'':
  338. lx.ignore()
  339. lx.push(lexTableNameEnd)
  340. return lexQuotedName
  341. default:
  342. lx.push(lexTableNameEnd)
  343. return lexBareName
  344. }
  345. }
  346. // lexTableNameEnd reads the end of a piece of a table name, optionally
  347. // consuming whitespace.
  348. func lexTableNameEnd(lx *lexer) stateFn {
  349. lx.skip(isWhitespace)
  350. switch r := lx.next(); {
  351. case isWhitespace(r):
  352. return lexTableNameEnd
  353. case r == '.':
  354. lx.ignore()
  355. return lexTableNameStart
  356. case r == ']':
  357. return lx.pop()
  358. default:
  359. return lx.errorf("expected '.' or ']' to end table name, but got %q instead", r)
  360. }
  361. }
  362. // lexBareName lexes one part of a key or table.
  363. //
  364. // It assumes that at least one valid character for the table has already been
  365. // read.
  366. //
  367. // Lexes only one part, e.g. only 'a' inside 'a.b'.
  368. func lexBareName(lx *lexer) stateFn {
  369. r := lx.next()
  370. if isBareKeyChar(r, lx.tomlNext) {
  371. return lexBareName
  372. }
  373. lx.backup()
  374. lx.emit(itemText)
  375. return lx.pop()
  376. }
  377. // lexBareName lexes one part of a key or table.
  378. //
  379. // It assumes that at least one valid character for the table has already been
  380. // read.
  381. //
  382. // Lexes only one part, e.g. only '"a"' inside '"a".b'.
  383. func lexQuotedName(lx *lexer) stateFn {
  384. r := lx.next()
  385. switch {
  386. case isWhitespace(r):
  387. return lexSkip(lx, lexValue)
  388. case r == '"':
  389. lx.ignore() // ignore the '"'
  390. return lexString
  391. case r == '\'':
  392. lx.ignore() // ignore the "'"
  393. return lexRawString
  394. case r == eof:
  395. return lx.errorf("unexpected EOF; expected value")
  396. default:
  397. return lx.errorf("expected value but found %q instead", r)
  398. }
  399. }
  400. // lexKeyStart consumes all key parts until a '='.
  401. func lexKeyStart(lx *lexer) stateFn {
  402. lx.skip(isWhitespace)
  403. switch r := lx.peek(); {
  404. case r == '=' || r == eof:
  405. return lx.errorf("unexpected '=': key name appears blank")
  406. case r == '.':
  407. return lx.errorf("unexpected '.': keys cannot start with a '.'")
  408. case r == '"' || r == '\'':
  409. lx.ignore()
  410. fallthrough
  411. default: // Bare key
  412. lx.emit(itemKeyStart)
  413. return lexKeyNameStart
  414. }
  415. }
  416. func lexKeyNameStart(lx *lexer) stateFn {
  417. lx.skip(isWhitespace)
  418. switch r := lx.peek(); {
  419. case r == '=' || r == eof:
  420. return lx.errorf("unexpected '='")
  421. case r == '.':
  422. return lx.errorf("unexpected '.'")
  423. case r == '"' || r == '\'':
  424. lx.ignore()
  425. lx.push(lexKeyEnd)
  426. return lexQuotedName
  427. default:
  428. lx.push(lexKeyEnd)
  429. return lexBareName
  430. }
  431. }
  432. // lexKeyEnd consumes the end of a key and trims whitespace (up to the key
  433. // separator).
  434. func lexKeyEnd(lx *lexer) stateFn {
  435. lx.skip(isWhitespace)
  436. switch r := lx.next(); {
  437. case isWhitespace(r):
  438. return lexSkip(lx, lexKeyEnd)
  439. case r == eof:
  440. return lx.errorf("unexpected EOF; expected key separator '='")
  441. case r == '.':
  442. lx.ignore()
  443. return lexKeyNameStart
  444. case r == '=':
  445. lx.emit(itemKeyEnd)
  446. return lexSkip(lx, lexValue)
  447. default:
  448. return lx.errorf("expected '.' or '=', but got %q instead", r)
  449. }
  450. }
  451. // lexValue starts the consumption of a value anywhere a value is expected.
  452. // lexValue will ignore whitespace.
  453. // After a value is lexed, the last state on the next is popped and returned.
  454. func lexValue(lx *lexer) stateFn {
  455. // We allow whitespace to precede a value, but NOT newlines.
  456. // In array syntax, the array states are responsible for ignoring newlines.
  457. r := lx.next()
  458. switch {
  459. case isWhitespace(r):
  460. return lexSkip(lx, lexValue)
  461. case isDigit(r):
  462. lx.backup() // avoid an extra state and use the same as above
  463. return lexNumberOrDateStart
  464. }
  465. switch r {
  466. case '[':
  467. lx.ignore()
  468. lx.emit(itemArray)
  469. return lexArrayValue
  470. case '{':
  471. lx.ignore()
  472. lx.emit(itemInlineTableStart)
  473. return lexInlineTableValue
  474. case '"':
  475. if lx.accept('"') {
  476. if lx.accept('"') {
  477. lx.ignore() // Ignore """
  478. return lexMultilineString
  479. }
  480. lx.backup()
  481. }
  482. lx.ignore() // ignore the '"'
  483. return lexString
  484. case '\'':
  485. if lx.accept('\'') {
  486. if lx.accept('\'') {
  487. lx.ignore() // Ignore """
  488. return lexMultilineRawString
  489. }
  490. lx.backup()
  491. }
  492. lx.ignore() // ignore the "'"
  493. return lexRawString
  494. case '.': // special error case, be kind to users
  495. return lx.errorf("floats must start with a digit, not '.'")
  496. case 'i', 'n':
  497. if (lx.accept('n') && lx.accept('f')) || (lx.accept('a') && lx.accept('n')) {
  498. lx.emit(itemFloat)
  499. return lx.pop()
  500. }
  501. case '-', '+':
  502. return lexDecimalNumberStart
  503. }
  504. if unicode.IsLetter(r) {
  505. // Be permissive here; lexBool will give a nice error if the
  506. // user wrote something like
  507. // x = foo
  508. // (i.e. not 'true' or 'false' but is something else word-like.)
  509. lx.backup()
  510. return lexBool
  511. }
  512. if r == eof {
  513. return lx.errorf("unexpected EOF; expected value")
  514. }
  515. return lx.errorf("expected value but found %q instead", r)
  516. }
  517. // lexArrayValue consumes one value in an array. It assumes that '[' or ','
  518. // have already been consumed. All whitespace and newlines are ignored.
  519. func lexArrayValue(lx *lexer) stateFn {
  520. r := lx.next()
  521. switch {
  522. case isWhitespace(r) || isNL(r):
  523. return lexSkip(lx, lexArrayValue)
  524. case r == '#':
  525. lx.push(lexArrayValue)
  526. return lexCommentStart
  527. case r == ',':
  528. return lx.errorf("unexpected comma")
  529. case r == ']':
  530. return lexArrayEnd
  531. }
  532. lx.backup()
  533. lx.push(lexArrayValueEnd)
  534. return lexValue
  535. }
  536. // lexArrayValueEnd consumes everything between the end of an array value and
  537. // the next value (or the end of the array): it ignores whitespace and newlines
  538. // and expects either a ',' or a ']'.
  539. func lexArrayValueEnd(lx *lexer) stateFn {
  540. switch r := lx.next(); {
  541. case isWhitespace(r) || isNL(r):
  542. return lexSkip(lx, lexArrayValueEnd)
  543. case r == '#':
  544. lx.push(lexArrayValueEnd)
  545. return lexCommentStart
  546. case r == ',':
  547. lx.ignore()
  548. return lexArrayValue // move on to the next value
  549. case r == ']':
  550. return lexArrayEnd
  551. default:
  552. return lx.errorf("expected a comma (',') or array terminator (']'), but got %s", runeOrEOF(r))
  553. }
  554. }
  555. // lexArrayEnd finishes the lexing of an array.
  556. // It assumes that a ']' has just been consumed.
  557. func lexArrayEnd(lx *lexer) stateFn {
  558. lx.ignore()
  559. lx.emit(itemArrayEnd)
  560. return lx.pop()
  561. }
  562. // lexInlineTableValue consumes one key/value pair in an inline table.
  563. // It assumes that '{' or ',' have already been consumed. Whitespace is ignored.
  564. func lexInlineTableValue(lx *lexer) stateFn {
  565. r := lx.next()
  566. switch {
  567. case isWhitespace(r):
  568. return lexSkip(lx, lexInlineTableValue)
  569. case isNL(r):
  570. if lx.tomlNext {
  571. return lexSkip(lx, lexInlineTableValue)
  572. }
  573. return lx.errorPrevLine(errLexInlineTableNL{})
  574. case r == '#':
  575. lx.push(lexInlineTableValue)
  576. return lexCommentStart
  577. case r == ',':
  578. return lx.errorf("unexpected comma")
  579. case r == '}':
  580. return lexInlineTableEnd
  581. }
  582. lx.backup()
  583. lx.push(lexInlineTableValueEnd)
  584. return lexKeyStart
  585. }
  586. // lexInlineTableValueEnd consumes everything between the end of an inline table
  587. // key/value pair and the next pair (or the end of the table):
  588. // it ignores whitespace and expects either a ',' or a '}'.
  589. func lexInlineTableValueEnd(lx *lexer) stateFn {
  590. switch r := lx.next(); {
  591. case isWhitespace(r):
  592. return lexSkip(lx, lexInlineTableValueEnd)
  593. case isNL(r):
  594. if lx.tomlNext {
  595. return lexSkip(lx, lexInlineTableValueEnd)
  596. }
  597. return lx.errorPrevLine(errLexInlineTableNL{})
  598. case r == '#':
  599. lx.push(lexInlineTableValueEnd)
  600. return lexCommentStart
  601. case r == ',':
  602. lx.ignore()
  603. lx.skip(isWhitespace)
  604. if lx.peek() == '}' {
  605. if lx.tomlNext {
  606. return lexInlineTableValueEnd
  607. }
  608. return lx.errorf("trailing comma not allowed in inline tables")
  609. }
  610. return lexInlineTableValue
  611. case r == '}':
  612. return lexInlineTableEnd
  613. default:
  614. return lx.errorf("expected a comma or an inline table terminator '}', but got %s instead", runeOrEOF(r))
  615. }
  616. }
  617. func runeOrEOF(r rune) string {
  618. if r == eof {
  619. return "end of file"
  620. }
  621. return "'" + string(r) + "'"
  622. }
  623. // lexInlineTableEnd finishes the lexing of an inline table.
  624. // It assumes that a '}' has just been consumed.
  625. func lexInlineTableEnd(lx *lexer) stateFn {
  626. lx.ignore()
  627. lx.emit(itemInlineTableEnd)
  628. return lx.pop()
  629. }
  630. // lexString consumes the inner contents of a string. It assumes that the
  631. // beginning '"' has already been consumed and ignored.
  632. func lexString(lx *lexer) stateFn {
  633. r := lx.next()
  634. switch {
  635. case r == eof:
  636. return lx.errorf(`unexpected EOF; expected '"'`)
  637. case isNL(r):
  638. return lx.errorPrevLine(errLexStringNL{})
  639. case r == '\\':
  640. lx.push(lexString)
  641. return lexStringEscape
  642. case r == '"':
  643. lx.backup()
  644. lx.emit(itemString)
  645. lx.next()
  646. lx.ignore()
  647. return lx.pop()
  648. }
  649. return lexString
  650. }
  651. // lexMultilineString consumes the inner contents of a string. It assumes that
  652. // the beginning '"""' has already been consumed and ignored.
  653. func lexMultilineString(lx *lexer) stateFn {
  654. r := lx.next()
  655. switch r {
  656. default:
  657. return lexMultilineString
  658. case eof:
  659. return lx.errorf(`unexpected EOF; expected '"""'`)
  660. case '\\':
  661. return lexMultilineStringEscape
  662. case '"':
  663. /// Found " → try to read two more "".
  664. if lx.accept('"') {
  665. if lx.accept('"') {
  666. /// Peek ahead: the string can contain " and "", including at the
  667. /// end: """str"""""
  668. /// 6 or more at the end, however, is an error.
  669. if lx.peek() == '"' {
  670. /// Check if we already lexed 5 's; if so we have 6 now, and
  671. /// that's just too many man!
  672. ///
  673. /// Second check is for the edge case:
  674. ///
  675. /// two quotes allowed.
  676. /// vv
  677. /// """lol \""""""
  678. /// ^^ ^^^---- closing three
  679. /// escaped
  680. ///
  681. /// But ugly, but it works
  682. if strings.HasSuffix(lx.current(), `"""""`) && !strings.HasSuffix(lx.current(), `\"""""`) {
  683. return lx.errorf(`unexpected '""""""'`)
  684. }
  685. lx.backup()
  686. lx.backup()
  687. return lexMultilineString
  688. }
  689. lx.backup() /// backup: don't include the """ in the item.
  690. lx.backup()
  691. lx.backup()
  692. lx.emit(itemMultilineString)
  693. lx.next() /// Read over ''' again and discard it.
  694. lx.next()
  695. lx.next()
  696. lx.ignore()
  697. return lx.pop()
  698. }
  699. lx.backup()
  700. }
  701. return lexMultilineString
  702. }
  703. }
  704. // lexRawString consumes a raw string. Nothing can be escaped in such a string.
  705. // It assumes that the beginning "'" has already been consumed and ignored.
  706. func lexRawString(lx *lexer) stateFn {
  707. r := lx.next()
  708. switch {
  709. default:
  710. return lexRawString
  711. case r == eof:
  712. return lx.errorf(`unexpected EOF; expected "'"`)
  713. case isNL(r):
  714. return lx.errorPrevLine(errLexStringNL{})
  715. case r == '\'':
  716. lx.backup()
  717. lx.emit(itemRawString)
  718. lx.next()
  719. lx.ignore()
  720. return lx.pop()
  721. }
  722. }
  723. // lexMultilineRawString consumes a raw string. Nothing can be escaped in such a
  724. // string. It assumes that the beginning triple-' has already been consumed and
  725. // ignored.
  726. func lexMultilineRawString(lx *lexer) stateFn {
  727. r := lx.next()
  728. switch r {
  729. default:
  730. return lexMultilineRawString
  731. case eof:
  732. return lx.errorf(`unexpected EOF; expected "'''"`)
  733. case '\'':
  734. /// Found ' → try to read two more ''.
  735. if lx.accept('\'') {
  736. if lx.accept('\'') {
  737. /// Peek ahead: the string can contain ' and '', including at the
  738. /// end: '''str'''''
  739. /// 6 or more at the end, however, is an error.
  740. if lx.peek() == '\'' {
  741. /// Check if we already lexed 5 's; if so we have 6 now, and
  742. /// that's just too many man!
  743. if strings.HasSuffix(lx.current(), "'''''") {
  744. return lx.errorf(`unexpected "''''''"`)
  745. }
  746. lx.backup()
  747. lx.backup()
  748. return lexMultilineRawString
  749. }
  750. lx.backup() /// backup: don't include the ''' in the item.
  751. lx.backup()
  752. lx.backup()
  753. lx.emit(itemRawMultilineString)
  754. lx.next() /// Read over ''' again and discard it.
  755. lx.next()
  756. lx.next()
  757. lx.ignore()
  758. return lx.pop()
  759. }
  760. lx.backup()
  761. }
  762. return lexMultilineRawString
  763. }
  764. }
  765. // lexMultilineStringEscape consumes an escaped character. It assumes that the
  766. // preceding '\\' has already been consumed.
  767. func lexMultilineStringEscape(lx *lexer) stateFn {
  768. if isNL(lx.next()) { /// \ escaping newline.
  769. return lexMultilineString
  770. }
  771. lx.backup()
  772. lx.push(lexMultilineString)
  773. return lexStringEscape(lx)
  774. }
  775. func lexStringEscape(lx *lexer) stateFn {
  776. r := lx.next()
  777. switch r {
  778. case 'e':
  779. if !lx.tomlNext {
  780. return lx.error(errLexEscape{r})
  781. }
  782. fallthrough
  783. case 'b':
  784. fallthrough
  785. case 't':
  786. fallthrough
  787. case 'n':
  788. fallthrough
  789. case 'f':
  790. fallthrough
  791. case 'r':
  792. fallthrough
  793. case '"':
  794. fallthrough
  795. case ' ', '\t':
  796. // Inside """ .. """ strings you can use \ to escape newlines, and any
  797. // amount of whitespace can be between the \ and \n.
  798. fallthrough
  799. case '\\':
  800. return lx.pop()
  801. case 'x':
  802. if !lx.tomlNext {
  803. return lx.error(errLexEscape{r})
  804. }
  805. return lexHexEscape
  806. case 'u':
  807. return lexShortUnicodeEscape
  808. case 'U':
  809. return lexLongUnicodeEscape
  810. }
  811. return lx.error(errLexEscape{r})
  812. }
  813. func lexHexEscape(lx *lexer) stateFn {
  814. var r rune
  815. for i := 0; i < 2; i++ {
  816. r = lx.next()
  817. if !isHexadecimal(r) {
  818. return lx.errorf(
  819. `expected two hexadecimal digits after '\x', but got %q instead`,
  820. lx.current())
  821. }
  822. }
  823. return lx.pop()
  824. }
  825. func lexShortUnicodeEscape(lx *lexer) stateFn {
  826. var r rune
  827. for i := 0; i < 4; i++ {
  828. r = lx.next()
  829. if !isHexadecimal(r) {
  830. return lx.errorf(
  831. `expected four hexadecimal digits after '\u', but got %q instead`,
  832. lx.current())
  833. }
  834. }
  835. return lx.pop()
  836. }
  837. func lexLongUnicodeEscape(lx *lexer) stateFn {
  838. var r rune
  839. for i := 0; i < 8; i++ {
  840. r = lx.next()
  841. if !isHexadecimal(r) {
  842. return lx.errorf(
  843. `expected eight hexadecimal digits after '\U', but got %q instead`,
  844. lx.current())
  845. }
  846. }
  847. return lx.pop()
  848. }
  849. // lexNumberOrDateStart processes the first character of a value which begins
  850. // with a digit. It exists to catch values starting with '0', so that
  851. // lexBaseNumberOrDate can differentiate base prefixed integers from other
  852. // types.
  853. func lexNumberOrDateStart(lx *lexer) stateFn {
  854. r := lx.next()
  855. switch r {
  856. case '0':
  857. return lexBaseNumberOrDate
  858. }
  859. if !isDigit(r) {
  860. // The only way to reach this state is if the value starts
  861. // with a digit, so specifically treat anything else as an
  862. // error.
  863. return lx.errorf("expected a digit but got %q", r)
  864. }
  865. return lexNumberOrDate
  866. }
  867. // lexNumberOrDate consumes either an integer, float or datetime.
  868. func lexNumberOrDate(lx *lexer) stateFn {
  869. r := lx.next()
  870. if isDigit(r) {
  871. return lexNumberOrDate
  872. }
  873. switch r {
  874. case '-', ':':
  875. return lexDatetime
  876. case '_':
  877. return lexDecimalNumber
  878. case '.', 'e', 'E':
  879. return lexFloat
  880. }
  881. lx.backup()
  882. lx.emit(itemInteger)
  883. return lx.pop()
  884. }
  885. // lexDatetime consumes a Datetime, to a first approximation.
  886. // The parser validates that it matches one of the accepted formats.
  887. func lexDatetime(lx *lexer) stateFn {
  888. r := lx.next()
  889. if isDigit(r) {
  890. return lexDatetime
  891. }
  892. switch r {
  893. case '-', ':', 'T', 't', ' ', '.', 'Z', 'z', '+':
  894. return lexDatetime
  895. }
  896. lx.backup()
  897. lx.emitTrim(itemDatetime)
  898. return lx.pop()
  899. }
  900. // lexHexInteger consumes a hexadecimal integer after seeing the '0x' prefix.
  901. func lexHexInteger(lx *lexer) stateFn {
  902. r := lx.next()
  903. if isHexadecimal(r) {
  904. return lexHexInteger
  905. }
  906. switch r {
  907. case '_':
  908. return lexHexInteger
  909. }
  910. lx.backup()
  911. lx.emit(itemInteger)
  912. return lx.pop()
  913. }
  914. // lexOctalInteger consumes an octal integer after seeing the '0o' prefix.
  915. func lexOctalInteger(lx *lexer) stateFn {
  916. r := lx.next()
  917. if isOctal(r) {
  918. return lexOctalInteger
  919. }
  920. switch r {
  921. case '_':
  922. return lexOctalInteger
  923. }
  924. lx.backup()
  925. lx.emit(itemInteger)
  926. return lx.pop()
  927. }
  928. // lexBinaryInteger consumes a binary integer after seeing the '0b' prefix.
  929. func lexBinaryInteger(lx *lexer) stateFn {
  930. r := lx.next()
  931. if isBinary(r) {
  932. return lexBinaryInteger
  933. }
  934. switch r {
  935. case '_':
  936. return lexBinaryInteger
  937. }
  938. lx.backup()
  939. lx.emit(itemInteger)
  940. return lx.pop()
  941. }
  942. // lexDecimalNumber consumes a decimal float or integer.
  943. func lexDecimalNumber(lx *lexer) stateFn {
  944. r := lx.next()
  945. if isDigit(r) {
  946. return lexDecimalNumber
  947. }
  948. switch r {
  949. case '.', 'e', 'E':
  950. return lexFloat
  951. case '_':
  952. return lexDecimalNumber
  953. }
  954. lx.backup()
  955. lx.emit(itemInteger)
  956. return lx.pop()
  957. }
  958. // lexDecimalNumber consumes the first digit of a number beginning with a sign.
  959. // It assumes the sign has already been consumed. Values which start with a sign
  960. // are only allowed to be decimal integers or floats.
  961. //
  962. // The special "nan" and "inf" values are also recognized.
  963. func lexDecimalNumberStart(lx *lexer) stateFn {
  964. r := lx.next()
  965. // Special error cases to give users better error messages
  966. switch r {
  967. case 'i':
  968. if !lx.accept('n') || !lx.accept('f') {
  969. return lx.errorf("invalid float: '%s'", lx.current())
  970. }
  971. lx.emit(itemFloat)
  972. return lx.pop()
  973. case 'n':
  974. if !lx.accept('a') || !lx.accept('n') {
  975. return lx.errorf("invalid float: '%s'", lx.current())
  976. }
  977. lx.emit(itemFloat)
  978. return lx.pop()
  979. case '0':
  980. p := lx.peek()
  981. switch p {
  982. case 'b', 'o', 'x':
  983. return lx.errorf("cannot use sign with non-decimal numbers: '%s%c'", lx.current(), p)
  984. }
  985. case '.':
  986. return lx.errorf("floats must start with a digit, not '.'")
  987. }
  988. if isDigit(r) {
  989. return lexDecimalNumber
  990. }
  991. return lx.errorf("expected a digit but got %q", r)
  992. }
  993. // lexBaseNumberOrDate differentiates between the possible values which
  994. // start with '0'. It assumes that before reaching this state, the initial '0'
  995. // has been consumed.
  996. func lexBaseNumberOrDate(lx *lexer) stateFn {
  997. r := lx.next()
  998. // Note: All datetimes start with at least two digits, so we don't
  999. // handle date characters (':', '-', etc.) here.
  1000. if isDigit(r) {
  1001. return lexNumberOrDate
  1002. }
  1003. switch r {
  1004. case '_':
  1005. // Can only be decimal, because there can't be an underscore
  1006. // between the '0' and the base designator, and dates can't
  1007. // contain underscores.
  1008. return lexDecimalNumber
  1009. case '.', 'e', 'E':
  1010. return lexFloat
  1011. case 'b':
  1012. r = lx.peek()
  1013. if !isBinary(r) {
  1014. lx.errorf("not a binary number: '%s%c'", lx.current(), r)
  1015. }
  1016. return lexBinaryInteger
  1017. case 'o':
  1018. r = lx.peek()
  1019. if !isOctal(r) {
  1020. lx.errorf("not an octal number: '%s%c'", lx.current(), r)
  1021. }
  1022. return lexOctalInteger
  1023. case 'x':
  1024. r = lx.peek()
  1025. if !isHexadecimal(r) {
  1026. lx.errorf("not a hexidecimal number: '%s%c'", lx.current(), r)
  1027. }
  1028. return lexHexInteger
  1029. }
  1030. lx.backup()
  1031. lx.emit(itemInteger)
  1032. return lx.pop()
  1033. }
  1034. // lexFloat consumes the elements of a float. It allows any sequence of
  1035. // float-like characters, so floats emitted by the lexer are only a first
  1036. // approximation and must be validated by the parser.
  1037. func lexFloat(lx *lexer) stateFn {
  1038. r := lx.next()
  1039. if isDigit(r) {
  1040. return lexFloat
  1041. }
  1042. switch r {
  1043. case '_', '.', '-', '+', 'e', 'E':
  1044. return lexFloat
  1045. }
  1046. lx.backup()
  1047. lx.emit(itemFloat)
  1048. return lx.pop()
  1049. }
  1050. // lexBool consumes a bool string: 'true' or 'false.
  1051. func lexBool(lx *lexer) stateFn {
  1052. var rs []rune
  1053. for {
  1054. r := lx.next()
  1055. if !unicode.IsLetter(r) {
  1056. lx.backup()
  1057. break
  1058. }
  1059. rs = append(rs, r)
  1060. }
  1061. s := string(rs)
  1062. switch s {
  1063. case "true", "false":
  1064. lx.emit(itemBool)
  1065. return lx.pop()
  1066. }
  1067. return lx.errorf("expected value but found %q instead", s)
  1068. }
  1069. // lexCommentStart begins the lexing of a comment. It will emit
  1070. // itemCommentStart and consume no characters, passing control to lexComment.
  1071. func lexCommentStart(lx *lexer) stateFn {
  1072. lx.ignore()
  1073. lx.emit(itemCommentStart)
  1074. return lexComment
  1075. }
  1076. // lexComment lexes an entire comment. It assumes that '#' has been consumed.
  1077. // It will consume *up to* the first newline character, and pass control
  1078. // back to the last state on the stack.
  1079. func lexComment(lx *lexer) stateFn {
  1080. switch r := lx.next(); {
  1081. case isNL(r) || r == eof:
  1082. lx.backup()
  1083. lx.emit(itemText)
  1084. return lx.pop()
  1085. default:
  1086. return lexComment
  1087. }
  1088. }
  1089. // lexSkip ignores all slurped input and moves on to the next state.
  1090. func lexSkip(lx *lexer, nextState stateFn) stateFn {
  1091. lx.ignore()
  1092. return nextState
  1093. }
  1094. func (s stateFn) String() string {
  1095. name := runtime.FuncForPC(reflect.ValueOf(s).Pointer()).Name()
  1096. if i := strings.LastIndexByte(name, '.'); i > -1 {
  1097. name = name[i+1:]
  1098. }
  1099. if s == nil {
  1100. name = "<nil>"
  1101. }
  1102. return name + "()"
  1103. }
  1104. func (itype itemType) String() string {
  1105. switch itype {
  1106. case itemError:
  1107. return "Error"
  1108. case itemNIL:
  1109. return "NIL"
  1110. case itemEOF:
  1111. return "EOF"
  1112. case itemText:
  1113. return "Text"
  1114. case itemString, itemRawString, itemMultilineString, itemRawMultilineString:
  1115. return "String"
  1116. case itemBool:
  1117. return "Bool"
  1118. case itemInteger:
  1119. return "Integer"
  1120. case itemFloat:
  1121. return "Float"
  1122. case itemDatetime:
  1123. return "DateTime"
  1124. case itemTableStart:
  1125. return "TableStart"
  1126. case itemTableEnd:
  1127. return "TableEnd"
  1128. case itemKeyStart:
  1129. return "KeyStart"
  1130. case itemKeyEnd:
  1131. return "KeyEnd"
  1132. case itemArray:
  1133. return "Array"
  1134. case itemArrayEnd:
  1135. return "ArrayEnd"
  1136. case itemCommentStart:
  1137. return "CommentStart"
  1138. case itemInlineTableStart:
  1139. return "InlineTableStart"
  1140. case itemInlineTableEnd:
  1141. return "InlineTableEnd"
  1142. }
  1143. panic(fmt.Sprintf("BUG: Unknown type '%d'.", int(itype)))
  1144. }
  1145. func (item item) String() string {
  1146. return fmt.Sprintf("(%s, %s)", item.typ.String(), item.val)
  1147. }
  1148. func isWhitespace(r rune) bool { return r == '\t' || r == ' ' }
  1149. func isNL(r rune) bool { return r == '\n' || r == '\r' }
  1150. func isControl(r rune) bool { // Control characters except \t, \r, \n
  1151. switch r {
  1152. case '\t', '\r', '\n':
  1153. return false
  1154. default:
  1155. return (r >= 0x00 && r <= 0x1f) || r == 0x7f
  1156. }
  1157. }
  1158. func isDigit(r rune) bool { return r >= '0' && r <= '9' }
  1159. func isBinary(r rune) bool { return r == '0' || r == '1' }
  1160. func isOctal(r rune) bool { return r >= '0' && r <= '7' }
  1161. func isHexadecimal(r rune) bool {
  1162. return (r >= '0' && r <= '9') || (r >= 'a' && r <= 'f') || (r >= 'A' && r <= 'F')
  1163. }
  1164. func isBareKeyChar(r rune, tomlNext bool) bool {
  1165. if tomlNext {
  1166. return (r >= 'A' && r <= 'Z') ||
  1167. (r >= 'a' && r <= 'z') ||
  1168. (r >= '0' && r <= '9') ||
  1169. r == '_' || r == '-' ||
  1170. r == 0xb2 || r == 0xb3 || r == 0xb9 || (r >= 0xbc && r <= 0xbe) ||
  1171. (r >= 0xc0 && r <= 0xd6) || (r >= 0xd8 && r <= 0xf6) || (r >= 0xf8 && r <= 0x037d) ||
  1172. (r >= 0x037f && r <= 0x1fff) ||
  1173. (r >= 0x200c && r <= 0x200d) || (r >= 0x203f && r <= 0x2040) ||
  1174. (r >= 0x2070 && r <= 0x218f) || (r >= 0x2460 && r <= 0x24ff) ||
  1175. (r >= 0x2c00 && r <= 0x2fef) || (r >= 0x3001 && r <= 0xd7ff) ||
  1176. (r >= 0xf900 && r <= 0xfdcf) || (r >= 0xfdf0 && r <= 0xfffd) ||
  1177. (r >= 0x10000 && r <= 0xeffff)
  1178. }
  1179. return (r >= 'A' && r <= 'Z') ||
  1180. (r >= 'a' && r <= 'z') ||
  1181. (r >= '0' && r <= '9') ||
  1182. r == '_' || r == '-'
  1183. }