lex.go 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684
  1. // Copyright 2016 José Santos <henrique_1609@me.com>
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package jet
  15. import (
  16. "fmt"
  17. "strings"
  18. "unicode"
  19. "unicode/utf8"
  20. )
  21. // item represents a token or text string returned from the scanner.
  22. type item struct {
  23. typ itemType // The type of this item.
  24. pos Pos // The starting position, in bytes, of this item in the input string.
  25. val string // The value of this item.
  26. }
  27. func (i item) String() string {
  28. switch {
  29. case i.typ == itemEOF:
  30. return "EOF"
  31. case i.typ == itemError:
  32. return i.val
  33. case i.typ > itemKeyword:
  34. return fmt.Sprintf("<%s>", i.val)
  35. case len(i.val) > 10:
  36. return fmt.Sprintf("%.10q...", i.val)
  37. }
  38. return fmt.Sprintf("%q", i.val)
  39. }
  40. // itemType identifies the type of lex items.
  41. type itemType int
  42. const (
  43. itemError itemType = iota // error occurred; value is text of error
  44. itemBool // boolean constant
  45. itemChar // printable ASCII character; grab bag for comma etc.
  46. itemCharConstant // character constant
  47. itemComplex // complex constant (1+2i); imaginary is just a number
  48. itemEOF
  49. itemField // alphanumeric identifier starting with '.'
  50. itemIdentifier // alphanumeric identifier not starting with '.'
  51. itemLeftDelim // left action delimiter
  52. itemLeftParen // '(' inside action
  53. itemNumber // simple number, including imaginary
  54. itemPipe // pipe symbol
  55. itemRawString // raw quoted string (includes quotes)
  56. itemRightDelim // right action delimiter
  57. itemRightParen // ')' inside action
  58. itemSpace // run of spaces separating arguments
  59. itemString // quoted string (includes quotes)
  60. itemText // plain text
  61. itemAssign
  62. itemEquals
  63. itemNotEquals
  64. itemGreat
  65. itemGreatEquals
  66. itemLess
  67. itemLessEquals
  68. itemComma
  69. itemSemicolon
  70. itemAdd
  71. itemMinus
  72. itemMul
  73. itemDiv
  74. itemMod
  75. itemColon
  76. itemTernary
  77. itemLeftBrackets
  78. itemRightBrackets
  79. // Keywords appear after all the rest.
  80. itemKeyword // used only to delimit the keywords
  81. itemExtends
  82. itemImport
  83. itemInclude
  84. itemBlock
  85. itemEnd
  86. itemYield
  87. itemContent
  88. itemIf
  89. itemElse
  90. itemRange
  91. itemTry
  92. itemCatch
  93. itemReturn
  94. itemAnd
  95. itemOr
  96. itemNot
  97. itemNil
  98. itemMSG
  99. itemTrans
  100. )
  101. var key = map[string]itemType{
  102. "extends": itemExtends,
  103. "import": itemImport,
  104. "include": itemInclude,
  105. "block": itemBlock,
  106. "end": itemEnd,
  107. "yield": itemYield,
  108. "content": itemContent,
  109. "if": itemIf,
  110. "else": itemElse,
  111. "range": itemRange,
  112. "try": itemTry,
  113. "catch": itemCatch,
  114. "return": itemReturn,
  115. "and": itemAnd,
  116. "or": itemOr,
  117. "not": itemNot,
  118. "nil": itemNil,
  119. "msg": itemMSG,
  120. "trans": itemTrans,
  121. }
  122. const eof = -1
  123. const (
  124. defaultLeftDelim = "{{"
  125. defaultRightDelim = "}}"
  126. leftComment = "{*"
  127. rightComment = "*}"
  128. )
  129. // stateFn represents the state of the scanner as a function that returns the next state.
  130. type stateFn func(*lexer) stateFn
  131. // lexer holds the state of the scanner.
  132. type lexer struct {
  133. name string // the name of the input; used only for error reports
  134. input string // the string being scanned
  135. state stateFn // the next lexing function to enter
  136. pos Pos // current position in the input
  137. start Pos // start position of this item
  138. width Pos // width of last rune read from input
  139. lastPos Pos // position of most recent item returned by nextItem
  140. items chan item // channel of scanned items
  141. parenDepth int // nesting depth of ( ) exprs
  142. lastType itemType
  143. leftDelim string
  144. rightDelim string
  145. }
  146. func (l *lexer) setDelimiters(leftDelim, rightDelim string) {
  147. if leftDelim != "" {
  148. l.leftDelim = leftDelim
  149. }
  150. if rightDelim != "" {
  151. l.rightDelim = rightDelim
  152. }
  153. }
  154. // next returns the next rune in the input.
  155. func (l *lexer) next() rune {
  156. if int(l.pos) >= len(l.input) {
  157. l.width = 0
  158. return eof
  159. }
  160. r, w := utf8.DecodeRuneInString(l.input[l.pos:])
  161. l.width = Pos(w)
  162. l.pos += l.width
  163. return r
  164. }
  165. // peek returns but does not consume the next rune in the input.
  166. func (l *lexer) peek() rune {
  167. r := l.next()
  168. l.backup()
  169. return r
  170. }
  171. // backup steps back one rune. Can only be called once per call of next.
  172. func (l *lexer) backup() {
  173. l.pos -= l.width
  174. }
  175. // emit passes an item back to the client.
  176. func (l *lexer) emit(t itemType) {
  177. l.lastType = t
  178. l.items <- item{t, l.start, l.input[l.start:l.pos]}
  179. l.start = l.pos
  180. }
  181. // ignore skips over the pending input before this point.
  182. func (l *lexer) ignore() {
  183. l.start = l.pos
  184. }
  185. // accept consumes the next rune if it's from the valid set.
  186. func (l *lexer) accept(valid string) bool {
  187. if strings.IndexRune(valid, l.next()) >= 0 {
  188. return true
  189. }
  190. l.backup()
  191. return false
  192. }
  193. // acceptRun consumes a run of runes from the valid set.
  194. func (l *lexer) acceptRun(valid string) {
  195. for strings.IndexRune(valid, l.next()) >= 0 {
  196. }
  197. l.backup()
  198. }
  199. // lineNumber reports which line we're on, based on the position of
  200. // the previous item returned by nextItem. Doing it this way
  201. // means we don't have to worry about peek double counting.
  202. func (l *lexer) lineNumber() int {
  203. return 1 + strings.Count(l.input[:l.lastPos], "\n")
  204. }
  205. // errorf returns an error token and terminates the scan by passing
  206. // back a nil pointer that will be the next state, terminating l.nextItem.
  207. func (l *lexer) errorf(format string, args ...interface{}) stateFn {
  208. l.items <- item{itemError, l.start, fmt.Sprintf(format, args...)}
  209. return nil
  210. }
  211. // nextItem returns the next item from the input.
  212. // Called by the parser, not in the lexing goroutine.
  213. func (l *lexer) nextItem() item {
  214. item := <-l.items
  215. l.lastPos = item.pos
  216. return item
  217. }
  218. // drain drains the output so the lexing goroutine will exit.
  219. // Called by the parser, not in the lexing goroutine.
  220. func (l *lexer) drain() {
  221. for range l.items {
  222. }
  223. }
  224. // lex creates a new scanner for the input string.
  225. func lex(name, input string, run bool) *lexer {
  226. l := &lexer{
  227. name: name,
  228. input: input,
  229. items: make(chan item),
  230. leftDelim: defaultLeftDelim,
  231. rightDelim: defaultRightDelim,
  232. }
  233. if run {
  234. l.run()
  235. }
  236. return l
  237. }
  238. // run runs the state machine for the lexer.
  239. func (l *lexer) run() {
  240. go func() {
  241. for l.state = lexText; l.state != nil; {
  242. l.state = l.state(l)
  243. }
  244. close(l.items)
  245. }()
  246. }
  247. // state functions
  248. func lexText(l *lexer) stateFn {
  249. for {
  250. if i := strings.IndexByte(l.input[l.pos:], l.leftDelim[0]); i == -1 {
  251. l.pos = Pos(len(l.input))
  252. break
  253. } else {
  254. l.pos += Pos(i)
  255. if strings.HasPrefix(l.input[l.pos:], l.leftDelim) {
  256. if l.pos > l.start {
  257. l.emit(itemText)
  258. }
  259. return lexLeftDelim
  260. }
  261. if strings.HasPrefix(l.input[l.pos:], leftComment) {
  262. if l.pos > l.start {
  263. l.emit(itemText)
  264. }
  265. return lexComment
  266. }
  267. }
  268. if l.next() == eof {
  269. break
  270. }
  271. }
  272. // Correctly reached EOF.
  273. if l.pos > l.start {
  274. l.emit(itemText)
  275. }
  276. l.emit(itemEOF)
  277. return nil
  278. }
  279. func lexLeftDelim(l *lexer) stateFn {
  280. l.pos += Pos(len(l.leftDelim))
  281. l.emit(itemLeftDelim)
  282. l.parenDepth = 0
  283. return lexInsideAction
  284. }
  285. // lexComment scans a comment. The left comment marker is known to be present.
  286. func lexComment(l *lexer) stateFn {
  287. l.pos += Pos(len(leftComment))
  288. i := strings.Index(l.input[l.pos:], rightComment)
  289. if i < 0 {
  290. return l.errorf("unclosed comment")
  291. }
  292. l.pos += Pos(i + len(rightComment))
  293. l.ignore()
  294. return lexText
  295. }
  296. // lexRightDelim scans the right delimiter, which is known to be present.
  297. func lexRightDelim(l *lexer) stateFn {
  298. l.pos += Pos(len(l.rightDelim))
  299. l.emit(itemRightDelim)
  300. return lexText
  301. }
  302. // lexInsideAction scans the elements inside action delimiters.
  303. func lexInsideAction(l *lexer) stateFn {
  304. // Either number, quoted string, or identifier.
  305. // Spaces separate arguments; runs of spaces turn into itemSpace.
  306. // Pipe symbols separate and are emitted.
  307. if strings.HasPrefix(l.input[l.pos:], l.rightDelim) {
  308. if l.parenDepth == 0 {
  309. return lexRightDelim
  310. }
  311. return l.errorf("unclosed left paren")
  312. }
  313. switch r := l.next(); {
  314. case r == eof:
  315. return l.errorf("unclosed action")
  316. case isSpace(r):
  317. return lexSpace
  318. case r == ',':
  319. l.emit(itemComma)
  320. case r == ';':
  321. l.emit(itemSemicolon)
  322. case r == '*':
  323. l.emit(itemMul)
  324. case r == '/':
  325. l.emit(itemDiv)
  326. case r == '%':
  327. l.emit(itemMod)
  328. case r == '-':
  329. if r := l.peek(); '0' <= r && r <= '9' &&
  330. itemAdd != l.lastType &&
  331. itemMinus != l.lastType &&
  332. itemNumber != l.lastType &&
  333. itemIdentifier != l.lastType &&
  334. itemString != l.lastType &&
  335. itemRawString != l.lastType &&
  336. itemCharConstant != l.lastType &&
  337. itemBool != l.lastType &&
  338. itemField != l.lastType &&
  339. itemChar != l.lastType &&
  340. itemTrans != l.lastType {
  341. l.backup()
  342. return lexNumber
  343. }
  344. l.emit(itemMinus)
  345. case r == '+':
  346. if r := l.peek(); '0' <= r && r <= '9' &&
  347. itemAdd != l.lastType &&
  348. itemMinus != l.lastType &&
  349. itemNumber != l.lastType &&
  350. itemIdentifier != l.lastType &&
  351. itemString != l.lastType &&
  352. itemRawString != l.lastType &&
  353. itemCharConstant != l.lastType &&
  354. itemBool != l.lastType &&
  355. itemField != l.lastType &&
  356. itemChar != l.lastType &&
  357. itemTrans != l.lastType {
  358. l.backup()
  359. return lexNumber
  360. }
  361. l.emit(itemAdd)
  362. case r == '?':
  363. l.emit(itemTernary)
  364. case r == '&':
  365. if l.next() == '&' {
  366. l.emit(itemAnd)
  367. } else {
  368. l.backup()
  369. }
  370. case r == '<':
  371. if l.next() == '=' {
  372. l.emit(itemLessEquals)
  373. } else {
  374. l.backup()
  375. l.emit(itemLess)
  376. }
  377. case r == '>':
  378. if l.next() == '=' {
  379. l.emit(itemGreatEquals)
  380. } else {
  381. l.backup()
  382. l.emit(itemGreat)
  383. }
  384. case r == '!':
  385. if l.next() == '=' {
  386. l.emit(itemNotEquals)
  387. } else {
  388. l.backup()
  389. l.emit(itemNot)
  390. }
  391. case r == '=':
  392. if l.next() == '=' {
  393. l.emit(itemEquals)
  394. } else {
  395. l.backup()
  396. l.emit(itemAssign)
  397. }
  398. case r == ':':
  399. if l.next() == '=' {
  400. l.emit(itemAssign)
  401. } else {
  402. l.backup()
  403. l.emit(itemColon)
  404. }
  405. case r == '|':
  406. if l.next() == '|' {
  407. l.emit(itemOr)
  408. } else {
  409. l.backup()
  410. l.emit(itemPipe)
  411. }
  412. case r == '"':
  413. return lexQuote
  414. case r == '`':
  415. return lexRawQuote
  416. case r == '\'':
  417. return lexChar
  418. case r == '.':
  419. // special look-ahead for ".field" so we don't break l.backup().
  420. if l.pos < Pos(len(l.input)) {
  421. r := l.input[l.pos]
  422. if r < '0' || '9' < r {
  423. return lexField
  424. }
  425. }
  426. fallthrough // '.' can start a number.
  427. case '0' <= r && r <= '9':
  428. l.backup()
  429. return lexNumber
  430. case isAlphaNumeric(r):
  431. l.backup()
  432. return lexIdentifier
  433. case r == '[':
  434. l.emit(itemLeftBrackets)
  435. case r == ']':
  436. l.emit(itemRightBrackets)
  437. case r == '(':
  438. l.emit(itemLeftParen)
  439. l.parenDepth++
  440. case r == ')':
  441. l.emit(itemRightParen)
  442. l.parenDepth--
  443. if l.parenDepth < 0 {
  444. return l.errorf("unexpected right paren %#U", r)
  445. }
  446. case r <= unicode.MaxASCII && unicode.IsPrint(r):
  447. l.emit(itemChar)
  448. return lexInsideAction
  449. default:
  450. return l.errorf("unrecognized character in action: %#U", r)
  451. }
  452. return lexInsideAction
  453. }
  454. // lexSpace scans a run of space characters.
  455. // One space has already been seen.
  456. func lexSpace(l *lexer) stateFn {
  457. for isSpace(l.peek()) {
  458. l.next()
  459. }
  460. l.emit(itemSpace)
  461. return lexInsideAction
  462. }
  463. // lexIdentifier scans an alphanumeric.
  464. func lexIdentifier(l *lexer) stateFn {
  465. Loop:
  466. for {
  467. switch r := l.next(); {
  468. case isAlphaNumeric(r):
  469. // absorb.
  470. default:
  471. l.backup()
  472. word := l.input[l.start:l.pos]
  473. if !l.atTerminator() {
  474. return l.errorf("bad character %#U", r)
  475. }
  476. switch {
  477. case key[word] > itemKeyword:
  478. l.emit(key[word])
  479. case word[0] == '.':
  480. l.emit(itemField)
  481. case word == "true", word == "false":
  482. l.emit(itemBool)
  483. default:
  484. l.emit(itemIdentifier)
  485. }
  486. break Loop
  487. }
  488. }
  489. return lexInsideAction
  490. }
  491. // lexField scans a field: .Alphanumeric.
  492. // The . has been scanned.
  493. func lexField(l *lexer) stateFn {
  494. if l.atTerminator() {
  495. // Nothing interesting follows -> "." or "$".
  496. l.emit(itemIdentifier)
  497. return lexInsideAction
  498. }
  499. var r rune
  500. for {
  501. r = l.next()
  502. if !isAlphaNumeric(r) {
  503. l.backup()
  504. break
  505. }
  506. }
  507. if !l.atTerminator() {
  508. return l.errorf("bad character %#U", r)
  509. }
  510. l.emit(itemField)
  511. return lexInsideAction
  512. }
  513. // atTerminator reports whether the input is at valid termination character to
  514. // appear after an identifier. Breaks .X.Y into two pieces. Also catches cases
  515. // like "$x+2" not being acceptable without a space, in case we decide one
  516. // day to implement arithmetic.
  517. func (l *lexer) atTerminator() bool {
  518. r := l.peek()
  519. if isSpace(r) {
  520. return true
  521. }
  522. switch r {
  523. case eof, '.', ',', '|', ':', ')', '=', '(', ';', '?', '[', ']', '+', '-', '/', '%', '*', '&', '!', '<', '>':
  524. return true
  525. }
  526. // Does r start the delimiter? This can be ambiguous (with delim=="//", $x/2 will
  527. // succeed but should fail) but only in extremely rare cases caused by willfully
  528. // bad choice of delimiter.
  529. if rd, _ := utf8.DecodeRuneInString(l.rightDelim); rd == r {
  530. return true
  531. }
  532. return false
  533. }
  534. // lexChar scans a character constant. The initial quote is already
  535. // scanned. Syntax checking is done by the parser.
  536. func lexChar(l *lexer) stateFn {
  537. Loop:
  538. for {
  539. switch l.next() {
  540. case '\\':
  541. if r := l.next(); r != eof && r != '\n' {
  542. break
  543. }
  544. fallthrough
  545. case eof, '\n':
  546. return l.errorf("unterminated character constant")
  547. case '\'':
  548. break Loop
  549. }
  550. }
  551. l.emit(itemCharConstant)
  552. return lexInsideAction
  553. }
  554. // lexNumber scans a number: decimal, octal, hex, float, or imaginary. This
  555. // isn't a perfect number scanner - for instance it accepts "." and "0x0.2"
  556. // and "089" - but when it's wrong the input is invalid and the parser (via
  557. // strconv) will notice.
  558. func lexNumber(l *lexer) stateFn {
  559. if !l.scanNumber() {
  560. return l.errorf("bad number syntax: %q", l.input[l.start:l.pos])
  561. }
  562. l.emit(itemNumber)
  563. return lexInsideAction
  564. }
  565. func (l *lexer) scanNumber() bool {
  566. // Optional leading sign.
  567. l.accept("+-")
  568. // Is it hex?
  569. digits := "0123456789"
  570. if l.accept("0") && l.accept("xX") {
  571. digits = "0123456789abcdefABCDEF"
  572. }
  573. l.acceptRun(digits)
  574. if l.accept(".") {
  575. l.acceptRun(digits)
  576. }
  577. if l.accept("eE") {
  578. l.accept("+-")
  579. l.acceptRun("0123456789")
  580. }
  581. //Is it imaginary?
  582. l.accept("i")
  583. //Next thing mustn't be alphanumeric.
  584. if isAlphaNumeric(l.peek()) {
  585. l.next()
  586. return false
  587. }
  588. return true
  589. }
  590. // lexQuote scans a quoted string.
  591. func lexQuote(l *lexer) stateFn {
  592. Loop:
  593. for {
  594. switch l.next() {
  595. case '\\':
  596. if r := l.next(); r != eof && r != '\n' {
  597. break
  598. }
  599. fallthrough
  600. case eof, '\n':
  601. return l.errorf("unterminated quoted string")
  602. case '"':
  603. break Loop
  604. }
  605. }
  606. l.emit(itemString)
  607. return lexInsideAction
  608. }
  609. // lexRawQuote scans a raw quoted string.
  610. func lexRawQuote(l *lexer) stateFn {
  611. Loop:
  612. for {
  613. switch l.next() {
  614. case eof:
  615. return l.errorf("unterminated raw quoted string")
  616. case '`':
  617. break Loop
  618. }
  619. }
  620. l.emit(itemRawString)
  621. return lexInsideAction
  622. }
  623. // isSpace reports whether r is a space character.
  624. func isSpace(r rune) bool {
  625. return r == ' ' || r == '\t' || r == '\r' || r == '\n'
  626. }
  627. // isAlphaNumeric reports whether r is an alphabetic, digit, or underscore.
  628. func isAlphaNumeric(r rune) bool {
  629. return r == '_' || unicode.IsLetter(r) || unicode.IsDigit(r)
  630. }