lexer.go 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639
  1. // Package lexer provides a handlebars tokenizer.
  2. package lexer
  3. import (
  4. "fmt"
  5. "regexp"
  6. "strings"
  7. "unicode"
  8. "unicode/utf8"
  9. )
  10. // References:
  11. // - https://github.com/wycats/handlebars.js/blob/master/src/handlebars.l
  12. // - https://github.com/golang/go/blob/master/src/text/template/parse/lex.go
  13. const (
  14. // Mustaches detection
  15. escapedEscapedOpenMustache = "\\\\{{"
  16. escapedOpenMustache = "\\{{"
  17. openMustache = "{{"
  18. closeMustache = "}}"
  19. closeStripMustache = "~}}"
  20. closeUnescapedStripMustache = "}~}}"
  21. )
  22. const eof = -1
  23. // lexFunc represents a function that returns the next lexer function.
  24. type lexFunc func(*Lexer) lexFunc
  25. // Lexer is a lexical analyzer.
  26. type Lexer struct {
  27. input string // input to scan
  28. name string // lexer name, used for testing purpose
  29. tokens chan Token // channel of scanned tokens
  30. nextFunc lexFunc // the next function to execute
  31. pos int // current byte position in input string
  32. line int // current line position in input string
  33. width int // size of last rune scanned from input string
  34. start int // start position of the token we are scanning
  35. // the shameful contextual properties needed because `nextFunc` is not enough
  36. closeComment *regexp.Regexp // regexp to scan close of current comment
  37. rawBlock bool // are we parsing a raw block content ?
  38. }
  39. var (
  40. lookheadChars = `[\s` + regexp.QuoteMeta("=~}/)|") + `]`
  41. literalLookheadChars = `[\s` + regexp.QuoteMeta("~})") + `]`
  42. // characters not allowed in an identifier
  43. unallowedIDChars = " \n\t!\"#%&'()*+,./;<=>@[\\]^`{|}~"
  44. // regular expressions
  45. rID = regexp.MustCompile(`^[^` + regexp.QuoteMeta(unallowedIDChars) + `]+`)
  46. rDotID = regexp.MustCompile(`^\.` + lookheadChars)
  47. rTrue = regexp.MustCompile(`^true` + literalLookheadChars)
  48. rFalse = regexp.MustCompile(`^false` + literalLookheadChars)
  49. rOpenRaw = regexp.MustCompile(`^\{\{\{\{`)
  50. rCloseRaw = regexp.MustCompile(`^\}\}\}\}`)
  51. rOpenEndRaw = regexp.MustCompile(`^\{\{\{\{/`)
  52. rOpenEndRawLookAhead = regexp.MustCompile(`\{\{\{\{/`)
  53. rOpenUnescaped = regexp.MustCompile(`^\{\{~?\{`)
  54. rCloseUnescaped = regexp.MustCompile(`^\}~?\}\}`)
  55. rOpenBlock = regexp.MustCompile(`^\{\{~?#`)
  56. rOpenEndBlock = regexp.MustCompile(`^\{\{~?/`)
  57. rOpenPartial = regexp.MustCompile(`^\{\{~?>`)
  58. // {{^}} or {{else}}
  59. rInverse = regexp.MustCompile(`^(\{\{~?\^\s*~?\}\}|\{\{~?\s*else\s*~?\}\})`)
  60. rOpenInverse = regexp.MustCompile(`^\{\{~?\^`)
  61. rOpenInverseChain = regexp.MustCompile(`^\{\{~?\s*else`)
  62. // {{ or {{&
  63. rOpen = regexp.MustCompile(`^\{\{~?&?`)
  64. rClose = regexp.MustCompile(`^~?\}\}`)
  65. rOpenBlockParams = regexp.MustCompile(`^as\s+\|`)
  66. // {{!-- ... --}}
  67. rOpenCommentDash = regexp.MustCompile(`^\{\{~?!--\s*`)
  68. rCloseCommentDash = regexp.MustCompile(`^\s*--~?\}\}`)
  69. // {{! ... }}
  70. rOpenComment = regexp.MustCompile(`^\{\{~?!\s*`)
  71. rCloseComment = regexp.MustCompile(`^\s*~?\}\}`)
  72. )
  73. // Scan scans given input.
  74. //
  75. // Tokens can then be fetched sequentially thanks to NextToken() function on returned lexer.
  76. func Scan(input string) *Lexer {
  77. return scanWithName(input, "")
  78. }
  79. // scanWithName scans given input, with a name used for testing
  80. //
  81. // Tokens can then be fetched sequentially thanks to NextToken() function on returned lexer.
  82. func scanWithName(input string, name string) *Lexer {
  83. result := &Lexer{
  84. input: input,
  85. name: name,
  86. tokens: make(chan Token),
  87. line: 1,
  88. }
  89. go result.run()
  90. return result
  91. }
  92. // Collect scans and collect all tokens.
  93. //
  94. // This should be used for debugging purpose only. You should use Scan() and lexer.NextToken() functions instead.
  95. func Collect(input string) []Token {
  96. var result []Token
  97. l := Scan(input)
  98. for {
  99. token := l.NextToken()
  100. result = append(result, token)
  101. if token.Kind == TokenEOF || token.Kind == TokenError {
  102. break
  103. }
  104. }
  105. return result
  106. }
  107. // NextToken returns the next scanned token.
  108. func (l *Lexer) NextToken() Token {
  109. result := <-l.tokens
  110. return result
  111. }
  112. // run starts lexical analysis
  113. func (l *Lexer) run() {
  114. for l.nextFunc = lexContent; l.nextFunc != nil; {
  115. l.nextFunc = l.nextFunc(l)
  116. }
  117. }
  118. // next returns next character from input, or eof of there is nothing left to scan
  119. func (l *Lexer) next() rune {
  120. if l.pos >= len(l.input) {
  121. l.width = 0
  122. return eof
  123. }
  124. r, w := utf8.DecodeRuneInString(l.input[l.pos:])
  125. l.width = w
  126. l.pos += l.width
  127. return r
  128. }
  129. func (l *Lexer) produce(kind TokenKind, val string) {
  130. l.tokens <- Token{kind, val, l.start, l.line}
  131. // scanning a new token
  132. l.start = l.pos
  133. // update line number
  134. l.line += strings.Count(val, "\n")
  135. }
  136. // emit emits a new scanned token
  137. func (l *Lexer) emit(kind TokenKind) {
  138. l.produce(kind, l.input[l.start:l.pos])
  139. }
  140. // emitContent emits scanned content
  141. func (l *Lexer) emitContent() {
  142. if l.pos > l.start {
  143. l.emit(TokenContent)
  144. }
  145. }
  146. // emitString emits a scanned string
  147. func (l *Lexer) emitString(delimiter rune) {
  148. str := l.input[l.start:l.pos]
  149. // replace escaped delimiters
  150. str = strings.Replace(str, "\\"+string(delimiter), string(delimiter), -1)
  151. l.produce(TokenString, str)
  152. }
  153. // peek returns but does not consume the next character in the input
  154. func (l *Lexer) peek() rune {
  155. r := l.next()
  156. l.backup()
  157. return r
  158. }
  159. // backup steps back one character
  160. //
  161. // WARNING: Can only be called once per call of next
  162. func (l *Lexer) backup() {
  163. l.pos -= l.width
  164. }
  165. // ignoreskips all characters that have been scanned up to current position
  166. func (l *Lexer) ignore() {
  167. l.start = l.pos
  168. }
  169. // accept scans the next character if it is included in given string
  170. func (l *Lexer) accept(valid string) bool {
  171. if strings.IndexRune(valid, l.next()) >= 0 {
  172. return true
  173. }
  174. l.backup()
  175. return false
  176. }
  177. // acceptRun scans all following characters that are part of given string
  178. func (l *Lexer) acceptRun(valid string) {
  179. for strings.IndexRune(valid, l.next()) >= 0 {
  180. }
  181. l.backup()
  182. }
  183. // errorf emits an error token
  184. func (l *Lexer) errorf(format string, args ...interface{}) lexFunc {
  185. l.tokens <- Token{TokenError, fmt.Sprintf(format, args...), l.start, l.line}
  186. return nil
  187. }
  188. // isString returns true if content at current scanning position starts with given string
  189. func (l *Lexer) isString(str string) bool {
  190. return strings.HasPrefix(l.input[l.pos:], str)
  191. }
  192. // findRegexp returns the first string from current scanning position that matches given regular expression
  193. func (l *Lexer) findRegexp(r *regexp.Regexp) string {
  194. return r.FindString(l.input[l.pos:])
  195. }
  196. // indexRegexp returns the index of the first string from current scanning position that matches given regular expression
  197. //
  198. // It returns -1 if not found
  199. func (l *Lexer) indexRegexp(r *regexp.Regexp) int {
  200. loc := r.FindStringIndex(l.input[l.pos:])
  201. if loc == nil {
  202. return -1
  203. }
  204. return loc[0]
  205. }
  206. // lexContent scans content (ie: not between mustaches)
  207. func lexContent(l *Lexer) lexFunc {
  208. var next lexFunc
  209. if l.rawBlock {
  210. if i := l.indexRegexp(rOpenEndRawLookAhead); i != -1 {
  211. // {{{{/
  212. l.rawBlock = false
  213. l.pos += i
  214. next = lexOpenMustache
  215. } else {
  216. return l.errorf("Unclosed raw block")
  217. }
  218. } else if l.isString(escapedEscapedOpenMustache) {
  219. // \\{{
  220. // emit content with only one escaped escape
  221. l.next()
  222. l.emitContent()
  223. // ignore second escaped escape
  224. l.next()
  225. l.ignore()
  226. next = lexContent
  227. } else if l.isString(escapedOpenMustache) {
  228. // \{{
  229. next = lexEscapedOpenMustache
  230. } else if str := l.findRegexp(rOpenCommentDash); str != "" {
  231. // {{!--
  232. l.closeComment = rCloseCommentDash
  233. next = lexComment
  234. } else if str := l.findRegexp(rOpenComment); str != "" {
  235. // {{!
  236. l.closeComment = rCloseComment
  237. next = lexComment
  238. } else if l.isString(openMustache) {
  239. // {{
  240. next = lexOpenMustache
  241. }
  242. if next != nil {
  243. // emit scanned content
  244. l.emitContent()
  245. // scan next token
  246. return next
  247. }
  248. // scan next rune
  249. if l.next() == eof {
  250. // emit scanned content
  251. l.emitContent()
  252. // this is over
  253. l.emit(TokenEOF)
  254. return nil
  255. }
  256. // continue content scanning
  257. return lexContent
  258. }
  259. // lexEscapedOpenMustache scans \{{
  260. func lexEscapedOpenMustache(l *Lexer) lexFunc {
  261. // ignore escape character
  262. l.next()
  263. l.ignore()
  264. // scan mustaches
  265. for l.peek() == '{' {
  266. l.next()
  267. }
  268. return lexContent
  269. }
  270. // lexOpenMustache scans {{
  271. func lexOpenMustache(l *Lexer) lexFunc {
  272. var str string
  273. var tok TokenKind
  274. nextFunc := lexExpression
  275. if str = l.findRegexp(rOpenEndRaw); str != "" {
  276. tok = TokenOpenEndRawBlock
  277. } else if str = l.findRegexp(rOpenRaw); str != "" {
  278. tok = TokenOpenRawBlock
  279. l.rawBlock = true
  280. } else if str = l.findRegexp(rOpenUnescaped); str != "" {
  281. tok = TokenOpenUnescaped
  282. } else if str = l.findRegexp(rOpenBlock); str != "" {
  283. tok = TokenOpenBlock
  284. } else if str = l.findRegexp(rOpenEndBlock); str != "" {
  285. tok = TokenOpenEndBlock
  286. } else if str = l.findRegexp(rOpenPartial); str != "" {
  287. tok = TokenOpenPartial
  288. } else if str = l.findRegexp(rInverse); str != "" {
  289. tok = TokenInverse
  290. nextFunc = lexContent
  291. } else if str = l.findRegexp(rOpenInverse); str != "" {
  292. tok = TokenOpenInverse
  293. } else if str = l.findRegexp(rOpenInverseChain); str != "" {
  294. tok = TokenOpenInverseChain
  295. } else if str = l.findRegexp(rOpen); str != "" {
  296. tok = TokenOpen
  297. } else {
  298. // this is rotten
  299. panic("Current pos MUST be an opening mustache")
  300. }
  301. l.pos += len(str)
  302. l.emit(tok)
  303. return nextFunc
  304. }
  305. // lexCloseMustache scans }} or ~}}
  306. func lexCloseMustache(l *Lexer) lexFunc {
  307. var str string
  308. var tok TokenKind
  309. if str = l.findRegexp(rCloseRaw); str != "" {
  310. // }}}}
  311. tok = TokenCloseRawBlock
  312. } else if str = l.findRegexp(rCloseUnescaped); str != "" {
  313. // }}}
  314. tok = TokenCloseUnescaped
  315. } else if str = l.findRegexp(rClose); str != "" {
  316. // }}
  317. tok = TokenClose
  318. } else {
  319. // this is rotten
  320. panic("Current pos MUST be a closing mustache")
  321. }
  322. l.pos += len(str)
  323. l.emit(tok)
  324. return lexContent
  325. }
  326. // lexExpression scans inside mustaches
  327. func lexExpression(l *Lexer) lexFunc {
  328. // search close mustache delimiter
  329. if l.isString(closeMustache) || l.isString(closeStripMustache) || l.isString(closeUnescapedStripMustache) {
  330. return lexCloseMustache
  331. }
  332. // search some patterns before advancing scanning position
  333. // "as |"
  334. if str := l.findRegexp(rOpenBlockParams); str != "" {
  335. l.pos += len(str)
  336. l.emit(TokenOpenBlockParams)
  337. return lexExpression
  338. }
  339. // ..
  340. if l.isString("..") {
  341. l.pos += len("..")
  342. l.emit(TokenID)
  343. return lexExpression
  344. }
  345. // .
  346. if str := l.findRegexp(rDotID); str != "" {
  347. l.pos += len(".")
  348. l.emit(TokenID)
  349. return lexExpression
  350. }
  351. // true
  352. if str := l.findRegexp(rTrue); str != "" {
  353. l.pos += len("true")
  354. l.emit(TokenBoolean)
  355. return lexExpression
  356. }
  357. // false
  358. if str := l.findRegexp(rFalse); str != "" {
  359. l.pos += len("false")
  360. l.emit(TokenBoolean)
  361. return lexExpression
  362. }
  363. // let's scan next character
  364. switch r := l.next(); {
  365. case r == eof:
  366. return l.errorf("Unclosed expression")
  367. case isIgnorable(r):
  368. return lexIgnorable
  369. case r == '(':
  370. l.emit(TokenOpenSexpr)
  371. case r == ')':
  372. l.emit(TokenCloseSexpr)
  373. case r == '=':
  374. l.emit(TokenEquals)
  375. case r == '@':
  376. l.emit(TokenData)
  377. case r == '"' || r == '\'':
  378. l.backup()
  379. return lexString
  380. case r == '/' || r == '.':
  381. l.emit(TokenSep)
  382. case r == '|':
  383. l.emit(TokenCloseBlockParams)
  384. case r == '+' || r == '-' || (r >= '0' && r <= '9'):
  385. l.backup()
  386. return lexNumber
  387. case r == '[':
  388. return lexPathLiteral
  389. case strings.IndexRune(unallowedIDChars, r) < 0:
  390. l.backup()
  391. return lexIdentifier
  392. default:
  393. return l.errorf("Unexpected character in expression: '%c'", r)
  394. }
  395. return lexExpression
  396. }
  397. // lexComment scans {{!-- or {{!
  398. func lexComment(l *Lexer) lexFunc {
  399. if str := l.findRegexp(l.closeComment); str != "" {
  400. l.pos += len(str)
  401. l.emit(TokenComment)
  402. return lexContent
  403. }
  404. if r := l.next(); r == eof {
  405. return l.errorf("Unclosed comment")
  406. }
  407. return lexComment
  408. }
  409. // lexIgnorable scans all following ignorable characters
  410. func lexIgnorable(l *Lexer) lexFunc {
  411. for isIgnorable(l.peek()) {
  412. l.next()
  413. }
  414. l.ignore()
  415. return lexExpression
  416. }
  417. // lexString scans a string
  418. func lexString(l *Lexer) lexFunc {
  419. // get string delimiter
  420. delim := l.next()
  421. var prev rune
  422. // ignore delimiter
  423. l.ignore()
  424. for {
  425. r := l.next()
  426. if r == eof || r == '\n' {
  427. return l.errorf("Unterminated string")
  428. }
  429. if (r == delim) && (prev != '\\') {
  430. break
  431. }
  432. prev = r
  433. }
  434. // remove end delimiter
  435. l.backup()
  436. // emit string
  437. l.emitString(delim)
  438. // skip end delimiter
  439. l.next()
  440. l.ignore()
  441. return lexExpression
  442. }
  443. // lexNumber scans a number: decimal, octal, hex, float, or imaginary. This
  444. // isn't a perfect number scanner - for instance it accepts "." and "0x0.2"
  445. // and "089" - but when it's wrong the input is invalid and the parser (via
  446. // strconv) will notice.
  447. //
  448. // NOTE: borrowed from https://github.com/golang/go/tree/master/src/text/template/parse/lex.go
  449. func lexNumber(l *Lexer) lexFunc {
  450. if !l.scanNumber() {
  451. return l.errorf("bad number syntax: %q", l.input[l.start:l.pos])
  452. }
  453. if sign := l.peek(); sign == '+' || sign == '-' {
  454. // Complex: 1+2i. No spaces, must end in 'i'.
  455. if !l.scanNumber() || l.input[l.pos-1] != 'i' {
  456. return l.errorf("bad number syntax: %q", l.input[l.start:l.pos])
  457. }
  458. l.emit(TokenNumber)
  459. } else {
  460. l.emit(TokenNumber)
  461. }
  462. return lexExpression
  463. }
  464. // scanNumber scans a number
  465. //
  466. // NOTE: borrowed from https://github.com/golang/go/tree/master/src/text/template/parse/lex.go
  467. func (l *Lexer) scanNumber() bool {
  468. // Optional leading sign.
  469. l.accept("+-")
  470. // Is it hex?
  471. digits := "0123456789"
  472. if l.accept("0") && l.accept("xX") {
  473. digits = "0123456789abcdefABCDEF"
  474. }
  475. l.acceptRun(digits)
  476. if l.accept(".") {
  477. l.acceptRun(digits)
  478. }
  479. if l.accept("eE") {
  480. l.accept("+-")
  481. l.acceptRun("0123456789")
  482. }
  483. // Is it imaginary?
  484. l.accept("i")
  485. // Next thing mustn't be alphanumeric.
  486. if isAlphaNumeric(l.peek()) {
  487. l.next()
  488. return false
  489. }
  490. return true
  491. }
  492. // lexIdentifier scans an ID
  493. func lexIdentifier(l *Lexer) lexFunc {
  494. str := l.findRegexp(rID)
  495. if len(str) == 0 {
  496. // this is rotten
  497. panic("Identifier expected")
  498. }
  499. l.pos += len(str)
  500. l.emit(TokenID)
  501. return lexExpression
  502. }
  503. // lexPathLiteral scans an [ID]
  504. func lexPathLiteral(l *Lexer) lexFunc {
  505. for {
  506. r := l.next()
  507. if r == eof || r == '\n' {
  508. return l.errorf("Unterminated path literal")
  509. }
  510. if r == ']' {
  511. break
  512. }
  513. }
  514. l.emit(TokenID)
  515. return lexExpression
  516. }
  517. // isIgnorable returns true if given character is ignorable (ie. whitespace of line feed)
  518. func isIgnorable(r rune) bool {
  519. return r == ' ' || r == '\t' || r == '\n'
  520. }
  521. // isAlphaNumeric reports whether r is an alphabetic, digit, or underscore.
  522. //
  523. // NOTE borrowed from https://github.com/golang/go/tree/master/src/text/template/parse/lex.go
  524. func isAlphaNumeric(r rune) bool {
  525. return r == '_' || unicode.IsLetter(r) || unicode.IsDigit(r)
  526. }