lexer.go 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432
  1. package pongo2
  2. import (
  3. "fmt"
  4. "strings"
  5. "unicode/utf8"
  6. "errors"
  7. )
  8. const (
  9. TokenError = iota
  10. EOF
  11. TokenHTML
  12. TokenKeyword
  13. TokenIdentifier
  14. TokenString
  15. TokenNumber
  16. TokenSymbol
  17. )
  18. var (
  19. tokenSpaceChars = " \n\r\t"
  20. tokenIdentifierChars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_"
  21. tokenIdentifierCharsWithDigits = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_0123456789"
  22. tokenDigits = "0123456789"
  23. // Available symbols in pongo2 (within filters/tag)
  24. TokenSymbols = []string{
  25. // 3-Char symbols
  26. "{{-", "-}}", "{%-", "-%}",
  27. // 2-Char symbols
  28. "==", ">=", "<=", "&&", "||", "{{", "}}", "{%", "%}", "!=", "<>",
  29. // 1-Char symbol
  30. "(", ")", "+", "-", "*", "<", ">", "/", "^", ",", ".", "!", "|", ":", "=", "%",
  31. }
  32. // Available keywords in pongo2
  33. TokenKeywords = []string{"in", "and", "or", "not", "true", "false", "as", "export"}
  34. )
  35. type TokenType int
  36. type Token struct {
  37. Filename string
  38. Typ TokenType
  39. Val string
  40. Line int
  41. Col int
  42. TrimWhitespaces bool
  43. }
  44. type lexerStateFn func() lexerStateFn
  45. type lexer struct {
  46. name string
  47. input string
  48. start int // start pos of the item
  49. pos int // current pos
  50. width int // width of last rune
  51. tokens []*Token
  52. errored bool
  53. startline int
  54. startcol int
  55. line int
  56. col int
  57. inVerbatim bool
  58. verbatimName string
  59. }
  60. func (t *Token) String() string {
  61. val := t.Val
  62. if len(val) > 1000 {
  63. val = fmt.Sprintf("%s...%s", val[:10], val[len(val)-5:])
  64. }
  65. typ := ""
  66. switch t.Typ {
  67. case TokenHTML:
  68. typ = "HTML"
  69. case TokenError:
  70. typ = "Error"
  71. case TokenIdentifier:
  72. typ = "Identifier"
  73. case TokenKeyword:
  74. typ = "Keyword"
  75. case TokenNumber:
  76. typ = "Number"
  77. case TokenString:
  78. typ = "String"
  79. case TokenSymbol:
  80. typ = "Symbol"
  81. default:
  82. typ = "Unknown"
  83. }
  84. return fmt.Sprintf("<Token Typ=%s (%d) Val='%s' Line=%d Col=%d, WT=%t>",
  85. typ, t.Typ, val, t.Line, t.Col, t.TrimWhitespaces)
  86. }
  87. func lex(name string, input string) ([]*Token, *Error) {
  88. l := &lexer{
  89. name: name,
  90. input: input,
  91. tokens: make([]*Token, 0, 100),
  92. line: 1,
  93. col: 1,
  94. startline: 1,
  95. startcol: 1,
  96. }
  97. l.run()
  98. if l.errored {
  99. errtoken := l.tokens[len(l.tokens)-1]
  100. return nil, &Error{
  101. Filename: name,
  102. Line: errtoken.Line,
  103. Column: errtoken.Col,
  104. Sender: "lexer",
  105. OrigError: errors.New(errtoken.Val),
  106. }
  107. }
  108. return l.tokens, nil
  109. }
  110. func (l *lexer) value() string {
  111. return l.input[l.start:l.pos]
  112. }
  113. func (l *lexer) length() int {
  114. return l.pos - l.start
  115. }
  116. func (l *lexer) emit(t TokenType) {
  117. tok := &Token{
  118. Filename: l.name,
  119. Typ: t,
  120. Val: l.value(),
  121. Line: l.startline,
  122. Col: l.startcol,
  123. }
  124. if t == TokenString {
  125. // Escape sequence \" in strings
  126. tok.Val = strings.Replace(tok.Val, `\"`, `"`, -1)
  127. tok.Val = strings.Replace(tok.Val, `\\`, `\`, -1)
  128. }
  129. if t == TokenSymbol && len(tok.Val) == 3 && (strings.HasSuffix(tok.Val, "-") || strings.HasPrefix(tok.Val, "-")) {
  130. tok.TrimWhitespaces = true
  131. tok.Val = strings.Replace(tok.Val, "-", "", -1)
  132. }
  133. l.tokens = append(l.tokens, tok)
  134. l.start = l.pos
  135. l.startline = l.line
  136. l.startcol = l.col
  137. }
  138. func (l *lexer) next() rune {
  139. if l.pos >= len(l.input) {
  140. l.width = 0
  141. return EOF
  142. }
  143. r, w := utf8.DecodeRuneInString(l.input[l.pos:])
  144. l.width = w
  145. l.pos += l.width
  146. l.col += l.width
  147. return r
  148. }
  149. func (l *lexer) backup() {
  150. l.pos -= l.width
  151. l.col -= l.width
  152. }
  153. func (l *lexer) peek() rune {
  154. r := l.next()
  155. l.backup()
  156. return r
  157. }
  158. func (l *lexer) ignore() {
  159. l.start = l.pos
  160. l.startline = l.line
  161. l.startcol = l.col
  162. }
  163. func (l *lexer) accept(what string) bool {
  164. if strings.IndexRune(what, l.next()) >= 0 {
  165. return true
  166. }
  167. l.backup()
  168. return false
  169. }
  170. func (l *lexer) acceptRun(what string) {
  171. for strings.IndexRune(what, l.next()) >= 0 {
  172. }
  173. l.backup()
  174. }
  175. func (l *lexer) errorf(format string, args ...interface{}) lexerStateFn {
  176. t := &Token{
  177. Filename: l.name,
  178. Typ: TokenError,
  179. Val: fmt.Sprintf(format, args...),
  180. Line: l.startline,
  181. Col: l.startcol,
  182. }
  183. l.tokens = append(l.tokens, t)
  184. l.errored = true
  185. l.startline = l.line
  186. l.startcol = l.col
  187. return nil
  188. }
  189. func (l *lexer) eof() bool {
  190. return l.start >= len(l.input)-1
  191. }
  192. func (l *lexer) run() {
  193. for {
  194. // TODO: Support verbatim tag names
  195. // https://docs.djangoproject.com/en/dev/ref/templates/builtins/#verbatim
  196. if l.inVerbatim {
  197. name := l.verbatimName
  198. if name != "" {
  199. name += " "
  200. }
  201. if strings.HasPrefix(l.input[l.pos:], fmt.Sprintf("{%% endverbatim %s%%}", name)) { // end verbatim
  202. if l.pos > l.start {
  203. l.emit(TokenHTML)
  204. }
  205. w := len("{% endverbatim %}")
  206. l.pos += w
  207. l.col += w
  208. l.ignore()
  209. l.inVerbatim = false
  210. }
  211. } else if strings.HasPrefix(l.input[l.pos:], "{% verbatim %}") { // tag
  212. if l.pos > l.start {
  213. l.emit(TokenHTML)
  214. }
  215. l.inVerbatim = true
  216. w := len("{% verbatim %}")
  217. l.pos += w
  218. l.col += w
  219. l.ignore()
  220. }
  221. if !l.inVerbatim {
  222. // Ignore single-line comments {# ... #}
  223. if strings.HasPrefix(l.input[l.pos:], "{#") {
  224. if l.pos > l.start {
  225. l.emit(TokenHTML)
  226. }
  227. l.pos += 2 // pass '{#'
  228. l.col += 2
  229. for {
  230. switch l.peek() {
  231. case EOF:
  232. l.errorf("Single-line comment not closed.")
  233. return
  234. case '\n':
  235. l.errorf("Newline not permitted in a single-line comment.")
  236. return
  237. }
  238. if strings.HasPrefix(l.input[l.pos:], "#}") {
  239. l.pos += 2 // pass '#}'
  240. l.col += 2
  241. break
  242. }
  243. l.next()
  244. }
  245. l.ignore() // ignore whole comment
  246. // Comment skipped
  247. continue // next token
  248. }
  249. if strings.HasPrefix(l.input[l.pos:], "{{") || // variable
  250. strings.HasPrefix(l.input[l.pos:], "{%") { // tag
  251. if l.pos > l.start {
  252. l.emit(TokenHTML)
  253. }
  254. l.tokenize()
  255. if l.errored {
  256. return
  257. }
  258. continue
  259. }
  260. }
  261. switch l.peek() {
  262. case '\n':
  263. l.line++
  264. l.col = 0
  265. }
  266. if l.next() == EOF {
  267. break
  268. }
  269. }
  270. if l.pos > l.start {
  271. l.emit(TokenHTML)
  272. }
  273. if l.inVerbatim {
  274. l.errorf("verbatim-tag not closed, got EOF.")
  275. }
  276. }
  277. func (l *lexer) tokenize() {
  278. for state := l.stateCode; state != nil; {
  279. state = state()
  280. }
  281. }
  282. func (l *lexer) stateCode() lexerStateFn {
  283. outer_loop:
  284. for {
  285. switch {
  286. case l.accept(tokenSpaceChars):
  287. if l.value() == "\n" {
  288. return l.errorf("Newline not allowed within tag/variable.")
  289. }
  290. l.ignore()
  291. continue
  292. case l.accept(tokenIdentifierChars):
  293. return l.stateIdentifier
  294. case l.accept(tokenDigits):
  295. return l.stateNumber
  296. case l.accept(`"'`):
  297. return l.stateString
  298. }
  299. // Check for symbol
  300. for _, sym := range TokenSymbols {
  301. if strings.HasPrefix(l.input[l.start:], sym) {
  302. l.pos += len(sym)
  303. l.col += l.length()
  304. l.emit(TokenSymbol)
  305. if sym == "%}" || sym == "-%}" || sym == "}}" || sym == "-}}" {
  306. // Tag/variable end, return after emit
  307. return nil
  308. }
  309. continue outer_loop
  310. }
  311. }
  312. break
  313. }
  314. // Normal shut down
  315. return nil
  316. }
  317. func (l *lexer) stateIdentifier() lexerStateFn {
  318. l.acceptRun(tokenIdentifierChars)
  319. l.acceptRun(tokenIdentifierCharsWithDigits)
  320. for _, kw := range TokenKeywords {
  321. if kw == l.value() {
  322. l.emit(TokenKeyword)
  323. return l.stateCode
  324. }
  325. }
  326. l.emit(TokenIdentifier)
  327. return l.stateCode
  328. }
  329. func (l *lexer) stateNumber() lexerStateFn {
  330. l.acceptRun(tokenDigits)
  331. if l.accept(tokenIdentifierCharsWithDigits) {
  332. // This seems to be an identifier starting with a number.
  333. // See https://github.com/flosch/pongo2/issues/151
  334. return l.stateIdentifier()
  335. }
  336. /*
  337. Maybe context-sensitive number lexing?
  338. * comments.0.Text // first comment
  339. * usercomments.1.0 // second user, first comment
  340. * if (score >= 8.5) // 8.5 as a number
  341. if l.peek() == '.' {
  342. l.accept(".")
  343. if !l.accept(tokenDigits) {
  344. return l.errorf("Malformed number.")
  345. }
  346. l.acceptRun(tokenDigits)
  347. }
  348. */
  349. l.emit(TokenNumber)
  350. return l.stateCode
  351. }
  352. func (l *lexer) stateString() lexerStateFn {
  353. quotationMark := l.value()
  354. l.ignore()
  355. l.startcol-- // we're starting the position at the first "
  356. for !l.accept(quotationMark) {
  357. switch l.next() {
  358. case '\\':
  359. // escape sequence
  360. switch l.peek() {
  361. case '"', '\\':
  362. l.next()
  363. default:
  364. return l.errorf("Unknown escape sequence: \\%c", l.peek())
  365. }
  366. case EOF:
  367. return l.errorf("Unexpected EOF, string not closed.")
  368. case '\n':
  369. return l.errorf("Newline in string is not allowed.")
  370. }
  371. }
  372. l.backup()
  373. l.emit(TokenString)
  374. l.next()
  375. l.ignore()
  376. return l.stateCode
  377. }