lexer.go 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424
  1. package pongo2
  2. import (
  3. "fmt"
  4. "strings"
  5. "unicode/utf8"
  6. "github.com/juju/errors"
  7. )
  8. const (
  9. TokenError = iota
  10. EOF
  11. TokenHTML
  12. TokenKeyword
  13. TokenIdentifier
  14. TokenString
  15. TokenNumber
  16. TokenSymbol
  17. )
  18. var (
  19. tokenSpaceChars = " \n\r\t"
  20. tokenIdentifierChars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_"
  21. tokenIdentifierCharsWithDigits = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_0123456789"
  22. tokenDigits = "0123456789"
  23. // Available symbols in pongo2 (within filters/tag)
  24. TokenSymbols = []string{
  25. // 3-Char symbols
  26. // 2-Char symbols
  27. "==", ">=", "<=", "&&", "||", "{{", "}}", "{%", "%}", "!=", "<>",
  28. // 1-Char symbol
  29. "(", ")", "+", "-", "*", "<", ">", "/", "^", ",", ".", "!", "|", ":", "=", "%",
  30. }
  31. // Available keywords in pongo2
  32. TokenKeywords = []string{"in", "and", "or", "not", "true", "false", "as", "export"}
  33. )
  34. type TokenType int
  35. type Token struct {
  36. Filename string
  37. Typ TokenType
  38. Val string
  39. Line int
  40. Col int
  41. }
  42. type lexerStateFn func() lexerStateFn
  43. type lexer struct {
  44. name string
  45. input string
  46. start int // start pos of the item
  47. pos int // current pos
  48. width int // width of last rune
  49. tokens []*Token
  50. errored bool
  51. startline int
  52. startcol int
  53. line int
  54. col int
  55. inVerbatim bool
  56. verbatimName string
  57. }
  58. func (t *Token) String() string {
  59. val := t.Val
  60. if len(val) > 1000 {
  61. val = fmt.Sprintf("%s...%s", val[:10], val[len(val)-5:])
  62. }
  63. typ := ""
  64. switch t.Typ {
  65. case TokenHTML:
  66. typ = "HTML"
  67. case TokenError:
  68. typ = "Error"
  69. case TokenIdentifier:
  70. typ = "Identifier"
  71. case TokenKeyword:
  72. typ = "Keyword"
  73. case TokenNumber:
  74. typ = "Number"
  75. case TokenString:
  76. typ = "String"
  77. case TokenSymbol:
  78. typ = "Symbol"
  79. default:
  80. typ = "Unknown"
  81. }
  82. return fmt.Sprintf("<Token Typ=%s (%d) Val='%s' Line=%d Col=%d>",
  83. typ, t.Typ, val, t.Line, t.Col)
  84. }
  85. func lex(name string, input string) ([]*Token, *Error) {
  86. l := &lexer{
  87. name: name,
  88. input: input,
  89. tokens: make([]*Token, 0, 100),
  90. line: 1,
  91. col: 1,
  92. startline: 1,
  93. startcol: 1,
  94. }
  95. l.run()
  96. if l.errored {
  97. errtoken := l.tokens[len(l.tokens)-1]
  98. return nil, &Error{
  99. Filename: name,
  100. Line: errtoken.Line,
  101. Column: errtoken.Col,
  102. Sender: "lexer",
  103. OrigError: errors.New(errtoken.Val),
  104. }
  105. }
  106. return l.tokens, nil
  107. }
  108. func (l *lexer) value() string {
  109. return l.input[l.start:l.pos]
  110. }
  111. func (l *lexer) length() int {
  112. return l.pos - l.start
  113. }
  114. func (l *lexer) emit(t TokenType) {
  115. tok := &Token{
  116. Filename: l.name,
  117. Typ: t,
  118. Val: l.value(),
  119. Line: l.startline,
  120. Col: l.startcol,
  121. }
  122. if t == TokenString {
  123. // Escape sequence \" in strings
  124. tok.Val = strings.Replace(tok.Val, `\"`, `"`, -1)
  125. tok.Val = strings.Replace(tok.Val, `\\`, `\`, -1)
  126. }
  127. l.tokens = append(l.tokens, tok)
  128. l.start = l.pos
  129. l.startline = l.line
  130. l.startcol = l.col
  131. }
  132. func (l *lexer) next() rune {
  133. if l.pos >= len(l.input) {
  134. l.width = 0
  135. return EOF
  136. }
  137. r, w := utf8.DecodeRuneInString(l.input[l.pos:])
  138. l.width = w
  139. l.pos += l.width
  140. l.col += l.width
  141. return r
  142. }
  143. func (l *lexer) backup() {
  144. l.pos -= l.width
  145. l.col -= l.width
  146. }
  147. func (l *lexer) peek() rune {
  148. r := l.next()
  149. l.backup()
  150. return r
  151. }
  152. func (l *lexer) ignore() {
  153. l.start = l.pos
  154. l.startline = l.line
  155. l.startcol = l.col
  156. }
  157. func (l *lexer) accept(what string) bool {
  158. if strings.IndexRune(what, l.next()) >= 0 {
  159. return true
  160. }
  161. l.backup()
  162. return false
  163. }
  164. func (l *lexer) acceptRun(what string) {
  165. for strings.IndexRune(what, l.next()) >= 0 {
  166. }
  167. l.backup()
  168. }
  169. func (l *lexer) errorf(format string, args ...interface{}) lexerStateFn {
  170. t := &Token{
  171. Filename: l.name,
  172. Typ: TokenError,
  173. Val: fmt.Sprintf(format, args...),
  174. Line: l.startline,
  175. Col: l.startcol,
  176. }
  177. l.tokens = append(l.tokens, t)
  178. l.errored = true
  179. l.startline = l.line
  180. l.startcol = l.col
  181. return nil
  182. }
  183. func (l *lexer) eof() bool {
  184. return l.start >= len(l.input)-1
  185. }
  186. func (l *lexer) run() {
  187. for {
  188. // TODO: Support verbatim tag names
  189. // https://docs.djangoproject.com/en/dev/ref/templates/builtins/#verbatim
  190. if l.inVerbatim {
  191. name := l.verbatimName
  192. if name != "" {
  193. name += " "
  194. }
  195. if strings.HasPrefix(l.input[l.pos:], fmt.Sprintf("{%% endverbatim %s%%}", name)) { // end verbatim
  196. if l.pos > l.start {
  197. l.emit(TokenHTML)
  198. }
  199. w := len("{% endverbatim %}")
  200. l.pos += w
  201. l.col += w
  202. l.ignore()
  203. l.inVerbatim = false
  204. }
  205. } else if strings.HasPrefix(l.input[l.pos:], "{% verbatim %}") { // tag
  206. if l.pos > l.start {
  207. l.emit(TokenHTML)
  208. }
  209. l.inVerbatim = true
  210. w := len("{% verbatim %}")
  211. l.pos += w
  212. l.col += w
  213. l.ignore()
  214. }
  215. if !l.inVerbatim {
  216. // Ignore single-line comments {# ... #}
  217. if strings.HasPrefix(l.input[l.pos:], "{#") {
  218. if l.pos > l.start {
  219. l.emit(TokenHTML)
  220. }
  221. l.pos += 2 // pass '{#'
  222. l.col += 2
  223. for {
  224. switch l.peek() {
  225. case EOF:
  226. l.errorf("Single-line comment not closed.")
  227. return
  228. case '\n':
  229. l.errorf("Newline not permitted in a single-line comment.")
  230. return
  231. }
  232. if strings.HasPrefix(l.input[l.pos:], "#}") {
  233. l.pos += 2 // pass '#}'
  234. l.col += 2
  235. break
  236. }
  237. l.next()
  238. }
  239. l.ignore() // ignore whole comment
  240. // Comment skipped
  241. continue // next token
  242. }
  243. if strings.HasPrefix(l.input[l.pos:], "{{") || // variable
  244. strings.HasPrefix(l.input[l.pos:], "{%") { // tag
  245. if l.pos > l.start {
  246. l.emit(TokenHTML)
  247. }
  248. l.tokenize()
  249. if l.errored {
  250. return
  251. }
  252. continue
  253. }
  254. }
  255. switch l.peek() {
  256. case '\n':
  257. l.line++
  258. l.col = 0
  259. }
  260. if l.next() == EOF {
  261. break
  262. }
  263. }
  264. if l.pos > l.start {
  265. l.emit(TokenHTML)
  266. }
  267. if l.inVerbatim {
  268. l.errorf("verbatim-tag not closed, got EOF.")
  269. }
  270. }
  271. func (l *lexer) tokenize() {
  272. for state := l.stateCode; state != nil; {
  273. state = state()
  274. }
  275. }
  276. func (l *lexer) stateCode() lexerStateFn {
  277. outer_loop:
  278. for {
  279. switch {
  280. case l.accept(tokenSpaceChars):
  281. if l.value() == "\n" {
  282. return l.errorf("Newline not allowed within tag/variable.")
  283. }
  284. l.ignore()
  285. continue
  286. case l.accept(tokenIdentifierChars):
  287. return l.stateIdentifier
  288. case l.accept(tokenDigits):
  289. return l.stateNumber
  290. case l.accept(`"`):
  291. return l.stateString
  292. }
  293. // Check for symbol
  294. for _, sym := range TokenSymbols {
  295. if strings.HasPrefix(l.input[l.start:], sym) {
  296. l.pos += len(sym)
  297. l.col += l.length()
  298. l.emit(TokenSymbol)
  299. if sym == "%}" || sym == "}}" {
  300. // Tag/variable end, return after emit
  301. return nil
  302. }
  303. continue outer_loop
  304. }
  305. }
  306. break
  307. }
  308. // Normal shut down
  309. return nil
  310. }
  311. func (l *lexer) stateIdentifier() lexerStateFn {
  312. l.acceptRun(tokenIdentifierChars)
  313. l.acceptRun(tokenIdentifierCharsWithDigits)
  314. for _, kw := range TokenKeywords {
  315. if kw == l.value() {
  316. l.emit(TokenKeyword)
  317. return l.stateCode
  318. }
  319. }
  320. l.emit(TokenIdentifier)
  321. return l.stateCode
  322. }
  323. func (l *lexer) stateNumber() lexerStateFn {
  324. l.acceptRun(tokenDigits)
  325. if l.accept(tokenIdentifierCharsWithDigits) {
  326. // This seems to be an identifier starting with a number.
  327. // See https://github.com/flosch/pongo2/issues/151
  328. return l.stateIdentifier()
  329. }
  330. /*
  331. Maybe context-sensitive number lexing?
  332. * comments.0.Text // first comment
  333. * usercomments.1.0 // second user, first comment
  334. * if (score >= 8.5) // 8.5 as a number
  335. if l.peek() == '.' {
  336. l.accept(".")
  337. if !l.accept(tokenDigits) {
  338. return l.errorf("Malformed number.")
  339. }
  340. l.acceptRun(tokenDigits)
  341. }
  342. */
  343. l.emit(TokenNumber)
  344. return l.stateCode
  345. }
  346. func (l *lexer) stateString() lexerStateFn {
  347. l.ignore()
  348. l.startcol-- // we're starting the position at the first "
  349. for !l.accept(`"`) {
  350. switch l.next() {
  351. case '\\':
  352. // escape sequence
  353. switch l.peek() {
  354. case '"', '\\':
  355. l.next()
  356. default:
  357. return l.errorf("Unknown escape sequence: \\%c", l.peek())
  358. }
  359. case EOF:
  360. return l.errorf("Unexpected EOF, string not closed.")
  361. case '\n':
  362. return l.errorf("Newline in string is not allowed.")
  363. }
  364. }
  365. l.backup()
  366. l.emit(TokenString)
  367. l.next()
  368. l.ignore()
  369. return l.stateCode
  370. }