parser.go 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930
  1. /*
  2. Package parser implements parser for markdown text that generates AST (abstract syntax tree).
  3. */
  4. package parser
  5. import (
  6. "bytes"
  7. "fmt"
  8. "strconv"
  9. "strings"
  10. "github.com/gomarkdown/markdown/ast"
  11. )
  12. // Extensions is a bitmask of enabled parser extensions.
  13. type Extensions int
  14. // Bit flags representing markdown parsing extensions.
  15. // Use | (or) to specify multiple extensions.
  16. const (
  17. NoExtensions Extensions = 0
  18. NoIntraEmphasis Extensions = 1 << iota // Ignore emphasis markers inside words
  19. Tables // Parse tables
  20. FencedCode // Parse fenced code blocks
  21. Autolink // Detect embedded URLs that are not explicitly marked
  22. Strikethrough // Strikethrough text using ~~test~~
  23. LaxHTMLBlocks // Loosen up HTML block parsing rules
  24. SpaceHeadings // Be strict about prefix heading rules
  25. HardLineBreak // Translate newlines into line breaks
  26. NonBlockingSpace // Translate backspace spaces into line non-blocking spaces
  27. TabSizeEight // Expand tabs to eight spaces instead of four
  28. Footnotes // Pandoc-style footnotes
  29. NoEmptyLineBeforeBlock // No need to insert an empty line to start a (code, quote, ordered list, unordered list) block
  30. HeadingIDs // specify heading IDs with {#id}
  31. Titleblock // Titleblock ala pandoc
  32. AutoHeadingIDs // Create the heading ID from the text
  33. BackslashLineBreak // Translate trailing backslashes into line breaks
  34. DefinitionLists // Parse definition lists
  35. MathJax // Parse MathJax
  36. OrderedListStart // Keep track of the first number used when starting an ordered list.
  37. Attributes // Block Attributes
  38. SuperSubscript // Super- and subscript support: 2^10^, H~2~O.
  39. EmptyLinesBreakList // 2 empty lines break out of list
  40. Includes // Support including other files.
  41. Mmark // Support Mmark syntax, see https://mmark.miek.nl/post/syntax/
  42. CommonExtensions Extensions = NoIntraEmphasis | Tables | FencedCode |
  43. Autolink | Strikethrough | SpaceHeadings | HeadingIDs |
  44. BackslashLineBreak | DefinitionLists | MathJax
  45. )
  46. // The size of a tab stop.
  47. const (
  48. tabSizeDefault = 4
  49. tabSizeDouble = 8
  50. )
  51. // for each character that triggers a response when parsing inline data.
  52. type inlineParser func(p *Parser, data []byte, offset int) (int, ast.Node)
  53. // ReferenceOverrideFunc is expected to be called with a reference string and
  54. // return either a valid Reference type that the reference string maps to or
  55. // nil. If overridden is false, the default reference logic will be executed.
  56. // See the documentation in Options for more details on use-case.
  57. type ReferenceOverrideFunc func(reference string) (ref *Reference, overridden bool)
  58. // Parser is a type that holds extensions and the runtime state used by
  59. // Parse, and the renderer. You can not use it directly, construct it with New.
  60. type Parser struct {
  61. // ReferenceOverride is an optional function callback that is called every
  62. // time a reference is resolved. It can be set before starting parsing.
  63. //
  64. // In Markdown, the link reference syntax can be made to resolve a link to
  65. // a reference instead of an inline URL, in one of the following ways:
  66. //
  67. // * [link text][refid]
  68. // * [refid][]
  69. //
  70. // Usually, the refid is defined at the bottom of the Markdown document. If
  71. // this override function is provided, the refid is passed to the override
  72. // function first, before consulting the defined refids at the bottom. If
  73. // the override function indicates an override did not occur, the refids at
  74. // the bottom will be used to fill in the link details.
  75. ReferenceOverride ReferenceOverrideFunc
  76. // IsSafeURLOverride allows overriding the default URL matcher. URL is
  77. // safe if the overriding function returns true. Can be used to extend
  78. // the default list of safe URLs.
  79. IsSafeURLOverride func(url []byte) bool
  80. Opts Options
  81. // after parsing, this is AST root of parsed markdown text
  82. Doc ast.Node
  83. extensions Extensions
  84. refs map[string]*reference
  85. refsRecord map[string]struct{}
  86. inlineCallback [256]inlineParser
  87. nesting int
  88. maxNesting int
  89. insideLink bool
  90. indexCnt int // incremented after every index
  91. // Footnotes need to be ordered as well as available to quickly check for
  92. // presence. If a ref is also a footnote, it's stored both in refs and here
  93. // in notes. Slice is nil if footnotes not enabled.
  94. notes []*reference
  95. tip ast.Node // = doc
  96. oldTip ast.Node
  97. lastMatchedContainer ast.Node // = doc
  98. allClosed bool
  99. // Attributes are attached to block level elements.
  100. attr *ast.Attribute
  101. includeStack *incStack
  102. // collect headings where we auto-generated id so that we can
  103. // ensure they are unique at the end
  104. allHeadingsWithAutoID []*ast.Heading
  105. }
  106. // New creates a markdown parser with CommonExtensions.
  107. //
  108. // You can then call `doc := p.Parse(markdown)` to parse markdown document
  109. // and `markdown.Render(doc, renderer)` to convert it to another format with
  110. // a renderer.
  111. func New() *Parser {
  112. return NewWithExtensions(CommonExtensions)
  113. }
  114. // NewWithExtensions creates a markdown parser with given extensions.
  115. func NewWithExtensions(extension Extensions) *Parser {
  116. p := Parser{
  117. refs: make(map[string]*reference),
  118. refsRecord: make(map[string]struct{}),
  119. maxNesting: 16,
  120. insideLink: false,
  121. Doc: &ast.Document{},
  122. extensions: extension,
  123. allClosed: true,
  124. includeStack: newIncStack(),
  125. }
  126. p.tip = p.Doc
  127. p.oldTip = p.Doc
  128. p.lastMatchedContainer = p.Doc
  129. p.inlineCallback[' '] = maybeLineBreak
  130. p.inlineCallback['*'] = emphasis
  131. p.inlineCallback['_'] = emphasis
  132. if p.extensions&Strikethrough != 0 {
  133. p.inlineCallback['~'] = emphasis
  134. }
  135. p.inlineCallback['`'] = codeSpan
  136. p.inlineCallback['\n'] = lineBreak
  137. p.inlineCallback['['] = link
  138. p.inlineCallback['<'] = leftAngle
  139. p.inlineCallback['\\'] = escape
  140. p.inlineCallback['&'] = entity
  141. p.inlineCallback['!'] = maybeImage
  142. if p.extensions&Mmark != 0 {
  143. p.inlineCallback['('] = maybeShortRefOrIndex
  144. }
  145. p.inlineCallback['^'] = maybeInlineFootnoteOrSuper
  146. if p.extensions&Autolink != 0 {
  147. p.inlineCallback['h'] = maybeAutoLink
  148. p.inlineCallback['m'] = maybeAutoLink
  149. p.inlineCallback['f'] = maybeAutoLink
  150. p.inlineCallback['H'] = maybeAutoLink
  151. p.inlineCallback['M'] = maybeAutoLink
  152. p.inlineCallback['F'] = maybeAutoLink
  153. }
  154. if p.extensions&MathJax != 0 {
  155. p.inlineCallback['$'] = math
  156. }
  157. return &p
  158. }
  159. func (p *Parser) RegisterInline(n byte, fn inlineParser) inlineParser {
  160. prev := p.inlineCallback[n]
  161. p.inlineCallback[n] = fn
  162. return prev
  163. }
  164. func (p *Parser) getRef(refid string) (ref *reference, found bool) {
  165. if p.ReferenceOverride != nil {
  166. r, overridden := p.ReferenceOverride(refid)
  167. if overridden {
  168. if r == nil {
  169. return nil, false
  170. }
  171. return &reference{
  172. link: []byte(r.Link),
  173. title: []byte(r.Title),
  174. noteID: 0,
  175. hasBlock: false,
  176. text: []byte(r.Text)}, true
  177. }
  178. }
  179. // refs are case insensitive
  180. ref, found = p.refs[strings.ToLower(refid)]
  181. return ref, found
  182. }
  183. func (p *Parser) isFootnote(ref *reference) bool {
  184. _, ok := p.refsRecord[string(ref.link)]
  185. return ok
  186. }
  187. func (p *Parser) Finalize(block ast.Node) {
  188. p.tip = block.GetParent()
  189. }
  190. func (p *Parser) addChild(node ast.Node) ast.Node {
  191. for !canNodeContain(p.tip, node) {
  192. p.Finalize(p.tip)
  193. }
  194. ast.AppendChild(p.tip, node)
  195. p.tip = node
  196. return node
  197. }
  198. func canNodeContain(n ast.Node, v ast.Node) bool {
  199. switch n.(type) {
  200. case *ast.List:
  201. return isListItem(v)
  202. case *ast.Document, *ast.BlockQuote, *ast.Aside, *ast.ListItem, *ast.CaptionFigure:
  203. return !isListItem(v)
  204. case *ast.Table:
  205. switch v.(type) {
  206. case *ast.TableHeader, *ast.TableBody, *ast.TableFooter:
  207. return true
  208. default:
  209. return false
  210. }
  211. case *ast.TableHeader, *ast.TableBody, *ast.TableFooter:
  212. _, ok := v.(*ast.TableRow)
  213. return ok
  214. case *ast.TableRow:
  215. _, ok := v.(*ast.TableCell)
  216. return ok
  217. }
  218. // for nodes implemented outside of ast package, allow them
  219. // to implement this logic via CanContain interface
  220. if o, ok := n.(ast.CanContain); ok {
  221. return o.CanContain(v)
  222. }
  223. // for container nodes outside of ast package default to true
  224. // because false is a bad default
  225. typ := fmt.Sprintf("%T", n)
  226. customNode := !strings.HasPrefix(typ, "*ast.")
  227. if customNode {
  228. return n.AsLeaf() == nil
  229. }
  230. return false
  231. }
  232. func (p *Parser) closeUnmatchedBlocks() {
  233. if p.allClosed {
  234. return
  235. }
  236. for p.oldTip != p.lastMatchedContainer {
  237. parent := p.oldTip.GetParent()
  238. p.Finalize(p.oldTip)
  239. p.oldTip = parent
  240. }
  241. p.allClosed = true
  242. }
  243. // Reference represents the details of a link.
  244. // See the documentation in Options for more details on use-case.
  245. type Reference struct {
  246. // Link is usually the URL the reference points to.
  247. Link string
  248. // Title is the alternate text describing the link in more detail.
  249. Title string
  250. // Text is the optional text to override the ref with if the syntax used was
  251. // [refid][]
  252. Text string
  253. }
  254. // Parse generates AST (abstract syntax tree) representing markdown document.
  255. //
  256. // The result is a root of the tree whose underlying type is *ast.Document
  257. //
  258. // You can then convert AST to html using html.Renderer, to some other format
  259. // using a custom renderer or transform the tree.
  260. func (p *Parser) Parse(input []byte) ast.Node {
  261. // the code only works with Unix CR newlines so to make life easy for
  262. // callers normalize newlines
  263. input = NormalizeNewlines(input)
  264. p.Block(input)
  265. // Walk the tree and finish up some of unfinished blocks
  266. for p.tip != nil {
  267. p.Finalize(p.tip)
  268. }
  269. // Walk the tree again and process inline markdown in each block
  270. ast.WalkFunc(p.Doc, func(node ast.Node, entering bool) ast.WalkStatus {
  271. switch node.(type) {
  272. case *ast.Paragraph, *ast.Heading, *ast.TableCell:
  273. p.Inline(node, node.AsContainer().Content)
  274. node.AsContainer().Content = nil
  275. }
  276. return ast.GoToNext
  277. })
  278. if p.Opts.Flags&SkipFootnoteList == 0 {
  279. p.parseRefsToAST()
  280. }
  281. // ensure HeadingIDs generated with AutoHeadingIDs are unique
  282. // this is delayed here (as opposed to done when we create the id)
  283. // so that we can preserve more original ids when there are conflicts
  284. taken := map[string]bool{}
  285. for _, h := range p.allHeadingsWithAutoID {
  286. id := h.HeadingID
  287. if id == "" {
  288. continue
  289. }
  290. n := 0
  291. for taken[id] {
  292. n++
  293. id = h.HeadingID + "-" + strconv.Itoa(n)
  294. }
  295. h.HeadingID = id
  296. taken[id] = true
  297. }
  298. return p.Doc
  299. }
  300. func (p *Parser) parseRefsToAST() {
  301. if p.extensions&Footnotes == 0 || len(p.notes) == 0 {
  302. return
  303. }
  304. p.tip = p.Doc
  305. list := &ast.List{
  306. IsFootnotesList: true,
  307. ListFlags: ast.ListTypeOrdered,
  308. }
  309. p.AddBlock(&ast.Footnotes{})
  310. block := p.AddBlock(list)
  311. flags := ast.ListItemBeginningOfList
  312. // Note: this loop is intentionally explicit, not range-form. This is
  313. // because the body of the loop will append nested footnotes to p.notes and
  314. // we need to process those late additions. Range form would only walk over
  315. // the fixed initial set.
  316. for i := 0; i < len(p.notes); i++ {
  317. ref := p.notes[i]
  318. p.addChild(ref.footnote)
  319. block := ref.footnote
  320. listItem := block.(*ast.ListItem)
  321. listItem.ListFlags = flags | ast.ListTypeOrdered
  322. listItem.RefLink = ref.link
  323. if ref.hasBlock {
  324. flags |= ast.ListItemContainsBlock
  325. p.Block(ref.title)
  326. } else {
  327. p.Inline(block, ref.title)
  328. }
  329. flags &^= ast.ListItemBeginningOfList | ast.ListItemContainsBlock
  330. }
  331. above := list.Parent
  332. finalizeList(list)
  333. p.tip = above
  334. ast.WalkFunc(block, func(node ast.Node, entering bool) ast.WalkStatus {
  335. switch node.(type) {
  336. case *ast.Paragraph, *ast.Heading:
  337. p.Inline(node, node.AsContainer().Content)
  338. node.AsContainer().Content = nil
  339. }
  340. return ast.GoToNext
  341. })
  342. }
  343. //
  344. // Link references
  345. //
  346. // This section implements support for references that (usually) appear
  347. // as footnotes in a document, and can be referenced anywhere in the document.
  348. // The basic format is:
  349. //
  350. // [1]: http://www.google.com/ "Google"
  351. // [2]: http://www.github.com/ "Github"
  352. //
  353. // Anywhere in the document, the reference can be linked by referring to its
  354. // label, i.e., 1 and 2 in this example, as in:
  355. //
  356. // This library is hosted on [Github][2], a git hosting site.
  357. //
  358. // Actual footnotes as specified in Pandoc and supported by some other Markdown
  359. // libraries such as php-markdown are also taken care of. They look like this:
  360. //
  361. // This sentence needs a bit of further explanation.[^note]
  362. //
  363. // [^note]: This is the explanation.
  364. //
  365. // Footnotes should be placed at the end of the document in an ordered list.
  366. // Inline footnotes such as:
  367. //
  368. // Inline footnotes^[Not supported.] also exist.
  369. //
  370. // are not yet supported.
  371. // reference holds all information necessary for a reference-style links or
  372. // footnotes.
  373. //
  374. // Consider this markdown with reference-style links:
  375. //
  376. // [link][ref]
  377. //
  378. // [ref]: /url/ "tooltip title"
  379. //
  380. // It will be ultimately converted to this HTML:
  381. //
  382. // <p><a href=\"/url/\" title=\"title\">link</a></p>
  383. //
  384. // And a reference structure will be populated as follows:
  385. //
  386. // p.refs["ref"] = &reference{
  387. // link: "/url/",
  388. // title: "tooltip title",
  389. // }
  390. //
  391. // Alternatively, reference can contain information about a footnote. Consider
  392. // this markdown:
  393. //
  394. // Text needing a footnote.[^a]
  395. //
  396. // [^a]: This is the note
  397. //
  398. // A reference structure will be populated as follows:
  399. //
  400. // p.refs["a"] = &reference{
  401. // link: "a",
  402. // title: "This is the note",
  403. // noteID: <some positive int>,
  404. // }
  405. //
  406. // TODO: As you can see, it begs for splitting into two dedicated structures
  407. // for refs and for footnotes.
  408. type reference struct {
  409. link []byte
  410. title []byte
  411. noteID int // 0 if not a footnote ref
  412. hasBlock bool
  413. footnote ast.Node // a link to the Item node within a list of footnotes
  414. text []byte // only gets populated by refOverride feature with Reference.Text
  415. }
  416. func (r *reference) String() string {
  417. return fmt.Sprintf("{link: %q, title: %q, text: %q, noteID: %d, hasBlock: %v}",
  418. r.link, r.title, r.text, r.noteID, r.hasBlock)
  419. }
  420. // Check whether or not data starts with a reference link.
  421. // If so, it is parsed and stored in the list of references
  422. // (in the render struct).
  423. // Returns the number of bytes to skip to move past it,
  424. // or zero if the first line is not a reference.
  425. func isReference(p *Parser, data []byte, tabSize int) int {
  426. // up to 3 optional leading spaces
  427. if len(data) < 4 {
  428. return 0
  429. }
  430. i := 0
  431. for i < 3 && data[i] == ' ' {
  432. i++
  433. }
  434. noteID := 0
  435. // id part: anything but a newline between brackets
  436. if data[i] != '[' {
  437. return 0
  438. }
  439. i++
  440. if p.extensions&Footnotes != 0 {
  441. if i < len(data) && data[i] == '^' {
  442. // we can set it to anything here because the proper noteIds will
  443. // be assigned later during the second pass. It just has to be != 0
  444. noteID = 1
  445. i++
  446. }
  447. }
  448. idOffset := i
  449. for i < len(data) && data[i] != '\n' && data[i] != '\r' && data[i] != ']' {
  450. i++
  451. }
  452. if i >= len(data) || data[i] != ']' {
  453. return 0
  454. }
  455. idEnd := i
  456. // footnotes can have empty ID, like this: [^], but a reference can not be
  457. // empty like this: []. Break early if it's not a footnote and there's no ID
  458. if noteID == 0 && idOffset == idEnd {
  459. return 0
  460. }
  461. // spacer: colon (space | tab)* newline? (space | tab)*
  462. i++
  463. if i >= len(data) || data[i] != ':' {
  464. return 0
  465. }
  466. i++
  467. for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
  468. i++
  469. }
  470. if i < len(data) && (data[i] == '\n' || data[i] == '\r') {
  471. i++
  472. if i < len(data) && data[i] == '\n' && data[i-1] == '\r' {
  473. i++
  474. }
  475. }
  476. for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
  477. i++
  478. }
  479. if i >= len(data) {
  480. return 0
  481. }
  482. var (
  483. linkOffset, linkEnd int
  484. titleOffset, titleEnd int
  485. lineEnd int
  486. raw []byte
  487. hasBlock bool
  488. )
  489. if p.extensions&Footnotes != 0 && noteID != 0 {
  490. linkOffset, linkEnd, raw, hasBlock = scanFootnote(p, data, i, tabSize)
  491. lineEnd = linkEnd
  492. } else {
  493. linkOffset, linkEnd, titleOffset, titleEnd, lineEnd = scanLinkRef(p, data, i)
  494. }
  495. if lineEnd == 0 {
  496. return 0
  497. }
  498. // a valid ref has been found
  499. ref := &reference{
  500. noteID: noteID,
  501. hasBlock: hasBlock,
  502. }
  503. if noteID > 0 {
  504. // reusing the link field for the id since footnotes don't have links
  505. ref.link = data[idOffset:idEnd]
  506. // if footnote, it's not really a title, it's the contained text
  507. ref.title = raw
  508. } else {
  509. ref.link = data[linkOffset:linkEnd]
  510. ref.title = data[titleOffset:titleEnd]
  511. }
  512. // id matches are case-insensitive
  513. id := string(bytes.ToLower(data[idOffset:idEnd]))
  514. p.refs[id] = ref
  515. return lineEnd
  516. }
  517. func scanLinkRef(p *Parser, data []byte, i int) (linkOffset, linkEnd, titleOffset, titleEnd, lineEnd int) {
  518. // link: whitespace-free sequence, optionally between angle brackets
  519. if data[i] == '<' {
  520. i++
  521. }
  522. linkOffset = i
  523. for i < len(data) && data[i] != ' ' && data[i] != '\t' && data[i] != '\n' && data[i] != '\r' {
  524. i++
  525. }
  526. linkEnd = i
  527. if linkEnd < len(data) && data[linkOffset] == '<' && data[linkEnd-1] == '>' {
  528. linkOffset++
  529. linkEnd--
  530. }
  531. // optional spacer: (space | tab)* (newline | '\'' | '"' | '(' )
  532. for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
  533. i++
  534. }
  535. if i < len(data) && data[i] != '\n' && data[i] != '\r' && data[i] != '\'' && data[i] != '"' && data[i] != '(' {
  536. return
  537. }
  538. // compute end-of-line
  539. if i >= len(data) || data[i] == '\r' || data[i] == '\n' {
  540. lineEnd = i
  541. }
  542. if i+1 < len(data) && data[i] == '\r' && data[i+1] == '\n' {
  543. lineEnd++
  544. }
  545. // optional (space|tab)* spacer after a newline
  546. if lineEnd > 0 {
  547. i = lineEnd + 1
  548. for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
  549. i++
  550. }
  551. }
  552. // optional title: any non-newline sequence enclosed in '"() alone on its line
  553. if i+1 < len(data) && (data[i] == '\'' || data[i] == '"' || data[i] == '(') {
  554. i++
  555. titleOffset = i
  556. // look for EOL
  557. for i < len(data) && data[i] != '\n' && data[i] != '\r' {
  558. i++
  559. }
  560. if i+1 < len(data) && data[i] == '\n' && data[i+1] == '\r' {
  561. titleEnd = i + 1
  562. } else {
  563. titleEnd = i
  564. }
  565. // step back
  566. i--
  567. for i > titleOffset && (data[i] == ' ' || data[i] == '\t') {
  568. i--
  569. }
  570. if i > titleOffset && (data[i] == '\'' || data[i] == '"' || data[i] == ')') {
  571. lineEnd = titleEnd
  572. titleEnd = i
  573. }
  574. }
  575. return
  576. }
  577. // The first bit of this logic is the same as Parser.listItem, but the rest
  578. // is much simpler. This function simply finds the entire block and shifts it
  579. // over by one tab if it is indeed a block (just returns the line if it's not).
  580. // blockEnd is the end of the section in the input buffer, and contents is the
  581. // extracted text that was shifted over one tab. It will need to be rendered at
  582. // the end of the document.
  583. func scanFootnote(p *Parser, data []byte, i, indentSize int) (blockStart, blockEnd int, contents []byte, hasBlock bool) {
  584. if i == 0 || len(data) == 0 {
  585. return
  586. }
  587. // skip leading whitespace on first line
  588. for i < len(data) && data[i] == ' ' {
  589. i++
  590. }
  591. blockStart = i
  592. // find the end of the line
  593. blockEnd = i
  594. for i < len(data) && data[i-1] != '\n' {
  595. i++
  596. }
  597. // get working buffer
  598. var raw bytes.Buffer
  599. // put the first line into the working buffer
  600. raw.Write(data[blockEnd:i])
  601. blockEnd = i
  602. // process the following lines
  603. containsBlankLine := false
  604. gatherLines:
  605. for blockEnd < len(data) {
  606. i++
  607. // find the end of this line
  608. for i < len(data) && data[i-1] != '\n' {
  609. i++
  610. }
  611. // if it is an empty line, guess that it is part of this item
  612. // and move on to the next line
  613. if IsEmpty(data[blockEnd:i]) > 0 {
  614. containsBlankLine = true
  615. blockEnd = i
  616. continue
  617. }
  618. n := 0
  619. if n = isIndented(data[blockEnd:i], indentSize); n == 0 {
  620. // this is the end of the block.
  621. // we don't want to include this last line in the index.
  622. break gatherLines
  623. }
  624. // if there were blank lines before this one, insert a new one now
  625. if containsBlankLine {
  626. raw.WriteByte('\n')
  627. containsBlankLine = false
  628. }
  629. // get rid of that first tab, write to buffer
  630. raw.Write(data[blockEnd+n : i])
  631. hasBlock = true
  632. blockEnd = i
  633. }
  634. if data[blockEnd-1] != '\n' {
  635. raw.WriteByte('\n')
  636. }
  637. contents = raw.Bytes()
  638. return
  639. }
  640. // IsPunctuation returns true if c is a punctuation symbol.
  641. func IsPunctuation(c byte) bool {
  642. for _, r := range []byte("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~") {
  643. if c == r {
  644. return true
  645. }
  646. }
  647. return false
  648. }
  649. // IsSpace returns true if c is a white-space charactr
  650. func IsSpace(c byte) bool {
  651. return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == '\v'
  652. }
  653. // IsLetter returns true if c is ascii letter
  654. func IsLetter(c byte) bool {
  655. return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
  656. }
  657. // IsAlnum returns true if c is a digit or letter
  658. // TODO: check when this is looking for ASCII alnum and when it should use unicode
  659. func IsAlnum(c byte) bool {
  660. return (c >= '0' && c <= '9') || IsLetter(c)
  661. }
  662. var URIs = [][]byte{
  663. []byte("http://"),
  664. []byte("https://"),
  665. []byte("ftp://"),
  666. []byte("mailto:"),
  667. }
  668. var Paths = [][]byte{
  669. []byte("/"),
  670. []byte("./"),
  671. []byte("../"),
  672. }
  673. // IsSafeURL returns true if url starts with one of the valid schemes or is a relative path.
  674. func IsSafeURL(url []byte) bool {
  675. nLink := len(url)
  676. for _, path := range Paths {
  677. nPath := len(path)
  678. linkPrefix := url[:nPath]
  679. if nLink >= nPath && bytes.Equal(linkPrefix, path) {
  680. if nLink == nPath {
  681. return true
  682. } else if IsAlnum(url[nPath]) {
  683. return true
  684. }
  685. }
  686. }
  687. for _, prefix := range URIs {
  688. // TODO: handle unicode here
  689. // case-insensitive prefix test
  690. nPrefix := len(prefix)
  691. if nLink > nPrefix {
  692. linkPrefix := bytes.ToLower(url[:nPrefix])
  693. if bytes.Equal(linkPrefix, prefix) && IsAlnum(url[nPrefix]) {
  694. return true
  695. }
  696. }
  697. }
  698. return false
  699. }
  700. // TODO: this is not used
  701. // Replace tab characters with spaces, aligning to the next TAB_SIZE column.
  702. // always ends output with a newline
  703. /*
  704. func expandTabs(out *bytes.Buffer, line []byte, tabSize int) {
  705. // first, check for common cases: no tabs, or only tabs at beginning of line
  706. i, prefix := 0, 0
  707. slowcase := false
  708. for i = 0; i < len(line); i++ {
  709. if line[i] == '\t' {
  710. if prefix == i {
  711. prefix++
  712. } else {
  713. slowcase = true
  714. break
  715. }
  716. }
  717. }
  718. // no need to decode runes if all tabs are at the beginning of the line
  719. if !slowcase {
  720. for i = 0; i < prefix*tabSize; i++ {
  721. out.WriteByte(' ')
  722. }
  723. out.Write(line[prefix:])
  724. return
  725. }
  726. // the slow case: we need to count runes to figure out how
  727. // many spaces to insert for each tab
  728. column := 0
  729. i = 0
  730. for i < len(line) {
  731. start := i
  732. for i < len(line) && line[i] != '\t' {
  733. _, size := utf8.DecodeRune(line[i:])
  734. i += size
  735. column++
  736. }
  737. if i > start {
  738. out.Write(line[start:i])
  739. }
  740. if i >= len(line) {
  741. break
  742. }
  743. for {
  744. out.WriteByte(' ')
  745. column++
  746. if column%tabSize == 0 {
  747. break
  748. }
  749. }
  750. i++
  751. }
  752. }
  753. */
  754. // Find if a line counts as indented or not.
  755. // Returns number of characters the indent is (0 = not indented).
  756. func isIndented(data []byte, indentSize int) int {
  757. if len(data) == 0 {
  758. return 0
  759. }
  760. if data[0] == '\t' {
  761. return 1
  762. }
  763. if len(data) < indentSize {
  764. return 0
  765. }
  766. for i := 0; i < indentSize; i++ {
  767. if data[i] != ' ' {
  768. return 0
  769. }
  770. }
  771. return indentSize
  772. }
  773. // Create a url-safe slug for fragments
  774. func slugify(in []byte) []byte {
  775. if len(in) == 0 {
  776. return in
  777. }
  778. out := make([]byte, 0, len(in))
  779. sym := false
  780. for _, ch := range in {
  781. if IsAlnum(ch) {
  782. sym = false
  783. out = append(out, ch)
  784. } else if sym {
  785. continue
  786. } else {
  787. out = append(out, '-')
  788. sym = true
  789. }
  790. }
  791. var a, b int
  792. var ch byte
  793. for a, ch = range out {
  794. if ch != '-' {
  795. break
  796. }
  797. }
  798. for b = len(out) - 1; b > 0; b-- {
  799. if out[b] != '-' {
  800. break
  801. }
  802. }
  803. return out[a : b+1]
  804. }
  805. func isListItem(d ast.Node) bool {
  806. _, ok := d.(*ast.ListItem)
  807. return ok
  808. }
  809. func NormalizeNewlines(d []byte) []byte {
  810. wi := 0
  811. n := len(d)
  812. for i := 0; i < n; i++ {
  813. c := d[i]
  814. // 13 is CR
  815. if c != 13 {
  816. d[wi] = c
  817. wi++
  818. continue
  819. }
  820. // replace CR (mac / win) with LF (unix)
  821. d[wi] = 10
  822. wi++
  823. if i < n-1 && d[i+1] == 10 {
  824. // this was CRLF, so skip the LF
  825. i++
  826. }
  827. }
  828. return d[:wi]
  829. }