inline.go 25 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228
  1. //
  2. // Blackfriday Markdown Processor
  3. // Available at http://github.com/russross/blackfriday
  4. //
  5. // Copyright © 2011 Russ Ross <russ@russross.com>.
  6. // Distributed under the Simplified BSD License.
  7. // See README.md for details.
  8. //
  9. //
  10. // Functions to parse inline elements.
  11. //
  12. package blackfriday
  13. import (
  14. "bytes"
  15. "regexp"
  16. "strconv"
  17. )
  18. var (
  19. urlRe = `((https?|ftp):\/\/|\/)[-A-Za-z0-9+&@#\/%?=~_|!:,.;\(\)]+`
  20. anchorRe = regexp.MustCompile(`^(<a\shref="` + urlRe + `"(\stitle="[^"<>]+")?\s?>` + urlRe + `<\/a>)`)
  21. // https://www.w3.org/TR/html5/syntax.html#character-references
  22. // highest unicode code point in 17 planes (2^20): 1,114,112d =
  23. // 7 dec digits or 6 hex digits
  24. // named entity references can be 2-31 characters with stuff like &lt;
  25. // at one end and &CounterClockwiseContourIntegral; at the other. There
  26. // are also sometimes numbers at the end, although this isn't inherent
  27. // in the specification; there are never numbers anywhere else in
  28. // current character references, though; see &frac34; and &blk12;, etc.
  29. // https://www.w3.org/TR/html5/syntax.html#named-character-references
  30. //
  31. // entity := "&" (named group | number ref) ";"
  32. // named group := [a-zA-Z]{2,31}[0-9]{0,2}
  33. // number ref := "#" (dec ref | hex ref)
  34. // dec ref := [0-9]{1,7}
  35. // hex ref := ("x" | "X") [0-9a-fA-F]{1,6}
  36. htmlEntityRe = regexp.MustCompile(`&([a-zA-Z]{2,31}[0-9]{0,2}|#([0-9]{1,7}|[xX][0-9a-fA-F]{1,6}));`)
  37. )
  38. // Functions to parse text within a block
  39. // Each function returns the number of chars taken care of
  40. // data is the complete block being rendered
  41. // offset is the number of valid chars before the current cursor
  42. func (p *Markdown) inline(currBlock *Node, data []byte) {
  43. // handlers might call us recursively: enforce a maximum depth
  44. if p.nesting >= p.maxNesting || len(data) == 0 {
  45. return
  46. }
  47. p.nesting++
  48. beg, end := 0, 0
  49. for end < len(data) {
  50. handler := p.inlineCallback[data[end]]
  51. if handler != nil {
  52. if consumed, node := handler(p, data, end); consumed == 0 {
  53. // No action from the callback.
  54. end++
  55. } else {
  56. // Copy inactive chars into the output.
  57. currBlock.AppendChild(text(data[beg:end]))
  58. if node != nil {
  59. currBlock.AppendChild(node)
  60. }
  61. // Skip past whatever the callback used.
  62. beg = end + consumed
  63. end = beg
  64. }
  65. } else {
  66. end++
  67. }
  68. }
  69. if beg < len(data) {
  70. if data[end-1] == '\n' {
  71. end--
  72. }
  73. currBlock.AppendChild(text(data[beg:end]))
  74. }
  75. p.nesting--
  76. }
  77. // single and double emphasis parsing
  78. func emphasis(p *Markdown, data []byte, offset int) (int, *Node) {
  79. data = data[offset:]
  80. c := data[0]
  81. if len(data) > 2 && data[1] != c {
  82. // whitespace cannot follow an opening emphasis;
  83. // strikethrough only takes two characters '~~'
  84. if c == '~' || isspace(data[1]) {
  85. return 0, nil
  86. }
  87. ret, node := helperEmphasis(p, data[1:], c)
  88. if ret == 0 {
  89. return 0, nil
  90. }
  91. return ret + 1, node
  92. }
  93. if len(data) > 3 && data[1] == c && data[2] != c {
  94. if isspace(data[2]) {
  95. return 0, nil
  96. }
  97. ret, node := helperDoubleEmphasis(p, data[2:], c)
  98. if ret == 0 {
  99. return 0, nil
  100. }
  101. return ret + 2, node
  102. }
  103. if len(data) > 4 && data[1] == c && data[2] == c && data[3] != c {
  104. if c == '~' || isspace(data[3]) {
  105. return 0, nil
  106. }
  107. ret, node := helperTripleEmphasis(p, data, 3, c)
  108. if ret == 0 {
  109. return 0, nil
  110. }
  111. return ret + 3, node
  112. }
  113. return 0, nil
  114. }
  115. func codeSpan(p *Markdown, data []byte, offset int) (int, *Node) {
  116. data = data[offset:]
  117. nb := 0
  118. // count the number of backticks in the delimiter
  119. for nb < len(data) && data[nb] == '`' {
  120. nb++
  121. }
  122. // find the next delimiter
  123. i, end := 0, 0
  124. for end = nb; end < len(data) && i < nb; end++ {
  125. if data[end] == '`' {
  126. i++
  127. } else {
  128. i = 0
  129. }
  130. }
  131. // no matching delimiter?
  132. if i < nb && end >= len(data) {
  133. return 0, nil
  134. }
  135. // trim outside whitespace
  136. fBegin := nb
  137. for fBegin < end && data[fBegin] == ' ' {
  138. fBegin++
  139. }
  140. fEnd := end - nb
  141. for fEnd > fBegin && data[fEnd-1] == ' ' {
  142. fEnd--
  143. }
  144. // render the code span
  145. if fBegin != fEnd {
  146. code := NewNode(Code)
  147. code.Literal = data[fBegin:fEnd]
  148. return end, code
  149. }
  150. return end, nil
  151. }
  152. // newline preceded by two spaces becomes <br>
  153. func maybeLineBreak(p *Markdown, data []byte, offset int) (int, *Node) {
  154. origOffset := offset
  155. for offset < len(data) && data[offset] == ' ' {
  156. offset++
  157. }
  158. if offset < len(data) && data[offset] == '\n' {
  159. if offset-origOffset >= 2 {
  160. return offset - origOffset + 1, NewNode(Hardbreak)
  161. }
  162. return offset - origOffset, nil
  163. }
  164. return 0, nil
  165. }
  166. // newline without two spaces works when HardLineBreak is enabled
  167. func lineBreak(p *Markdown, data []byte, offset int) (int, *Node) {
  168. if p.extensions&HardLineBreak != 0 {
  169. return 1, NewNode(Hardbreak)
  170. }
  171. return 0, nil
  172. }
  173. type linkType int
  174. const (
  175. linkNormal linkType = iota
  176. linkImg
  177. linkDeferredFootnote
  178. linkInlineFootnote
  179. )
  180. func isReferenceStyleLink(data []byte, pos int, t linkType) bool {
  181. if t == linkDeferredFootnote {
  182. return false
  183. }
  184. return pos < len(data)-1 && data[pos] == '[' && data[pos+1] != '^'
  185. }
  186. func maybeImage(p *Markdown, data []byte, offset int) (int, *Node) {
  187. if offset < len(data)-1 && data[offset+1] == '[' {
  188. return link(p, data, offset)
  189. }
  190. return 0, nil
  191. }
  192. func maybeInlineFootnote(p *Markdown, data []byte, offset int) (int, *Node) {
  193. if offset < len(data)-1 && data[offset+1] == '[' {
  194. return link(p, data, offset)
  195. }
  196. return 0, nil
  197. }
  198. // '[': parse a link or an image or a footnote
  199. func link(p *Markdown, data []byte, offset int) (int, *Node) {
  200. // no links allowed inside regular links, footnote, and deferred footnotes
  201. if p.insideLink && (offset > 0 && data[offset-1] == '[' || len(data)-1 > offset && data[offset+1] == '^') {
  202. return 0, nil
  203. }
  204. var t linkType
  205. switch {
  206. // special case: ![^text] == deferred footnote (that follows something with
  207. // an exclamation point)
  208. case p.extensions&Footnotes != 0 && len(data)-1 > offset && data[offset+1] == '^':
  209. t = linkDeferredFootnote
  210. // ![alt] == image
  211. case offset >= 0 && data[offset] == '!':
  212. t = linkImg
  213. offset++
  214. // ^[text] == inline footnote
  215. // [^refId] == deferred footnote
  216. case p.extensions&Footnotes != 0:
  217. if offset >= 0 && data[offset] == '^' {
  218. t = linkInlineFootnote
  219. offset++
  220. } else if len(data)-1 > offset && data[offset+1] == '^' {
  221. t = linkDeferredFootnote
  222. }
  223. // [text] == regular link
  224. default:
  225. t = linkNormal
  226. }
  227. data = data[offset:]
  228. var (
  229. i = 1
  230. noteID int
  231. title, link, altContent []byte
  232. textHasNl = false
  233. )
  234. if t == linkDeferredFootnote {
  235. i++
  236. }
  237. // look for the matching closing bracket
  238. for level := 1; level > 0 && i < len(data); i++ {
  239. switch {
  240. case data[i] == '\n':
  241. textHasNl = true
  242. case isBackslashEscaped(data, i):
  243. continue
  244. case data[i] == '[':
  245. level++
  246. case data[i] == ']':
  247. level--
  248. if level <= 0 {
  249. i-- // compensate for extra i++ in for loop
  250. }
  251. }
  252. }
  253. if i >= len(data) {
  254. return 0, nil
  255. }
  256. txtE := i
  257. i++
  258. var footnoteNode *Node
  259. // skip any amount of whitespace or newline
  260. // (this is much more lax than original markdown syntax)
  261. for i < len(data) && isspace(data[i]) {
  262. i++
  263. }
  264. // inline style link
  265. switch {
  266. case i < len(data) && data[i] == '(':
  267. // skip initial whitespace
  268. i++
  269. for i < len(data) && isspace(data[i]) {
  270. i++
  271. }
  272. linkB := i
  273. // look for link end: ' " )
  274. findlinkend:
  275. for i < len(data) {
  276. switch {
  277. case data[i] == '\\':
  278. i += 2
  279. case data[i] == ')' || data[i] == '\'' || data[i] == '"':
  280. break findlinkend
  281. default:
  282. i++
  283. }
  284. }
  285. if i >= len(data) {
  286. return 0, nil
  287. }
  288. linkE := i
  289. // look for title end if present
  290. titleB, titleE := 0, 0
  291. if data[i] == '\'' || data[i] == '"' {
  292. i++
  293. titleB = i
  294. findtitleend:
  295. for i < len(data) {
  296. switch {
  297. case data[i] == '\\':
  298. i += 2
  299. case data[i] == ')':
  300. break findtitleend
  301. default:
  302. i++
  303. }
  304. }
  305. if i >= len(data) {
  306. return 0, nil
  307. }
  308. // skip whitespace after title
  309. titleE = i - 1
  310. for titleE > titleB && isspace(data[titleE]) {
  311. titleE--
  312. }
  313. // check for closing quote presence
  314. if data[titleE] != '\'' && data[titleE] != '"' {
  315. titleB, titleE = 0, 0
  316. linkE = i
  317. }
  318. }
  319. // remove whitespace at the end of the link
  320. for linkE > linkB && isspace(data[linkE-1]) {
  321. linkE--
  322. }
  323. // remove optional angle brackets around the link
  324. if data[linkB] == '<' {
  325. linkB++
  326. }
  327. if data[linkE-1] == '>' {
  328. linkE--
  329. }
  330. // build escaped link and title
  331. if linkE > linkB {
  332. link = data[linkB:linkE]
  333. }
  334. if titleE > titleB {
  335. title = data[titleB:titleE]
  336. }
  337. i++
  338. // reference style link
  339. case isReferenceStyleLink(data, i, t):
  340. var id []byte
  341. altContentConsidered := false
  342. // look for the id
  343. i++
  344. linkB := i
  345. for i < len(data) && data[i] != ']' {
  346. i++
  347. }
  348. if i >= len(data) {
  349. return 0, nil
  350. }
  351. linkE := i
  352. // find the reference
  353. if linkB == linkE {
  354. if textHasNl {
  355. var b bytes.Buffer
  356. for j := 1; j < txtE; j++ {
  357. switch {
  358. case data[j] != '\n':
  359. b.WriteByte(data[j])
  360. case data[j-1] != ' ':
  361. b.WriteByte(' ')
  362. }
  363. }
  364. id = b.Bytes()
  365. } else {
  366. id = data[1:txtE]
  367. altContentConsidered = true
  368. }
  369. } else {
  370. id = data[linkB:linkE]
  371. }
  372. // find the reference with matching id
  373. lr, ok := p.getRef(string(id))
  374. if !ok {
  375. return 0, nil
  376. }
  377. // keep link and title from reference
  378. link = lr.link
  379. title = lr.title
  380. if altContentConsidered {
  381. altContent = lr.text
  382. }
  383. i++
  384. // shortcut reference style link or reference or inline footnote
  385. default:
  386. var id []byte
  387. // craft the id
  388. if textHasNl {
  389. var b bytes.Buffer
  390. for j := 1; j < txtE; j++ {
  391. switch {
  392. case data[j] != '\n':
  393. b.WriteByte(data[j])
  394. case data[j-1] != ' ':
  395. b.WriteByte(' ')
  396. }
  397. }
  398. id = b.Bytes()
  399. } else {
  400. if t == linkDeferredFootnote {
  401. id = data[2:txtE] // get rid of the ^
  402. } else {
  403. id = data[1:txtE]
  404. }
  405. }
  406. footnoteNode = NewNode(Item)
  407. if t == linkInlineFootnote {
  408. // create a new reference
  409. noteID = len(p.notes) + 1
  410. var fragment []byte
  411. if len(id) > 0 {
  412. if len(id) < 16 {
  413. fragment = make([]byte, len(id))
  414. } else {
  415. fragment = make([]byte, 16)
  416. }
  417. copy(fragment, slugify(id))
  418. } else {
  419. fragment = append([]byte("footnote-"), []byte(strconv.Itoa(noteID))...)
  420. }
  421. ref := &reference{
  422. noteID: noteID,
  423. hasBlock: false,
  424. link: fragment,
  425. title: id,
  426. footnote: footnoteNode,
  427. }
  428. p.notes = append(p.notes, ref)
  429. link = ref.link
  430. title = ref.title
  431. } else {
  432. // find the reference with matching id
  433. lr, ok := p.getRef(string(id))
  434. if !ok {
  435. return 0, nil
  436. }
  437. if t == linkDeferredFootnote {
  438. lr.noteID = len(p.notes) + 1
  439. lr.footnote = footnoteNode
  440. p.notes = append(p.notes, lr)
  441. }
  442. // keep link and title from reference
  443. link = lr.link
  444. // if inline footnote, title == footnote contents
  445. title = lr.title
  446. noteID = lr.noteID
  447. }
  448. // rewind the whitespace
  449. i = txtE + 1
  450. }
  451. var uLink []byte
  452. if t == linkNormal || t == linkImg {
  453. if len(link) > 0 {
  454. var uLinkBuf bytes.Buffer
  455. unescapeText(&uLinkBuf, link)
  456. uLink = uLinkBuf.Bytes()
  457. }
  458. // links need something to click on and somewhere to go
  459. if len(uLink) == 0 || (t == linkNormal && txtE <= 1) {
  460. return 0, nil
  461. }
  462. }
  463. // call the relevant rendering function
  464. var linkNode *Node
  465. switch t {
  466. case linkNormal:
  467. linkNode = NewNode(Link)
  468. linkNode.Destination = normalizeURI(uLink)
  469. linkNode.Title = title
  470. if len(altContent) > 0 {
  471. linkNode.AppendChild(text(altContent))
  472. } else {
  473. // links cannot contain other links, so turn off link parsing
  474. // temporarily and recurse
  475. insideLink := p.insideLink
  476. p.insideLink = true
  477. p.inline(linkNode, data[1:txtE])
  478. p.insideLink = insideLink
  479. }
  480. case linkImg:
  481. linkNode = NewNode(Image)
  482. linkNode.Destination = uLink
  483. linkNode.Title = title
  484. linkNode.AppendChild(text(data[1:txtE]))
  485. i++
  486. case linkInlineFootnote, linkDeferredFootnote:
  487. linkNode = NewNode(Link)
  488. linkNode.Destination = link
  489. linkNode.Title = title
  490. linkNode.NoteID = noteID
  491. linkNode.Footnote = footnoteNode
  492. if t == linkInlineFootnote {
  493. i++
  494. }
  495. default:
  496. return 0, nil
  497. }
  498. return i, linkNode
  499. }
  500. func (p *Markdown) inlineHTMLComment(data []byte) int {
  501. if len(data) < 5 {
  502. return 0
  503. }
  504. if data[0] != '<' || data[1] != '!' || data[2] != '-' || data[3] != '-' {
  505. return 0
  506. }
  507. i := 5
  508. // scan for an end-of-comment marker, across lines if necessary
  509. for i < len(data) && !(data[i-2] == '-' && data[i-1] == '-' && data[i] == '>') {
  510. i++
  511. }
  512. // no end-of-comment marker
  513. if i >= len(data) {
  514. return 0
  515. }
  516. return i + 1
  517. }
  518. func stripMailto(link []byte) []byte {
  519. if bytes.HasPrefix(link, []byte("mailto://")) {
  520. return link[9:]
  521. } else if bytes.HasPrefix(link, []byte("mailto:")) {
  522. return link[7:]
  523. } else {
  524. return link
  525. }
  526. }
  527. // autolinkType specifies a kind of autolink that gets detected.
  528. type autolinkType int
  529. // These are the possible flag values for the autolink renderer.
  530. const (
  531. notAutolink autolinkType = iota
  532. normalAutolink
  533. emailAutolink
  534. )
  535. // '<' when tags or autolinks are allowed
  536. func leftAngle(p *Markdown, data []byte, offset int) (int, *Node) {
  537. data = data[offset:]
  538. altype, end := tagLength(data)
  539. if size := p.inlineHTMLComment(data); size > 0 {
  540. end = size
  541. }
  542. if end > 2 {
  543. if altype != notAutolink {
  544. var uLink bytes.Buffer
  545. unescapeText(&uLink, data[1:end+1-2])
  546. if uLink.Len() > 0 {
  547. link := uLink.Bytes()
  548. node := NewNode(Link)
  549. node.Destination = link
  550. if altype == emailAutolink {
  551. node.Destination = append([]byte("mailto:"), link...)
  552. }
  553. node.AppendChild(text(stripMailto(link)))
  554. return end, node
  555. }
  556. } else {
  557. htmlTag := NewNode(HTMLSpan)
  558. htmlTag.Literal = data[:end]
  559. return end, htmlTag
  560. }
  561. }
  562. return end, nil
  563. }
  564. // '\\' backslash escape
  565. var escapeChars = []byte("\\`*_{}[]()#+-.!:|&<>~")
  566. func escape(p *Markdown, data []byte, offset int) (int, *Node) {
  567. data = data[offset:]
  568. if len(data) > 1 {
  569. if p.extensions&BackslashLineBreak != 0 && data[1] == '\n' {
  570. return 2, NewNode(Hardbreak)
  571. }
  572. if bytes.IndexByte(escapeChars, data[1]) < 0 {
  573. return 0, nil
  574. }
  575. return 2, text(data[1:2])
  576. }
  577. return 2, nil
  578. }
  579. func unescapeText(ob *bytes.Buffer, src []byte) {
  580. i := 0
  581. for i < len(src) {
  582. org := i
  583. for i < len(src) && src[i] != '\\' {
  584. i++
  585. }
  586. if i > org {
  587. ob.Write(src[org:i])
  588. }
  589. if i+1 >= len(src) {
  590. break
  591. }
  592. ob.WriteByte(src[i+1])
  593. i += 2
  594. }
  595. }
  596. // '&' escaped when it doesn't belong to an entity
  597. // valid entities are assumed to be anything matching &#?[A-Za-z0-9]+;
  598. func entity(p *Markdown, data []byte, offset int) (int, *Node) {
  599. data = data[offset:]
  600. end := 1
  601. if end < len(data) && data[end] == '#' {
  602. end++
  603. }
  604. for end < len(data) && isalnum(data[end]) {
  605. end++
  606. }
  607. if end < len(data) && data[end] == ';' {
  608. end++ // real entity
  609. } else {
  610. return 0, nil // lone '&'
  611. }
  612. ent := data[:end]
  613. // undo &amp; escaping or it will be converted to &amp;amp; by another
  614. // escaper in the renderer
  615. if bytes.Equal(ent, []byte("&amp;")) {
  616. ent = []byte{'&'}
  617. }
  618. return end, text(ent)
  619. }
  620. func linkEndsWithEntity(data []byte, linkEnd int) bool {
  621. entityRanges := htmlEntityRe.FindAllIndex(data[:linkEnd], -1)
  622. return entityRanges != nil && entityRanges[len(entityRanges)-1][1] == linkEnd
  623. }
  624. // hasPrefixCaseInsensitive is a custom implementation of
  625. // strings.HasPrefix(strings.ToLower(s), prefix)
  626. // we rolled our own because ToLower pulls in a huge machinery of lowercasing
  627. // anything from Unicode and that's very slow. Since this func will only be
  628. // used on ASCII protocol prefixes, we can take shortcuts.
  629. func hasPrefixCaseInsensitive(s, prefix []byte) bool {
  630. if len(s) < len(prefix) {
  631. return false
  632. }
  633. delta := byte('a' - 'A')
  634. for i, b := range prefix {
  635. if b != s[i] && b != s[i]+delta {
  636. return false
  637. }
  638. }
  639. return true
  640. }
  641. var protocolPrefixes = [][]byte{
  642. []byte("http://"),
  643. []byte("https://"),
  644. []byte("ftp://"),
  645. []byte("file://"),
  646. []byte("mailto:"),
  647. }
  648. const shortestPrefix = 6 // len("ftp://"), the shortest of the above
  649. func maybeAutoLink(p *Markdown, data []byte, offset int) (int, *Node) {
  650. // quick check to rule out most false hits
  651. if p.insideLink || len(data) < offset+shortestPrefix {
  652. return 0, nil
  653. }
  654. for _, prefix := range protocolPrefixes {
  655. endOfHead := offset + 8 // 8 is the len() of the longest prefix
  656. if endOfHead > len(data) {
  657. endOfHead = len(data)
  658. }
  659. if hasPrefixCaseInsensitive(data[offset:endOfHead], prefix) {
  660. return autoLink(p, data, offset)
  661. }
  662. }
  663. return 0, nil
  664. }
  665. func autoLink(p *Markdown, data []byte, offset int) (int, *Node) {
  666. // Now a more expensive check to see if we're not inside an anchor element
  667. anchorStart := offset
  668. offsetFromAnchor := 0
  669. for anchorStart > 0 && data[anchorStart] != '<' {
  670. anchorStart--
  671. offsetFromAnchor++
  672. }
  673. anchorStr := anchorRe.Find(data[anchorStart:])
  674. if anchorStr != nil {
  675. anchorClose := NewNode(HTMLSpan)
  676. anchorClose.Literal = anchorStr[offsetFromAnchor:]
  677. return len(anchorStr) - offsetFromAnchor, anchorClose
  678. }
  679. // scan backward for a word boundary
  680. rewind := 0
  681. for offset-rewind > 0 && rewind <= 7 && isletter(data[offset-rewind-1]) {
  682. rewind++
  683. }
  684. if rewind > 6 { // longest supported protocol is "mailto" which has 6 letters
  685. return 0, nil
  686. }
  687. origData := data
  688. data = data[offset-rewind:]
  689. if !isSafeLink(data) {
  690. return 0, nil
  691. }
  692. linkEnd := 0
  693. for linkEnd < len(data) && !isEndOfLink(data[linkEnd]) {
  694. linkEnd++
  695. }
  696. // Skip punctuation at the end of the link
  697. if (data[linkEnd-1] == '.' || data[linkEnd-1] == ',') && data[linkEnd-2] != '\\' {
  698. linkEnd--
  699. }
  700. // But don't skip semicolon if it's a part of escaped entity:
  701. if data[linkEnd-1] == ';' && data[linkEnd-2] != '\\' && !linkEndsWithEntity(data, linkEnd) {
  702. linkEnd--
  703. }
  704. // See if the link finishes with a punctuation sign that can be closed.
  705. var copen byte
  706. switch data[linkEnd-1] {
  707. case '"':
  708. copen = '"'
  709. case '\'':
  710. copen = '\''
  711. case ')':
  712. copen = '('
  713. case ']':
  714. copen = '['
  715. case '}':
  716. copen = '{'
  717. default:
  718. copen = 0
  719. }
  720. if copen != 0 {
  721. bufEnd := offset - rewind + linkEnd - 2
  722. openDelim := 1
  723. /* Try to close the final punctuation sign in this same line;
  724. * if we managed to close it outside of the URL, that means that it's
  725. * not part of the URL. If it closes inside the URL, that means it
  726. * is part of the URL.
  727. *
  728. * Examples:
  729. *
  730. * foo http://www.pokemon.com/Pikachu_(Electric) bar
  731. * => http://www.pokemon.com/Pikachu_(Electric)
  732. *
  733. * foo (http://www.pokemon.com/Pikachu_(Electric)) bar
  734. * => http://www.pokemon.com/Pikachu_(Electric)
  735. *
  736. * foo http://www.pokemon.com/Pikachu_(Electric)) bar
  737. * => http://www.pokemon.com/Pikachu_(Electric))
  738. *
  739. * (foo http://www.pokemon.com/Pikachu_(Electric)) bar
  740. * => foo http://www.pokemon.com/Pikachu_(Electric)
  741. */
  742. for bufEnd >= 0 && origData[bufEnd] != '\n' && openDelim != 0 {
  743. if origData[bufEnd] == data[linkEnd-1] {
  744. openDelim++
  745. }
  746. if origData[bufEnd] == copen {
  747. openDelim--
  748. }
  749. bufEnd--
  750. }
  751. if openDelim == 0 {
  752. linkEnd--
  753. }
  754. }
  755. var uLink bytes.Buffer
  756. unescapeText(&uLink, data[:linkEnd])
  757. if uLink.Len() > 0 {
  758. node := NewNode(Link)
  759. node.Destination = uLink.Bytes()
  760. node.AppendChild(text(uLink.Bytes()))
  761. return linkEnd, node
  762. }
  763. return linkEnd, nil
  764. }
  765. func isEndOfLink(char byte) bool {
  766. return isspace(char) || char == '<'
  767. }
  768. var validUris = [][]byte{[]byte("http://"), []byte("https://"), []byte("ftp://"), []byte("mailto://")}
  769. var validPaths = [][]byte{[]byte("/"), []byte("./"), []byte("../")}
  770. func isSafeLink(link []byte) bool {
  771. for _, path := range validPaths {
  772. if len(link) >= len(path) && bytes.Equal(link[:len(path)], path) {
  773. if len(link) == len(path) {
  774. return true
  775. } else if isalnum(link[len(path)]) {
  776. return true
  777. }
  778. }
  779. }
  780. for _, prefix := range validUris {
  781. // TODO: handle unicode here
  782. // case-insensitive prefix test
  783. if len(link) > len(prefix) && bytes.Equal(bytes.ToLower(link[:len(prefix)]), prefix) && isalnum(link[len(prefix)]) {
  784. return true
  785. }
  786. }
  787. return false
  788. }
  789. // return the length of the given tag, or 0 is it's not valid
  790. func tagLength(data []byte) (autolink autolinkType, end int) {
  791. var i, j int
  792. // a valid tag can't be shorter than 3 chars
  793. if len(data) < 3 {
  794. return notAutolink, 0
  795. }
  796. // begins with a '<' optionally followed by '/', followed by letter or number
  797. if data[0] != '<' {
  798. return notAutolink, 0
  799. }
  800. if data[1] == '/' {
  801. i = 2
  802. } else {
  803. i = 1
  804. }
  805. if !isalnum(data[i]) {
  806. return notAutolink, 0
  807. }
  808. // scheme test
  809. autolink = notAutolink
  810. // try to find the beginning of an URI
  811. for i < len(data) && (isalnum(data[i]) || data[i] == '.' || data[i] == '+' || data[i] == '-') {
  812. i++
  813. }
  814. if i > 1 && i < len(data) && data[i] == '@' {
  815. if j = isMailtoAutoLink(data[i:]); j != 0 {
  816. return emailAutolink, i + j
  817. }
  818. }
  819. if i > 2 && i < len(data) && data[i] == ':' {
  820. autolink = normalAutolink
  821. i++
  822. }
  823. // complete autolink test: no whitespace or ' or "
  824. switch {
  825. case i >= len(data):
  826. autolink = notAutolink
  827. case autolink != notAutolink:
  828. j = i
  829. for i < len(data) {
  830. if data[i] == '\\' {
  831. i += 2
  832. } else if data[i] == '>' || data[i] == '\'' || data[i] == '"' || isspace(data[i]) {
  833. break
  834. } else {
  835. i++
  836. }
  837. }
  838. if i >= len(data) {
  839. return autolink, 0
  840. }
  841. if i > j && data[i] == '>' {
  842. return autolink, i + 1
  843. }
  844. // one of the forbidden chars has been found
  845. autolink = notAutolink
  846. }
  847. i += bytes.IndexByte(data[i:], '>')
  848. if i < 0 {
  849. return autolink, 0
  850. }
  851. return autolink, i + 1
  852. }
  853. // look for the address part of a mail autolink and '>'
  854. // this is less strict than the original markdown e-mail address matching
  855. func isMailtoAutoLink(data []byte) int {
  856. nb := 0
  857. // address is assumed to be: [-@._a-zA-Z0-9]+ with exactly one '@'
  858. for i := 0; i < len(data); i++ {
  859. if isalnum(data[i]) {
  860. continue
  861. }
  862. switch data[i] {
  863. case '@':
  864. nb++
  865. case '-', '.', '_':
  866. break
  867. case '>':
  868. if nb == 1 {
  869. return i + 1
  870. }
  871. return 0
  872. default:
  873. return 0
  874. }
  875. }
  876. return 0
  877. }
  878. // look for the next emph char, skipping other constructs
  879. func helperFindEmphChar(data []byte, c byte) int {
  880. i := 0
  881. for i < len(data) {
  882. for i < len(data) && data[i] != c && data[i] != '`' && data[i] != '[' {
  883. i++
  884. }
  885. if i >= len(data) {
  886. return 0
  887. }
  888. // do not count escaped chars
  889. if i != 0 && data[i-1] == '\\' {
  890. i++
  891. continue
  892. }
  893. if data[i] == c {
  894. return i
  895. }
  896. if data[i] == '`' {
  897. // skip a code span
  898. tmpI := 0
  899. i++
  900. for i < len(data) && data[i] != '`' {
  901. if tmpI == 0 && data[i] == c {
  902. tmpI = i
  903. }
  904. i++
  905. }
  906. if i >= len(data) {
  907. return tmpI
  908. }
  909. i++
  910. } else if data[i] == '[' {
  911. // skip a link
  912. tmpI := 0
  913. i++
  914. for i < len(data) && data[i] != ']' {
  915. if tmpI == 0 && data[i] == c {
  916. tmpI = i
  917. }
  918. i++
  919. }
  920. i++
  921. for i < len(data) && (data[i] == ' ' || data[i] == '\n') {
  922. i++
  923. }
  924. if i >= len(data) {
  925. return tmpI
  926. }
  927. if data[i] != '[' && data[i] != '(' { // not a link
  928. if tmpI > 0 {
  929. return tmpI
  930. }
  931. continue
  932. }
  933. cc := data[i]
  934. i++
  935. for i < len(data) && data[i] != cc {
  936. if tmpI == 0 && data[i] == c {
  937. return i
  938. }
  939. i++
  940. }
  941. if i >= len(data) {
  942. return tmpI
  943. }
  944. i++
  945. }
  946. }
  947. return 0
  948. }
  949. func helperEmphasis(p *Markdown, data []byte, c byte) (int, *Node) {
  950. i := 0
  951. // skip one symbol if coming from emph3
  952. if len(data) > 1 && data[0] == c && data[1] == c {
  953. i = 1
  954. }
  955. for i < len(data) {
  956. length := helperFindEmphChar(data[i:], c)
  957. if length == 0 {
  958. return 0, nil
  959. }
  960. i += length
  961. if i >= len(data) {
  962. return 0, nil
  963. }
  964. if i+1 < len(data) && data[i+1] == c {
  965. i++
  966. continue
  967. }
  968. if data[i] == c && !isspace(data[i-1]) {
  969. if p.extensions&NoIntraEmphasis != 0 {
  970. if !(i+1 == len(data) || isspace(data[i+1]) || ispunct(data[i+1])) {
  971. continue
  972. }
  973. }
  974. emph := NewNode(Emph)
  975. p.inline(emph, data[:i])
  976. return i + 1, emph
  977. }
  978. }
  979. return 0, nil
  980. }
  981. func helperDoubleEmphasis(p *Markdown, data []byte, c byte) (int, *Node) {
  982. i := 0
  983. for i < len(data) {
  984. length := helperFindEmphChar(data[i:], c)
  985. if length == 0 {
  986. return 0, nil
  987. }
  988. i += length
  989. if i+1 < len(data) && data[i] == c && data[i+1] == c && i > 0 && !isspace(data[i-1]) {
  990. nodeType := Strong
  991. if c == '~' {
  992. nodeType = Del
  993. }
  994. node := NewNode(nodeType)
  995. p.inline(node, data[:i])
  996. return i + 2, node
  997. }
  998. i++
  999. }
  1000. return 0, nil
  1001. }
  1002. func helperTripleEmphasis(p *Markdown, data []byte, offset int, c byte) (int, *Node) {
  1003. i := 0
  1004. origData := data
  1005. data = data[offset:]
  1006. for i < len(data) {
  1007. length := helperFindEmphChar(data[i:], c)
  1008. if length == 0 {
  1009. return 0, nil
  1010. }
  1011. i += length
  1012. // skip whitespace preceded symbols
  1013. if data[i] != c || isspace(data[i-1]) {
  1014. continue
  1015. }
  1016. switch {
  1017. case i+2 < len(data) && data[i+1] == c && data[i+2] == c:
  1018. // triple symbol found
  1019. strong := NewNode(Strong)
  1020. em := NewNode(Emph)
  1021. strong.AppendChild(em)
  1022. p.inline(em, data[:i])
  1023. return i + 3, strong
  1024. case (i+1 < len(data) && data[i+1] == c):
  1025. // double symbol found, hand over to emph1
  1026. length, node := helperEmphasis(p, origData[offset-2:], c)
  1027. if length == 0 {
  1028. return 0, nil
  1029. }
  1030. return length - 2, node
  1031. default:
  1032. // single symbol found, hand over to emph2
  1033. length, node := helperDoubleEmphasis(p, origData[offset-1:], c)
  1034. if length == 0 {
  1035. return 0, nil
  1036. }
  1037. return length - 1, node
  1038. }
  1039. }
  1040. return 0, nil
  1041. }
  1042. func text(s []byte) *Node {
  1043. node := NewNode(Text)
  1044. node.Literal = s
  1045. return node
  1046. }
  1047. func normalizeURI(s []byte) []byte {
  1048. return s // TODO: implement
  1049. }