sanitize.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539
  1. // Copyright (c) 2014, David Kitchen <david@buro9.com>
  2. //
  3. // All rights reserved.
  4. //
  5. // Redistribution and use in source and binary forms, with or without
  6. // modification, are permitted provided that the following conditions are met:
  7. //
  8. // * Redistributions of source code must retain the above copyright notice, this
  9. // list of conditions and the following disclaimer.
  10. //
  11. // * Redistributions in binary form must reproduce the above copyright notice,
  12. // this list of conditions and the following disclaimer in the documentation
  13. // and/or other materials provided with the distribution.
  14. //
  15. // * Neither the name of the organisation (Microcosm) nor the names of its
  16. // contributors may be used to endorse or promote products derived from
  17. // this software without specific prior written permission.
  18. //
  19. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  20. // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  21. // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  22. // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
  23. // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  24. // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  25. // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  26. // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  27. // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  28. // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  29. package bluemonday
  30. import (
  31. "bytes"
  32. "io"
  33. "net/url"
  34. "strings"
  35. "golang.org/x/net/html"
  36. )
  37. // Sanitize takes a string that contains a HTML fragment or document and applies
  38. // the given policy whitelist.
  39. //
  40. // It returns a HTML string that has been sanitized by the policy or an empty
  41. // string if an error has occurred (most likely as a consequence of extremely
  42. // malformed input)
  43. func (p *Policy) Sanitize(s string) string {
  44. if strings.TrimSpace(s) == "" {
  45. return s
  46. }
  47. return p.sanitize(strings.NewReader(s)).String()
  48. }
  49. // SanitizeBytes takes a []byte that contains a HTML fragment or document and applies
  50. // the given policy whitelist.
  51. //
  52. // It returns a []byte containing the HTML that has been sanitized by the policy
  53. // or an empty []byte if an error has occurred (most likely as a consequence of
  54. // extremely malformed input)
  55. func (p *Policy) SanitizeBytes(b []byte) []byte {
  56. if len(bytes.TrimSpace(b)) == 0 {
  57. return b
  58. }
  59. return p.sanitize(bytes.NewReader(b)).Bytes()
  60. }
  61. // SanitizeReader takes an io.Reader that contains a HTML fragment or document
  62. // and applies the given policy whitelist.
  63. //
  64. // It returns a bytes.Buffer containing the HTML that has been sanitized by the
  65. // policy. Errors during sanitization will merely return an empty result.
  66. func (p *Policy) SanitizeReader(r io.Reader) *bytes.Buffer {
  67. return p.sanitize(r)
  68. }
  69. // Performs the actual sanitization process.
  70. func (p *Policy) sanitize(r io.Reader) *bytes.Buffer {
  71. // It is possible that the developer has created the policy via:
  72. // p := bluemonday.Policy{}
  73. // rather than:
  74. // p := bluemonday.NewPolicy()
  75. // If this is the case, and if they haven't yet triggered an action that
  76. // would initiliaze the maps, then we need to do that.
  77. p.init()
  78. var (
  79. buff bytes.Buffer
  80. skipElementContent bool
  81. skippingElementsCount int64
  82. skipClosingTag bool
  83. closingTagToSkipStack []string
  84. mostRecentlyStartedToken string
  85. )
  86. tokenizer := html.NewTokenizer(r)
  87. for {
  88. if tokenizer.Next() == html.ErrorToken {
  89. err := tokenizer.Err()
  90. if err == io.EOF {
  91. // End of input means end of processing
  92. return &buff
  93. }
  94. // Raw tokenizer error
  95. return &bytes.Buffer{}
  96. }
  97. token := tokenizer.Token()
  98. switch token.Type {
  99. case html.DoctypeToken:
  100. if p.allowDocType {
  101. buff.WriteString(token.String())
  102. }
  103. case html.CommentToken:
  104. // Comments are ignored by default
  105. case html.StartTagToken:
  106. mostRecentlyStartedToken = token.Data
  107. aps, ok := p.elsAndAttrs[token.Data]
  108. if !ok {
  109. if _, ok := p.setOfElementsToSkipContent[token.Data]; ok {
  110. skipElementContent = true
  111. skippingElementsCount++
  112. }
  113. if p.addSpaces {
  114. buff.WriteString(" ")
  115. }
  116. break
  117. }
  118. if len(token.Attr) != 0 {
  119. token.Attr = p.sanitizeAttrs(token.Data, token.Attr, aps)
  120. }
  121. if len(token.Attr) == 0 {
  122. if !p.allowNoAttrs(token.Data) {
  123. skipClosingTag = true
  124. closingTagToSkipStack = append(closingTagToSkipStack, token.Data)
  125. if p.addSpaces {
  126. buff.WriteString(" ")
  127. }
  128. break
  129. }
  130. }
  131. if !skipElementContent {
  132. buff.WriteString(token.String())
  133. }
  134. case html.EndTagToken:
  135. if mostRecentlyStartedToken == token.Data {
  136. mostRecentlyStartedToken = ""
  137. }
  138. if skipClosingTag && closingTagToSkipStack[len(closingTagToSkipStack)-1] == token.Data {
  139. closingTagToSkipStack = closingTagToSkipStack[:len(closingTagToSkipStack)-1]
  140. if len(closingTagToSkipStack) == 0 {
  141. skipClosingTag = false
  142. }
  143. if p.addSpaces {
  144. buff.WriteString(" ")
  145. }
  146. break
  147. }
  148. if _, ok := p.elsAndAttrs[token.Data]; !ok {
  149. if _, ok := p.setOfElementsToSkipContent[token.Data]; ok {
  150. skippingElementsCount--
  151. if skippingElementsCount == 0 {
  152. skipElementContent = false
  153. }
  154. }
  155. if p.addSpaces {
  156. buff.WriteString(" ")
  157. }
  158. break
  159. }
  160. if !skipElementContent {
  161. buff.WriteString(token.String())
  162. }
  163. case html.SelfClosingTagToken:
  164. aps, ok := p.elsAndAttrs[token.Data]
  165. if !ok {
  166. if p.addSpaces {
  167. buff.WriteString(" ")
  168. }
  169. break
  170. }
  171. if len(token.Attr) != 0 {
  172. token.Attr = p.sanitizeAttrs(token.Data, token.Attr, aps)
  173. }
  174. if len(token.Attr) == 0 && !p.allowNoAttrs(token.Data) {
  175. if p.addSpaces {
  176. buff.WriteString(" ")
  177. }
  178. break
  179. }
  180. if !skipElementContent {
  181. buff.WriteString(token.String())
  182. }
  183. case html.TextToken:
  184. if !skipElementContent {
  185. switch strings.ToLower(mostRecentlyStartedToken) {
  186. case "script":
  187. // not encouraged, but if a policy allows JavaScript we
  188. // should not HTML escape it as that would break the output
  189. buff.WriteString(token.Data)
  190. case "style":
  191. // not encouraged, but if a policy allows CSS styles we
  192. // should not HTML escape it as that would break the output
  193. buff.WriteString(token.Data)
  194. default:
  195. // HTML escape the text
  196. buff.WriteString(token.String())
  197. }
  198. }
  199. default:
  200. // A token that didn't exist in the html package when we wrote this
  201. return &bytes.Buffer{}
  202. }
  203. }
  204. }
  205. // sanitizeAttrs takes a set of element attribute policies and the global
  206. // attribute policies and applies them to the []html.Attribute returning a set
  207. // of html.Attributes that match the policies
  208. func (p *Policy) sanitizeAttrs(
  209. elementName string,
  210. attrs []html.Attribute,
  211. aps map[string]attrPolicy,
  212. ) []html.Attribute {
  213. if len(attrs) == 0 {
  214. return attrs
  215. }
  216. // Builds a new attribute slice based on the whether the attribute has been
  217. // whitelisted explicitly or globally.
  218. cleanAttrs := []html.Attribute{}
  219. for _, htmlAttr := range attrs {
  220. // Is there an element specific attribute policy that applies?
  221. if ap, ok := aps[htmlAttr.Key]; ok {
  222. if ap.regexp != nil {
  223. if ap.regexp.MatchString(htmlAttr.Val) {
  224. cleanAttrs = append(cleanAttrs, htmlAttr)
  225. continue
  226. }
  227. } else {
  228. cleanAttrs = append(cleanAttrs, htmlAttr)
  229. continue
  230. }
  231. }
  232. // Is there a global attribute policy that applies?
  233. if ap, ok := p.globalAttrs[htmlAttr.Key]; ok {
  234. if ap.regexp != nil {
  235. if ap.regexp.MatchString(htmlAttr.Val) {
  236. cleanAttrs = append(cleanAttrs, htmlAttr)
  237. }
  238. } else {
  239. cleanAttrs = append(cleanAttrs, htmlAttr)
  240. }
  241. }
  242. }
  243. if len(cleanAttrs) == 0 {
  244. // If nothing was allowed, let's get out of here
  245. return cleanAttrs
  246. }
  247. // cleanAttrs now contains the attributes that are permitted
  248. if linkable(elementName) {
  249. if p.requireParseableURLs {
  250. // Ensure URLs are parseable:
  251. // - a.href
  252. // - area.href
  253. // - link.href
  254. // - blockquote.cite
  255. // - q.cite
  256. // - img.src
  257. // - script.src
  258. tmpAttrs := []html.Attribute{}
  259. for _, htmlAttr := range cleanAttrs {
  260. switch elementName {
  261. case "a", "area", "link":
  262. if htmlAttr.Key == "href" {
  263. if u, ok := p.validURL(htmlAttr.Val); ok {
  264. htmlAttr.Val = u
  265. tmpAttrs = append(tmpAttrs, htmlAttr)
  266. }
  267. break
  268. }
  269. tmpAttrs = append(tmpAttrs, htmlAttr)
  270. case "blockquote", "q":
  271. if htmlAttr.Key == "cite" {
  272. if u, ok := p.validURL(htmlAttr.Val); ok {
  273. htmlAttr.Val = u
  274. tmpAttrs = append(tmpAttrs, htmlAttr)
  275. }
  276. break
  277. }
  278. tmpAttrs = append(tmpAttrs, htmlAttr)
  279. case "img", "script":
  280. if htmlAttr.Key == "src" {
  281. if u, ok := p.validURL(htmlAttr.Val); ok {
  282. htmlAttr.Val = u
  283. tmpAttrs = append(tmpAttrs, htmlAttr)
  284. }
  285. break
  286. }
  287. tmpAttrs = append(tmpAttrs, htmlAttr)
  288. default:
  289. tmpAttrs = append(tmpAttrs, htmlAttr)
  290. }
  291. }
  292. cleanAttrs = tmpAttrs
  293. }
  294. if (p.requireNoFollow ||
  295. p.requireNoFollowFullyQualifiedLinks ||
  296. p.addTargetBlankToFullyQualifiedLinks) &&
  297. len(cleanAttrs) > 0 {
  298. // Add rel="nofollow" if a "href" exists
  299. switch elementName {
  300. case "a", "area", "link":
  301. var hrefFound bool
  302. var externalLink bool
  303. for _, htmlAttr := range cleanAttrs {
  304. if htmlAttr.Key == "href" {
  305. hrefFound = true
  306. u, err := url.Parse(htmlAttr.Val)
  307. if err != nil {
  308. continue
  309. }
  310. if u.Host != "" {
  311. externalLink = true
  312. }
  313. continue
  314. }
  315. }
  316. if hrefFound {
  317. var (
  318. noFollowFound bool
  319. targetBlankFound bool
  320. )
  321. addNoFollow := (p.requireNoFollow ||
  322. externalLink && p.requireNoFollowFullyQualifiedLinks)
  323. addTargetBlank := (externalLink &&
  324. p.addTargetBlankToFullyQualifiedLinks)
  325. tmpAttrs := []html.Attribute{}
  326. for _, htmlAttr := range cleanAttrs {
  327. var appended bool
  328. if htmlAttr.Key == "rel" && addNoFollow {
  329. if strings.Contains(htmlAttr.Val, "nofollow") {
  330. noFollowFound = true
  331. tmpAttrs = append(tmpAttrs, htmlAttr)
  332. appended = true
  333. } else {
  334. htmlAttr.Val += " nofollow"
  335. noFollowFound = true
  336. tmpAttrs = append(tmpAttrs, htmlAttr)
  337. appended = true
  338. }
  339. }
  340. if elementName == "a" && htmlAttr.Key == "target" {
  341. if htmlAttr.Val == "_blank" {
  342. targetBlankFound = true
  343. }
  344. if addTargetBlank && !targetBlankFound {
  345. htmlAttr.Val = "_blank"
  346. targetBlankFound = true
  347. tmpAttrs = append(tmpAttrs, htmlAttr)
  348. appended = true
  349. }
  350. }
  351. if !appended {
  352. tmpAttrs = append(tmpAttrs, htmlAttr)
  353. }
  354. }
  355. if noFollowFound || targetBlankFound {
  356. cleanAttrs = tmpAttrs
  357. }
  358. if addNoFollow && !noFollowFound {
  359. rel := html.Attribute{}
  360. rel.Key = "rel"
  361. rel.Val = "nofollow"
  362. cleanAttrs = append(cleanAttrs, rel)
  363. }
  364. if elementName == "a" && addTargetBlank && !targetBlankFound {
  365. rel := html.Attribute{}
  366. rel.Key = "target"
  367. rel.Val = "_blank"
  368. targetBlankFound = true
  369. cleanAttrs = append(cleanAttrs, rel)
  370. }
  371. if targetBlankFound {
  372. // target="_blank" has a security risk that allows the
  373. // opened window/tab to issue JavaScript calls against
  374. // window.opener, which in effect allow the destination
  375. // of the link to control the source:
  376. // https://dev.to/ben/the-targetblank-vulnerability-by-example
  377. //
  378. // To mitigate this risk, we need to add a specific rel
  379. // attribute if it is not already present.
  380. // rel="noopener"
  381. //
  382. // Unfortunately this is processing the rel twice (we
  383. // already looked at it earlier ^^) as we cannot be sure
  384. // of the ordering of the href and rel, and whether we
  385. // have fully satisfied that we need to do this. This
  386. // double processing only happens *if* target="_blank"
  387. // is true.
  388. var noOpenerAdded bool
  389. tmpAttrs := []html.Attribute{}
  390. for _, htmlAttr := range cleanAttrs {
  391. var appended bool
  392. if htmlAttr.Key == "rel" {
  393. if strings.Contains(htmlAttr.Val, "noopener") {
  394. noOpenerAdded = true
  395. tmpAttrs = append(tmpAttrs, htmlAttr)
  396. } else {
  397. htmlAttr.Val += " noopener"
  398. noOpenerAdded = true
  399. tmpAttrs = append(tmpAttrs, htmlAttr)
  400. }
  401. appended = true
  402. }
  403. if !appended {
  404. tmpAttrs = append(tmpAttrs, htmlAttr)
  405. }
  406. }
  407. if noOpenerAdded {
  408. cleanAttrs = tmpAttrs
  409. } else {
  410. // rel attr was not found, or else noopener would
  411. // have been added already
  412. rel := html.Attribute{}
  413. rel.Key = "rel"
  414. rel.Val = "noopener"
  415. cleanAttrs = append(cleanAttrs, rel)
  416. }
  417. }
  418. }
  419. default:
  420. }
  421. }
  422. }
  423. return cleanAttrs
  424. }
  425. func (p *Policy) allowNoAttrs(elementName string) bool {
  426. _, ok := p.setOfElementsAllowedWithoutAttrs[elementName]
  427. return ok
  428. }
  429. func (p *Policy) validURL(rawurl string) (string, bool) {
  430. if p.requireParseableURLs {
  431. // URLs do not contain whitespace
  432. if strings.Contains(rawurl, " ") ||
  433. strings.Contains(rawurl, "\t") ||
  434. strings.Contains(rawurl, "\n") {
  435. return "", false
  436. }
  437. u, err := url.Parse(rawurl)
  438. if err != nil {
  439. return "", false
  440. }
  441. if u.Scheme != "" {
  442. urlPolicy, ok := p.allowURLSchemes[u.Scheme]
  443. if !ok {
  444. return "", false
  445. }
  446. if urlPolicy == nil || urlPolicy(u) == true {
  447. return u.String(), true
  448. }
  449. return "", false
  450. }
  451. if p.allowRelativeURLs {
  452. if u.String() != "" {
  453. return u.String(), true
  454. }
  455. }
  456. return "", false
  457. }
  458. return rawurl, true
  459. }
  460. func linkable(elementName string) bool {
  461. switch elementName {
  462. case "a", "area", "blockquote", "img", "link", "script":
  463. return true
  464. default:
  465. return false
  466. }
  467. }