sanitize.go 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089
  1. // Copyright (c) 2014, David Kitchen <david@buro9.com>
  2. //
  3. // All rights reserved.
  4. //
  5. // Redistribution and use in source and binary forms, with or without
  6. // modification, are permitted provided that the following conditions are met:
  7. //
  8. // * Redistributions of source code must retain the above copyright notice, this
  9. // list of conditions and the following disclaimer.
  10. //
  11. // * Redistributions in binary form must reproduce the above copyright notice,
  12. // this list of conditions and the following disclaimer in the documentation
  13. // and/or other materials provided with the distribution.
  14. //
  15. // * Neither the name of the organisation (Microcosm) nor the names of its
  16. // contributors may be used to endorse or promote products derived from
  17. // this software without specific prior written permission.
  18. //
  19. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  20. // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  21. // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  22. // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
  23. // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  24. // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  25. // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  26. // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  27. // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  28. // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  29. package bluemonday
  30. import (
  31. "bytes"
  32. "fmt"
  33. "io"
  34. "net/url"
  35. "regexp"
  36. "strconv"
  37. "strings"
  38. "golang.org/x/net/html"
  39. "github.com/aymerick/douceur/parser"
  40. )
  41. var (
  42. dataAttribute = regexp.MustCompile("^data-.+")
  43. dataAttributeXMLPrefix = regexp.MustCompile("^xml.+")
  44. dataAttributeInvalidChars = regexp.MustCompile("[A-Z;]+")
  45. cssUnicodeChar = regexp.MustCompile(`\\[0-9a-f]{1,6} ?`)
  46. dataURIbase64Prefix = regexp.MustCompile(`^data:[^,]*;base64,`)
  47. )
  48. // Sanitize takes a string that contains a HTML fragment or document and applies
  49. // the given policy allowlist.
  50. //
  51. // It returns a HTML string that has been sanitized by the policy or an empty
  52. // string if an error has occurred (most likely as a consequence of extremely
  53. // malformed input)
  54. func (p *Policy) Sanitize(s string) string {
  55. if strings.TrimSpace(s) == "" {
  56. return s
  57. }
  58. return p.sanitizeWithBuff(strings.NewReader(s)).String()
  59. }
  60. // SanitizeBytes takes a []byte that contains a HTML fragment or document and applies
  61. // the given policy allowlist.
  62. //
  63. // It returns a []byte containing the HTML that has been sanitized by the policy
  64. // or an empty []byte if an error has occurred (most likely as a consequence of
  65. // extremely malformed input)
  66. func (p *Policy) SanitizeBytes(b []byte) []byte {
  67. if len(bytes.TrimSpace(b)) == 0 {
  68. return b
  69. }
  70. return p.sanitizeWithBuff(bytes.NewReader(b)).Bytes()
  71. }
  72. // SanitizeReader takes an io.Reader that contains a HTML fragment or document
  73. // and applies the given policy allowlist.
  74. //
  75. // It returns a bytes.Buffer containing the HTML that has been sanitized by the
  76. // policy. Errors during sanitization will merely return an empty result.
  77. func (p *Policy) SanitizeReader(r io.Reader) *bytes.Buffer {
  78. return p.sanitizeWithBuff(r)
  79. }
  80. // SanitizeReaderToWriter takes an io.Reader that contains a HTML fragment or document
  81. // and applies the given policy allowlist and writes to the provided writer returning
  82. // an error if there is one.
  83. func (p *Policy) SanitizeReaderToWriter(r io.Reader, w io.Writer) error {
  84. return p.sanitize(r, w)
  85. }
  86. // Query represents a single part of the query string, a query param
  87. type Query struct {
  88. Key string
  89. Value string
  90. HasValue bool
  91. }
  92. func parseQuery(query string) (values []Query, err error) {
  93. // This is essentially a copy of parseQuery from
  94. // https://golang.org/src/net/url/url.go but adjusted to build our values
  95. // based on our type, which we need to preserve the ordering of the query
  96. // string
  97. for query != "" {
  98. key := query
  99. if i := strings.IndexAny(key, "&;"); i >= 0 {
  100. key, query = key[:i], key[i+1:]
  101. } else {
  102. query = ""
  103. }
  104. if key == "" {
  105. continue
  106. }
  107. value := ""
  108. hasValue := false
  109. if i := strings.Index(key, "="); i >= 0 {
  110. key, value = key[:i], key[i+1:]
  111. hasValue = true
  112. }
  113. key, err1 := url.QueryUnescape(key)
  114. if err1 != nil {
  115. if err == nil {
  116. err = err1
  117. }
  118. continue
  119. }
  120. value, err1 = url.QueryUnescape(value)
  121. if err1 != nil {
  122. if err == nil {
  123. err = err1
  124. }
  125. continue
  126. }
  127. values = append(values, Query{
  128. Key: key,
  129. Value: value,
  130. HasValue: hasValue,
  131. })
  132. }
  133. return values, err
  134. }
  135. func encodeQueries(queries []Query) string {
  136. var buff bytes.Buffer
  137. for i, query := range queries {
  138. buff.WriteString(url.QueryEscape(query.Key))
  139. if query.HasValue {
  140. buff.WriteString("=")
  141. buff.WriteString(url.QueryEscape(query.Value))
  142. }
  143. if i < len(queries)-1 {
  144. buff.WriteString("&")
  145. }
  146. }
  147. return buff.String()
  148. }
  149. func sanitizedURL(val string) (string, error) {
  150. u, err := url.Parse(val)
  151. if err != nil {
  152. return "", err
  153. }
  154. // we use parseQuery but not u.Query to keep the order not change because
  155. // url.Values is a map which has a random order.
  156. queryValues, err := parseQuery(u.RawQuery)
  157. if err != nil {
  158. return "", err
  159. }
  160. // sanitize the url query params
  161. for i, query := range queryValues {
  162. queryValues[i].Key = html.EscapeString(query.Key)
  163. }
  164. u.RawQuery = encodeQueries(queryValues)
  165. // u.String() will also sanitize host/scheme/user/pass
  166. return u.String(), nil
  167. }
  168. // Performs the actual sanitization process.
  169. func (p *Policy) sanitizeWithBuff(r io.Reader) *bytes.Buffer {
  170. var buff bytes.Buffer
  171. if err := p.sanitize(r, &buff); err != nil {
  172. return &bytes.Buffer{}
  173. }
  174. return &buff
  175. }
  176. type asStringWriter struct {
  177. io.Writer
  178. }
  179. func (a *asStringWriter) WriteString(s string) (int, error) {
  180. return a.Write([]byte(s))
  181. }
  182. func (p *Policy) sanitize(r io.Reader, w io.Writer) error {
  183. // It is possible that the developer has created the policy via:
  184. // p := bluemonday.Policy{}
  185. // rather than:
  186. // p := bluemonday.NewPolicy()
  187. // If this is the case, and if they haven't yet triggered an action that
  188. // would initialize the maps, then we need to do that.
  189. p.init()
  190. buff, ok := w.(stringWriterWriter)
  191. if !ok {
  192. buff = &asStringWriter{w}
  193. }
  194. var (
  195. skipElementContent bool
  196. skippingElementsCount int64
  197. skipClosingTag bool
  198. closingTagToSkipStack []string
  199. mostRecentlyStartedToken string
  200. )
  201. tokenizer := html.NewTokenizer(r)
  202. for {
  203. if tokenizer.Next() == html.ErrorToken {
  204. err := tokenizer.Err()
  205. if err == io.EOF {
  206. // End of input means end of processing
  207. return nil
  208. }
  209. // Raw tokenizer error
  210. return err
  211. }
  212. token := tokenizer.Token()
  213. switch token.Type {
  214. case html.DoctypeToken:
  215. // DocType is not handled as there is no safe parsing mechanism
  216. // provided by golang.org/x/net/html for the content, and this can
  217. // be misused to insert HTML tags that are not then sanitized
  218. //
  219. // One might wish to recursively sanitize here using the same policy
  220. // but I will need to do some further testing before considering
  221. // this.
  222. case html.CommentToken:
  223. // Comments are ignored by default
  224. if p.allowComments {
  225. // But if allowed then write the comment out as-is
  226. buff.WriteString(token.String())
  227. }
  228. case html.StartTagToken:
  229. mostRecentlyStartedToken = normaliseElementName(token.Data)
  230. switch normaliseElementName(token.Data) {
  231. case `script`:
  232. if !p.allowUnsafe {
  233. continue
  234. }
  235. case `style`:
  236. if !p.allowUnsafe {
  237. continue
  238. }
  239. }
  240. aps, ok := p.elsAndAttrs[token.Data]
  241. if !ok {
  242. aa, matched := p.matchRegex(token.Data)
  243. if !matched {
  244. if _, ok := p.setOfElementsToSkipContent[token.Data]; ok {
  245. skipElementContent = true
  246. skippingElementsCount++
  247. }
  248. if p.addSpaces {
  249. if _, err := buff.WriteString(" "); err != nil {
  250. return err
  251. }
  252. }
  253. break
  254. }
  255. aps = aa
  256. }
  257. if len(token.Attr) != 0 {
  258. token.Attr = p.sanitizeAttrs(token.Data, token.Attr, aps)
  259. }
  260. if len(token.Attr) == 0 {
  261. if !p.allowNoAttrs(token.Data) {
  262. skipClosingTag = true
  263. closingTagToSkipStack = append(closingTagToSkipStack, token.Data)
  264. if p.addSpaces {
  265. if _, err := buff.WriteString(" "); err != nil {
  266. return err
  267. }
  268. }
  269. break
  270. }
  271. }
  272. if !skipElementContent {
  273. if _, err := buff.WriteString(token.String()); err != nil {
  274. return err
  275. }
  276. }
  277. case html.EndTagToken:
  278. if mostRecentlyStartedToken == normaliseElementName(token.Data) {
  279. mostRecentlyStartedToken = ""
  280. }
  281. switch normaliseElementName(token.Data) {
  282. case `script`:
  283. if !p.allowUnsafe {
  284. continue
  285. }
  286. case `style`:
  287. if !p.allowUnsafe {
  288. continue
  289. }
  290. }
  291. if skipClosingTag && closingTagToSkipStack[len(closingTagToSkipStack)-1] == token.Data {
  292. closingTagToSkipStack = closingTagToSkipStack[:len(closingTagToSkipStack)-1]
  293. if len(closingTagToSkipStack) == 0 {
  294. skipClosingTag = false
  295. }
  296. if p.addSpaces {
  297. if _, err := buff.WriteString(" "); err != nil {
  298. return err
  299. }
  300. }
  301. break
  302. }
  303. if _, ok := p.elsAndAttrs[token.Data]; !ok {
  304. match := false
  305. for regex := range p.elsMatchingAndAttrs {
  306. if regex.MatchString(token.Data) {
  307. skipElementContent = false
  308. match = true
  309. break
  310. }
  311. }
  312. if _, ok := p.setOfElementsToSkipContent[token.Data]; ok && !match {
  313. skippingElementsCount--
  314. if skippingElementsCount == 0 {
  315. skipElementContent = false
  316. }
  317. }
  318. if !match {
  319. if p.addSpaces {
  320. if _, err := buff.WriteString(" "); err != nil {
  321. return err
  322. }
  323. }
  324. break
  325. }
  326. }
  327. if !skipElementContent {
  328. if _, err := buff.WriteString(token.String()); err != nil {
  329. return err
  330. }
  331. }
  332. case html.SelfClosingTagToken:
  333. switch normaliseElementName(token.Data) {
  334. case `script`:
  335. if !p.allowUnsafe {
  336. continue
  337. }
  338. case `style`:
  339. if !p.allowUnsafe {
  340. continue
  341. }
  342. }
  343. aps, ok := p.elsAndAttrs[token.Data]
  344. if !ok {
  345. aa, matched := p.matchRegex(token.Data)
  346. if !matched {
  347. if p.addSpaces && !matched {
  348. if _, err := buff.WriteString(" "); err != nil {
  349. return err
  350. }
  351. }
  352. break
  353. }
  354. aps = aa
  355. }
  356. if len(token.Attr) != 0 {
  357. token.Attr = p.sanitizeAttrs(token.Data, token.Attr, aps)
  358. }
  359. if len(token.Attr) == 0 && !p.allowNoAttrs(token.Data) {
  360. if p.addSpaces {
  361. if _, err := buff.WriteString(" "); err != nil {
  362. return err
  363. }
  364. }
  365. break
  366. }
  367. if !skipElementContent {
  368. if _, err := buff.WriteString(token.String()); err != nil {
  369. return err
  370. }
  371. }
  372. case html.TextToken:
  373. if !skipElementContent {
  374. switch mostRecentlyStartedToken {
  375. case `script`:
  376. // not encouraged, but if a policy allows JavaScript we
  377. // should not HTML escape it as that would break the output
  378. //
  379. // requires p.AllowUnsafe()
  380. if p.allowUnsafe {
  381. if _, err := buff.WriteString(token.Data); err != nil {
  382. return err
  383. }
  384. }
  385. case "style":
  386. // not encouraged, but if a policy allows CSS styles we
  387. // should not HTML escape it as that would break the output
  388. //
  389. // requires p.AllowUnsafe()
  390. if p.allowUnsafe {
  391. if _, err := buff.WriteString(token.Data); err != nil {
  392. return err
  393. }
  394. }
  395. default:
  396. // HTML escape the text
  397. if _, err := buff.WriteString(token.String()); err != nil {
  398. return err
  399. }
  400. }
  401. }
  402. default:
  403. // A token that didn't exist in the html package when we wrote this
  404. return fmt.Errorf("unknown token: %v", token)
  405. }
  406. }
  407. }
  408. // sanitizeAttrs takes a set of element attribute policies and the global
  409. // attribute policies and applies them to the []html.Attribute returning a set
  410. // of html.Attributes that match the policies
  411. func (p *Policy) sanitizeAttrs(
  412. elementName string,
  413. attrs []html.Attribute,
  414. aps map[string][]attrPolicy,
  415. ) []html.Attribute {
  416. if len(attrs) == 0 {
  417. return attrs
  418. }
  419. hasStylePolicies := false
  420. sps, elementHasStylePolicies := p.elsAndStyles[elementName]
  421. if len(p.globalStyles) > 0 || (elementHasStylePolicies && len(sps) > 0) {
  422. hasStylePolicies = true
  423. }
  424. // no specific element policy found, look for a pattern match
  425. if !hasStylePolicies {
  426. for k, v := range p.elsMatchingAndStyles {
  427. if k.MatchString(elementName) {
  428. if len(v) > 0 {
  429. hasStylePolicies = true
  430. break
  431. }
  432. }
  433. }
  434. }
  435. // Builds a new attribute slice based on the whether the attribute has been
  436. // allowed explicitly or globally.
  437. cleanAttrs := []html.Attribute{}
  438. attrsLoop:
  439. for _, htmlAttr := range attrs {
  440. if p.allowDataAttributes {
  441. // If we see a data attribute, let it through.
  442. if isDataAttribute(htmlAttr.Key) {
  443. cleanAttrs = append(cleanAttrs, htmlAttr)
  444. continue
  445. }
  446. }
  447. // Is this a "style" attribute, and if so, do we need to sanitize it?
  448. if htmlAttr.Key == "style" && hasStylePolicies {
  449. htmlAttr = p.sanitizeStyles(htmlAttr, elementName)
  450. if htmlAttr.Val == "" {
  451. // We've sanitized away any and all styles; don't bother to
  452. // output the style attribute (even if it's allowed)
  453. continue
  454. } else {
  455. cleanAttrs = append(cleanAttrs, htmlAttr)
  456. continue
  457. }
  458. }
  459. // Is there an element specific attribute policy that applies?
  460. if apl, ok := aps[htmlAttr.Key]; ok {
  461. for _, ap := range apl {
  462. if ap.regexp != nil {
  463. if ap.regexp.MatchString(htmlAttr.Val) {
  464. cleanAttrs = append(cleanAttrs, htmlAttr)
  465. continue attrsLoop
  466. }
  467. } else {
  468. cleanAttrs = append(cleanAttrs, htmlAttr)
  469. continue attrsLoop
  470. }
  471. }
  472. }
  473. // Is there a global attribute policy that applies?
  474. if apl, ok := p.globalAttrs[htmlAttr.Key]; ok {
  475. for _, ap := range apl {
  476. if ap.regexp != nil {
  477. if ap.regexp.MatchString(htmlAttr.Val) {
  478. cleanAttrs = append(cleanAttrs, htmlAttr)
  479. }
  480. } else {
  481. cleanAttrs = append(cleanAttrs, htmlAttr)
  482. }
  483. }
  484. }
  485. }
  486. if len(cleanAttrs) == 0 {
  487. // If nothing was allowed, let's get out of here
  488. return cleanAttrs
  489. }
  490. // cleanAttrs now contains the attributes that are permitted
  491. if linkable(elementName) {
  492. if p.requireParseableURLs {
  493. // Ensure URLs are parseable:
  494. // - a.href
  495. // - area.href
  496. // - link.href
  497. // - blockquote.cite
  498. // - q.cite
  499. // - img.src
  500. // - script.src
  501. tmpAttrs := []html.Attribute{}
  502. for _, htmlAttr := range cleanAttrs {
  503. switch elementName {
  504. case "a", "area", "base", "link":
  505. if htmlAttr.Key == "href" {
  506. if u, ok := p.validURL(htmlAttr.Val); ok {
  507. htmlAttr.Val = u
  508. tmpAttrs = append(tmpAttrs, htmlAttr)
  509. }
  510. break
  511. }
  512. tmpAttrs = append(tmpAttrs, htmlAttr)
  513. case "blockquote", "del", "ins", "q":
  514. if htmlAttr.Key == "cite" {
  515. if u, ok := p.validURL(htmlAttr.Val); ok {
  516. htmlAttr.Val = u
  517. tmpAttrs = append(tmpAttrs, htmlAttr)
  518. }
  519. break
  520. }
  521. tmpAttrs = append(tmpAttrs, htmlAttr)
  522. case "audio", "embed", "iframe", "img", "script", "source", "track", "video":
  523. if htmlAttr.Key == "src" {
  524. if u, ok := p.validURL(htmlAttr.Val); ok {
  525. if p.srcRewriter != nil {
  526. parsedURL, err := url.Parse(u)
  527. if err != nil {
  528. fmt.Println(err)
  529. }
  530. p.srcRewriter(parsedURL)
  531. u = parsedURL.String()
  532. }
  533. htmlAttr.Val = u
  534. tmpAttrs = append(tmpAttrs, htmlAttr)
  535. }
  536. break
  537. }
  538. tmpAttrs = append(tmpAttrs, htmlAttr)
  539. default:
  540. tmpAttrs = append(tmpAttrs, htmlAttr)
  541. }
  542. }
  543. cleanAttrs = tmpAttrs
  544. }
  545. if (p.requireNoFollow ||
  546. p.requireNoFollowFullyQualifiedLinks ||
  547. p.requireNoReferrer ||
  548. p.requireNoReferrerFullyQualifiedLinks ||
  549. p.addTargetBlankToFullyQualifiedLinks) &&
  550. len(cleanAttrs) > 0 {
  551. // Add rel="nofollow" if a "href" exists
  552. switch elementName {
  553. case "a", "area", "base", "link":
  554. var hrefFound bool
  555. var externalLink bool
  556. for _, htmlAttr := range cleanAttrs {
  557. if htmlAttr.Key == "href" {
  558. hrefFound = true
  559. u, err := url.Parse(htmlAttr.Val)
  560. if err != nil {
  561. continue
  562. }
  563. if u.Host != "" {
  564. externalLink = true
  565. }
  566. continue
  567. }
  568. }
  569. if hrefFound {
  570. var (
  571. noFollowFound bool
  572. noReferrerFound bool
  573. targetBlankFound bool
  574. )
  575. addNoFollow := (p.requireNoFollow ||
  576. externalLink && p.requireNoFollowFullyQualifiedLinks)
  577. addNoReferrer := (p.requireNoReferrer ||
  578. externalLink && p.requireNoReferrerFullyQualifiedLinks)
  579. addTargetBlank := (externalLink &&
  580. p.addTargetBlankToFullyQualifiedLinks)
  581. tmpAttrs := []html.Attribute{}
  582. for _, htmlAttr := range cleanAttrs {
  583. var appended bool
  584. if htmlAttr.Key == "rel" && (addNoFollow || addNoReferrer) {
  585. if addNoFollow && !strings.Contains(htmlAttr.Val, "nofollow") {
  586. htmlAttr.Val += " nofollow"
  587. }
  588. if addNoReferrer && !strings.Contains(htmlAttr.Val, "noreferrer") {
  589. htmlAttr.Val += " noreferrer"
  590. }
  591. noFollowFound = addNoFollow
  592. noReferrerFound = addNoReferrer
  593. tmpAttrs = append(tmpAttrs, htmlAttr)
  594. appended = true
  595. }
  596. if elementName == "a" && htmlAttr.Key == "target" {
  597. if htmlAttr.Val == "_blank" {
  598. targetBlankFound = true
  599. }
  600. if addTargetBlank && !targetBlankFound {
  601. htmlAttr.Val = "_blank"
  602. targetBlankFound = true
  603. tmpAttrs = append(tmpAttrs, htmlAttr)
  604. appended = true
  605. }
  606. }
  607. if !appended {
  608. tmpAttrs = append(tmpAttrs, htmlAttr)
  609. }
  610. }
  611. if noFollowFound || noReferrerFound || targetBlankFound {
  612. cleanAttrs = tmpAttrs
  613. }
  614. if (addNoFollow && !noFollowFound) || (addNoReferrer && !noReferrerFound) {
  615. rel := html.Attribute{}
  616. rel.Key = "rel"
  617. if addNoFollow {
  618. rel.Val = "nofollow"
  619. }
  620. if addNoReferrer {
  621. if rel.Val != "" {
  622. rel.Val += " "
  623. }
  624. rel.Val += "noreferrer"
  625. }
  626. cleanAttrs = append(cleanAttrs, rel)
  627. }
  628. if elementName == "a" && addTargetBlank && !targetBlankFound {
  629. rel := html.Attribute{}
  630. rel.Key = "target"
  631. rel.Val = "_blank"
  632. targetBlankFound = true
  633. cleanAttrs = append(cleanAttrs, rel)
  634. }
  635. if targetBlankFound {
  636. // target="_blank" has a security risk that allows the
  637. // opened window/tab to issue JavaScript calls against
  638. // window.opener, which in effect allow the destination
  639. // of the link to control the source:
  640. // https://dev.to/ben/the-targetblank-vulnerability-by-example
  641. //
  642. // To mitigate this risk, we need to add a specific rel
  643. // attribute if it is not already present.
  644. // rel="noopener"
  645. //
  646. // Unfortunately this is processing the rel twice (we
  647. // already looked at it earlier ^^) as we cannot be sure
  648. // of the ordering of the href and rel, and whether we
  649. // have fully satisfied that we need to do this. This
  650. // double processing only happens *if* target="_blank"
  651. // is true.
  652. var noOpenerAdded bool
  653. tmpAttrs := []html.Attribute{}
  654. for _, htmlAttr := range cleanAttrs {
  655. var appended bool
  656. if htmlAttr.Key == "rel" {
  657. if strings.Contains(htmlAttr.Val, "noopener") {
  658. noOpenerAdded = true
  659. tmpAttrs = append(tmpAttrs, htmlAttr)
  660. } else {
  661. htmlAttr.Val += " noopener"
  662. noOpenerAdded = true
  663. tmpAttrs = append(tmpAttrs, htmlAttr)
  664. }
  665. appended = true
  666. }
  667. if !appended {
  668. tmpAttrs = append(tmpAttrs, htmlAttr)
  669. }
  670. }
  671. if noOpenerAdded {
  672. cleanAttrs = tmpAttrs
  673. } else {
  674. // rel attr was not found, or else noopener would
  675. // have been added already
  676. rel := html.Attribute{}
  677. rel.Key = "rel"
  678. rel.Val = "noopener"
  679. cleanAttrs = append(cleanAttrs, rel)
  680. }
  681. }
  682. }
  683. default:
  684. }
  685. }
  686. }
  687. if p.requireCrossOriginAnonymous && len(cleanAttrs) > 0 {
  688. switch elementName {
  689. case "audio", "img", "link", "script", "video":
  690. var crossOriginFound bool
  691. for _, htmlAttr := range cleanAttrs {
  692. if htmlAttr.Key == "crossorigin" {
  693. crossOriginFound = true
  694. htmlAttr.Val = "anonymous"
  695. }
  696. }
  697. if !crossOriginFound {
  698. crossOrigin := html.Attribute{}
  699. crossOrigin.Key = "crossorigin"
  700. crossOrigin.Val = "anonymous"
  701. cleanAttrs = append(cleanAttrs, crossOrigin)
  702. }
  703. }
  704. }
  705. if p.requireSandboxOnIFrame != nil && elementName == "iframe" {
  706. var sandboxFound bool
  707. for i, htmlAttr := range cleanAttrs {
  708. if htmlAttr.Key == "sandbox" {
  709. sandboxFound = true
  710. var cleanVals []string
  711. cleanValsSet := make(map[string]bool)
  712. for _, val := range strings.Fields(htmlAttr.Val) {
  713. if p.requireSandboxOnIFrame[val] {
  714. if !cleanValsSet[val] {
  715. cleanVals = append(cleanVals, val)
  716. cleanValsSet[val] = true
  717. }
  718. }
  719. }
  720. cleanAttrs[i].Val = strings.Join(cleanVals, " ")
  721. }
  722. }
  723. if !sandboxFound {
  724. sandbox := html.Attribute{}
  725. sandbox.Key = "sandbox"
  726. sandbox.Val = ""
  727. cleanAttrs = append(cleanAttrs, sandbox)
  728. }
  729. }
  730. return cleanAttrs
  731. }
  732. func (p *Policy) sanitizeStyles(attr html.Attribute, elementName string) html.Attribute {
  733. sps := p.elsAndStyles[elementName]
  734. if len(sps) == 0 {
  735. sps = map[string][]stylePolicy{}
  736. // check for any matching elements, if we don't already have a policy found
  737. // if multiple matches are found they will be overwritten, it's best
  738. // to not have overlapping matchers
  739. for regex, policies := range p.elsMatchingAndStyles {
  740. if regex.MatchString(elementName) {
  741. for k, v := range policies {
  742. sps[k] = append(sps[k], v...)
  743. }
  744. }
  745. }
  746. }
  747. //Add semi-colon to end to fix parsing issue
  748. attr.Val = strings.TrimRight(attr.Val, " ")
  749. if len(attr.Val) > 0 && attr.Val[len(attr.Val)-1] != ';' {
  750. attr.Val = attr.Val + ";"
  751. }
  752. decs, err := parser.ParseDeclarations(attr.Val)
  753. if err != nil {
  754. attr.Val = ""
  755. return attr
  756. }
  757. clean := []string{}
  758. prefixes := []string{"-webkit-", "-moz-", "-ms-", "-o-", "mso-", "-xv-", "-atsc-", "-wap-", "-khtml-", "prince-", "-ah-", "-hp-", "-ro-", "-rim-", "-tc-"}
  759. decLoop:
  760. for _, dec := range decs {
  761. tempProperty := strings.ToLower(dec.Property)
  762. tempValue := removeUnicode(strings.ToLower(dec.Value))
  763. for _, i := range prefixes {
  764. tempProperty = strings.TrimPrefix(tempProperty, i)
  765. }
  766. if spl, ok := sps[tempProperty]; ok {
  767. for _, sp := range spl {
  768. if sp.handler != nil {
  769. if sp.handler(tempValue) {
  770. clean = append(clean, dec.Property+": "+dec.Value)
  771. continue decLoop
  772. }
  773. } else if len(sp.enum) > 0 {
  774. if stringInSlice(tempValue, sp.enum) {
  775. clean = append(clean, dec.Property+": "+dec.Value)
  776. continue decLoop
  777. }
  778. } else if sp.regexp != nil {
  779. if sp.regexp.MatchString(tempValue) {
  780. clean = append(clean, dec.Property+": "+dec.Value)
  781. continue decLoop
  782. }
  783. }
  784. }
  785. }
  786. if spl, ok := p.globalStyles[tempProperty]; ok {
  787. for _, sp := range spl {
  788. if sp.handler != nil {
  789. if sp.handler(tempValue) {
  790. clean = append(clean, dec.Property+": "+dec.Value)
  791. continue decLoop
  792. }
  793. } else if len(sp.enum) > 0 {
  794. if stringInSlice(tempValue, sp.enum) {
  795. clean = append(clean, dec.Property+": "+dec.Value)
  796. continue decLoop
  797. }
  798. } else if sp.regexp != nil {
  799. if sp.regexp.MatchString(tempValue) {
  800. clean = append(clean, dec.Property+": "+dec.Value)
  801. continue decLoop
  802. }
  803. }
  804. }
  805. }
  806. }
  807. if len(clean) > 0 {
  808. attr.Val = strings.Join(clean, "; ")
  809. } else {
  810. attr.Val = ""
  811. }
  812. return attr
  813. }
  814. func (p *Policy) allowNoAttrs(elementName string) bool {
  815. _, ok := p.setOfElementsAllowedWithoutAttrs[elementName]
  816. if !ok {
  817. for _, r := range p.setOfElementsMatchingAllowedWithoutAttrs {
  818. if r.MatchString(elementName) {
  819. ok = true
  820. break
  821. }
  822. }
  823. }
  824. return ok
  825. }
  826. func (p *Policy) validURL(rawurl string) (string, bool) {
  827. if p.requireParseableURLs {
  828. // URLs are valid if when space is trimmed the URL is valid
  829. rawurl = strings.TrimSpace(rawurl)
  830. // URLs cannot contain whitespace, unless it is a data-uri
  831. if strings.Contains(rawurl, " ") ||
  832. strings.Contains(rawurl, "\t") ||
  833. strings.Contains(rawurl, "\n") {
  834. if !strings.HasPrefix(rawurl, `data:`) {
  835. return "", false
  836. }
  837. // Remove \r and \n from base64 encoded data to pass url.Parse.
  838. matched := dataURIbase64Prefix.FindString(rawurl)
  839. if matched != "" {
  840. rawurl = matched + strings.Replace(
  841. strings.Replace(
  842. rawurl[len(matched):],
  843. "\r",
  844. "",
  845. -1,
  846. ),
  847. "\n",
  848. "",
  849. -1,
  850. )
  851. }
  852. }
  853. // URLs are valid if they parse
  854. u, err := url.Parse(rawurl)
  855. if err != nil {
  856. return "", false
  857. }
  858. if u.Scheme != "" {
  859. urlPolicies, ok := p.allowURLSchemes[u.Scheme]
  860. if !ok {
  861. for _, r := range p.allowURLSchemeRegexps {
  862. if r.MatchString(u.Scheme) {
  863. return u.String(), true
  864. }
  865. }
  866. return "", false
  867. }
  868. if len(urlPolicies) == 0 {
  869. return u.String(), true
  870. }
  871. for _, urlPolicy := range urlPolicies {
  872. if urlPolicy(u) {
  873. return u.String(), true
  874. }
  875. }
  876. return "", false
  877. }
  878. if p.allowRelativeURLs {
  879. if u.String() != "" {
  880. return u.String(), true
  881. }
  882. }
  883. return "", false
  884. }
  885. return rawurl, true
  886. }
  887. func linkable(elementName string) bool {
  888. switch elementName {
  889. case "a", "area", "base", "link":
  890. // elements that allow .href
  891. return true
  892. case "blockquote", "del", "ins", "q":
  893. // elements that allow .cite
  894. return true
  895. case "audio", "embed", "iframe", "img", "input", "script", "track", "video":
  896. // elements that allow .src
  897. return true
  898. default:
  899. return false
  900. }
  901. }
  902. // stringInSlice returns true if needle exists in haystack
  903. func stringInSlice(needle string, haystack []string) bool {
  904. for _, straw := range haystack {
  905. if strings.EqualFold(straw, needle) {
  906. return true
  907. }
  908. }
  909. return false
  910. }
  911. func isDataAttribute(val string) bool {
  912. if !dataAttribute.MatchString(val) {
  913. return false
  914. }
  915. rest := strings.Split(val, "data-")
  916. if len(rest) == 1 {
  917. return false
  918. }
  919. // data-xml* is invalid.
  920. if dataAttributeXMLPrefix.MatchString(rest[1]) {
  921. return false
  922. }
  923. // no uppercase or semi-colons allowed.
  924. if dataAttributeInvalidChars.MatchString(rest[1]) {
  925. return false
  926. }
  927. return true
  928. }
  929. func removeUnicode(value string) string {
  930. substitutedValue := value
  931. currentLoc := cssUnicodeChar.FindStringIndex(substitutedValue)
  932. for currentLoc != nil {
  933. character := substitutedValue[currentLoc[0]+1 : currentLoc[1]]
  934. character = strings.TrimSpace(character)
  935. if len(character) < 4 {
  936. character = strings.Repeat("0", 4-len(character)) + character
  937. } else {
  938. for len(character) > 4 {
  939. if character[0] != '0' {
  940. character = ""
  941. break
  942. } else {
  943. character = character[1:]
  944. }
  945. }
  946. }
  947. character = "\\u" + character
  948. translatedChar, err := strconv.Unquote(`"` + character + `"`)
  949. translatedChar = strings.TrimSpace(translatedChar)
  950. if err != nil {
  951. return ""
  952. }
  953. substitutedValue = substitutedValue[0:currentLoc[0]] + translatedChar + substitutedValue[currentLoc[1]:]
  954. currentLoc = cssUnicodeChar.FindStringIndex(substitutedValue)
  955. }
  956. return substitutedValue
  957. }
  958. func (p *Policy) matchRegex(elementName string) (map[string][]attrPolicy, bool) {
  959. aps := make(map[string][]attrPolicy, 0)
  960. matched := false
  961. for regex, attrs := range p.elsMatchingAndAttrs {
  962. if regex.MatchString(elementName) {
  963. matched = true
  964. for k, v := range attrs {
  965. aps[k] = append(aps[k], v...)
  966. }
  967. }
  968. }
  969. return aps, matched
  970. }
  971. // normaliseElementName takes a HTML element like <script> which is user input
  972. // and returns a lower case version of it that is immune to UTF-8 to ASCII
  973. // conversion tricks (like the use of upper case cyrillic i scrİpt which a
  974. // strings.ToLower would convert to script). Instead this func will preserve
  975. // all non-ASCII as their escaped equivalent, i.e. \u0130 which reveals the
  976. // characters when lower cased
  977. func normaliseElementName(str string) string {
  978. // that useful QuoteToASCII put quote marks at the start and end
  979. // so those are trimmed off
  980. return strings.TrimSuffix(
  981. strings.TrimPrefix(
  982. strings.ToLower(
  983. strconv.QuoteToASCII(str),
  984. ),
  985. `"`),
  986. `"`,
  987. )
  988. }