html.go 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531
  1. // Package html minifies HTML5 following the specifications at http://www.w3.org/TR/html5/syntax.html.
  2. package html
  3. import (
  4. "bytes"
  5. "fmt"
  6. "io"
  7. "github.com/tdewolff/minify/v2"
  8. "github.com/tdewolff/parse/v2"
  9. "github.com/tdewolff/parse/v2/buffer"
  10. "github.com/tdewolff/parse/v2/html"
  11. )
  12. var (
  13. gtBytes = []byte(">")
  14. isBytes = []byte("=")
  15. spaceBytes = []byte(" ")
  16. doctypeBytes = []byte("<!doctype html>")
  17. jsMimeBytes = []byte("application/javascript")
  18. cssMimeBytes = []byte("text/css")
  19. htmlMimeBytes = []byte("text/html")
  20. svgMimeBytes = []byte("image/svg+xml")
  21. formMimeBytes = []byte("application/x-www-form-urlencoded")
  22. mathMimeBytes = []byte("application/mathml+xml")
  23. dataSchemeBytes = []byte("data:")
  24. jsSchemeBytes = []byte("javascript:")
  25. httpBytes = []byte("http")
  26. radioBytes = []byte("radio")
  27. onBytes = []byte("on")
  28. textBytes = []byte("text")
  29. noneBytes = []byte("none")
  30. submitBytes = []byte("submit")
  31. allBytes = []byte("all")
  32. rectBytes = []byte("rect")
  33. dataBytes = []byte("data")
  34. getBytes = []byte("get")
  35. autoBytes = []byte("auto")
  36. oneBytes = []byte("one")
  37. inlineParams = map[string]string{"inline": "1"}
  38. )
  39. ////////////////////////////////////////////////////////////////
  40. var GoTemplateDelims = [2]string{"{{", "}}"}
  41. var HandlebarsTemplateDelims = [2]string{"{{", "}}"}
  42. var MustacheTemplateDelims = [2]string{"{{", "}}"}
  43. var EJSTemplateDelims = [2]string{"<%", "%>"}
  44. var ASPTemplateDelims = [2]string{"<%", "%>"}
  45. var PHPTemplateDelims = [2]string{"<?", "?>"}
  46. // Minifier is an HTML minifier.
  47. type Minifier struct {
  48. KeepComments bool
  49. KeepConditionalComments bool
  50. KeepSpecialComments bool
  51. KeepDefaultAttrVals bool
  52. KeepDocumentTags bool
  53. KeepEndTags bool
  54. KeepQuotes bool
  55. KeepWhitespace bool
  56. TemplateDelims [2]string
  57. }
  58. // Minify minifies HTML data, it reads from r and writes to w.
  59. func Minify(m *minify.M, w io.Writer, r io.Reader, params map[string]string) error {
  60. return (&Minifier{}).Minify(m, w, r, params)
  61. }
  62. // Minify minifies HTML data, it reads from r and writes to w.
  63. func (o *Minifier) Minify(m *minify.M, w io.Writer, r io.Reader, _ map[string]string) error {
  64. var rawTagHash Hash
  65. var rawTagMediatype []byte
  66. if o.KeepConditionalComments {
  67. fmt.Println("DEPRECATED: KeepConditionalComments is replaced by KeepSpecialComments")
  68. o.KeepSpecialComments = true
  69. }
  70. omitSpace := true // if true the next leading space is omitted
  71. inPre := false
  72. attrMinifyBuffer := buffer.NewWriter(make([]byte, 0, 64))
  73. attrByteBuffer := make([]byte, 0, 64)
  74. z := parse.NewInput(r)
  75. defer z.Restore()
  76. l := html.NewTemplateLexer(z, o.TemplateDelims)
  77. tb := NewTokenBuffer(z, l)
  78. for {
  79. t := *tb.Shift()
  80. switch t.TokenType {
  81. case html.ErrorToken:
  82. if _, err := w.Write(nil); err != nil {
  83. return err
  84. }
  85. if l.Err() == io.EOF {
  86. return nil
  87. }
  88. return l.Err()
  89. case html.DoctypeToken:
  90. w.Write(doctypeBytes)
  91. case html.CommentToken:
  92. if o.KeepComments {
  93. w.Write(t.Data)
  94. } else if o.KeepSpecialComments {
  95. if 6 < len(t.Text) && (bytes.HasPrefix(t.Text, []byte("[if ")) || bytes.HasSuffix(t.Text, []byte("[endif]")) || bytes.HasSuffix(t.Text, []byte("[endif]--"))) {
  96. // [if ...] is always 7 or more characters, [endif] is only encountered for downlevel-revealed
  97. // see https://msdn.microsoft.com/en-us/library/ms537512(v=vs.85).aspx#syntax
  98. if bytes.HasPrefix(t.Data, []byte("<!--[if ")) && bytes.HasSuffix(t.Data, []byte("<![endif]-->")) { // downlevel-hidden
  99. begin := bytes.IndexByte(t.Data, '>') + 1
  100. end := len(t.Data) - len("<![endif]-->")
  101. if begin < end {
  102. w.Write(t.Data[:begin])
  103. if err := o.Minify(m, w, buffer.NewReader(t.Data[begin:end]), nil); err != nil {
  104. return minify.UpdateErrorPosition(err, z, t.Offset)
  105. }
  106. w.Write(t.Data[end:])
  107. } else {
  108. w.Write(t.Data) // malformed
  109. }
  110. } else {
  111. w.Write(t.Data) // downlevel-revealed or short downlevel-hidden
  112. }
  113. } else if 1 < len(t.Text) && t.Text[0] == '#' {
  114. // SSI tags
  115. w.Write(t.Data)
  116. }
  117. }
  118. case html.SvgToken:
  119. if err := m.MinifyMimetype(svgMimeBytes, w, buffer.NewReader(t.Data), nil); err != nil {
  120. if err != minify.ErrNotExist {
  121. return minify.UpdateErrorPosition(err, z, t.Offset)
  122. }
  123. w.Write(t.Data)
  124. }
  125. omitSpace = false
  126. case html.MathToken:
  127. if err := m.MinifyMimetype(mathMimeBytes, w, buffer.NewReader(t.Data), nil); err != nil {
  128. if err != minify.ErrNotExist {
  129. return minify.UpdateErrorPosition(err, z, t.Offset)
  130. }
  131. w.Write(t.Data)
  132. }
  133. omitSpace = false
  134. case html.TextToken:
  135. if t.HasTemplate {
  136. w.Write(t.Data)
  137. } else if rawTagHash != 0 {
  138. if rawTagHash == Style || rawTagHash == Script || rawTagHash == Iframe {
  139. var mimetype []byte
  140. var params map[string]string
  141. if rawTagHash == Iframe {
  142. mimetype = htmlMimeBytes
  143. } else if 0 < len(rawTagMediatype) {
  144. mimetype, params = parse.Mediatype(rawTagMediatype)
  145. } else if rawTagHash == Script {
  146. mimetype = jsMimeBytes
  147. } else if rawTagHash == Style {
  148. mimetype = cssMimeBytes
  149. }
  150. if err := m.MinifyMimetype(mimetype, w, buffer.NewReader(t.Data), params); err != nil {
  151. if err != minify.ErrNotExist {
  152. return minify.UpdateErrorPosition(err, z, t.Offset)
  153. }
  154. w.Write(t.Data)
  155. }
  156. } else {
  157. w.Write(t.Data)
  158. }
  159. } else if inPre {
  160. w.Write(t.Data)
  161. } else {
  162. t.Data = parse.ReplaceMultipleWhitespaceAndEntities(t.Data, EntitiesMap, TextRevEntitiesMap)
  163. // whitespace removal; trim left
  164. if omitSpace && parse.IsWhitespace(t.Data[0]) {
  165. t.Data = t.Data[1:]
  166. }
  167. // whitespace removal; trim right
  168. omitSpace = false
  169. if len(t.Data) == 0 {
  170. omitSpace = true
  171. } else if parse.IsWhitespace(t.Data[len(t.Data)-1]) {
  172. omitSpace = true
  173. i := 0
  174. for {
  175. next := tb.Peek(i)
  176. // trim if EOF, text token with leading whitespace or block token
  177. if next.TokenType == html.ErrorToken {
  178. t.Data = t.Data[:len(t.Data)-1]
  179. omitSpace = false
  180. break
  181. } else if next.TokenType == html.TextToken && !parse.IsAllWhitespace(next.Data) {
  182. // stop looking when text encountered
  183. break
  184. } else if next.TokenType == html.StartTagToken || next.TokenType == html.EndTagToken {
  185. if o.KeepWhitespace {
  186. break
  187. }
  188. // remove when followed by a block tag
  189. if next.Traits&blockTag != 0 {
  190. t.Data = t.Data[:len(t.Data)-1]
  191. omitSpace = false
  192. break
  193. } else if next.TokenType == html.StartTagToken {
  194. break
  195. }
  196. }
  197. i++
  198. }
  199. }
  200. w.Write(t.Data)
  201. }
  202. case html.StartTagToken, html.EndTagToken:
  203. rawTagHash = 0
  204. hasAttributes := false
  205. if t.TokenType == html.StartTagToken {
  206. if next := tb.Peek(0); next.TokenType == html.AttributeToken {
  207. hasAttributes = true
  208. }
  209. if t.Traits&rawTag != 0 {
  210. // ignore empty script and style tags
  211. if !hasAttributes && (t.Hash == Script || t.Hash == Style) {
  212. if next := tb.Peek(1); next.TokenType == html.EndTagToken {
  213. tb.Shift()
  214. tb.Shift()
  215. break
  216. }
  217. }
  218. rawTagHash = t.Hash
  219. rawTagMediatype = nil
  220. // do not minify content of <style amp-boilerplate>
  221. if hasAttributes && t.Hash == Style {
  222. if attrs := tb.Attributes(Amp_Boilerplate); attrs[0] != nil {
  223. rawTagHash = 0
  224. }
  225. }
  226. }
  227. } else if t.Hash == Template {
  228. omitSpace = true // EndTagToken
  229. }
  230. if t.Hash == Pre {
  231. inPre = t.TokenType == html.StartTagToken
  232. }
  233. // remove superfluous tags, except for html, head and body tags when KeepDocumentTags is set
  234. if !hasAttributes && (!o.KeepDocumentTags && (t.Hash == Html || t.Hash == Head || t.Hash == Body) || t.Hash == Colgroup) {
  235. break
  236. } else if t.TokenType == html.EndTagToken {
  237. omitEndTag := false
  238. if !o.KeepEndTags {
  239. if t.Hash == Thead || t.Hash == Tbody || t.Hash == Tfoot || t.Hash == Tr || t.Hash == Th ||
  240. t.Hash == Td || t.Hash == Option || t.Hash == Dd || t.Hash == Dt || t.Hash == Li ||
  241. t.Hash == Rb || t.Hash == Rt || t.Hash == Rtc || t.Hash == Rp {
  242. omitEndTag = true // omit end tags
  243. } else if t.Hash == P {
  244. i := 0
  245. for {
  246. next := tb.Peek(i)
  247. i++
  248. // continue if text token is empty or whitespace
  249. if next.TokenType == html.TextToken && parse.IsAllWhitespace(next.Data) {
  250. continue
  251. }
  252. if next.TokenType == html.ErrorToken || next.TokenType == html.EndTagToken && next.Traits&keepPTag == 0 || next.TokenType == html.StartTagToken && next.Traits&omitPTag != 0 {
  253. omitEndTag = true // omit p end tag
  254. }
  255. break
  256. }
  257. } else if t.Hash == Optgroup {
  258. i := 0
  259. for {
  260. next := tb.Peek(i)
  261. i++
  262. // continue if text token
  263. if next.TokenType == html.TextToken {
  264. continue
  265. }
  266. if next.TokenType == html.ErrorToken || next.Hash != Option {
  267. omitEndTag = true // omit optgroup end tag
  268. }
  269. break
  270. }
  271. }
  272. }
  273. if !omitEndTag {
  274. if o.KeepWhitespace || t.Traits&objectTag != 0 {
  275. omitSpace = false
  276. } else if t.Traits&blockTag != 0 {
  277. omitSpace = true // omit spaces after block elements
  278. }
  279. if 3+len(t.Text) < len(t.Data) {
  280. t.Data[2+len(t.Text)] = '>'
  281. t.Data = t.Data[:3+len(t.Text)]
  282. }
  283. w.Write(t.Data)
  284. }
  285. // skip text in select and optgroup tags
  286. if t.Hash == Option || t.Hash == Optgroup {
  287. if next := tb.Peek(0); next.TokenType == html.TextToken {
  288. tb.Shift()
  289. }
  290. }
  291. break
  292. }
  293. if o.KeepWhitespace || t.Traits&objectTag != 0 {
  294. omitSpace = false
  295. } else if t.Traits&blockTag != 0 {
  296. omitSpace = true // omit spaces after block elements
  297. }
  298. w.Write(t.Data)
  299. if hasAttributes {
  300. if t.Hash == Meta {
  301. attrs := tb.Attributes(Content, Http_Equiv, Charset, Name)
  302. if content := attrs[0]; content != nil {
  303. if httpEquiv := attrs[1]; httpEquiv != nil {
  304. httpEquiv.AttrVal = parse.TrimWhitespace(httpEquiv.AttrVal)
  305. if charset := attrs[2]; charset == nil && parse.EqualFold(httpEquiv.AttrVal, []byte("content-type")) {
  306. content.AttrVal = minify.Mediatype(content.AttrVal)
  307. if bytes.Equal(content.AttrVal, []byte("text/html;charset=utf-8")) {
  308. httpEquiv.Text = nil
  309. content.Text = []byte("charset")
  310. content.Hash = Charset
  311. content.AttrVal = []byte("utf-8")
  312. }
  313. }
  314. }
  315. if name := attrs[3]; name != nil {
  316. name.AttrVal = parse.TrimWhitespace(name.AttrVal)
  317. if parse.EqualFold(name.AttrVal, []byte("keywords")) {
  318. content.AttrVal = bytes.ReplaceAll(content.AttrVal, []byte(", "), []byte(","))
  319. } else if parse.EqualFold(name.AttrVal, []byte("viewport")) {
  320. content.AttrVal = bytes.ReplaceAll(content.AttrVal, []byte(" "), []byte(""))
  321. for i := 0; i < len(content.AttrVal); i++ {
  322. if content.AttrVal[i] == '=' && i+2 < len(content.AttrVal) {
  323. i++
  324. if n := parse.Number(content.AttrVal[i:]); 0 < n {
  325. minNum := minify.Number(content.AttrVal[i:i+n], -1)
  326. if len(minNum) < n {
  327. copy(content.AttrVal[i:i+len(minNum)], minNum)
  328. copy(content.AttrVal[i+len(minNum):], content.AttrVal[i+n:])
  329. content.AttrVal = content.AttrVal[:len(content.AttrVal)+len(minNum)-n]
  330. }
  331. i += len(minNum)
  332. }
  333. i-- // mitigate for-loop increase
  334. }
  335. }
  336. }
  337. }
  338. }
  339. } else if t.Hash == Script {
  340. attrs := tb.Attributes(Src, Charset)
  341. if attrs[0] != nil && attrs[1] != nil {
  342. attrs[1].Text = nil
  343. }
  344. } else if t.Hash == Input {
  345. attrs := tb.Attributes(Type, Value)
  346. if t, value := attrs[0], attrs[1]; t != nil && value != nil {
  347. isRadio := parse.EqualFold(t.AttrVal, radioBytes)
  348. if !isRadio && len(value.AttrVal) == 0 {
  349. value.Text = nil
  350. } else if isRadio && parse.EqualFold(value.AttrVal, onBytes) {
  351. value.Text = nil
  352. }
  353. }
  354. } else if t.Hash == A {
  355. attrs := tb.Attributes(Id, Name)
  356. if id, name := attrs[0], attrs[1]; id != nil && name != nil {
  357. if bytes.Equal(id.AttrVal, name.AttrVal) {
  358. name.Text = nil
  359. }
  360. }
  361. }
  362. // write attributes
  363. for {
  364. attr := *tb.Shift()
  365. if attr.TokenType != html.AttributeToken {
  366. break
  367. } else if attr.Text == nil {
  368. continue // removed attribute
  369. } else if attr.HasTemplate {
  370. w.Write(attr.Data)
  371. continue // don't minify attributes that contain templates
  372. }
  373. val := attr.AttrVal
  374. if attr.Traits&trimAttr != 0 {
  375. val = parse.ReplaceMultipleWhitespaceAndEntities(val, EntitiesMap, nil)
  376. val = parse.TrimWhitespace(val)
  377. } else {
  378. val = parse.ReplaceEntities(val, EntitiesMap, nil)
  379. }
  380. if t.Traits != 0 {
  381. if len(val) == 0 && (attr.Hash == Class ||
  382. attr.Hash == Dir ||
  383. attr.Hash == Id ||
  384. attr.Hash == Name ||
  385. attr.Hash == Action && t.Hash == Form) {
  386. continue // omit empty attribute values
  387. }
  388. if rawTagHash != 0 && attr.Hash == Type {
  389. rawTagMediatype = parse.Copy(val)
  390. }
  391. if attr.Hash == Enctype ||
  392. attr.Hash == Formenctype ||
  393. attr.Hash == Accept ||
  394. attr.Hash == Type && (t.Hash == A || t.Hash == Link || t.Hash == Embed || t.Hash == Object || t.Hash == Source || t.Hash == Script) {
  395. val = minify.Mediatype(val)
  396. }
  397. // default attribute values can be omitted
  398. if !o.KeepDefaultAttrVals && (attr.Hash == Type && (t.Hash == Script && jsMimetypes[string(parse.ToLower(parse.Copy(val)))] ||
  399. t.Hash == Style && parse.EqualFold(val, cssMimeBytes) ||
  400. t.Hash == Link && parse.EqualFold(val, cssMimeBytes) ||
  401. t.Hash == Input && parse.EqualFold(val, textBytes) ||
  402. t.Hash == Button && parse.EqualFold(val, submitBytes)) ||
  403. attr.Hash == Method && parse.EqualFold(val, getBytes) ||
  404. attr.Hash == Enctype && parse.EqualFold(val, formMimeBytes) ||
  405. attr.Hash == Colspan && bytes.Equal(val, oneBytes) ||
  406. attr.Hash == Rowspan && bytes.Equal(val, oneBytes) ||
  407. attr.Hash == Shape && parse.EqualFold(val, rectBytes) ||
  408. attr.Hash == Span && bytes.Equal(val, oneBytes) ||
  409. attr.Hash == Media && t.Hash == Style && parse.EqualFold(val, allBytes)) {
  410. continue
  411. }
  412. if attr.Hash == Style {
  413. // CSS minifier for attribute inline code
  414. val = parse.TrimWhitespace(val)
  415. attrMinifyBuffer.Reset()
  416. if err := m.MinifyMimetype(cssMimeBytes, attrMinifyBuffer, buffer.NewReader(val), inlineParams); err == nil {
  417. val = attrMinifyBuffer.Bytes()
  418. } else if err != minify.ErrNotExist {
  419. return minify.UpdateErrorPosition(err, z, attr.Offset)
  420. }
  421. if len(val) == 0 {
  422. continue
  423. }
  424. } else if 2 < len(attr.Text) && attr.Text[0] == 'o' && attr.Text[1] == 'n' {
  425. // JS minifier for attribute inline code
  426. val = parse.TrimWhitespace(val)
  427. if 11 <= len(val) && parse.EqualFold(val[:11], jsSchemeBytes) {
  428. val = val[11:]
  429. }
  430. attrMinifyBuffer.Reset()
  431. if err := m.MinifyMimetype(jsMimeBytes, attrMinifyBuffer, buffer.NewReader(val), inlineParams); err == nil {
  432. val = attrMinifyBuffer.Bytes()
  433. } else if err != minify.ErrNotExist {
  434. return minify.UpdateErrorPosition(err, z, attr.Offset)
  435. }
  436. if len(val) == 0 {
  437. continue
  438. }
  439. } else if attr.Traits&urlAttr != 0 { // anchors are already handled
  440. val = parse.TrimWhitespace(val)
  441. if 5 < len(val) {
  442. if parse.EqualFold(val[:4], httpBytes) {
  443. if val[4] == ':' {
  444. if m.URL != nil && m.URL.Scheme == "http" {
  445. val = val[5:]
  446. } else {
  447. parse.ToLower(val[:4])
  448. }
  449. } else if (val[4] == 's' || val[4] == 'S') && val[5] == ':' {
  450. if m.URL != nil && m.URL.Scheme == "https" {
  451. val = val[6:]
  452. } else {
  453. parse.ToLower(val[:5])
  454. }
  455. }
  456. } else if parse.EqualFold(val[:5], dataSchemeBytes) {
  457. val = minify.DataURI(m, val)
  458. }
  459. }
  460. }
  461. }
  462. w.Write(spaceBytes)
  463. w.Write(attr.Text)
  464. if 0 < len(val) && attr.Traits&booleanAttr == 0 {
  465. w.Write(isBytes)
  466. // use double quotes for RDFa attributes
  467. isXML := attr.Hash == Vocab || attr.Hash == Typeof || attr.Hash == Property || attr.Hash == Resource || attr.Hash == Prefix || attr.Hash == Content || attr.Hash == About || attr.Hash == Rev || attr.Hash == Datatype || attr.Hash == Inlist
  468. // no quotes if possible, else prefer single or double depending on which occurs more often in value
  469. var quote byte
  470. if 0 < len(attr.Data) && (attr.Data[len(attr.Data)-1] == '\'' || attr.Data[len(attr.Data)-1] == '"') {
  471. quote = attr.Data[len(attr.Data)-1]
  472. }
  473. val = html.EscapeAttrVal(&attrByteBuffer, val, quote, o.KeepQuotes, isXML)
  474. w.Write(val)
  475. }
  476. }
  477. } else {
  478. _ = tb.Shift() // StartTagClose
  479. }
  480. w.Write(gtBytes)
  481. // skip text in select and optgroup tags
  482. if t.Hash == Select || t.Hash == Optgroup {
  483. if next := tb.Peek(0); next.TokenType == html.TextToken {
  484. tb.Shift()
  485. }
  486. }
  487. // keep space after phrasing tags (<i>, <span>, ...) FontAwesome etc.
  488. if t.TokenType == html.StartTagToken && t.Traits == normalTag {
  489. if next := tb.Peek(0); next.Hash == t.Hash && next.TokenType == html.EndTagToken {
  490. omitSpace = false
  491. }
  492. }
  493. }
  494. }
  495. }