sitemap.go 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338
  1. // Package sitemap implements the Sitemap Protocol.
  2. // Reference: https://www.sitemaps.org/protocol.html
  3. package sitemap
  4. import (
  5. "encoding/xml"
  6. "fmt"
  7. "log"
  8. "net/http"
  9. "strings"
  10. "time"
  11. )
  12. // MaxURLsPerSitemap is the limit of each sitemap, if more than number of urls are registered
  13. // then sitemaps are automatically splitted and a sitemap index will be used.
  14. // Defaults to 50000 as Sitemap Protocol specifies.
  15. var MaxURLsPerSitemap = 50000
  16. // URL.ChangeFreq valid values.
  17. const (
  18. Always = "always"
  19. Hourly = "hourly"
  20. Daily = "daily"
  21. Weekly = "weekly"
  22. Monthly = "monthly"
  23. Yearly = "yearly"
  24. Never = "never"
  25. )
  26. // URL is the parent tag for each URL entry.
  27. type URL struct {
  28. // Loc is required. It defines the URL of the page.
  29. // This URL must begin with the protocol (such as http) and end with a trailing slash,
  30. // if your web server requires it. This value must be less than 2,048 characters.
  31. // Read more at: https://www.sitemaps.org/protocol.html#location
  32. Loc string `xml:"loc"`
  33. // LastMod is optional. It is the date of last modification of the file.
  34. LastMod time.Time `xml:"-"`
  35. // LastModStr do NOT set it directly,
  36. // other solution would be to use ptr or custom time marshaler but this will ruin the API's expressiveness.
  37. //
  38. // See internal `sitemap#Add`.
  39. LastModStr string `xml:"lastmod,omitempty"`
  40. // ChangeFreq is optional. Defines how frequently the page is likely to change.
  41. // This value provides general information to search engines and may not correlate exactly to how often they crawl the page.
  42. // Valid values are:
  43. // "always"
  44. // "hourly"
  45. // "daily"
  46. // "weekly"
  47. // "monthly"
  48. // "yearly"
  49. // "never"
  50. ChangeFreq string `xml:"changefreq,omitempty"`
  51. // Priority is optional. It defines the priority of this URL relative to other URLs on your site.
  52. // Valid values range from 0.0 to 1.0.
  53. //
  54. // The default priority of a page is 0.5.
  55. Priority float32 `xml:"priority,omitempty"`
  56. Links []Link `xml:"xhtml:link,omitempty"`
  57. }
  58. // AddLink adds a link to this URL.
  59. func (u *URL) AddLink(link Link) {
  60. u.Links = append(u.Links, link)
  61. }
  62. // Link is the optional child element of a URL.
  63. // It can be used to list every alternate version of the page.
  64. //
  65. // Read more at: https://support.google.com/webmasters/answer/189077?hl=en.
  66. type Link struct {
  67. Rel string `xml:"rel,attr"`
  68. Hreflang string `xml:"hreflang,attr"`
  69. Href string `xml:"href,attr"`
  70. }
  71. const (
  72. xmlSchemaURL = "http://www.sitemaps.org/schemas/sitemap/0.9"
  73. xmlnsXhtmlURL = "http://www.w3.org/1999/xhtml"
  74. xmlTimeFormat = "2006-01-02T15:04:05-07:00" // W3C Datetime.
  75. )
  76. type sitemap struct {
  77. XMLName xml.Name `xml:"urlset"`
  78. Xmlns string `xml:"xmlns,attr"`
  79. XmlnsXhtml string `xml:"xmlns:xhtml,attr,omitempty"`
  80. URLs []URL `xml:"url"`
  81. }
  82. func newSitemap() *sitemap {
  83. return &sitemap{
  84. Xmlns: xmlSchemaURL,
  85. }
  86. }
  87. func (s *sitemap) Add(url URL) {
  88. if !url.LastMod.IsZero() {
  89. url.LastModStr = url.LastMod.Format(xmlTimeFormat)
  90. }
  91. s.URLs = append(s.URLs, url)
  92. }
  93. type sitemapIndex struct {
  94. XMLName xml.Name `xml:"sitemapindex"`
  95. Xmlns string `xml:"xmlns,attr"`
  96. XmlnsXhtml string `xml:"xmlns:xhtml,attr,omitempty"`
  97. URLs []URL `xml:"sitemap"`
  98. }
  99. func newSitemapIndex() *sitemapIndex {
  100. return &sitemapIndex{Xmlns: xmlSchemaURL}
  101. }
  102. // Builder is the sitemaps Builder.
  103. type Builder struct {
  104. startURL string
  105. currentIndex int
  106. sitemaps []*sitemap
  107. defaultLang string
  108. errorHandler func(err error) (handled bool)
  109. }
  110. // DefaultLang is the default "hreflang" attribute of a self-included Link child element of URL.
  111. const DefaultLang = "en"
  112. // New returns a new sitemaps Builder.
  113. // Use its `Add` to add one or more urls and `Build` once.
  114. func New(startURL string) *Builder {
  115. return &Builder{
  116. startURL: withScheme(startURL),
  117. currentIndex: 0,
  118. sitemaps: []*sitemap{newSitemap()},
  119. defaultLang: DefaultLang,
  120. errorHandler: func(err error) bool {
  121. log.Fatal(err)
  122. return false
  123. },
  124. }
  125. }
  126. // ErrorHandler sets the error handler.
  127. func (b *Builder) ErrorHandler(fn func(err error) (handled bool)) *Builder {
  128. if fn == nil {
  129. fn = func(error) bool {
  130. return true
  131. }
  132. }
  133. b.errorHandler = fn
  134. return b
  135. }
  136. // DefaultLang sets the default "hreflang" attribute of a self-included URL Link.
  137. func (b *Builder) DefaultLang(langCode string) *Builder {
  138. b.defaultLang = langCode
  139. return b
  140. }
  141. const alternateLinkAttrName = "alternate"
  142. // URL adds a location of a Sitemap file determines the set of URLs that can be included in that Sitemap.
  143. func (b *Builder) URL(sitemapURLs ...URL) *Builder {
  144. for _, sitemapURL := range sitemapURLs {
  145. if sitemapURL.Loc == "" {
  146. continue
  147. }
  148. sitemapURL.Loc = concat(b.startURL, sitemapURL.Loc)
  149. sm := b.sitemaps[b.currentIndex]
  150. if len(sm.URLs) >= MaxURLsPerSitemap {
  151. // If static pages are more than 50000 then
  152. // a sitemap index should be served because each sitemap.xml has a limit of 50000 url elements.
  153. sm = newSitemap()
  154. b.currentIndex++
  155. b.sitemaps = append(b.sitemaps, sm)
  156. }
  157. if len(sitemapURL.Links) > 0 {
  158. sm.XmlnsXhtml = xmlnsXhtmlURL
  159. hasItself := false
  160. for idx, link := range sitemapURL.Links {
  161. link.Href = concat(b.startURL, link.Href)
  162. if link.Rel == "" && link.Hreflang != "" {
  163. link.Rel = alternateLinkAttrName
  164. }
  165. if !hasItself {
  166. // Check if the user provided the translated link to that URL itself.
  167. // the links, if not empty, should provide the URL loc itself.
  168. if link.Rel == alternateLinkAttrName && link.Hreflang == b.defaultLang {
  169. hasItself = true
  170. }
  171. }
  172. sitemapURL.Links[idx] = link
  173. }
  174. if !hasItself && b.defaultLang != "" {
  175. sitemapURL.AddLink(Link{
  176. Rel: alternateLinkAttrName,
  177. Hreflang: b.defaultLang,
  178. Href: sitemapURL.Loc,
  179. })
  180. }
  181. }
  182. sm.Add(sitemapURL)
  183. }
  184. return b
  185. }
  186. var xmlHeaderElem = []byte("<?xml version=\"1.0\" encoding=\"utf-8\" standalone=\"yes\"?>")
  187. // Handler is a sitemap handler. The result of `Builder#Build`.
  188. type Handler struct {
  189. // Content returns the raw xml data.
  190. Content []byte
  191. // Pos returns the position, starting from 0.
  192. Pos int
  193. // Path returns the request path that this handler should be listening on.
  194. Path string
  195. // IsSitemapIndex reports whether this handler serves a Sitemap Index File.
  196. IsSitemapIndex bool
  197. }
  198. const indexPath = "/sitemap.xml"
  199. func newSitemapHandler(v interface{}, pos int) (*Handler, error) {
  200. b, err := xml.Marshal(v)
  201. if err != nil {
  202. return nil, err
  203. }
  204. sitemapContent := append(xmlHeaderElem, b...)
  205. path := indexPath
  206. if pos > 0 {
  207. path = fmt.Sprintf("/sitemap%d.xml", pos)
  208. }
  209. _, isSitemapIndex := v.(*sitemapIndex)
  210. handler := &Handler{
  211. Content: sitemapContent,
  212. Pos: pos,
  213. Path: path,
  214. IsSitemapIndex: isSitemapIndex,
  215. }
  216. return handler, nil
  217. }
  218. func (h *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
  219. w.Header().Set("Content-Type", "text/xml; charset=utf-8")
  220. w.Write(h.Content)
  221. }
  222. // Build builds the sitemaps based on previous `Builder#URL` calls.
  223. // It returns a list of sitemap Handlers. Each `Handler` is compatible with `net/http#Handler`
  224. // and it contains further like the `Path`, `Pos` and if it's a sitemaps index handler.
  225. func (b *Builder) Build() (handlers []*Handler) {
  226. pos := 0
  227. if len(b.sitemaps) == 1 {
  228. // write single sitemap.
  229. handler, err := newSitemapHandler(b.sitemaps[pos], pos)
  230. if err != nil {
  231. b.errorHandler(err)
  232. } else {
  233. handlers = append(handlers, handler)
  234. }
  235. return
  236. }
  237. index := newSitemapIndex()
  238. for _, sitemap := range b.sitemaps {
  239. pos++
  240. handler, err := newSitemapHandler(sitemap, pos)
  241. if err != nil {
  242. pos--
  243. if !b.errorHandler(err) {
  244. break
  245. }
  246. continue
  247. }
  248. index.URLs = append(index.URLs, URL{
  249. Loc: b.startURL + handler.Path,
  250. })
  251. handlers = append(handlers, handler)
  252. }
  253. indexHandler, err := newSitemapHandler(index, 0)
  254. if err != nil {
  255. if !b.errorHandler(err) {
  256. return
  257. }
  258. }
  259. // prepend index sitemap.
  260. handlers = append([]*Handler{indexHandler}, handlers...)
  261. return
  262. }
  263. func withScheme(s string) string {
  264. if len(s) == 0 {
  265. return "http://localhost:8080"
  266. }
  267. if !strings.HasPrefix(s, "http://") && !strings.HasPrefix(s, "https://") {
  268. s = "https://" + s
  269. }
  270. if lidx := len(s) - 1; s[lidx] == '/' {
  271. s = s[0:lidx]
  272. }
  273. return s
  274. }
  275. func concat(startURL, loc string) string {
  276. if loc[0] == '/' {
  277. return startURL + loc
  278. }
  279. return startURL + "/" + loc
  280. }