123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338 |
- // Package sitemap implements the Sitemap Protocol.
- // Reference: https://www.sitemaps.org/protocol.html
- package sitemap
- import (
- "encoding/xml"
- "fmt"
- "log"
- "net/http"
- "strings"
- "time"
- )
- // MaxURLsPerSitemap is the limit of each sitemap, if more than number of urls are registered
- // then sitemaps are automatically splitted and a sitemap index will be used.
- // Defaults to 50000 as Sitemap Protocol specifies.
- var MaxURLsPerSitemap = 50000
- // URL.ChangeFreq valid values.
- const (
- Always = "always"
- Hourly = "hourly"
- Daily = "daily"
- Weekly = "weekly"
- Monthly = "monthly"
- Yearly = "yearly"
- Never = "never"
- )
- // URL is the parent tag for each URL entry.
- type URL struct {
- // Loc is required. It defines the URL of the page.
- // This URL must begin with the protocol (such as http) and end with a trailing slash,
- // if your web server requires it. This value must be less than 2,048 characters.
- // Read more at: https://www.sitemaps.org/protocol.html#location
- Loc string `xml:"loc"`
- // LastMod is optional. It is the date of last modification of the file.
- LastMod time.Time `xml:"-"`
- // LastModStr do NOT set it directly,
- // other solution would be to use ptr or custom time marshaler but this will ruin the API's expressiveness.
- //
- // See internal `sitemap#Add`.
- LastModStr string `xml:"lastmod,omitempty"`
- // ChangeFreq is optional. Defines how frequently the page is likely to change.
- // This value provides general information to search engines and may not correlate exactly to how often they crawl the page.
- // Valid values are:
- // "always"
- // "hourly"
- // "daily"
- // "weekly"
- // "monthly"
- // "yearly"
- // "never"
- ChangeFreq string `xml:"changefreq,omitempty"`
- // Priority is optional. It defines the priority of this URL relative to other URLs on your site.
- // Valid values range from 0.0 to 1.0.
- //
- // The default priority of a page is 0.5.
- Priority float32 `xml:"priority,omitempty"`
- Links []Link `xml:"xhtml:link,omitempty"`
- }
- // AddLink adds a link to this URL.
- func (u *URL) AddLink(link Link) {
- u.Links = append(u.Links, link)
- }
- // Link is the optional child element of a URL.
- // It can be used to list every alternate version of the page.
- //
- // Read more at: https://support.google.com/webmasters/answer/189077?hl=en.
- type Link struct {
- Rel string `xml:"rel,attr"`
- Hreflang string `xml:"hreflang,attr"`
- Href string `xml:"href,attr"`
- }
- const (
- xmlSchemaURL = "http://www.sitemaps.org/schemas/sitemap/0.9"
- xmlnsXhtmlURL = "http://www.w3.org/1999/xhtml"
- xmlTimeFormat = "2006-01-02T15:04:05-07:00" // W3C Datetime.
- )
- type sitemap struct {
- XMLName xml.Name `xml:"urlset"`
- Xmlns string `xml:"xmlns,attr"`
- XmlnsXhtml string `xml:"xmlns:xhtml,attr,omitempty"`
- URLs []URL `xml:"url"`
- }
- func newSitemap() *sitemap {
- return &sitemap{
- Xmlns: xmlSchemaURL,
- }
- }
- func (s *sitemap) Add(url URL) {
- if !url.LastMod.IsZero() {
- url.LastModStr = url.LastMod.Format(xmlTimeFormat)
- }
- s.URLs = append(s.URLs, url)
- }
- type sitemapIndex struct {
- XMLName xml.Name `xml:"sitemapindex"`
- Xmlns string `xml:"xmlns,attr"`
- XmlnsXhtml string `xml:"xmlns:xhtml,attr,omitempty"`
- URLs []URL `xml:"sitemap"`
- }
- func newSitemapIndex() *sitemapIndex {
- return &sitemapIndex{Xmlns: xmlSchemaURL}
- }
- // Builder is the sitemaps Builder.
- type Builder struct {
- startURL string
- currentIndex int
- sitemaps []*sitemap
- defaultLang string
- errorHandler func(err error) (handled bool)
- }
- // DefaultLang is the default "hreflang" attribute of a self-included Link child element of URL.
- const DefaultLang = "en"
- // New returns a new sitemaps Builder.
- // Use its `Add` to add one or more urls and `Build` once.
- func New(startURL string) *Builder {
- return &Builder{
- startURL: withScheme(startURL),
- currentIndex: 0,
- sitemaps: []*sitemap{newSitemap()},
- defaultLang: DefaultLang,
- errorHandler: func(err error) bool {
- log.Fatal(err)
- return false
- },
- }
- }
- // ErrorHandler sets the error handler.
- func (b *Builder) ErrorHandler(fn func(err error) (handled bool)) *Builder {
- if fn == nil {
- fn = func(error) bool {
- return true
- }
- }
- b.errorHandler = fn
- return b
- }
- // DefaultLang sets the default "hreflang" attribute of a self-included URL Link.
- func (b *Builder) DefaultLang(langCode string) *Builder {
- b.defaultLang = langCode
- return b
- }
- const alternateLinkAttrName = "alternate"
- // URL adds a location of a Sitemap file determines the set of URLs that can be included in that Sitemap.
- func (b *Builder) URL(sitemapURLs ...URL) *Builder {
- for _, sitemapURL := range sitemapURLs {
- if sitemapURL.Loc == "" {
- continue
- }
- sitemapURL.Loc = concat(b.startURL, sitemapURL.Loc)
- sm := b.sitemaps[b.currentIndex]
- if len(sm.URLs) >= MaxURLsPerSitemap {
- // If static pages are more than 50000 then
- // a sitemap index should be served because each sitemap.xml has a limit of 50000 url elements.
- sm = newSitemap()
- b.currentIndex++
- b.sitemaps = append(b.sitemaps, sm)
- }
- if len(sitemapURL.Links) > 0 {
- sm.XmlnsXhtml = xmlnsXhtmlURL
- hasItself := false
- for idx, link := range sitemapURL.Links {
- link.Href = concat(b.startURL, link.Href)
- if link.Rel == "" && link.Hreflang != "" {
- link.Rel = alternateLinkAttrName
- }
- if !hasItself {
- // Check if the user provided the translated link to that URL itself.
- // the links, if not empty, should provide the URL loc itself.
- if link.Rel == alternateLinkAttrName && link.Hreflang == b.defaultLang {
- hasItself = true
- }
- }
- sitemapURL.Links[idx] = link
- }
- if !hasItself && b.defaultLang != "" {
- sitemapURL.AddLink(Link{
- Rel: alternateLinkAttrName,
- Hreflang: b.defaultLang,
- Href: sitemapURL.Loc,
- })
- }
- }
- sm.Add(sitemapURL)
- }
- return b
- }
- var xmlHeaderElem = []byte("<?xml version=\"1.0\" encoding=\"utf-8\" standalone=\"yes\"?>")
- // Handler is a sitemap handler. The result of `Builder#Build`.
- type Handler struct {
- // Content returns the raw xml data.
- Content []byte
- // Pos returns the position, starting from 0.
- Pos int
- // Path returns the request path that this handler should be listening on.
- Path string
- // IsSitemapIndex reports whether this handler serves a Sitemap Index File.
- IsSitemapIndex bool
- }
- const indexPath = "/sitemap.xml"
- func newSitemapHandler(v interface{}, pos int) (*Handler, error) {
- b, err := xml.Marshal(v)
- if err != nil {
- return nil, err
- }
- sitemapContent := append(xmlHeaderElem, b...)
- path := indexPath
- if pos > 0 {
- path = fmt.Sprintf("/sitemap%d.xml", pos)
- }
- _, isSitemapIndex := v.(*sitemapIndex)
- handler := &Handler{
- Content: sitemapContent,
- Pos: pos,
- Path: path,
- IsSitemapIndex: isSitemapIndex,
- }
- return handler, nil
- }
- func (h *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
- w.Header().Set("Content-Type", "text/xml; charset=utf-8")
- w.Write(h.Content)
- }
- // Build builds the sitemaps based on previous `Builder#URL` calls.
- // It returns a list of sitemap Handlers. Each `Handler` is compatible with `net/http#Handler`
- // and it contains further like the `Path`, `Pos` and if it's a sitemaps index handler.
- func (b *Builder) Build() (handlers []*Handler) {
- pos := 0
- if len(b.sitemaps) == 1 {
- // write single sitemap.
- handler, err := newSitemapHandler(b.sitemaps[pos], pos)
- if err != nil {
- b.errorHandler(err)
- } else {
- handlers = append(handlers, handler)
- }
- return
- }
- index := newSitemapIndex()
- for _, sitemap := range b.sitemaps {
- pos++
- handler, err := newSitemapHandler(sitemap, pos)
- if err != nil {
- pos--
- if !b.errorHandler(err) {
- break
- }
- continue
- }
- index.URLs = append(index.URLs, URL{
- Loc: b.startURL + handler.Path,
- })
- handlers = append(handlers, handler)
- }
- indexHandler, err := newSitemapHandler(index, 0)
- if err != nil {
- if !b.errorHandler(err) {
- return
- }
- }
- // prepend index sitemap.
- handlers = append([]*Handler{indexHandler}, handlers...)
- return
- }
- func withScheme(s string) string {
- if len(s) == 0 {
- return "http://localhost:8080"
- }
- if !strings.HasPrefix(s, "http://") && !strings.HasPrefix(s, "https://") {
- s = "https://" + s
- }
- if lidx := len(s) - 1; s[lidx] == '/' {
- s = s[0:lidx]
- }
- return s
- }
- func concat(startURL, loc string) string {
- if loc[0] == '/' {
- return startURL + loc
- }
- return startURL + "/" + loc
- }
|