list.go 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203
  1. // Copyright 2012 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. //go:generate go run gen.go
  5. // Package publicsuffix provides a public suffix list based on data from
  6. // https://publicsuffix.org/
  7. //
  8. // A public suffix is one under which Internet users can directly register
  9. // names. It is related to, but different from, a TLD (top level domain).
  10. //
  11. // "com" is a TLD (top level domain). Top level means it has no dots.
  12. //
  13. // "com" is also a public suffix. Amazon and Google have registered different
  14. // siblings under that domain: "amazon.com" and "google.com".
  15. //
  16. // "au" is another TLD, again because it has no dots. But it's not "amazon.au".
  17. // Instead, it's "amazon.com.au".
  18. //
  19. // "com.au" isn't an actual TLD, because it's not at the top level (it has
  20. // dots). But it is an eTLD (effective TLD), because that's the branching point
  21. // for domain name registrars.
  22. //
  23. // Another name for "an eTLD" is "a public suffix". Often, what's more of
  24. // interest is the eTLD+1, or one more label than the public suffix. For
  25. // example, browsers partition read/write access to HTTP cookies according to
  26. // the eTLD+1. Web pages served from "amazon.com.au" can't read cookies from
  27. // "google.com.au", but web pages served from "maps.google.com" can share
  28. // cookies from "www.google.com", so you don't have to sign into Google Maps
  29. // separately from signing into Google Web Search. Note that all four of those
  30. // domains have 3 labels and 2 dots. The first two domains are each an eTLD+1,
  31. // the last two are not (but share the same eTLD+1: "google.com").
  32. //
  33. // All of these domains have the same eTLD+1:
  34. // - "www.books.amazon.co.uk"
  35. // - "books.amazon.co.uk"
  36. // - "amazon.co.uk"
  37. //
  38. // Specifically, the eTLD+1 is "amazon.co.uk", because the eTLD is "co.uk".
  39. //
  40. // There is no closed form algorithm to calculate the eTLD of a domain.
  41. // Instead, the calculation is data driven. This package provides a
  42. // pre-compiled snapshot of Mozilla's PSL (Public Suffix List) data at
  43. // https://publicsuffix.org/
  44. package publicsuffix // import "golang.org/x/net/publicsuffix"
  45. // TODO: specify case sensitivity and leading/trailing dot behavior for
  46. // func PublicSuffix and func EffectiveTLDPlusOne.
  47. import (
  48. "fmt"
  49. "net/http/cookiejar"
  50. "strings"
  51. )
  52. // List implements the cookiejar.PublicSuffixList interface by calling the
  53. // PublicSuffix function.
  54. var List cookiejar.PublicSuffixList = list{}
  55. type list struct{}
  56. func (list) PublicSuffix(domain string) string {
  57. ps, _ := PublicSuffix(domain)
  58. return ps
  59. }
  60. func (list) String() string {
  61. return version
  62. }
  63. // PublicSuffix returns the public suffix of the domain using a copy of the
  64. // publicsuffix.org database compiled into the library.
  65. //
  66. // icann is whether the public suffix is managed by the Internet Corporation
  67. // for Assigned Names and Numbers. If not, the public suffix is either a
  68. // privately managed domain (and in practice, not a top level domain) or an
  69. // unmanaged top level domain (and not explicitly mentioned in the
  70. // publicsuffix.org list). For example, "foo.org" and "foo.co.uk" are ICANN
  71. // domains, "foo.dyndns.org" and "foo.blogspot.co.uk" are private domains and
  72. // "cromulent" is an unmanaged top level domain.
  73. //
  74. // Use cases for distinguishing ICANN domains like "foo.com" from private
  75. // domains like "foo.appspot.com" can be found at
  76. // https://wiki.mozilla.org/Public_Suffix_List/Use_Cases
  77. func PublicSuffix(domain string) (publicSuffix string, icann bool) {
  78. lo, hi := uint32(0), uint32(numTLD)
  79. s, suffix, icannNode, wildcard := domain, len(domain), false, false
  80. loop:
  81. for {
  82. dot := strings.LastIndex(s, ".")
  83. if wildcard {
  84. icann = icannNode
  85. suffix = 1 + dot
  86. }
  87. if lo == hi {
  88. break
  89. }
  90. f := find(s[1+dot:], lo, hi)
  91. if f == notFound {
  92. break
  93. }
  94. u := uint32(nodes.get(f) >> (nodesBitsTextOffset + nodesBitsTextLength))
  95. icannNode = u&(1<<nodesBitsICANN-1) != 0
  96. u >>= nodesBitsICANN
  97. u = children.get(u & (1<<nodesBitsChildren - 1))
  98. lo = u & (1<<childrenBitsLo - 1)
  99. u >>= childrenBitsLo
  100. hi = u & (1<<childrenBitsHi - 1)
  101. u >>= childrenBitsHi
  102. switch u & (1<<childrenBitsNodeType - 1) {
  103. case nodeTypeNormal:
  104. suffix = 1 + dot
  105. case nodeTypeException:
  106. suffix = 1 + len(s)
  107. break loop
  108. }
  109. u >>= childrenBitsNodeType
  110. wildcard = u&(1<<childrenBitsWildcard-1) != 0
  111. if !wildcard {
  112. icann = icannNode
  113. }
  114. if dot == -1 {
  115. break
  116. }
  117. s = s[:dot]
  118. }
  119. if suffix == len(domain) {
  120. // If no rules match, the prevailing rule is "*".
  121. return domain[1+strings.LastIndex(domain, "."):], icann
  122. }
  123. return domain[suffix:], icann
  124. }
  125. const notFound uint32 = 1<<32 - 1
  126. // find returns the index of the node in the range [lo, hi) whose label equals
  127. // label, or notFound if there is no such node. The range is assumed to be in
  128. // strictly increasing node label order.
  129. func find(label string, lo, hi uint32) uint32 {
  130. for lo < hi {
  131. mid := lo + (hi-lo)/2
  132. s := nodeLabel(mid)
  133. if s < label {
  134. lo = mid + 1
  135. } else if s == label {
  136. return mid
  137. } else {
  138. hi = mid
  139. }
  140. }
  141. return notFound
  142. }
  143. // nodeLabel returns the label for the i'th node.
  144. func nodeLabel(i uint32) string {
  145. x := nodes.get(i)
  146. length := x & (1<<nodesBitsTextLength - 1)
  147. x >>= nodesBitsTextLength
  148. offset := x & (1<<nodesBitsTextOffset - 1)
  149. return text[offset : offset+length]
  150. }
  151. // EffectiveTLDPlusOne returns the effective top level domain plus one more
  152. // label. For example, the eTLD+1 for "foo.bar.golang.org" is "golang.org".
  153. func EffectiveTLDPlusOne(domain string) (string, error) {
  154. if strings.HasPrefix(domain, ".") || strings.HasSuffix(domain, ".") || strings.Contains(domain, "..") {
  155. return "", fmt.Errorf("publicsuffix: empty label in domain %q", domain)
  156. }
  157. suffix, _ := PublicSuffix(domain)
  158. if len(domain) <= len(suffix) {
  159. return "", fmt.Errorf("publicsuffix: cannot derive eTLD+1 for domain %q", domain)
  160. }
  161. i := len(domain) - len(suffix) - 1
  162. if domain[i] != '.' {
  163. return "", fmt.Errorf("publicsuffix: invalid public suffix %q for domain %q", suffix, domain)
  164. }
  165. return domain[1+strings.LastIndex(domain[:i], "."):], nil
  166. }
  167. type uint32String string
  168. func (u uint32String) get(i uint32) uint32 {
  169. off := i * 4
  170. return (uint32(u[off])<<24 |
  171. uint32(u[off+1])<<16 |
  172. uint32(u[off+2])<<8 |
  173. uint32(u[off+3]))
  174. }
  175. type uint40String string
  176. func (u uint40String) get(i uint32) uint64 {
  177. off := uint64(i * (nodesBits / 8))
  178. return uint64(u[off])<<32 |
  179. uint64(u[off+1])<<24 |
  180. uint64(u[off+2])<<16 |
  181. uint64(u[off+3])<<8 |
  182. uint64(u[off+4])
  183. }