sum_generic.go 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312
  1. // Copyright 2018 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // This file provides the generic implementation of Sum and MAC. Other files
  5. // might provide optimized assembly implementations of some of this code.
  6. package poly1305
  7. import (
  8. "encoding/binary"
  9. "math/bits"
  10. )
  11. // Poly1305 [RFC 7539] is a relatively simple algorithm: the authentication tag
  12. // for a 64 bytes message is approximately
  13. //
  14. // s + m[0:16] * r⁴ + m[16:32] * r³ + m[32:48] * r² + m[48:64] * r mod 2¹³⁰ - 5
  15. //
  16. // for some secret r and s. It can be computed sequentially like
  17. //
  18. // for len(msg) > 0:
  19. // h += read(msg, 16)
  20. // h *= r
  21. // h %= 2¹³⁰ - 5
  22. // return h + s
  23. //
  24. // All the complexity is about doing performant constant-time math on numbers
  25. // larger than any available numeric type.
  26. func sumGeneric(out *[TagSize]byte, msg []byte, key *[32]byte) {
  27. h := newMACGeneric(key)
  28. h.Write(msg)
  29. h.Sum(out)
  30. }
  31. func newMACGeneric(key *[32]byte) macGeneric {
  32. m := macGeneric{}
  33. initialize(key, &m.macState)
  34. return m
  35. }
  36. // macState holds numbers in saturated 64-bit little-endian limbs. That is,
  37. // the value of [x0, x1, x2] is x[0] + x[1] * 2⁶⁴ + x[2] * 2¹²⁸.
  38. type macState struct {
  39. // h is the main accumulator. It is to be interpreted modulo 2¹³⁰ - 5, but
  40. // can grow larger during and after rounds. It must, however, remain below
  41. // 2 * (2¹³⁰ - 5).
  42. h [3]uint64
  43. // r and s are the private key components.
  44. r [2]uint64
  45. s [2]uint64
  46. }
  47. type macGeneric struct {
  48. macState
  49. buffer [TagSize]byte
  50. offset int
  51. }
  52. // Write splits the incoming message into TagSize chunks, and passes them to
  53. // update. It buffers incomplete chunks.
  54. func (h *macGeneric) Write(p []byte) (int, error) {
  55. nn := len(p)
  56. if h.offset > 0 {
  57. n := copy(h.buffer[h.offset:], p)
  58. if h.offset+n < TagSize {
  59. h.offset += n
  60. return nn, nil
  61. }
  62. p = p[n:]
  63. h.offset = 0
  64. updateGeneric(&h.macState, h.buffer[:])
  65. }
  66. if n := len(p) - (len(p) % TagSize); n > 0 {
  67. updateGeneric(&h.macState, p[:n])
  68. p = p[n:]
  69. }
  70. if len(p) > 0 {
  71. h.offset += copy(h.buffer[h.offset:], p)
  72. }
  73. return nn, nil
  74. }
  75. // Sum flushes the last incomplete chunk from the buffer, if any, and generates
  76. // the MAC output. It does not modify its state, in order to allow for multiple
  77. // calls to Sum, even if no Write is allowed after Sum.
  78. func (h *macGeneric) Sum(out *[TagSize]byte) {
  79. state := h.macState
  80. if h.offset > 0 {
  81. updateGeneric(&state, h.buffer[:h.offset])
  82. }
  83. finalize(out, &state.h, &state.s)
  84. }
  85. // [rMask0, rMask1] is the specified Poly1305 clamping mask in little-endian. It
  86. // clears some bits of the secret coefficient to make it possible to implement
  87. // multiplication more efficiently.
  88. const (
  89. rMask0 = 0x0FFFFFFC0FFFFFFF
  90. rMask1 = 0x0FFFFFFC0FFFFFFC
  91. )
  92. // initialize loads the 256-bit key into the two 128-bit secret values r and s.
  93. func initialize(key *[32]byte, m *macState) {
  94. m.r[0] = binary.LittleEndian.Uint64(key[0:8]) & rMask0
  95. m.r[1] = binary.LittleEndian.Uint64(key[8:16]) & rMask1
  96. m.s[0] = binary.LittleEndian.Uint64(key[16:24])
  97. m.s[1] = binary.LittleEndian.Uint64(key[24:32])
  98. }
  99. // uint128 holds a 128-bit number as two 64-bit limbs, for use with the
  100. // bits.Mul64 and bits.Add64 intrinsics.
  101. type uint128 struct {
  102. lo, hi uint64
  103. }
  104. func mul64(a, b uint64) uint128 {
  105. hi, lo := bits.Mul64(a, b)
  106. return uint128{lo, hi}
  107. }
  108. func add128(a, b uint128) uint128 {
  109. lo, c := bits.Add64(a.lo, b.lo, 0)
  110. hi, c := bits.Add64(a.hi, b.hi, c)
  111. if c != 0 {
  112. panic("poly1305: unexpected overflow")
  113. }
  114. return uint128{lo, hi}
  115. }
  116. func shiftRightBy2(a uint128) uint128 {
  117. a.lo = a.lo>>2 | (a.hi&3)<<62
  118. a.hi = a.hi >> 2
  119. return a
  120. }
  121. // updateGeneric absorbs msg into the state.h accumulator. For each chunk m of
  122. // 128 bits of message, it computes
  123. //
  124. // h₊ = (h + m) * r mod 2¹³⁰ - 5
  125. //
  126. // If the msg length is not a multiple of TagSize, it assumes the last
  127. // incomplete chunk is the final one.
  128. func updateGeneric(state *macState, msg []byte) {
  129. h0, h1, h2 := state.h[0], state.h[1], state.h[2]
  130. r0, r1 := state.r[0], state.r[1]
  131. for len(msg) > 0 {
  132. var c uint64
  133. // For the first step, h + m, we use a chain of bits.Add64 intrinsics.
  134. // The resulting value of h might exceed 2¹³⁰ - 5, but will be partially
  135. // reduced at the end of the multiplication below.
  136. //
  137. // The spec requires us to set a bit just above the message size, not to
  138. // hide leading zeroes. For full chunks, that's 1 << 128, so we can just
  139. // add 1 to the most significant (2¹²⁸) limb, h2.
  140. if len(msg) >= TagSize {
  141. h0, c = bits.Add64(h0, binary.LittleEndian.Uint64(msg[0:8]), 0)
  142. h1, c = bits.Add64(h1, binary.LittleEndian.Uint64(msg[8:16]), c)
  143. h2 += c + 1
  144. msg = msg[TagSize:]
  145. } else {
  146. var buf [TagSize]byte
  147. copy(buf[:], msg)
  148. buf[len(msg)] = 1
  149. h0, c = bits.Add64(h0, binary.LittleEndian.Uint64(buf[0:8]), 0)
  150. h1, c = bits.Add64(h1, binary.LittleEndian.Uint64(buf[8:16]), c)
  151. h2 += c
  152. msg = nil
  153. }
  154. // Multiplication of big number limbs is similar to elementary school
  155. // columnar multiplication. Instead of digits, there are 64-bit limbs.
  156. //
  157. // We are multiplying a 3 limbs number, h, by a 2 limbs number, r.
  158. //
  159. // h2 h1 h0 x
  160. // r1 r0 =
  161. // ----------------
  162. // h2r0 h1r0 h0r0 <-- individual 128-bit products
  163. // + h2r1 h1r1 h0r1
  164. // ------------------------
  165. // m3 m2 m1 m0 <-- result in 128-bit overlapping limbs
  166. // ------------------------
  167. // m3.hi m2.hi m1.hi m0.hi <-- carry propagation
  168. // + m3.lo m2.lo m1.lo m0.lo
  169. // -------------------------------
  170. // t4 t3 t2 t1 t0 <-- final result in 64-bit limbs
  171. //
  172. // The main difference from pen-and-paper multiplication is that we do
  173. // carry propagation in a separate step, as if we wrote two digit sums
  174. // at first (the 128-bit limbs), and then carried the tens all at once.
  175. h0r0 := mul64(h0, r0)
  176. h1r0 := mul64(h1, r0)
  177. h2r0 := mul64(h2, r0)
  178. h0r1 := mul64(h0, r1)
  179. h1r1 := mul64(h1, r1)
  180. h2r1 := mul64(h2, r1)
  181. // Since h2 is known to be at most 7 (5 + 1 + 1), and r0 and r1 have their
  182. // top 4 bits cleared by rMask{0,1}, we know that their product is not going
  183. // to overflow 64 bits, so we can ignore the high part of the products.
  184. //
  185. // This also means that the product doesn't have a fifth limb (t4).
  186. if h2r0.hi != 0 {
  187. panic("poly1305: unexpected overflow")
  188. }
  189. if h2r1.hi != 0 {
  190. panic("poly1305: unexpected overflow")
  191. }
  192. m0 := h0r0
  193. m1 := add128(h1r0, h0r1) // These two additions don't overflow thanks again
  194. m2 := add128(h2r0, h1r1) // to the 4 masked bits at the top of r0 and r1.
  195. m3 := h2r1
  196. t0 := m0.lo
  197. t1, c := bits.Add64(m1.lo, m0.hi, 0)
  198. t2, c := bits.Add64(m2.lo, m1.hi, c)
  199. t3, _ := bits.Add64(m3.lo, m2.hi, c)
  200. // Now we have the result as 4 64-bit limbs, and we need to reduce it
  201. // modulo 2¹³⁰ - 5. The special shape of this Crandall prime lets us do
  202. // a cheap partial reduction according to the reduction identity
  203. //
  204. // c * 2¹³⁰ + n = c * 5 + n mod 2¹³⁰ - 5
  205. //
  206. // because 2¹³⁰ = 5 mod 2¹³⁰ - 5. Partial reduction since the result is
  207. // likely to be larger than 2¹³⁰ - 5, but still small enough to fit the
  208. // assumptions we make about h in the rest of the code.
  209. //
  210. // See also https://speakerdeck.com/gtank/engineering-prime-numbers?slide=23
  211. // We split the final result at the 2¹³⁰ mark into h and cc, the carry.
  212. // Note that the carry bits are effectively shifted left by 2, in other
  213. // words, cc = c * 4 for the c in the reduction identity.
  214. h0, h1, h2 = t0, t1, t2&maskLow2Bits
  215. cc := uint128{t2 & maskNotLow2Bits, t3}
  216. // To add c * 5 to h, we first add cc = c * 4, and then add (cc >> 2) = c.
  217. h0, c = bits.Add64(h0, cc.lo, 0)
  218. h1, c = bits.Add64(h1, cc.hi, c)
  219. h2 += c
  220. cc = shiftRightBy2(cc)
  221. h0, c = bits.Add64(h0, cc.lo, 0)
  222. h1, c = bits.Add64(h1, cc.hi, c)
  223. h2 += c
  224. // h2 is at most 3 + 1 + 1 = 5, making the whole of h at most
  225. //
  226. // 5 * 2¹²⁸ + (2¹²⁸ - 1) = 6 * 2¹²⁸ - 1
  227. }
  228. state.h[0], state.h[1], state.h[2] = h0, h1, h2
  229. }
  230. const (
  231. maskLow2Bits uint64 = 0x0000000000000003
  232. maskNotLow2Bits uint64 = ^maskLow2Bits
  233. )
  234. // select64 returns x if v == 1 and y if v == 0, in constant time.
  235. func select64(v, x, y uint64) uint64 { return ^(v-1)&x | (v-1)&y }
  236. // [p0, p1, p2] is 2¹³⁰ - 5 in little endian order.
  237. const (
  238. p0 = 0xFFFFFFFFFFFFFFFB
  239. p1 = 0xFFFFFFFFFFFFFFFF
  240. p2 = 0x0000000000000003
  241. )
  242. // finalize completes the modular reduction of h and computes
  243. //
  244. // out = h + s mod 2¹²⁸
  245. func finalize(out *[TagSize]byte, h *[3]uint64, s *[2]uint64) {
  246. h0, h1, h2 := h[0], h[1], h[2]
  247. // After the partial reduction in updateGeneric, h might be more than
  248. // 2¹³⁰ - 5, but will be less than 2 * (2¹³⁰ - 5). To complete the reduction
  249. // in constant time, we compute t = h - (2¹³⁰ - 5), and select h as the
  250. // result if the subtraction underflows, and t otherwise.
  251. hMinusP0, b := bits.Sub64(h0, p0, 0)
  252. hMinusP1, b := bits.Sub64(h1, p1, b)
  253. _, b = bits.Sub64(h2, p2, b)
  254. // h = h if h < p else h - p
  255. h0 = select64(b, h0, hMinusP0)
  256. h1 = select64(b, h1, hMinusP1)
  257. // Finally, we compute the last Poly1305 step
  258. //
  259. // tag = h + s mod 2¹²⁸
  260. //
  261. // by just doing a wide addition with the 128 low bits of h and discarding
  262. // the overflow.
  263. h0, c := bits.Add64(h0, s[0], 0)
  264. h1, _ = bits.Add64(h1, s[1], c)
  265. binary.LittleEndian.PutUint64(out[0:8], h0)
  266. binary.LittleEndian.PutUint64(out[8:16], h1)
  267. }