xxh32zero_arm.s 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251
  1. // +build !noasm
  2. #include "go_asm.h"
  3. #include "textflag.h"
  4. // Register allocation.
  5. #define p R0
  6. #define n R1
  7. #define h R2
  8. #define v1 R2 // Alias for h.
  9. #define v2 R3
  10. #define v3 R4
  11. #define v4 R5
  12. #define x1 R6
  13. #define x2 R7
  14. #define x3 R8
  15. #define x4 R9
  16. // We need the primes in registers. The 16-byte loop only uses prime{1,2}.
  17. #define prime1r R11
  18. #define prime2r R12
  19. #define prime3r R3 // The rest can alias v{2-4}.
  20. #define prime4r R4
  21. #define prime5r R5
  22. // Update round macros. These read from and increment p.
  23. #define round16aligned \
  24. MOVM.IA.W (p), [x1, x2, x3, x4] \
  25. \
  26. MULA x1, prime2r, v1, v1 \
  27. MULA x2, prime2r, v2, v2 \
  28. MULA x3, prime2r, v3, v3 \
  29. MULA x4, prime2r, v4, v4 \
  30. \
  31. MOVW v1 @> 19, v1 \
  32. MOVW v2 @> 19, v2 \
  33. MOVW v3 @> 19, v3 \
  34. MOVW v4 @> 19, v4 \
  35. \
  36. MUL prime1r, v1 \
  37. MUL prime1r, v2 \
  38. MUL prime1r, v3 \
  39. MUL prime1r, v4 \
  40. #define round16unaligned \
  41. MOVBU.P 16(p), x1 \
  42. MOVBU -15(p), x2 \
  43. ORR x2 << 8, x1 \
  44. MOVBU -14(p), x3 \
  45. MOVBU -13(p), x4 \
  46. ORR x4 << 8, x3 \
  47. ORR x3 << 16, x1 \
  48. \
  49. MULA x1, prime2r, v1, v1 \
  50. MOVW v1 @> 19, v1 \
  51. MUL prime1r, v1 \
  52. \
  53. MOVBU -12(p), x1 \
  54. MOVBU -11(p), x2 \
  55. ORR x2 << 8, x1 \
  56. MOVBU -10(p), x3 \
  57. MOVBU -9(p), x4 \
  58. ORR x4 << 8, x3 \
  59. ORR x3 << 16, x1 \
  60. \
  61. MULA x1, prime2r, v2, v2 \
  62. MOVW v2 @> 19, v2 \
  63. MUL prime1r, v2 \
  64. \
  65. MOVBU -8(p), x1 \
  66. MOVBU -7(p), x2 \
  67. ORR x2 << 8, x1 \
  68. MOVBU -6(p), x3 \
  69. MOVBU -5(p), x4 \
  70. ORR x4 << 8, x3 \
  71. ORR x3 << 16, x1 \
  72. \
  73. MULA x1, prime2r, v3, v3 \
  74. MOVW v3 @> 19, v3 \
  75. MUL prime1r, v3 \
  76. \
  77. MOVBU -4(p), x1 \
  78. MOVBU -3(p), x2 \
  79. ORR x2 << 8, x1 \
  80. MOVBU -2(p), x3 \
  81. MOVBU -1(p), x4 \
  82. ORR x4 << 8, x3 \
  83. ORR x3 << 16, x1 \
  84. \
  85. MULA x1, prime2r, v4, v4 \
  86. MOVW v4 @> 19, v4 \
  87. MUL prime1r, v4 \
  88. // func ChecksumZero([]byte) uint32
  89. TEXT ·ChecksumZero(SB), NOFRAME|NOSPLIT, $-4-16
  90. MOVW input_base+0(FP), p
  91. MOVW input_len+4(FP), n
  92. MOVW $const_prime1, prime1r
  93. MOVW $const_prime2, prime2r
  94. // Set up h for n < 16. It's tempting to say {ADD prime5, n, h}
  95. // here, but that's a pseudo-op that generates a load through R11.
  96. MOVW $const_prime5, prime5r
  97. ADD prime5r, n, h
  98. CMP $0, n
  99. BEQ end
  100. // We let n go negative so we can do comparisons with SUB.S
  101. // instead of separate CMP.
  102. SUB.S $16, n
  103. BMI loop16done
  104. ADD prime1r, prime2r, v1
  105. MOVW prime2r, v2
  106. MOVW $0, v3
  107. RSB $0, prime1r, v4
  108. TST $3, p
  109. BNE loop16unaligned
  110. loop16aligned:
  111. SUB.S $16, n
  112. round16aligned
  113. BPL loop16aligned
  114. B loop16finish
  115. loop16unaligned:
  116. SUB.S $16, n
  117. round16unaligned
  118. BPL loop16unaligned
  119. loop16finish:
  120. MOVW v1 @> 31, h
  121. ADD v2 @> 25, h
  122. ADD v3 @> 20, h
  123. ADD v4 @> 14, h
  124. // h += len(input) with v2 as temporary.
  125. MOVW input_len+4(FP), v2
  126. ADD v2, h
  127. loop16done:
  128. ADD $16, n // Restore number of bytes left.
  129. SUB.S $4, n
  130. MOVW $const_prime3, prime3r
  131. BMI loop4done
  132. MOVW $const_prime4, prime4r
  133. TST $3, p
  134. BNE loop4unaligned
  135. loop4aligned:
  136. SUB.S $4, n
  137. MOVW.P 4(p), x1
  138. MULA prime3r, x1, h, h
  139. MOVW h @> 15, h
  140. MUL prime4r, h
  141. BPL loop4aligned
  142. B loop4done
  143. loop4unaligned:
  144. SUB.S $4, n
  145. MOVBU.P 4(p), x1
  146. MOVBU -3(p), x2
  147. ORR x2 << 8, x1
  148. MOVBU -2(p), x3
  149. ORR x3 << 16, x1
  150. MOVBU -1(p), x4
  151. ORR x4 << 24, x1
  152. MULA prime3r, x1, h, h
  153. MOVW h @> 15, h
  154. MUL prime4r, h
  155. BPL loop4unaligned
  156. loop4done:
  157. ADD.S $4, n // Restore number of bytes left.
  158. BEQ end
  159. MOVW $const_prime5, prime5r
  160. loop1:
  161. SUB.S $1, n
  162. MOVBU.P 1(p), x1
  163. MULA prime5r, x1, h, h
  164. MOVW h @> 21, h
  165. MUL prime1r, h
  166. BNE loop1
  167. end:
  168. MOVW $const_prime3, prime3r
  169. EOR h >> 15, h
  170. MUL prime2r, h
  171. EOR h >> 13, h
  172. MUL prime3r, h
  173. EOR h >> 16, h
  174. MOVW h, ret+12(FP)
  175. RET
  176. // func update(v *[4]uint64, buf *[16]byte, p []byte)
  177. TEXT ·update(SB), NOFRAME|NOSPLIT, $-4-20
  178. MOVW v+0(FP), p
  179. MOVM.IA (p), [v1, v2, v3, v4]
  180. MOVW $const_prime1, prime1r
  181. MOVW $const_prime2, prime2r
  182. // Process buf, if not nil.
  183. MOVW buf+4(FP), p
  184. CMP $0, p
  185. BEQ noBuffered
  186. round16aligned
  187. noBuffered:
  188. MOVW input_base +8(FP), p
  189. MOVW input_len +12(FP), n
  190. SUB.S $16, n
  191. BMI end
  192. TST $3, p
  193. BNE loop16unaligned
  194. loop16aligned:
  195. SUB.S $16, n
  196. round16aligned
  197. BPL loop16aligned
  198. B end
  199. loop16unaligned:
  200. SUB.S $16, n
  201. round16unaligned
  202. BPL loop16unaligned
  203. end:
  204. MOVW v+0(FP), p
  205. MOVM.IA [v1, v2, v3, v4], (p)
  206. RET