decode_arm64.s 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230
  1. // +build gc
  2. // +build !noasm
  3. // This implementation assumes that strict alignment checking is turned off.
  4. // The Go compiler makes the same assumption.
  5. #include "go_asm.h"
  6. #include "textflag.h"
  7. // Register allocation.
  8. #define dst R0
  9. #define dstorig R1
  10. #define src R2
  11. #define dstend R3
  12. #define dstend16 R4 // dstend - 16
  13. #define srcend R5
  14. #define srcend16 R6 // srcend - 16
  15. #define match R7 // Match address.
  16. #define dict R8
  17. #define dictlen R9
  18. #define dictend R10
  19. #define token R11
  20. #define len R12 // Literal and match lengths.
  21. #define lenRem R13
  22. #define offset R14 // Match offset.
  23. #define tmp1 R15
  24. #define tmp2 R16
  25. #define tmp3 R17
  26. #define tmp4 R19
  27. // func decodeBlock(dst, src, dict []byte) int
  28. TEXT ·decodeBlock(SB), NOFRAME+NOSPLIT, $0-80
  29. LDP dst_base+0(FP), (dst, dstend)
  30. ADD dst, dstend
  31. MOVD dst, dstorig
  32. LDP src_base+24(FP), (src, srcend)
  33. CBZ srcend, shortSrc
  34. ADD src, srcend
  35. // dstend16 = max(dstend-16, 0) and similarly for srcend16.
  36. SUBS $16, dstend, dstend16
  37. CSEL LO, ZR, dstend16, dstend16
  38. SUBS $16, srcend, srcend16
  39. CSEL LO, ZR, srcend16, srcend16
  40. LDP dict_base+48(FP), (dict, dictlen)
  41. ADD dict, dictlen, dictend
  42. loop:
  43. // Read token. Extract literal length.
  44. MOVBU.P 1(src), token
  45. LSR $4, token, len
  46. CMP $15, len
  47. BNE readLitlenDone
  48. readLitlenLoop:
  49. CMP src, srcend
  50. BEQ shortSrc
  51. MOVBU.P 1(src), tmp1
  52. ADDS tmp1, len
  53. BVS shortDst
  54. CMP $255, tmp1
  55. BEQ readLitlenLoop
  56. readLitlenDone:
  57. CBZ len, copyLiteralDone
  58. // Bounds check dst+len and src+len.
  59. ADDS dst, len, tmp1
  60. BCS shortSrc
  61. ADDS src, len, tmp2
  62. BCS shortSrc
  63. CMP dstend, tmp1
  64. BHI shortDst
  65. CMP srcend, tmp2
  66. BHI shortSrc
  67. // Copy literal.
  68. SUBS $16, len
  69. BLO copyLiteralShort
  70. copyLiteralLoop:
  71. LDP.P 16(src), (tmp1, tmp2)
  72. STP.P (tmp1, tmp2), 16(dst)
  73. SUBS $16, len
  74. BPL copyLiteralLoop
  75. // Copy (final part of) literal of length 0-15.
  76. // If we have >=16 bytes left in src and dst, just copy 16 bytes.
  77. copyLiteralShort:
  78. CMP dstend16, dst
  79. CCMP LO, src, srcend16, $0b0010 // 0010 = preserve carry (LO).
  80. BHS copyLiteralShortEnd
  81. AND $15, len
  82. LDP (src), (tmp1, tmp2)
  83. ADD len, src
  84. STP (tmp1, tmp2), (dst)
  85. ADD len, dst
  86. B copyLiteralDone
  87. // Safe but slow copy near the end of src, dst.
  88. copyLiteralShortEnd:
  89. TBZ $3, len, 3(PC)
  90. MOVD.P 8(src), tmp1
  91. MOVD.P tmp1, 8(dst)
  92. TBZ $2, len, 3(PC)
  93. MOVW.P 4(src), tmp2
  94. MOVW.P tmp2, 4(dst)
  95. TBZ $1, len, 3(PC)
  96. MOVH.P 2(src), tmp3
  97. MOVH.P tmp3, 2(dst)
  98. TBZ $0, len, 3(PC)
  99. MOVBU.P 1(src), tmp4
  100. MOVB.P tmp4, 1(dst)
  101. copyLiteralDone:
  102. // Initial part of match length.
  103. AND $15, token, len
  104. CMP src, srcend
  105. BEQ end
  106. // Read offset.
  107. ADDS $2, src
  108. BCS shortSrc
  109. CMP srcend, src
  110. BHI shortSrc
  111. MOVHU -2(src), offset
  112. CBZ offset, corrupt
  113. // Read rest of match length.
  114. CMP $15, len
  115. BNE readMatchlenDone
  116. readMatchlenLoop:
  117. CMP src, srcend
  118. BEQ shortSrc
  119. MOVBU.P 1(src), tmp1
  120. ADDS tmp1, len
  121. BVS shortDst
  122. CMP $255, tmp1
  123. BEQ readMatchlenLoop
  124. readMatchlenDone:
  125. ADD $const_minMatch, len
  126. // Bounds check dst+len.
  127. ADDS dst, len, tmp2
  128. BCS shortDst
  129. CMP dstend, tmp2
  130. BHI shortDst
  131. SUB offset, dst, match
  132. CMP dstorig, match
  133. BHS copyMatchTry8
  134. // match < dstorig means the match starts in the dictionary,
  135. // at len(dict) - offset + (dst - dstorig).
  136. SUB dstorig, dst, tmp1
  137. SUB offset, dictlen, tmp2
  138. ADDS tmp2, tmp1
  139. BMI shortDict
  140. ADD dict, tmp1, match
  141. copyDict:
  142. MOVBU.P 1(match), tmp3
  143. MOVB.P tmp3, 1(dst)
  144. SUBS $1, len
  145. CCMP NE, dictend, match, $0b0100 // 0100 sets the Z (EQ) flag.
  146. BNE copyDict
  147. CBZ len, copyMatchDone
  148. // If the match extends beyond the dictionary, the rest is at dstorig.
  149. // Recompute the offset for the next check.
  150. MOVD dstorig, match
  151. SUB dstorig, dst, offset
  152. copyMatchTry8:
  153. // Copy doublewords if both len and offset are at least eight.
  154. // A 16-at-a-time loop doesn't provide a further speedup.
  155. CMP $8, len
  156. CCMP HS, offset, $8, $0
  157. BLO copyMatchLoop1
  158. AND $7, len, lenRem
  159. SUB $8, len
  160. copyMatchLoop8:
  161. MOVD.P 8(match), tmp1
  162. MOVD.P tmp1, 8(dst)
  163. SUBS $8, len
  164. BPL copyMatchLoop8
  165. MOVD (match)(len), tmp2 // match+len == match+lenRem-8.
  166. ADD lenRem, dst
  167. MOVD $0, len
  168. MOVD tmp2, -8(dst)
  169. B copyMatchDone
  170. copyMatchLoop1:
  171. // Byte-at-a-time copy for small offsets.
  172. MOVBU.P 1(match), tmp2
  173. MOVB.P tmp2, 1(dst)
  174. SUBS $1, len
  175. BNE copyMatchLoop1
  176. copyMatchDone:
  177. CMP src, srcend
  178. BNE loop
  179. end:
  180. CBNZ len, corrupt
  181. SUB dstorig, dst, tmp1
  182. MOVD tmp1, ret+72(FP)
  183. RET
  184. // The error cases have distinct labels so we can put different
  185. // return codes here when debugging, or if the error returns need to
  186. // be changed.
  187. shortDict:
  188. shortDst:
  189. shortSrc:
  190. corrupt:
  191. MOVD $-1, tmp1
  192. MOVD tmp1, ret+72(FP)
  193. RET