decode_arm.s 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231
  1. // +build gc
  2. // +build !noasm
  3. #include "go_asm.h"
  4. #include "textflag.h"
  5. // Register allocation.
  6. #define dst R0
  7. #define dstorig R1
  8. #define src R2
  9. #define dstend R3
  10. #define srcend R4
  11. #define match R5 // Match address.
  12. #define dictend R6
  13. #define token R7
  14. #define len R8 // Literal and match lengths.
  15. #define offset R7 // Match offset; overlaps with token.
  16. #define tmp1 R9
  17. #define tmp2 R11
  18. #define tmp3 R12
  19. // func decodeBlock(dst, src, dict []byte) int
  20. TEXT ·decodeBlock(SB), NOFRAME+NOSPLIT, $-4-40
  21. MOVW dst_base +0(FP), dst
  22. MOVW dst_len +4(FP), dstend
  23. MOVW src_base +12(FP), src
  24. MOVW src_len +16(FP), srcend
  25. CMP $0, srcend
  26. BEQ shortSrc
  27. ADD dst, dstend
  28. ADD src, srcend
  29. MOVW dst, dstorig
  30. loop:
  31. // Read token. Extract literal length.
  32. MOVBU.P 1(src), token
  33. MOVW token >> 4, len
  34. CMP $15, len
  35. BNE readLitlenDone
  36. readLitlenLoop:
  37. CMP src, srcend
  38. BEQ shortSrc
  39. MOVBU.P 1(src), tmp1
  40. ADD.S tmp1, len
  41. BVS shortDst
  42. CMP $255, tmp1
  43. BEQ readLitlenLoop
  44. readLitlenDone:
  45. CMP $0, len
  46. BEQ copyLiteralDone
  47. // Bounds check dst+len and src+len.
  48. ADD.S dst, len, tmp1
  49. ADD.CC.S src, len, tmp2
  50. BCS shortSrc
  51. CMP dstend, tmp1
  52. //BHI shortDst // Uncomment for distinct error codes.
  53. CMP.LS srcend, tmp2
  54. BHI shortSrc
  55. // Copy literal.
  56. CMP $4, len
  57. BLO copyLiteralFinish
  58. // Copy 0-3 bytes until src is aligned.
  59. TST $1, src
  60. MOVBU.NE.P 1(src), tmp1
  61. MOVB.NE.P tmp1, 1(dst)
  62. SUB.NE $1, len
  63. TST $2, src
  64. MOVHU.NE.P 2(src), tmp2
  65. MOVB.NE.P tmp2, 1(dst)
  66. MOVW.NE tmp2 >> 8, tmp1
  67. MOVB.NE.P tmp1, 1(dst)
  68. SUB.NE $2, len
  69. B copyLiteralLoopCond
  70. copyLiteralLoop:
  71. // Aligned load, unaligned write.
  72. MOVW.P 4(src), tmp1
  73. MOVW tmp1 >> 8, tmp2
  74. MOVB tmp2, 1(dst)
  75. MOVW tmp1 >> 16, tmp3
  76. MOVB tmp3, 2(dst)
  77. MOVW tmp1 >> 24, tmp2
  78. MOVB tmp2, 3(dst)
  79. MOVB.P tmp1, 4(dst)
  80. copyLiteralLoopCond:
  81. // Loop until len-4 < 0.
  82. SUB.S $4, len
  83. BPL copyLiteralLoop
  84. copyLiteralFinish:
  85. // Copy remaining 0-3 bytes.
  86. // At this point, len may be < 0, but len&3 is still accurate.
  87. TST $1, len
  88. MOVB.NE.P 1(src), tmp3
  89. MOVB.NE.P tmp3, 1(dst)
  90. TST $2, len
  91. MOVB.NE.P 2(src), tmp1
  92. MOVB.NE.P tmp1, 2(dst)
  93. MOVB.NE -1(src), tmp2
  94. MOVB.NE tmp2, -1(dst)
  95. copyLiteralDone:
  96. // Initial part of match length.
  97. // This frees up the token register for reuse as offset.
  98. AND $15, token, len
  99. CMP src, srcend
  100. BEQ end
  101. // Read offset.
  102. ADD.S $2, src
  103. BCS shortSrc
  104. CMP srcend, src
  105. BHI shortSrc
  106. MOVBU -2(src), offset
  107. MOVBU -1(src), tmp1
  108. ORR.S tmp1 << 8, offset
  109. BEQ corrupt
  110. // Read rest of match length.
  111. CMP $15, len
  112. BNE readMatchlenDone
  113. readMatchlenLoop:
  114. CMP src, srcend
  115. BEQ shortSrc
  116. MOVBU.P 1(src), tmp1
  117. ADD.S tmp1, len
  118. BVS shortDst
  119. CMP $255, tmp1
  120. BEQ readMatchlenLoop
  121. readMatchlenDone:
  122. // Bounds check dst+len+minMatch.
  123. ADD.S dst, len, tmp1
  124. ADD.CC.S $const_minMatch, tmp1
  125. BCS shortDst
  126. CMP dstend, tmp1
  127. BHI shortDst
  128. RSB dst, offset, match
  129. CMP dstorig, match
  130. BGE copyMatch4
  131. // match < dstorig means the match starts in the dictionary,
  132. // at len(dict) - offset + (dst - dstorig).
  133. MOVW dict_base+24(FP), match
  134. MOVW dict_len +28(FP), dictend
  135. ADD $const_minMatch, len
  136. RSB dst, dstorig, tmp1
  137. RSB dictend, offset, tmp2
  138. ADD.S tmp2, tmp1
  139. BMI shortDict
  140. ADD match, dictend
  141. ADD tmp1, match
  142. copyDict:
  143. MOVBU.P 1(match), tmp1
  144. MOVB.P tmp1, 1(dst)
  145. SUB.S $1, len
  146. CMP.NE match, dictend
  147. BNE copyDict
  148. // If the match extends beyond the dictionary, the rest is at dstorig.
  149. CMP $0, len
  150. BEQ copyMatchDone
  151. MOVW dstorig, match
  152. B copyMatch
  153. // Copy a regular match.
  154. // Since len+minMatch is at least four, we can do a 4× unrolled
  155. // byte copy loop. Using MOVW instead of four byte loads is faster,
  156. // but to remain portable we'd have to align match first, which is
  157. // too expensive. By alternating loads and stores, we also handle
  158. // the case offset < 4.
  159. copyMatch4:
  160. SUB.S $4, len
  161. MOVBU.P 4(match), tmp1
  162. MOVB.P tmp1, 4(dst)
  163. MOVBU -3(match), tmp2
  164. MOVB tmp2, -3(dst)
  165. MOVBU -2(match), tmp3
  166. MOVB tmp3, -2(dst)
  167. MOVBU -1(match), tmp1
  168. MOVB tmp1, -1(dst)
  169. BPL copyMatch4
  170. // Restore len, which is now negative.
  171. ADD.S $4, len
  172. BEQ copyMatchDone
  173. copyMatch:
  174. // Finish with a byte-at-a-time copy.
  175. SUB.S $1, len
  176. MOVBU.P 1(match), tmp2
  177. MOVB.P tmp2, 1(dst)
  178. BNE copyMatch
  179. copyMatchDone:
  180. CMP src, srcend
  181. BNE loop
  182. end:
  183. CMP $0, len
  184. BNE corrupt
  185. SUB dstorig, dst, tmp1
  186. MOVW tmp1, ret+36(FP)
  187. RET
  188. // The error cases have distinct labels so we can put different
  189. // return codes here when debugging, or if the error returns need to
  190. // be changed.
  191. shortDict:
  192. shortDst:
  193. shortSrc:
  194. corrupt:
  195. MOVW $-1, tmp1
  196. MOVW tmp1, ret+36(FP)
  197. RET