decode_amd64.s 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448
  1. // +build !appengine
  2. // +build gc
  3. // +build !noasm
  4. #include "go_asm.h"
  5. #include "textflag.h"
  6. // AX scratch
  7. // BX scratch
  8. // CX literal and match lengths
  9. // DX token, match offset
  10. //
  11. // DI &dst
  12. // SI &src
  13. // R8 &dst + len(dst)
  14. // R9 &src + len(src)
  15. // R11 &dst
  16. // R12 short output end
  17. // R13 short input end
  18. // R14 &dict
  19. // R15 len(dict)
  20. // func decodeBlock(dst, src, dict []byte) int
  21. TEXT ·decodeBlock(SB), NOSPLIT, $48-80
  22. MOVQ dst_base+0(FP), DI
  23. MOVQ DI, R11
  24. MOVQ dst_len+8(FP), R8
  25. ADDQ DI, R8
  26. MOVQ src_base+24(FP), SI
  27. MOVQ src_len+32(FP), R9
  28. CMPQ R9, $0
  29. JE err_corrupt
  30. ADDQ SI, R9
  31. MOVQ dict_base+48(FP), R14
  32. MOVQ dict_len+56(FP), R15
  33. // shortcut ends
  34. // short output end
  35. MOVQ R8, R12
  36. SUBQ $32, R12
  37. // short input end
  38. MOVQ R9, R13
  39. SUBQ $16, R13
  40. XORL CX, CX
  41. loop:
  42. // token := uint32(src[si])
  43. MOVBLZX (SI), DX
  44. INCQ SI
  45. // lit_len = token >> 4
  46. // if lit_len > 0
  47. // CX = lit_len
  48. MOVL DX, CX
  49. SHRL $4, CX
  50. // if lit_len != 0xF
  51. CMPL CX, $0xF
  52. JEQ lit_len_loop
  53. CMPQ DI, R12
  54. JAE copy_literal
  55. CMPQ SI, R13
  56. JAE copy_literal
  57. // copy shortcut
  58. // A two-stage shortcut for the most common case:
  59. // 1) If the literal length is 0..14, and there is enough space,
  60. // enter the shortcut and copy 16 bytes on behalf of the literals
  61. // (in the fast mode, only 8 bytes can be safely copied this way).
  62. // 2) Further if the match length is 4..18, copy 18 bytes in a similar
  63. // manner; but we ensure that there's enough space in the output for
  64. // those 18 bytes earlier, upon entering the shortcut (in other words,
  65. // there is a combined check for both stages).
  66. // copy literal
  67. MOVOU (SI), X0
  68. MOVOU X0, (DI)
  69. ADDQ CX, DI
  70. ADDQ CX, SI
  71. MOVL DX, CX
  72. ANDL $0xF, CX
  73. // The second stage: prepare for match copying, decode full info.
  74. // If it doesn't work out, the info won't be wasted.
  75. // offset := uint16(data[:2])
  76. MOVWLZX (SI), DX
  77. TESTL DX, DX
  78. JE err_corrupt
  79. ADDQ $2, SI
  80. JC err_short_buf
  81. MOVQ DI, AX
  82. SUBQ DX, AX
  83. JC err_corrupt
  84. CMPQ AX, DI
  85. JA err_short_buf
  86. // if we can't do the second stage then jump straight to read the
  87. // match length, we already have the offset.
  88. CMPL CX, $0xF
  89. JEQ match_len_loop_pre
  90. CMPL DX, $8
  91. JLT match_len_loop_pre
  92. CMPQ AX, R11
  93. JB match_len_loop_pre
  94. // memcpy(op + 0, match + 0, 8);
  95. MOVQ (AX), BX
  96. MOVQ BX, (DI)
  97. // memcpy(op + 8, match + 8, 8);
  98. MOVQ 8(AX), BX
  99. MOVQ BX, 8(DI)
  100. // memcpy(op +16, match +16, 2);
  101. MOVW 16(AX), BX
  102. MOVW BX, 16(DI)
  103. LEAQ const_minMatch(DI)(CX*1), DI
  104. // shortcut complete, load next token
  105. JMP loopcheck
  106. // Read the rest of the literal length:
  107. // do { BX = src[si++]; lit_len += BX } while (BX == 0xFF).
  108. lit_len_loop:
  109. CMPQ SI, R9
  110. JAE err_short_buf
  111. MOVBLZX (SI), BX
  112. INCQ SI
  113. ADDQ BX, CX
  114. CMPB BX, $0xFF
  115. JE lit_len_loop
  116. copy_literal:
  117. // bounds check src and dst
  118. MOVQ SI, AX
  119. ADDQ CX, AX
  120. JC err_short_buf
  121. CMPQ AX, R9
  122. JA err_short_buf
  123. MOVQ DI, BX
  124. ADDQ CX, BX
  125. JC err_short_buf
  126. CMPQ BX, R8
  127. JA err_short_buf
  128. // Copy literals of <=48 bytes through the XMM registers.
  129. CMPQ CX, $48
  130. JGT memmove_lit
  131. // if len(dst[di:]) < 48
  132. MOVQ R8, AX
  133. SUBQ DI, AX
  134. CMPQ AX, $48
  135. JLT memmove_lit
  136. // if len(src[si:]) < 48
  137. MOVQ R9, BX
  138. SUBQ SI, BX
  139. CMPQ BX, $48
  140. JLT memmove_lit
  141. MOVOU (SI), X0
  142. MOVOU 16(SI), X1
  143. MOVOU 32(SI), X2
  144. MOVOU X0, (DI)
  145. MOVOU X1, 16(DI)
  146. MOVOU X2, 32(DI)
  147. ADDQ CX, SI
  148. ADDQ CX, DI
  149. JMP finish_lit_copy
  150. memmove_lit:
  151. // memmove(to, from, len)
  152. MOVQ DI, 0(SP)
  153. MOVQ SI, 8(SP)
  154. MOVQ CX, 16(SP)
  155. // Spill registers. Increment SI, DI now so we don't need to save CX.
  156. ADDQ CX, DI
  157. ADDQ CX, SI
  158. MOVQ DI, 24(SP)
  159. MOVQ SI, 32(SP)
  160. MOVL DX, 40(SP)
  161. CALL runtime·memmove(SB)
  162. // restore registers
  163. MOVQ 24(SP), DI
  164. MOVQ 32(SP), SI
  165. MOVL 40(SP), DX
  166. // recalc initial values
  167. MOVQ dst_base+0(FP), R8
  168. MOVQ R8, R11
  169. ADDQ dst_len+8(FP), R8
  170. MOVQ src_base+24(FP), R9
  171. ADDQ src_len+32(FP), R9
  172. MOVQ dict_base+48(FP), R14
  173. MOVQ dict_len+56(FP), R15
  174. MOVQ R8, R12
  175. SUBQ $32, R12
  176. MOVQ R9, R13
  177. SUBQ $16, R13
  178. finish_lit_copy:
  179. // CX := mLen
  180. // free up DX to use for offset
  181. MOVL DX, CX
  182. ANDL $0xF, CX
  183. CMPQ SI, R9
  184. JAE end
  185. // offset
  186. // si += 2
  187. // DX := int(src[si-2]) | int(src[si-1])<<8
  188. ADDQ $2, SI
  189. JC err_short_buf
  190. CMPQ SI, R9
  191. JA err_short_buf
  192. MOVWQZX -2(SI), DX
  193. // 0 offset is invalid
  194. TESTL DX, DX
  195. JEQ err_corrupt
  196. match_len_loop_pre:
  197. // if mlen != 0xF
  198. CMPB CX, $0xF
  199. JNE copy_match
  200. // do { BX = src[si++]; mlen += BX } while (BX == 0xFF).
  201. match_len_loop:
  202. CMPQ SI, R9
  203. JAE err_short_buf
  204. MOVBLZX (SI), BX
  205. INCQ SI
  206. ADDQ BX, CX
  207. CMPB BX, $0xFF
  208. JE match_len_loop
  209. copy_match:
  210. ADDQ $const_minMatch, CX
  211. // check we have match_len bytes left in dst
  212. // di+match_len < len(dst)
  213. MOVQ DI, AX
  214. ADDQ CX, AX
  215. JC err_short_buf
  216. CMPQ AX, R8
  217. JA err_short_buf
  218. // DX = offset
  219. // CX = match_len
  220. // BX = &dst + (di - offset)
  221. MOVQ DI, BX
  222. SUBQ DX, BX
  223. // check BX is within dst
  224. // if BX < &dst
  225. JC copy_match_from_dict
  226. CMPQ BX, R11
  227. JBE copy_match_from_dict
  228. // if offset + match_len < di
  229. LEAQ (BX)(CX*1), AX
  230. CMPQ DI, AX
  231. JA copy_interior_match
  232. // AX := len(dst[:di])
  233. // MOVQ DI, AX
  234. // SUBQ R11, AX
  235. // copy 16 bytes at a time
  236. // if di-offset < 16 copy 16-(di-offset) bytes to di
  237. // then do the remaining
  238. copy_match_loop:
  239. // for match_len >= 0
  240. // dst[di] = dst[i]
  241. // di++
  242. // i++
  243. MOVB (BX), AX
  244. MOVB AX, (DI)
  245. INCQ DI
  246. INCQ BX
  247. DECQ CX
  248. JNZ copy_match_loop
  249. JMP loopcheck
  250. copy_interior_match:
  251. CMPQ CX, $16
  252. JGT memmove_match
  253. // if len(dst[di:]) < 16
  254. MOVQ R8, AX
  255. SUBQ DI, AX
  256. CMPQ AX, $16
  257. JLT memmove_match
  258. MOVOU (BX), X0
  259. MOVOU X0, (DI)
  260. ADDQ CX, DI
  261. XORL CX, CX
  262. JMP loopcheck
  263. copy_match_from_dict:
  264. // CX = match_len
  265. // BX = &dst + (di - offset)
  266. // AX = offset - di = dict_bytes_available => count of bytes potentially covered by the dictionary
  267. MOVQ R11, AX
  268. SUBQ BX, AX
  269. // BX = len(dict) - dict_bytes_available
  270. MOVQ R15, BX
  271. SUBQ AX, BX
  272. JS err_short_dict
  273. ADDQ R14, BX
  274. // if match_len > dict_bytes_available, match fits entirely within external dictionary : just copy
  275. CMPQ CX, AX
  276. JLT memmove_match
  277. // The match stretches over the dictionary and our block
  278. // 1) copy what comes from the dictionary
  279. // AX = dict_bytes_available = copy_size
  280. // BX = &dict_end - copy_size
  281. // CX = match_len
  282. // memmove(to, from, len)
  283. MOVQ DI, 0(SP)
  284. MOVQ BX, 8(SP)
  285. MOVQ AX, 16(SP)
  286. // store extra stuff we want to recover
  287. // spill
  288. MOVQ DI, 24(SP)
  289. MOVQ SI, 32(SP)
  290. MOVQ CX, 40(SP)
  291. CALL runtime·memmove(SB)
  292. // restore registers
  293. MOVQ 16(SP), AX // copy_size
  294. MOVQ 24(SP), DI
  295. MOVQ 32(SP), SI
  296. MOVQ 40(SP), CX // match_len
  297. // recalc initial values
  298. MOVQ dst_base+0(FP), R8
  299. MOVQ R8, R11 // TODO: make these sensible numbers
  300. ADDQ dst_len+8(FP), R8
  301. MOVQ src_base+24(FP), R9
  302. ADDQ src_len+32(FP), R9
  303. MOVQ dict_base+48(FP), R14
  304. MOVQ dict_len+56(FP), R15
  305. MOVQ R8, R12
  306. SUBQ $32, R12
  307. MOVQ R9, R13
  308. SUBQ $16, R13
  309. // di+=copy_size
  310. ADDQ AX, DI
  311. // 2) copy the rest from the current block
  312. // CX = match_len - copy_size = rest_size
  313. SUBQ AX, CX
  314. MOVQ R11, BX
  315. // check if we have a copy overlap
  316. // AX = &dst + rest_size
  317. MOVQ CX, AX
  318. ADDQ BX, AX
  319. // if &dst + rest_size > di, copy byte by byte
  320. CMPQ AX, DI
  321. JA copy_match_loop
  322. memmove_match:
  323. // memmove(to, from, len)
  324. MOVQ DI, 0(SP)
  325. MOVQ BX, 8(SP)
  326. MOVQ CX, 16(SP)
  327. // Spill registers. Increment DI now so we don't need to save CX.
  328. ADDQ CX, DI
  329. MOVQ DI, 24(SP)
  330. MOVQ SI, 32(SP)
  331. CALL runtime·memmove(SB)
  332. // restore registers
  333. MOVQ 24(SP), DI
  334. MOVQ 32(SP), SI
  335. // recalc initial values
  336. MOVQ dst_base+0(FP), R8
  337. MOVQ R8, R11 // TODO: make these sensible numbers
  338. ADDQ dst_len+8(FP), R8
  339. MOVQ src_base+24(FP), R9
  340. ADDQ src_len+32(FP), R9
  341. MOVQ R8, R12
  342. SUBQ $32, R12
  343. MOVQ R9, R13
  344. SUBQ $16, R13
  345. MOVQ dict_base+48(FP), R14
  346. MOVQ dict_len+56(FP), R15
  347. XORL CX, CX
  348. loopcheck:
  349. // for si < len(src)
  350. CMPQ SI, R9
  351. JB loop
  352. end:
  353. // Remaining length must be zero.
  354. TESTQ CX, CX
  355. JNE err_corrupt
  356. SUBQ R11, DI
  357. MOVQ DI, ret+72(FP)
  358. RET
  359. err_corrupt:
  360. MOVQ $-1, ret+72(FP)
  361. RET
  362. err_short_buf:
  363. MOVQ $-2, ret+72(FP)
  364. RET
  365. err_short_dict:
  366. MOVQ $-3, ret+72(FP)
  367. RET