seqdec_amd64.s 82 KB


  1. // Code generated by command: go run gen.go -out ../seqdec_amd64.s -pkg=zstd. DO NOT EDIT.
  2. //go:build !appengine && !noasm && gc && !noasm
  3. // func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
  4. // Requires: CMOV
  5. TEXT ·sequenceDecs_decode_amd64(SB), $8-32
  6. MOVQ br+8(FP), CX
  7. MOVQ 24(CX), DX
  8. MOVBQZX 32(CX), BX
  9. MOVQ (CX), AX
  10. MOVQ 8(CX), SI
  11. ADDQ SI, AX
  12. MOVQ AX, (SP)
  13. MOVQ ctx+16(FP), AX
  14. MOVQ 72(AX), DI
  15. MOVQ 80(AX), R8
  16. MOVQ 88(AX), R9
  17. MOVQ 104(AX), R10
  18. MOVQ s+0(FP), AX
  19. MOVQ 144(AX), R11
  20. MOVQ 152(AX), R12
  21. MOVQ 160(AX), R13
  22. sequenceDecs_decode_amd64_main_loop:
  23. MOVQ (SP), R14
  24. // Fill bitreader to have enough for the offset and match length.
  25. CMPQ SI, $0x08
  26. JL sequenceDecs_decode_amd64_fill_byte_by_byte
  27. MOVQ BX, AX
  28. SHRQ $0x03, AX
  29. SUBQ AX, R14
  30. MOVQ (R14), DX
  31. SUBQ AX, SI
  32. ANDQ $0x07, BX
  33. JMP sequenceDecs_decode_amd64_fill_end
  34. sequenceDecs_decode_amd64_fill_byte_by_byte:
  35. CMPQ SI, $0x00
  36. JLE sequenceDecs_decode_amd64_fill_check_overread
  37. CMPQ BX, $0x07
  38. JLE sequenceDecs_decode_amd64_fill_end
  39. SHLQ $0x08, DX
  40. SUBQ $0x01, R14
  41. SUBQ $0x01, SI
  42. SUBQ $0x08, BX
  43. MOVBQZX (R14), AX
  44. ORQ AX, DX
  45. JMP sequenceDecs_decode_amd64_fill_byte_by_byte
  46. sequenceDecs_decode_amd64_fill_check_overread:
  47. CMPQ BX, $0x40
  48. JA error_overread
  49. sequenceDecs_decode_amd64_fill_end:
  50. // Update offset
  51. MOVQ R9, AX
  52. MOVQ BX, CX
  53. MOVQ DX, R15
  54. SHLQ CL, R15
  55. MOVB AH, CL
  56. SHRQ $0x20, AX
  57. TESTQ CX, CX
  58. JZ sequenceDecs_decode_amd64_of_update_zero
  59. ADDQ CX, BX
  60. CMPQ BX, $0x40
  61. JA sequenceDecs_decode_amd64_of_update_zero
  62. CMPQ CX, $0x40
  63. JAE sequenceDecs_decode_amd64_of_update_zero
  64. NEGQ CX
  65. SHRQ CL, R15
  66. ADDQ R15, AX
  67. sequenceDecs_decode_amd64_of_update_zero:
  68. MOVQ AX, 16(R10)
  69. // Update match length
  70. MOVQ R8, AX
  71. MOVQ BX, CX
  72. MOVQ DX, R15
  73. SHLQ CL, R15
  74. MOVB AH, CL
  75. SHRQ $0x20, AX
  76. TESTQ CX, CX
  77. JZ sequenceDecs_decode_amd64_ml_update_zero
  78. ADDQ CX, BX
  79. CMPQ BX, $0x40
  80. JA sequenceDecs_decode_amd64_ml_update_zero
  81. CMPQ CX, $0x40
  82. JAE sequenceDecs_decode_amd64_ml_update_zero
  83. NEGQ CX
  84. SHRQ CL, R15
  85. ADDQ R15, AX
  86. sequenceDecs_decode_amd64_ml_update_zero:
  87. MOVQ AX, 8(R10)
  88. // Fill bitreader to have enough for the remaining
  89. CMPQ SI, $0x08
  90. JL sequenceDecs_decode_amd64_fill_2_byte_by_byte
  91. MOVQ BX, AX
  92. SHRQ $0x03, AX
  93. SUBQ AX, R14
  94. MOVQ (R14), DX
  95. SUBQ AX, SI
  96. ANDQ $0x07, BX
  97. JMP sequenceDecs_decode_amd64_fill_2_end
  98. sequenceDecs_decode_amd64_fill_2_byte_by_byte:
  99. CMPQ SI, $0x00
  100. JLE sequenceDecs_decode_amd64_fill_2_check_overread
  101. CMPQ BX, $0x07
  102. JLE sequenceDecs_decode_amd64_fill_2_end
  103. SHLQ $0x08, DX
  104. SUBQ $0x01, R14
  105. SUBQ $0x01, SI
  106. SUBQ $0x08, BX
  107. MOVBQZX (R14), AX
  108. ORQ AX, DX
  109. JMP sequenceDecs_decode_amd64_fill_2_byte_by_byte
  110. sequenceDecs_decode_amd64_fill_2_check_overread:
  111. CMPQ BX, $0x40
  112. JA error_overread
  113. sequenceDecs_decode_amd64_fill_2_end:
  114. // Update literal length
  115. MOVQ DI, AX
  116. MOVQ BX, CX
  117. MOVQ DX, R15
  118. SHLQ CL, R15
  119. MOVB AH, CL
  120. SHRQ $0x20, AX
  121. TESTQ CX, CX
  122. JZ sequenceDecs_decode_amd64_ll_update_zero
  123. ADDQ CX, BX
  124. CMPQ BX, $0x40
  125. JA sequenceDecs_decode_amd64_ll_update_zero
  126. CMPQ CX, $0x40
  127. JAE sequenceDecs_decode_amd64_ll_update_zero
  128. NEGQ CX
  129. SHRQ CL, R15
  130. ADDQ R15, AX
  131. sequenceDecs_decode_amd64_ll_update_zero:
  132. MOVQ AX, (R10)
  133. // Fill bitreader for state updates
  134. MOVQ R14, (SP)
  135. MOVQ R9, AX
  136. SHRQ $0x08, AX
  137. MOVBQZX AL, AX
  138. MOVQ ctx+16(FP), CX
  139. CMPQ 96(CX), $0x00
  140. JZ sequenceDecs_decode_amd64_skip_update
  141. // Update Literal Length State
  142. MOVBQZX DI, R14
  143. SHRQ $0x10, DI
  144. MOVWQZX DI, DI
  145. LEAQ (BX)(R14*1), CX
  146. MOVQ DX, R15
  147. MOVQ CX, BX
  148. ROLQ CL, R15
  149. MOVL $0x00000001, BP
  150. MOVB R14, CL
  151. SHLL CL, BP
  152. DECL BP
  153. ANDQ BP, R15
  154. ADDQ R15, DI
  155. // Load ctx.llTable
  156. MOVQ ctx+16(FP), CX
  157. MOVQ (CX), CX
  158. MOVQ (CX)(DI*8), DI
  159. // Update Match Length State
  160. MOVBQZX R8, R14
  161. SHRQ $0x10, R8
  162. MOVWQZX R8, R8
  163. LEAQ (BX)(R14*1), CX
  164. MOVQ DX, R15
  165. MOVQ CX, BX
  166. ROLQ CL, R15
  167. MOVL $0x00000001, BP
  168. MOVB R14, CL
  169. SHLL CL, BP
  170. DECL BP
  171. ANDQ BP, R15
  172. ADDQ R15, R8
  173. // Load ctx.mlTable
  174. MOVQ ctx+16(FP), CX
  175. MOVQ 24(CX), CX
  176. MOVQ (CX)(R8*8), R8
  177. // Update Offset State
  178. MOVBQZX R9, R14
  179. SHRQ $0x10, R9
  180. MOVWQZX R9, R9
  181. LEAQ (BX)(R14*1), CX
  182. MOVQ DX, R15
  183. MOVQ CX, BX
  184. ROLQ CL, R15
  185. MOVL $0x00000001, BP
  186. MOVB R14, CL
  187. SHLL CL, BP
  188. DECL BP
  189. ANDQ BP, R15
  190. ADDQ R15, R9
  191. // Load ctx.ofTable
  192. MOVQ ctx+16(FP), CX
  193. MOVQ 48(CX), CX
  194. MOVQ (CX)(R9*8), R9
  195. sequenceDecs_decode_amd64_skip_update:
  196. // Adjust offset
  197. MOVQ 16(R10), CX
  198. CMPQ AX, $0x01
  199. JBE sequenceDecs_decode_amd64_adjust_offsetB_1_or_0
  200. MOVQ R12, R13
  201. MOVQ R11, R12
  202. MOVQ CX, R11
  203. JMP sequenceDecs_decode_amd64_after_adjust
  204. sequenceDecs_decode_amd64_adjust_offsetB_1_or_0:
  205. CMPQ (R10), $0x00000000
  206. JNE sequenceDecs_decode_amd64_adjust_offset_maybezero
  207. INCQ CX
  208. JMP sequenceDecs_decode_amd64_adjust_offset_nonzero
  209. sequenceDecs_decode_amd64_adjust_offset_maybezero:
  210. TESTQ CX, CX
  211. JNZ sequenceDecs_decode_amd64_adjust_offset_nonzero
  212. MOVQ R11, CX
  213. JMP sequenceDecs_decode_amd64_after_adjust
  214. sequenceDecs_decode_amd64_adjust_offset_nonzero:
  215. CMPQ CX, $0x01
  216. JB sequenceDecs_decode_amd64_adjust_zero
  217. JEQ sequenceDecs_decode_amd64_adjust_one
  218. CMPQ CX, $0x02
  219. JA sequenceDecs_decode_amd64_adjust_three
  220. JMP sequenceDecs_decode_amd64_adjust_two
  221. sequenceDecs_decode_amd64_adjust_zero:
  222. MOVQ R11, AX
  223. JMP sequenceDecs_decode_amd64_adjust_test_temp_valid
  224. sequenceDecs_decode_amd64_adjust_one:
  225. MOVQ R12, AX
  226. JMP sequenceDecs_decode_amd64_adjust_test_temp_valid
  227. sequenceDecs_decode_amd64_adjust_two:
  228. MOVQ R13, AX
  229. JMP sequenceDecs_decode_amd64_adjust_test_temp_valid
  230. sequenceDecs_decode_amd64_adjust_three:
  231. LEAQ -1(R11), AX
  232. sequenceDecs_decode_amd64_adjust_test_temp_valid:
  233. TESTQ AX, AX
  234. JNZ sequenceDecs_decode_amd64_adjust_temp_valid
  235. MOVQ $0x00000001, AX
  236. sequenceDecs_decode_amd64_adjust_temp_valid:
  237. CMPQ CX, $0x01
  238. CMOVQNE R12, R13
  239. MOVQ R11, R12
  240. MOVQ AX, R11
  241. MOVQ AX, CX
  242. sequenceDecs_decode_amd64_after_adjust:
  243. MOVQ CX, 16(R10)
  244. // Check values
  245. MOVQ 8(R10), AX
  246. MOVQ (R10), R14
  247. LEAQ (AX)(R14*1), R15
  248. MOVQ s+0(FP), BP
  249. ADDQ R15, 256(BP)
  250. MOVQ ctx+16(FP), R15
  251. SUBQ R14, 128(R15)
  252. JS error_not_enough_literals
  253. CMPQ AX, $0x00020002
  254. JA sequenceDecs_decode_amd64_error_match_len_too_big
  255. TESTQ CX, CX
  256. JNZ sequenceDecs_decode_amd64_match_len_ofs_ok
  257. TESTQ AX, AX
  258. JNZ sequenceDecs_decode_amd64_error_match_len_ofs_mismatch
  259. sequenceDecs_decode_amd64_match_len_ofs_ok:
  260. ADDQ $0x18, R10
  261. MOVQ ctx+16(FP), AX
  262. DECQ 96(AX)
  263. JNS sequenceDecs_decode_amd64_main_loop
  264. MOVQ s+0(FP), AX
  265. MOVQ R11, 144(AX)
  266. MOVQ R12, 152(AX)
  267. MOVQ R13, 160(AX)
  268. MOVQ br+8(FP), AX
  269. MOVQ DX, 24(AX)
  270. MOVB BL, 32(AX)
  271. MOVQ SI, 8(AX)
  272. // Return success
  273. MOVQ $0x00000000, ret+24(FP)
  274. RET
  275. // Return with match length error
  276. sequenceDecs_decode_amd64_error_match_len_ofs_mismatch:
  277. MOVQ $0x00000001, ret+24(FP)
  278. RET
  279. // Return with match too long error
  280. sequenceDecs_decode_amd64_error_match_len_too_big:
  281. MOVQ $0x00000002, ret+24(FP)
  282. RET
  283. // Return with match offset too long error
  284. MOVQ $0x00000003, ret+24(FP)
  285. RET
  286. // Return with not enough literals error
  287. error_not_enough_literals:
  288. MOVQ $0x00000004, ret+24(FP)
  289. RET
  290. // Return with overread error
  291. error_overread:
  292. MOVQ $0x00000006, ret+24(FP)
  293. RET
  294. // func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
  295. // Requires: CMOV
  296. TEXT ·sequenceDecs_decode_56_amd64(SB), $8-32
  297. MOVQ br+8(FP), CX
  298. MOVQ 24(CX), DX
  299. MOVBQZX 32(CX), BX
  300. MOVQ (CX), AX
  301. MOVQ 8(CX), SI
  302. ADDQ SI, AX
  303. MOVQ AX, (SP)
  304. MOVQ ctx+16(FP), AX
  305. MOVQ 72(AX), DI
  306. MOVQ 80(AX), R8
  307. MOVQ 88(AX), R9
  308. MOVQ 104(AX), R10
  309. MOVQ s+0(FP), AX
  310. MOVQ 144(AX), R11
  311. MOVQ 152(AX), R12
  312. MOVQ 160(AX), R13
  313. sequenceDecs_decode_56_amd64_main_loop:
  314. MOVQ (SP), R14
  315. // Fill bitreader to have enough for the offset and match length.
  316. CMPQ SI, $0x08
  317. JL sequenceDecs_decode_56_amd64_fill_byte_by_byte
  318. MOVQ BX, AX
  319. SHRQ $0x03, AX
  320. SUBQ AX, R14
  321. MOVQ (R14), DX
  322. SUBQ AX, SI
  323. ANDQ $0x07, BX
  324. JMP sequenceDecs_decode_56_amd64_fill_end
  325. sequenceDecs_decode_56_amd64_fill_byte_by_byte:
  326. CMPQ SI, $0x00
  327. JLE sequenceDecs_decode_56_amd64_fill_check_overread
  328. CMPQ BX, $0x07
  329. JLE sequenceDecs_decode_56_amd64_fill_end
  330. SHLQ $0x08, DX
  331. SUBQ $0x01, R14
  332. SUBQ $0x01, SI
  333. SUBQ $0x08, BX
  334. MOVBQZX (R14), AX
  335. ORQ AX, DX
  336. JMP sequenceDecs_decode_56_amd64_fill_byte_by_byte
  337. sequenceDecs_decode_56_amd64_fill_check_overread:
  338. CMPQ BX, $0x40
  339. JA error_overread
  340. sequenceDecs_decode_56_amd64_fill_end:
  341. // Update offset
  342. MOVQ R9, AX
  343. MOVQ BX, CX
  344. MOVQ DX, R15
  345. SHLQ CL, R15
  346. MOVB AH, CL
  347. SHRQ $0x20, AX
  348. TESTQ CX, CX
  349. JZ sequenceDecs_decode_56_amd64_of_update_zero
  350. ADDQ CX, BX
  351. CMPQ BX, $0x40
  352. JA sequenceDecs_decode_56_amd64_of_update_zero
  353. CMPQ CX, $0x40
  354. JAE sequenceDecs_decode_56_amd64_of_update_zero
  355. NEGQ CX
  356. SHRQ CL, R15
  357. ADDQ R15, AX
  358. sequenceDecs_decode_56_amd64_of_update_zero:
  359. MOVQ AX, 16(R10)
  360. // Update match length
  361. MOVQ R8, AX
  362. MOVQ BX, CX
  363. MOVQ DX, R15
  364. SHLQ CL, R15
  365. MOVB AH, CL
  366. SHRQ $0x20, AX
  367. TESTQ CX, CX
  368. JZ sequenceDecs_decode_56_amd64_ml_update_zero
  369. ADDQ CX, BX
  370. CMPQ BX, $0x40
  371. JA sequenceDecs_decode_56_amd64_ml_update_zero
  372. CMPQ CX, $0x40
  373. JAE sequenceDecs_decode_56_amd64_ml_update_zero
  374. NEGQ CX
  375. SHRQ CL, R15
  376. ADDQ R15, AX
  377. sequenceDecs_decode_56_amd64_ml_update_zero:
  378. MOVQ AX, 8(R10)
  379. // Update literal length
  380. MOVQ DI, AX
  381. MOVQ BX, CX
  382. MOVQ DX, R15
  383. SHLQ CL, R15
  384. MOVB AH, CL
  385. SHRQ $0x20, AX
  386. TESTQ CX, CX
  387. JZ sequenceDecs_decode_56_amd64_ll_update_zero
  388. ADDQ CX, BX
  389. CMPQ BX, $0x40
  390. JA sequenceDecs_decode_56_amd64_ll_update_zero
  391. CMPQ CX, $0x40
  392. JAE sequenceDecs_decode_56_amd64_ll_update_zero
  393. NEGQ CX
  394. SHRQ CL, R15
  395. ADDQ R15, AX
  396. sequenceDecs_decode_56_amd64_ll_update_zero:
  397. MOVQ AX, (R10)
  398. // Fill bitreader for state updates
  399. MOVQ R14, (SP)
  400. MOVQ R9, AX
  401. SHRQ $0x08, AX
  402. MOVBQZX AL, AX
  403. MOVQ ctx+16(FP), CX
  404. CMPQ 96(CX), $0x00
  405. JZ sequenceDecs_decode_56_amd64_skip_update
  406. // Update Literal Length State
  407. MOVBQZX DI, R14
  408. SHRQ $0x10, DI
  409. MOVWQZX DI, DI
  410. LEAQ (BX)(R14*1), CX
  411. MOVQ DX, R15
  412. MOVQ CX, BX
  413. ROLQ CL, R15
  414. MOVL $0x00000001, BP
  415. MOVB R14, CL
  416. SHLL CL, BP
  417. DECL BP
  418. ANDQ BP, R15
  419. ADDQ R15, DI
  420. // Load ctx.llTable
  421. MOVQ ctx+16(FP), CX
  422. MOVQ (CX), CX
  423. MOVQ (CX)(DI*8), DI
  424. // Update Match Length State
  425. MOVBQZX R8, R14
  426. SHRQ $0x10, R8
  427. MOVWQZX R8, R8
  428. LEAQ (BX)(R14*1), CX
  429. MOVQ DX, R15
  430. MOVQ CX, BX
  431. ROLQ CL, R15
  432. MOVL $0x00000001, BP
  433. MOVB R14, CL
  434. SHLL CL, BP
  435. DECL BP
  436. ANDQ BP, R15
  437. ADDQ R15, R8
  438. // Load ctx.mlTable
  439. MOVQ ctx+16(FP), CX
  440. MOVQ 24(CX), CX
  441. MOVQ (CX)(R8*8), R8
  442. // Update Offset State
  443. MOVBQZX R9, R14
  444. SHRQ $0x10, R9
  445. MOVWQZX R9, R9
  446. LEAQ (BX)(R14*1), CX
  447. MOVQ DX, R15
  448. MOVQ CX, BX
  449. ROLQ CL, R15
  450. MOVL $0x00000001, BP
  451. MOVB R14, CL
  452. SHLL CL, BP
  453. DECL BP
  454. ANDQ BP, R15
  455. ADDQ R15, R9
  456. // Load ctx.ofTable
  457. MOVQ ctx+16(FP), CX
  458. MOVQ 48(CX), CX
  459. MOVQ (CX)(R9*8), R9
  460. sequenceDecs_decode_56_amd64_skip_update:
  461. // Adjust offset
  462. MOVQ 16(R10), CX
  463. CMPQ AX, $0x01
  464. JBE sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0
  465. MOVQ R12, R13
  466. MOVQ R11, R12
  467. MOVQ CX, R11
  468. JMP sequenceDecs_decode_56_amd64_after_adjust
  469. sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0:
  470. CMPQ (R10), $0x00000000
  471. JNE sequenceDecs_decode_56_amd64_adjust_offset_maybezero
  472. INCQ CX
  473. JMP sequenceDecs_decode_56_amd64_adjust_offset_nonzero
  474. sequenceDecs_decode_56_amd64_adjust_offset_maybezero:
  475. TESTQ CX, CX
  476. JNZ sequenceDecs_decode_56_amd64_adjust_offset_nonzero
  477. MOVQ R11, CX
  478. JMP sequenceDecs_decode_56_amd64_after_adjust
  479. sequenceDecs_decode_56_amd64_adjust_offset_nonzero:
  480. CMPQ CX, $0x01
  481. JB sequenceDecs_decode_56_amd64_adjust_zero
  482. JEQ sequenceDecs_decode_56_amd64_adjust_one
  483. CMPQ CX, $0x02
  484. JA sequenceDecs_decode_56_amd64_adjust_three
  485. JMP sequenceDecs_decode_56_amd64_adjust_two
  486. sequenceDecs_decode_56_amd64_adjust_zero:
  487. MOVQ R11, AX
  488. JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid
  489. sequenceDecs_decode_56_amd64_adjust_one:
  490. MOVQ R12, AX
  491. JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid
  492. sequenceDecs_decode_56_amd64_adjust_two:
  493. MOVQ R13, AX
  494. JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid
  495. sequenceDecs_decode_56_amd64_adjust_three:
  496. LEAQ -1(R11), AX
  497. sequenceDecs_decode_56_amd64_adjust_test_temp_valid:
  498. TESTQ AX, AX
  499. JNZ sequenceDecs_decode_56_amd64_adjust_temp_valid
  500. MOVQ $0x00000001, AX
  501. sequenceDecs_decode_56_amd64_adjust_temp_valid:
  502. CMPQ CX, $0x01
  503. CMOVQNE R12, R13
  504. MOVQ R11, R12
  505. MOVQ AX, R11
  506. MOVQ AX, CX
  507. sequenceDecs_decode_56_amd64_after_adjust:
  508. MOVQ CX, 16(R10)
  509. // Check values
  510. MOVQ 8(R10), AX
  511. MOVQ (R10), R14
  512. LEAQ (AX)(R14*1), R15
  513. MOVQ s+0(FP), BP
  514. ADDQ R15, 256(BP)
  515. MOVQ ctx+16(FP), R15
  516. SUBQ R14, 128(R15)
  517. JS error_not_enough_literals
  518. CMPQ AX, $0x00020002
  519. JA sequenceDecs_decode_56_amd64_error_match_len_too_big
  520. TESTQ CX, CX
  521. JNZ sequenceDecs_decode_56_amd64_match_len_ofs_ok
  522. TESTQ AX, AX
  523. JNZ sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch
  524. sequenceDecs_decode_56_amd64_match_len_ofs_ok:
  525. ADDQ $0x18, R10
  526. MOVQ ctx+16(FP), AX
  527. DECQ 96(AX)
  528. JNS sequenceDecs_decode_56_amd64_main_loop
  529. MOVQ s+0(FP), AX
  530. MOVQ R11, 144(AX)
  531. MOVQ R12, 152(AX)
  532. MOVQ R13, 160(AX)
  533. MOVQ br+8(FP), AX
  534. MOVQ DX, 24(AX)
  535. MOVB BL, 32(AX)
  536. MOVQ SI, 8(AX)
  537. // Return success
  538. MOVQ $0x00000000, ret+24(FP)
  539. RET
  540. // Return with match length error
  541. sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch:
  542. MOVQ $0x00000001, ret+24(FP)
  543. RET
  544. // Return with match too long error
  545. sequenceDecs_decode_56_amd64_error_match_len_too_big:
  546. MOVQ $0x00000002, ret+24(FP)
  547. RET
  548. // Return with match offset too long error
  549. MOVQ $0x00000003, ret+24(FP)
  550. RET
  551. // Return with not enough literals error
  552. error_not_enough_literals:
  553. MOVQ $0x00000004, ret+24(FP)
  554. RET
  555. // Return with overread error
  556. error_overread:
  557. MOVQ $0x00000006, ret+24(FP)
  558. RET
  559. // func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
  560. // Requires: BMI, BMI2, CMOV
  561. TEXT ·sequenceDecs_decode_bmi2(SB), $8-32
  562. MOVQ br+8(FP), BX
  563. MOVQ 24(BX), AX
  564. MOVBQZX 32(BX), DX
  565. MOVQ (BX), CX
  566. MOVQ 8(BX), BX
  567. ADDQ BX, CX
  568. MOVQ CX, (SP)
  569. MOVQ ctx+16(FP), CX
  570. MOVQ 72(CX), SI
  571. MOVQ 80(CX), DI
  572. MOVQ 88(CX), R8
  573. MOVQ 104(CX), R9
  574. MOVQ s+0(FP), CX
  575. MOVQ 144(CX), R10
  576. MOVQ 152(CX), R11
  577. MOVQ 160(CX), R12
  578. sequenceDecs_decode_bmi2_main_loop:
  579. MOVQ (SP), R13
  580. // Fill bitreader to have enough for the offset and match length.
  581. CMPQ BX, $0x08
  582. JL sequenceDecs_decode_bmi2_fill_byte_by_byte
  583. MOVQ DX, CX
  584. SHRQ $0x03, CX
  585. SUBQ CX, R13
  586. MOVQ (R13), AX
  587. SUBQ CX, BX
  588. ANDQ $0x07, DX
  589. JMP sequenceDecs_decode_bmi2_fill_end
  590. sequenceDecs_decode_bmi2_fill_byte_by_byte:
  591. CMPQ BX, $0x00
  592. JLE sequenceDecs_decode_bmi2_fill_check_overread
  593. CMPQ DX, $0x07
  594. JLE sequenceDecs_decode_bmi2_fill_end
  595. SHLQ $0x08, AX
  596. SUBQ $0x01, R13
  597. SUBQ $0x01, BX
  598. SUBQ $0x08, DX
  599. MOVBQZX (R13), CX
  600. ORQ CX, AX
  601. JMP sequenceDecs_decode_bmi2_fill_byte_by_byte
  602. sequenceDecs_decode_bmi2_fill_check_overread:
  603. CMPQ DX, $0x40
  604. JA error_overread
  605. sequenceDecs_decode_bmi2_fill_end:
  606. // Update offset
  607. MOVQ $0x00000808, CX
  608. BEXTRQ CX, R8, R14
  609. MOVQ AX, R15
  610. LEAQ (DX)(R14*1), CX
  611. ROLQ CL, R15
  612. BZHIQ R14, R15, R15
  613. MOVQ CX, DX
  614. MOVQ R8, CX
  615. SHRQ $0x20, CX
  616. ADDQ R15, CX
  617. MOVQ CX, 16(R9)
  618. // Update match length
  619. MOVQ $0x00000808, CX
  620. BEXTRQ CX, DI, R14
  621. MOVQ AX, R15
  622. LEAQ (DX)(R14*1), CX
  623. ROLQ CL, R15
  624. BZHIQ R14, R15, R15
  625. MOVQ CX, DX
  626. MOVQ DI, CX
  627. SHRQ $0x20, CX
  628. ADDQ R15, CX
  629. MOVQ CX, 8(R9)
  630. // Fill bitreader to have enough for the remaining
  631. CMPQ BX, $0x08
  632. JL sequenceDecs_decode_bmi2_fill_2_byte_by_byte
  633. MOVQ DX, CX
  634. SHRQ $0x03, CX
  635. SUBQ CX, R13
  636. MOVQ (R13), AX
  637. SUBQ CX, BX
  638. ANDQ $0x07, DX
  639. JMP sequenceDecs_decode_bmi2_fill_2_end
  640. sequenceDecs_decode_bmi2_fill_2_byte_by_byte:
  641. CMPQ BX, $0x00
  642. JLE sequenceDecs_decode_bmi2_fill_2_check_overread
  643. CMPQ DX, $0x07
  644. JLE sequenceDecs_decode_bmi2_fill_2_end
  645. SHLQ $0x08, AX
  646. SUBQ $0x01, R13
  647. SUBQ $0x01, BX
  648. SUBQ $0x08, DX
  649. MOVBQZX (R13), CX
  650. ORQ CX, AX
  651. JMP sequenceDecs_decode_bmi2_fill_2_byte_by_byte
  652. sequenceDecs_decode_bmi2_fill_2_check_overread:
  653. CMPQ DX, $0x40
  654. JA error_overread
  655. sequenceDecs_decode_bmi2_fill_2_end:
  656. // Update literal length
  657. MOVQ $0x00000808, CX
  658. BEXTRQ CX, SI, R14
  659. MOVQ AX, R15
  660. LEAQ (DX)(R14*1), CX
  661. ROLQ CL, R15
  662. BZHIQ R14, R15, R15
  663. MOVQ CX, DX
  664. MOVQ SI, CX
  665. SHRQ $0x20, CX
  666. ADDQ R15, CX
  667. MOVQ CX, (R9)
  668. // Fill bitreader for state updates
  669. MOVQ R13, (SP)
  670. MOVQ $0x00000808, CX
  671. BEXTRQ CX, R8, R13
  672. MOVQ ctx+16(FP), CX
  673. CMPQ 96(CX), $0x00
  674. JZ sequenceDecs_decode_bmi2_skip_update
  675. LEAQ (SI)(DI*1), R14
  676. ADDQ R8, R14
  677. MOVBQZX R14, R14
  678. LEAQ (DX)(R14*1), CX
  679. MOVQ AX, R15
  680. MOVQ CX, DX
  681. ROLQ CL, R15
  682. BZHIQ R14, R15, R15
  683. // Update Offset State
  684. BZHIQ R8, R15, CX
  685. SHRXQ R8, R15, R15
  686. MOVQ $0x00001010, R14
  687. BEXTRQ R14, R8, R8
  688. ADDQ CX, R8
  689. // Load ctx.ofTable
  690. MOVQ ctx+16(FP), CX
  691. MOVQ 48(CX), CX
  692. MOVQ (CX)(R8*8), R8
  693. // Update Match Length State
  694. BZHIQ DI, R15, CX
  695. SHRXQ DI, R15, R15
  696. MOVQ $0x00001010, R14
  697. BEXTRQ R14, DI, DI
  698. ADDQ CX, DI
  699. // Load ctx.mlTable
  700. MOVQ ctx+16(FP), CX
  701. MOVQ 24(CX), CX
  702. MOVQ (CX)(DI*8), DI
  703. // Update Literal Length State
  704. BZHIQ SI, R15, CX
  705. MOVQ $0x00001010, R14
  706. BEXTRQ R14, SI, SI
  707. ADDQ CX, SI
  708. // Load ctx.llTable
  709. MOVQ ctx+16(FP), CX
  710. MOVQ (CX), CX
  711. MOVQ (CX)(SI*8), SI
  712. sequenceDecs_decode_bmi2_skip_update:
  713. // Adjust offset
  714. MOVQ 16(R9), CX
  715. CMPQ R13, $0x01
  716. JBE sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0
  717. MOVQ R11, R12
  718. MOVQ R10, R11
  719. MOVQ CX, R10
  720. JMP sequenceDecs_decode_bmi2_after_adjust
  721. sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0:
  722. CMPQ (R9), $0x00000000
  723. JNE sequenceDecs_decode_bmi2_adjust_offset_maybezero
  724. INCQ CX
  725. JMP sequenceDecs_decode_bmi2_adjust_offset_nonzero
  726. sequenceDecs_decode_bmi2_adjust_offset_maybezero:
  727. TESTQ CX, CX
  728. JNZ sequenceDecs_decode_bmi2_adjust_offset_nonzero
  729. MOVQ R10, CX
  730. JMP sequenceDecs_decode_bmi2_after_adjust
  731. sequenceDecs_decode_bmi2_adjust_offset_nonzero:
  732. CMPQ CX, $0x01
  733. JB sequenceDecs_decode_bmi2_adjust_zero
  734. JEQ sequenceDecs_decode_bmi2_adjust_one
  735. CMPQ CX, $0x02
  736. JA sequenceDecs_decode_bmi2_adjust_three
  737. JMP sequenceDecs_decode_bmi2_adjust_two
  738. sequenceDecs_decode_bmi2_adjust_zero:
  739. MOVQ R10, R13
  740. JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid
  741. sequenceDecs_decode_bmi2_adjust_one:
  742. MOVQ R11, R13
  743. JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid
  744. sequenceDecs_decode_bmi2_adjust_two:
  745. MOVQ R12, R13
  746. JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid
  747. sequenceDecs_decode_bmi2_adjust_three:
  748. LEAQ -1(R10), R13
  749. sequenceDecs_decode_bmi2_adjust_test_temp_valid:
  750. TESTQ R13, R13
  751. JNZ sequenceDecs_decode_bmi2_adjust_temp_valid
  752. MOVQ $0x00000001, R13
  753. sequenceDecs_decode_bmi2_adjust_temp_valid:
  754. CMPQ CX, $0x01
  755. CMOVQNE R11, R12
  756. MOVQ R10, R11
  757. MOVQ R13, R10
  758. MOVQ R13, CX
  759. sequenceDecs_decode_bmi2_after_adjust:
  760. MOVQ CX, 16(R9)
  761. // Check values
  762. MOVQ 8(R9), R13
  763. MOVQ (R9), R14
  764. LEAQ (R13)(R14*1), R15
  765. MOVQ s+0(FP), BP
  766. ADDQ R15, 256(BP)
  767. MOVQ ctx+16(FP), R15
  768. SUBQ R14, 128(R15)
  769. JS error_not_enough_literals
  770. CMPQ R13, $0x00020002
  771. JA sequenceDecs_decode_bmi2_error_match_len_too_big
  772. TESTQ CX, CX
  773. JNZ sequenceDecs_decode_bmi2_match_len_ofs_ok
  774. TESTQ R13, R13
  775. JNZ sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch
  776. sequenceDecs_decode_bmi2_match_len_ofs_ok:
  777. ADDQ $0x18, R9
  778. MOVQ ctx+16(FP), CX
  779. DECQ 96(CX)
  780. JNS sequenceDecs_decode_bmi2_main_loop
  781. MOVQ s+0(FP), CX
  782. MOVQ R10, 144(CX)
  783. MOVQ R11, 152(CX)
  784. MOVQ R12, 160(CX)
  785. MOVQ br+8(FP), CX
  786. MOVQ AX, 24(CX)
  787. MOVB DL, 32(CX)
  788. MOVQ BX, 8(CX)
  789. // Return success
  790. MOVQ $0x00000000, ret+24(FP)
  791. RET
  792. // Return with match length error
  793. sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch:
  794. MOVQ $0x00000001, ret+24(FP)
  795. RET
  796. // Return with match too long error
  797. sequenceDecs_decode_bmi2_error_match_len_too_big:
  798. MOVQ $0x00000002, ret+24(FP)
  799. RET
  800. // Return with match offset too long error
  801. MOVQ $0x00000003, ret+24(FP)
  802. RET
  803. // Return with not enough literals error
  804. error_not_enough_literals:
  805. MOVQ $0x00000004, ret+24(FP)
  806. RET
  807. // Return with overread error
  808. error_overread:
  809. MOVQ $0x00000006, ret+24(FP)
  810. RET
  811. // func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
  812. // Requires: BMI, BMI2, CMOV
  813. TEXT ·sequenceDecs_decode_56_bmi2(SB), $8-32
  814. MOVQ br+8(FP), BX
  815. MOVQ 24(BX), AX
  816. MOVBQZX 32(BX), DX
  817. MOVQ (BX), CX
  818. MOVQ 8(BX), BX
  819. ADDQ BX, CX
  820. MOVQ CX, (SP)
  821. MOVQ ctx+16(FP), CX
  822. MOVQ 72(CX), SI
  823. MOVQ 80(CX), DI
  824. MOVQ 88(CX), R8
  825. MOVQ 104(CX), R9
  826. MOVQ s+0(FP), CX
  827. MOVQ 144(CX), R10
  828. MOVQ 152(CX), R11
  829. MOVQ 160(CX), R12
  830. sequenceDecs_decode_56_bmi2_main_loop:
  831. MOVQ (SP), R13
  832. // Fill bitreader to have enough for the offset and match length.
  833. CMPQ BX, $0x08
  834. JL sequenceDecs_decode_56_bmi2_fill_byte_by_byte
  835. MOVQ DX, CX
  836. SHRQ $0x03, CX
  837. SUBQ CX, R13
  838. MOVQ (R13), AX
  839. SUBQ CX, BX
  840. ANDQ $0x07, DX
  841. JMP sequenceDecs_decode_56_bmi2_fill_end
  842. sequenceDecs_decode_56_bmi2_fill_byte_by_byte:
  843. CMPQ BX, $0x00
  844. JLE sequenceDecs_decode_56_bmi2_fill_check_overread
  845. CMPQ DX, $0x07
  846. JLE sequenceDecs_decode_56_bmi2_fill_end
  847. SHLQ $0x08, AX
  848. SUBQ $0x01, R13
  849. SUBQ $0x01, BX
  850. SUBQ $0x08, DX
  851. MOVBQZX (R13), CX
  852. ORQ CX, AX
  853. JMP sequenceDecs_decode_56_bmi2_fill_byte_by_byte
  854. sequenceDecs_decode_56_bmi2_fill_check_overread:
  855. CMPQ DX, $0x40
  856. JA error_overread
  857. sequenceDecs_decode_56_bmi2_fill_end:
  858. // Update offset
  859. MOVQ $0x00000808, CX
  860. BEXTRQ CX, R8, R14
  861. MOVQ AX, R15
  862. LEAQ (DX)(R14*1), CX
  863. ROLQ CL, R15
  864. BZHIQ R14, R15, R15
  865. MOVQ CX, DX
  866. MOVQ R8, CX
  867. SHRQ $0x20, CX
  868. ADDQ R15, CX
  869. MOVQ CX, 16(R9)
  870. // Update match length
  871. MOVQ $0x00000808, CX
  872. BEXTRQ CX, DI, R14
  873. MOVQ AX, R15
  874. LEAQ (DX)(R14*1), CX
  875. ROLQ CL, R15
  876. BZHIQ R14, R15, R15
  877. MOVQ CX, DX
  878. MOVQ DI, CX
  879. SHRQ $0x20, CX
  880. ADDQ R15, CX
  881. MOVQ CX, 8(R9)
  882. // Update literal length
  883. MOVQ $0x00000808, CX
  884. BEXTRQ CX, SI, R14
  885. MOVQ AX, R15
  886. LEAQ (DX)(R14*1), CX
  887. ROLQ CL, R15
  888. BZHIQ R14, R15, R15
  889. MOVQ CX, DX
  890. MOVQ SI, CX
  891. SHRQ $0x20, CX
  892. ADDQ R15, CX
  893. MOVQ CX, (R9)
  894. // Fill bitreader for state updates
  895. MOVQ R13, (SP)
  896. MOVQ $0x00000808, CX
  897. BEXTRQ CX, R8, R13
  898. MOVQ ctx+16(FP), CX
  899. CMPQ 96(CX), $0x00
  900. JZ sequenceDecs_decode_56_bmi2_skip_update
  901. LEAQ (SI)(DI*1), R14
  902. ADDQ R8, R14
  903. MOVBQZX R14, R14
  904. LEAQ (DX)(R14*1), CX
  905. MOVQ AX, R15
  906. MOVQ CX, DX
  907. ROLQ CL, R15
  908. BZHIQ R14, R15, R15
  909. // Update Offset State
  910. BZHIQ R8, R15, CX
  911. SHRXQ R8, R15, R15
  912. MOVQ $0x00001010, R14
  913. BEXTRQ R14, R8, R8
  914. ADDQ CX, R8
  915. // Load ctx.ofTable
  916. MOVQ ctx+16(FP), CX
  917. MOVQ 48(CX), CX
  918. MOVQ (CX)(R8*8), R8
  919. // Update Match Length State
  920. BZHIQ DI, R15, CX
  921. SHRXQ DI, R15, R15
  922. MOVQ $0x00001010, R14
  923. BEXTRQ R14, DI, DI
  924. ADDQ CX, DI
  925. // Load ctx.mlTable
  926. MOVQ ctx+16(FP), CX
  927. MOVQ 24(CX), CX
  928. MOVQ (CX)(DI*8), DI
  929. // Update Literal Length State
  930. BZHIQ SI, R15, CX
  931. MOVQ $0x00001010, R14
  932. BEXTRQ R14, SI, SI
  933. ADDQ CX, SI
  934. // Load ctx.llTable
  935. MOVQ ctx+16(FP), CX
  936. MOVQ (CX), CX
  937. MOVQ (CX)(SI*8), SI
  938. sequenceDecs_decode_56_bmi2_skip_update:
  939. // Adjust offset
  940. MOVQ 16(R9), CX
  941. CMPQ R13, $0x01
  942. JBE sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0
  943. MOVQ R11, R12
  944. MOVQ R10, R11
  945. MOVQ CX, R10
  946. JMP sequenceDecs_decode_56_bmi2_after_adjust
  947. sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0:
  948. CMPQ (R9), $0x00000000
  949. JNE sequenceDecs_decode_56_bmi2_adjust_offset_maybezero
  950. INCQ CX
  951. JMP sequenceDecs_decode_56_bmi2_adjust_offset_nonzero
  952. sequenceDecs_decode_56_bmi2_adjust_offset_maybezero:
  953. TESTQ CX, CX
  954. JNZ sequenceDecs_decode_56_bmi2_adjust_offset_nonzero
  955. MOVQ R10, CX
  956. JMP sequenceDecs_decode_56_bmi2_after_adjust
  957. sequenceDecs_decode_56_bmi2_adjust_offset_nonzero:
  958. CMPQ CX, $0x01
  959. JB sequenceDecs_decode_56_bmi2_adjust_zero
  960. JEQ sequenceDecs_decode_56_bmi2_adjust_one
  961. CMPQ CX, $0x02
  962. JA sequenceDecs_decode_56_bmi2_adjust_three
  963. JMP sequenceDecs_decode_56_bmi2_adjust_two
  964. sequenceDecs_decode_56_bmi2_adjust_zero:
  965. MOVQ R10, R13
  966. JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
  967. sequenceDecs_decode_56_bmi2_adjust_one:
  968. MOVQ R11, R13
  969. JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
  970. sequenceDecs_decode_56_bmi2_adjust_two:
  971. MOVQ R12, R13
  972. JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
  973. sequenceDecs_decode_56_bmi2_adjust_three:
  974. LEAQ -1(R10), R13
  975. sequenceDecs_decode_56_bmi2_adjust_test_temp_valid:
  976. TESTQ R13, R13
  977. JNZ sequenceDecs_decode_56_bmi2_adjust_temp_valid
  978. MOVQ $0x00000001, R13
  979. sequenceDecs_decode_56_bmi2_adjust_temp_valid:
  980. CMPQ CX, $0x01
  981. CMOVQNE R11, R12
  982. MOVQ R10, R11
  983. MOVQ R13, R10
  984. MOVQ R13, CX
  985. sequenceDecs_decode_56_bmi2_after_adjust:
  986. MOVQ CX, 16(R9)
  987. // Check values
  988. MOVQ 8(R9), R13
  989. MOVQ (R9), R14
  990. LEAQ (R13)(R14*1), R15
  991. MOVQ s+0(FP), BP
  992. ADDQ R15, 256(BP)
  993. MOVQ ctx+16(FP), R15
  994. SUBQ R14, 128(R15)
  995. JS error_not_enough_literals
  996. CMPQ R13, $0x00020002
  997. JA sequenceDecs_decode_56_bmi2_error_match_len_too_big
  998. TESTQ CX, CX
  999. JNZ sequenceDecs_decode_56_bmi2_match_len_ofs_ok
  1000. TESTQ R13, R13
  1001. JNZ sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch
  1002. sequenceDecs_decode_56_bmi2_match_len_ofs_ok:
  1003. ADDQ $0x18, R9
  1004. MOVQ ctx+16(FP), CX
  1005. DECQ 96(CX)
  1006. JNS sequenceDecs_decode_56_bmi2_main_loop
  1007. MOVQ s+0(FP), CX
  1008. MOVQ R10, 144(CX)
  1009. MOVQ R11, 152(CX)
  1010. MOVQ R12, 160(CX)
  1011. MOVQ br+8(FP), CX
  1012. MOVQ AX, 24(CX)
  1013. MOVB DL, 32(CX)
  1014. MOVQ BX, 8(CX)
  1015. // Return success
  1016. MOVQ $0x00000000, ret+24(FP)
  1017. RET
  1018. // Return with match length error
  1019. sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch:
  1020. MOVQ $0x00000001, ret+24(FP)
  1021. RET
  1022. // Return with match too long error
  1023. sequenceDecs_decode_56_bmi2_error_match_len_too_big:
  1024. MOVQ $0x00000002, ret+24(FP)
  1025. RET
  1026. // Return with match offset too long error
  1027. MOVQ $0x00000003, ret+24(FP)
  1028. RET
  1029. // Return with not enough literals error
  1030. error_not_enough_literals:
  1031. MOVQ $0x00000004, ret+24(FP)
  1032. RET
  1033. // Return with overread error
  1034. error_overread:
  1035. MOVQ $0x00000006, ret+24(FP)
  1036. RET
  1037. // func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool
  1038. // Requires: SSE
  1039. TEXT ·sequenceDecs_executeSimple_amd64(SB), $8-9
  1040. MOVQ ctx+0(FP), R10
  1041. MOVQ 8(R10), CX
  1042. TESTQ CX, CX
  1043. JZ empty_seqs
  1044. MOVQ (R10), AX
  1045. MOVQ 24(R10), DX
  1046. MOVQ 32(R10), BX
  1047. MOVQ 80(R10), SI
  1048. MOVQ 104(R10), DI
  1049. MOVQ 120(R10), R8
  1050. MOVQ 56(R10), R9
  1051. MOVQ 64(R10), R10
  1052. ADDQ R10, R9
  1053. // seqsBase += 24 * seqIndex
  1054. LEAQ (DX)(DX*2), R11
  1055. SHLQ $0x03, R11
  1056. ADDQ R11, AX
  1057. // outBase += outPosition
  1058. ADDQ DI, BX
  1059. main_loop:
  1060. MOVQ (AX), R11
  1061. MOVQ 16(AX), R12
  1062. MOVQ 8(AX), R13
  1063. // Copy literals
  1064. TESTQ R11, R11
  1065. JZ check_offset
  1066. XORQ R14, R14
  1067. copy_1:
  1068. MOVUPS (SI)(R14*1), X0
  1069. MOVUPS X0, (BX)(R14*1)
  1070. ADDQ $0x10, R14
  1071. CMPQ R14, R11
  1072. JB copy_1
  1073. ADDQ R11, SI
  1074. ADDQ R11, BX
  1075. ADDQ R11, DI
  1076. // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
  1077. check_offset:
  1078. LEAQ (DI)(R10*1), R11
  1079. CMPQ R12, R11
  1080. JG error_match_off_too_big
  1081. CMPQ R12, R8
  1082. JG error_match_off_too_big
  1083. // Copy match from history
  1084. MOVQ R12, R11
  1085. SUBQ DI, R11
  1086. JLS copy_match
  1087. MOVQ R9, R14
  1088. SUBQ R11, R14
  1089. CMPQ R13, R11
  1090. JG copy_all_from_history
  1091. MOVQ R13, R11
  1092. SUBQ $0x10, R11
  1093. JB copy_4_small
  1094. copy_4_loop:
  1095. MOVUPS (R14), X0
  1096. MOVUPS X0, (BX)
  1097. ADDQ $0x10, R14
  1098. ADDQ $0x10, BX
  1099. SUBQ $0x10, R11
  1100. JAE copy_4_loop
  1101. LEAQ 16(R14)(R11*1), R14
  1102. LEAQ 16(BX)(R11*1), BX
  1103. MOVUPS -16(R14), X0
  1104. MOVUPS X0, -16(BX)
  1105. JMP copy_4_end
  1106. copy_4_small:
  1107. CMPQ R13, $0x03
  1108. JE copy_4_move_3
  1109. CMPQ R13, $0x08
  1110. JB copy_4_move_4through7
  1111. JMP copy_4_move_8through16
  1112. copy_4_move_3:
  1113. MOVW (R14), R11
  1114. MOVB 2(R14), R12
  1115. MOVW R11, (BX)
  1116. MOVB R12, 2(BX)
  1117. ADDQ R13, R14
  1118. ADDQ R13, BX
  1119. JMP copy_4_end
  1120. copy_4_move_4through7:
  1121. MOVL (R14), R11
  1122. MOVL -4(R14)(R13*1), R12
  1123. MOVL R11, (BX)
  1124. MOVL R12, -4(BX)(R13*1)
  1125. ADDQ R13, R14
  1126. ADDQ R13, BX
  1127. JMP copy_4_end
  1128. copy_4_move_8through16:
  1129. MOVQ (R14), R11
  1130. MOVQ -8(R14)(R13*1), R12
  1131. MOVQ R11, (BX)
  1132. MOVQ R12, -8(BX)(R13*1)
  1133. ADDQ R13, R14
  1134. ADDQ R13, BX
  1135. copy_4_end:
  1136. ADDQ R13, DI
  1137. ADDQ $0x18, AX
  1138. INCQ DX
  1139. CMPQ DX, CX
  1140. JB main_loop
  1141. JMP loop_finished
  1142. copy_all_from_history:
  1143. MOVQ R11, R15
  1144. SUBQ $0x10, R15
  1145. JB copy_5_small
  1146. copy_5_loop:
  1147. MOVUPS (R14), X0
  1148. MOVUPS X0, (BX)
  1149. ADDQ $0x10, R14
  1150. ADDQ $0x10, BX
  1151. SUBQ $0x10, R15
  1152. JAE copy_5_loop
  1153. LEAQ 16(R14)(R15*1), R14
  1154. LEAQ 16(BX)(R15*1), BX
  1155. MOVUPS -16(R14), X0
  1156. MOVUPS X0, -16(BX)
  1157. JMP copy_5_end
  1158. copy_5_small:
  1159. CMPQ R11, $0x03
  1160. JE copy_5_move_3
  1161. JB copy_5_move_1or2
  1162. CMPQ R11, $0x08
  1163. JB copy_5_move_4through7
  1164. JMP copy_5_move_8through16
  1165. copy_5_move_1or2:
  1166. MOVB (R14), R15
  1167. MOVB -1(R14)(R11*1), BP
  1168. MOVB R15, (BX)
  1169. MOVB BP, -1(BX)(R11*1)
  1170. ADDQ R11, R14
  1171. ADDQ R11, BX
  1172. JMP copy_5_end
  1173. copy_5_move_3:
  1174. MOVW (R14), R15
  1175. MOVB 2(R14), BP
  1176. MOVW R15, (BX)
  1177. MOVB BP, 2(BX)
  1178. ADDQ R11, R14
  1179. ADDQ R11, BX
  1180. JMP copy_5_end
  1181. copy_5_move_4through7:
  1182. MOVL (R14), R15
  1183. MOVL -4(R14)(R11*1), BP
  1184. MOVL R15, (BX)
  1185. MOVL BP, -4(BX)(R11*1)
  1186. ADDQ R11, R14
  1187. ADDQ R11, BX
  1188. JMP copy_5_end
  1189. copy_5_move_8through16:
  1190. MOVQ (R14), R15
  1191. MOVQ -8(R14)(R11*1), BP
  1192. MOVQ R15, (BX)
  1193. MOVQ BP, -8(BX)(R11*1)
  1194. ADDQ R11, R14
  1195. ADDQ R11, BX
  1196. copy_5_end:
  1197. ADDQ R11, DI
  1198. SUBQ R11, R13
  1199. // Copy match from the current buffer
  1200. copy_match:
  1201. MOVQ BX, R11
  1202. SUBQ R12, R11
  1203. // ml <= mo
  1204. CMPQ R13, R12
  1205. JA copy_overlapping_match
  1206. // Copy non-overlapping match
  1207. ADDQ R13, DI
  1208. MOVQ BX, R12
  1209. ADDQ R13, BX
  1210. copy_2:
  1211. MOVUPS (R11), X0
  1212. MOVUPS X0, (R12)
  1213. ADDQ $0x10, R11
  1214. ADDQ $0x10, R12
  1215. SUBQ $0x10, R13
  1216. JHI copy_2
  1217. JMP handle_loop
  1218. // Copy overlapping match
  1219. copy_overlapping_match:
  1220. ADDQ R13, DI
  1221. copy_slow_3:
  1222. MOVB (R11), R12
  1223. MOVB R12, (BX)
  1224. INCQ R11
  1225. INCQ BX
  1226. DECQ R13
  1227. JNZ copy_slow_3
  1228. handle_loop:
  1229. ADDQ $0x18, AX
  1230. INCQ DX
  1231. CMPQ DX, CX
  1232. JB main_loop
  1233. loop_finished:
  1234. // Return value
  1235. MOVB $0x01, ret+8(FP)
  1236. // Update the context
  1237. MOVQ ctx+0(FP), AX
  1238. MOVQ DX, 24(AX)
  1239. MOVQ DI, 104(AX)
  1240. SUBQ 80(AX), SI
  1241. MOVQ SI, 112(AX)
  1242. RET
  1243. error_match_off_too_big:
  1244. // Return value
  1245. MOVB $0x00, ret+8(FP)
  1246. // Update the context
  1247. MOVQ ctx+0(FP), AX
  1248. MOVQ DX, 24(AX)
  1249. MOVQ DI, 104(AX)
  1250. SUBQ 80(AX), SI
  1251. MOVQ SI, 112(AX)
  1252. RET
  1253. empty_seqs:
  1254. // Return value
  1255. MOVB $0x01, ret+8(FP)
  1256. RET
  1257. // func sequenceDecs_executeSimple_safe_amd64(ctx *executeAsmContext) bool
  1258. // Requires: SSE
  1259. TEXT ·sequenceDecs_executeSimple_safe_amd64(SB), $8-9
  1260. MOVQ ctx+0(FP), R10
  1261. MOVQ 8(R10), CX
  1262. TESTQ CX, CX
  1263. JZ empty_seqs
  1264. MOVQ (R10), AX
  1265. MOVQ 24(R10), DX
  1266. MOVQ 32(R10), BX
  1267. MOVQ 80(R10), SI
  1268. MOVQ 104(R10), DI
  1269. MOVQ 120(R10), R8
  1270. MOVQ 56(R10), R9
  1271. MOVQ 64(R10), R10
  1272. ADDQ R10, R9
  1273. // seqsBase += 24 * seqIndex
  1274. LEAQ (DX)(DX*2), R11
  1275. SHLQ $0x03, R11
  1276. ADDQ R11, AX
  1277. // outBase += outPosition
  1278. ADDQ DI, BX
  1279. main_loop:
  1280. MOVQ (AX), R11
  1281. MOVQ 16(AX), R12
  1282. MOVQ 8(AX), R13
  1283. // Copy literals
  1284. TESTQ R11, R11
  1285. JZ check_offset
  1286. MOVQ R11, R14
  1287. SUBQ $0x10, R14
  1288. JB copy_1_small
  1289. copy_1_loop:
  1290. MOVUPS (SI), X0
  1291. MOVUPS X0, (BX)
  1292. ADDQ $0x10, SI
  1293. ADDQ $0x10, BX
  1294. SUBQ $0x10, R14
  1295. JAE copy_1_loop
  1296. LEAQ 16(SI)(R14*1), SI
  1297. LEAQ 16(BX)(R14*1), BX
  1298. MOVUPS -16(SI), X0
  1299. MOVUPS X0, -16(BX)
  1300. JMP copy_1_end
  1301. copy_1_small:
  1302. CMPQ R11, $0x03
  1303. JE copy_1_move_3
  1304. JB copy_1_move_1or2
  1305. CMPQ R11, $0x08
  1306. JB copy_1_move_4through7
  1307. JMP copy_1_move_8through16
  1308. copy_1_move_1or2:
  1309. MOVB (SI), R14
  1310. MOVB -1(SI)(R11*1), R15
  1311. MOVB R14, (BX)
  1312. MOVB R15, -1(BX)(R11*1)
  1313. ADDQ R11, SI
  1314. ADDQ R11, BX
  1315. JMP copy_1_end
  1316. copy_1_move_3:
  1317. MOVW (SI), R14
  1318. MOVB 2(SI), R15
  1319. MOVW R14, (BX)
  1320. MOVB R15, 2(BX)
  1321. ADDQ R11, SI
  1322. ADDQ R11, BX
  1323. JMP copy_1_end
  1324. copy_1_move_4through7:
  1325. MOVL (SI), R14
  1326. MOVL -4(SI)(R11*1), R15
  1327. MOVL R14, (BX)
  1328. MOVL R15, -4(BX)(R11*1)
  1329. ADDQ R11, SI
  1330. ADDQ R11, BX
  1331. JMP copy_1_end
  1332. copy_1_move_8through16:
  1333. MOVQ (SI), R14
  1334. MOVQ -8(SI)(R11*1), R15
  1335. MOVQ R14, (BX)
  1336. MOVQ R15, -8(BX)(R11*1)
  1337. ADDQ R11, SI
  1338. ADDQ R11, BX
  1339. copy_1_end:
  1340. ADDQ R11, DI
  1341. // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
  1342. check_offset:
  1343. LEAQ (DI)(R10*1), R11
  1344. CMPQ R12, R11
  1345. JG error_match_off_too_big
  1346. CMPQ R12, R8
  1347. JG error_match_off_too_big
  1348. // Copy match from history
  1349. MOVQ R12, R11
  1350. SUBQ DI, R11
  1351. JLS copy_match
  1352. MOVQ R9, R14
  1353. SUBQ R11, R14
  1354. CMPQ R13, R11
  1355. JG copy_all_from_history
  1356. MOVQ R13, R11
  1357. SUBQ $0x10, R11
  1358. JB copy_4_small
  1359. copy_4_loop:
  1360. MOVUPS (R14), X0
  1361. MOVUPS X0, (BX)
  1362. ADDQ $0x10, R14
  1363. ADDQ $0x10, BX
  1364. SUBQ $0x10, R11
  1365. JAE copy_4_loop
  1366. LEAQ 16(R14)(R11*1), R14
  1367. LEAQ 16(BX)(R11*1), BX
  1368. MOVUPS -16(R14), X0
  1369. MOVUPS X0, -16(BX)
  1370. JMP copy_4_end
  1371. copy_4_small:
  1372. CMPQ R13, $0x03
  1373. JE copy_4_move_3
  1374. CMPQ R13, $0x08
  1375. JB copy_4_move_4through7
  1376. JMP copy_4_move_8through16
  1377. copy_4_move_3:
  1378. MOVW (R14), R11
  1379. MOVB 2(R14), R12
  1380. MOVW R11, (BX)
  1381. MOVB R12, 2(BX)
  1382. ADDQ R13, R14
  1383. ADDQ R13, BX
  1384. JMP copy_4_end
  1385. copy_4_move_4through7:
  1386. MOVL (R14), R11
  1387. MOVL -4(R14)(R13*1), R12
  1388. MOVL R11, (BX)
  1389. MOVL R12, -4(BX)(R13*1)
  1390. ADDQ R13, R14
  1391. ADDQ R13, BX
  1392. JMP copy_4_end
  1393. copy_4_move_8through16:
  1394. MOVQ (R14), R11
  1395. MOVQ -8(R14)(R13*1), R12
  1396. MOVQ R11, (BX)
  1397. MOVQ R12, -8(BX)(R13*1)
  1398. ADDQ R13, R14
  1399. ADDQ R13, BX
  1400. copy_4_end:
  1401. ADDQ R13, DI
  1402. ADDQ $0x18, AX
  1403. INCQ DX
  1404. CMPQ DX, CX
  1405. JB main_loop
  1406. JMP loop_finished
  1407. copy_all_from_history:
  1408. MOVQ R11, R15
  1409. SUBQ $0x10, R15
  1410. JB copy_5_small
  1411. copy_5_loop:
  1412. MOVUPS (R14), X0
  1413. MOVUPS X0, (BX)
  1414. ADDQ $0x10, R14
  1415. ADDQ $0x10, BX
  1416. SUBQ $0x10, R15
  1417. JAE copy_5_loop
  1418. LEAQ 16(R14)(R15*1), R14
  1419. LEAQ 16(BX)(R15*1), BX
  1420. MOVUPS -16(R14), X0
  1421. MOVUPS X0, -16(BX)
  1422. JMP copy_5_end
  1423. copy_5_small:
  1424. CMPQ R11, $0x03
  1425. JE copy_5_move_3
  1426. JB copy_5_move_1or2
  1427. CMPQ R11, $0x08
  1428. JB copy_5_move_4through7
  1429. JMP copy_5_move_8through16
  1430. copy_5_move_1or2:
  1431. MOVB (R14), R15
  1432. MOVB -1(R14)(R11*1), BP
  1433. MOVB R15, (BX)
  1434. MOVB BP, -1(BX)(R11*1)
  1435. ADDQ R11, R14
  1436. ADDQ R11, BX
  1437. JMP copy_5_end
  1438. copy_5_move_3:
  1439. MOVW (R14), R15
  1440. MOVB 2(R14), BP
  1441. MOVW R15, (BX)
  1442. MOVB BP, 2(BX)
  1443. ADDQ R11, R14
  1444. ADDQ R11, BX
  1445. JMP copy_5_end
  1446. copy_5_move_4through7:
  1447. MOVL (R14), R15
  1448. MOVL -4(R14)(R11*1), BP
  1449. MOVL R15, (BX)
  1450. MOVL BP, -4(BX)(R11*1)
  1451. ADDQ R11, R14
  1452. ADDQ R11, BX
  1453. JMP copy_5_end
  1454. copy_5_move_8through16:
  1455. MOVQ (R14), R15
  1456. MOVQ -8(R14)(R11*1), BP
  1457. MOVQ R15, (BX)
  1458. MOVQ BP, -8(BX)(R11*1)
  1459. ADDQ R11, R14
  1460. ADDQ R11, BX
  1461. copy_5_end:
  1462. ADDQ R11, DI
  1463. SUBQ R11, R13
  1464. // Copy match from the current buffer
  1465. copy_match:
  1466. MOVQ BX, R11
  1467. SUBQ R12, R11
  1468. // ml <= mo
  1469. CMPQ R13, R12
  1470. JA copy_overlapping_match
  1471. // Copy non-overlapping match
  1472. ADDQ R13, DI
  1473. MOVQ R13, R12
  1474. SUBQ $0x10, R12
  1475. JB copy_2_small
  1476. copy_2_loop:
  1477. MOVUPS (R11), X0
  1478. MOVUPS X0, (BX)
  1479. ADDQ $0x10, R11
  1480. ADDQ $0x10, BX
  1481. SUBQ $0x10, R12
  1482. JAE copy_2_loop
  1483. LEAQ 16(R11)(R12*1), R11
  1484. LEAQ 16(BX)(R12*1), BX
  1485. MOVUPS -16(R11), X0
  1486. MOVUPS X0, -16(BX)
  1487. JMP copy_2_end
  1488. copy_2_small:
  1489. CMPQ R13, $0x03
  1490. JE copy_2_move_3
  1491. JB copy_2_move_1or2
  1492. CMPQ R13, $0x08
  1493. JB copy_2_move_4through7
  1494. JMP copy_2_move_8through16
  1495. copy_2_move_1or2:
  1496. MOVB (R11), R12
  1497. MOVB -1(R11)(R13*1), R14
  1498. MOVB R12, (BX)
  1499. MOVB R14, -1(BX)(R13*1)
  1500. ADDQ R13, R11
  1501. ADDQ R13, BX
  1502. JMP copy_2_end
  1503. copy_2_move_3:
  1504. MOVW (R11), R12
  1505. MOVB 2(R11), R14
  1506. MOVW R12, (BX)
  1507. MOVB R14, 2(BX)
  1508. ADDQ R13, R11
  1509. ADDQ R13, BX
  1510. JMP copy_2_end
  1511. copy_2_move_4through7:
  1512. MOVL (R11), R12
  1513. MOVL -4(R11)(R13*1), R14
  1514. MOVL R12, (BX)
  1515. MOVL R14, -4(BX)(R13*1)
  1516. ADDQ R13, R11
  1517. ADDQ R13, BX
  1518. JMP copy_2_end
  1519. copy_2_move_8through16:
  1520. MOVQ (R11), R12
  1521. MOVQ -8(R11)(R13*1), R14
  1522. MOVQ R12, (BX)
  1523. MOVQ R14, -8(BX)(R13*1)
  1524. ADDQ R13, R11
  1525. ADDQ R13, BX
  1526. copy_2_end:
  1527. JMP handle_loop
  1528. // Copy overlapping match
  1529. copy_overlapping_match:
  1530. ADDQ R13, DI
  1531. copy_slow_3:
  1532. MOVB (R11), R12
  1533. MOVB R12, (BX)
  1534. INCQ R11
  1535. INCQ BX
  1536. DECQ R13
  1537. JNZ copy_slow_3
  1538. handle_loop:
  1539. ADDQ $0x18, AX
  1540. INCQ DX
  1541. CMPQ DX, CX
  1542. JB main_loop
  1543. loop_finished:
  1544. // Return value
  1545. MOVB $0x01, ret+8(FP)
  1546. // Update the context
  1547. MOVQ ctx+0(FP), AX
  1548. MOVQ DX, 24(AX)
  1549. MOVQ DI, 104(AX)
  1550. SUBQ 80(AX), SI
  1551. MOVQ SI, 112(AX)
  1552. RET
  1553. error_match_off_too_big:
  1554. // Return value
  1555. MOVB $0x00, ret+8(FP)
  1556. // Update the context
  1557. MOVQ ctx+0(FP), AX
  1558. MOVQ DX, 24(AX)
  1559. MOVQ DI, 104(AX)
  1560. SUBQ 80(AX), SI
  1561. MOVQ SI, 112(AX)
  1562. RET
  1563. empty_seqs:
  1564. // Return value
  1565. MOVB $0x01, ret+8(FP)
  1566. RET
  1567. // func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
  1568. // Requires: CMOV, SSE
  1569. TEXT ·sequenceDecs_decodeSync_amd64(SB), $64-32
  1570. MOVQ br+8(FP), CX
  1571. MOVQ 24(CX), DX
  1572. MOVBQZX 32(CX), BX
  1573. MOVQ (CX), AX
  1574. MOVQ 8(CX), SI
  1575. ADDQ SI, AX
  1576. MOVQ AX, (SP)
  1577. MOVQ ctx+16(FP), AX
  1578. MOVQ 72(AX), DI
  1579. MOVQ 80(AX), R8
  1580. MOVQ 88(AX), R9
  1581. XORQ CX, CX
  1582. MOVQ CX, 8(SP)
  1583. MOVQ CX, 16(SP)
  1584. MOVQ CX, 24(SP)
  1585. MOVQ 112(AX), R10
  1586. MOVQ 128(AX), CX
  1587. MOVQ CX, 32(SP)
  1588. MOVQ 144(AX), R11
  1589. MOVQ 136(AX), R12
  1590. MOVQ 200(AX), CX
  1591. MOVQ CX, 56(SP)
  1592. MOVQ 176(AX), CX
  1593. MOVQ CX, 48(SP)
  1594. MOVQ 184(AX), AX
  1595. MOVQ AX, 40(SP)
  1596. MOVQ 40(SP), AX
  1597. ADDQ AX, 48(SP)
  1598. // Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
  1599. ADDQ R10, 32(SP)
  1600. // outBase += outPosition
  1601. ADDQ R12, R10
  1602. sequenceDecs_decodeSync_amd64_main_loop:
  1603. MOVQ (SP), R13
  1604. // Fill bitreader to have enough for the offset and match length.
  1605. CMPQ SI, $0x08
  1606. JL sequenceDecs_decodeSync_amd64_fill_byte_by_byte
  1607. MOVQ BX, AX
  1608. SHRQ $0x03, AX
  1609. SUBQ AX, R13
  1610. MOVQ (R13), DX
  1611. SUBQ AX, SI
  1612. ANDQ $0x07, BX
  1613. JMP sequenceDecs_decodeSync_amd64_fill_end
  1614. sequenceDecs_decodeSync_amd64_fill_byte_by_byte:
  1615. CMPQ SI, $0x00
  1616. JLE sequenceDecs_decodeSync_amd64_fill_check_overread
  1617. CMPQ BX, $0x07
  1618. JLE sequenceDecs_decodeSync_amd64_fill_end
  1619. SHLQ $0x08, DX
  1620. SUBQ $0x01, R13
  1621. SUBQ $0x01, SI
  1622. SUBQ $0x08, BX
  1623. MOVBQZX (R13), AX
  1624. ORQ AX, DX
  1625. JMP sequenceDecs_decodeSync_amd64_fill_byte_by_byte
  1626. sequenceDecs_decodeSync_amd64_fill_check_overread:
  1627. CMPQ BX, $0x40
  1628. JA error_overread
  1629. sequenceDecs_decodeSync_amd64_fill_end:
  1630. // Update offset
  1631. MOVQ R9, AX
  1632. MOVQ BX, CX
  1633. MOVQ DX, R14
  1634. SHLQ CL, R14
  1635. MOVB AH, CL
  1636. SHRQ $0x20, AX
  1637. TESTQ CX, CX
  1638. JZ sequenceDecs_decodeSync_amd64_of_update_zero
  1639. ADDQ CX, BX
  1640. CMPQ BX, $0x40
  1641. JA sequenceDecs_decodeSync_amd64_of_update_zero
  1642. CMPQ CX, $0x40
  1643. JAE sequenceDecs_decodeSync_amd64_of_update_zero
  1644. NEGQ CX
  1645. SHRQ CL, R14
  1646. ADDQ R14, AX
  1647. sequenceDecs_decodeSync_amd64_of_update_zero:
  1648. MOVQ AX, 8(SP)
  1649. // Update match length
  1650. MOVQ R8, AX
  1651. MOVQ BX, CX
  1652. MOVQ DX, R14
  1653. SHLQ CL, R14
  1654. MOVB AH, CL
  1655. SHRQ $0x20, AX
  1656. TESTQ CX, CX
  1657. JZ sequenceDecs_decodeSync_amd64_ml_update_zero
  1658. ADDQ CX, BX
  1659. CMPQ BX, $0x40
  1660. JA sequenceDecs_decodeSync_amd64_ml_update_zero
  1661. CMPQ CX, $0x40
  1662. JAE sequenceDecs_decodeSync_amd64_ml_update_zero
  1663. NEGQ CX
  1664. SHRQ CL, R14
  1665. ADDQ R14, AX
  1666. sequenceDecs_decodeSync_amd64_ml_update_zero:
  1667. MOVQ AX, 16(SP)
  1668. // Fill bitreader to have enough for the remaining
  1669. CMPQ SI, $0x08
  1670. JL sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte
  1671. MOVQ BX, AX
  1672. SHRQ $0x03, AX
  1673. SUBQ AX, R13
  1674. MOVQ (R13), DX
  1675. SUBQ AX, SI
  1676. ANDQ $0x07, BX
  1677. JMP sequenceDecs_decodeSync_amd64_fill_2_end
  1678. sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte:
  1679. CMPQ SI, $0x00
  1680. JLE sequenceDecs_decodeSync_amd64_fill_2_check_overread
  1681. CMPQ BX, $0x07
  1682. JLE sequenceDecs_decodeSync_amd64_fill_2_end
  1683. SHLQ $0x08, DX
  1684. SUBQ $0x01, R13
  1685. SUBQ $0x01, SI
  1686. SUBQ $0x08, BX
  1687. MOVBQZX (R13), AX
  1688. ORQ AX, DX
  1689. JMP sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte
  1690. sequenceDecs_decodeSync_amd64_fill_2_check_overread:
  1691. CMPQ BX, $0x40
  1692. JA error_overread
  1693. sequenceDecs_decodeSync_amd64_fill_2_end:
  1694. // Update literal length
  1695. MOVQ DI, AX
  1696. MOVQ BX, CX
  1697. MOVQ DX, R14
  1698. SHLQ CL, R14
  1699. MOVB AH, CL
  1700. SHRQ $0x20, AX
  1701. TESTQ CX, CX
  1702. JZ sequenceDecs_decodeSync_amd64_ll_update_zero
  1703. ADDQ CX, BX
  1704. CMPQ BX, $0x40
  1705. JA sequenceDecs_decodeSync_amd64_ll_update_zero
  1706. CMPQ CX, $0x40
  1707. JAE sequenceDecs_decodeSync_amd64_ll_update_zero
  1708. NEGQ CX
  1709. SHRQ CL, R14
  1710. ADDQ R14, AX
  1711. sequenceDecs_decodeSync_amd64_ll_update_zero:
  1712. MOVQ AX, 24(SP)
  1713. // Fill bitreader for state updates
  1714. MOVQ R13, (SP)
  1715. MOVQ R9, AX
  1716. SHRQ $0x08, AX
  1717. MOVBQZX AL, AX
  1718. MOVQ ctx+16(FP), CX
  1719. CMPQ 96(CX), $0x00
  1720. JZ sequenceDecs_decodeSync_amd64_skip_update
  1721. // Update Literal Length State
  1722. MOVBQZX DI, R13
  1723. SHRQ $0x10, DI
  1724. MOVWQZX DI, DI
  1725. LEAQ (BX)(R13*1), CX
  1726. MOVQ DX, R14
  1727. MOVQ CX, BX
  1728. ROLQ CL, R14
  1729. MOVL $0x00000001, R15
  1730. MOVB R13, CL
  1731. SHLL CL, R15
  1732. DECL R15
  1733. ANDQ R15, R14
  1734. ADDQ R14, DI
  1735. // Load ctx.llTable
  1736. MOVQ ctx+16(FP), CX
  1737. MOVQ (CX), CX
  1738. MOVQ (CX)(DI*8), DI
  1739. // Update Match Length State
  1740. MOVBQZX R8, R13
  1741. SHRQ $0x10, R8
  1742. MOVWQZX R8, R8
  1743. LEAQ (BX)(R13*1), CX
  1744. MOVQ DX, R14
  1745. MOVQ CX, BX
  1746. ROLQ CL, R14
  1747. MOVL $0x00000001, R15
  1748. MOVB R13, CL
  1749. SHLL CL, R15
  1750. DECL R15
  1751. ANDQ R15, R14
  1752. ADDQ R14, R8
  1753. // Load ctx.mlTable
  1754. MOVQ ctx+16(FP), CX
  1755. MOVQ 24(CX), CX
  1756. MOVQ (CX)(R8*8), R8
  1757. // Update Offset State
  1758. MOVBQZX R9, R13
  1759. SHRQ $0x10, R9
  1760. MOVWQZX R9, R9
  1761. LEAQ (BX)(R13*1), CX
  1762. MOVQ DX, R14
  1763. MOVQ CX, BX
  1764. ROLQ CL, R14
  1765. MOVL $0x00000001, R15
  1766. MOVB R13, CL
  1767. SHLL CL, R15
  1768. DECL R15
  1769. ANDQ R15, R14
  1770. ADDQ R14, R9
  1771. // Load ctx.ofTable
  1772. MOVQ ctx+16(FP), CX
  1773. MOVQ 48(CX), CX
  1774. MOVQ (CX)(R9*8), R9
  1775. sequenceDecs_decodeSync_amd64_skip_update:
  1776. // Adjust offset
  1777. MOVQ s+0(FP), CX
  1778. MOVQ 8(SP), R13
  1779. CMPQ AX, $0x01
  1780. JBE sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0
  1781. MOVUPS 144(CX), X0
  1782. MOVQ R13, 144(CX)
  1783. MOVUPS X0, 152(CX)
  1784. JMP sequenceDecs_decodeSync_amd64_after_adjust
  1785. sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0:
  1786. CMPQ 24(SP), $0x00000000
  1787. JNE sequenceDecs_decodeSync_amd64_adjust_offset_maybezero
  1788. INCQ R13
  1789. JMP sequenceDecs_decodeSync_amd64_adjust_offset_nonzero
  1790. sequenceDecs_decodeSync_amd64_adjust_offset_maybezero:
  1791. TESTQ R13, R13
  1792. JNZ sequenceDecs_decodeSync_amd64_adjust_offset_nonzero
  1793. MOVQ 144(CX), R13
  1794. JMP sequenceDecs_decodeSync_amd64_after_adjust
  1795. sequenceDecs_decodeSync_amd64_adjust_offset_nonzero:
  1796. MOVQ R13, AX
  1797. XORQ R14, R14
  1798. MOVQ $-1, R15
  1799. CMPQ R13, $0x03
  1800. CMOVQEQ R14, AX
  1801. CMOVQEQ R15, R14
  1802. ADDQ 144(CX)(AX*8), R14
  1803. JNZ sequenceDecs_decodeSync_amd64_adjust_temp_valid
  1804. MOVQ $0x00000001, R14
  1805. sequenceDecs_decodeSync_amd64_adjust_temp_valid:
  1806. CMPQ R13, $0x01
  1807. JZ sequenceDecs_decodeSync_amd64_adjust_skip
  1808. MOVQ 152(CX), AX
  1809. MOVQ AX, 160(CX)
  1810. sequenceDecs_decodeSync_amd64_adjust_skip:
  1811. MOVQ 144(CX), AX
  1812. MOVQ AX, 152(CX)
  1813. MOVQ R14, 144(CX)
  1814. MOVQ R14, R13
  1815. sequenceDecs_decodeSync_amd64_after_adjust:
  1816. MOVQ R13, 8(SP)
  1817. // Check values
  1818. MOVQ 16(SP), AX
  1819. MOVQ 24(SP), CX
  1820. LEAQ (AX)(CX*1), R14
  1821. MOVQ s+0(FP), R15
  1822. ADDQ R14, 256(R15)
  1823. MOVQ ctx+16(FP), R14
  1824. SUBQ CX, 104(R14)
  1825. JS error_not_enough_literals
  1826. CMPQ AX, $0x00020002
  1827. JA sequenceDecs_decodeSync_amd64_error_match_len_too_big
  1828. TESTQ R13, R13
  1829. JNZ sequenceDecs_decodeSync_amd64_match_len_ofs_ok
  1830. TESTQ AX, AX
  1831. JNZ sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch
  1832. sequenceDecs_decodeSync_amd64_match_len_ofs_ok:
  1833. MOVQ 24(SP), AX
  1834. MOVQ 8(SP), CX
  1835. MOVQ 16(SP), R13
  1836. // Check if we have enough space in s.out
  1837. LEAQ (AX)(R13*1), R14
  1838. ADDQ R10, R14
  1839. CMPQ R14, 32(SP)
  1840. JA error_not_enough_space
  1841. // Copy literals
  1842. TESTQ AX, AX
  1843. JZ check_offset
  1844. XORQ R14, R14
  1845. copy_1:
  1846. MOVUPS (R11)(R14*1), X0
  1847. MOVUPS X0, (R10)(R14*1)
  1848. ADDQ $0x10, R14
  1849. CMPQ R14, AX
  1850. JB copy_1
  1851. ADDQ AX, R11
  1852. ADDQ AX, R10
  1853. ADDQ AX, R12
  1854. // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
  1855. check_offset:
  1856. MOVQ R12, AX
  1857. ADDQ 40(SP), AX
  1858. CMPQ CX, AX
  1859. JG error_match_off_too_big
  1860. CMPQ CX, 56(SP)
  1861. JG error_match_off_too_big
  1862. // Copy match from history
  1863. MOVQ CX, AX
  1864. SUBQ R12, AX
  1865. JLS copy_match
  1866. MOVQ 48(SP), R14
  1867. SUBQ AX, R14
  1868. CMPQ R13, AX
  1869. JG copy_all_from_history
  1870. MOVQ R13, AX
  1871. SUBQ $0x10, AX
  1872. JB copy_4_small
  1873. copy_4_loop:
  1874. MOVUPS (R14), X0
  1875. MOVUPS X0, (R10)
  1876. ADDQ $0x10, R14
  1877. ADDQ $0x10, R10
  1878. SUBQ $0x10, AX
  1879. JAE copy_4_loop
  1880. LEAQ 16(R14)(AX*1), R14
  1881. LEAQ 16(R10)(AX*1), R10
  1882. MOVUPS -16(R14), X0
  1883. MOVUPS X0, -16(R10)
  1884. JMP copy_4_end
  1885. copy_4_small:
  1886. CMPQ R13, $0x03
  1887. JE copy_4_move_3
  1888. CMPQ R13, $0x08
  1889. JB copy_4_move_4through7
  1890. JMP copy_4_move_8through16
  1891. copy_4_move_3:
  1892. MOVW (R14), AX
  1893. MOVB 2(R14), CL
  1894. MOVW AX, (R10)
  1895. MOVB CL, 2(R10)
  1896. ADDQ R13, R14
  1897. ADDQ R13, R10
  1898. JMP copy_4_end
  1899. copy_4_move_4through7:
  1900. MOVL (R14), AX
  1901. MOVL -4(R14)(R13*1), CX
  1902. MOVL AX, (R10)
  1903. MOVL CX, -4(R10)(R13*1)
  1904. ADDQ R13, R14
  1905. ADDQ R13, R10
  1906. JMP copy_4_end
  1907. copy_4_move_8through16:
  1908. MOVQ (R14), AX
  1909. MOVQ -8(R14)(R13*1), CX
  1910. MOVQ AX, (R10)
  1911. MOVQ CX, -8(R10)(R13*1)
  1912. ADDQ R13, R14
  1913. ADDQ R13, R10
  1914. copy_4_end:
  1915. ADDQ R13, R12
  1916. JMP handle_loop
  1917. JMP loop_finished
  1918. copy_all_from_history:
  1919. MOVQ AX, R15
  1920. SUBQ $0x10, R15
  1921. JB copy_5_small
  1922. copy_5_loop:
  1923. MOVUPS (R14), X0
  1924. MOVUPS X0, (R10)
  1925. ADDQ $0x10, R14
  1926. ADDQ $0x10, R10
  1927. SUBQ $0x10, R15
  1928. JAE copy_5_loop
  1929. LEAQ 16(R14)(R15*1), R14
  1930. LEAQ 16(R10)(R15*1), R10
  1931. MOVUPS -16(R14), X0
  1932. MOVUPS X0, -16(R10)
  1933. JMP copy_5_end
  1934. copy_5_small:
  1935. CMPQ AX, $0x03
  1936. JE copy_5_move_3
  1937. JB copy_5_move_1or2
  1938. CMPQ AX, $0x08
  1939. JB copy_5_move_4through7
  1940. JMP copy_5_move_8through16
  1941. copy_5_move_1or2:
  1942. MOVB (R14), R15
  1943. MOVB -1(R14)(AX*1), BP
  1944. MOVB R15, (R10)
  1945. MOVB BP, -1(R10)(AX*1)
  1946. ADDQ AX, R14
  1947. ADDQ AX, R10
  1948. JMP copy_5_end
  1949. copy_5_move_3:
  1950. MOVW (R14), R15
  1951. MOVB 2(R14), BP
  1952. MOVW R15, (R10)
  1953. MOVB BP, 2(R10)
  1954. ADDQ AX, R14
  1955. ADDQ AX, R10
  1956. JMP copy_5_end
  1957. copy_5_move_4through7:
  1958. MOVL (R14), R15
  1959. MOVL -4(R14)(AX*1), BP
  1960. MOVL R15, (R10)
  1961. MOVL BP, -4(R10)(AX*1)
  1962. ADDQ AX, R14
  1963. ADDQ AX, R10
  1964. JMP copy_5_end
  1965. copy_5_move_8through16:
  1966. MOVQ (R14), R15
  1967. MOVQ -8(R14)(AX*1), BP
  1968. MOVQ R15, (R10)
  1969. MOVQ BP, -8(R10)(AX*1)
  1970. ADDQ AX, R14
  1971. ADDQ AX, R10
  1972. copy_5_end:
  1973. ADDQ AX, R12
  1974. SUBQ AX, R13
  1975. // Copy match from the current buffer
  1976. copy_match:
  1977. MOVQ R10, AX
  1978. SUBQ CX, AX
  1979. // ml <= mo
  1980. CMPQ R13, CX
  1981. JA copy_overlapping_match
  1982. // Copy non-overlapping match
  1983. ADDQ R13, R12
  1984. MOVQ R10, CX
  1985. ADDQ R13, R10
  1986. copy_2:
  1987. MOVUPS (AX), X0
  1988. MOVUPS X0, (CX)
  1989. ADDQ $0x10, AX
  1990. ADDQ $0x10, CX
  1991. SUBQ $0x10, R13
  1992. JHI copy_2
  1993. JMP handle_loop
  1994. // Copy overlapping match
  1995. copy_overlapping_match:
  1996. ADDQ R13, R12
  1997. copy_slow_3:
  1998. MOVB (AX), CL
  1999. MOVB CL, (R10)
  2000. INCQ AX
  2001. INCQ R10
  2002. DECQ R13
  2003. JNZ copy_slow_3
  2004. handle_loop:
  2005. MOVQ ctx+16(FP), AX
  2006. DECQ 96(AX)
  2007. JNS sequenceDecs_decodeSync_amd64_main_loop
  2008. loop_finished:
  2009. MOVQ br+8(FP), AX
  2010. MOVQ DX, 24(AX)
  2011. MOVB BL, 32(AX)
  2012. MOVQ SI, 8(AX)
  2013. // Update the context
  2014. MOVQ ctx+16(FP), AX
  2015. MOVQ R12, 136(AX)
  2016. MOVQ 144(AX), CX
  2017. SUBQ CX, R11
  2018. MOVQ R11, 168(AX)
  2019. // Return success
  2020. MOVQ $0x00000000, ret+24(FP)
  2021. RET
  2022. // Return with match length error
  2023. sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch:
  2024. MOVQ 16(SP), AX
  2025. MOVQ ctx+16(FP), CX
  2026. MOVQ AX, 216(CX)
  2027. MOVQ $0x00000001, ret+24(FP)
  2028. RET
  2029. // Return with match too long error
  2030. sequenceDecs_decodeSync_amd64_error_match_len_too_big:
  2031. MOVQ ctx+16(FP), AX
  2032. MOVQ 16(SP), CX
  2033. MOVQ CX, 216(AX)
  2034. MOVQ $0x00000002, ret+24(FP)
  2035. RET
  2036. // Return with match offset too long error
  2037. error_match_off_too_big:
  2038. MOVQ ctx+16(FP), AX
  2039. MOVQ 8(SP), CX
  2040. MOVQ CX, 224(AX)
  2041. MOVQ R12, 136(AX)
  2042. MOVQ $0x00000003, ret+24(FP)
  2043. RET
  2044. // Return with not enough literals error
  2045. error_not_enough_literals:
  2046. MOVQ ctx+16(FP), AX
  2047. MOVQ 24(SP), CX
  2048. MOVQ CX, 208(AX)
  2049. MOVQ $0x00000004, ret+24(FP)
  2050. RET
  2051. // Return with overread error
  2052. error_overread:
  2053. MOVQ $0x00000006, ret+24(FP)
  2054. RET
  2055. // Return with not enough output space error
  2056. error_not_enough_space:
  2057. MOVQ ctx+16(FP), AX
  2058. MOVQ 24(SP), CX
  2059. MOVQ CX, 208(AX)
  2060. MOVQ 16(SP), CX
  2061. MOVQ CX, 216(AX)
  2062. MOVQ R12, 136(AX)
  2063. MOVQ $0x00000005, ret+24(FP)
  2064. RET
  2065. // func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
  2066. // Requires: BMI, BMI2, CMOV, SSE
  2067. TEXT ·sequenceDecs_decodeSync_bmi2(SB), $64-32
  2068. MOVQ br+8(FP), BX
  2069. MOVQ 24(BX), AX
  2070. MOVBQZX 32(BX), DX
  2071. MOVQ (BX), CX
  2072. MOVQ 8(BX), BX
  2073. ADDQ BX, CX
  2074. MOVQ CX, (SP)
  2075. MOVQ ctx+16(FP), CX
  2076. MOVQ 72(CX), SI
  2077. MOVQ 80(CX), DI
  2078. MOVQ 88(CX), R8
  2079. XORQ R9, R9
  2080. MOVQ R9, 8(SP)
  2081. MOVQ R9, 16(SP)
  2082. MOVQ R9, 24(SP)
  2083. MOVQ 112(CX), R9
  2084. MOVQ 128(CX), R10
  2085. MOVQ R10, 32(SP)
  2086. MOVQ 144(CX), R10
  2087. MOVQ 136(CX), R11
  2088. MOVQ 200(CX), R12
  2089. MOVQ R12, 56(SP)
  2090. MOVQ 176(CX), R12
  2091. MOVQ R12, 48(SP)
  2092. MOVQ 184(CX), CX
  2093. MOVQ CX, 40(SP)
  2094. MOVQ 40(SP), CX
  2095. ADDQ CX, 48(SP)
  2096. // Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
  2097. ADDQ R9, 32(SP)
  2098. // outBase += outPosition
  2099. ADDQ R11, R9
  2100. sequenceDecs_decodeSync_bmi2_main_loop:
  2101. MOVQ (SP), R12
  2102. // Fill bitreader to have enough for the offset and match length.
  2103. CMPQ BX, $0x08
  2104. JL sequenceDecs_decodeSync_bmi2_fill_byte_by_byte
  2105. MOVQ DX, CX
  2106. SHRQ $0x03, CX
  2107. SUBQ CX, R12
  2108. MOVQ (R12), AX
  2109. SUBQ CX, BX
  2110. ANDQ $0x07, DX
  2111. JMP sequenceDecs_decodeSync_bmi2_fill_end
  2112. sequenceDecs_decodeSync_bmi2_fill_byte_by_byte:
  2113. CMPQ BX, $0x00
  2114. JLE sequenceDecs_decodeSync_bmi2_fill_check_overread
  2115. CMPQ DX, $0x07
  2116. JLE sequenceDecs_decodeSync_bmi2_fill_end
  2117. SHLQ $0x08, AX
  2118. SUBQ $0x01, R12
  2119. SUBQ $0x01, BX
  2120. SUBQ $0x08, DX
  2121. MOVBQZX (R12), CX
  2122. ORQ CX, AX
  2123. JMP sequenceDecs_decodeSync_bmi2_fill_byte_by_byte
  2124. sequenceDecs_decodeSync_bmi2_fill_check_overread:
  2125. CMPQ DX, $0x40
  2126. JA error_overread
  2127. sequenceDecs_decodeSync_bmi2_fill_end:
  2128. // Update offset
  2129. MOVQ $0x00000808, CX
  2130. BEXTRQ CX, R8, R13
  2131. MOVQ AX, R14
  2132. LEAQ (DX)(R13*1), CX
  2133. ROLQ CL, R14
  2134. BZHIQ R13, R14, R14
  2135. MOVQ CX, DX
  2136. MOVQ R8, CX
  2137. SHRQ $0x20, CX
  2138. ADDQ R14, CX
  2139. MOVQ CX, 8(SP)
  2140. // Update match length
  2141. MOVQ $0x00000808, CX
  2142. BEXTRQ CX, DI, R13
  2143. MOVQ AX, R14
  2144. LEAQ (DX)(R13*1), CX
  2145. ROLQ CL, R14
  2146. BZHIQ R13, R14, R14
  2147. MOVQ CX, DX
  2148. MOVQ DI, CX
  2149. SHRQ $0x20, CX
  2150. ADDQ R14, CX
  2151. MOVQ CX, 16(SP)
  2152. // Fill bitreader to have enough for the remaining
  2153. CMPQ BX, $0x08
  2154. JL sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte
  2155. MOVQ DX, CX
  2156. SHRQ $0x03, CX
  2157. SUBQ CX, R12
  2158. MOVQ (R12), AX
  2159. SUBQ CX, BX
  2160. ANDQ $0x07, DX
  2161. JMP sequenceDecs_decodeSync_bmi2_fill_2_end
  2162. sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte:
  2163. CMPQ BX, $0x00
  2164. JLE sequenceDecs_decodeSync_bmi2_fill_2_check_overread
  2165. CMPQ DX, $0x07
  2166. JLE sequenceDecs_decodeSync_bmi2_fill_2_end
  2167. SHLQ $0x08, AX
  2168. SUBQ $0x01, R12
  2169. SUBQ $0x01, BX
  2170. SUBQ $0x08, DX
  2171. MOVBQZX (R12), CX
  2172. ORQ CX, AX
  2173. JMP sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte
  2174. sequenceDecs_decodeSync_bmi2_fill_2_check_overread:
  2175. CMPQ DX, $0x40
  2176. JA error_overread
  2177. sequenceDecs_decodeSync_bmi2_fill_2_end:
  2178. // Update literal length
  2179. MOVQ $0x00000808, CX
  2180. BEXTRQ CX, SI, R13
  2181. MOVQ AX, R14
  2182. LEAQ (DX)(R13*1), CX
  2183. ROLQ CL, R14
  2184. BZHIQ R13, R14, R14
  2185. MOVQ CX, DX
  2186. MOVQ SI, CX
  2187. SHRQ $0x20, CX
  2188. ADDQ R14, CX
  2189. MOVQ CX, 24(SP)
  2190. // Fill bitreader for state updates
  2191. MOVQ R12, (SP)
  2192. MOVQ $0x00000808, CX
  2193. BEXTRQ CX, R8, R12
  2194. MOVQ ctx+16(FP), CX
  2195. CMPQ 96(CX), $0x00
  2196. JZ sequenceDecs_decodeSync_bmi2_skip_update
  2197. LEAQ (SI)(DI*1), R13
  2198. ADDQ R8, R13
  2199. MOVBQZX R13, R13
  2200. LEAQ (DX)(R13*1), CX
  2201. MOVQ AX, R14
  2202. MOVQ CX, DX
  2203. ROLQ CL, R14
  2204. BZHIQ R13, R14, R14
  2205. // Update Offset State
  2206. BZHIQ R8, R14, CX
  2207. SHRXQ R8, R14, R14
  2208. MOVQ $0x00001010, R13
  2209. BEXTRQ R13, R8, R8
  2210. ADDQ CX, R8
  2211. // Load ctx.ofTable
  2212. MOVQ ctx+16(FP), CX
  2213. MOVQ 48(CX), CX
  2214. MOVQ (CX)(R8*8), R8
  2215. // Update Match Length State
  2216. BZHIQ DI, R14, CX
  2217. SHRXQ DI, R14, R14
  2218. MOVQ $0x00001010, R13
  2219. BEXTRQ R13, DI, DI
  2220. ADDQ CX, DI
  2221. // Load ctx.mlTable
  2222. MOVQ ctx+16(FP), CX
  2223. MOVQ 24(CX), CX
  2224. MOVQ (CX)(DI*8), DI
  2225. // Update Literal Length State
  2226. BZHIQ SI, R14, CX
  2227. MOVQ $0x00001010, R13
  2228. BEXTRQ R13, SI, SI
  2229. ADDQ CX, SI
  2230. // Load ctx.llTable
  2231. MOVQ ctx+16(FP), CX
  2232. MOVQ (CX), CX
  2233. MOVQ (CX)(SI*8), SI
  2234. sequenceDecs_decodeSync_bmi2_skip_update:
  2235. // Adjust offset
  2236. MOVQ s+0(FP), CX
  2237. MOVQ 8(SP), R13
  2238. CMPQ R12, $0x01
  2239. JBE sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0
  2240. MOVUPS 144(CX), X0
  2241. MOVQ R13, 144(CX)
  2242. MOVUPS X0, 152(CX)
  2243. JMP sequenceDecs_decodeSync_bmi2_after_adjust
  2244. sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0:
  2245. CMPQ 24(SP), $0x00000000
  2246. JNE sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero
  2247. INCQ R13
  2248. JMP sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero
  2249. sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero:
  2250. TESTQ R13, R13
  2251. JNZ sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero
  2252. MOVQ 144(CX), R13
  2253. JMP sequenceDecs_decodeSync_bmi2_after_adjust
  2254. sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero:
  2255. MOVQ R13, R12
  2256. XORQ R14, R14
  2257. MOVQ $-1, R15
  2258. CMPQ R13, $0x03
  2259. CMOVQEQ R14, R12
  2260. CMOVQEQ R15, R14
  2261. ADDQ 144(CX)(R12*8), R14
  2262. JNZ sequenceDecs_decodeSync_bmi2_adjust_temp_valid
  2263. MOVQ $0x00000001, R14
  2264. sequenceDecs_decodeSync_bmi2_adjust_temp_valid:
  2265. CMPQ R13, $0x01
  2266. JZ sequenceDecs_decodeSync_bmi2_adjust_skip
  2267. MOVQ 152(CX), R12
  2268. MOVQ R12, 160(CX)
  2269. sequenceDecs_decodeSync_bmi2_adjust_skip:
  2270. MOVQ 144(CX), R12
  2271. MOVQ R12, 152(CX)
  2272. MOVQ R14, 144(CX)
  2273. MOVQ R14, R13
  2274. sequenceDecs_decodeSync_bmi2_after_adjust:
  2275. MOVQ R13, 8(SP)
  2276. // Check values
  2277. MOVQ 16(SP), CX
  2278. MOVQ 24(SP), R12
  2279. LEAQ (CX)(R12*1), R14
  2280. MOVQ s+0(FP), R15
  2281. ADDQ R14, 256(R15)
  2282. MOVQ ctx+16(FP), R14
  2283. SUBQ R12, 104(R14)
  2284. JS error_not_enough_literals
  2285. CMPQ CX, $0x00020002
  2286. JA sequenceDecs_decodeSync_bmi2_error_match_len_too_big
  2287. TESTQ R13, R13
  2288. JNZ sequenceDecs_decodeSync_bmi2_match_len_ofs_ok
  2289. TESTQ CX, CX
  2290. JNZ sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch
  2291. sequenceDecs_decodeSync_bmi2_match_len_ofs_ok:
  2292. MOVQ 24(SP), CX
  2293. MOVQ 8(SP), R12
  2294. MOVQ 16(SP), R13
  2295. // Check if we have enough space in s.out
  2296. LEAQ (CX)(R13*1), R14
  2297. ADDQ R9, R14
  2298. CMPQ R14, 32(SP)
  2299. JA error_not_enough_space
  2300. // Copy literals
  2301. TESTQ CX, CX
  2302. JZ check_offset
  2303. XORQ R14, R14
  2304. copy_1:
  2305. MOVUPS (R10)(R14*1), X0
  2306. MOVUPS X0, (R9)(R14*1)
  2307. ADDQ $0x10, R14
  2308. CMPQ R14, CX
  2309. JB copy_1
  2310. ADDQ CX, R10
  2311. ADDQ CX, R9
  2312. ADDQ CX, R11
  2313. // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
  2314. check_offset:
  2315. MOVQ R11, CX
  2316. ADDQ 40(SP), CX
  2317. CMPQ R12, CX
  2318. JG error_match_off_too_big
  2319. CMPQ R12, 56(SP)
  2320. JG error_match_off_too_big
  2321. // Copy match from history
  2322. MOVQ R12, CX
  2323. SUBQ R11, CX
  2324. JLS copy_match
  2325. MOVQ 48(SP), R14
  2326. SUBQ CX, R14
  2327. CMPQ R13, CX
  2328. JG copy_all_from_history
  2329. MOVQ R13, CX
  2330. SUBQ $0x10, CX
  2331. JB copy_4_small
  2332. copy_4_loop:
  2333. MOVUPS (R14), X0
  2334. MOVUPS X0, (R9)
  2335. ADDQ $0x10, R14
  2336. ADDQ $0x10, R9
  2337. SUBQ $0x10, CX
  2338. JAE copy_4_loop
  2339. LEAQ 16(R14)(CX*1), R14
  2340. LEAQ 16(R9)(CX*1), R9
  2341. MOVUPS -16(R14), X0
  2342. MOVUPS X0, -16(R9)
  2343. JMP copy_4_end
  2344. copy_4_small:
  2345. CMPQ R13, $0x03
  2346. JE copy_4_move_3
  2347. CMPQ R13, $0x08
  2348. JB copy_4_move_4through7
  2349. JMP copy_4_move_8through16
  2350. copy_4_move_3:
  2351. MOVW (R14), CX
  2352. MOVB 2(R14), R12
  2353. MOVW CX, (R9)
  2354. MOVB R12, 2(R9)
  2355. ADDQ R13, R14
  2356. ADDQ R13, R9
  2357. JMP copy_4_end
  2358. copy_4_move_4through7:
  2359. MOVL (R14), CX
  2360. MOVL -4(R14)(R13*1), R12
  2361. MOVL CX, (R9)
  2362. MOVL R12, -4(R9)(R13*1)
  2363. ADDQ R13, R14
  2364. ADDQ R13, R9
  2365. JMP copy_4_end
  2366. copy_4_move_8through16:
  2367. MOVQ (R14), CX
  2368. MOVQ -8(R14)(R13*1), R12
  2369. MOVQ CX, (R9)
  2370. MOVQ R12, -8(R9)(R13*1)
  2371. ADDQ R13, R14
  2372. ADDQ R13, R9
  2373. copy_4_end:
  2374. ADDQ R13, R11
  2375. JMP handle_loop
  2376. JMP loop_finished
  2377. copy_all_from_history:
  2378. MOVQ CX, R15
  2379. SUBQ $0x10, R15
  2380. JB copy_5_small
  2381. copy_5_loop:
  2382. MOVUPS (R14), X0
  2383. MOVUPS X0, (R9)
  2384. ADDQ $0x10, R14
  2385. ADDQ $0x10, R9
  2386. SUBQ $0x10, R15
  2387. JAE copy_5_loop
  2388. LEAQ 16(R14)(R15*1), R14
  2389. LEAQ 16(R9)(R15*1), R9
  2390. MOVUPS -16(R14), X0
  2391. MOVUPS X0, -16(R9)
  2392. JMP copy_5_end
  2393. copy_5_small:
  2394. CMPQ CX, $0x03
  2395. JE copy_5_move_3
  2396. JB copy_5_move_1or2
  2397. CMPQ CX, $0x08
  2398. JB copy_5_move_4through7
  2399. JMP copy_5_move_8through16
  2400. copy_5_move_1or2:
  2401. MOVB (R14), R15
  2402. MOVB -1(R14)(CX*1), BP
  2403. MOVB R15, (R9)
  2404. MOVB BP, -1(R9)(CX*1)
  2405. ADDQ CX, R14
  2406. ADDQ CX, R9
  2407. JMP copy_5_end
  2408. copy_5_move_3:
  2409. MOVW (R14), R15
  2410. MOVB 2(R14), BP
  2411. MOVW R15, (R9)
  2412. MOVB BP, 2(R9)
  2413. ADDQ CX, R14
  2414. ADDQ CX, R9
  2415. JMP copy_5_end
  2416. copy_5_move_4through7:
  2417. MOVL (R14), R15
  2418. MOVL -4(R14)(CX*1), BP
  2419. MOVL R15, (R9)
  2420. MOVL BP, -4(R9)(CX*1)
  2421. ADDQ CX, R14
  2422. ADDQ CX, R9
  2423. JMP copy_5_end
  2424. copy_5_move_8through16:
  2425. MOVQ (R14), R15
  2426. MOVQ -8(R14)(CX*1), BP
  2427. MOVQ R15, (R9)
  2428. MOVQ BP, -8(R9)(CX*1)
  2429. ADDQ CX, R14
  2430. ADDQ CX, R9
  2431. copy_5_end:
  2432. ADDQ CX, R11
  2433. SUBQ CX, R13
  2434. // Copy match from the current buffer
  2435. copy_match:
  2436. MOVQ R9, CX
  2437. SUBQ R12, CX
  2438. // ml <= mo
  2439. CMPQ R13, R12
  2440. JA copy_overlapping_match
  2441. // Copy non-overlapping match
  2442. ADDQ R13, R11
  2443. MOVQ R9, R12
  2444. ADDQ R13, R9
  2445. copy_2:
  2446. MOVUPS (CX), X0
  2447. MOVUPS X0, (R12)
  2448. ADDQ $0x10, CX
  2449. ADDQ $0x10, R12
  2450. SUBQ $0x10, R13
  2451. JHI copy_2
  2452. JMP handle_loop
  2453. // Copy overlapping match
  2454. copy_overlapping_match:
  2455. ADDQ R13, R11
  2456. copy_slow_3:
  2457. MOVB (CX), R12
  2458. MOVB R12, (R9)
  2459. INCQ CX
  2460. INCQ R9
  2461. DECQ R13
  2462. JNZ copy_slow_3
  2463. handle_loop:
  2464. MOVQ ctx+16(FP), CX
  2465. DECQ 96(CX)
  2466. JNS sequenceDecs_decodeSync_bmi2_main_loop
  2467. loop_finished:
  2468. MOVQ br+8(FP), CX
  2469. MOVQ AX, 24(CX)
  2470. MOVB DL, 32(CX)
  2471. MOVQ BX, 8(CX)
  2472. // Update the context
  2473. MOVQ ctx+16(FP), AX
  2474. MOVQ R11, 136(AX)
  2475. MOVQ 144(AX), CX
  2476. SUBQ CX, R10
  2477. MOVQ R10, 168(AX)
  2478. // Return success
  2479. MOVQ $0x00000000, ret+24(FP)
  2480. RET
  2481. // Return with match length error
  2482. sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch:
  2483. MOVQ 16(SP), AX
  2484. MOVQ ctx+16(FP), CX
  2485. MOVQ AX, 216(CX)
  2486. MOVQ $0x00000001, ret+24(FP)
  2487. RET
  2488. // Return with match too long error
  2489. sequenceDecs_decodeSync_bmi2_error_match_len_too_big:
  2490. MOVQ ctx+16(FP), AX
  2491. MOVQ 16(SP), CX
  2492. MOVQ CX, 216(AX)
  2493. MOVQ $0x00000002, ret+24(FP)
  2494. RET
  2495. // Return with match offset too long error
  2496. error_match_off_too_big:
  2497. MOVQ ctx+16(FP), AX
  2498. MOVQ 8(SP), CX
  2499. MOVQ CX, 224(AX)
  2500. MOVQ R11, 136(AX)
  2501. MOVQ $0x00000003, ret+24(FP)
  2502. RET
  2503. // Return with not enough literals error
  2504. error_not_enough_literals:
  2505. MOVQ ctx+16(FP), AX
  2506. MOVQ 24(SP), CX
  2507. MOVQ CX, 208(AX)
  2508. MOVQ $0x00000004, ret+24(FP)
  2509. RET
  2510. // Return with overread error
  2511. error_overread:
  2512. MOVQ $0x00000006, ret+24(FP)
  2513. RET
  2514. // Return with not enough output space error
  2515. error_not_enough_space:
  2516. MOVQ ctx+16(FP), AX
  2517. MOVQ 24(SP), CX
  2518. MOVQ CX, 208(AX)
  2519. MOVQ 16(SP), CX
  2520. MOVQ CX, 216(AX)
  2521. MOVQ R11, 136(AX)
  2522. MOVQ $0x00000005, ret+24(FP)
  2523. RET
  2524. // func sequenceDecs_decodeSync_safe_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
  2525. // Requires: CMOV, SSE
  2526. TEXT ·sequenceDecs_decodeSync_safe_amd64(SB), $64-32
  2527. MOVQ br+8(FP), CX
  2528. MOVQ 24(CX), DX
  2529. MOVBQZX 32(CX), BX
  2530. MOVQ (CX), AX
  2531. MOVQ 8(CX), SI
  2532. ADDQ SI, AX
  2533. MOVQ AX, (SP)
  2534. MOVQ ctx+16(FP), AX
  2535. MOVQ 72(AX), DI
  2536. MOVQ 80(AX), R8
  2537. MOVQ 88(AX), R9
  2538. XORQ CX, CX
  2539. MOVQ CX, 8(SP)
  2540. MOVQ CX, 16(SP)
  2541. MOVQ CX, 24(SP)
  2542. MOVQ 112(AX), R10
  2543. MOVQ 128(AX), CX
  2544. MOVQ CX, 32(SP)
  2545. MOVQ 144(AX), R11
  2546. MOVQ 136(AX), R12
  2547. MOVQ 200(AX), CX
  2548. MOVQ CX, 56(SP)
  2549. MOVQ 176(AX), CX
  2550. MOVQ CX, 48(SP)
  2551. MOVQ 184(AX), AX
  2552. MOVQ AX, 40(SP)
  2553. MOVQ 40(SP), AX
  2554. ADDQ AX, 48(SP)
  2555. // Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
  2556. ADDQ R10, 32(SP)
  2557. // outBase += outPosition
  2558. ADDQ R12, R10
  2559. sequenceDecs_decodeSync_safe_amd64_main_loop:
  2560. MOVQ (SP), R13
  2561. // Fill bitreader to have enough for the offset and match length.
  2562. CMPQ SI, $0x08
  2563. JL sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte
  2564. MOVQ BX, AX
  2565. SHRQ $0x03, AX
  2566. SUBQ AX, R13
  2567. MOVQ (R13), DX
  2568. SUBQ AX, SI
  2569. ANDQ $0x07, BX
  2570. JMP sequenceDecs_decodeSync_safe_amd64_fill_end
  2571. sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte:
  2572. CMPQ SI, $0x00
  2573. JLE sequenceDecs_decodeSync_safe_amd64_fill_check_overread
  2574. CMPQ BX, $0x07
  2575. JLE sequenceDecs_decodeSync_safe_amd64_fill_end
  2576. SHLQ $0x08, DX
  2577. SUBQ $0x01, R13
  2578. SUBQ $0x01, SI
  2579. SUBQ $0x08, BX
  2580. MOVBQZX (R13), AX
  2581. ORQ AX, DX
  2582. JMP sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte
  2583. sequenceDecs_decodeSync_safe_amd64_fill_check_overread:
  2584. CMPQ BX, $0x40
  2585. JA error_overread
  2586. sequenceDecs_decodeSync_safe_amd64_fill_end:
  2587. // Update offset
  2588. MOVQ R9, AX
  2589. MOVQ BX, CX
  2590. MOVQ DX, R14
  2591. SHLQ CL, R14
  2592. MOVB AH, CL
  2593. SHRQ $0x20, AX
  2594. TESTQ CX, CX
  2595. JZ sequenceDecs_decodeSync_safe_amd64_of_update_zero
  2596. ADDQ CX, BX
  2597. CMPQ BX, $0x40
  2598. JA sequenceDecs_decodeSync_safe_amd64_of_update_zero
  2599. CMPQ CX, $0x40
  2600. JAE sequenceDecs_decodeSync_safe_amd64_of_update_zero
  2601. NEGQ CX
  2602. SHRQ CL, R14
  2603. ADDQ R14, AX
  2604. sequenceDecs_decodeSync_safe_amd64_of_update_zero:
  2605. MOVQ AX, 8(SP)
  2606. // Update match length
  2607. MOVQ R8, AX
  2608. MOVQ BX, CX
  2609. MOVQ DX, R14
  2610. SHLQ CL, R14
  2611. MOVB AH, CL
  2612. SHRQ $0x20, AX
  2613. TESTQ CX, CX
  2614. JZ sequenceDecs_decodeSync_safe_amd64_ml_update_zero
  2615. ADDQ CX, BX
  2616. CMPQ BX, $0x40
  2617. JA sequenceDecs_decodeSync_safe_amd64_ml_update_zero
  2618. CMPQ CX, $0x40
  2619. JAE sequenceDecs_decodeSync_safe_amd64_ml_update_zero
  2620. NEGQ CX
  2621. SHRQ CL, R14
  2622. ADDQ R14, AX
  2623. sequenceDecs_decodeSync_safe_amd64_ml_update_zero:
  2624. MOVQ AX, 16(SP)
  2625. // Fill bitreader to have enough for the remaining
  2626. CMPQ SI, $0x08
  2627. JL sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte
  2628. MOVQ BX, AX
  2629. SHRQ $0x03, AX
  2630. SUBQ AX, R13
  2631. MOVQ (R13), DX
  2632. SUBQ AX, SI
  2633. ANDQ $0x07, BX
  2634. JMP sequenceDecs_decodeSync_safe_amd64_fill_2_end
  2635. sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte:
  2636. CMPQ SI, $0x00
  2637. JLE sequenceDecs_decodeSync_safe_amd64_fill_2_check_overread
  2638. CMPQ BX, $0x07
  2639. JLE sequenceDecs_decodeSync_safe_amd64_fill_2_end
  2640. SHLQ $0x08, DX
  2641. SUBQ $0x01, R13
  2642. SUBQ $0x01, SI
  2643. SUBQ $0x08, BX
  2644. MOVBQZX (R13), AX
  2645. ORQ AX, DX
  2646. JMP sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte
  2647. sequenceDecs_decodeSync_safe_amd64_fill_2_check_overread:
  2648. CMPQ BX, $0x40
  2649. JA error_overread
  2650. sequenceDecs_decodeSync_safe_amd64_fill_2_end:
  2651. // Update literal length
  2652. MOVQ DI, AX
  2653. MOVQ BX, CX
  2654. MOVQ DX, R14
  2655. SHLQ CL, R14
  2656. MOVB AH, CL
  2657. SHRQ $0x20, AX
  2658. TESTQ CX, CX
  2659. JZ sequenceDecs_decodeSync_safe_amd64_ll_update_zero
  2660. ADDQ CX, BX
  2661. CMPQ BX, $0x40
  2662. JA sequenceDecs_decodeSync_safe_amd64_ll_update_zero
  2663. CMPQ CX, $0x40
  2664. JAE sequenceDecs_decodeSync_safe_amd64_ll_update_zero
  2665. NEGQ CX
  2666. SHRQ CL, R14
  2667. ADDQ R14, AX
  2668. sequenceDecs_decodeSync_safe_amd64_ll_update_zero:
  2669. MOVQ AX, 24(SP)
  2670. // Fill bitreader for state updates
  2671. MOVQ R13, (SP)
  2672. MOVQ R9, AX
  2673. SHRQ $0x08, AX
  2674. MOVBQZX AL, AX
  2675. MOVQ ctx+16(FP), CX
  2676. CMPQ 96(CX), $0x00
  2677. JZ sequenceDecs_decodeSync_safe_amd64_skip_update
  2678. // Update Literal Length State
  2679. MOVBQZX DI, R13
  2680. SHRQ $0x10, DI
  2681. MOVWQZX DI, DI
  2682. LEAQ (BX)(R13*1), CX
  2683. MOVQ DX, R14
  2684. MOVQ CX, BX
  2685. ROLQ CL, R14
  2686. MOVL $0x00000001, R15
  2687. MOVB R13, CL
  2688. SHLL CL, R15
  2689. DECL R15
  2690. ANDQ R15, R14
  2691. ADDQ R14, DI
  2692. // Load ctx.llTable
  2693. MOVQ ctx+16(FP), CX
  2694. MOVQ (CX), CX
  2695. MOVQ (CX)(DI*8), DI
  2696. // Update Match Length State
  2697. MOVBQZX R8, R13
  2698. SHRQ $0x10, R8
  2699. MOVWQZX R8, R8
  2700. LEAQ (BX)(R13*1), CX
  2701. MOVQ DX, R14
  2702. MOVQ CX, BX
  2703. ROLQ CL, R14
  2704. MOVL $0x00000001, R15
  2705. MOVB R13, CL
  2706. SHLL CL, R15
  2707. DECL R15
  2708. ANDQ R15, R14
  2709. ADDQ R14, R8
  2710. // Load ctx.mlTable
  2711. MOVQ ctx+16(FP), CX
  2712. MOVQ 24(CX), CX
  2713. MOVQ (CX)(R8*8), R8
  2714. // Update Offset State
  2715. MOVBQZX R9, R13
  2716. SHRQ $0x10, R9
  2717. MOVWQZX R9, R9
  2718. LEAQ (BX)(R13*1), CX
  2719. MOVQ DX, R14
  2720. MOVQ CX, BX
  2721. ROLQ CL, R14
  2722. MOVL $0x00000001, R15
  2723. MOVB R13, CL
  2724. SHLL CL, R15
  2725. DECL R15
  2726. ANDQ R15, R14
  2727. ADDQ R14, R9
  2728. // Load ctx.ofTable
  2729. MOVQ ctx+16(FP), CX
  2730. MOVQ 48(CX), CX
  2731. MOVQ (CX)(R9*8), R9
  2732. sequenceDecs_decodeSync_safe_amd64_skip_update:
  2733. // Adjust offset
  2734. MOVQ s+0(FP), CX
  2735. MOVQ 8(SP), R13
  2736. CMPQ AX, $0x01
  2737. JBE sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0
  2738. MOVUPS 144(CX), X0
  2739. MOVQ R13, 144(CX)
  2740. MOVUPS X0, 152(CX)
  2741. JMP sequenceDecs_decodeSync_safe_amd64_after_adjust
  2742. sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0:
  2743. CMPQ 24(SP), $0x00000000
  2744. JNE sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero
  2745. INCQ R13
  2746. JMP sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero
  2747. sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero:
  2748. TESTQ R13, R13
  2749. JNZ sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero
  2750. MOVQ 144(CX), R13
  2751. JMP sequenceDecs_decodeSync_safe_amd64_after_adjust
  2752. sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero:
  2753. MOVQ R13, AX
  2754. XORQ R14, R14
  2755. MOVQ $-1, R15
  2756. CMPQ R13, $0x03
  2757. CMOVQEQ R14, AX
  2758. CMOVQEQ R15, R14
  2759. ADDQ 144(CX)(AX*8), R14
  2760. JNZ sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid
  2761. MOVQ $0x00000001, R14
  2762. sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid:
  2763. CMPQ R13, $0x01
  2764. JZ sequenceDecs_decodeSync_safe_amd64_adjust_skip
  2765. MOVQ 152(CX), AX
  2766. MOVQ AX, 160(CX)
  2767. sequenceDecs_decodeSync_safe_amd64_adjust_skip:
  2768. MOVQ 144(CX), AX
  2769. MOVQ AX, 152(CX)
  2770. MOVQ R14, 144(CX)
  2771. MOVQ R14, R13
  2772. sequenceDecs_decodeSync_safe_amd64_after_adjust:
  2773. MOVQ R13, 8(SP)
  2774. // Check values
  2775. MOVQ 16(SP), AX
  2776. MOVQ 24(SP), CX
  2777. LEAQ (AX)(CX*1), R14
  2778. MOVQ s+0(FP), R15
  2779. ADDQ R14, 256(R15)
  2780. MOVQ ctx+16(FP), R14
  2781. SUBQ CX, 104(R14)
  2782. JS error_not_enough_literals
  2783. CMPQ AX, $0x00020002
  2784. JA sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big
  2785. TESTQ R13, R13
  2786. JNZ sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok
  2787. TESTQ AX, AX
  2788. JNZ sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch
  2789. sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok:
  2790. MOVQ 24(SP), AX
  2791. MOVQ 8(SP), CX
  2792. MOVQ 16(SP), R13
  2793. // Check if we have enough space in s.out
  2794. LEAQ (AX)(R13*1), R14
  2795. ADDQ R10, R14
  2796. CMPQ R14, 32(SP)
  2797. JA error_not_enough_space
  2798. // Copy literals
  2799. TESTQ AX, AX
  2800. JZ check_offset
  2801. MOVQ AX, R14
  2802. SUBQ $0x10, R14
  2803. JB copy_1_small
  2804. copy_1_loop:
  2805. MOVUPS (R11), X0
  2806. MOVUPS X0, (R10)
  2807. ADDQ $0x10, R11
  2808. ADDQ $0x10, R10
  2809. SUBQ $0x10, R14
  2810. JAE copy_1_loop
  2811. LEAQ 16(R11)(R14*1), R11
  2812. LEAQ 16(R10)(R14*1), R10
  2813. MOVUPS -16(R11), X0
  2814. MOVUPS X0, -16(R10)
  2815. JMP copy_1_end
  2816. copy_1_small:
  2817. CMPQ AX, $0x03
  2818. JE copy_1_move_3
  2819. JB copy_1_move_1or2
  2820. CMPQ AX, $0x08
  2821. JB copy_1_move_4through7
  2822. JMP copy_1_move_8through16
  2823. copy_1_move_1or2:
  2824. MOVB (R11), R14
  2825. MOVB -1(R11)(AX*1), R15
  2826. MOVB R14, (R10)
  2827. MOVB R15, -1(R10)(AX*1)
  2828. ADDQ AX, R11
  2829. ADDQ AX, R10
  2830. JMP copy_1_end
  2831. copy_1_move_3:
  2832. MOVW (R11), R14
  2833. MOVB 2(R11), R15
  2834. MOVW R14, (R10)
  2835. MOVB R15, 2(R10)
  2836. ADDQ AX, R11
  2837. ADDQ AX, R10
  2838. JMP copy_1_end
  2839. copy_1_move_4through7:
  2840. MOVL (R11), R14
  2841. MOVL -4(R11)(AX*1), R15
  2842. MOVL R14, (R10)
  2843. MOVL R15, -4(R10)(AX*1)
  2844. ADDQ AX, R11
  2845. ADDQ AX, R10
  2846. JMP copy_1_end
  2847. copy_1_move_8through16:
  2848. MOVQ (R11), R14
  2849. MOVQ -8(R11)(AX*1), R15
  2850. MOVQ R14, (R10)
  2851. MOVQ R15, -8(R10)(AX*1)
  2852. ADDQ AX, R11
  2853. ADDQ AX, R10
  2854. copy_1_end:
  2855. ADDQ AX, R12
  2856. // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
  2857. check_offset:
  2858. MOVQ R12, AX
  2859. ADDQ 40(SP), AX
  2860. CMPQ CX, AX
  2861. JG error_match_off_too_big
  2862. CMPQ CX, 56(SP)
  2863. JG error_match_off_too_big
  2864. // Copy match from history
  2865. MOVQ CX, AX
  2866. SUBQ R12, AX
  2867. JLS copy_match
  2868. MOVQ 48(SP), R14
  2869. SUBQ AX, R14
  2870. CMPQ R13, AX
  2871. JG copy_all_from_history
  2872. MOVQ R13, AX
  2873. SUBQ $0x10, AX
  2874. JB copy_4_small
  2875. copy_4_loop:
  2876. MOVUPS (R14), X0
  2877. MOVUPS X0, (R10)
  2878. ADDQ $0x10, R14
  2879. ADDQ $0x10, R10
  2880. SUBQ $0x10, AX
  2881. JAE copy_4_loop
  2882. LEAQ 16(R14)(AX*1), R14
  2883. LEAQ 16(R10)(AX*1), R10
  2884. MOVUPS -16(R14), X0
  2885. MOVUPS X0, -16(R10)
  2886. JMP copy_4_end
  2887. copy_4_small:
  2888. CMPQ R13, $0x03
  2889. JE copy_4_move_3
  2890. CMPQ R13, $0x08
  2891. JB copy_4_move_4through7
  2892. JMP copy_4_move_8through16
  2893. copy_4_move_3:
  2894. MOVW (R14), AX
  2895. MOVB 2(R14), CL
  2896. MOVW AX, (R10)
  2897. MOVB CL, 2(R10)
  2898. ADDQ R13, R14
  2899. ADDQ R13, R10
  2900. JMP copy_4_end
  2901. copy_4_move_4through7:
  2902. MOVL (R14), AX
  2903. MOVL -4(R14)(R13*1), CX
  2904. MOVL AX, (R10)
  2905. MOVL CX, -4(R10)(R13*1)
  2906. ADDQ R13, R14
  2907. ADDQ R13, R10
  2908. JMP copy_4_end
  2909. copy_4_move_8through16:
  2910. MOVQ (R14), AX
  2911. MOVQ -8(R14)(R13*1), CX
  2912. MOVQ AX, (R10)
  2913. MOVQ CX, -8(R10)(R13*1)
  2914. ADDQ R13, R14
  2915. ADDQ R13, R10
  2916. copy_4_end:
  2917. ADDQ R13, R12
  2918. JMP handle_loop
  2919. JMP loop_finished
  2920. copy_all_from_history:
  2921. MOVQ AX, R15
  2922. SUBQ $0x10, R15
  2923. JB copy_5_small
  2924. copy_5_loop:
  2925. MOVUPS (R14), X0
  2926. MOVUPS X0, (R10)
  2927. ADDQ $0x10, R14
  2928. ADDQ $0x10, R10
  2929. SUBQ $0x10, R15
  2930. JAE copy_5_loop
  2931. LEAQ 16(R14)(R15*1), R14
  2932. LEAQ 16(R10)(R15*1), R10
  2933. MOVUPS -16(R14), X0
  2934. MOVUPS X0, -16(R10)
  2935. JMP copy_5_end
  2936. copy_5_small:
  2937. CMPQ AX, $0x03
  2938. JE copy_5_move_3
  2939. JB copy_5_move_1or2
  2940. CMPQ AX, $0x08
  2941. JB copy_5_move_4through7
  2942. JMP copy_5_move_8through16
  2943. copy_5_move_1or2:
  2944. MOVB (R14), R15
  2945. MOVB -1(R14)(AX*1), BP
  2946. MOVB R15, (R10)
  2947. MOVB BP, -1(R10)(AX*1)
  2948. ADDQ AX, R14
  2949. ADDQ AX, R10
  2950. JMP copy_5_end
  2951. copy_5_move_3:
  2952. MOVW (R14), R15
  2953. MOVB 2(R14), BP
  2954. MOVW R15, (R10)
  2955. MOVB BP, 2(R10)
  2956. ADDQ AX, R14
  2957. ADDQ AX, R10
  2958. JMP copy_5_end
  2959. copy_5_move_4through7:
  2960. MOVL (R14), R15
  2961. MOVL -4(R14)(AX*1), BP
  2962. MOVL R15, (R10)
  2963. MOVL BP, -4(R10)(AX*1)
  2964. ADDQ AX, R14
  2965. ADDQ AX, R10
  2966. JMP copy_5_end
  2967. copy_5_move_8through16:
  2968. MOVQ (R14), R15
  2969. MOVQ -8(R14)(AX*1), BP
  2970. MOVQ R15, (R10)
  2971. MOVQ BP, -8(R10)(AX*1)
  2972. ADDQ AX, R14
  2973. ADDQ AX, R10
  2974. copy_5_end:
  2975. ADDQ AX, R12
  2976. SUBQ AX, R13
  2977. // Copy match from the current buffer
  2978. copy_match:
  2979. MOVQ R10, AX
  2980. SUBQ CX, AX
  2981. // ml <= mo
  2982. CMPQ R13, CX
  2983. JA copy_overlapping_match
  2984. // Copy non-overlapping match
  2985. ADDQ R13, R12
  2986. MOVQ R13, CX
  2987. SUBQ $0x10, CX
  2988. JB copy_2_small
  2989. copy_2_loop:
  2990. MOVUPS (AX), X0
  2991. MOVUPS X0, (R10)
  2992. ADDQ $0x10, AX
  2993. ADDQ $0x10, R10
  2994. SUBQ $0x10, CX
  2995. JAE copy_2_loop
  2996. LEAQ 16(AX)(CX*1), AX
  2997. LEAQ 16(R10)(CX*1), R10
  2998. MOVUPS -16(AX), X0
  2999. MOVUPS X0, -16(R10)
  3000. JMP copy_2_end
  3001. copy_2_small:
  3002. CMPQ R13, $0x03
  3003. JE copy_2_move_3
  3004. JB copy_2_move_1or2
  3005. CMPQ R13, $0x08
  3006. JB copy_2_move_4through7
  3007. JMP copy_2_move_8through16
  3008. copy_2_move_1or2:
  3009. MOVB (AX), CL
  3010. MOVB -1(AX)(R13*1), R14
  3011. MOVB CL, (R10)
  3012. MOVB R14, -1(R10)(R13*1)
  3013. ADDQ R13, AX
  3014. ADDQ R13, R10
  3015. JMP copy_2_end
  3016. copy_2_move_3:
  3017. MOVW (AX), CX
  3018. MOVB 2(AX), R14
  3019. MOVW CX, (R10)
  3020. MOVB R14, 2(R10)
  3021. ADDQ R13, AX
  3022. ADDQ R13, R10
  3023. JMP copy_2_end
  3024. copy_2_move_4through7:
  3025. MOVL (AX), CX
  3026. MOVL -4(AX)(R13*1), R14
  3027. MOVL CX, (R10)
  3028. MOVL R14, -4(R10)(R13*1)
  3029. ADDQ R13, AX
  3030. ADDQ R13, R10
  3031. JMP copy_2_end
  3032. copy_2_move_8through16:
  3033. MOVQ (AX), CX
  3034. MOVQ -8(AX)(R13*1), R14
  3035. MOVQ CX, (R10)
  3036. MOVQ R14, -8(R10)(R13*1)
  3037. ADDQ R13, AX
  3038. ADDQ R13, R10
  3039. copy_2_end:
  3040. JMP handle_loop
  3041. // Copy overlapping match
  3042. copy_overlapping_match:
  3043. ADDQ R13, R12
  3044. copy_slow_3:
  3045. MOVB (AX), CL
  3046. MOVB CL, (R10)
  3047. INCQ AX
  3048. INCQ R10
  3049. DECQ R13
  3050. JNZ copy_slow_3
  3051. handle_loop:
  3052. MOVQ ctx+16(FP), AX
  3053. DECQ 96(AX)
  3054. JNS sequenceDecs_decodeSync_safe_amd64_main_loop
  3055. loop_finished:
  3056. MOVQ br+8(FP), AX
  3057. MOVQ DX, 24(AX)
  3058. MOVB BL, 32(AX)
  3059. MOVQ SI, 8(AX)
  3060. // Update the context
  3061. MOVQ ctx+16(FP), AX
  3062. MOVQ R12, 136(AX)
  3063. MOVQ 144(AX), CX
  3064. SUBQ CX, R11
  3065. MOVQ R11, 168(AX)
  3066. // Return success
  3067. MOVQ $0x00000000, ret+24(FP)
  3068. RET
  3069. // Return with match length error
  3070. sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch:
  3071. MOVQ 16(SP), AX
  3072. MOVQ ctx+16(FP), CX
  3073. MOVQ AX, 216(CX)
  3074. MOVQ $0x00000001, ret+24(FP)
  3075. RET
  3076. // Return with match too long error
  3077. sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big:
  3078. MOVQ ctx+16(FP), AX
  3079. MOVQ 16(SP), CX
  3080. MOVQ CX, 216(AX)
  3081. MOVQ $0x00000002, ret+24(FP)
  3082. RET
  3083. // Return with match offset too long error
  3084. error_match_off_too_big:
  3085. MOVQ ctx+16(FP), AX
  3086. MOVQ 8(SP), CX
  3087. MOVQ CX, 224(AX)
  3088. MOVQ R12, 136(AX)
  3089. MOVQ $0x00000003, ret+24(FP)
  3090. RET
  3091. // Return with not enough literals error
  3092. error_not_enough_literals:
  3093. MOVQ ctx+16(FP), AX
  3094. MOVQ 24(SP), CX
  3095. MOVQ CX, 208(AX)
  3096. MOVQ $0x00000004, ret+24(FP)
  3097. RET
  3098. // Return with overread error
  3099. error_overread:
  3100. MOVQ $0x00000006, ret+24(FP)
  3101. RET
  3102. // Return with not enough output space error
  3103. error_not_enough_space:
  3104. MOVQ ctx+16(FP), AX
  3105. MOVQ 24(SP), CX
  3106. MOVQ CX, 208(AX)
  3107. MOVQ 16(SP), CX
  3108. MOVQ CX, 216(AX)
  3109. MOVQ R12, 136(AX)
  3110. MOVQ $0x00000005, ret+24(FP)
  3111. RET
  3112. // func sequenceDecs_decodeSync_safe_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
  3113. // Requires: BMI, BMI2, CMOV, SSE
  3114. TEXT ·sequenceDecs_decodeSync_safe_bmi2(SB), $64-32
  3115. MOVQ br+8(FP), BX
  3116. MOVQ 24(BX), AX
  3117. MOVBQZX 32(BX), DX
  3118. MOVQ (BX), CX
  3119. MOVQ 8(BX), BX
  3120. ADDQ BX, CX
  3121. MOVQ CX, (SP)
  3122. MOVQ ctx+16(FP), CX
  3123. MOVQ 72(CX), SI
  3124. MOVQ 80(CX), DI
  3125. MOVQ 88(CX), R8
  3126. XORQ R9, R9
  3127. MOVQ R9, 8(SP)
  3128. MOVQ R9, 16(SP)
  3129. MOVQ R9, 24(SP)
  3130. MOVQ 112(CX), R9
  3131. MOVQ 128(CX), R10
  3132. MOVQ R10, 32(SP)
  3133. MOVQ 144(CX), R10
  3134. MOVQ 136(CX), R11
  3135. MOVQ 200(CX), R12
  3136. MOVQ R12, 56(SP)
  3137. MOVQ 176(CX), R12
  3138. MOVQ R12, 48(SP)
  3139. MOVQ 184(CX), CX
  3140. MOVQ CX, 40(SP)
  3141. MOVQ 40(SP), CX
  3142. ADDQ CX, 48(SP)
  3143. // Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
  3144. ADDQ R9, 32(SP)
  3145. // outBase += outPosition
  3146. ADDQ R11, R9
  3147. sequenceDecs_decodeSync_safe_bmi2_main_loop:
  3148. MOVQ (SP), R12
  3149. // Fill bitreader to have enough for the offset and match length.
  3150. CMPQ BX, $0x08
  3151. JL sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte
  3152. MOVQ DX, CX
  3153. SHRQ $0x03, CX
  3154. SUBQ CX, R12
  3155. MOVQ (R12), AX
  3156. SUBQ CX, BX
  3157. ANDQ $0x07, DX
  3158. JMP sequenceDecs_decodeSync_safe_bmi2_fill_end
  3159. sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte:
  3160. CMPQ BX, $0x00
  3161. JLE sequenceDecs_decodeSync_safe_bmi2_fill_check_overread
  3162. CMPQ DX, $0x07
  3163. JLE sequenceDecs_decodeSync_safe_bmi2_fill_end
  3164. SHLQ $0x08, AX
  3165. SUBQ $0x01, R12
  3166. SUBQ $0x01, BX
  3167. SUBQ $0x08, DX
  3168. MOVBQZX (R12), CX
  3169. ORQ CX, AX
  3170. JMP sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte
  3171. sequenceDecs_decodeSync_safe_bmi2_fill_check_overread:
  3172. CMPQ DX, $0x40
  3173. JA error_overread
  3174. sequenceDecs_decodeSync_safe_bmi2_fill_end:
  3175. // Update offset
  3176. MOVQ $0x00000808, CX
  3177. BEXTRQ CX, R8, R13
  3178. MOVQ AX, R14
  3179. LEAQ (DX)(R13*1), CX
  3180. ROLQ CL, R14
  3181. BZHIQ R13, R14, R14
  3182. MOVQ CX, DX
  3183. MOVQ R8, CX
  3184. SHRQ $0x20, CX
  3185. ADDQ R14, CX
  3186. MOVQ CX, 8(SP)
  3187. // Update match length
  3188. MOVQ $0x00000808, CX
  3189. BEXTRQ CX, DI, R13
  3190. MOVQ AX, R14
  3191. LEAQ (DX)(R13*1), CX
  3192. ROLQ CL, R14
  3193. BZHIQ R13, R14, R14
  3194. MOVQ CX, DX
  3195. MOVQ DI, CX
  3196. SHRQ $0x20, CX
  3197. ADDQ R14, CX
  3198. MOVQ CX, 16(SP)
  3199. // Fill bitreader to have enough for the remaining
  3200. CMPQ BX, $0x08
  3201. JL sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte
  3202. MOVQ DX, CX
  3203. SHRQ $0x03, CX
  3204. SUBQ CX, R12
  3205. MOVQ (R12), AX
  3206. SUBQ CX, BX
  3207. ANDQ $0x07, DX
  3208. JMP sequenceDecs_decodeSync_safe_bmi2_fill_2_end
  3209. sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte:
  3210. CMPQ BX, $0x00
  3211. JLE sequenceDecs_decodeSync_safe_bmi2_fill_2_check_overread
  3212. CMPQ DX, $0x07
  3213. JLE sequenceDecs_decodeSync_safe_bmi2_fill_2_end
  3214. SHLQ $0x08, AX
  3215. SUBQ $0x01, R12
  3216. SUBQ $0x01, BX
  3217. SUBQ $0x08, DX
  3218. MOVBQZX (R12), CX
  3219. ORQ CX, AX
  3220. JMP sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte
  3221. sequenceDecs_decodeSync_safe_bmi2_fill_2_check_overread:
  3222. CMPQ DX, $0x40
  3223. JA error_overread
  3224. sequenceDecs_decodeSync_safe_bmi2_fill_2_end:
  3225. // Update literal length
  3226. MOVQ $0x00000808, CX
  3227. BEXTRQ CX, SI, R13
  3228. MOVQ AX, R14
  3229. LEAQ (DX)(R13*1), CX
  3230. ROLQ CL, R14
  3231. BZHIQ R13, R14, R14
  3232. MOVQ CX, DX
  3233. MOVQ SI, CX
  3234. SHRQ $0x20, CX
  3235. ADDQ R14, CX
  3236. MOVQ CX, 24(SP)
  3237. // Fill bitreader for state updates
  3238. MOVQ R12, (SP)
  3239. MOVQ $0x00000808, CX
  3240. BEXTRQ CX, R8, R12
  3241. MOVQ ctx+16(FP), CX
  3242. CMPQ 96(CX), $0x00
  3243. JZ sequenceDecs_decodeSync_safe_bmi2_skip_update
  3244. LEAQ (SI)(DI*1), R13
  3245. ADDQ R8, R13
  3246. MOVBQZX R13, R13
  3247. LEAQ (DX)(R13*1), CX
  3248. MOVQ AX, R14
  3249. MOVQ CX, DX
  3250. ROLQ CL, R14
  3251. BZHIQ R13, R14, R14
  3252. // Update Offset State
  3253. BZHIQ R8, R14, CX
  3254. SHRXQ R8, R14, R14
  3255. MOVQ $0x00001010, R13
  3256. BEXTRQ R13, R8, R8
  3257. ADDQ CX, R8
  3258. // Load ctx.ofTable
  3259. MOVQ ctx+16(FP), CX
  3260. MOVQ 48(CX), CX
  3261. MOVQ (CX)(R8*8), R8
  3262. // Update Match Length State
  3263. BZHIQ DI, R14, CX
  3264. SHRXQ DI, R14, R14
  3265. MOVQ $0x00001010, R13
  3266. BEXTRQ R13, DI, DI
  3267. ADDQ CX, DI
  3268. // Load ctx.mlTable
  3269. MOVQ ctx+16(FP), CX
  3270. MOVQ 24(CX), CX
  3271. MOVQ (CX)(DI*8), DI
  3272. // Update Literal Length State
  3273. BZHIQ SI, R14, CX
  3274. MOVQ $0x00001010, R13
  3275. BEXTRQ R13, SI, SI
  3276. ADDQ CX, SI
  3277. // Load ctx.llTable
  3278. MOVQ ctx+16(FP), CX
  3279. MOVQ (CX), CX
  3280. MOVQ (CX)(SI*8), SI
  3281. sequenceDecs_decodeSync_safe_bmi2_skip_update:
  3282. // Adjust offset
  3283. MOVQ s+0(FP), CX
  3284. MOVQ 8(SP), R13
  3285. CMPQ R12, $0x01
  3286. JBE sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0
  3287. MOVUPS 144(CX), X0
  3288. MOVQ R13, 144(CX)
  3289. MOVUPS X0, 152(CX)
  3290. JMP sequenceDecs_decodeSync_safe_bmi2_after_adjust
  3291. sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0:
  3292. CMPQ 24(SP), $0x00000000
  3293. JNE sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero
  3294. INCQ R13
  3295. JMP sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero
  3296. sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero:
  3297. TESTQ R13, R13
  3298. JNZ sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero
  3299. MOVQ 144(CX), R13
  3300. JMP sequenceDecs_decodeSync_safe_bmi2_after_adjust
  3301. sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero:
  3302. MOVQ R13, R12
  3303. XORQ R14, R14
  3304. MOVQ $-1, R15
  3305. CMPQ R13, $0x03
  3306. CMOVQEQ R14, R12
  3307. CMOVQEQ R15, R14
  3308. ADDQ 144(CX)(R12*8), R14
  3309. JNZ sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid
  3310. MOVQ $0x00000001, R14
  3311. sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid:
  3312. CMPQ R13, $0x01
  3313. JZ sequenceDecs_decodeSync_safe_bmi2_adjust_skip
  3314. MOVQ 152(CX), R12
  3315. MOVQ R12, 160(CX)
  3316. sequenceDecs_decodeSync_safe_bmi2_adjust_skip:
  3317. MOVQ 144(CX), R12
  3318. MOVQ R12, 152(CX)
  3319. MOVQ R14, 144(CX)
  3320. MOVQ R14, R13
  3321. sequenceDecs_decodeSync_safe_bmi2_after_adjust:
  3322. MOVQ R13, 8(SP)
  3323. // Check values
  3324. MOVQ 16(SP), CX
  3325. MOVQ 24(SP), R12
  3326. LEAQ (CX)(R12*1), R14
  3327. MOVQ s+0(FP), R15
  3328. ADDQ R14, 256(R15)
  3329. MOVQ ctx+16(FP), R14
  3330. SUBQ R12, 104(R14)
  3331. JS error_not_enough_literals
  3332. CMPQ CX, $0x00020002
  3333. JA sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big
  3334. TESTQ R13, R13
  3335. JNZ sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok
  3336. TESTQ CX, CX
  3337. JNZ sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch
  3338. sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok:
  3339. MOVQ 24(SP), CX
  3340. MOVQ 8(SP), R12
  3341. MOVQ 16(SP), R13
  3342. // Check if we have enough space in s.out
  3343. LEAQ (CX)(R13*1), R14
  3344. ADDQ R9, R14
  3345. CMPQ R14, 32(SP)
  3346. JA error_not_enough_space
  3347. // Copy literals
  3348. TESTQ CX, CX
  3349. JZ check_offset
  3350. MOVQ CX, R14
  3351. SUBQ $0x10, R14
  3352. JB copy_1_small
  3353. copy_1_loop:
  3354. MOVUPS (R10), X0
  3355. MOVUPS X0, (R9)
  3356. ADDQ $0x10, R10
  3357. ADDQ $0x10, R9
  3358. SUBQ $0x10, R14
  3359. JAE copy_1_loop
  3360. LEAQ 16(R10)(R14*1), R10
  3361. LEAQ 16(R9)(R14*1), R9
  3362. MOVUPS -16(R10), X0
  3363. MOVUPS X0, -16(R9)
  3364. JMP copy_1_end
  3365. copy_1_small:
  3366. CMPQ CX, $0x03
  3367. JE copy_1_move_3
  3368. JB copy_1_move_1or2
  3369. CMPQ CX, $0x08
  3370. JB copy_1_move_4through7
  3371. JMP copy_1_move_8through16
  3372. copy_1_move_1or2:
  3373. MOVB (R10), R14
  3374. MOVB -1(R10)(CX*1), R15
  3375. MOVB R14, (R9)
  3376. MOVB R15, -1(R9)(CX*1)
  3377. ADDQ CX, R10
  3378. ADDQ CX, R9
  3379. JMP copy_1_end
  3380. copy_1_move_3:
  3381. MOVW (R10), R14
  3382. MOVB 2(R10), R15
  3383. MOVW R14, (R9)
  3384. MOVB R15, 2(R9)
  3385. ADDQ CX, R10
  3386. ADDQ CX, R9
  3387. JMP copy_1_end
  3388. copy_1_move_4through7:
  3389. MOVL (R10), R14
  3390. MOVL -4(R10)(CX*1), R15
  3391. MOVL R14, (R9)
  3392. MOVL R15, -4(R9)(CX*1)
  3393. ADDQ CX, R10
  3394. ADDQ CX, R9
  3395. JMP copy_1_end
  3396. copy_1_move_8through16:
  3397. MOVQ (R10), R14
  3398. MOVQ -8(R10)(CX*1), R15
  3399. MOVQ R14, (R9)
  3400. MOVQ R15, -8(R9)(CX*1)
  3401. ADDQ CX, R10
  3402. ADDQ CX, R9
  3403. copy_1_end:
  3404. ADDQ CX, R11
  3405. // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
  3406. check_offset:
  3407. MOVQ R11, CX
  3408. ADDQ 40(SP), CX
  3409. CMPQ R12, CX
  3410. JG error_match_off_too_big
  3411. CMPQ R12, 56(SP)
  3412. JG error_match_off_too_big
  3413. // Copy match from history
  3414. MOVQ R12, CX
  3415. SUBQ R11, CX
  3416. JLS copy_match
  3417. MOVQ 48(SP), R14
  3418. SUBQ CX, R14
  3419. CMPQ R13, CX
  3420. JG copy_all_from_history
  3421. MOVQ R13, CX
  3422. SUBQ $0x10, CX
  3423. JB copy_4_small
  3424. copy_4_loop:
  3425. MOVUPS (R14), X0
  3426. MOVUPS X0, (R9)
  3427. ADDQ $0x10, R14
  3428. ADDQ $0x10, R9
  3429. SUBQ $0x10, CX
  3430. JAE copy_4_loop
  3431. LEAQ 16(R14)(CX*1), R14
  3432. LEAQ 16(R9)(CX*1), R9
  3433. MOVUPS -16(R14), X0
  3434. MOVUPS X0, -16(R9)
  3435. JMP copy_4_end
  3436. copy_4_small:
  3437. CMPQ R13, $0x03
  3438. JE copy_4_move_3
  3439. CMPQ R13, $0x08
  3440. JB copy_4_move_4through7
  3441. JMP copy_4_move_8through16
  3442. copy_4_move_3:
  3443. MOVW (R14), CX
  3444. MOVB 2(R14), R12
  3445. MOVW CX, (R9)
  3446. MOVB R12, 2(R9)
  3447. ADDQ R13, R14
  3448. ADDQ R13, R9
  3449. JMP copy_4_end
  3450. copy_4_move_4through7:
  3451. MOVL (R14), CX
  3452. MOVL -4(R14)(R13*1), R12
  3453. MOVL CX, (R9)
  3454. MOVL R12, -4(R9)(R13*1)
  3455. ADDQ R13, R14
  3456. ADDQ R13, R9
  3457. JMP copy_4_end
  3458. copy_4_move_8through16:
  3459. MOVQ (R14), CX
  3460. MOVQ -8(R14)(R13*1), R12
  3461. MOVQ CX, (R9)
  3462. MOVQ R12, -8(R9)(R13*1)
  3463. ADDQ R13, R14
  3464. ADDQ R13, R9
  3465. copy_4_end:
  3466. ADDQ R13, R11
  3467. JMP handle_loop
  3468. JMP loop_finished
  3469. copy_all_from_history:
  3470. MOVQ CX, R15
  3471. SUBQ $0x10, R15
  3472. JB copy_5_small
  3473. copy_5_loop:
  3474. MOVUPS (R14), X0
  3475. MOVUPS X0, (R9)
  3476. ADDQ $0x10, R14
  3477. ADDQ $0x10, R9
  3478. SUBQ $0x10, R15
  3479. JAE copy_5_loop
  3480. LEAQ 16(R14)(R15*1), R14
  3481. LEAQ 16(R9)(R15*1), R9
  3482. MOVUPS -16(R14), X0
  3483. MOVUPS X0, -16(R9)
  3484. JMP copy_5_end
  3485. copy_5_small:
  3486. CMPQ CX, $0x03
  3487. JE copy_5_move_3
  3488. JB copy_5_move_1or2
  3489. CMPQ CX, $0x08
  3490. JB copy_5_move_4through7
  3491. JMP copy_5_move_8through16
  3492. copy_5_move_1or2:
  3493. MOVB (R14), R15
  3494. MOVB -1(R14)(CX*1), BP
  3495. MOVB R15, (R9)
  3496. MOVB BP, -1(R9)(CX*1)
  3497. ADDQ CX, R14
  3498. ADDQ CX, R9
  3499. JMP copy_5_end
  3500. copy_5_move_3:
  3501. MOVW (R14), R15
  3502. MOVB 2(R14), BP
  3503. MOVW R15, (R9)
  3504. MOVB BP, 2(R9)
  3505. ADDQ CX, R14
  3506. ADDQ CX, R9
  3507. JMP copy_5_end
  3508. copy_5_move_4through7:
  3509. MOVL (R14), R15
  3510. MOVL -4(R14)(CX*1), BP
  3511. MOVL R15, (R9)
  3512. MOVL BP, -4(R9)(CX*1)
  3513. ADDQ CX, R14
  3514. ADDQ CX, R9
  3515. JMP copy_5_end
  3516. copy_5_move_8through16:
  3517. MOVQ (R14), R15
  3518. MOVQ -8(R14)(CX*1), BP
  3519. MOVQ R15, (R9)
  3520. MOVQ BP, -8(R9)(CX*1)
  3521. ADDQ CX, R14
  3522. ADDQ CX, R9
  3523. copy_5_end:
  3524. ADDQ CX, R11
  3525. SUBQ CX, R13
  3526. // Copy match from the current buffer
  3527. copy_match:
  3528. MOVQ R9, CX
  3529. SUBQ R12, CX
  3530. // ml <= mo
  3531. CMPQ R13, R12
  3532. JA copy_overlapping_match
  3533. // Copy non-overlapping match
  3534. ADDQ R13, R11
  3535. MOVQ R13, R12
  3536. SUBQ $0x10, R12
  3537. JB copy_2_small
  3538. copy_2_loop:
  3539. MOVUPS (CX), X0
  3540. MOVUPS X0, (R9)
  3541. ADDQ $0x10, CX
  3542. ADDQ $0x10, R9
  3543. SUBQ $0x10, R12
  3544. JAE copy_2_loop
  3545. LEAQ 16(CX)(R12*1), CX
  3546. LEAQ 16(R9)(R12*1), R9
  3547. MOVUPS -16(CX), X0
  3548. MOVUPS X0, -16(R9)
  3549. JMP copy_2_end
  3550. copy_2_small:
  3551. CMPQ R13, $0x03
  3552. JE copy_2_move_3
  3553. JB copy_2_move_1or2
  3554. CMPQ R13, $0x08
  3555. JB copy_2_move_4through7
  3556. JMP copy_2_move_8through16
  3557. copy_2_move_1or2:
  3558. MOVB (CX), R12
  3559. MOVB -1(CX)(R13*1), R14
  3560. MOVB R12, (R9)
  3561. MOVB R14, -1(R9)(R13*1)
  3562. ADDQ R13, CX
  3563. ADDQ R13, R9
  3564. JMP copy_2_end
  3565. copy_2_move_3:
  3566. MOVW (CX), R12
  3567. MOVB 2(CX), R14
  3568. MOVW R12, (R9)
  3569. MOVB R14, 2(R9)
  3570. ADDQ R13, CX
  3571. ADDQ R13, R9
  3572. JMP copy_2_end
  3573. copy_2_move_4through7:
  3574. MOVL (CX), R12
  3575. MOVL -4(CX)(R13*1), R14
  3576. MOVL R12, (R9)
  3577. MOVL R14, -4(R9)(R13*1)
  3578. ADDQ R13, CX
  3579. ADDQ R13, R9
  3580. JMP copy_2_end
  3581. copy_2_move_8through16:
  3582. MOVQ (CX), R12
  3583. MOVQ -8(CX)(R13*1), R14
  3584. MOVQ R12, (R9)
  3585. MOVQ R14, -8(R9)(R13*1)
  3586. ADDQ R13, CX
  3587. ADDQ R13, R9
  3588. copy_2_end:
  3589. JMP handle_loop
  3590. // Copy overlapping match
  3591. copy_overlapping_match:
  3592. ADDQ R13, R11
  3593. copy_slow_3:
  3594. MOVB (CX), R12
  3595. MOVB R12, (R9)
  3596. INCQ CX
  3597. INCQ R9
  3598. DECQ R13
  3599. JNZ copy_slow_3
  3600. handle_loop:
  3601. MOVQ ctx+16(FP), CX
  3602. DECQ 96(CX)
  3603. JNS sequenceDecs_decodeSync_safe_bmi2_main_loop
  3604. loop_finished:
  3605. MOVQ br+8(FP), CX
  3606. MOVQ AX, 24(CX)
  3607. MOVB DL, 32(CX)
  3608. MOVQ BX, 8(CX)
  3609. // Update the context
  3610. MOVQ ctx+16(FP), AX
  3611. MOVQ R11, 136(AX)
  3612. MOVQ 144(AX), CX
  3613. SUBQ CX, R10
  3614. MOVQ R10, 168(AX)
  3615. // Return success
  3616. MOVQ $0x00000000, ret+24(FP)
  3617. RET
  3618. // Return with match length error
  3619. sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch:
  3620. MOVQ 16(SP), AX
  3621. MOVQ ctx+16(FP), CX
  3622. MOVQ AX, 216(CX)
  3623. MOVQ $0x00000001, ret+24(FP)
  3624. RET
  3625. // Return with match too long error
  3626. sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big:
  3627. MOVQ ctx+16(FP), AX
  3628. MOVQ 16(SP), CX
  3629. MOVQ CX, 216(AX)
  3630. MOVQ $0x00000002, ret+24(FP)
  3631. RET
  3632. // Return with match offset too long error
  3633. error_match_off_too_big:
  3634. MOVQ ctx+16(FP), AX
  3635. MOVQ 8(SP), CX
  3636. MOVQ CX, 224(AX)
  3637. MOVQ R11, 136(AX)
  3638. MOVQ $0x00000003, ret+24(FP)
  3639. RET
  3640. // Return with not enough literals error
  3641. error_not_enough_literals:
  3642. MOVQ ctx+16(FP), AX
  3643. MOVQ 24(SP), CX
  3644. MOVQ CX, 208(AX)
  3645. MOVQ $0x00000004, ret+24(FP)
  3646. RET
  3647. // Return with overread error
  3648. error_overread:
  3649. MOVQ $0x00000006, ret+24(FP)
  3650. RET
  3651. // Return with not enough output space error
  3652. error_not_enough_space:
  3653. MOVQ ctx+16(FP), AX
  3654. MOVQ 24(SP), CX
  3655. MOVQ CX, 208(AX)
  3656. MOVQ 16(SP), CX
  3657. MOVQ CX, 216(AX)
  3658. MOVQ R11, 136(AX)
  3659. MOVQ $0x00000005, ret+24(FP)
  3660. RET