encodeblock_amd64.s 521 KB


  1. // Code generated by command: go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2. DO NOT EDIT.
  2. //go:build !appengine && !noasm && gc && !noasm
  3. #include "textflag.h"
  4. // func _dummy_()
  5. TEXT ·_dummy_(SB), $0
  6. #ifdef GOAMD64_v4
  7. #ifndef GOAMD64_v3
  8. #define GOAMD64_v3
  9. #endif
  10. #endif
  11. RET
  12. // func encodeBlockAsm(dst []byte, src []byte) int
  13. // Requires: BMI, SSE2
  14. TEXT ·encodeBlockAsm(SB), $65560-56
  15. MOVQ dst_base+0(FP), AX
  16. MOVQ $0x00000200, CX
  17. LEAQ 24(SP), DX
  18. PXOR X0, X0
  19. zero_loop_encodeBlockAsm:
  20. MOVOU X0, (DX)
  21. MOVOU X0, 16(DX)
  22. MOVOU X0, 32(DX)
  23. MOVOU X0, 48(DX)
  24. MOVOU X0, 64(DX)
  25. MOVOU X0, 80(DX)
  26. MOVOU X0, 96(DX)
  27. MOVOU X0, 112(DX)
  28. ADDQ $0x80, DX
  29. DECQ CX
  30. JNZ zero_loop_encodeBlockAsm
  31. MOVL $0x00000000, 12(SP)
  32. MOVQ src_len+32(FP), CX
  33. LEAQ -9(CX), DX
  34. LEAQ -8(CX), BX
  35. MOVL BX, 8(SP)
  36. SHRQ $0x05, CX
  37. SUBL CX, DX
  38. LEAQ (AX)(DX*1), DX
  39. MOVQ DX, (SP)
  40. MOVL $0x00000001, CX
  41. MOVL CX, 16(SP)
  42. MOVQ src_base+24(FP), DX
  43. search_loop_encodeBlockAsm:
  44. MOVL CX, BX
  45. SUBL 12(SP), BX
  46. SHRL $0x06, BX
  47. LEAL 4(CX)(BX*1), BX
  48. CMPL BX, 8(SP)
  49. JAE emit_remainder_encodeBlockAsm
  50. MOVQ (DX)(CX*1), SI
  51. MOVL BX, 20(SP)
  52. MOVQ $0x0000cf1bbcdcbf9b, R8
  53. MOVQ SI, R9
  54. MOVQ SI, R10
  55. SHRQ $0x08, R10
  56. SHLQ $0x10, R9
  57. IMULQ R8, R9
  58. SHRQ $0x32, R9
  59. SHLQ $0x10, R10
  60. IMULQ R8, R10
  61. SHRQ $0x32, R10
  62. MOVL 24(SP)(R9*4), BX
  63. MOVL 24(SP)(R10*4), DI
  64. MOVL CX, 24(SP)(R9*4)
  65. LEAL 1(CX), R9
  66. MOVL R9, 24(SP)(R10*4)
  67. MOVQ SI, R9
  68. SHRQ $0x10, R9
  69. SHLQ $0x10, R9
  70. IMULQ R8, R9
  71. SHRQ $0x32, R9
  72. MOVL CX, R8
  73. SUBL 16(SP), R8
  74. MOVL 1(DX)(R8*1), R10
  75. MOVQ SI, R8
  76. SHRQ $0x08, R8
  77. CMPL R8, R10
  78. JNE no_repeat_found_encodeBlockAsm
  79. LEAL 1(CX), SI
  80. MOVL 12(SP), DI
  81. MOVL SI, BX
  82. SUBL 16(SP), BX
  83. JZ repeat_extend_back_end_encodeBlockAsm
  84. repeat_extend_back_loop_encodeBlockAsm:
  85. CMPL SI, DI
  86. JBE repeat_extend_back_end_encodeBlockAsm
  87. MOVB -1(DX)(BX*1), R8
  88. MOVB -1(DX)(SI*1), R9
  89. CMPB R8, R9
  90. JNE repeat_extend_back_end_encodeBlockAsm
  91. LEAL -1(SI), SI
  92. DECL BX
  93. JNZ repeat_extend_back_loop_encodeBlockAsm
  94. repeat_extend_back_end_encodeBlockAsm:
  95. MOVL 12(SP), BX
  96. CMPL BX, SI
  97. JEQ emit_literal_done_repeat_emit_encodeBlockAsm
  98. MOVL SI, R8
  99. MOVL SI, 12(SP)
  100. LEAQ (DX)(BX*1), R9
  101. SUBL BX, R8
  102. LEAL -1(R8), BX
  103. CMPL BX, $0x3c
  104. JB one_byte_repeat_emit_encodeBlockAsm
  105. CMPL BX, $0x00000100
  106. JB two_bytes_repeat_emit_encodeBlockAsm
  107. CMPL BX, $0x00010000
  108. JB three_bytes_repeat_emit_encodeBlockAsm
  109. CMPL BX, $0x01000000
  110. JB four_bytes_repeat_emit_encodeBlockAsm
  111. MOVB $0xfc, (AX)
  112. MOVL BX, 1(AX)
  113. ADDQ $0x05, AX
  114. JMP memmove_long_repeat_emit_encodeBlockAsm
  115. four_bytes_repeat_emit_encodeBlockAsm:
  116. MOVL BX, R10
  117. SHRL $0x10, R10
  118. MOVB $0xf8, (AX)
  119. MOVW BX, 1(AX)
  120. MOVB R10, 3(AX)
  121. ADDQ $0x04, AX
  122. JMP memmove_long_repeat_emit_encodeBlockAsm
  123. three_bytes_repeat_emit_encodeBlockAsm:
  124. MOVB $0xf4, (AX)
  125. MOVW BX, 1(AX)
  126. ADDQ $0x03, AX
  127. JMP memmove_long_repeat_emit_encodeBlockAsm
  128. two_bytes_repeat_emit_encodeBlockAsm:
  129. MOVB $0xf0, (AX)
  130. MOVB BL, 1(AX)
  131. ADDQ $0x02, AX
  132. CMPL BX, $0x40
  133. JB memmove_repeat_emit_encodeBlockAsm
  134. JMP memmove_long_repeat_emit_encodeBlockAsm
  135. one_byte_repeat_emit_encodeBlockAsm:
  136. SHLB $0x02, BL
  137. MOVB BL, (AX)
  138. ADDQ $0x01, AX
  139. memmove_repeat_emit_encodeBlockAsm:
  140. LEAQ (AX)(R8*1), BX
  141. // genMemMoveShort
  142. CMPQ R8, $0x08
  143. JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8
  144. CMPQ R8, $0x10
  145. JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8through16
  146. CMPQ R8, $0x20
  147. JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32
  148. JMP emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64
  149. emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8:
  150. MOVQ (R9), R10
  151. MOVQ R10, (AX)
  152. JMP memmove_end_copy_repeat_emit_encodeBlockAsm
  153. emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8through16:
  154. MOVQ (R9), R10
  155. MOVQ -8(R9)(R8*1), R9
  156. MOVQ R10, (AX)
  157. MOVQ R9, -8(AX)(R8*1)
  158. JMP memmove_end_copy_repeat_emit_encodeBlockAsm
  159. emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32:
  160. MOVOU (R9), X0
  161. MOVOU -16(R9)(R8*1), X1
  162. MOVOU X0, (AX)
  163. MOVOU X1, -16(AX)(R8*1)
  164. JMP memmove_end_copy_repeat_emit_encodeBlockAsm
  165. emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64:
  166. MOVOU (R9), X0
  167. MOVOU 16(R9), X1
  168. MOVOU -32(R9)(R8*1), X2
  169. MOVOU -16(R9)(R8*1), X3
  170. MOVOU X0, (AX)
  171. MOVOU X1, 16(AX)
  172. MOVOU X2, -32(AX)(R8*1)
  173. MOVOU X3, -16(AX)(R8*1)
  174. memmove_end_copy_repeat_emit_encodeBlockAsm:
  175. MOVQ BX, AX
  176. JMP emit_literal_done_repeat_emit_encodeBlockAsm
  177. memmove_long_repeat_emit_encodeBlockAsm:
  178. LEAQ (AX)(R8*1), BX
  179. // genMemMoveLong
  180. MOVOU (R9), X0
  181. MOVOU 16(R9), X1
  182. MOVOU -32(R9)(R8*1), X2
  183. MOVOU -16(R9)(R8*1), X3
  184. MOVQ R8, R11
  185. SHRQ $0x05, R11
  186. MOVQ AX, R10
  187. ANDL $0x0000001f, R10
  188. MOVQ $0x00000040, R12
  189. SUBQ R10, R12
  190. DECQ R11
  191. JA emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32
  192. LEAQ -32(R9)(R12*1), R10
  193. LEAQ -32(AX)(R12*1), R13
  194. emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back:
  195. MOVOU (R10), X4
  196. MOVOU 16(R10), X5
  197. MOVOA X4, (R13)
  198. MOVOA X5, 16(R13)
  199. ADDQ $0x20, R13
  200. ADDQ $0x20, R10
  201. ADDQ $0x20, R12
  202. DECQ R11
  203. JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back
  204. emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32:
  205. MOVOU -32(R9)(R12*1), X4
  206. MOVOU -16(R9)(R12*1), X5
  207. MOVOA X4, -32(AX)(R12*1)
  208. MOVOA X5, -16(AX)(R12*1)
  209. ADDQ $0x20, R12
  210. CMPQ R8, R12
  211. JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32
  212. MOVOU X0, (AX)
  213. MOVOU X1, 16(AX)
  214. MOVOU X2, -32(AX)(R8*1)
  215. MOVOU X3, -16(AX)(R8*1)
  216. MOVQ BX, AX
  217. emit_literal_done_repeat_emit_encodeBlockAsm:
  218. ADDL $0x05, CX
  219. MOVL CX, BX
  220. SUBL 16(SP), BX
  221. MOVQ src_len+32(FP), R8
  222. SUBL CX, R8
  223. LEAQ (DX)(CX*1), R9
  224. LEAQ (DX)(BX*1), BX
  225. // matchLen
  226. XORL R11, R11
  227. matchlen_loopback_16_repeat_extend_encodeBlockAsm:
  228. CMPL R8, $0x10
  229. JB matchlen_match8_repeat_extend_encodeBlockAsm
  230. MOVQ (R9)(R11*1), R10
  231. MOVQ 8(R9)(R11*1), R12
  232. XORQ (BX)(R11*1), R10
  233. JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm
  234. XORQ 8(BX)(R11*1), R12
  235. JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm
  236. LEAL -16(R8), R8
  237. LEAL 16(R11), R11
  238. JMP matchlen_loopback_16_repeat_extend_encodeBlockAsm
  239. matchlen_bsf_16repeat_extend_encodeBlockAsm:
  240. #ifdef GOAMD64_v3
  241. TZCNTQ R12, R12
  242. #else
  243. BSFQ R12, R12
  244. #endif
  245. SARQ $0x03, R12
  246. LEAL 8(R11)(R12*1), R11
  247. JMP repeat_extend_forward_end_encodeBlockAsm
  248. matchlen_match8_repeat_extend_encodeBlockAsm:
  249. CMPL R8, $0x08
  250. JB matchlen_match4_repeat_extend_encodeBlockAsm
  251. MOVQ (R9)(R11*1), R10
  252. XORQ (BX)(R11*1), R10
  253. JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm
  254. LEAL -8(R8), R8
  255. LEAL 8(R11), R11
  256. JMP matchlen_match4_repeat_extend_encodeBlockAsm
  257. matchlen_bsf_8_repeat_extend_encodeBlockAsm:
  258. #ifdef GOAMD64_v3
  259. TZCNTQ R10, R10
  260. #else
  261. BSFQ R10, R10
  262. #endif
  263. SARQ $0x03, R10
  264. LEAL (R11)(R10*1), R11
  265. JMP repeat_extend_forward_end_encodeBlockAsm
  266. matchlen_match4_repeat_extend_encodeBlockAsm:
  267. CMPL R8, $0x04
  268. JB matchlen_match2_repeat_extend_encodeBlockAsm
  269. MOVL (R9)(R11*1), R10
  270. CMPL (BX)(R11*1), R10
  271. JNE matchlen_match2_repeat_extend_encodeBlockAsm
  272. LEAL -4(R8), R8
  273. LEAL 4(R11), R11
  274. matchlen_match2_repeat_extend_encodeBlockAsm:
  275. CMPL R8, $0x01
  276. JE matchlen_match1_repeat_extend_encodeBlockAsm
  277. JB repeat_extend_forward_end_encodeBlockAsm
  278. MOVW (R9)(R11*1), R10
  279. CMPW (BX)(R11*1), R10
  280. JNE matchlen_match1_repeat_extend_encodeBlockAsm
  281. LEAL 2(R11), R11
  282. SUBL $0x02, R8
  283. JZ repeat_extend_forward_end_encodeBlockAsm
  284. matchlen_match1_repeat_extend_encodeBlockAsm:
  285. MOVB (R9)(R11*1), R10
  286. CMPB (BX)(R11*1), R10
  287. JNE repeat_extend_forward_end_encodeBlockAsm
  288. LEAL 1(R11), R11
  289. repeat_extend_forward_end_encodeBlockAsm:
  290. ADDL R11, CX
  291. MOVL CX, BX
  292. SUBL SI, BX
  293. MOVL 16(SP), SI
  294. TESTL DI, DI
  295. JZ repeat_as_copy_encodeBlockAsm
  296. // emitRepeat
  297. emit_repeat_again_match_repeat_encodeBlockAsm:
  298. MOVL BX, DI
  299. LEAL -4(BX), BX
  300. CMPL DI, $0x08
  301. JBE repeat_two_match_repeat_encodeBlockAsm
  302. CMPL DI, $0x0c
  303. JAE cant_repeat_two_offset_match_repeat_encodeBlockAsm
  304. CMPL SI, $0x00000800
  305. JB repeat_two_offset_match_repeat_encodeBlockAsm
  306. cant_repeat_two_offset_match_repeat_encodeBlockAsm:
  307. CMPL BX, $0x00000104
  308. JB repeat_three_match_repeat_encodeBlockAsm
  309. CMPL BX, $0x00010100
  310. JB repeat_four_match_repeat_encodeBlockAsm
  311. CMPL BX, $0x0100ffff
  312. JB repeat_five_match_repeat_encodeBlockAsm
  313. LEAL -16842747(BX), BX
  314. MOVL $0xfffb001d, (AX)
  315. MOVB $0xff, 4(AX)
  316. ADDQ $0x05, AX
  317. JMP emit_repeat_again_match_repeat_encodeBlockAsm
  318. repeat_five_match_repeat_encodeBlockAsm:
  319. LEAL -65536(BX), BX
  320. MOVL BX, SI
  321. MOVW $0x001d, (AX)
  322. MOVW BX, 2(AX)
  323. SARL $0x10, SI
  324. MOVB SI, 4(AX)
  325. ADDQ $0x05, AX
  326. JMP repeat_end_emit_encodeBlockAsm
  327. repeat_four_match_repeat_encodeBlockAsm:
  328. LEAL -256(BX), BX
  329. MOVW $0x0019, (AX)
  330. MOVW BX, 2(AX)
  331. ADDQ $0x04, AX
  332. JMP repeat_end_emit_encodeBlockAsm
  333. repeat_three_match_repeat_encodeBlockAsm:
  334. LEAL -4(BX), BX
  335. MOVW $0x0015, (AX)
  336. MOVB BL, 2(AX)
  337. ADDQ $0x03, AX
  338. JMP repeat_end_emit_encodeBlockAsm
  339. repeat_two_match_repeat_encodeBlockAsm:
  340. SHLL $0x02, BX
  341. ORL $0x01, BX
  342. MOVW BX, (AX)
  343. ADDQ $0x02, AX
  344. JMP repeat_end_emit_encodeBlockAsm
  345. repeat_two_offset_match_repeat_encodeBlockAsm:
  346. XORQ DI, DI
  347. LEAL 1(DI)(BX*4), BX
  348. MOVB SI, 1(AX)
  349. SARL $0x08, SI
  350. SHLL $0x05, SI
  351. ORL SI, BX
  352. MOVB BL, (AX)
  353. ADDQ $0x02, AX
  354. JMP repeat_end_emit_encodeBlockAsm
  355. repeat_as_copy_encodeBlockAsm:
  356. // emitCopy
  357. CMPL SI, $0x00010000
  358. JB two_byte_offset_repeat_as_copy_encodeBlockAsm
  359. CMPL BX, $0x40
  360. JBE four_bytes_remain_repeat_as_copy_encodeBlockAsm
  361. MOVB $0xff, (AX)
  362. MOVL SI, 1(AX)
  363. LEAL -64(BX), BX
  364. ADDQ $0x05, AX
  365. CMPL BX, $0x04
  366. JB four_bytes_remain_repeat_as_copy_encodeBlockAsm
  367. // emitRepeat
  368. emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy:
  369. MOVL BX, DI
  370. LEAL -4(BX), BX
  371. CMPL DI, $0x08
  372. JBE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy
  373. CMPL DI, $0x0c
  374. JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy
  375. CMPL SI, $0x00000800
  376. JB repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy
  377. cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy:
  378. CMPL BX, $0x00000104
  379. JB repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy
  380. CMPL BX, $0x00010100
  381. JB repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy
  382. CMPL BX, $0x0100ffff
  383. JB repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy
  384. LEAL -16842747(BX), BX
  385. MOVL $0xfffb001d, (AX)
  386. MOVB $0xff, 4(AX)
  387. ADDQ $0x05, AX
  388. JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy
  389. repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy:
  390. LEAL -65536(BX), BX
  391. MOVL BX, SI
  392. MOVW $0x001d, (AX)
  393. MOVW BX, 2(AX)
  394. SARL $0x10, SI
  395. MOVB SI, 4(AX)
  396. ADDQ $0x05, AX
  397. JMP repeat_end_emit_encodeBlockAsm
  398. repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy:
  399. LEAL -256(BX), BX
  400. MOVW $0x0019, (AX)
  401. MOVW BX, 2(AX)
  402. ADDQ $0x04, AX
  403. JMP repeat_end_emit_encodeBlockAsm
  404. repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy:
  405. LEAL -4(BX), BX
  406. MOVW $0x0015, (AX)
  407. MOVB BL, 2(AX)
  408. ADDQ $0x03, AX
  409. JMP repeat_end_emit_encodeBlockAsm
  410. repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy:
  411. SHLL $0x02, BX
  412. ORL $0x01, BX
  413. MOVW BX, (AX)
  414. ADDQ $0x02, AX
  415. JMP repeat_end_emit_encodeBlockAsm
  416. repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy:
  417. XORQ DI, DI
  418. LEAL 1(DI)(BX*4), BX
  419. MOVB SI, 1(AX)
  420. SARL $0x08, SI
  421. SHLL $0x05, SI
  422. ORL SI, BX
  423. MOVB BL, (AX)
  424. ADDQ $0x02, AX
  425. JMP repeat_end_emit_encodeBlockAsm
  426. four_bytes_remain_repeat_as_copy_encodeBlockAsm:
  427. TESTL BX, BX
  428. JZ repeat_end_emit_encodeBlockAsm
  429. XORL DI, DI
  430. LEAL -1(DI)(BX*4), BX
  431. MOVB BL, (AX)
  432. MOVL SI, 1(AX)
  433. ADDQ $0x05, AX
  434. JMP repeat_end_emit_encodeBlockAsm
  435. two_byte_offset_repeat_as_copy_encodeBlockAsm:
  436. CMPL BX, $0x40
  437. JBE two_byte_offset_short_repeat_as_copy_encodeBlockAsm
  438. CMPL SI, $0x00000800
  439. JAE long_offset_short_repeat_as_copy_encodeBlockAsm
  440. MOVL $0x00000001, DI
  441. LEAL 16(DI), DI
  442. MOVB SI, 1(AX)
  443. MOVL SI, R8
  444. SHRL $0x08, R8
  445. SHLL $0x05, R8
  446. ORL R8, DI
  447. MOVB DI, (AX)
  448. ADDQ $0x02, AX
  449. SUBL $0x08, BX
  450. // emitRepeat
  451. LEAL -4(BX), BX
  452. JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
  453. emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
  454. MOVL BX, DI
  455. LEAL -4(BX), BX
  456. CMPL DI, $0x08
  457. JBE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
  458. CMPL DI, $0x0c
  459. JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
  460. CMPL SI, $0x00000800
  461. JB repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
  462. cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
  463. CMPL BX, $0x00000104
  464. JB repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
  465. CMPL BX, $0x00010100
  466. JB repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
  467. CMPL BX, $0x0100ffff
  468. JB repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
  469. LEAL -16842747(BX), BX
  470. MOVL $0xfffb001d, (AX)
  471. MOVB $0xff, 4(AX)
  472. ADDQ $0x05, AX
  473. JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
  474. repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
  475. LEAL -65536(BX), BX
  476. MOVL BX, SI
  477. MOVW $0x001d, (AX)
  478. MOVW BX, 2(AX)
  479. SARL $0x10, SI
  480. MOVB SI, 4(AX)
  481. ADDQ $0x05, AX
  482. JMP repeat_end_emit_encodeBlockAsm
  483. repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
  484. LEAL -256(BX), BX
  485. MOVW $0x0019, (AX)
  486. MOVW BX, 2(AX)
  487. ADDQ $0x04, AX
  488. JMP repeat_end_emit_encodeBlockAsm
  489. repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
  490. LEAL -4(BX), BX
  491. MOVW $0x0015, (AX)
  492. MOVB BL, 2(AX)
  493. ADDQ $0x03, AX
  494. JMP repeat_end_emit_encodeBlockAsm
  495. repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
  496. SHLL $0x02, BX
  497. ORL $0x01, BX
  498. MOVW BX, (AX)
  499. ADDQ $0x02, AX
  500. JMP repeat_end_emit_encodeBlockAsm
  501. repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
  502. XORQ DI, DI
  503. LEAL 1(DI)(BX*4), BX
  504. MOVB SI, 1(AX)
  505. SARL $0x08, SI
  506. SHLL $0x05, SI
  507. ORL SI, BX
  508. MOVB BL, (AX)
  509. ADDQ $0x02, AX
  510. JMP repeat_end_emit_encodeBlockAsm
  511. long_offset_short_repeat_as_copy_encodeBlockAsm:
  512. MOVB $0xee, (AX)
  513. MOVW SI, 1(AX)
  514. LEAL -60(BX), BX
  515. ADDQ $0x03, AX
  516. // emitRepeat
  517. emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short:
  518. MOVL BX, DI
  519. LEAL -4(BX), BX
  520. CMPL DI, $0x08
  521. JBE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short
  522. CMPL DI, $0x0c
  523. JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short
  524. CMPL SI, $0x00000800
  525. JB repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short
  526. cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short:
  527. CMPL BX, $0x00000104
  528. JB repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short
  529. CMPL BX, $0x00010100
  530. JB repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short
  531. CMPL BX, $0x0100ffff
  532. JB repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short
  533. LEAL -16842747(BX), BX
  534. MOVL $0xfffb001d, (AX)
  535. MOVB $0xff, 4(AX)
  536. ADDQ $0x05, AX
  537. JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short
  538. repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short:
  539. LEAL -65536(BX), BX
  540. MOVL BX, SI
  541. MOVW $0x001d, (AX)
  542. MOVW BX, 2(AX)
  543. SARL $0x10, SI
  544. MOVB SI, 4(AX)
  545. ADDQ $0x05, AX
  546. JMP repeat_end_emit_encodeBlockAsm
  547. repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short:
  548. LEAL -256(BX), BX
  549. MOVW $0x0019, (AX)
  550. MOVW BX, 2(AX)
  551. ADDQ $0x04, AX
  552. JMP repeat_end_emit_encodeBlockAsm
  553. repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short:
  554. LEAL -4(BX), BX
  555. MOVW $0x0015, (AX)
  556. MOVB BL, 2(AX)
  557. ADDQ $0x03, AX
  558. JMP repeat_end_emit_encodeBlockAsm
  559. repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short:
  560. SHLL $0x02, BX
  561. ORL $0x01, BX
  562. MOVW BX, (AX)
  563. ADDQ $0x02, AX
  564. JMP repeat_end_emit_encodeBlockAsm
  565. repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short:
  566. XORQ DI, DI
  567. LEAL 1(DI)(BX*4), BX
  568. MOVB SI, 1(AX)
  569. SARL $0x08, SI
  570. SHLL $0x05, SI
  571. ORL SI, BX
  572. MOVB BL, (AX)
  573. ADDQ $0x02, AX
  574. JMP repeat_end_emit_encodeBlockAsm
  575. two_byte_offset_short_repeat_as_copy_encodeBlockAsm:
  576. MOVL BX, DI
  577. SHLL $0x02, DI
  578. CMPL BX, $0x0c
  579. JAE emit_copy_three_repeat_as_copy_encodeBlockAsm
  580. CMPL SI, $0x00000800
  581. JAE emit_copy_three_repeat_as_copy_encodeBlockAsm
  582. LEAL -15(DI), DI
  583. MOVB SI, 1(AX)
  584. SHRL $0x08, SI
  585. SHLL $0x05, SI
  586. ORL SI, DI
  587. MOVB DI, (AX)
  588. ADDQ $0x02, AX
  589. JMP repeat_end_emit_encodeBlockAsm
  590. emit_copy_three_repeat_as_copy_encodeBlockAsm:
  591. LEAL -2(DI), DI
  592. MOVB DI, (AX)
  593. MOVW SI, 1(AX)
  594. ADDQ $0x03, AX
  595. repeat_end_emit_encodeBlockAsm:
  596. MOVL CX, 12(SP)
  597. JMP search_loop_encodeBlockAsm
  598. no_repeat_found_encodeBlockAsm:
  599. CMPL (DX)(BX*1), SI
  600. JEQ candidate_match_encodeBlockAsm
  601. SHRQ $0x08, SI
  602. MOVL 24(SP)(R9*4), BX
  603. LEAL 2(CX), R8
  604. CMPL (DX)(DI*1), SI
  605. JEQ candidate2_match_encodeBlockAsm
  606. MOVL R8, 24(SP)(R9*4)
  607. SHRQ $0x08, SI
  608. CMPL (DX)(BX*1), SI
  609. JEQ candidate3_match_encodeBlockAsm
  610. MOVL 20(SP), CX
  611. JMP search_loop_encodeBlockAsm
  612. candidate3_match_encodeBlockAsm:
  613. ADDL $0x02, CX
  614. JMP candidate_match_encodeBlockAsm
  615. candidate2_match_encodeBlockAsm:
  616. MOVL R8, 24(SP)(R9*4)
  617. INCL CX
  618. MOVL DI, BX
  619. candidate_match_encodeBlockAsm:
  620. MOVL 12(SP), SI
  621. TESTL BX, BX
  622. JZ match_extend_back_end_encodeBlockAsm
  623. match_extend_back_loop_encodeBlockAsm:
  624. CMPL CX, SI
  625. JBE match_extend_back_end_encodeBlockAsm
  626. MOVB -1(DX)(BX*1), DI
  627. MOVB -1(DX)(CX*1), R8
  628. CMPB DI, R8
  629. JNE match_extend_back_end_encodeBlockAsm
  630. LEAL -1(CX), CX
  631. DECL BX
  632. JZ match_extend_back_end_encodeBlockAsm
  633. JMP match_extend_back_loop_encodeBlockAsm
  634. match_extend_back_end_encodeBlockAsm:
  635. MOVL CX, SI
  636. SUBL 12(SP), SI
  637. LEAQ 5(AX)(SI*1), SI
  638. CMPQ SI, (SP)
  639. JB match_dst_size_check_encodeBlockAsm
  640. MOVQ $0x00000000, ret+48(FP)
  641. RET
  642. match_dst_size_check_encodeBlockAsm:
  643. MOVL CX, SI
  644. MOVL 12(SP), DI
  645. CMPL DI, SI
  646. JEQ emit_literal_done_match_emit_encodeBlockAsm
  647. MOVL SI, R8
  648. MOVL SI, 12(SP)
  649. LEAQ (DX)(DI*1), SI
  650. SUBL DI, R8
  651. LEAL -1(R8), DI
  652. CMPL DI, $0x3c
  653. JB one_byte_match_emit_encodeBlockAsm
  654. CMPL DI, $0x00000100
  655. JB two_bytes_match_emit_encodeBlockAsm
  656. CMPL DI, $0x00010000
  657. JB three_bytes_match_emit_encodeBlockAsm
  658. CMPL DI, $0x01000000
  659. JB four_bytes_match_emit_encodeBlockAsm
  660. MOVB $0xfc, (AX)
  661. MOVL DI, 1(AX)
  662. ADDQ $0x05, AX
  663. JMP memmove_long_match_emit_encodeBlockAsm
  664. four_bytes_match_emit_encodeBlockAsm:
  665. MOVL DI, R9
  666. SHRL $0x10, R9
  667. MOVB $0xf8, (AX)
  668. MOVW DI, 1(AX)
  669. MOVB R9, 3(AX)
  670. ADDQ $0x04, AX
  671. JMP memmove_long_match_emit_encodeBlockAsm
  672. three_bytes_match_emit_encodeBlockAsm:
  673. MOVB $0xf4, (AX)
  674. MOVW DI, 1(AX)
  675. ADDQ $0x03, AX
  676. JMP memmove_long_match_emit_encodeBlockAsm
  677. two_bytes_match_emit_encodeBlockAsm:
  678. MOVB $0xf0, (AX)
  679. MOVB DI, 1(AX)
  680. ADDQ $0x02, AX
  681. CMPL DI, $0x40
  682. JB memmove_match_emit_encodeBlockAsm
  683. JMP memmove_long_match_emit_encodeBlockAsm
  684. one_byte_match_emit_encodeBlockAsm:
  685. SHLB $0x02, DI
  686. MOVB DI, (AX)
  687. ADDQ $0x01, AX
  688. memmove_match_emit_encodeBlockAsm:
  689. LEAQ (AX)(R8*1), DI
  690. // genMemMoveShort
  691. CMPQ R8, $0x08
  692. JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8
  693. CMPQ R8, $0x10
  694. JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16
  695. CMPQ R8, $0x20
  696. JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32
  697. JMP emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64
  698. emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8:
  699. MOVQ (SI), R9
  700. MOVQ R9, (AX)
  701. JMP memmove_end_copy_match_emit_encodeBlockAsm
  702. emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16:
  703. MOVQ (SI), R9
  704. MOVQ -8(SI)(R8*1), SI
  705. MOVQ R9, (AX)
  706. MOVQ SI, -8(AX)(R8*1)
  707. JMP memmove_end_copy_match_emit_encodeBlockAsm
  708. emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32:
  709. MOVOU (SI), X0
  710. MOVOU -16(SI)(R8*1), X1
  711. MOVOU X0, (AX)
  712. MOVOU X1, -16(AX)(R8*1)
  713. JMP memmove_end_copy_match_emit_encodeBlockAsm
  714. emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64:
  715. MOVOU (SI), X0
  716. MOVOU 16(SI), X1
  717. MOVOU -32(SI)(R8*1), X2
  718. MOVOU -16(SI)(R8*1), X3
  719. MOVOU X0, (AX)
  720. MOVOU X1, 16(AX)
  721. MOVOU X2, -32(AX)(R8*1)
  722. MOVOU X3, -16(AX)(R8*1)
  723. memmove_end_copy_match_emit_encodeBlockAsm:
  724. MOVQ DI, AX
  725. JMP emit_literal_done_match_emit_encodeBlockAsm
  726. memmove_long_match_emit_encodeBlockAsm:
  727. LEAQ (AX)(R8*1), DI
  728. // genMemMoveLong
  729. MOVOU (SI), X0
  730. MOVOU 16(SI), X1
  731. MOVOU -32(SI)(R8*1), X2
  732. MOVOU -16(SI)(R8*1), X3
  733. MOVQ R8, R10
  734. SHRQ $0x05, R10
  735. MOVQ AX, R9
  736. ANDL $0x0000001f, R9
  737. MOVQ $0x00000040, R11
  738. SUBQ R9, R11
  739. DECQ R10
  740. JA emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32
  741. LEAQ -32(SI)(R11*1), R9
  742. LEAQ -32(AX)(R11*1), R12
  743. emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back:
  744. MOVOU (R9), X4
  745. MOVOU 16(R9), X5
  746. MOVOA X4, (R12)
  747. MOVOA X5, 16(R12)
  748. ADDQ $0x20, R12
  749. ADDQ $0x20, R9
  750. ADDQ $0x20, R11
  751. DECQ R10
  752. JNA emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back
  753. emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32:
  754. MOVOU -32(SI)(R11*1), X4
  755. MOVOU -16(SI)(R11*1), X5
  756. MOVOA X4, -32(AX)(R11*1)
  757. MOVOA X5, -16(AX)(R11*1)
  758. ADDQ $0x20, R11
  759. CMPQ R8, R11
  760. JAE emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32
  761. MOVOU X0, (AX)
  762. MOVOU X1, 16(AX)
  763. MOVOU X2, -32(AX)(R8*1)
  764. MOVOU X3, -16(AX)(R8*1)
  765. MOVQ DI, AX
  766. emit_literal_done_match_emit_encodeBlockAsm:
  767. match_nolit_loop_encodeBlockAsm:
  768. MOVL CX, SI
  769. SUBL BX, SI
  770. MOVL SI, 16(SP)
  771. ADDL $0x04, CX
  772. ADDL $0x04, BX
  773. MOVQ src_len+32(FP), SI
  774. SUBL CX, SI
  775. LEAQ (DX)(CX*1), DI
  776. LEAQ (DX)(BX*1), BX
  777. // matchLen
  778. XORL R9, R9
  779. matchlen_loopback_16_match_nolit_encodeBlockAsm:
  780. CMPL SI, $0x10
  781. JB matchlen_match8_match_nolit_encodeBlockAsm
  782. MOVQ (DI)(R9*1), R8
  783. MOVQ 8(DI)(R9*1), R10
  784. XORQ (BX)(R9*1), R8
  785. JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm
  786. XORQ 8(BX)(R9*1), R10
  787. JNZ matchlen_bsf_16match_nolit_encodeBlockAsm
  788. LEAL -16(SI), SI
  789. LEAL 16(R9), R9
  790. JMP matchlen_loopback_16_match_nolit_encodeBlockAsm
  791. matchlen_bsf_16match_nolit_encodeBlockAsm:
  792. #ifdef GOAMD64_v3
  793. TZCNTQ R10, R10
  794. #else
  795. BSFQ R10, R10
  796. #endif
  797. SARQ $0x03, R10
  798. LEAL 8(R9)(R10*1), R9
  799. JMP match_nolit_end_encodeBlockAsm
  800. matchlen_match8_match_nolit_encodeBlockAsm:
  801. CMPL SI, $0x08
  802. JB matchlen_match4_match_nolit_encodeBlockAsm
  803. MOVQ (DI)(R9*1), R8
  804. XORQ (BX)(R9*1), R8
  805. JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm
  806. LEAL -8(SI), SI
  807. LEAL 8(R9), R9
  808. JMP matchlen_match4_match_nolit_encodeBlockAsm
  809. matchlen_bsf_8_match_nolit_encodeBlockAsm:
  810. #ifdef GOAMD64_v3
  811. TZCNTQ R8, R8
  812. #else
  813. BSFQ R8, R8
  814. #endif
  815. SARQ $0x03, R8
  816. LEAL (R9)(R8*1), R9
  817. JMP match_nolit_end_encodeBlockAsm
  818. matchlen_match4_match_nolit_encodeBlockAsm:
  819. CMPL SI, $0x04
  820. JB matchlen_match2_match_nolit_encodeBlockAsm
  821. MOVL (DI)(R9*1), R8
  822. CMPL (BX)(R9*1), R8
  823. JNE matchlen_match2_match_nolit_encodeBlockAsm
  824. LEAL -4(SI), SI
  825. LEAL 4(R9), R9
  826. matchlen_match2_match_nolit_encodeBlockAsm:
  827. CMPL SI, $0x01
  828. JE matchlen_match1_match_nolit_encodeBlockAsm
  829. JB match_nolit_end_encodeBlockAsm
  830. MOVW (DI)(R9*1), R8
  831. CMPW (BX)(R9*1), R8
  832. JNE matchlen_match1_match_nolit_encodeBlockAsm
  833. LEAL 2(R9), R9
  834. SUBL $0x02, SI
  835. JZ match_nolit_end_encodeBlockAsm
  836. matchlen_match1_match_nolit_encodeBlockAsm:
  837. MOVB (DI)(R9*1), R8
  838. CMPB (BX)(R9*1), R8
  839. JNE match_nolit_end_encodeBlockAsm
  840. LEAL 1(R9), R9
  841. match_nolit_end_encodeBlockAsm:
  842. ADDL R9, CX
  843. MOVL 16(SP), BX
  844. ADDL $0x04, R9
  845. MOVL CX, 12(SP)
  846. // emitCopy
  847. CMPL BX, $0x00010000
  848. JB two_byte_offset_match_nolit_encodeBlockAsm
  849. CMPL R9, $0x40
  850. JBE four_bytes_remain_match_nolit_encodeBlockAsm
  851. MOVB $0xff, (AX)
  852. MOVL BX, 1(AX)
  853. LEAL -64(R9), R9
  854. ADDQ $0x05, AX
  855. CMPL R9, $0x04
  856. JB four_bytes_remain_match_nolit_encodeBlockAsm
  857. // emitRepeat
  858. emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy:
  859. MOVL R9, SI
  860. LEAL -4(R9), R9
  861. CMPL SI, $0x08
  862. JBE repeat_two_match_nolit_encodeBlockAsm_emit_copy
  863. CMPL SI, $0x0c
  864. JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy
  865. CMPL BX, $0x00000800
  866. JB repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy
  867. cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy:
  868. CMPL R9, $0x00000104
  869. JB repeat_three_match_nolit_encodeBlockAsm_emit_copy
  870. CMPL R9, $0x00010100
  871. JB repeat_four_match_nolit_encodeBlockAsm_emit_copy
  872. CMPL R9, $0x0100ffff
  873. JB repeat_five_match_nolit_encodeBlockAsm_emit_copy
  874. LEAL -16842747(R9), R9
  875. MOVL $0xfffb001d, (AX)
  876. MOVB $0xff, 4(AX)
  877. ADDQ $0x05, AX
  878. JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy
  879. repeat_five_match_nolit_encodeBlockAsm_emit_copy:
  880. LEAL -65536(R9), R9
  881. MOVL R9, BX
  882. MOVW $0x001d, (AX)
  883. MOVW R9, 2(AX)
  884. SARL $0x10, BX
  885. MOVB BL, 4(AX)
  886. ADDQ $0x05, AX
  887. JMP match_nolit_emitcopy_end_encodeBlockAsm
  888. repeat_four_match_nolit_encodeBlockAsm_emit_copy:
  889. LEAL -256(R9), R9
  890. MOVW $0x0019, (AX)
  891. MOVW R9, 2(AX)
  892. ADDQ $0x04, AX
  893. JMP match_nolit_emitcopy_end_encodeBlockAsm
  894. repeat_three_match_nolit_encodeBlockAsm_emit_copy:
  895. LEAL -4(R9), R9
  896. MOVW $0x0015, (AX)
  897. MOVB R9, 2(AX)
  898. ADDQ $0x03, AX
  899. JMP match_nolit_emitcopy_end_encodeBlockAsm
  900. repeat_two_match_nolit_encodeBlockAsm_emit_copy:
  901. SHLL $0x02, R9
  902. ORL $0x01, R9
  903. MOVW R9, (AX)
  904. ADDQ $0x02, AX
  905. JMP match_nolit_emitcopy_end_encodeBlockAsm
  906. repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy:
  907. XORQ SI, SI
  908. LEAL 1(SI)(R9*4), R9
  909. MOVB BL, 1(AX)
  910. SARL $0x08, BX
  911. SHLL $0x05, BX
  912. ORL BX, R9
  913. MOVB R9, (AX)
  914. ADDQ $0x02, AX
  915. JMP match_nolit_emitcopy_end_encodeBlockAsm
  916. four_bytes_remain_match_nolit_encodeBlockAsm:
  917. TESTL R9, R9
  918. JZ match_nolit_emitcopy_end_encodeBlockAsm
  919. XORL SI, SI
  920. LEAL -1(SI)(R9*4), R9
  921. MOVB R9, (AX)
  922. MOVL BX, 1(AX)
  923. ADDQ $0x05, AX
  924. JMP match_nolit_emitcopy_end_encodeBlockAsm
  925. two_byte_offset_match_nolit_encodeBlockAsm:
  926. CMPL R9, $0x40
  927. JBE two_byte_offset_short_match_nolit_encodeBlockAsm
  928. CMPL BX, $0x00000800
  929. JAE long_offset_short_match_nolit_encodeBlockAsm
  930. MOVL $0x00000001, SI
  931. LEAL 16(SI), SI
  932. MOVB BL, 1(AX)
  933. MOVL BX, DI
  934. SHRL $0x08, DI
  935. SHLL $0x05, DI
  936. ORL DI, SI
  937. MOVB SI, (AX)
  938. ADDQ $0x02, AX
  939. SUBL $0x08, R9
  940. // emitRepeat
  941. LEAL -4(R9), R9
  942. JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b
  943. emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short_2b:
  944. MOVL R9, SI
  945. LEAL -4(R9), R9
  946. CMPL SI, $0x08
  947. JBE repeat_two_match_nolit_encodeBlockAsm_emit_copy_short_2b
  948. CMPL SI, $0x0c
  949. JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b
  950. CMPL BX, $0x00000800
  951. JB repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b
  952. cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b:
  953. CMPL R9, $0x00000104
  954. JB repeat_three_match_nolit_encodeBlockAsm_emit_copy_short_2b
  955. CMPL R9, $0x00010100
  956. JB repeat_four_match_nolit_encodeBlockAsm_emit_copy_short_2b
  957. CMPL R9, $0x0100ffff
  958. JB repeat_five_match_nolit_encodeBlockAsm_emit_copy_short_2b
  959. LEAL -16842747(R9), R9
  960. MOVL $0xfffb001d, (AX)
  961. MOVB $0xff, 4(AX)
  962. ADDQ $0x05, AX
  963. JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short_2b
  964. repeat_five_match_nolit_encodeBlockAsm_emit_copy_short_2b:
  965. LEAL -65536(R9), R9
  966. MOVL R9, BX
  967. MOVW $0x001d, (AX)
  968. MOVW R9, 2(AX)
  969. SARL $0x10, BX
  970. MOVB BL, 4(AX)
  971. ADDQ $0x05, AX
  972. JMP match_nolit_emitcopy_end_encodeBlockAsm
  973. repeat_four_match_nolit_encodeBlockAsm_emit_copy_short_2b:
  974. LEAL -256(R9), R9
  975. MOVW $0x0019, (AX)
  976. MOVW R9, 2(AX)
  977. ADDQ $0x04, AX
  978. JMP match_nolit_emitcopy_end_encodeBlockAsm
  979. repeat_three_match_nolit_encodeBlockAsm_emit_copy_short_2b:
  980. LEAL -4(R9), R9
  981. MOVW $0x0015, (AX)
  982. MOVB R9, 2(AX)
  983. ADDQ $0x03, AX
  984. JMP match_nolit_emitcopy_end_encodeBlockAsm
  985. repeat_two_match_nolit_encodeBlockAsm_emit_copy_short_2b:
  986. SHLL $0x02, R9
  987. ORL $0x01, R9
  988. MOVW R9, (AX)
  989. ADDQ $0x02, AX
  990. JMP match_nolit_emitcopy_end_encodeBlockAsm
  991. repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b:
  992. XORQ SI, SI
  993. LEAL 1(SI)(R9*4), R9
  994. MOVB BL, 1(AX)
  995. SARL $0x08, BX
  996. SHLL $0x05, BX
  997. ORL BX, R9
  998. MOVB R9, (AX)
  999. ADDQ $0x02, AX
  1000. JMP match_nolit_emitcopy_end_encodeBlockAsm
  1001. long_offset_short_match_nolit_encodeBlockAsm:
  1002. MOVB $0xee, (AX)
  1003. MOVW BX, 1(AX)
  1004. LEAL -60(R9), R9
  1005. ADDQ $0x03, AX
  1006. // emitRepeat
  1007. emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short:
  1008. MOVL R9, SI
  1009. LEAL -4(R9), R9
  1010. CMPL SI, $0x08
  1011. JBE repeat_two_match_nolit_encodeBlockAsm_emit_copy_short
  1012. CMPL SI, $0x0c
  1013. JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short
  1014. CMPL BX, $0x00000800
  1015. JB repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short
  1016. cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short:
  1017. CMPL R9, $0x00000104
  1018. JB repeat_three_match_nolit_encodeBlockAsm_emit_copy_short
  1019. CMPL R9, $0x00010100
  1020. JB repeat_four_match_nolit_encodeBlockAsm_emit_copy_short
  1021. CMPL R9, $0x0100ffff
  1022. JB repeat_five_match_nolit_encodeBlockAsm_emit_copy_short
  1023. LEAL -16842747(R9), R9
  1024. MOVL $0xfffb001d, (AX)
  1025. MOVB $0xff, 4(AX)
  1026. ADDQ $0x05, AX
  1027. JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short
  1028. repeat_five_match_nolit_encodeBlockAsm_emit_copy_short:
  1029. LEAL -65536(R9), R9
  1030. MOVL R9, BX
  1031. MOVW $0x001d, (AX)
  1032. MOVW R9, 2(AX)
  1033. SARL $0x10, BX
  1034. MOVB BL, 4(AX)
  1035. ADDQ $0x05, AX
  1036. JMP match_nolit_emitcopy_end_encodeBlockAsm
  1037. repeat_four_match_nolit_encodeBlockAsm_emit_copy_short:
  1038. LEAL -256(R9), R9
  1039. MOVW $0x0019, (AX)
  1040. MOVW R9, 2(AX)
  1041. ADDQ $0x04, AX
  1042. JMP match_nolit_emitcopy_end_encodeBlockAsm
  1043. repeat_three_match_nolit_encodeBlockAsm_emit_copy_short:
  1044. LEAL -4(R9), R9
  1045. MOVW $0x0015, (AX)
  1046. MOVB R9, 2(AX)
  1047. ADDQ $0x03, AX
  1048. JMP match_nolit_emitcopy_end_encodeBlockAsm
  1049. repeat_two_match_nolit_encodeBlockAsm_emit_copy_short:
  1050. SHLL $0x02, R9
  1051. ORL $0x01, R9
  1052. MOVW R9, (AX)
  1053. ADDQ $0x02, AX
  1054. JMP match_nolit_emitcopy_end_encodeBlockAsm
  1055. repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short:
  1056. XORQ SI, SI
  1057. LEAL 1(SI)(R9*4), R9
  1058. MOVB BL, 1(AX)
  1059. SARL $0x08, BX
  1060. SHLL $0x05, BX
  1061. ORL BX, R9
  1062. MOVB R9, (AX)
  1063. ADDQ $0x02, AX
  1064. JMP match_nolit_emitcopy_end_encodeBlockAsm
  1065. two_byte_offset_short_match_nolit_encodeBlockAsm:
  1066. MOVL R9, SI
  1067. SHLL $0x02, SI
  1068. CMPL R9, $0x0c
  1069. JAE emit_copy_three_match_nolit_encodeBlockAsm
  1070. CMPL BX, $0x00000800
  1071. JAE emit_copy_three_match_nolit_encodeBlockAsm
  1072. LEAL -15(SI), SI
  1073. MOVB BL, 1(AX)
  1074. SHRL $0x08, BX
  1075. SHLL $0x05, BX
  1076. ORL BX, SI
  1077. MOVB SI, (AX)
  1078. ADDQ $0x02, AX
  1079. JMP match_nolit_emitcopy_end_encodeBlockAsm
  1080. emit_copy_three_match_nolit_encodeBlockAsm:
  1081. LEAL -2(SI), SI
  1082. MOVB SI, (AX)
  1083. MOVW BX, 1(AX)
  1084. ADDQ $0x03, AX
  1085. match_nolit_emitcopy_end_encodeBlockAsm:
  1086. CMPL CX, 8(SP)
  1087. JAE emit_remainder_encodeBlockAsm
  1088. MOVQ -2(DX)(CX*1), SI
  1089. CMPQ AX, (SP)
  1090. JB match_nolit_dst_ok_encodeBlockAsm
  1091. MOVQ $0x00000000, ret+48(FP)
  1092. RET
  1093. match_nolit_dst_ok_encodeBlockAsm:
  1094. MOVQ $0x0000cf1bbcdcbf9b, R8
  1095. MOVQ SI, DI
  1096. SHRQ $0x10, SI
  1097. MOVQ SI, BX
  1098. SHLQ $0x10, DI
  1099. IMULQ R8, DI
  1100. SHRQ $0x32, DI
  1101. SHLQ $0x10, BX
  1102. IMULQ R8, BX
  1103. SHRQ $0x32, BX
  1104. LEAL -2(CX), R8
  1105. LEAQ 24(SP)(BX*4), R9
  1106. MOVL (R9), BX
  1107. MOVL R8, 24(SP)(DI*4)
  1108. MOVL CX, (R9)
  1109. CMPL (DX)(BX*1), SI
  1110. JEQ match_nolit_loop_encodeBlockAsm
  1111. INCL CX
  1112. JMP search_loop_encodeBlockAsm
  1113. emit_remainder_encodeBlockAsm:
  1114. MOVQ src_len+32(FP), CX
  1115. SUBL 12(SP), CX
  1116. LEAQ 5(AX)(CX*1), CX
  1117. CMPQ CX, (SP)
  1118. JB emit_remainder_ok_encodeBlockAsm
  1119. MOVQ $0x00000000, ret+48(FP)
  1120. RET
  1121. emit_remainder_ok_encodeBlockAsm:
  1122. MOVQ src_len+32(FP), CX
  1123. MOVL 12(SP), BX
  1124. CMPL BX, CX
  1125. JEQ emit_literal_done_emit_remainder_encodeBlockAsm
  1126. MOVL CX, SI
  1127. MOVL CX, 12(SP)
  1128. LEAQ (DX)(BX*1), CX
  1129. SUBL BX, SI
  1130. LEAL -1(SI), DX
  1131. CMPL DX, $0x3c
  1132. JB one_byte_emit_remainder_encodeBlockAsm
  1133. CMPL DX, $0x00000100
  1134. JB two_bytes_emit_remainder_encodeBlockAsm
  1135. CMPL DX, $0x00010000
  1136. JB three_bytes_emit_remainder_encodeBlockAsm
  1137. CMPL DX, $0x01000000
  1138. JB four_bytes_emit_remainder_encodeBlockAsm
  1139. MOVB $0xfc, (AX)
  1140. MOVL DX, 1(AX)
  1141. ADDQ $0x05, AX
  1142. JMP memmove_long_emit_remainder_encodeBlockAsm
  1143. four_bytes_emit_remainder_encodeBlockAsm:
  1144. MOVL DX, BX
  1145. SHRL $0x10, BX
  1146. MOVB $0xf8, (AX)
  1147. MOVW DX, 1(AX)
  1148. MOVB BL, 3(AX)
  1149. ADDQ $0x04, AX
  1150. JMP memmove_long_emit_remainder_encodeBlockAsm
  1151. three_bytes_emit_remainder_encodeBlockAsm:
  1152. MOVB $0xf4, (AX)
  1153. MOVW DX, 1(AX)
  1154. ADDQ $0x03, AX
  1155. JMP memmove_long_emit_remainder_encodeBlockAsm
  1156. two_bytes_emit_remainder_encodeBlockAsm:
  1157. MOVB $0xf0, (AX)
  1158. MOVB DL, 1(AX)
  1159. ADDQ $0x02, AX
  1160. CMPL DX, $0x40
  1161. JB memmove_emit_remainder_encodeBlockAsm
  1162. JMP memmove_long_emit_remainder_encodeBlockAsm
  1163. one_byte_emit_remainder_encodeBlockAsm:
  1164. SHLB $0x02, DL
  1165. MOVB DL, (AX)
  1166. ADDQ $0x01, AX
  1167. memmove_emit_remainder_encodeBlockAsm:
  1168. LEAQ (AX)(SI*1), DX
  1169. MOVL SI, BX
  1170. // genMemMoveShort
  1171. CMPQ BX, $0x03
  1172. JB emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_1or2
  1173. JE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_3
  1174. CMPQ BX, $0x08
  1175. JB emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_4through7
  1176. CMPQ BX, $0x10
  1177. JBE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16
  1178. CMPQ BX, $0x20
  1179. JBE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32
  1180. JMP emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64
  1181. emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_1or2:
  1182. MOVB (CX), SI
  1183. MOVB -1(CX)(BX*1), CL
  1184. MOVB SI, (AX)
  1185. MOVB CL, -1(AX)(BX*1)
  1186. JMP memmove_end_copy_emit_remainder_encodeBlockAsm
  1187. emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_3:
  1188. MOVW (CX), SI
  1189. MOVB 2(CX), CL
  1190. MOVW SI, (AX)
  1191. MOVB CL, 2(AX)
  1192. JMP memmove_end_copy_emit_remainder_encodeBlockAsm
  1193. emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_4through7:
  1194. MOVL (CX), SI
  1195. MOVL -4(CX)(BX*1), CX
  1196. MOVL SI, (AX)
  1197. MOVL CX, -4(AX)(BX*1)
  1198. JMP memmove_end_copy_emit_remainder_encodeBlockAsm
  1199. emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16:
  1200. MOVQ (CX), SI
  1201. MOVQ -8(CX)(BX*1), CX
  1202. MOVQ SI, (AX)
  1203. MOVQ CX, -8(AX)(BX*1)
  1204. JMP memmove_end_copy_emit_remainder_encodeBlockAsm
  1205. emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32:
  1206. MOVOU (CX), X0
  1207. MOVOU -16(CX)(BX*1), X1
  1208. MOVOU X0, (AX)
  1209. MOVOU X1, -16(AX)(BX*1)
  1210. JMP memmove_end_copy_emit_remainder_encodeBlockAsm
  1211. emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64:
  1212. MOVOU (CX), X0
  1213. MOVOU 16(CX), X1
  1214. MOVOU -32(CX)(BX*1), X2
  1215. MOVOU -16(CX)(BX*1), X3
  1216. MOVOU X0, (AX)
  1217. MOVOU X1, 16(AX)
  1218. MOVOU X2, -32(AX)(BX*1)
  1219. MOVOU X3, -16(AX)(BX*1)
  1220. memmove_end_copy_emit_remainder_encodeBlockAsm:
  1221. MOVQ DX, AX
  1222. JMP emit_literal_done_emit_remainder_encodeBlockAsm
  1223. memmove_long_emit_remainder_encodeBlockAsm:
  1224. LEAQ (AX)(SI*1), DX
  1225. MOVL SI, BX
  1226. // genMemMoveLong
  1227. MOVOU (CX), X0
  1228. MOVOU 16(CX), X1
  1229. MOVOU -32(CX)(BX*1), X2
  1230. MOVOU -16(CX)(BX*1), X3
  1231. MOVQ BX, DI
  1232. SHRQ $0x05, DI
  1233. MOVQ AX, SI
  1234. ANDL $0x0000001f, SI
  1235. MOVQ $0x00000040, R8
  1236. SUBQ SI, R8
  1237. DECQ DI
  1238. JA emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32
  1239. LEAQ -32(CX)(R8*1), SI
  1240. LEAQ -32(AX)(R8*1), R9
  1241. emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back:
  1242. MOVOU (SI), X4
  1243. MOVOU 16(SI), X5
  1244. MOVOA X4, (R9)
  1245. MOVOA X5, 16(R9)
  1246. ADDQ $0x20, R9
  1247. ADDQ $0x20, SI
  1248. ADDQ $0x20, R8
  1249. DECQ DI
  1250. JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back
  1251. emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32:
  1252. MOVOU -32(CX)(R8*1), X4
  1253. MOVOU -16(CX)(R8*1), X5
  1254. MOVOA X4, -32(AX)(R8*1)
  1255. MOVOA X5, -16(AX)(R8*1)
  1256. ADDQ $0x20, R8
  1257. CMPQ BX, R8
  1258. JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32
  1259. MOVOU X0, (AX)
  1260. MOVOU X1, 16(AX)
  1261. MOVOU X2, -32(AX)(BX*1)
  1262. MOVOU X3, -16(AX)(BX*1)
  1263. MOVQ DX, AX
  1264. emit_literal_done_emit_remainder_encodeBlockAsm:
  1265. MOVQ dst_base+0(FP), CX
  1266. SUBQ CX, AX
  1267. MOVQ AX, ret+48(FP)
  1268. RET
  1269. // func encodeBlockAsm4MB(dst []byte, src []byte) int
  1270. // Requires: BMI, SSE2
  1271. TEXT ·encodeBlockAsm4MB(SB), $65560-56
  1272. MOVQ dst_base+0(FP), AX
  1273. MOVQ $0x00000200, CX
  1274. LEAQ 24(SP), DX
  1275. PXOR X0, X0
  1276. zero_loop_encodeBlockAsm4MB:
  1277. MOVOU X0, (DX)
  1278. MOVOU X0, 16(DX)
  1279. MOVOU X0, 32(DX)
  1280. MOVOU X0, 48(DX)
  1281. MOVOU X0, 64(DX)
  1282. MOVOU X0, 80(DX)
  1283. MOVOU X0, 96(DX)
  1284. MOVOU X0, 112(DX)
  1285. ADDQ $0x80, DX
  1286. DECQ CX
  1287. JNZ zero_loop_encodeBlockAsm4MB
  1288. MOVL $0x00000000, 12(SP)
  1289. MOVQ src_len+32(FP), CX
  1290. LEAQ -9(CX), DX
  1291. LEAQ -8(CX), BX
  1292. MOVL BX, 8(SP)
  1293. SHRQ $0x05, CX
  1294. SUBL CX, DX
  1295. LEAQ (AX)(DX*1), DX
  1296. MOVQ DX, (SP)
  1297. MOVL $0x00000001, CX
  1298. MOVL CX, 16(SP)
  1299. MOVQ src_base+24(FP), DX
  1300. search_loop_encodeBlockAsm4MB:
  1301. MOVL CX, BX
  1302. SUBL 12(SP), BX
  1303. SHRL $0x06, BX
  1304. LEAL 4(CX)(BX*1), BX
  1305. CMPL BX, 8(SP)
  1306. JAE emit_remainder_encodeBlockAsm4MB
  1307. MOVQ (DX)(CX*1), SI
  1308. MOVL BX, 20(SP)
  1309. MOVQ $0x0000cf1bbcdcbf9b, R8
  1310. MOVQ SI, R9
  1311. MOVQ SI, R10
  1312. SHRQ $0x08, R10
  1313. SHLQ $0x10, R9
  1314. IMULQ R8, R9
  1315. SHRQ $0x32, R9
  1316. SHLQ $0x10, R10
  1317. IMULQ R8, R10
  1318. SHRQ $0x32, R10
  1319. MOVL 24(SP)(R9*4), BX
  1320. MOVL 24(SP)(R10*4), DI
  1321. MOVL CX, 24(SP)(R9*4)
  1322. LEAL 1(CX), R9
  1323. MOVL R9, 24(SP)(R10*4)
  1324. MOVQ SI, R9
  1325. SHRQ $0x10, R9
  1326. SHLQ $0x10, R9
  1327. IMULQ R8, R9
  1328. SHRQ $0x32, R9
  1329. MOVL CX, R8
  1330. SUBL 16(SP), R8
  1331. MOVL 1(DX)(R8*1), R10
  1332. MOVQ SI, R8
  1333. SHRQ $0x08, R8
  1334. CMPL R8, R10
  1335. JNE no_repeat_found_encodeBlockAsm4MB
  1336. LEAL 1(CX), SI
  1337. MOVL 12(SP), DI
  1338. MOVL SI, BX
  1339. SUBL 16(SP), BX
  1340. JZ repeat_extend_back_end_encodeBlockAsm4MB
  1341. repeat_extend_back_loop_encodeBlockAsm4MB:
  1342. CMPL SI, DI
  1343. JBE repeat_extend_back_end_encodeBlockAsm4MB
  1344. MOVB -1(DX)(BX*1), R8
  1345. MOVB -1(DX)(SI*1), R9
  1346. CMPB R8, R9
  1347. JNE repeat_extend_back_end_encodeBlockAsm4MB
  1348. LEAL -1(SI), SI
  1349. DECL BX
  1350. JNZ repeat_extend_back_loop_encodeBlockAsm4MB
  1351. repeat_extend_back_end_encodeBlockAsm4MB:
  1352. MOVL 12(SP), BX
  1353. CMPL BX, SI
  1354. JEQ emit_literal_done_repeat_emit_encodeBlockAsm4MB
  1355. MOVL SI, R8
  1356. MOVL SI, 12(SP)
  1357. LEAQ (DX)(BX*1), R9
  1358. SUBL BX, R8
  1359. LEAL -1(R8), BX
  1360. CMPL BX, $0x3c
  1361. JB one_byte_repeat_emit_encodeBlockAsm4MB
  1362. CMPL BX, $0x00000100
  1363. JB two_bytes_repeat_emit_encodeBlockAsm4MB
  1364. CMPL BX, $0x00010000
  1365. JB three_bytes_repeat_emit_encodeBlockAsm4MB
  1366. MOVL BX, R10
  1367. SHRL $0x10, R10
  1368. MOVB $0xf8, (AX)
  1369. MOVW BX, 1(AX)
  1370. MOVB R10, 3(AX)
  1371. ADDQ $0x04, AX
  1372. JMP memmove_long_repeat_emit_encodeBlockAsm4MB
  1373. three_bytes_repeat_emit_encodeBlockAsm4MB:
  1374. MOVB $0xf4, (AX)
  1375. MOVW BX, 1(AX)
  1376. ADDQ $0x03, AX
  1377. JMP memmove_long_repeat_emit_encodeBlockAsm4MB
  1378. two_bytes_repeat_emit_encodeBlockAsm4MB:
  1379. MOVB $0xf0, (AX)
  1380. MOVB BL, 1(AX)
  1381. ADDQ $0x02, AX
  1382. CMPL BX, $0x40
  1383. JB memmove_repeat_emit_encodeBlockAsm4MB
  1384. JMP memmove_long_repeat_emit_encodeBlockAsm4MB
  1385. one_byte_repeat_emit_encodeBlockAsm4MB:
  1386. SHLB $0x02, BL
  1387. MOVB BL, (AX)
  1388. ADDQ $0x01, AX
  1389. memmove_repeat_emit_encodeBlockAsm4MB:
  1390. LEAQ (AX)(R8*1), BX
  1391. // genMemMoveShort
  1392. CMPQ R8, $0x08
  1393. JBE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8
  1394. CMPQ R8, $0x10
  1395. JBE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16
  1396. CMPQ R8, $0x20
  1397. JBE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32
  1398. JMP emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64
  1399. emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8:
  1400. MOVQ (R9), R10
  1401. MOVQ R10, (AX)
  1402. JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB
  1403. emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16:
  1404. MOVQ (R9), R10
  1405. MOVQ -8(R9)(R8*1), R9
  1406. MOVQ R10, (AX)
  1407. MOVQ R9, -8(AX)(R8*1)
  1408. JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB
  1409. emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32:
  1410. MOVOU (R9), X0
  1411. MOVOU -16(R9)(R8*1), X1
  1412. MOVOU X0, (AX)
  1413. MOVOU X1, -16(AX)(R8*1)
  1414. JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB
  1415. emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64:
  1416. MOVOU (R9), X0
  1417. MOVOU 16(R9), X1
  1418. MOVOU -32(R9)(R8*1), X2
  1419. MOVOU -16(R9)(R8*1), X3
  1420. MOVOU X0, (AX)
  1421. MOVOU X1, 16(AX)
  1422. MOVOU X2, -32(AX)(R8*1)
  1423. MOVOU X3, -16(AX)(R8*1)
  1424. memmove_end_copy_repeat_emit_encodeBlockAsm4MB:
  1425. MOVQ BX, AX
  1426. JMP emit_literal_done_repeat_emit_encodeBlockAsm4MB
  1427. memmove_long_repeat_emit_encodeBlockAsm4MB:
  1428. LEAQ (AX)(R8*1), BX
  1429. // genMemMoveLong
  1430. MOVOU (R9), X0
  1431. MOVOU 16(R9), X1
  1432. MOVOU -32(R9)(R8*1), X2
  1433. MOVOU -16(R9)(R8*1), X3
  1434. MOVQ R8, R11
  1435. SHRQ $0x05, R11
  1436. MOVQ AX, R10
  1437. ANDL $0x0000001f, R10
  1438. MOVQ $0x00000040, R12
  1439. SUBQ R10, R12
  1440. DECQ R11
  1441. JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
  1442. LEAQ -32(R9)(R12*1), R10
  1443. LEAQ -32(AX)(R12*1), R13
  1444. emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back:
  1445. MOVOU (R10), X4
  1446. MOVOU 16(R10), X5
  1447. MOVOA X4, (R13)
  1448. MOVOA X5, 16(R13)
  1449. ADDQ $0x20, R13
  1450. ADDQ $0x20, R10
  1451. ADDQ $0x20, R12
  1452. DECQ R11
  1453. JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back
  1454. emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32:
  1455. MOVOU -32(R9)(R12*1), X4
  1456. MOVOU -16(R9)(R12*1), X5
  1457. MOVOA X4, -32(AX)(R12*1)
  1458. MOVOA X5, -16(AX)(R12*1)
  1459. ADDQ $0x20, R12
  1460. CMPQ R8, R12
  1461. JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
  1462. MOVOU X0, (AX)
  1463. MOVOU X1, 16(AX)
  1464. MOVOU X2, -32(AX)(R8*1)
  1465. MOVOU X3, -16(AX)(R8*1)
  1466. MOVQ BX, AX
  1467. emit_literal_done_repeat_emit_encodeBlockAsm4MB:
  1468. ADDL $0x05, CX
  1469. MOVL CX, BX
  1470. SUBL 16(SP), BX
  1471. MOVQ src_len+32(FP), R8
  1472. SUBL CX, R8
  1473. LEAQ (DX)(CX*1), R9
  1474. LEAQ (DX)(BX*1), BX
  1475. // matchLen
  1476. XORL R11, R11
  1477. matchlen_loopback_16_repeat_extend_encodeBlockAsm4MB:
  1478. CMPL R8, $0x10
  1479. JB matchlen_match8_repeat_extend_encodeBlockAsm4MB
  1480. MOVQ (R9)(R11*1), R10
  1481. MOVQ 8(R9)(R11*1), R12
  1482. XORQ (BX)(R11*1), R10
  1483. JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm4MB
  1484. XORQ 8(BX)(R11*1), R12
  1485. JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm4MB
  1486. LEAL -16(R8), R8
  1487. LEAL 16(R11), R11
  1488. JMP matchlen_loopback_16_repeat_extend_encodeBlockAsm4MB
  1489. matchlen_bsf_16repeat_extend_encodeBlockAsm4MB:
  1490. #ifdef GOAMD64_v3
  1491. TZCNTQ R12, R12
  1492. #else
  1493. BSFQ R12, R12
  1494. #endif
  1495. SARQ $0x03, R12
  1496. LEAL 8(R11)(R12*1), R11
  1497. JMP repeat_extend_forward_end_encodeBlockAsm4MB
  1498. matchlen_match8_repeat_extend_encodeBlockAsm4MB:
  1499. CMPL R8, $0x08
  1500. JB matchlen_match4_repeat_extend_encodeBlockAsm4MB
  1501. MOVQ (R9)(R11*1), R10
  1502. XORQ (BX)(R11*1), R10
  1503. JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm4MB
  1504. LEAL -8(R8), R8
  1505. LEAL 8(R11), R11
  1506. JMP matchlen_match4_repeat_extend_encodeBlockAsm4MB
  1507. matchlen_bsf_8_repeat_extend_encodeBlockAsm4MB:
  1508. #ifdef GOAMD64_v3
  1509. TZCNTQ R10, R10
  1510. #else
  1511. BSFQ R10, R10
  1512. #endif
  1513. SARQ $0x03, R10
  1514. LEAL (R11)(R10*1), R11
  1515. JMP repeat_extend_forward_end_encodeBlockAsm4MB
  1516. matchlen_match4_repeat_extend_encodeBlockAsm4MB:
  1517. CMPL R8, $0x04
  1518. JB matchlen_match2_repeat_extend_encodeBlockAsm4MB
  1519. MOVL (R9)(R11*1), R10
  1520. CMPL (BX)(R11*1), R10
  1521. JNE matchlen_match2_repeat_extend_encodeBlockAsm4MB
  1522. LEAL -4(R8), R8
  1523. LEAL 4(R11), R11
  1524. matchlen_match2_repeat_extend_encodeBlockAsm4MB:
  1525. CMPL R8, $0x01
  1526. JE matchlen_match1_repeat_extend_encodeBlockAsm4MB
  1527. JB repeat_extend_forward_end_encodeBlockAsm4MB
  1528. MOVW (R9)(R11*1), R10
  1529. CMPW (BX)(R11*1), R10
  1530. JNE matchlen_match1_repeat_extend_encodeBlockAsm4MB
  1531. LEAL 2(R11), R11
  1532. SUBL $0x02, R8
  1533. JZ repeat_extend_forward_end_encodeBlockAsm4MB
  1534. matchlen_match1_repeat_extend_encodeBlockAsm4MB:
  1535. MOVB (R9)(R11*1), R10
  1536. CMPB (BX)(R11*1), R10
  1537. JNE repeat_extend_forward_end_encodeBlockAsm4MB
  1538. LEAL 1(R11), R11
  1539. repeat_extend_forward_end_encodeBlockAsm4MB:
  1540. ADDL R11, CX
  1541. MOVL CX, BX
  1542. SUBL SI, BX
  1543. MOVL 16(SP), SI
  1544. TESTL DI, DI
  1545. JZ repeat_as_copy_encodeBlockAsm4MB
  1546. // emitRepeat
  1547. MOVL BX, DI
  1548. LEAL -4(BX), BX
  1549. CMPL DI, $0x08
  1550. JBE repeat_two_match_repeat_encodeBlockAsm4MB
  1551. CMPL DI, $0x0c
  1552. JAE cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB
  1553. CMPL SI, $0x00000800
  1554. JB repeat_two_offset_match_repeat_encodeBlockAsm4MB
  1555. cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB:
  1556. CMPL BX, $0x00000104
  1557. JB repeat_three_match_repeat_encodeBlockAsm4MB
  1558. CMPL BX, $0x00010100
  1559. JB repeat_four_match_repeat_encodeBlockAsm4MB
  1560. LEAL -65536(BX), BX
  1561. MOVL BX, SI
  1562. MOVW $0x001d, (AX)
  1563. MOVW BX, 2(AX)
  1564. SARL $0x10, SI
  1565. MOVB SI, 4(AX)
  1566. ADDQ $0x05, AX
  1567. JMP repeat_end_emit_encodeBlockAsm4MB
  1568. repeat_four_match_repeat_encodeBlockAsm4MB:
  1569. LEAL -256(BX), BX
  1570. MOVW $0x0019, (AX)
  1571. MOVW BX, 2(AX)
  1572. ADDQ $0x04, AX
  1573. JMP repeat_end_emit_encodeBlockAsm4MB
  1574. repeat_three_match_repeat_encodeBlockAsm4MB:
  1575. LEAL -4(BX), BX
  1576. MOVW $0x0015, (AX)
  1577. MOVB BL, 2(AX)
  1578. ADDQ $0x03, AX
  1579. JMP repeat_end_emit_encodeBlockAsm4MB
  1580. repeat_two_match_repeat_encodeBlockAsm4MB:
  1581. SHLL $0x02, BX
  1582. ORL $0x01, BX
  1583. MOVW BX, (AX)
  1584. ADDQ $0x02, AX
  1585. JMP repeat_end_emit_encodeBlockAsm4MB
  1586. repeat_two_offset_match_repeat_encodeBlockAsm4MB:
  1587. XORQ DI, DI
  1588. LEAL 1(DI)(BX*4), BX
  1589. MOVB SI, 1(AX)
  1590. SARL $0x08, SI
  1591. SHLL $0x05, SI
  1592. ORL SI, BX
  1593. MOVB BL, (AX)
  1594. ADDQ $0x02, AX
  1595. JMP repeat_end_emit_encodeBlockAsm4MB
  1596. repeat_as_copy_encodeBlockAsm4MB:
  1597. // emitCopy
  1598. CMPL SI, $0x00010000
  1599. JB two_byte_offset_repeat_as_copy_encodeBlockAsm4MB
  1600. CMPL BX, $0x40
  1601. JBE four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB
  1602. MOVB $0xff, (AX)
  1603. MOVL SI, 1(AX)
  1604. LEAL -64(BX), BX
  1605. ADDQ $0x05, AX
  1606. CMPL BX, $0x04
  1607. JB four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB
  1608. // emitRepeat
  1609. MOVL BX, DI
  1610. LEAL -4(BX), BX
  1611. CMPL DI, $0x08
  1612. JBE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy
  1613. CMPL DI, $0x0c
  1614. JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy
  1615. CMPL SI, $0x00000800
  1616. JB repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy
  1617. cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
  1618. CMPL BX, $0x00000104
  1619. JB repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy
  1620. CMPL BX, $0x00010100
  1621. JB repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy
  1622. LEAL -65536(BX), BX
  1623. MOVL BX, SI
  1624. MOVW $0x001d, (AX)
  1625. MOVW BX, 2(AX)
  1626. SARL $0x10, SI
  1627. MOVB SI, 4(AX)
  1628. ADDQ $0x05, AX
  1629. JMP repeat_end_emit_encodeBlockAsm4MB
  1630. repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
  1631. LEAL -256(BX), BX
  1632. MOVW $0x0019, (AX)
  1633. MOVW BX, 2(AX)
  1634. ADDQ $0x04, AX
  1635. JMP repeat_end_emit_encodeBlockAsm4MB
  1636. repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
  1637. LEAL -4(BX), BX
  1638. MOVW $0x0015, (AX)
  1639. MOVB BL, 2(AX)
  1640. ADDQ $0x03, AX
  1641. JMP repeat_end_emit_encodeBlockAsm4MB
  1642. repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
  1643. SHLL $0x02, BX
  1644. ORL $0x01, BX
  1645. MOVW BX, (AX)
  1646. ADDQ $0x02, AX
  1647. JMP repeat_end_emit_encodeBlockAsm4MB
  1648. repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
  1649. XORQ DI, DI
  1650. LEAL 1(DI)(BX*4), BX
  1651. MOVB SI, 1(AX)
  1652. SARL $0x08, SI
  1653. SHLL $0x05, SI
  1654. ORL SI, BX
  1655. MOVB BL, (AX)
  1656. ADDQ $0x02, AX
  1657. JMP repeat_end_emit_encodeBlockAsm4MB
  1658. four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB:
  1659. TESTL BX, BX
  1660. JZ repeat_end_emit_encodeBlockAsm4MB
  1661. XORL DI, DI
  1662. LEAL -1(DI)(BX*4), BX
  1663. MOVB BL, (AX)
  1664. MOVL SI, 1(AX)
  1665. ADDQ $0x05, AX
  1666. JMP repeat_end_emit_encodeBlockAsm4MB
  1667. two_byte_offset_repeat_as_copy_encodeBlockAsm4MB:
  1668. CMPL BX, $0x40
  1669. JBE two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB
  1670. CMPL SI, $0x00000800
  1671. JAE long_offset_short_repeat_as_copy_encodeBlockAsm4MB
  1672. MOVL $0x00000001, DI
  1673. LEAL 16(DI), DI
  1674. MOVB SI, 1(AX)
  1675. SHRL $0x08, SI
  1676. SHLL $0x05, SI
  1677. ORL SI, DI
  1678. MOVB DI, (AX)
  1679. ADDQ $0x02, AX
  1680. SUBL $0x08, BX
  1681. // emitRepeat
  1682. LEAL -4(BX), BX
  1683. JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
  1684. MOVL BX, DI
  1685. LEAL -4(BX), BX
  1686. CMPL DI, $0x08
  1687. JBE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
  1688. CMPL DI, $0x0c
  1689. JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
  1690. CMPL SI, $0x00000800
  1691. JB repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
  1692. cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
  1693. CMPL BX, $0x00000104
  1694. JB repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
  1695. CMPL BX, $0x00010100
  1696. JB repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
  1697. LEAL -65536(BX), BX
  1698. MOVL BX, SI
  1699. MOVW $0x001d, (AX)
  1700. MOVW BX, 2(AX)
  1701. SARL $0x10, SI
  1702. MOVB SI, 4(AX)
  1703. ADDQ $0x05, AX
  1704. JMP repeat_end_emit_encodeBlockAsm4MB
  1705. repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
  1706. LEAL -256(BX), BX
  1707. MOVW $0x0019, (AX)
  1708. MOVW BX, 2(AX)
  1709. ADDQ $0x04, AX
  1710. JMP repeat_end_emit_encodeBlockAsm4MB
  1711. repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
  1712. LEAL -4(BX), BX
  1713. MOVW $0x0015, (AX)
  1714. MOVB BL, 2(AX)
  1715. ADDQ $0x03, AX
  1716. JMP repeat_end_emit_encodeBlockAsm4MB
  1717. repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
  1718. SHLL $0x02, BX
  1719. ORL $0x01, BX
  1720. MOVW BX, (AX)
  1721. ADDQ $0x02, AX
  1722. JMP repeat_end_emit_encodeBlockAsm4MB
  1723. repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
  1724. XORQ DI, DI
  1725. LEAL 1(DI)(BX*4), BX
  1726. MOVB SI, 1(AX)
  1727. SARL $0x08, SI
  1728. SHLL $0x05, SI
  1729. ORL SI, BX
  1730. MOVB BL, (AX)
  1731. ADDQ $0x02, AX
  1732. JMP repeat_end_emit_encodeBlockAsm4MB
  1733. long_offset_short_repeat_as_copy_encodeBlockAsm4MB:
  1734. MOVB $0xee, (AX)
  1735. MOVW SI, 1(AX)
  1736. LEAL -60(BX), BX
  1737. ADDQ $0x03, AX
  1738. // emitRepeat
  1739. MOVL BX, DI
  1740. LEAL -4(BX), BX
  1741. CMPL DI, $0x08
  1742. JBE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
  1743. CMPL DI, $0x0c
  1744. JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
  1745. CMPL SI, $0x00000800
  1746. JB repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
  1747. cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
  1748. CMPL BX, $0x00000104
  1749. JB repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
  1750. CMPL BX, $0x00010100
  1751. JB repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
  1752. LEAL -65536(BX), BX
  1753. MOVL BX, SI
  1754. MOVW $0x001d, (AX)
  1755. MOVW BX, 2(AX)
  1756. SARL $0x10, SI
  1757. MOVB SI, 4(AX)
  1758. ADDQ $0x05, AX
  1759. JMP repeat_end_emit_encodeBlockAsm4MB
  1760. repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
  1761. LEAL -256(BX), BX
  1762. MOVW $0x0019, (AX)
  1763. MOVW BX, 2(AX)
  1764. ADDQ $0x04, AX
  1765. JMP repeat_end_emit_encodeBlockAsm4MB
  1766. repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
  1767. LEAL -4(BX), BX
  1768. MOVW $0x0015, (AX)
  1769. MOVB BL, 2(AX)
  1770. ADDQ $0x03, AX
  1771. JMP repeat_end_emit_encodeBlockAsm4MB
  1772. repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
  1773. SHLL $0x02, BX
  1774. ORL $0x01, BX
  1775. MOVW BX, (AX)
  1776. ADDQ $0x02, AX
  1777. JMP repeat_end_emit_encodeBlockAsm4MB
  1778. repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
  1779. XORQ DI, DI
  1780. LEAL 1(DI)(BX*4), BX
  1781. MOVB SI, 1(AX)
  1782. SARL $0x08, SI
  1783. SHLL $0x05, SI
  1784. ORL SI, BX
  1785. MOVB BL, (AX)
  1786. ADDQ $0x02, AX
  1787. JMP repeat_end_emit_encodeBlockAsm4MB
  1788. two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB:
  1789. MOVL BX, DI
  1790. SHLL $0x02, DI
  1791. CMPL BX, $0x0c
  1792. JAE emit_copy_three_repeat_as_copy_encodeBlockAsm4MB
  1793. CMPL SI, $0x00000800
  1794. JAE emit_copy_three_repeat_as_copy_encodeBlockAsm4MB
  1795. LEAL -15(DI), DI
  1796. MOVB SI, 1(AX)
  1797. SHRL $0x08, SI
  1798. SHLL $0x05, SI
  1799. ORL SI, DI
  1800. MOVB DI, (AX)
  1801. ADDQ $0x02, AX
  1802. JMP repeat_end_emit_encodeBlockAsm4MB
  1803. emit_copy_three_repeat_as_copy_encodeBlockAsm4MB:
  1804. LEAL -2(DI), DI
  1805. MOVB DI, (AX)
  1806. MOVW SI, 1(AX)
  1807. ADDQ $0x03, AX
  1808. repeat_end_emit_encodeBlockAsm4MB:
  1809. MOVL CX, 12(SP)
  1810. JMP search_loop_encodeBlockAsm4MB
  1811. no_repeat_found_encodeBlockAsm4MB:
  1812. CMPL (DX)(BX*1), SI
  1813. JEQ candidate_match_encodeBlockAsm4MB
  1814. SHRQ $0x08, SI
  1815. MOVL 24(SP)(R9*4), BX
  1816. LEAL 2(CX), R8
  1817. CMPL (DX)(DI*1), SI
  1818. JEQ candidate2_match_encodeBlockAsm4MB
  1819. MOVL R8, 24(SP)(R9*4)
  1820. SHRQ $0x08, SI
  1821. CMPL (DX)(BX*1), SI
  1822. JEQ candidate3_match_encodeBlockAsm4MB
  1823. MOVL 20(SP), CX
  1824. JMP search_loop_encodeBlockAsm4MB
  1825. candidate3_match_encodeBlockAsm4MB:
  1826. ADDL $0x02, CX
  1827. JMP candidate_match_encodeBlockAsm4MB
  1828. candidate2_match_encodeBlockAsm4MB:
  1829. MOVL R8, 24(SP)(R9*4)
  1830. INCL CX
  1831. MOVL DI, BX
  1832. candidate_match_encodeBlockAsm4MB:
  1833. MOVL 12(SP), SI
  1834. TESTL BX, BX
  1835. JZ match_extend_back_end_encodeBlockAsm4MB
  1836. match_extend_back_loop_encodeBlockAsm4MB:
  1837. CMPL CX, SI
  1838. JBE match_extend_back_end_encodeBlockAsm4MB
  1839. MOVB -1(DX)(BX*1), DI
  1840. MOVB -1(DX)(CX*1), R8
  1841. CMPB DI, R8
  1842. JNE match_extend_back_end_encodeBlockAsm4MB
  1843. LEAL -1(CX), CX
  1844. DECL BX
  1845. JZ match_extend_back_end_encodeBlockAsm4MB
  1846. JMP match_extend_back_loop_encodeBlockAsm4MB
  1847. match_extend_back_end_encodeBlockAsm4MB:
  1848. MOVL CX, SI
  1849. SUBL 12(SP), SI
  1850. LEAQ 4(AX)(SI*1), SI
  1851. CMPQ SI, (SP)
  1852. JB match_dst_size_check_encodeBlockAsm4MB
  1853. MOVQ $0x00000000, ret+48(FP)
  1854. RET
  1855. match_dst_size_check_encodeBlockAsm4MB:
  1856. MOVL CX, SI
  1857. MOVL 12(SP), DI
  1858. CMPL DI, SI
  1859. JEQ emit_literal_done_match_emit_encodeBlockAsm4MB
  1860. MOVL SI, R8
  1861. MOVL SI, 12(SP)
  1862. LEAQ (DX)(DI*1), SI
  1863. SUBL DI, R8
  1864. LEAL -1(R8), DI
  1865. CMPL DI, $0x3c
  1866. JB one_byte_match_emit_encodeBlockAsm4MB
  1867. CMPL DI, $0x00000100
  1868. JB two_bytes_match_emit_encodeBlockAsm4MB
  1869. CMPL DI, $0x00010000
  1870. JB three_bytes_match_emit_encodeBlockAsm4MB
  1871. MOVL DI, R9
  1872. SHRL $0x10, R9
  1873. MOVB $0xf8, (AX)
  1874. MOVW DI, 1(AX)
  1875. MOVB R9, 3(AX)
  1876. ADDQ $0x04, AX
  1877. JMP memmove_long_match_emit_encodeBlockAsm4MB
  1878. three_bytes_match_emit_encodeBlockAsm4MB:
  1879. MOVB $0xf4, (AX)
  1880. MOVW DI, 1(AX)
  1881. ADDQ $0x03, AX
  1882. JMP memmove_long_match_emit_encodeBlockAsm4MB
  1883. two_bytes_match_emit_encodeBlockAsm4MB:
  1884. MOVB $0xf0, (AX)
  1885. MOVB DI, 1(AX)
  1886. ADDQ $0x02, AX
  1887. CMPL DI, $0x40
  1888. JB memmove_match_emit_encodeBlockAsm4MB
  1889. JMP memmove_long_match_emit_encodeBlockAsm4MB
  1890. one_byte_match_emit_encodeBlockAsm4MB:
  1891. SHLB $0x02, DI
  1892. MOVB DI, (AX)
  1893. ADDQ $0x01, AX
  1894. memmove_match_emit_encodeBlockAsm4MB:
  1895. LEAQ (AX)(R8*1), DI
  1896. // genMemMoveShort
  1897. CMPQ R8, $0x08
  1898. JBE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8
  1899. CMPQ R8, $0x10
  1900. JBE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16
  1901. CMPQ R8, $0x20
  1902. JBE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32
  1903. JMP emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64
  1904. emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8:
  1905. MOVQ (SI), R9
  1906. MOVQ R9, (AX)
  1907. JMP memmove_end_copy_match_emit_encodeBlockAsm4MB
  1908. emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16:
  1909. MOVQ (SI), R9
  1910. MOVQ -8(SI)(R8*1), SI
  1911. MOVQ R9, (AX)
  1912. MOVQ SI, -8(AX)(R8*1)
  1913. JMP memmove_end_copy_match_emit_encodeBlockAsm4MB
  1914. emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32:
  1915. MOVOU (SI), X0
  1916. MOVOU -16(SI)(R8*1), X1
  1917. MOVOU X0, (AX)
  1918. MOVOU X1, -16(AX)(R8*1)
  1919. JMP memmove_end_copy_match_emit_encodeBlockAsm4MB
  1920. emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64:
  1921. MOVOU (SI), X0
  1922. MOVOU 16(SI), X1
  1923. MOVOU -32(SI)(R8*1), X2
  1924. MOVOU -16(SI)(R8*1), X3
  1925. MOVOU X0, (AX)
  1926. MOVOU X1, 16(AX)
  1927. MOVOU X2, -32(AX)(R8*1)
  1928. MOVOU X3, -16(AX)(R8*1)
  1929. memmove_end_copy_match_emit_encodeBlockAsm4MB:
  1930. MOVQ DI, AX
  1931. JMP emit_literal_done_match_emit_encodeBlockAsm4MB
  1932. memmove_long_match_emit_encodeBlockAsm4MB:
  1933. LEAQ (AX)(R8*1), DI
  1934. // genMemMoveLong
  1935. MOVOU (SI), X0
  1936. MOVOU 16(SI), X1
  1937. MOVOU -32(SI)(R8*1), X2
  1938. MOVOU -16(SI)(R8*1), X3
  1939. MOVQ R8, R10
  1940. SHRQ $0x05, R10
  1941. MOVQ AX, R9
  1942. ANDL $0x0000001f, R9
  1943. MOVQ $0x00000040, R11
  1944. SUBQ R9, R11
  1945. DECQ R10
  1946. JA emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
  1947. LEAQ -32(SI)(R11*1), R9
  1948. LEAQ -32(AX)(R11*1), R12
  1949. emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back:
  1950. MOVOU (R9), X4
  1951. MOVOU 16(R9), X5
  1952. MOVOA X4, (R12)
  1953. MOVOA X5, 16(R12)
  1954. ADDQ $0x20, R12
  1955. ADDQ $0x20, R9
  1956. ADDQ $0x20, R11
  1957. DECQ R10
  1958. JNA emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back
  1959. emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32:
  1960. MOVOU -32(SI)(R11*1), X4
  1961. MOVOU -16(SI)(R11*1), X5
  1962. MOVOA X4, -32(AX)(R11*1)
  1963. MOVOA X5, -16(AX)(R11*1)
  1964. ADDQ $0x20, R11
  1965. CMPQ R8, R11
  1966. JAE emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
  1967. MOVOU X0, (AX)
  1968. MOVOU X1, 16(AX)
  1969. MOVOU X2, -32(AX)(R8*1)
  1970. MOVOU X3, -16(AX)(R8*1)
  1971. MOVQ DI, AX
  1972. emit_literal_done_match_emit_encodeBlockAsm4MB:
  1973. match_nolit_loop_encodeBlockAsm4MB:
  1974. MOVL CX, SI
  1975. SUBL BX, SI
  1976. MOVL SI, 16(SP)
  1977. ADDL $0x04, CX
  1978. ADDL $0x04, BX
  1979. MOVQ src_len+32(FP), SI
  1980. SUBL CX, SI
  1981. LEAQ (DX)(CX*1), DI
  1982. LEAQ (DX)(BX*1), BX
  1983. // matchLen
  1984. XORL R9, R9
  1985. matchlen_loopback_16_match_nolit_encodeBlockAsm4MB:
  1986. CMPL SI, $0x10
  1987. JB matchlen_match8_match_nolit_encodeBlockAsm4MB
  1988. MOVQ (DI)(R9*1), R8
  1989. MOVQ 8(DI)(R9*1), R10
  1990. XORQ (BX)(R9*1), R8
  1991. JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm4MB
  1992. XORQ 8(BX)(R9*1), R10
  1993. JNZ matchlen_bsf_16match_nolit_encodeBlockAsm4MB
  1994. LEAL -16(SI), SI
  1995. LEAL 16(R9), R9
  1996. JMP matchlen_loopback_16_match_nolit_encodeBlockAsm4MB
  1997. matchlen_bsf_16match_nolit_encodeBlockAsm4MB:
  1998. #ifdef GOAMD64_v3
  1999. TZCNTQ R10, R10
  2000. #else
  2001. BSFQ R10, R10
  2002. #endif
  2003. SARQ $0x03, R10
  2004. LEAL 8(R9)(R10*1), R9
  2005. JMP match_nolit_end_encodeBlockAsm4MB
  2006. matchlen_match8_match_nolit_encodeBlockAsm4MB:
  2007. CMPL SI, $0x08
  2008. JB matchlen_match4_match_nolit_encodeBlockAsm4MB
  2009. MOVQ (DI)(R9*1), R8
  2010. XORQ (BX)(R9*1), R8
  2011. JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm4MB
  2012. LEAL -8(SI), SI
  2013. LEAL 8(R9), R9
  2014. JMP matchlen_match4_match_nolit_encodeBlockAsm4MB
  2015. matchlen_bsf_8_match_nolit_encodeBlockAsm4MB:
  2016. #ifdef GOAMD64_v3
  2017. TZCNTQ R8, R8
  2018. #else
  2019. BSFQ R8, R8
  2020. #endif
  2021. SARQ $0x03, R8
  2022. LEAL (R9)(R8*1), R9
  2023. JMP match_nolit_end_encodeBlockAsm4MB
  2024. matchlen_match4_match_nolit_encodeBlockAsm4MB:
  2025. CMPL SI, $0x04
  2026. JB matchlen_match2_match_nolit_encodeBlockAsm4MB
  2027. MOVL (DI)(R9*1), R8
  2028. CMPL (BX)(R9*1), R8
  2029. JNE matchlen_match2_match_nolit_encodeBlockAsm4MB
  2030. LEAL -4(SI), SI
  2031. LEAL 4(R9), R9
  2032. matchlen_match2_match_nolit_encodeBlockAsm4MB:
  2033. CMPL SI, $0x01
  2034. JE matchlen_match1_match_nolit_encodeBlockAsm4MB
  2035. JB match_nolit_end_encodeBlockAsm4MB
  2036. MOVW (DI)(R9*1), R8
  2037. CMPW (BX)(R9*1), R8
  2038. JNE matchlen_match1_match_nolit_encodeBlockAsm4MB
  2039. LEAL 2(R9), R9
  2040. SUBL $0x02, SI
  2041. JZ match_nolit_end_encodeBlockAsm4MB
  2042. matchlen_match1_match_nolit_encodeBlockAsm4MB:
  2043. MOVB (DI)(R9*1), R8
  2044. CMPB (BX)(R9*1), R8
  2045. JNE match_nolit_end_encodeBlockAsm4MB
  2046. LEAL 1(R9), R9
  2047. match_nolit_end_encodeBlockAsm4MB:
  2048. ADDL R9, CX
  2049. MOVL 16(SP), BX
  2050. ADDL $0x04, R9
  2051. MOVL CX, 12(SP)
  2052. // emitCopy
  2053. CMPL BX, $0x00010000
  2054. JB two_byte_offset_match_nolit_encodeBlockAsm4MB
  2055. CMPL R9, $0x40
  2056. JBE four_bytes_remain_match_nolit_encodeBlockAsm4MB
  2057. MOVB $0xff, (AX)
  2058. MOVL BX, 1(AX)
  2059. LEAL -64(R9), R9
  2060. ADDQ $0x05, AX
  2061. CMPL R9, $0x04
  2062. JB four_bytes_remain_match_nolit_encodeBlockAsm4MB
  2063. // emitRepeat
  2064. MOVL R9, SI
  2065. LEAL -4(R9), R9
  2066. CMPL SI, $0x08
  2067. JBE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy
  2068. CMPL SI, $0x0c
  2069. JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy
  2070. CMPL BX, $0x00000800
  2071. JB repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy
  2072. cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy:
  2073. CMPL R9, $0x00000104
  2074. JB repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy
  2075. CMPL R9, $0x00010100
  2076. JB repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy
  2077. LEAL -65536(R9), R9
  2078. MOVL R9, BX
  2079. MOVW $0x001d, (AX)
  2080. MOVW R9, 2(AX)
  2081. SARL $0x10, BX
  2082. MOVB BL, 4(AX)
  2083. ADDQ $0x05, AX
  2084. JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
  2085. repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy:
  2086. LEAL -256(R9), R9
  2087. MOVW $0x0019, (AX)
  2088. MOVW R9, 2(AX)
  2089. ADDQ $0x04, AX
  2090. JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
  2091. repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy:
  2092. LEAL -4(R9), R9
  2093. MOVW $0x0015, (AX)
  2094. MOVB R9, 2(AX)
  2095. ADDQ $0x03, AX
  2096. JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
  2097. repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy:
  2098. SHLL $0x02, R9
  2099. ORL $0x01, R9
  2100. MOVW R9, (AX)
  2101. ADDQ $0x02, AX
  2102. JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
  2103. repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy:
  2104. XORQ SI, SI
  2105. LEAL 1(SI)(R9*4), R9
  2106. MOVB BL, 1(AX)
  2107. SARL $0x08, BX
  2108. SHLL $0x05, BX
  2109. ORL BX, R9
  2110. MOVB R9, (AX)
  2111. ADDQ $0x02, AX
  2112. JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
  2113. four_bytes_remain_match_nolit_encodeBlockAsm4MB:
  2114. TESTL R9, R9
  2115. JZ match_nolit_emitcopy_end_encodeBlockAsm4MB
  2116. XORL SI, SI
  2117. LEAL -1(SI)(R9*4), R9
  2118. MOVB R9, (AX)
  2119. MOVL BX, 1(AX)
  2120. ADDQ $0x05, AX
  2121. JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
  2122. two_byte_offset_match_nolit_encodeBlockAsm4MB:
  2123. CMPL R9, $0x40
  2124. JBE two_byte_offset_short_match_nolit_encodeBlockAsm4MB
  2125. CMPL BX, $0x00000800
  2126. JAE long_offset_short_match_nolit_encodeBlockAsm4MB
  2127. MOVL $0x00000001, SI
  2128. LEAL 16(SI), SI
  2129. MOVB BL, 1(AX)
  2130. SHRL $0x08, BX
  2131. SHLL $0x05, BX
  2132. ORL BX, SI
  2133. MOVB SI, (AX)
  2134. ADDQ $0x02, AX
  2135. SUBL $0x08, R9
  2136. // emitRepeat
  2137. LEAL -4(R9), R9
  2138. JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
  2139. MOVL R9, SI
  2140. LEAL -4(R9), R9
  2141. CMPL SI, $0x08
  2142. JBE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
  2143. CMPL SI, $0x0c
  2144. JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
  2145. CMPL BX, $0x00000800
  2146. JB repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
  2147. cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
  2148. CMPL R9, $0x00000104
  2149. JB repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
  2150. CMPL R9, $0x00010100
  2151. JB repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
  2152. LEAL -65536(R9), R9
  2153. MOVL R9, BX
  2154. MOVW $0x001d, (AX)
  2155. MOVW R9, 2(AX)
  2156. SARL $0x10, BX
  2157. MOVB BL, 4(AX)
  2158. ADDQ $0x05, AX
  2159. JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
  2160. repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
  2161. LEAL -256(R9), R9
  2162. MOVW $0x0019, (AX)
  2163. MOVW R9, 2(AX)
  2164. ADDQ $0x04, AX
  2165. JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
  2166. repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
  2167. LEAL -4(R9), R9
  2168. MOVW $0x0015, (AX)
  2169. MOVB R9, 2(AX)
  2170. ADDQ $0x03, AX
  2171. JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
  2172. repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
  2173. SHLL $0x02, R9
  2174. ORL $0x01, R9
  2175. MOVW R9, (AX)
  2176. ADDQ $0x02, AX
  2177. JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
  2178. repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
  2179. XORQ SI, SI
  2180. LEAL 1(SI)(R9*4), R9
  2181. MOVB BL, 1(AX)
  2182. SARL $0x08, BX
  2183. SHLL $0x05, BX
  2184. ORL BX, R9
  2185. MOVB R9, (AX)
  2186. ADDQ $0x02, AX
  2187. JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
  2188. long_offset_short_match_nolit_encodeBlockAsm4MB:
  2189. MOVB $0xee, (AX)
  2190. MOVW BX, 1(AX)
  2191. LEAL -60(R9), R9
  2192. ADDQ $0x03, AX
  2193. // emitRepeat
  2194. MOVL R9, SI
  2195. LEAL -4(R9), R9
  2196. CMPL SI, $0x08
  2197. JBE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short
  2198. CMPL SI, $0x0c
  2199. JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short
  2200. CMPL BX, $0x00000800
  2201. JB repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short
  2202. cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short:
  2203. CMPL R9, $0x00000104
  2204. JB repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short
  2205. CMPL R9, $0x00010100
  2206. JB repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short
  2207. LEAL -65536(R9), R9
  2208. MOVL R9, BX
  2209. MOVW $0x001d, (AX)
  2210. MOVW R9, 2(AX)
  2211. SARL $0x10, BX
  2212. MOVB BL, 4(AX)
  2213. ADDQ $0x05, AX
  2214. JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
  2215. repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short:
  2216. LEAL -256(R9), R9
  2217. MOVW $0x0019, (AX)
  2218. MOVW R9, 2(AX)
  2219. ADDQ $0x04, AX
  2220. JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
  2221. repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short:
  2222. LEAL -4(R9), R9
  2223. MOVW $0x0015, (AX)
  2224. MOVB R9, 2(AX)
  2225. ADDQ $0x03, AX
  2226. JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
  2227. repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short:
  2228. SHLL $0x02, R9
  2229. ORL $0x01, R9
  2230. MOVW R9, (AX)
  2231. ADDQ $0x02, AX
  2232. JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
  2233. repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short:
  2234. XORQ SI, SI
  2235. LEAL 1(SI)(R9*4), R9
  2236. MOVB BL, 1(AX)
  2237. SARL $0x08, BX
  2238. SHLL $0x05, BX
  2239. ORL BX, R9
  2240. MOVB R9, (AX)
  2241. ADDQ $0x02, AX
  2242. JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
  2243. two_byte_offset_short_match_nolit_encodeBlockAsm4MB:
  2244. MOVL R9, SI
  2245. SHLL $0x02, SI
  2246. CMPL R9, $0x0c
  2247. JAE emit_copy_three_match_nolit_encodeBlockAsm4MB
  2248. CMPL BX, $0x00000800
  2249. JAE emit_copy_three_match_nolit_encodeBlockAsm4MB
  2250. LEAL -15(SI), SI
  2251. MOVB BL, 1(AX)
  2252. SHRL $0x08, BX
  2253. SHLL $0x05, BX
  2254. ORL BX, SI
  2255. MOVB SI, (AX)
  2256. ADDQ $0x02, AX
  2257. JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
  2258. emit_copy_three_match_nolit_encodeBlockAsm4MB:
  2259. LEAL -2(SI), SI
  2260. MOVB SI, (AX)
  2261. MOVW BX, 1(AX)
  2262. ADDQ $0x03, AX
  2263. match_nolit_emitcopy_end_encodeBlockAsm4MB:
  2264. CMPL CX, 8(SP)
  2265. JAE emit_remainder_encodeBlockAsm4MB
  2266. MOVQ -2(DX)(CX*1), SI
  2267. CMPQ AX, (SP)
  2268. JB match_nolit_dst_ok_encodeBlockAsm4MB
  2269. MOVQ $0x00000000, ret+48(FP)
  2270. RET
  2271. match_nolit_dst_ok_encodeBlockAsm4MB:
  2272. MOVQ $0x0000cf1bbcdcbf9b, R8
  2273. MOVQ SI, DI
  2274. SHRQ $0x10, SI
  2275. MOVQ SI, BX
  2276. SHLQ $0x10, DI
  2277. IMULQ R8, DI
  2278. SHRQ $0x32, DI
  2279. SHLQ $0x10, BX
  2280. IMULQ R8, BX
  2281. SHRQ $0x32, BX
  2282. LEAL -2(CX), R8
  2283. LEAQ 24(SP)(BX*4), R9
  2284. MOVL (R9), BX
  2285. MOVL R8, 24(SP)(DI*4)
  2286. MOVL CX, (R9)
  2287. CMPL (DX)(BX*1), SI
  2288. JEQ match_nolit_loop_encodeBlockAsm4MB
  2289. INCL CX
  2290. JMP search_loop_encodeBlockAsm4MB
  2291. emit_remainder_encodeBlockAsm4MB:
  2292. MOVQ src_len+32(FP), CX
  2293. SUBL 12(SP), CX
  2294. LEAQ 4(AX)(CX*1), CX
  2295. CMPQ CX, (SP)
  2296. JB emit_remainder_ok_encodeBlockAsm4MB
  2297. MOVQ $0x00000000, ret+48(FP)
  2298. RET
  2299. emit_remainder_ok_encodeBlockAsm4MB:
  2300. MOVQ src_len+32(FP), CX
  2301. MOVL 12(SP), BX
  2302. CMPL BX, CX
  2303. JEQ emit_literal_done_emit_remainder_encodeBlockAsm4MB
  2304. MOVL CX, SI
  2305. MOVL CX, 12(SP)
  2306. LEAQ (DX)(BX*1), CX
  2307. SUBL BX, SI
  2308. LEAL -1(SI), DX
  2309. CMPL DX, $0x3c
  2310. JB one_byte_emit_remainder_encodeBlockAsm4MB
  2311. CMPL DX, $0x00000100
  2312. JB two_bytes_emit_remainder_encodeBlockAsm4MB
  2313. CMPL DX, $0x00010000
  2314. JB three_bytes_emit_remainder_encodeBlockAsm4MB
  2315. MOVL DX, BX
  2316. SHRL $0x10, BX
  2317. MOVB $0xf8, (AX)
  2318. MOVW DX, 1(AX)
  2319. MOVB BL, 3(AX)
  2320. ADDQ $0x04, AX
  2321. JMP memmove_long_emit_remainder_encodeBlockAsm4MB
  2322. three_bytes_emit_remainder_encodeBlockAsm4MB:
  2323. MOVB $0xf4, (AX)
  2324. MOVW DX, 1(AX)
  2325. ADDQ $0x03, AX
  2326. JMP memmove_long_emit_remainder_encodeBlockAsm4MB
  2327. two_bytes_emit_remainder_encodeBlockAsm4MB:
  2328. MOVB $0xf0, (AX)
  2329. MOVB DL, 1(AX)
  2330. ADDQ $0x02, AX
  2331. CMPL DX, $0x40
  2332. JB memmove_emit_remainder_encodeBlockAsm4MB
  2333. JMP memmove_long_emit_remainder_encodeBlockAsm4MB
  2334. one_byte_emit_remainder_encodeBlockAsm4MB:
  2335. SHLB $0x02, DL
  2336. MOVB DL, (AX)
  2337. ADDQ $0x01, AX
  2338. memmove_emit_remainder_encodeBlockAsm4MB:
  2339. LEAQ (AX)(SI*1), DX
  2340. MOVL SI, BX
  2341. // genMemMoveShort
  2342. CMPQ BX, $0x03
  2343. JB emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_1or2
  2344. JE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_3
  2345. CMPQ BX, $0x08
  2346. JB emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_4through7
  2347. CMPQ BX, $0x10
  2348. JBE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16
  2349. CMPQ BX, $0x20
  2350. JBE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32
  2351. JMP emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64
  2352. emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_1or2:
  2353. MOVB (CX), SI
  2354. MOVB -1(CX)(BX*1), CL
  2355. MOVB SI, (AX)
  2356. MOVB CL, -1(AX)(BX*1)
  2357. JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB
  2358. emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_3:
  2359. MOVW (CX), SI
  2360. MOVB 2(CX), CL
  2361. MOVW SI, (AX)
  2362. MOVB CL, 2(AX)
  2363. JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB
  2364. emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_4through7:
  2365. MOVL (CX), SI
  2366. MOVL -4(CX)(BX*1), CX
  2367. MOVL SI, (AX)
  2368. MOVL CX, -4(AX)(BX*1)
  2369. JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB
  2370. emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16:
  2371. MOVQ (CX), SI
  2372. MOVQ -8(CX)(BX*1), CX
  2373. MOVQ SI, (AX)
  2374. MOVQ CX, -8(AX)(BX*1)
  2375. JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB
  2376. emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32:
  2377. MOVOU (CX), X0
  2378. MOVOU -16(CX)(BX*1), X1
  2379. MOVOU X0, (AX)
  2380. MOVOU X1, -16(AX)(BX*1)
  2381. JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB
  2382. emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64:
  2383. MOVOU (CX), X0
  2384. MOVOU 16(CX), X1
  2385. MOVOU -32(CX)(BX*1), X2
  2386. MOVOU -16(CX)(BX*1), X3
  2387. MOVOU X0, (AX)
  2388. MOVOU X1, 16(AX)
  2389. MOVOU X2, -32(AX)(BX*1)
  2390. MOVOU X3, -16(AX)(BX*1)
  2391. memmove_end_copy_emit_remainder_encodeBlockAsm4MB:
  2392. MOVQ DX, AX
  2393. JMP emit_literal_done_emit_remainder_encodeBlockAsm4MB
  2394. memmove_long_emit_remainder_encodeBlockAsm4MB:
  2395. LEAQ (AX)(SI*1), DX
  2396. MOVL SI, BX
  2397. // genMemMoveLong
  2398. MOVOU (CX), X0
  2399. MOVOU 16(CX), X1
  2400. MOVOU -32(CX)(BX*1), X2
  2401. MOVOU -16(CX)(BX*1), X3
  2402. MOVQ BX, DI
  2403. SHRQ $0x05, DI
  2404. MOVQ AX, SI
  2405. ANDL $0x0000001f, SI
  2406. MOVQ $0x00000040, R8
  2407. SUBQ SI, R8
  2408. DECQ DI
  2409. JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32
  2410. LEAQ -32(CX)(R8*1), SI
  2411. LEAQ -32(AX)(R8*1), R9
  2412. emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_big_loop_back:
  2413. MOVOU (SI), X4
  2414. MOVOU 16(SI), X5
  2415. MOVOA X4, (R9)
  2416. MOVOA X5, 16(R9)
  2417. ADDQ $0x20, R9
  2418. ADDQ $0x20, SI
  2419. ADDQ $0x20, R8
  2420. DECQ DI
  2421. JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_big_loop_back
  2422. emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32:
  2423. MOVOU -32(CX)(R8*1), X4
  2424. MOVOU -16(CX)(R8*1), X5
  2425. MOVOA X4, -32(AX)(R8*1)
  2426. MOVOA X5, -16(AX)(R8*1)
  2427. ADDQ $0x20, R8
  2428. CMPQ BX, R8
  2429. JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32
  2430. MOVOU X0, (AX)
  2431. MOVOU X1, 16(AX)
  2432. MOVOU X2, -32(AX)(BX*1)
  2433. MOVOU X3, -16(AX)(BX*1)
  2434. MOVQ DX, AX
  2435. emit_literal_done_emit_remainder_encodeBlockAsm4MB:
  2436. MOVQ dst_base+0(FP), CX
  2437. SUBQ CX, AX
  2438. MOVQ AX, ret+48(FP)
  2439. RET
  2440. // func encodeBlockAsm12B(dst []byte, src []byte) int
  2441. // Requires: BMI, SSE2
  2442. TEXT ·encodeBlockAsm12B(SB), $16408-56
  2443. MOVQ dst_base+0(FP), AX
  2444. MOVQ $0x00000080, CX
  2445. LEAQ 24(SP), DX
  2446. PXOR X0, X0
  2447. zero_loop_encodeBlockAsm12B:
  2448. MOVOU X0, (DX)
  2449. MOVOU X0, 16(DX)
  2450. MOVOU X0, 32(DX)
  2451. MOVOU X0, 48(DX)
  2452. MOVOU X0, 64(DX)
  2453. MOVOU X0, 80(DX)
  2454. MOVOU X0, 96(DX)
  2455. MOVOU X0, 112(DX)
  2456. ADDQ $0x80, DX
  2457. DECQ CX
  2458. JNZ zero_loop_encodeBlockAsm12B
  2459. MOVL $0x00000000, 12(SP)
  2460. MOVQ src_len+32(FP), CX
  2461. LEAQ -9(CX), DX
  2462. LEAQ -8(CX), BX
  2463. MOVL BX, 8(SP)
  2464. SHRQ $0x05, CX
  2465. SUBL CX, DX
  2466. LEAQ (AX)(DX*1), DX
  2467. MOVQ DX, (SP)
  2468. MOVL $0x00000001, CX
  2469. MOVL CX, 16(SP)
  2470. MOVQ src_base+24(FP), DX
  2471. search_loop_encodeBlockAsm12B:
  2472. MOVL CX, BX
  2473. SUBL 12(SP), BX
  2474. SHRL $0x05, BX
  2475. LEAL 4(CX)(BX*1), BX
  2476. CMPL BX, 8(SP)
  2477. JAE emit_remainder_encodeBlockAsm12B
  2478. MOVQ (DX)(CX*1), SI
  2479. MOVL BX, 20(SP)
  2480. MOVQ $0x000000cf1bbcdcbb, R8
  2481. MOVQ SI, R9
  2482. MOVQ SI, R10
  2483. SHRQ $0x08, R10
  2484. SHLQ $0x18, R9
  2485. IMULQ R8, R9
  2486. SHRQ $0x34, R9
  2487. SHLQ $0x18, R10
  2488. IMULQ R8, R10
  2489. SHRQ $0x34, R10
  2490. MOVL 24(SP)(R9*4), BX
  2491. MOVL 24(SP)(R10*4), DI
  2492. MOVL CX, 24(SP)(R9*4)
  2493. LEAL 1(CX), R9
  2494. MOVL R9, 24(SP)(R10*4)
  2495. MOVQ SI, R9
  2496. SHRQ $0x10, R9
  2497. SHLQ $0x18, R9
  2498. IMULQ R8, R9
  2499. SHRQ $0x34, R9
  2500. MOVL CX, R8
  2501. SUBL 16(SP), R8
  2502. MOVL 1(DX)(R8*1), R10
  2503. MOVQ SI, R8
  2504. SHRQ $0x08, R8
  2505. CMPL R8, R10
  2506. JNE no_repeat_found_encodeBlockAsm12B
  2507. LEAL 1(CX), SI
  2508. MOVL 12(SP), DI
  2509. MOVL SI, BX
  2510. SUBL 16(SP), BX
  2511. JZ repeat_extend_back_end_encodeBlockAsm12B
  2512. repeat_extend_back_loop_encodeBlockAsm12B:
  2513. CMPL SI, DI
  2514. JBE repeat_extend_back_end_encodeBlockAsm12B
  2515. MOVB -1(DX)(BX*1), R8
  2516. MOVB -1(DX)(SI*1), R9
  2517. CMPB R8, R9
  2518. JNE repeat_extend_back_end_encodeBlockAsm12B
  2519. LEAL -1(SI), SI
  2520. DECL BX
  2521. JNZ repeat_extend_back_loop_encodeBlockAsm12B
  2522. repeat_extend_back_end_encodeBlockAsm12B:
  2523. MOVL 12(SP), BX
  2524. CMPL BX, SI
  2525. JEQ emit_literal_done_repeat_emit_encodeBlockAsm12B
  2526. MOVL SI, R8
  2527. MOVL SI, 12(SP)
  2528. LEAQ (DX)(BX*1), R9
  2529. SUBL BX, R8
  2530. LEAL -1(R8), BX
  2531. CMPL BX, $0x3c
  2532. JB one_byte_repeat_emit_encodeBlockAsm12B
  2533. CMPL BX, $0x00000100
  2534. JB two_bytes_repeat_emit_encodeBlockAsm12B
  2535. JB three_bytes_repeat_emit_encodeBlockAsm12B
  2536. three_bytes_repeat_emit_encodeBlockAsm12B:
  2537. MOVB $0xf4, (AX)
  2538. MOVW BX, 1(AX)
  2539. ADDQ $0x03, AX
  2540. JMP memmove_long_repeat_emit_encodeBlockAsm12B
  2541. two_bytes_repeat_emit_encodeBlockAsm12B:
  2542. MOVB $0xf0, (AX)
  2543. MOVB BL, 1(AX)
  2544. ADDQ $0x02, AX
  2545. CMPL BX, $0x40
  2546. JB memmove_repeat_emit_encodeBlockAsm12B
  2547. JMP memmove_long_repeat_emit_encodeBlockAsm12B
  2548. one_byte_repeat_emit_encodeBlockAsm12B:
  2549. SHLB $0x02, BL
  2550. MOVB BL, (AX)
  2551. ADDQ $0x01, AX
  2552. memmove_repeat_emit_encodeBlockAsm12B:
  2553. LEAQ (AX)(R8*1), BX
  2554. // genMemMoveShort
  2555. CMPQ R8, $0x08
  2556. JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8
  2557. CMPQ R8, $0x10
  2558. JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16
  2559. CMPQ R8, $0x20
  2560. JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32
  2561. JMP emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64
  2562. emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8:
  2563. MOVQ (R9), R10
  2564. MOVQ R10, (AX)
  2565. JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B
  2566. emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16:
  2567. MOVQ (R9), R10
  2568. MOVQ -8(R9)(R8*1), R9
  2569. MOVQ R10, (AX)
  2570. MOVQ R9, -8(AX)(R8*1)
  2571. JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B
  2572. emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32:
  2573. MOVOU (R9), X0
  2574. MOVOU -16(R9)(R8*1), X1
  2575. MOVOU X0, (AX)
  2576. MOVOU X1, -16(AX)(R8*1)
  2577. JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B
  2578. emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64:
  2579. MOVOU (R9), X0
  2580. MOVOU 16(R9), X1
  2581. MOVOU -32(R9)(R8*1), X2
  2582. MOVOU -16(R9)(R8*1), X3
  2583. MOVOU X0, (AX)
  2584. MOVOU X1, 16(AX)
  2585. MOVOU X2, -32(AX)(R8*1)
  2586. MOVOU X3, -16(AX)(R8*1)
  2587. memmove_end_copy_repeat_emit_encodeBlockAsm12B:
  2588. MOVQ BX, AX
  2589. JMP emit_literal_done_repeat_emit_encodeBlockAsm12B
  2590. memmove_long_repeat_emit_encodeBlockAsm12B:
  2591. LEAQ (AX)(R8*1), BX
  2592. // genMemMoveLong
  2593. MOVOU (R9), X0
  2594. MOVOU 16(R9), X1
  2595. MOVOU -32(R9)(R8*1), X2
  2596. MOVOU -16(R9)(R8*1), X3
  2597. MOVQ R8, R11
  2598. SHRQ $0x05, R11
  2599. MOVQ AX, R10
  2600. ANDL $0x0000001f, R10
  2601. MOVQ $0x00000040, R12
  2602. SUBQ R10, R12
  2603. DECQ R11
  2604. JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
  2605. LEAQ -32(R9)(R12*1), R10
  2606. LEAQ -32(AX)(R12*1), R13
  2607. emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back:
  2608. MOVOU (R10), X4
  2609. MOVOU 16(R10), X5
  2610. MOVOA X4, (R13)
  2611. MOVOA X5, 16(R13)
  2612. ADDQ $0x20, R13
  2613. ADDQ $0x20, R10
  2614. ADDQ $0x20, R12
  2615. DECQ R11
  2616. JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back
  2617. emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32:
  2618. MOVOU -32(R9)(R12*1), X4
  2619. MOVOU -16(R9)(R12*1), X5
  2620. MOVOA X4, -32(AX)(R12*1)
  2621. MOVOA X5, -16(AX)(R12*1)
  2622. ADDQ $0x20, R12
  2623. CMPQ R8, R12
  2624. JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
  2625. MOVOU X0, (AX)
  2626. MOVOU X1, 16(AX)
  2627. MOVOU X2, -32(AX)(R8*1)
  2628. MOVOU X3, -16(AX)(R8*1)
  2629. MOVQ BX, AX
  2630. emit_literal_done_repeat_emit_encodeBlockAsm12B:
  2631. ADDL $0x05, CX
  2632. MOVL CX, BX
  2633. SUBL 16(SP), BX
  2634. MOVQ src_len+32(FP), R8
  2635. SUBL CX, R8
  2636. LEAQ (DX)(CX*1), R9
  2637. LEAQ (DX)(BX*1), BX
  2638. // matchLen
  2639. XORL R11, R11
  2640. matchlen_loopback_16_repeat_extend_encodeBlockAsm12B:
  2641. CMPL R8, $0x10
  2642. JB matchlen_match8_repeat_extend_encodeBlockAsm12B
  2643. MOVQ (R9)(R11*1), R10
  2644. MOVQ 8(R9)(R11*1), R12
  2645. XORQ (BX)(R11*1), R10
  2646. JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm12B
  2647. XORQ 8(BX)(R11*1), R12
  2648. JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm12B
  2649. LEAL -16(R8), R8
  2650. LEAL 16(R11), R11
  2651. JMP matchlen_loopback_16_repeat_extend_encodeBlockAsm12B
  2652. matchlen_bsf_16repeat_extend_encodeBlockAsm12B:
  2653. #ifdef GOAMD64_v3
  2654. TZCNTQ R12, R12
  2655. #else
  2656. BSFQ R12, R12
  2657. #endif
  2658. SARQ $0x03, R12
  2659. LEAL 8(R11)(R12*1), R11
  2660. JMP repeat_extend_forward_end_encodeBlockAsm12B
  2661. matchlen_match8_repeat_extend_encodeBlockAsm12B:
  2662. CMPL R8, $0x08
  2663. JB matchlen_match4_repeat_extend_encodeBlockAsm12B
  2664. MOVQ (R9)(R11*1), R10
  2665. XORQ (BX)(R11*1), R10
  2666. JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm12B
  2667. LEAL -8(R8), R8
  2668. LEAL 8(R11), R11
  2669. JMP matchlen_match4_repeat_extend_encodeBlockAsm12B
  2670. matchlen_bsf_8_repeat_extend_encodeBlockAsm12B:
  2671. #ifdef GOAMD64_v3
  2672. TZCNTQ R10, R10
  2673. #else
  2674. BSFQ R10, R10
  2675. #endif
  2676. SARQ $0x03, R10
  2677. LEAL (R11)(R10*1), R11
  2678. JMP repeat_extend_forward_end_encodeBlockAsm12B
  2679. matchlen_match4_repeat_extend_encodeBlockAsm12B:
  2680. CMPL R8, $0x04
  2681. JB matchlen_match2_repeat_extend_encodeBlockAsm12B
  2682. MOVL (R9)(R11*1), R10
  2683. CMPL (BX)(R11*1), R10
  2684. JNE matchlen_match2_repeat_extend_encodeBlockAsm12B
  2685. LEAL -4(R8), R8
  2686. LEAL 4(R11), R11
  2687. matchlen_match2_repeat_extend_encodeBlockAsm12B:
  2688. CMPL R8, $0x01
  2689. JE matchlen_match1_repeat_extend_encodeBlockAsm12B
  2690. JB repeat_extend_forward_end_encodeBlockAsm12B
  2691. MOVW (R9)(R11*1), R10
  2692. CMPW (BX)(R11*1), R10
  2693. JNE matchlen_match1_repeat_extend_encodeBlockAsm12B
  2694. LEAL 2(R11), R11
  2695. SUBL $0x02, R8
  2696. JZ repeat_extend_forward_end_encodeBlockAsm12B
  2697. matchlen_match1_repeat_extend_encodeBlockAsm12B:
  2698. MOVB (R9)(R11*1), R10
  2699. CMPB (BX)(R11*1), R10
  2700. JNE repeat_extend_forward_end_encodeBlockAsm12B
  2701. LEAL 1(R11), R11
  2702. repeat_extend_forward_end_encodeBlockAsm12B:
  2703. ADDL R11, CX
  2704. MOVL CX, BX
  2705. SUBL SI, BX
  2706. MOVL 16(SP), SI
  2707. TESTL DI, DI
  2708. JZ repeat_as_copy_encodeBlockAsm12B
  2709. // emitRepeat
  2710. MOVL BX, DI
  2711. LEAL -4(BX), BX
  2712. CMPL DI, $0x08
  2713. JBE repeat_two_match_repeat_encodeBlockAsm12B
  2714. CMPL DI, $0x0c
  2715. JAE cant_repeat_two_offset_match_repeat_encodeBlockAsm12B
  2716. CMPL SI, $0x00000800
  2717. JB repeat_two_offset_match_repeat_encodeBlockAsm12B
  2718. cant_repeat_two_offset_match_repeat_encodeBlockAsm12B:
  2719. CMPL BX, $0x00000104
  2720. JB repeat_three_match_repeat_encodeBlockAsm12B
  2721. LEAL -256(BX), BX
  2722. MOVW $0x0019, (AX)
  2723. MOVW BX, 2(AX)
  2724. ADDQ $0x04, AX
  2725. JMP repeat_end_emit_encodeBlockAsm12B
  2726. repeat_three_match_repeat_encodeBlockAsm12B:
  2727. LEAL -4(BX), BX
  2728. MOVW $0x0015, (AX)
  2729. MOVB BL, 2(AX)
  2730. ADDQ $0x03, AX
  2731. JMP repeat_end_emit_encodeBlockAsm12B
  2732. repeat_two_match_repeat_encodeBlockAsm12B:
  2733. SHLL $0x02, BX
  2734. ORL $0x01, BX
  2735. MOVW BX, (AX)
  2736. ADDQ $0x02, AX
  2737. JMP repeat_end_emit_encodeBlockAsm12B
  2738. repeat_two_offset_match_repeat_encodeBlockAsm12B:
  2739. XORQ DI, DI
  2740. LEAL 1(DI)(BX*4), BX
  2741. MOVB SI, 1(AX)
  2742. SARL $0x08, SI
  2743. SHLL $0x05, SI
  2744. ORL SI, BX
  2745. MOVB BL, (AX)
  2746. ADDQ $0x02, AX
  2747. JMP repeat_end_emit_encodeBlockAsm12B
  2748. repeat_as_copy_encodeBlockAsm12B:
  2749. // emitCopy
  2750. CMPL BX, $0x40
  2751. JBE two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B
  2752. CMPL SI, $0x00000800
  2753. JAE long_offset_short_repeat_as_copy_encodeBlockAsm12B
  2754. MOVL $0x00000001, DI
  2755. LEAL 16(DI), DI
  2756. MOVB SI, 1(AX)
  2757. SHRL $0x08, SI
  2758. SHLL $0x05, SI
  2759. ORL SI, DI
  2760. MOVB DI, (AX)
  2761. ADDQ $0x02, AX
  2762. SUBL $0x08, BX
  2763. // emitRepeat
  2764. LEAL -4(BX), BX
  2765. JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b
  2766. MOVL BX, DI
  2767. LEAL -4(BX), BX
  2768. CMPL DI, $0x08
  2769. JBE repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b
  2770. CMPL DI, $0x0c
  2771. JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b
  2772. CMPL SI, $0x00000800
  2773. JB repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b
  2774. cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b:
  2775. CMPL BX, $0x00000104
  2776. JB repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b
  2777. LEAL -256(BX), BX
  2778. MOVW $0x0019, (AX)
  2779. MOVW BX, 2(AX)
  2780. ADDQ $0x04, AX
  2781. JMP repeat_end_emit_encodeBlockAsm12B
  2782. repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b:
  2783. LEAL -4(BX), BX
  2784. MOVW $0x0015, (AX)
  2785. MOVB BL, 2(AX)
  2786. ADDQ $0x03, AX
  2787. JMP repeat_end_emit_encodeBlockAsm12B
  2788. repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b:
  2789. SHLL $0x02, BX
  2790. ORL $0x01, BX
  2791. MOVW BX, (AX)
  2792. ADDQ $0x02, AX
  2793. JMP repeat_end_emit_encodeBlockAsm12B
  2794. repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b:
  2795. XORQ DI, DI
  2796. LEAL 1(DI)(BX*4), BX
  2797. MOVB SI, 1(AX)
  2798. SARL $0x08, SI
  2799. SHLL $0x05, SI
  2800. ORL SI, BX
  2801. MOVB BL, (AX)
  2802. ADDQ $0x02, AX
  2803. JMP repeat_end_emit_encodeBlockAsm12B
  2804. long_offset_short_repeat_as_copy_encodeBlockAsm12B:
  2805. MOVB $0xee, (AX)
  2806. MOVW SI, 1(AX)
  2807. LEAL -60(BX), BX
  2808. ADDQ $0x03, AX
  2809. // emitRepeat
  2810. MOVL BX, DI
  2811. LEAL -4(BX), BX
  2812. CMPL DI, $0x08
  2813. JBE repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
  2814. CMPL DI, $0x0c
  2815. JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
  2816. CMPL SI, $0x00000800
  2817. JB repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
  2818. cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
  2819. CMPL BX, $0x00000104
  2820. JB repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
  2821. LEAL -256(BX), BX
  2822. MOVW $0x0019, (AX)
  2823. MOVW BX, 2(AX)
  2824. ADDQ $0x04, AX
  2825. JMP repeat_end_emit_encodeBlockAsm12B
  2826. repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
  2827. LEAL -4(BX), BX
  2828. MOVW $0x0015, (AX)
  2829. MOVB BL, 2(AX)
  2830. ADDQ $0x03, AX
  2831. JMP repeat_end_emit_encodeBlockAsm12B
  2832. repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
  2833. SHLL $0x02, BX
  2834. ORL $0x01, BX
  2835. MOVW BX, (AX)
  2836. ADDQ $0x02, AX
  2837. JMP repeat_end_emit_encodeBlockAsm12B
  2838. repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
  2839. XORQ DI, DI
  2840. LEAL 1(DI)(BX*4), BX
  2841. MOVB SI, 1(AX)
  2842. SARL $0x08, SI
  2843. SHLL $0x05, SI
  2844. ORL SI, BX
  2845. MOVB BL, (AX)
  2846. ADDQ $0x02, AX
  2847. JMP repeat_end_emit_encodeBlockAsm12B
  2848. two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B:
  2849. MOVL BX, DI
  2850. SHLL $0x02, DI
  2851. CMPL BX, $0x0c
  2852. JAE emit_copy_three_repeat_as_copy_encodeBlockAsm12B
  2853. CMPL SI, $0x00000800
  2854. JAE emit_copy_three_repeat_as_copy_encodeBlockAsm12B
  2855. LEAL -15(DI), DI
  2856. MOVB SI, 1(AX)
  2857. SHRL $0x08, SI
  2858. SHLL $0x05, SI
  2859. ORL SI, DI
  2860. MOVB DI, (AX)
  2861. ADDQ $0x02, AX
  2862. JMP repeat_end_emit_encodeBlockAsm12B
  2863. emit_copy_three_repeat_as_copy_encodeBlockAsm12B:
  2864. LEAL -2(DI), DI
  2865. MOVB DI, (AX)
  2866. MOVW SI, 1(AX)
  2867. ADDQ $0x03, AX
  2868. repeat_end_emit_encodeBlockAsm12B:
  2869. MOVL CX, 12(SP)
  2870. JMP search_loop_encodeBlockAsm12B
  2871. no_repeat_found_encodeBlockAsm12B:
  2872. CMPL (DX)(BX*1), SI
  2873. JEQ candidate_match_encodeBlockAsm12B
  2874. SHRQ $0x08, SI
  2875. MOVL 24(SP)(R9*4), BX
  2876. LEAL 2(CX), R8
  2877. CMPL (DX)(DI*1), SI
  2878. JEQ candidate2_match_encodeBlockAsm12B
  2879. MOVL R8, 24(SP)(R9*4)
  2880. SHRQ $0x08, SI
  2881. CMPL (DX)(BX*1), SI
  2882. JEQ candidate3_match_encodeBlockAsm12B
  2883. MOVL 20(SP), CX
  2884. JMP search_loop_encodeBlockAsm12B
  2885. candidate3_match_encodeBlockAsm12B:
  2886. ADDL $0x02, CX
  2887. JMP candidate_match_encodeBlockAsm12B
  2888. candidate2_match_encodeBlockAsm12B:
  2889. MOVL R8, 24(SP)(R9*4)
  2890. INCL CX
  2891. MOVL DI, BX
  2892. candidate_match_encodeBlockAsm12B:
  2893. MOVL 12(SP), SI
  2894. TESTL BX, BX
  2895. JZ match_extend_back_end_encodeBlockAsm12B
  2896. match_extend_back_loop_encodeBlockAsm12B:
  2897. CMPL CX, SI
  2898. JBE match_extend_back_end_encodeBlockAsm12B
  2899. MOVB -1(DX)(BX*1), DI
  2900. MOVB -1(DX)(CX*1), R8
  2901. CMPB DI, R8
  2902. JNE match_extend_back_end_encodeBlockAsm12B
  2903. LEAL -1(CX), CX
  2904. DECL BX
  2905. JZ match_extend_back_end_encodeBlockAsm12B
  2906. JMP match_extend_back_loop_encodeBlockAsm12B
  2907. match_extend_back_end_encodeBlockAsm12B:
  2908. MOVL CX, SI
  2909. SUBL 12(SP), SI
  2910. LEAQ 3(AX)(SI*1), SI
  2911. CMPQ SI, (SP)
  2912. JB match_dst_size_check_encodeBlockAsm12B
  2913. MOVQ $0x00000000, ret+48(FP)
  2914. RET
  2915. match_dst_size_check_encodeBlockAsm12B:
  2916. MOVL CX, SI
  2917. MOVL 12(SP), DI
  2918. CMPL DI, SI
  2919. JEQ emit_literal_done_match_emit_encodeBlockAsm12B
  2920. MOVL SI, R8
  2921. MOVL SI, 12(SP)
  2922. LEAQ (DX)(DI*1), SI
  2923. SUBL DI, R8
  2924. LEAL -1(R8), DI
  2925. CMPL DI, $0x3c
  2926. JB one_byte_match_emit_encodeBlockAsm12B
  2927. CMPL DI, $0x00000100
  2928. JB two_bytes_match_emit_encodeBlockAsm12B
  2929. JB three_bytes_match_emit_encodeBlockAsm12B
  2930. three_bytes_match_emit_encodeBlockAsm12B:
  2931. MOVB $0xf4, (AX)
  2932. MOVW DI, 1(AX)
  2933. ADDQ $0x03, AX
  2934. JMP memmove_long_match_emit_encodeBlockAsm12B
  2935. two_bytes_match_emit_encodeBlockAsm12B:
  2936. MOVB $0xf0, (AX)
  2937. MOVB DI, 1(AX)
  2938. ADDQ $0x02, AX
  2939. CMPL DI, $0x40
  2940. JB memmove_match_emit_encodeBlockAsm12B
  2941. JMP memmove_long_match_emit_encodeBlockAsm12B
  2942. one_byte_match_emit_encodeBlockAsm12B:
  2943. SHLB $0x02, DI
  2944. MOVB DI, (AX)
  2945. ADDQ $0x01, AX
  2946. memmove_match_emit_encodeBlockAsm12B:
  2947. LEAQ (AX)(R8*1), DI
  2948. // genMemMoveShort
  2949. CMPQ R8, $0x08
  2950. JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8
  2951. CMPQ R8, $0x10
  2952. JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16
  2953. CMPQ R8, $0x20
  2954. JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32
  2955. JMP emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64
  2956. emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8:
  2957. MOVQ (SI), R9
  2958. MOVQ R9, (AX)
  2959. JMP memmove_end_copy_match_emit_encodeBlockAsm12B
  2960. emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16:
  2961. MOVQ (SI), R9
  2962. MOVQ -8(SI)(R8*1), SI
  2963. MOVQ R9, (AX)
  2964. MOVQ SI, -8(AX)(R8*1)
  2965. JMP memmove_end_copy_match_emit_encodeBlockAsm12B
  2966. emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32:
  2967. MOVOU (SI), X0
  2968. MOVOU -16(SI)(R8*1), X1
  2969. MOVOU X0, (AX)
  2970. MOVOU X1, -16(AX)(R8*1)
  2971. JMP memmove_end_copy_match_emit_encodeBlockAsm12B
  2972. emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64:
  2973. MOVOU (SI), X0
  2974. MOVOU 16(SI), X1
  2975. MOVOU -32(SI)(R8*1), X2
  2976. MOVOU -16(SI)(R8*1), X3
  2977. MOVOU X0, (AX)
  2978. MOVOU X1, 16(AX)
  2979. MOVOU X2, -32(AX)(R8*1)
  2980. MOVOU X3, -16(AX)(R8*1)
  2981. memmove_end_copy_match_emit_encodeBlockAsm12B:
  2982. MOVQ DI, AX
  2983. JMP emit_literal_done_match_emit_encodeBlockAsm12B
  2984. memmove_long_match_emit_encodeBlockAsm12B:
  2985. LEAQ (AX)(R8*1), DI
  2986. // genMemMoveLong
  2987. MOVOU (SI), X0
  2988. MOVOU 16(SI), X1
  2989. MOVOU -32(SI)(R8*1), X2
  2990. MOVOU -16(SI)(R8*1), X3
  2991. MOVQ R8, R10
  2992. SHRQ $0x05, R10
  2993. MOVQ AX, R9
  2994. ANDL $0x0000001f, R9
  2995. MOVQ $0x00000040, R11
  2996. SUBQ R9, R11
  2997. DECQ R10
  2998. JA emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
  2999. LEAQ -32(SI)(R11*1), R9
  3000. LEAQ -32(AX)(R11*1), R12
  3001. emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back:
  3002. MOVOU (R9), X4
  3003. MOVOU 16(R9), X5
  3004. MOVOA X4, (R12)
  3005. MOVOA X5, 16(R12)
  3006. ADDQ $0x20, R12
  3007. ADDQ $0x20, R9
  3008. ADDQ $0x20, R11
  3009. DECQ R10
  3010. JNA emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back
  3011. emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32:
  3012. MOVOU -32(SI)(R11*1), X4
  3013. MOVOU -16(SI)(R11*1), X5
  3014. MOVOA X4, -32(AX)(R11*1)
  3015. MOVOA X5, -16(AX)(R11*1)
  3016. ADDQ $0x20, R11
  3017. CMPQ R8, R11
  3018. JAE emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
  3019. MOVOU X0, (AX)
  3020. MOVOU X1, 16(AX)
  3021. MOVOU X2, -32(AX)(R8*1)
  3022. MOVOU X3, -16(AX)(R8*1)
  3023. MOVQ DI, AX
  3024. emit_literal_done_match_emit_encodeBlockAsm12B:
  3025. match_nolit_loop_encodeBlockAsm12B:
  3026. MOVL CX, SI
  3027. SUBL BX, SI
  3028. MOVL SI, 16(SP)
  3029. ADDL $0x04, CX
  3030. ADDL $0x04, BX
  3031. MOVQ src_len+32(FP), SI
  3032. SUBL CX, SI
  3033. LEAQ (DX)(CX*1), DI
  3034. LEAQ (DX)(BX*1), BX
  3035. // matchLen
  3036. XORL R9, R9
  3037. matchlen_loopback_16_match_nolit_encodeBlockAsm12B:
  3038. CMPL SI, $0x10
  3039. JB matchlen_match8_match_nolit_encodeBlockAsm12B
  3040. MOVQ (DI)(R9*1), R8
  3041. MOVQ 8(DI)(R9*1), R10
  3042. XORQ (BX)(R9*1), R8
  3043. JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm12B
  3044. XORQ 8(BX)(R9*1), R10
  3045. JNZ matchlen_bsf_16match_nolit_encodeBlockAsm12B
  3046. LEAL -16(SI), SI
  3047. LEAL 16(R9), R9
  3048. JMP matchlen_loopback_16_match_nolit_encodeBlockAsm12B
  3049. matchlen_bsf_16match_nolit_encodeBlockAsm12B:
  3050. #ifdef GOAMD64_v3
  3051. TZCNTQ R10, R10
  3052. #else
  3053. BSFQ R10, R10
  3054. #endif
  3055. SARQ $0x03, R10
  3056. LEAL 8(R9)(R10*1), R9
  3057. JMP match_nolit_end_encodeBlockAsm12B
  3058. matchlen_match8_match_nolit_encodeBlockAsm12B:
  3059. CMPL SI, $0x08
  3060. JB matchlen_match4_match_nolit_encodeBlockAsm12B
  3061. MOVQ (DI)(R9*1), R8
  3062. XORQ (BX)(R9*1), R8
  3063. JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm12B
  3064. LEAL -8(SI), SI
  3065. LEAL 8(R9), R9
  3066. JMP matchlen_match4_match_nolit_encodeBlockAsm12B
  3067. matchlen_bsf_8_match_nolit_encodeBlockAsm12B:
  3068. #ifdef GOAMD64_v3
  3069. TZCNTQ R8, R8
  3070. #else
  3071. BSFQ R8, R8
  3072. #endif
  3073. SARQ $0x03, R8
  3074. LEAL (R9)(R8*1), R9
  3075. JMP match_nolit_end_encodeBlockAsm12B
  3076. matchlen_match4_match_nolit_encodeBlockAsm12B:
  3077. CMPL SI, $0x04
  3078. JB matchlen_match2_match_nolit_encodeBlockAsm12B
  3079. MOVL (DI)(R9*1), R8
  3080. CMPL (BX)(R9*1), R8
  3081. JNE matchlen_match2_match_nolit_encodeBlockAsm12B
  3082. LEAL -4(SI), SI
  3083. LEAL 4(R9), R9
  3084. matchlen_match2_match_nolit_encodeBlockAsm12B:
  3085. CMPL SI, $0x01
  3086. JE matchlen_match1_match_nolit_encodeBlockAsm12B
  3087. JB match_nolit_end_encodeBlockAsm12B
  3088. MOVW (DI)(R9*1), R8
  3089. CMPW (BX)(R9*1), R8
  3090. JNE matchlen_match1_match_nolit_encodeBlockAsm12B
  3091. LEAL 2(R9), R9
  3092. SUBL $0x02, SI
  3093. JZ match_nolit_end_encodeBlockAsm12B
  3094. matchlen_match1_match_nolit_encodeBlockAsm12B:
  3095. MOVB (DI)(R9*1), R8
  3096. CMPB (BX)(R9*1), R8
  3097. JNE match_nolit_end_encodeBlockAsm12B
  3098. LEAL 1(R9), R9
  3099. match_nolit_end_encodeBlockAsm12B:
  3100. ADDL R9, CX
  3101. MOVL 16(SP), BX
  3102. ADDL $0x04, R9
  3103. MOVL CX, 12(SP)
  3104. // emitCopy
  3105. CMPL R9, $0x40
  3106. JBE two_byte_offset_short_match_nolit_encodeBlockAsm12B
  3107. CMPL BX, $0x00000800
  3108. JAE long_offset_short_match_nolit_encodeBlockAsm12B
  3109. MOVL $0x00000001, SI
  3110. LEAL 16(SI), SI
  3111. MOVB BL, 1(AX)
  3112. SHRL $0x08, BX
  3113. SHLL $0x05, BX
  3114. ORL BX, SI
  3115. MOVB SI, (AX)
  3116. ADDQ $0x02, AX
  3117. SUBL $0x08, R9
  3118. // emitRepeat
  3119. LEAL -4(R9), R9
  3120. JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b
  3121. MOVL R9, SI
  3122. LEAL -4(R9), R9
  3123. CMPL SI, $0x08
  3124. JBE repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short_2b
  3125. CMPL SI, $0x0c
  3126. JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b
  3127. CMPL BX, $0x00000800
  3128. JB repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b
  3129. cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b:
  3130. CMPL R9, $0x00000104
  3131. JB repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short_2b
  3132. LEAL -256(R9), R9
  3133. MOVW $0x0019, (AX)
  3134. MOVW R9, 2(AX)
  3135. ADDQ $0x04, AX
  3136. JMP match_nolit_emitcopy_end_encodeBlockAsm12B
  3137. repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short_2b:
  3138. LEAL -4(R9), R9
  3139. MOVW $0x0015, (AX)
  3140. MOVB R9, 2(AX)
  3141. ADDQ $0x03, AX
  3142. JMP match_nolit_emitcopy_end_encodeBlockAsm12B
  3143. repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short_2b:
  3144. SHLL $0x02, R9
  3145. ORL $0x01, R9
  3146. MOVW R9, (AX)
  3147. ADDQ $0x02, AX
  3148. JMP match_nolit_emitcopy_end_encodeBlockAsm12B
  3149. repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b:
  3150. XORQ SI, SI
  3151. LEAL 1(SI)(R9*4), R9
  3152. MOVB BL, 1(AX)
  3153. SARL $0x08, BX
  3154. SHLL $0x05, BX
  3155. ORL BX, R9
  3156. MOVB R9, (AX)
  3157. ADDQ $0x02, AX
  3158. JMP match_nolit_emitcopy_end_encodeBlockAsm12B
  3159. long_offset_short_match_nolit_encodeBlockAsm12B:
  3160. MOVB $0xee, (AX)
  3161. MOVW BX, 1(AX)
  3162. LEAL -60(R9), R9
  3163. ADDQ $0x03, AX
  3164. // emitRepeat
  3165. MOVL R9, SI
  3166. LEAL -4(R9), R9
  3167. CMPL SI, $0x08
  3168. JBE repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short
  3169. CMPL SI, $0x0c
  3170. JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short
  3171. CMPL BX, $0x00000800
  3172. JB repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short
  3173. cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short:
  3174. CMPL R9, $0x00000104
  3175. JB repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short
  3176. LEAL -256(R9), R9
  3177. MOVW $0x0019, (AX)
  3178. MOVW R9, 2(AX)
  3179. ADDQ $0x04, AX
  3180. JMP match_nolit_emitcopy_end_encodeBlockAsm12B
  3181. repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short:
  3182. LEAL -4(R9), R9
  3183. MOVW $0x0015, (AX)
  3184. MOVB R9, 2(AX)
  3185. ADDQ $0x03, AX
  3186. JMP match_nolit_emitcopy_end_encodeBlockAsm12B
  3187. repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short:
  3188. SHLL $0x02, R9
  3189. ORL $0x01, R9
  3190. MOVW R9, (AX)
  3191. ADDQ $0x02, AX
  3192. JMP match_nolit_emitcopy_end_encodeBlockAsm12B
  3193. repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short:
  3194. XORQ SI, SI
  3195. LEAL 1(SI)(R9*4), R9
  3196. MOVB BL, 1(AX)
  3197. SARL $0x08, BX
  3198. SHLL $0x05, BX
  3199. ORL BX, R9
  3200. MOVB R9, (AX)
  3201. ADDQ $0x02, AX
  3202. JMP match_nolit_emitcopy_end_encodeBlockAsm12B
  3203. two_byte_offset_short_match_nolit_encodeBlockAsm12B:
  3204. MOVL R9, SI
  3205. SHLL $0x02, SI
  3206. CMPL R9, $0x0c
  3207. JAE emit_copy_three_match_nolit_encodeBlockAsm12B
  3208. CMPL BX, $0x00000800
  3209. JAE emit_copy_three_match_nolit_encodeBlockAsm12B
  3210. LEAL -15(SI), SI
  3211. MOVB BL, 1(AX)
  3212. SHRL $0x08, BX
  3213. SHLL $0x05, BX
  3214. ORL BX, SI
  3215. MOVB SI, (AX)
  3216. ADDQ $0x02, AX
  3217. JMP match_nolit_emitcopy_end_encodeBlockAsm12B
  3218. emit_copy_three_match_nolit_encodeBlockAsm12B:
  3219. LEAL -2(SI), SI
  3220. MOVB SI, (AX)
  3221. MOVW BX, 1(AX)
  3222. ADDQ $0x03, AX
  3223. match_nolit_emitcopy_end_encodeBlockAsm12B:
  3224. CMPL CX, 8(SP)
  3225. JAE emit_remainder_encodeBlockAsm12B
  3226. MOVQ -2(DX)(CX*1), SI
  3227. CMPQ AX, (SP)
  3228. JB match_nolit_dst_ok_encodeBlockAsm12B
  3229. MOVQ $0x00000000, ret+48(FP)
  3230. RET
  3231. match_nolit_dst_ok_encodeBlockAsm12B:
  3232. MOVQ $0x000000cf1bbcdcbb, R8
  3233. MOVQ SI, DI
  3234. SHRQ $0x10, SI
  3235. MOVQ SI, BX
  3236. SHLQ $0x18, DI
  3237. IMULQ R8, DI
  3238. SHRQ $0x34, DI
  3239. SHLQ $0x18, BX
  3240. IMULQ R8, BX
  3241. SHRQ $0x34, BX
  3242. LEAL -2(CX), R8
  3243. LEAQ 24(SP)(BX*4), R9
  3244. MOVL (R9), BX
  3245. MOVL R8, 24(SP)(DI*4)
  3246. MOVL CX, (R9)
  3247. CMPL (DX)(BX*1), SI
  3248. JEQ match_nolit_loop_encodeBlockAsm12B
  3249. INCL CX
  3250. JMP search_loop_encodeBlockAsm12B
  3251. emit_remainder_encodeBlockAsm12B:
  3252. MOVQ src_len+32(FP), CX
  3253. SUBL 12(SP), CX
  3254. LEAQ 3(AX)(CX*1), CX
  3255. CMPQ CX, (SP)
  3256. JB emit_remainder_ok_encodeBlockAsm12B
  3257. MOVQ $0x00000000, ret+48(FP)
  3258. RET
  3259. emit_remainder_ok_encodeBlockAsm12B:
  3260. MOVQ src_len+32(FP), CX
  3261. MOVL 12(SP), BX
  3262. CMPL BX, CX
  3263. JEQ emit_literal_done_emit_remainder_encodeBlockAsm12B
  3264. MOVL CX, SI
  3265. MOVL CX, 12(SP)
  3266. LEAQ (DX)(BX*1), CX
  3267. SUBL BX, SI
  3268. LEAL -1(SI), DX
  3269. CMPL DX, $0x3c
  3270. JB one_byte_emit_remainder_encodeBlockAsm12B
  3271. CMPL DX, $0x00000100
  3272. JB two_bytes_emit_remainder_encodeBlockAsm12B
  3273. JB three_bytes_emit_remainder_encodeBlockAsm12B
  3274. three_bytes_emit_remainder_encodeBlockAsm12B:
  3275. MOVB $0xf4, (AX)
  3276. MOVW DX, 1(AX)
  3277. ADDQ $0x03, AX
  3278. JMP memmove_long_emit_remainder_encodeBlockAsm12B
  3279. two_bytes_emit_remainder_encodeBlockAsm12B:
  3280. MOVB $0xf0, (AX)
  3281. MOVB DL, 1(AX)
  3282. ADDQ $0x02, AX
  3283. CMPL DX, $0x40
  3284. JB memmove_emit_remainder_encodeBlockAsm12B
  3285. JMP memmove_long_emit_remainder_encodeBlockAsm12B
  3286. one_byte_emit_remainder_encodeBlockAsm12B:
  3287. SHLB $0x02, DL
  3288. MOVB DL, (AX)
  3289. ADDQ $0x01, AX
  3290. memmove_emit_remainder_encodeBlockAsm12B:
  3291. LEAQ (AX)(SI*1), DX
  3292. MOVL SI, BX
  3293. // genMemMoveShort
  3294. CMPQ BX, $0x03
  3295. JB emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_1or2
  3296. JE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_3
  3297. CMPQ BX, $0x08
  3298. JB emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_4through7
  3299. CMPQ BX, $0x10
  3300. JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16
  3301. CMPQ BX, $0x20
  3302. JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32
  3303. JMP emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64
  3304. emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_1or2:
  3305. MOVB (CX), SI
  3306. MOVB -1(CX)(BX*1), CL
  3307. MOVB SI, (AX)
  3308. MOVB CL, -1(AX)(BX*1)
  3309. JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B
  3310. emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_3:
  3311. MOVW (CX), SI
  3312. MOVB 2(CX), CL
  3313. MOVW SI, (AX)
  3314. MOVB CL, 2(AX)
  3315. JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B
  3316. emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_4through7:
  3317. MOVL (CX), SI
  3318. MOVL -4(CX)(BX*1), CX
  3319. MOVL SI, (AX)
  3320. MOVL CX, -4(AX)(BX*1)
  3321. JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B
  3322. emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16:
  3323. MOVQ (CX), SI
  3324. MOVQ -8(CX)(BX*1), CX
  3325. MOVQ SI, (AX)
  3326. MOVQ CX, -8(AX)(BX*1)
  3327. JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B
  3328. emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32:
  3329. MOVOU (CX), X0
  3330. MOVOU -16(CX)(BX*1), X1
  3331. MOVOU X0, (AX)
  3332. MOVOU X1, -16(AX)(BX*1)
  3333. JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B
  3334. emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64:
  3335. MOVOU (CX), X0
  3336. MOVOU 16(CX), X1
  3337. MOVOU -32(CX)(BX*1), X2
  3338. MOVOU -16(CX)(BX*1), X3
  3339. MOVOU X0, (AX)
  3340. MOVOU X1, 16(AX)
  3341. MOVOU X2, -32(AX)(BX*1)
  3342. MOVOU X3, -16(AX)(BX*1)
  3343. memmove_end_copy_emit_remainder_encodeBlockAsm12B:
  3344. MOVQ DX, AX
  3345. JMP emit_literal_done_emit_remainder_encodeBlockAsm12B
  3346. memmove_long_emit_remainder_encodeBlockAsm12B:
  3347. LEAQ (AX)(SI*1), DX
  3348. MOVL SI, BX
  3349. // genMemMoveLong
  3350. MOVOU (CX), X0
  3351. MOVOU 16(CX), X1
  3352. MOVOU -32(CX)(BX*1), X2
  3353. MOVOU -16(CX)(BX*1), X3
  3354. MOVQ BX, DI
  3355. SHRQ $0x05, DI
  3356. MOVQ AX, SI
  3357. ANDL $0x0000001f, SI
  3358. MOVQ $0x00000040, R8
  3359. SUBQ SI, R8
  3360. DECQ DI
  3361. JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32
  3362. LEAQ -32(CX)(R8*1), SI
  3363. LEAQ -32(AX)(R8*1), R9
  3364. emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back:
  3365. MOVOU (SI), X4
  3366. MOVOU 16(SI), X5
  3367. MOVOA X4, (R9)
  3368. MOVOA X5, 16(R9)
  3369. ADDQ $0x20, R9
  3370. ADDQ $0x20, SI
  3371. ADDQ $0x20, R8
  3372. DECQ DI
  3373. JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back
  3374. emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32:
  3375. MOVOU -32(CX)(R8*1), X4
  3376. MOVOU -16(CX)(R8*1), X5
  3377. MOVOA X4, -32(AX)(R8*1)
  3378. MOVOA X5, -16(AX)(R8*1)
  3379. ADDQ $0x20, R8
  3380. CMPQ BX, R8
  3381. JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32
  3382. MOVOU X0, (AX)
  3383. MOVOU X1, 16(AX)
  3384. MOVOU X2, -32(AX)(BX*1)
  3385. MOVOU X3, -16(AX)(BX*1)
  3386. MOVQ DX, AX
  3387. emit_literal_done_emit_remainder_encodeBlockAsm12B:
  3388. MOVQ dst_base+0(FP), CX
  3389. SUBQ CX, AX
  3390. MOVQ AX, ret+48(FP)
  3391. RET
  3392. // func encodeBlockAsm10B(dst []byte, src []byte) int
  3393. // Requires: BMI, SSE2
  3394. TEXT ·encodeBlockAsm10B(SB), $4120-56
  3395. MOVQ dst_base+0(FP), AX
  3396. MOVQ $0x00000020, CX
  3397. LEAQ 24(SP), DX
  3398. PXOR X0, X0
  3399. zero_loop_encodeBlockAsm10B:
  3400. MOVOU X0, (DX)
  3401. MOVOU X0, 16(DX)
  3402. MOVOU X0, 32(DX)
  3403. MOVOU X0, 48(DX)
  3404. MOVOU X0, 64(DX)
  3405. MOVOU X0, 80(DX)
  3406. MOVOU X0, 96(DX)
  3407. MOVOU X0, 112(DX)
  3408. ADDQ $0x80, DX
  3409. DECQ CX
  3410. JNZ zero_loop_encodeBlockAsm10B
  3411. MOVL $0x00000000, 12(SP)
  3412. MOVQ src_len+32(FP), CX
  3413. LEAQ -9(CX), DX
  3414. LEAQ -8(CX), BX
  3415. MOVL BX, 8(SP)
  3416. SHRQ $0x05, CX
  3417. SUBL CX, DX
  3418. LEAQ (AX)(DX*1), DX
  3419. MOVQ DX, (SP)
  3420. MOVL $0x00000001, CX
  3421. MOVL CX, 16(SP)
  3422. MOVQ src_base+24(FP), DX
  3423. search_loop_encodeBlockAsm10B:
  3424. MOVL CX, BX
  3425. SUBL 12(SP), BX
  3426. SHRL $0x05, BX
  3427. LEAL 4(CX)(BX*1), BX
  3428. CMPL BX, 8(SP)
  3429. JAE emit_remainder_encodeBlockAsm10B
  3430. MOVQ (DX)(CX*1), SI
  3431. MOVL BX, 20(SP)
  3432. MOVQ $0x9e3779b1, R8
  3433. MOVQ SI, R9
  3434. MOVQ SI, R10
  3435. SHRQ $0x08, R10
  3436. SHLQ $0x20, R9
  3437. IMULQ R8, R9
  3438. SHRQ $0x36, R9
  3439. SHLQ $0x20, R10
  3440. IMULQ R8, R10
  3441. SHRQ $0x36, R10
  3442. MOVL 24(SP)(R9*4), BX
  3443. MOVL 24(SP)(R10*4), DI
  3444. MOVL CX, 24(SP)(R9*4)
  3445. LEAL 1(CX), R9
  3446. MOVL R9, 24(SP)(R10*4)
  3447. MOVQ SI, R9
  3448. SHRQ $0x10, R9
  3449. SHLQ $0x20, R9
  3450. IMULQ R8, R9
  3451. SHRQ $0x36, R9
  3452. MOVL CX, R8
  3453. SUBL 16(SP), R8
  3454. MOVL 1(DX)(R8*1), R10
  3455. MOVQ SI, R8
  3456. SHRQ $0x08, R8
  3457. CMPL R8, R10
  3458. JNE no_repeat_found_encodeBlockAsm10B
  3459. LEAL 1(CX), SI
  3460. MOVL 12(SP), DI
  3461. MOVL SI, BX
  3462. SUBL 16(SP), BX
  3463. JZ repeat_extend_back_end_encodeBlockAsm10B
  3464. repeat_extend_back_loop_encodeBlockAsm10B:
  3465. CMPL SI, DI
  3466. JBE repeat_extend_back_end_encodeBlockAsm10B
  3467. MOVB -1(DX)(BX*1), R8
  3468. MOVB -1(DX)(SI*1), R9
  3469. CMPB R8, R9
  3470. JNE repeat_extend_back_end_encodeBlockAsm10B
  3471. LEAL -1(SI), SI
  3472. DECL BX
  3473. JNZ repeat_extend_back_loop_encodeBlockAsm10B
  3474. repeat_extend_back_end_encodeBlockAsm10B:
  3475. MOVL 12(SP), BX
  3476. CMPL BX, SI
  3477. JEQ emit_literal_done_repeat_emit_encodeBlockAsm10B
  3478. MOVL SI, R8
  3479. MOVL SI, 12(SP)
  3480. LEAQ (DX)(BX*1), R9
  3481. SUBL BX, R8
  3482. LEAL -1(R8), BX
  3483. CMPL BX, $0x3c
  3484. JB one_byte_repeat_emit_encodeBlockAsm10B
  3485. CMPL BX, $0x00000100
  3486. JB two_bytes_repeat_emit_encodeBlockAsm10B
  3487. JB three_bytes_repeat_emit_encodeBlockAsm10B
  3488. three_bytes_repeat_emit_encodeBlockAsm10B:
  3489. MOVB $0xf4, (AX)
  3490. MOVW BX, 1(AX)
  3491. ADDQ $0x03, AX
  3492. JMP memmove_long_repeat_emit_encodeBlockAsm10B
  3493. two_bytes_repeat_emit_encodeBlockAsm10B:
  3494. MOVB $0xf0, (AX)
  3495. MOVB BL, 1(AX)
  3496. ADDQ $0x02, AX
  3497. CMPL BX, $0x40
  3498. JB memmove_repeat_emit_encodeBlockAsm10B
  3499. JMP memmove_long_repeat_emit_encodeBlockAsm10B
  3500. one_byte_repeat_emit_encodeBlockAsm10B:
  3501. SHLB $0x02, BL
  3502. MOVB BL, (AX)
  3503. ADDQ $0x01, AX
  3504. memmove_repeat_emit_encodeBlockAsm10B:
  3505. LEAQ (AX)(R8*1), BX
  3506. // genMemMoveShort
  3507. CMPQ R8, $0x08
  3508. JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8
  3509. CMPQ R8, $0x10
  3510. JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16
  3511. CMPQ R8, $0x20
  3512. JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32
  3513. JMP emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64
  3514. emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8:
  3515. MOVQ (R9), R10
  3516. MOVQ R10, (AX)
  3517. JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B
  3518. emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16:
  3519. MOVQ (R9), R10
  3520. MOVQ -8(R9)(R8*1), R9
  3521. MOVQ R10, (AX)
  3522. MOVQ R9, -8(AX)(R8*1)
  3523. JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B
  3524. emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32:
  3525. MOVOU (R9), X0
  3526. MOVOU -16(R9)(R8*1), X1
  3527. MOVOU X0, (AX)
  3528. MOVOU X1, -16(AX)(R8*1)
  3529. JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B
  3530. emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64:
  3531. MOVOU (R9), X0
  3532. MOVOU 16(R9), X1
  3533. MOVOU -32(R9)(R8*1), X2
  3534. MOVOU -16(R9)(R8*1), X3
  3535. MOVOU X0, (AX)
  3536. MOVOU X1, 16(AX)
  3537. MOVOU X2, -32(AX)(R8*1)
  3538. MOVOU X3, -16(AX)(R8*1)
  3539. memmove_end_copy_repeat_emit_encodeBlockAsm10B:
  3540. MOVQ BX, AX
  3541. JMP emit_literal_done_repeat_emit_encodeBlockAsm10B
  3542. memmove_long_repeat_emit_encodeBlockAsm10B:
  3543. LEAQ (AX)(R8*1), BX
  3544. // genMemMoveLong
  3545. MOVOU (R9), X0
  3546. MOVOU 16(R9), X1
  3547. MOVOU -32(R9)(R8*1), X2
  3548. MOVOU -16(R9)(R8*1), X3
  3549. MOVQ R8, R11
  3550. SHRQ $0x05, R11
  3551. MOVQ AX, R10
  3552. ANDL $0x0000001f, R10
  3553. MOVQ $0x00000040, R12
  3554. SUBQ R10, R12
  3555. DECQ R11
  3556. JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
  3557. LEAQ -32(R9)(R12*1), R10
  3558. LEAQ -32(AX)(R12*1), R13
  3559. emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back:
  3560. MOVOU (R10), X4
  3561. MOVOU 16(R10), X5
  3562. MOVOA X4, (R13)
  3563. MOVOA X5, 16(R13)
  3564. ADDQ $0x20, R13
  3565. ADDQ $0x20, R10
  3566. ADDQ $0x20, R12
  3567. DECQ R11
  3568. JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back
  3569. emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32:
  3570. MOVOU -32(R9)(R12*1), X4
  3571. MOVOU -16(R9)(R12*1), X5
  3572. MOVOA X4, -32(AX)(R12*1)
  3573. MOVOA X5, -16(AX)(R12*1)
  3574. ADDQ $0x20, R12
  3575. CMPQ R8, R12
  3576. JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
  3577. MOVOU X0, (AX)
  3578. MOVOU X1, 16(AX)
  3579. MOVOU X2, -32(AX)(R8*1)
  3580. MOVOU X3, -16(AX)(R8*1)
  3581. MOVQ BX, AX
  3582. emit_literal_done_repeat_emit_encodeBlockAsm10B:
  3583. ADDL $0x05, CX
  3584. MOVL CX, BX
  3585. SUBL 16(SP), BX
  3586. MOVQ src_len+32(FP), R8
  3587. SUBL CX, R8
  3588. LEAQ (DX)(CX*1), R9
  3589. LEAQ (DX)(BX*1), BX
  3590. // matchLen
  3591. XORL R11, R11
  3592. matchlen_loopback_16_repeat_extend_encodeBlockAsm10B:
  3593. CMPL R8, $0x10
  3594. JB matchlen_match8_repeat_extend_encodeBlockAsm10B
  3595. MOVQ (R9)(R11*1), R10
  3596. MOVQ 8(R9)(R11*1), R12
  3597. XORQ (BX)(R11*1), R10
  3598. JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm10B
  3599. XORQ 8(BX)(R11*1), R12
  3600. JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm10B
  3601. LEAL -16(R8), R8
  3602. LEAL 16(R11), R11
  3603. JMP matchlen_loopback_16_repeat_extend_encodeBlockAsm10B
  3604. matchlen_bsf_16repeat_extend_encodeBlockAsm10B:
  3605. #ifdef GOAMD64_v3
  3606. TZCNTQ R12, R12
  3607. #else
  3608. BSFQ R12, R12
  3609. #endif
  3610. SARQ $0x03, R12
  3611. LEAL 8(R11)(R12*1), R11
  3612. JMP repeat_extend_forward_end_encodeBlockAsm10B
  3613. matchlen_match8_repeat_extend_encodeBlockAsm10B:
  3614. CMPL R8, $0x08
  3615. JB matchlen_match4_repeat_extend_encodeBlockAsm10B
  3616. MOVQ (R9)(R11*1), R10
  3617. XORQ (BX)(R11*1), R10
  3618. JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm10B
  3619. LEAL -8(R8), R8
  3620. LEAL 8(R11), R11
  3621. JMP matchlen_match4_repeat_extend_encodeBlockAsm10B
  3622. matchlen_bsf_8_repeat_extend_encodeBlockAsm10B:
  3623. #ifdef GOAMD64_v3
  3624. TZCNTQ R10, R10
  3625. #else
  3626. BSFQ R10, R10
  3627. #endif
  3628. SARQ $0x03, R10
  3629. LEAL (R11)(R10*1), R11
  3630. JMP repeat_extend_forward_end_encodeBlockAsm10B
  3631. matchlen_match4_repeat_extend_encodeBlockAsm10B:
  3632. CMPL R8, $0x04
  3633. JB matchlen_match2_repeat_extend_encodeBlockAsm10B
  3634. MOVL (R9)(R11*1), R10
  3635. CMPL (BX)(R11*1), R10
  3636. JNE matchlen_match2_repeat_extend_encodeBlockAsm10B
  3637. LEAL -4(R8), R8
  3638. LEAL 4(R11), R11
  3639. matchlen_match2_repeat_extend_encodeBlockAsm10B:
  3640. CMPL R8, $0x01
  3641. JE matchlen_match1_repeat_extend_encodeBlockAsm10B
  3642. JB repeat_extend_forward_end_encodeBlockAsm10B
  3643. MOVW (R9)(R11*1), R10
  3644. CMPW (BX)(R11*1), R10
  3645. JNE matchlen_match1_repeat_extend_encodeBlockAsm10B
  3646. LEAL 2(R11), R11
  3647. SUBL $0x02, R8
  3648. JZ repeat_extend_forward_end_encodeBlockAsm10B
  3649. matchlen_match1_repeat_extend_encodeBlockAsm10B:
  3650. MOVB (R9)(R11*1), R10
  3651. CMPB (BX)(R11*1), R10
  3652. JNE repeat_extend_forward_end_encodeBlockAsm10B
  3653. LEAL 1(R11), R11
  3654. repeat_extend_forward_end_encodeBlockAsm10B:
  3655. ADDL R11, CX
  3656. MOVL CX, BX
  3657. SUBL SI, BX
  3658. MOVL 16(SP), SI
  3659. TESTL DI, DI
  3660. JZ repeat_as_copy_encodeBlockAsm10B
  3661. // emitRepeat
  3662. MOVL BX, DI
  3663. LEAL -4(BX), BX
  3664. CMPL DI, $0x08
  3665. JBE repeat_two_match_repeat_encodeBlockAsm10B
  3666. CMPL DI, $0x0c
  3667. JAE cant_repeat_two_offset_match_repeat_encodeBlockAsm10B
  3668. CMPL SI, $0x00000800
  3669. JB repeat_two_offset_match_repeat_encodeBlockAsm10B
  3670. cant_repeat_two_offset_match_repeat_encodeBlockAsm10B:
  3671. CMPL BX, $0x00000104
  3672. JB repeat_three_match_repeat_encodeBlockAsm10B
  3673. LEAL -256(BX), BX
  3674. MOVW $0x0019, (AX)
  3675. MOVW BX, 2(AX)
  3676. ADDQ $0x04, AX
  3677. JMP repeat_end_emit_encodeBlockAsm10B
  3678. repeat_three_match_repeat_encodeBlockAsm10B:
  3679. LEAL -4(BX), BX
  3680. MOVW $0x0015, (AX)
  3681. MOVB BL, 2(AX)
  3682. ADDQ $0x03, AX
  3683. JMP repeat_end_emit_encodeBlockAsm10B
  3684. repeat_two_match_repeat_encodeBlockAsm10B:
  3685. SHLL $0x02, BX
  3686. ORL $0x01, BX
  3687. MOVW BX, (AX)
  3688. ADDQ $0x02, AX
  3689. JMP repeat_end_emit_encodeBlockAsm10B
  3690. repeat_two_offset_match_repeat_encodeBlockAsm10B:
  3691. XORQ DI, DI
  3692. LEAL 1(DI)(BX*4), BX
  3693. MOVB SI, 1(AX)
  3694. SARL $0x08, SI
  3695. SHLL $0x05, SI
  3696. ORL SI, BX
  3697. MOVB BL, (AX)
  3698. ADDQ $0x02, AX
  3699. JMP repeat_end_emit_encodeBlockAsm10B
  3700. repeat_as_copy_encodeBlockAsm10B:
  3701. // emitCopy
  3702. CMPL BX, $0x40
  3703. JBE two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B
  3704. CMPL SI, $0x00000800
  3705. JAE long_offset_short_repeat_as_copy_encodeBlockAsm10B
  3706. MOVL $0x00000001, DI
  3707. LEAL 16(DI), DI
  3708. MOVB SI, 1(AX)
  3709. SHRL $0x08, SI
  3710. SHLL $0x05, SI
  3711. ORL SI, DI
  3712. MOVB DI, (AX)
  3713. ADDQ $0x02, AX
  3714. SUBL $0x08, BX
  3715. // emitRepeat
  3716. LEAL -4(BX), BX
  3717. JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b
  3718. MOVL BX, DI
  3719. LEAL -4(BX), BX
  3720. CMPL DI, $0x08
  3721. JBE repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b
  3722. CMPL DI, $0x0c
  3723. JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b
  3724. CMPL SI, $0x00000800
  3725. JB repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b
  3726. cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b:
  3727. CMPL BX, $0x00000104
  3728. JB repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b
  3729. LEAL -256(BX), BX
  3730. MOVW $0x0019, (AX)
  3731. MOVW BX, 2(AX)
  3732. ADDQ $0x04, AX
  3733. JMP repeat_end_emit_encodeBlockAsm10B
  3734. repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b:
  3735. LEAL -4(BX), BX
  3736. MOVW $0x0015, (AX)
  3737. MOVB BL, 2(AX)
  3738. ADDQ $0x03, AX
  3739. JMP repeat_end_emit_encodeBlockAsm10B
  3740. repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b:
  3741. SHLL $0x02, BX
  3742. ORL $0x01, BX
  3743. MOVW BX, (AX)
  3744. ADDQ $0x02, AX
  3745. JMP repeat_end_emit_encodeBlockAsm10B
  3746. repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b:
  3747. XORQ DI, DI
  3748. LEAL 1(DI)(BX*4), BX
  3749. MOVB SI, 1(AX)
  3750. SARL $0x08, SI
  3751. SHLL $0x05, SI
  3752. ORL SI, BX
  3753. MOVB BL, (AX)
  3754. ADDQ $0x02, AX
  3755. JMP repeat_end_emit_encodeBlockAsm10B
  3756. long_offset_short_repeat_as_copy_encodeBlockAsm10B:
  3757. MOVB $0xee, (AX)
  3758. MOVW SI, 1(AX)
  3759. LEAL -60(BX), BX
  3760. ADDQ $0x03, AX
  3761. // emitRepeat
  3762. MOVL BX, DI
  3763. LEAL -4(BX), BX
  3764. CMPL DI, $0x08
  3765. JBE repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
  3766. CMPL DI, $0x0c
  3767. JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
  3768. CMPL SI, $0x00000800
  3769. JB repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
  3770. cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
  3771. CMPL BX, $0x00000104
  3772. JB repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
  3773. LEAL -256(BX), BX
  3774. MOVW $0x0019, (AX)
  3775. MOVW BX, 2(AX)
  3776. ADDQ $0x04, AX
  3777. JMP repeat_end_emit_encodeBlockAsm10B
  3778. repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
  3779. LEAL -4(BX), BX
  3780. MOVW $0x0015, (AX)
  3781. MOVB BL, 2(AX)
  3782. ADDQ $0x03, AX
  3783. JMP repeat_end_emit_encodeBlockAsm10B
  3784. repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
  3785. SHLL $0x02, BX
  3786. ORL $0x01, BX
  3787. MOVW BX, (AX)
  3788. ADDQ $0x02, AX
  3789. JMP repeat_end_emit_encodeBlockAsm10B
  3790. repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
  3791. XORQ DI, DI
  3792. LEAL 1(DI)(BX*4), BX
  3793. MOVB SI, 1(AX)
  3794. SARL $0x08, SI
  3795. SHLL $0x05, SI
  3796. ORL SI, BX
  3797. MOVB BL, (AX)
  3798. ADDQ $0x02, AX
  3799. JMP repeat_end_emit_encodeBlockAsm10B
  3800. two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B:
  3801. MOVL BX, DI
  3802. SHLL $0x02, DI
  3803. CMPL BX, $0x0c
  3804. JAE emit_copy_three_repeat_as_copy_encodeBlockAsm10B
  3805. CMPL SI, $0x00000800
  3806. JAE emit_copy_three_repeat_as_copy_encodeBlockAsm10B
  3807. LEAL -15(DI), DI
  3808. MOVB SI, 1(AX)
  3809. SHRL $0x08, SI
  3810. SHLL $0x05, SI
  3811. ORL SI, DI
  3812. MOVB DI, (AX)
  3813. ADDQ $0x02, AX
  3814. JMP repeat_end_emit_encodeBlockAsm10B
  3815. emit_copy_three_repeat_as_copy_encodeBlockAsm10B:
  3816. LEAL -2(DI), DI
  3817. MOVB DI, (AX)
  3818. MOVW SI, 1(AX)
  3819. ADDQ $0x03, AX
  3820. repeat_end_emit_encodeBlockAsm10B:
  3821. MOVL CX, 12(SP)
  3822. JMP search_loop_encodeBlockAsm10B
  3823. no_repeat_found_encodeBlockAsm10B:
  3824. CMPL (DX)(BX*1), SI
  3825. JEQ candidate_match_encodeBlockAsm10B
  3826. SHRQ $0x08, SI
  3827. MOVL 24(SP)(R9*4), BX
  3828. LEAL 2(CX), R8
  3829. CMPL (DX)(DI*1), SI
  3830. JEQ candidate2_match_encodeBlockAsm10B
  3831. MOVL R8, 24(SP)(R9*4)
  3832. SHRQ $0x08, SI
  3833. CMPL (DX)(BX*1), SI
  3834. JEQ candidate3_match_encodeBlockAsm10B
  3835. MOVL 20(SP), CX
  3836. JMP search_loop_encodeBlockAsm10B
  3837. candidate3_match_encodeBlockAsm10B:
  3838. ADDL $0x02, CX
  3839. JMP candidate_match_encodeBlockAsm10B
  3840. candidate2_match_encodeBlockAsm10B:
  3841. MOVL R8, 24(SP)(R9*4)
  3842. INCL CX
  3843. MOVL DI, BX
  3844. candidate_match_encodeBlockAsm10B:
  3845. MOVL 12(SP), SI
  3846. TESTL BX, BX
  3847. JZ match_extend_back_end_encodeBlockAsm10B
  3848. match_extend_back_loop_encodeBlockAsm10B:
  3849. CMPL CX, SI
  3850. JBE match_extend_back_end_encodeBlockAsm10B
  3851. MOVB -1(DX)(BX*1), DI
  3852. MOVB -1(DX)(CX*1), R8
  3853. CMPB DI, R8
  3854. JNE match_extend_back_end_encodeBlockAsm10B
  3855. LEAL -1(CX), CX
  3856. DECL BX
  3857. JZ match_extend_back_end_encodeBlockAsm10B
  3858. JMP match_extend_back_loop_encodeBlockAsm10B
  3859. match_extend_back_end_encodeBlockAsm10B:
  3860. MOVL CX, SI
  3861. SUBL 12(SP), SI
  3862. LEAQ 3(AX)(SI*1), SI
  3863. CMPQ SI, (SP)
  3864. JB match_dst_size_check_encodeBlockAsm10B
  3865. MOVQ $0x00000000, ret+48(FP)
  3866. RET
  3867. match_dst_size_check_encodeBlockAsm10B:
  3868. MOVL CX, SI
  3869. MOVL 12(SP), DI
  3870. CMPL DI, SI
  3871. JEQ emit_literal_done_match_emit_encodeBlockAsm10B
  3872. MOVL SI, R8
  3873. MOVL SI, 12(SP)
  3874. LEAQ (DX)(DI*1), SI
  3875. SUBL DI, R8
  3876. LEAL -1(R8), DI
  3877. CMPL DI, $0x3c
  3878. JB one_byte_match_emit_encodeBlockAsm10B
  3879. CMPL DI, $0x00000100
  3880. JB two_bytes_match_emit_encodeBlockAsm10B
  3881. JB three_bytes_match_emit_encodeBlockAsm10B
  3882. three_bytes_match_emit_encodeBlockAsm10B:
  3883. MOVB $0xf4, (AX)
  3884. MOVW DI, 1(AX)
  3885. ADDQ $0x03, AX
  3886. JMP memmove_long_match_emit_encodeBlockAsm10B
  3887. two_bytes_match_emit_encodeBlockAsm10B:
  3888. MOVB $0xf0, (AX)
  3889. MOVB DI, 1(AX)
  3890. ADDQ $0x02, AX
  3891. CMPL DI, $0x40
  3892. JB memmove_match_emit_encodeBlockAsm10B
  3893. JMP memmove_long_match_emit_encodeBlockAsm10B
  3894. one_byte_match_emit_encodeBlockAsm10B:
  3895. SHLB $0x02, DI
  3896. MOVB DI, (AX)
  3897. ADDQ $0x01, AX
  3898. memmove_match_emit_encodeBlockAsm10B:
  3899. LEAQ (AX)(R8*1), DI
  3900. // genMemMoveShort
  3901. CMPQ R8, $0x08
  3902. JBE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8
  3903. CMPQ R8, $0x10
  3904. JBE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16
  3905. CMPQ R8, $0x20
  3906. JBE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32
  3907. JMP emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64
  3908. emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8:
  3909. MOVQ (SI), R9
  3910. MOVQ R9, (AX)
  3911. JMP memmove_end_copy_match_emit_encodeBlockAsm10B
  3912. emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16:
  3913. MOVQ (SI), R9
  3914. MOVQ -8(SI)(R8*1), SI
  3915. MOVQ R9, (AX)
  3916. MOVQ SI, -8(AX)(R8*1)
  3917. JMP memmove_end_copy_match_emit_encodeBlockAsm10B
  3918. emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32:
  3919. MOVOU (SI), X0
  3920. MOVOU -16(SI)(R8*1), X1
  3921. MOVOU X0, (AX)
  3922. MOVOU X1, -16(AX)(R8*1)
  3923. JMP memmove_end_copy_match_emit_encodeBlockAsm10B
  3924. emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64:
  3925. MOVOU (SI), X0
  3926. MOVOU 16(SI), X1
  3927. MOVOU -32(SI)(R8*1), X2
  3928. MOVOU -16(SI)(R8*1), X3
  3929. MOVOU X0, (AX)
  3930. MOVOU X1, 16(AX)
  3931. MOVOU X2, -32(AX)(R8*1)
  3932. MOVOU X3, -16(AX)(R8*1)
  3933. memmove_end_copy_match_emit_encodeBlockAsm10B:
  3934. MOVQ DI, AX
  3935. JMP emit_literal_done_match_emit_encodeBlockAsm10B
  3936. memmove_long_match_emit_encodeBlockAsm10B:
  3937. LEAQ (AX)(R8*1), DI
  3938. // genMemMoveLong
  3939. MOVOU (SI), X0
  3940. MOVOU 16(SI), X1
  3941. MOVOU -32(SI)(R8*1), X2
  3942. MOVOU -16(SI)(R8*1), X3
  3943. MOVQ R8, R10
  3944. SHRQ $0x05, R10
  3945. MOVQ AX, R9
  3946. ANDL $0x0000001f, R9
  3947. MOVQ $0x00000040, R11
  3948. SUBQ R9, R11
  3949. DECQ R10
  3950. JA emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
  3951. LEAQ -32(SI)(R11*1), R9
  3952. LEAQ -32(AX)(R11*1), R12
  3953. emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back:
  3954. MOVOU (R9), X4
  3955. MOVOU 16(R9), X5
  3956. MOVOA X4, (R12)
  3957. MOVOA X5, 16(R12)
  3958. ADDQ $0x20, R12
  3959. ADDQ $0x20, R9
  3960. ADDQ $0x20, R11
  3961. DECQ R10
  3962. JNA emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back
  3963. emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32:
  3964. MOVOU -32(SI)(R11*1), X4
  3965. MOVOU -16(SI)(R11*1), X5
  3966. MOVOA X4, -32(AX)(R11*1)
  3967. MOVOA X5, -16(AX)(R11*1)
  3968. ADDQ $0x20, R11
  3969. CMPQ R8, R11
  3970. JAE emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
  3971. MOVOU X0, (AX)
  3972. MOVOU X1, 16(AX)
  3973. MOVOU X2, -32(AX)(R8*1)
  3974. MOVOU X3, -16(AX)(R8*1)
  3975. MOVQ DI, AX
  3976. emit_literal_done_match_emit_encodeBlockAsm10B:
  3977. match_nolit_loop_encodeBlockAsm10B:
  3978. MOVL CX, SI
  3979. SUBL BX, SI
  3980. MOVL SI, 16(SP)
  3981. ADDL $0x04, CX
  3982. ADDL $0x04, BX
  3983. MOVQ src_len+32(FP), SI
  3984. SUBL CX, SI
  3985. LEAQ (DX)(CX*1), DI
  3986. LEAQ (DX)(BX*1), BX
  3987. // matchLen
  3988. XORL R9, R9
  3989. matchlen_loopback_16_match_nolit_encodeBlockAsm10B:
  3990. CMPL SI, $0x10
  3991. JB matchlen_match8_match_nolit_encodeBlockAsm10B
  3992. MOVQ (DI)(R9*1), R8
  3993. MOVQ 8(DI)(R9*1), R10
  3994. XORQ (BX)(R9*1), R8
  3995. JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm10B
  3996. XORQ 8(BX)(R9*1), R10
  3997. JNZ matchlen_bsf_16match_nolit_encodeBlockAsm10B
  3998. LEAL -16(SI), SI
  3999. LEAL 16(R9), R9
  4000. JMP matchlen_loopback_16_match_nolit_encodeBlockAsm10B
  4001. matchlen_bsf_16match_nolit_encodeBlockAsm10B:
  4002. #ifdef GOAMD64_v3
  4003. TZCNTQ R10, R10
  4004. #else
  4005. BSFQ R10, R10
  4006. #endif
  4007. SARQ $0x03, R10
  4008. LEAL 8(R9)(R10*1), R9
  4009. JMP match_nolit_end_encodeBlockAsm10B
  4010. matchlen_match8_match_nolit_encodeBlockAsm10B:
  4011. CMPL SI, $0x08
  4012. JB matchlen_match4_match_nolit_encodeBlockAsm10B
  4013. MOVQ (DI)(R9*1), R8
  4014. XORQ (BX)(R9*1), R8
  4015. JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm10B
  4016. LEAL -8(SI), SI
  4017. LEAL 8(R9), R9
  4018. JMP matchlen_match4_match_nolit_encodeBlockAsm10B
  4019. matchlen_bsf_8_match_nolit_encodeBlockAsm10B:
  4020. #ifdef GOAMD64_v3
  4021. TZCNTQ R8, R8
  4022. #else
  4023. BSFQ R8, R8
  4024. #endif
  4025. SARQ $0x03, R8
  4026. LEAL (R9)(R8*1), R9
  4027. JMP match_nolit_end_encodeBlockAsm10B
  4028. matchlen_match4_match_nolit_encodeBlockAsm10B:
  4029. CMPL SI, $0x04
  4030. JB matchlen_match2_match_nolit_encodeBlockAsm10B
  4031. MOVL (DI)(R9*1), R8
  4032. CMPL (BX)(R9*1), R8
  4033. JNE matchlen_match2_match_nolit_encodeBlockAsm10B
  4034. LEAL -4(SI), SI
  4035. LEAL 4(R9), R9
  4036. matchlen_match2_match_nolit_encodeBlockAsm10B:
  4037. CMPL SI, $0x01
  4038. JE matchlen_match1_match_nolit_encodeBlockAsm10B
  4039. JB match_nolit_end_encodeBlockAsm10B
  4040. MOVW (DI)(R9*1), R8
  4041. CMPW (BX)(R9*1), R8
  4042. JNE matchlen_match1_match_nolit_encodeBlockAsm10B
  4043. LEAL 2(R9), R9
  4044. SUBL $0x02, SI
  4045. JZ match_nolit_end_encodeBlockAsm10B
  4046. matchlen_match1_match_nolit_encodeBlockAsm10B:
  4047. MOVB (DI)(R9*1), R8
  4048. CMPB (BX)(R9*1), R8
  4049. JNE match_nolit_end_encodeBlockAsm10B
  4050. LEAL 1(R9), R9
  4051. match_nolit_end_encodeBlockAsm10B:
  4052. ADDL R9, CX
  4053. MOVL 16(SP), BX
  4054. ADDL $0x04, R9
  4055. MOVL CX, 12(SP)
  4056. // emitCopy
  4057. CMPL R9, $0x40
  4058. JBE two_byte_offset_short_match_nolit_encodeBlockAsm10B
  4059. CMPL BX, $0x00000800
  4060. JAE long_offset_short_match_nolit_encodeBlockAsm10B
  4061. MOVL $0x00000001, SI
  4062. LEAL 16(SI), SI
  4063. MOVB BL, 1(AX)
  4064. SHRL $0x08, BX
  4065. SHLL $0x05, BX
  4066. ORL BX, SI
  4067. MOVB SI, (AX)
  4068. ADDQ $0x02, AX
  4069. SUBL $0x08, R9
  4070. // emitRepeat
  4071. LEAL -4(R9), R9
  4072. JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b
  4073. MOVL R9, SI
  4074. LEAL -4(R9), R9
  4075. CMPL SI, $0x08
  4076. JBE repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short_2b
  4077. CMPL SI, $0x0c
  4078. JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b
  4079. CMPL BX, $0x00000800
  4080. JB repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b
  4081. cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b:
  4082. CMPL R9, $0x00000104
  4083. JB repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short_2b
  4084. LEAL -256(R9), R9
  4085. MOVW $0x0019, (AX)
  4086. MOVW R9, 2(AX)
  4087. ADDQ $0x04, AX
  4088. JMP match_nolit_emitcopy_end_encodeBlockAsm10B
  4089. repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short_2b:
  4090. LEAL -4(R9), R9
  4091. MOVW $0x0015, (AX)
  4092. MOVB R9, 2(AX)
  4093. ADDQ $0x03, AX
  4094. JMP match_nolit_emitcopy_end_encodeBlockAsm10B
  4095. repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short_2b:
  4096. SHLL $0x02, R9
  4097. ORL $0x01, R9
  4098. MOVW R9, (AX)
  4099. ADDQ $0x02, AX
  4100. JMP match_nolit_emitcopy_end_encodeBlockAsm10B
  4101. repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b:
  4102. XORQ SI, SI
  4103. LEAL 1(SI)(R9*4), R9
  4104. MOVB BL, 1(AX)
  4105. SARL $0x08, BX
  4106. SHLL $0x05, BX
  4107. ORL BX, R9
  4108. MOVB R9, (AX)
  4109. ADDQ $0x02, AX
  4110. JMP match_nolit_emitcopy_end_encodeBlockAsm10B
  4111. long_offset_short_match_nolit_encodeBlockAsm10B:
  4112. MOVB $0xee, (AX)
  4113. MOVW BX, 1(AX)
  4114. LEAL -60(R9), R9
  4115. ADDQ $0x03, AX
  4116. // emitRepeat
  4117. MOVL R9, SI
  4118. LEAL -4(R9), R9
  4119. CMPL SI, $0x08
  4120. JBE repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short
  4121. CMPL SI, $0x0c
  4122. JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short
  4123. CMPL BX, $0x00000800
  4124. JB repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short
  4125. cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short:
  4126. CMPL R9, $0x00000104
  4127. JB repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short
  4128. LEAL -256(R9), R9
  4129. MOVW $0x0019, (AX)
  4130. MOVW R9, 2(AX)
  4131. ADDQ $0x04, AX
  4132. JMP match_nolit_emitcopy_end_encodeBlockAsm10B
  4133. repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short:
  4134. LEAL -4(R9), R9
  4135. MOVW $0x0015, (AX)
  4136. MOVB R9, 2(AX)
  4137. ADDQ $0x03, AX
  4138. JMP match_nolit_emitcopy_end_encodeBlockAsm10B
  4139. repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short:
  4140. SHLL $0x02, R9
  4141. ORL $0x01, R9
  4142. MOVW R9, (AX)
  4143. ADDQ $0x02, AX
  4144. JMP match_nolit_emitcopy_end_encodeBlockAsm10B
  4145. repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short:
  4146. XORQ SI, SI
  4147. LEAL 1(SI)(R9*4), R9
  4148. MOVB BL, 1(AX)
  4149. SARL $0x08, BX
  4150. SHLL $0x05, BX
  4151. ORL BX, R9
  4152. MOVB R9, (AX)
  4153. ADDQ $0x02, AX
  4154. JMP match_nolit_emitcopy_end_encodeBlockAsm10B
  4155. two_byte_offset_short_match_nolit_encodeBlockAsm10B:
  4156. MOVL R9, SI
  4157. SHLL $0x02, SI
  4158. CMPL R9, $0x0c
  4159. JAE emit_copy_three_match_nolit_encodeBlockAsm10B
  4160. CMPL BX, $0x00000800
  4161. JAE emit_copy_three_match_nolit_encodeBlockAsm10B
  4162. LEAL -15(SI), SI
  4163. MOVB BL, 1(AX)
  4164. SHRL $0x08, BX
  4165. SHLL $0x05, BX
  4166. ORL BX, SI
  4167. MOVB SI, (AX)
  4168. ADDQ $0x02, AX
  4169. JMP match_nolit_emitcopy_end_encodeBlockAsm10B
  4170. emit_copy_three_match_nolit_encodeBlockAsm10B:
  4171. LEAL -2(SI), SI
  4172. MOVB SI, (AX)
  4173. MOVW BX, 1(AX)
  4174. ADDQ $0x03, AX
  4175. match_nolit_emitcopy_end_encodeBlockAsm10B:
  4176. CMPL CX, 8(SP)
  4177. JAE emit_remainder_encodeBlockAsm10B
  4178. MOVQ -2(DX)(CX*1), SI
  4179. CMPQ AX, (SP)
  4180. JB match_nolit_dst_ok_encodeBlockAsm10B
  4181. MOVQ $0x00000000, ret+48(FP)
  4182. RET
  4183. match_nolit_dst_ok_encodeBlockAsm10B:
  4184. MOVQ $0x9e3779b1, R8
  4185. MOVQ SI, DI
  4186. SHRQ $0x10, SI
  4187. MOVQ SI, BX
  4188. SHLQ $0x20, DI
  4189. IMULQ R8, DI
  4190. SHRQ $0x36, DI
  4191. SHLQ $0x20, BX
  4192. IMULQ R8, BX
  4193. SHRQ $0x36, BX
  4194. LEAL -2(CX), R8
  4195. LEAQ 24(SP)(BX*4), R9
  4196. MOVL (R9), BX
  4197. MOVL R8, 24(SP)(DI*4)
  4198. MOVL CX, (R9)
  4199. CMPL (DX)(BX*1), SI
  4200. JEQ match_nolit_loop_encodeBlockAsm10B
  4201. INCL CX
  4202. JMP search_loop_encodeBlockAsm10B
  4203. emit_remainder_encodeBlockAsm10B:
  4204. MOVQ src_len+32(FP), CX
  4205. SUBL 12(SP), CX
  4206. LEAQ 3(AX)(CX*1), CX
  4207. CMPQ CX, (SP)
  4208. JB emit_remainder_ok_encodeBlockAsm10B
  4209. MOVQ $0x00000000, ret+48(FP)
  4210. RET
  4211. emit_remainder_ok_encodeBlockAsm10B:
  4212. MOVQ src_len+32(FP), CX
  4213. MOVL 12(SP), BX
  4214. CMPL BX, CX
  4215. JEQ emit_literal_done_emit_remainder_encodeBlockAsm10B
  4216. MOVL CX, SI
  4217. MOVL CX, 12(SP)
  4218. LEAQ (DX)(BX*1), CX
  4219. SUBL BX, SI
  4220. LEAL -1(SI), DX
  4221. CMPL DX, $0x3c
  4222. JB one_byte_emit_remainder_encodeBlockAsm10B
  4223. CMPL DX, $0x00000100
  4224. JB two_bytes_emit_remainder_encodeBlockAsm10B
  4225. JB three_bytes_emit_remainder_encodeBlockAsm10B
  4226. three_bytes_emit_remainder_encodeBlockAsm10B:
  4227. MOVB $0xf4, (AX)
  4228. MOVW DX, 1(AX)
  4229. ADDQ $0x03, AX
  4230. JMP memmove_long_emit_remainder_encodeBlockAsm10B
  4231. two_bytes_emit_remainder_encodeBlockAsm10B:
  4232. MOVB $0xf0, (AX)
  4233. MOVB DL, 1(AX)
  4234. ADDQ $0x02, AX
  4235. CMPL DX, $0x40
  4236. JB memmove_emit_remainder_encodeBlockAsm10B
  4237. JMP memmove_long_emit_remainder_encodeBlockAsm10B
  4238. one_byte_emit_remainder_encodeBlockAsm10B:
  4239. SHLB $0x02, DL
  4240. MOVB DL, (AX)
  4241. ADDQ $0x01, AX
  4242. memmove_emit_remainder_encodeBlockAsm10B:
  4243. LEAQ (AX)(SI*1), DX
  4244. MOVL SI, BX
  4245. // genMemMoveShort
  4246. CMPQ BX, $0x03
  4247. JB emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_1or2
  4248. JE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_3
  4249. CMPQ BX, $0x08
  4250. JB emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_4through7
  4251. CMPQ BX, $0x10
  4252. JBE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16
  4253. CMPQ BX, $0x20
  4254. JBE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32
  4255. JMP emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64
  4256. emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_1or2:
  4257. MOVB (CX), SI
  4258. MOVB -1(CX)(BX*1), CL
  4259. MOVB SI, (AX)
  4260. MOVB CL, -1(AX)(BX*1)
  4261. JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B
  4262. emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_3:
  4263. MOVW (CX), SI
  4264. MOVB 2(CX), CL
  4265. MOVW SI, (AX)
  4266. MOVB CL, 2(AX)
  4267. JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B
  4268. emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_4through7:
  4269. MOVL (CX), SI
  4270. MOVL -4(CX)(BX*1), CX
  4271. MOVL SI, (AX)
  4272. MOVL CX, -4(AX)(BX*1)
  4273. JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B
  4274. emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16:
  4275. MOVQ (CX), SI
  4276. MOVQ -8(CX)(BX*1), CX
  4277. MOVQ SI, (AX)
  4278. MOVQ CX, -8(AX)(BX*1)
  4279. JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B
  4280. emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32:
  4281. MOVOU (CX), X0
  4282. MOVOU -16(CX)(BX*1), X1
  4283. MOVOU X0, (AX)
  4284. MOVOU X1, -16(AX)(BX*1)
  4285. JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B
  4286. emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64:
  4287. MOVOU (CX), X0
  4288. MOVOU 16(CX), X1
  4289. MOVOU -32(CX)(BX*1), X2
  4290. MOVOU -16(CX)(BX*1), X3
  4291. MOVOU X0, (AX)
  4292. MOVOU X1, 16(AX)
  4293. MOVOU X2, -32(AX)(BX*1)
  4294. MOVOU X3, -16(AX)(BX*1)
  4295. memmove_end_copy_emit_remainder_encodeBlockAsm10B:
  4296. MOVQ DX, AX
  4297. JMP emit_literal_done_emit_remainder_encodeBlockAsm10B
  4298. memmove_long_emit_remainder_encodeBlockAsm10B:
  4299. LEAQ (AX)(SI*1), DX
  4300. MOVL SI, BX
  4301. // genMemMoveLong
  4302. MOVOU (CX), X0
  4303. MOVOU 16(CX), X1
  4304. MOVOU -32(CX)(BX*1), X2
  4305. MOVOU -16(CX)(BX*1), X3
  4306. MOVQ BX, DI
  4307. SHRQ $0x05, DI
  4308. MOVQ AX, SI
  4309. ANDL $0x0000001f, SI
  4310. MOVQ $0x00000040, R8
  4311. SUBQ SI, R8
  4312. DECQ DI
  4313. JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32
  4314. LEAQ -32(CX)(R8*1), SI
  4315. LEAQ -32(AX)(R8*1), R9
  4316. emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back:
  4317. MOVOU (SI), X4
  4318. MOVOU 16(SI), X5
  4319. MOVOA X4, (R9)
  4320. MOVOA X5, 16(R9)
  4321. ADDQ $0x20, R9
  4322. ADDQ $0x20, SI
  4323. ADDQ $0x20, R8
  4324. DECQ DI
  4325. JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back
  4326. emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32:
  4327. MOVOU -32(CX)(R8*1), X4
  4328. MOVOU -16(CX)(R8*1), X5
  4329. MOVOA X4, -32(AX)(R8*1)
  4330. MOVOA X5, -16(AX)(R8*1)
  4331. ADDQ $0x20, R8
  4332. CMPQ BX, R8
  4333. JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32
  4334. MOVOU X0, (AX)
  4335. MOVOU X1, 16(AX)
  4336. MOVOU X2, -32(AX)(BX*1)
  4337. MOVOU X3, -16(AX)(BX*1)
  4338. MOVQ DX, AX
  4339. emit_literal_done_emit_remainder_encodeBlockAsm10B:
  4340. MOVQ dst_base+0(FP), CX
  4341. SUBQ CX, AX
  4342. MOVQ AX, ret+48(FP)
  4343. RET
  4344. // func encodeBlockAsm8B(dst []byte, src []byte) int
  4345. // Requires: BMI, SSE2
  4346. TEXT ·encodeBlockAsm8B(SB), $1048-56
  4347. MOVQ dst_base+0(FP), AX
  4348. MOVQ $0x00000008, CX
  4349. LEAQ 24(SP), DX
  4350. PXOR X0, X0
  4351. zero_loop_encodeBlockAsm8B:
  4352. MOVOU X0, (DX)
  4353. MOVOU X0, 16(DX)
  4354. MOVOU X0, 32(DX)
  4355. MOVOU X0, 48(DX)
  4356. MOVOU X0, 64(DX)
  4357. MOVOU X0, 80(DX)
  4358. MOVOU X0, 96(DX)
  4359. MOVOU X0, 112(DX)
  4360. ADDQ $0x80, DX
  4361. DECQ CX
  4362. JNZ zero_loop_encodeBlockAsm8B
  4363. MOVL $0x00000000, 12(SP)
  4364. MOVQ src_len+32(FP), CX
  4365. LEAQ -9(CX), DX
  4366. LEAQ -8(CX), BX
  4367. MOVL BX, 8(SP)
  4368. SHRQ $0x05, CX
  4369. SUBL CX, DX
  4370. LEAQ (AX)(DX*1), DX
  4371. MOVQ DX, (SP)
  4372. MOVL $0x00000001, CX
  4373. MOVL CX, 16(SP)
  4374. MOVQ src_base+24(FP), DX
  4375. search_loop_encodeBlockAsm8B:
  4376. MOVL CX, BX
  4377. SUBL 12(SP), BX
  4378. SHRL $0x04, BX
  4379. LEAL 4(CX)(BX*1), BX
  4380. CMPL BX, 8(SP)
  4381. JAE emit_remainder_encodeBlockAsm8B
  4382. MOVQ (DX)(CX*1), SI
  4383. MOVL BX, 20(SP)
  4384. MOVQ $0x9e3779b1, R8
  4385. MOVQ SI, R9
  4386. MOVQ SI, R10
  4387. SHRQ $0x08, R10
  4388. SHLQ $0x20, R9
  4389. IMULQ R8, R9
  4390. SHRQ $0x38, R9
  4391. SHLQ $0x20, R10
  4392. IMULQ R8, R10
  4393. SHRQ $0x38, R10
  4394. MOVL 24(SP)(R9*4), BX
  4395. MOVL 24(SP)(R10*4), DI
  4396. MOVL CX, 24(SP)(R9*4)
  4397. LEAL 1(CX), R9
  4398. MOVL R9, 24(SP)(R10*4)
  4399. MOVQ SI, R9
  4400. SHRQ $0x10, R9
  4401. SHLQ $0x20, R9
  4402. IMULQ R8, R9
  4403. SHRQ $0x38, R9
  4404. MOVL CX, R8
  4405. SUBL 16(SP), R8
  4406. MOVL 1(DX)(R8*1), R10
  4407. MOVQ SI, R8
  4408. SHRQ $0x08, R8
  4409. CMPL R8, R10
  4410. JNE no_repeat_found_encodeBlockAsm8B
  4411. LEAL 1(CX), SI
  4412. MOVL 12(SP), DI
  4413. MOVL SI, BX
  4414. SUBL 16(SP), BX
  4415. JZ repeat_extend_back_end_encodeBlockAsm8B
  4416. repeat_extend_back_loop_encodeBlockAsm8B:
  4417. CMPL SI, DI
  4418. JBE repeat_extend_back_end_encodeBlockAsm8B
  4419. MOVB -1(DX)(BX*1), R8
  4420. MOVB -1(DX)(SI*1), R9
  4421. CMPB R8, R9
  4422. JNE repeat_extend_back_end_encodeBlockAsm8B
  4423. LEAL -1(SI), SI
  4424. DECL BX
  4425. JNZ repeat_extend_back_loop_encodeBlockAsm8B
  4426. repeat_extend_back_end_encodeBlockAsm8B:
  4427. MOVL 12(SP), BX
  4428. CMPL BX, SI
  4429. JEQ emit_literal_done_repeat_emit_encodeBlockAsm8B
  4430. MOVL SI, R8
  4431. MOVL SI, 12(SP)
  4432. LEAQ (DX)(BX*1), R9
  4433. SUBL BX, R8
  4434. LEAL -1(R8), BX
  4435. CMPL BX, $0x3c
  4436. JB one_byte_repeat_emit_encodeBlockAsm8B
  4437. CMPL BX, $0x00000100
  4438. JB two_bytes_repeat_emit_encodeBlockAsm8B
  4439. JB three_bytes_repeat_emit_encodeBlockAsm8B
  4440. three_bytes_repeat_emit_encodeBlockAsm8B:
  4441. MOVB $0xf4, (AX)
  4442. MOVW BX, 1(AX)
  4443. ADDQ $0x03, AX
  4444. JMP memmove_long_repeat_emit_encodeBlockAsm8B
  4445. two_bytes_repeat_emit_encodeBlockAsm8B:
  4446. MOVB $0xf0, (AX)
  4447. MOVB BL, 1(AX)
  4448. ADDQ $0x02, AX
  4449. CMPL BX, $0x40
  4450. JB memmove_repeat_emit_encodeBlockAsm8B
  4451. JMP memmove_long_repeat_emit_encodeBlockAsm8B
  4452. one_byte_repeat_emit_encodeBlockAsm8B:
  4453. SHLB $0x02, BL
  4454. MOVB BL, (AX)
  4455. ADDQ $0x01, AX
  4456. memmove_repeat_emit_encodeBlockAsm8B:
  4457. LEAQ (AX)(R8*1), BX
  4458. // genMemMoveShort
  4459. CMPQ R8, $0x08
  4460. JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8
  4461. CMPQ R8, $0x10
  4462. JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16
  4463. CMPQ R8, $0x20
  4464. JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32
  4465. JMP emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64
  4466. emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8:
  4467. MOVQ (R9), R10
  4468. MOVQ R10, (AX)
  4469. JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B
  4470. emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16:
  4471. MOVQ (R9), R10
  4472. MOVQ -8(R9)(R8*1), R9
  4473. MOVQ R10, (AX)
  4474. MOVQ R9, -8(AX)(R8*1)
  4475. JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B
  4476. emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32:
  4477. MOVOU (R9), X0
  4478. MOVOU -16(R9)(R8*1), X1
  4479. MOVOU X0, (AX)
  4480. MOVOU X1, -16(AX)(R8*1)
  4481. JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B
  4482. emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64:
  4483. MOVOU (R9), X0
  4484. MOVOU 16(R9), X1
  4485. MOVOU -32(R9)(R8*1), X2
  4486. MOVOU -16(R9)(R8*1), X3
  4487. MOVOU X0, (AX)
  4488. MOVOU X1, 16(AX)
  4489. MOVOU X2, -32(AX)(R8*1)
  4490. MOVOU X3, -16(AX)(R8*1)
  4491. memmove_end_copy_repeat_emit_encodeBlockAsm8B:
  4492. MOVQ BX, AX
  4493. JMP emit_literal_done_repeat_emit_encodeBlockAsm8B
  4494. memmove_long_repeat_emit_encodeBlockAsm8B:
  4495. LEAQ (AX)(R8*1), BX
  4496. // genMemMoveLong
  4497. MOVOU (R9), X0
  4498. MOVOU 16(R9), X1
  4499. MOVOU -32(R9)(R8*1), X2
  4500. MOVOU -16(R9)(R8*1), X3
  4501. MOVQ R8, R11
  4502. SHRQ $0x05, R11
  4503. MOVQ AX, R10
  4504. ANDL $0x0000001f, R10
  4505. MOVQ $0x00000040, R12
  4506. SUBQ R10, R12
  4507. DECQ R11
  4508. JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
  4509. LEAQ -32(R9)(R12*1), R10
  4510. LEAQ -32(AX)(R12*1), R13
  4511. emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back:
  4512. MOVOU (R10), X4
  4513. MOVOU 16(R10), X5
  4514. MOVOA X4, (R13)
  4515. MOVOA X5, 16(R13)
  4516. ADDQ $0x20, R13
  4517. ADDQ $0x20, R10
  4518. ADDQ $0x20, R12
  4519. DECQ R11
  4520. JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back
  4521. emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32:
  4522. MOVOU -32(R9)(R12*1), X4
  4523. MOVOU -16(R9)(R12*1), X5
  4524. MOVOA X4, -32(AX)(R12*1)
  4525. MOVOA X5, -16(AX)(R12*1)
  4526. ADDQ $0x20, R12
  4527. CMPQ R8, R12
  4528. JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
  4529. MOVOU X0, (AX)
  4530. MOVOU X1, 16(AX)
  4531. MOVOU X2, -32(AX)(R8*1)
  4532. MOVOU X3, -16(AX)(R8*1)
  4533. MOVQ BX, AX
  4534. emit_literal_done_repeat_emit_encodeBlockAsm8B:
  4535. ADDL $0x05, CX
  4536. MOVL CX, BX
  4537. SUBL 16(SP), BX
  4538. MOVQ src_len+32(FP), R8
  4539. SUBL CX, R8
  4540. LEAQ (DX)(CX*1), R9
  4541. LEAQ (DX)(BX*1), BX
  4542. // matchLen
  4543. XORL R11, R11
  4544. matchlen_loopback_16_repeat_extend_encodeBlockAsm8B:
  4545. CMPL R8, $0x10
  4546. JB matchlen_match8_repeat_extend_encodeBlockAsm8B
  4547. MOVQ (R9)(R11*1), R10
  4548. MOVQ 8(R9)(R11*1), R12
  4549. XORQ (BX)(R11*1), R10
  4550. JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm8B
  4551. XORQ 8(BX)(R11*1), R12
  4552. JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm8B
  4553. LEAL -16(R8), R8
  4554. LEAL 16(R11), R11
  4555. JMP matchlen_loopback_16_repeat_extend_encodeBlockAsm8B
  4556. matchlen_bsf_16repeat_extend_encodeBlockAsm8B:
  4557. #ifdef GOAMD64_v3
  4558. TZCNTQ R12, R12
  4559. #else
  4560. BSFQ R12, R12
  4561. #endif
  4562. SARQ $0x03, R12
  4563. LEAL 8(R11)(R12*1), R11
  4564. JMP repeat_extend_forward_end_encodeBlockAsm8B
  4565. matchlen_match8_repeat_extend_encodeBlockAsm8B:
  4566. CMPL R8, $0x08
  4567. JB matchlen_match4_repeat_extend_encodeBlockAsm8B
  4568. MOVQ (R9)(R11*1), R10
  4569. XORQ (BX)(R11*1), R10
  4570. JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm8B
  4571. LEAL -8(R8), R8
  4572. LEAL 8(R11), R11
  4573. JMP matchlen_match4_repeat_extend_encodeBlockAsm8B
  4574. matchlen_bsf_8_repeat_extend_encodeBlockAsm8B:
  4575. #ifdef GOAMD64_v3
  4576. TZCNTQ R10, R10
  4577. #else
  4578. BSFQ R10, R10
  4579. #endif
  4580. SARQ $0x03, R10
  4581. LEAL (R11)(R10*1), R11
  4582. JMP repeat_extend_forward_end_encodeBlockAsm8B
  4583. matchlen_match4_repeat_extend_encodeBlockAsm8B:
  4584. CMPL R8, $0x04
  4585. JB matchlen_match2_repeat_extend_encodeBlockAsm8B
  4586. MOVL (R9)(R11*1), R10
  4587. CMPL (BX)(R11*1), R10
  4588. JNE matchlen_match2_repeat_extend_encodeBlockAsm8B
  4589. LEAL -4(R8), R8
  4590. LEAL 4(R11), R11
  4591. matchlen_match2_repeat_extend_encodeBlockAsm8B:
  4592. CMPL R8, $0x01
  4593. JE matchlen_match1_repeat_extend_encodeBlockAsm8B
  4594. JB repeat_extend_forward_end_encodeBlockAsm8B
  4595. MOVW (R9)(R11*1), R10
  4596. CMPW (BX)(R11*1), R10
  4597. JNE matchlen_match1_repeat_extend_encodeBlockAsm8B
  4598. LEAL 2(R11), R11
  4599. SUBL $0x02, R8
  4600. JZ repeat_extend_forward_end_encodeBlockAsm8B
  4601. matchlen_match1_repeat_extend_encodeBlockAsm8B:
  4602. MOVB (R9)(R11*1), R10
  4603. CMPB (BX)(R11*1), R10
  4604. JNE repeat_extend_forward_end_encodeBlockAsm8B
  4605. LEAL 1(R11), R11
  4606. repeat_extend_forward_end_encodeBlockAsm8B:
  4607. ADDL R11, CX
  4608. MOVL CX, BX
  4609. SUBL SI, BX
  4610. MOVL 16(SP), SI
  4611. TESTL DI, DI
  4612. JZ repeat_as_copy_encodeBlockAsm8B
  4613. // emitRepeat
  4614. MOVL BX, SI
  4615. LEAL -4(BX), BX
  4616. CMPL SI, $0x08
  4617. JBE repeat_two_match_repeat_encodeBlockAsm8B
  4618. CMPL SI, $0x0c
  4619. JAE cant_repeat_two_offset_match_repeat_encodeBlockAsm8B
  4620. cant_repeat_two_offset_match_repeat_encodeBlockAsm8B:
  4621. CMPL BX, $0x00000104
  4622. JB repeat_three_match_repeat_encodeBlockAsm8B
  4623. LEAL -256(BX), BX
  4624. MOVW $0x0019, (AX)
  4625. MOVW BX, 2(AX)
  4626. ADDQ $0x04, AX
  4627. JMP repeat_end_emit_encodeBlockAsm8B
  4628. repeat_three_match_repeat_encodeBlockAsm8B:
  4629. LEAL -4(BX), BX
  4630. MOVW $0x0015, (AX)
  4631. MOVB BL, 2(AX)
  4632. ADDQ $0x03, AX
  4633. JMP repeat_end_emit_encodeBlockAsm8B
  4634. repeat_two_match_repeat_encodeBlockAsm8B:
  4635. SHLL $0x02, BX
  4636. ORL $0x01, BX
  4637. MOVW BX, (AX)
  4638. ADDQ $0x02, AX
  4639. JMP repeat_end_emit_encodeBlockAsm8B
  4640. XORQ DI, DI
  4641. LEAL 1(DI)(BX*4), BX
  4642. MOVB SI, 1(AX)
  4643. SARL $0x08, SI
  4644. SHLL $0x05, SI
  4645. ORL SI, BX
  4646. MOVB BL, (AX)
  4647. ADDQ $0x02, AX
  4648. JMP repeat_end_emit_encodeBlockAsm8B
  4649. repeat_as_copy_encodeBlockAsm8B:
  4650. // emitCopy
  4651. CMPL BX, $0x40
  4652. JBE two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B
  4653. CMPL SI, $0x00000800
  4654. JAE long_offset_short_repeat_as_copy_encodeBlockAsm8B
  4655. MOVL $0x00000001, DI
  4656. LEAL 16(DI), DI
  4657. MOVB SI, 1(AX)
  4658. SHRL $0x08, SI
  4659. SHLL $0x05, SI
  4660. ORL SI, DI
  4661. MOVB DI, (AX)
  4662. ADDQ $0x02, AX
  4663. SUBL $0x08, BX
  4664. // emitRepeat
  4665. LEAL -4(BX), BX
  4666. JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b
  4667. MOVL BX, SI
  4668. LEAL -4(BX), BX
  4669. CMPL SI, $0x08
  4670. JBE repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b
  4671. CMPL SI, $0x0c
  4672. JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b
  4673. cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b:
  4674. CMPL BX, $0x00000104
  4675. JB repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b
  4676. LEAL -256(BX), BX
  4677. MOVW $0x0019, (AX)
  4678. MOVW BX, 2(AX)
  4679. ADDQ $0x04, AX
  4680. JMP repeat_end_emit_encodeBlockAsm8B
  4681. repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b:
  4682. LEAL -4(BX), BX
  4683. MOVW $0x0015, (AX)
  4684. MOVB BL, 2(AX)
  4685. ADDQ $0x03, AX
  4686. JMP repeat_end_emit_encodeBlockAsm8B
  4687. repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b:
  4688. SHLL $0x02, BX
  4689. ORL $0x01, BX
  4690. MOVW BX, (AX)
  4691. ADDQ $0x02, AX
  4692. JMP repeat_end_emit_encodeBlockAsm8B
  4693. XORQ DI, DI
  4694. LEAL 1(DI)(BX*4), BX
  4695. MOVB SI, 1(AX)
  4696. SARL $0x08, SI
  4697. SHLL $0x05, SI
  4698. ORL SI, BX
  4699. MOVB BL, (AX)
  4700. ADDQ $0x02, AX
  4701. JMP repeat_end_emit_encodeBlockAsm8B
  4702. long_offset_short_repeat_as_copy_encodeBlockAsm8B:
  4703. MOVB $0xee, (AX)
  4704. MOVW SI, 1(AX)
  4705. LEAL -60(BX), BX
  4706. ADDQ $0x03, AX
  4707. // emitRepeat
  4708. MOVL BX, SI
  4709. LEAL -4(BX), BX
  4710. CMPL SI, $0x08
  4711. JBE repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short
  4712. CMPL SI, $0x0c
  4713. JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short
  4714. cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short:
  4715. CMPL BX, $0x00000104
  4716. JB repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short
  4717. LEAL -256(BX), BX
  4718. MOVW $0x0019, (AX)
  4719. MOVW BX, 2(AX)
  4720. ADDQ $0x04, AX
  4721. JMP repeat_end_emit_encodeBlockAsm8B
  4722. repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short:
  4723. LEAL -4(BX), BX
  4724. MOVW $0x0015, (AX)
  4725. MOVB BL, 2(AX)
  4726. ADDQ $0x03, AX
  4727. JMP repeat_end_emit_encodeBlockAsm8B
  4728. repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short:
  4729. SHLL $0x02, BX
  4730. ORL $0x01, BX
  4731. MOVW BX, (AX)
  4732. ADDQ $0x02, AX
  4733. JMP repeat_end_emit_encodeBlockAsm8B
  4734. XORQ DI, DI
  4735. LEAL 1(DI)(BX*4), BX
  4736. MOVB SI, 1(AX)
  4737. SARL $0x08, SI
  4738. SHLL $0x05, SI
  4739. ORL SI, BX
  4740. MOVB BL, (AX)
  4741. ADDQ $0x02, AX
  4742. JMP repeat_end_emit_encodeBlockAsm8B
  4743. two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B:
  4744. MOVL BX, DI
  4745. SHLL $0x02, DI
  4746. CMPL BX, $0x0c
  4747. JAE emit_copy_three_repeat_as_copy_encodeBlockAsm8B
  4748. LEAL -15(DI), DI
  4749. MOVB SI, 1(AX)
  4750. SHRL $0x08, SI
  4751. SHLL $0x05, SI
  4752. ORL SI, DI
  4753. MOVB DI, (AX)
  4754. ADDQ $0x02, AX
  4755. JMP repeat_end_emit_encodeBlockAsm8B
  4756. emit_copy_three_repeat_as_copy_encodeBlockAsm8B:
  4757. LEAL -2(DI), DI
  4758. MOVB DI, (AX)
  4759. MOVW SI, 1(AX)
  4760. ADDQ $0x03, AX
  4761. repeat_end_emit_encodeBlockAsm8B:
  4762. MOVL CX, 12(SP)
  4763. JMP search_loop_encodeBlockAsm8B
  4764. no_repeat_found_encodeBlockAsm8B:
  4765. CMPL (DX)(BX*1), SI
  4766. JEQ candidate_match_encodeBlockAsm8B
  4767. SHRQ $0x08, SI
  4768. MOVL 24(SP)(R9*4), BX
  4769. LEAL 2(CX), R8
  4770. CMPL (DX)(DI*1), SI
  4771. JEQ candidate2_match_encodeBlockAsm8B
  4772. MOVL R8, 24(SP)(R9*4)
  4773. SHRQ $0x08, SI
  4774. CMPL (DX)(BX*1), SI
  4775. JEQ candidate3_match_encodeBlockAsm8B
  4776. MOVL 20(SP), CX
  4777. JMP search_loop_encodeBlockAsm8B
  4778. candidate3_match_encodeBlockAsm8B:
  4779. ADDL $0x02, CX
  4780. JMP candidate_match_encodeBlockAsm8B
  4781. candidate2_match_encodeBlockAsm8B:
  4782. MOVL R8, 24(SP)(R9*4)
  4783. INCL CX
  4784. MOVL DI, BX
  4785. candidate_match_encodeBlockAsm8B:
  4786. MOVL 12(SP), SI
  4787. TESTL BX, BX
  4788. JZ match_extend_back_end_encodeBlockAsm8B
  4789. match_extend_back_loop_encodeBlockAsm8B:
  4790. CMPL CX, SI
  4791. JBE match_extend_back_end_encodeBlockAsm8B
  4792. MOVB -1(DX)(BX*1), DI
  4793. MOVB -1(DX)(CX*1), R8
  4794. CMPB DI, R8
  4795. JNE match_extend_back_end_encodeBlockAsm8B
  4796. LEAL -1(CX), CX
  4797. DECL BX
  4798. JZ match_extend_back_end_encodeBlockAsm8B
  4799. JMP match_extend_back_loop_encodeBlockAsm8B
  4800. match_extend_back_end_encodeBlockAsm8B:
  4801. MOVL CX, SI
  4802. SUBL 12(SP), SI
  4803. LEAQ 3(AX)(SI*1), SI
  4804. CMPQ SI, (SP)
  4805. JB match_dst_size_check_encodeBlockAsm8B
  4806. MOVQ $0x00000000, ret+48(FP)
  4807. RET
  4808. match_dst_size_check_encodeBlockAsm8B:
  4809. MOVL CX, SI
  4810. MOVL 12(SP), DI
  4811. CMPL DI, SI
  4812. JEQ emit_literal_done_match_emit_encodeBlockAsm8B
  4813. MOVL SI, R8
  4814. MOVL SI, 12(SP)
  4815. LEAQ (DX)(DI*1), SI
  4816. SUBL DI, R8
  4817. LEAL -1(R8), DI
  4818. CMPL DI, $0x3c
  4819. JB one_byte_match_emit_encodeBlockAsm8B
  4820. CMPL DI, $0x00000100
  4821. JB two_bytes_match_emit_encodeBlockAsm8B
  4822. JB three_bytes_match_emit_encodeBlockAsm8B
  4823. three_bytes_match_emit_encodeBlockAsm8B:
  4824. MOVB $0xf4, (AX)
  4825. MOVW DI, 1(AX)
  4826. ADDQ $0x03, AX
  4827. JMP memmove_long_match_emit_encodeBlockAsm8B
  4828. two_bytes_match_emit_encodeBlockAsm8B:
  4829. MOVB $0xf0, (AX)
  4830. MOVB DI, 1(AX)
  4831. ADDQ $0x02, AX
  4832. CMPL DI, $0x40
  4833. JB memmove_match_emit_encodeBlockAsm8B
  4834. JMP memmove_long_match_emit_encodeBlockAsm8B
  4835. one_byte_match_emit_encodeBlockAsm8B:
  4836. SHLB $0x02, DI
  4837. MOVB DI, (AX)
  4838. ADDQ $0x01, AX
  4839. memmove_match_emit_encodeBlockAsm8B:
  4840. LEAQ (AX)(R8*1), DI
  4841. // genMemMoveShort
  4842. CMPQ R8, $0x08
  4843. JBE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8
  4844. CMPQ R8, $0x10
  4845. JBE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16
  4846. CMPQ R8, $0x20
  4847. JBE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32
  4848. JMP emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64
  4849. emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8:
  4850. MOVQ (SI), R9
  4851. MOVQ R9, (AX)
  4852. JMP memmove_end_copy_match_emit_encodeBlockAsm8B
  4853. emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16:
  4854. MOVQ (SI), R9
  4855. MOVQ -8(SI)(R8*1), SI
  4856. MOVQ R9, (AX)
  4857. MOVQ SI, -8(AX)(R8*1)
  4858. JMP memmove_end_copy_match_emit_encodeBlockAsm8B
  4859. emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32:
  4860. MOVOU (SI), X0
  4861. MOVOU -16(SI)(R8*1), X1
  4862. MOVOU X0, (AX)
  4863. MOVOU X1, -16(AX)(R8*1)
  4864. JMP memmove_end_copy_match_emit_encodeBlockAsm8B
  4865. emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64:
  4866. MOVOU (SI), X0
  4867. MOVOU 16(SI), X1
  4868. MOVOU -32(SI)(R8*1), X2
  4869. MOVOU -16(SI)(R8*1), X3
  4870. MOVOU X0, (AX)
  4871. MOVOU X1, 16(AX)
  4872. MOVOU X2, -32(AX)(R8*1)
  4873. MOVOU X3, -16(AX)(R8*1)
  4874. memmove_end_copy_match_emit_encodeBlockAsm8B:
  4875. MOVQ DI, AX
  4876. JMP emit_literal_done_match_emit_encodeBlockAsm8B
  4877. memmove_long_match_emit_encodeBlockAsm8B:
  4878. LEAQ (AX)(R8*1), DI
  4879. // genMemMoveLong
  4880. MOVOU (SI), X0
  4881. MOVOU 16(SI), X1
  4882. MOVOU -32(SI)(R8*1), X2
  4883. MOVOU -16(SI)(R8*1), X3
  4884. MOVQ R8, R10
  4885. SHRQ $0x05, R10
  4886. MOVQ AX, R9
  4887. ANDL $0x0000001f, R9
  4888. MOVQ $0x00000040, R11
  4889. SUBQ R9, R11
  4890. DECQ R10
  4891. JA emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
  4892. LEAQ -32(SI)(R11*1), R9
  4893. LEAQ -32(AX)(R11*1), R12
  4894. emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back:
  4895. MOVOU (R9), X4
  4896. MOVOU 16(R9), X5
  4897. MOVOA X4, (R12)
  4898. MOVOA X5, 16(R12)
  4899. ADDQ $0x20, R12
  4900. ADDQ $0x20, R9
  4901. ADDQ $0x20, R11
  4902. DECQ R10
  4903. JNA emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back
  4904. emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32:
  4905. MOVOU -32(SI)(R11*1), X4
  4906. MOVOU -16(SI)(R11*1), X5
  4907. MOVOA X4, -32(AX)(R11*1)
  4908. MOVOA X5, -16(AX)(R11*1)
  4909. ADDQ $0x20, R11
  4910. CMPQ R8, R11
  4911. JAE emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
  4912. MOVOU X0, (AX)
  4913. MOVOU X1, 16(AX)
  4914. MOVOU X2, -32(AX)(R8*1)
  4915. MOVOU X3, -16(AX)(R8*1)
  4916. MOVQ DI, AX
  4917. emit_literal_done_match_emit_encodeBlockAsm8B:
  4918. match_nolit_loop_encodeBlockAsm8B:
  4919. MOVL CX, SI
  4920. SUBL BX, SI
  4921. MOVL SI, 16(SP)
  4922. ADDL $0x04, CX
  4923. ADDL $0x04, BX
  4924. MOVQ src_len+32(FP), SI
  4925. SUBL CX, SI
  4926. LEAQ (DX)(CX*1), DI
  4927. LEAQ (DX)(BX*1), BX
  4928. // matchLen
  4929. XORL R9, R9
  4930. matchlen_loopback_16_match_nolit_encodeBlockAsm8B:
  4931. CMPL SI, $0x10
  4932. JB matchlen_match8_match_nolit_encodeBlockAsm8B
  4933. MOVQ (DI)(R9*1), R8
  4934. MOVQ 8(DI)(R9*1), R10
  4935. XORQ (BX)(R9*1), R8
  4936. JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm8B
  4937. XORQ 8(BX)(R9*1), R10
  4938. JNZ matchlen_bsf_16match_nolit_encodeBlockAsm8B
  4939. LEAL -16(SI), SI
  4940. LEAL 16(R9), R9
  4941. JMP matchlen_loopback_16_match_nolit_encodeBlockAsm8B
  4942. matchlen_bsf_16match_nolit_encodeBlockAsm8B:
  4943. #ifdef GOAMD64_v3
  4944. TZCNTQ R10, R10
  4945. #else
  4946. BSFQ R10, R10
  4947. #endif
  4948. SARQ $0x03, R10
  4949. LEAL 8(R9)(R10*1), R9
  4950. JMP match_nolit_end_encodeBlockAsm8B
  4951. matchlen_match8_match_nolit_encodeBlockAsm8B:
  4952. CMPL SI, $0x08
  4953. JB matchlen_match4_match_nolit_encodeBlockAsm8B
  4954. MOVQ (DI)(R9*1), R8
  4955. XORQ (BX)(R9*1), R8
  4956. JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm8B
  4957. LEAL -8(SI), SI
  4958. LEAL 8(R9), R9
  4959. JMP matchlen_match4_match_nolit_encodeBlockAsm8B
  4960. matchlen_bsf_8_match_nolit_encodeBlockAsm8B:
  4961. #ifdef GOAMD64_v3
  4962. TZCNTQ R8, R8
  4963. #else
  4964. BSFQ R8, R8
  4965. #endif
  4966. SARQ $0x03, R8
  4967. LEAL (R9)(R8*1), R9
  4968. JMP match_nolit_end_encodeBlockAsm8B
  4969. matchlen_match4_match_nolit_encodeBlockAsm8B:
  4970. CMPL SI, $0x04
  4971. JB matchlen_match2_match_nolit_encodeBlockAsm8B
  4972. MOVL (DI)(R9*1), R8
  4973. CMPL (BX)(R9*1), R8
  4974. JNE matchlen_match2_match_nolit_encodeBlockAsm8B
  4975. LEAL -4(SI), SI
  4976. LEAL 4(R9), R9
  4977. matchlen_match2_match_nolit_encodeBlockAsm8B:
  4978. CMPL SI, $0x01
  4979. JE matchlen_match1_match_nolit_encodeBlockAsm8B
  4980. JB match_nolit_end_encodeBlockAsm8B
  4981. MOVW (DI)(R9*1), R8
  4982. CMPW (BX)(R9*1), R8
  4983. JNE matchlen_match1_match_nolit_encodeBlockAsm8B
  4984. LEAL 2(R9), R9
  4985. SUBL $0x02, SI
  4986. JZ match_nolit_end_encodeBlockAsm8B
  4987. matchlen_match1_match_nolit_encodeBlockAsm8B:
  4988. MOVB (DI)(R9*1), R8
  4989. CMPB (BX)(R9*1), R8
  4990. JNE match_nolit_end_encodeBlockAsm8B
  4991. LEAL 1(R9), R9
  4992. match_nolit_end_encodeBlockAsm8B:
  4993. ADDL R9, CX
  4994. MOVL 16(SP), BX
  4995. ADDL $0x04, R9
  4996. MOVL CX, 12(SP)
  4997. // emitCopy
  4998. CMPL R9, $0x40
  4999. JBE two_byte_offset_short_match_nolit_encodeBlockAsm8B
  5000. CMPL BX, $0x00000800
  5001. JAE long_offset_short_match_nolit_encodeBlockAsm8B
  5002. MOVL $0x00000001, SI
  5003. LEAL 16(SI), SI
  5004. MOVB BL, 1(AX)
  5005. SHRL $0x08, BX
  5006. SHLL $0x05, BX
  5007. ORL BX, SI
  5008. MOVB SI, (AX)
  5009. ADDQ $0x02, AX
  5010. SUBL $0x08, R9
  5011. // emitRepeat
  5012. LEAL -4(R9), R9
  5013. JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b
  5014. MOVL R9, BX
  5015. LEAL -4(R9), R9
  5016. CMPL BX, $0x08
  5017. JBE repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short_2b
  5018. CMPL BX, $0x0c
  5019. JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b
  5020. cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b:
  5021. CMPL R9, $0x00000104
  5022. JB repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short_2b
  5023. LEAL -256(R9), R9
  5024. MOVW $0x0019, (AX)
  5025. MOVW R9, 2(AX)
  5026. ADDQ $0x04, AX
  5027. JMP match_nolit_emitcopy_end_encodeBlockAsm8B
  5028. repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short_2b:
  5029. LEAL -4(R9), R9
  5030. MOVW $0x0015, (AX)
  5031. MOVB R9, 2(AX)
  5032. ADDQ $0x03, AX
  5033. JMP match_nolit_emitcopy_end_encodeBlockAsm8B
  5034. repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short_2b:
  5035. SHLL $0x02, R9
  5036. ORL $0x01, R9
  5037. MOVW R9, (AX)
  5038. ADDQ $0x02, AX
  5039. JMP match_nolit_emitcopy_end_encodeBlockAsm8B
  5040. XORQ SI, SI
  5041. LEAL 1(SI)(R9*4), R9
  5042. MOVB BL, 1(AX)
  5043. SARL $0x08, BX
  5044. SHLL $0x05, BX
  5045. ORL BX, R9
  5046. MOVB R9, (AX)
  5047. ADDQ $0x02, AX
  5048. JMP match_nolit_emitcopy_end_encodeBlockAsm8B
  5049. long_offset_short_match_nolit_encodeBlockAsm8B:
  5050. MOVB $0xee, (AX)
  5051. MOVW BX, 1(AX)
  5052. LEAL -60(R9), R9
  5053. ADDQ $0x03, AX
  5054. // emitRepeat
  5055. MOVL R9, BX
  5056. LEAL -4(R9), R9
  5057. CMPL BX, $0x08
  5058. JBE repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short
  5059. CMPL BX, $0x0c
  5060. JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short
  5061. cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short:
  5062. CMPL R9, $0x00000104
  5063. JB repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short
  5064. LEAL -256(R9), R9
  5065. MOVW $0x0019, (AX)
  5066. MOVW R9, 2(AX)
  5067. ADDQ $0x04, AX
  5068. JMP match_nolit_emitcopy_end_encodeBlockAsm8B
  5069. repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short:
  5070. LEAL -4(R9), R9
  5071. MOVW $0x0015, (AX)
  5072. MOVB R9, 2(AX)
  5073. ADDQ $0x03, AX
  5074. JMP match_nolit_emitcopy_end_encodeBlockAsm8B
  5075. repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short:
  5076. SHLL $0x02, R9
  5077. ORL $0x01, R9
  5078. MOVW R9, (AX)
  5079. ADDQ $0x02, AX
  5080. JMP match_nolit_emitcopy_end_encodeBlockAsm8B
  5081. XORQ SI, SI
  5082. LEAL 1(SI)(R9*4), R9
  5083. MOVB BL, 1(AX)
  5084. SARL $0x08, BX
  5085. SHLL $0x05, BX
  5086. ORL BX, R9
  5087. MOVB R9, (AX)
  5088. ADDQ $0x02, AX
  5089. JMP match_nolit_emitcopy_end_encodeBlockAsm8B
  5090. two_byte_offset_short_match_nolit_encodeBlockAsm8B:
  5091. MOVL R9, SI
  5092. SHLL $0x02, SI
  5093. CMPL R9, $0x0c
  5094. JAE emit_copy_three_match_nolit_encodeBlockAsm8B
  5095. LEAL -15(SI), SI
  5096. MOVB BL, 1(AX)
  5097. SHRL $0x08, BX
  5098. SHLL $0x05, BX
  5099. ORL BX, SI
  5100. MOVB SI, (AX)
  5101. ADDQ $0x02, AX
  5102. JMP match_nolit_emitcopy_end_encodeBlockAsm8B
  5103. emit_copy_three_match_nolit_encodeBlockAsm8B:
  5104. LEAL -2(SI), SI
  5105. MOVB SI, (AX)
  5106. MOVW BX, 1(AX)
  5107. ADDQ $0x03, AX
  5108. match_nolit_emitcopy_end_encodeBlockAsm8B:
  5109. CMPL CX, 8(SP)
  5110. JAE emit_remainder_encodeBlockAsm8B
  5111. MOVQ -2(DX)(CX*1), SI
  5112. CMPQ AX, (SP)
  5113. JB match_nolit_dst_ok_encodeBlockAsm8B
  5114. MOVQ $0x00000000, ret+48(FP)
  5115. RET
  5116. match_nolit_dst_ok_encodeBlockAsm8B:
  5117. MOVQ $0x9e3779b1, R8
  5118. MOVQ SI, DI
  5119. SHRQ $0x10, SI
  5120. MOVQ SI, BX
  5121. SHLQ $0x20, DI
  5122. IMULQ R8, DI
  5123. SHRQ $0x38, DI
  5124. SHLQ $0x20, BX
  5125. IMULQ R8, BX
  5126. SHRQ $0x38, BX
  5127. LEAL -2(CX), R8
  5128. LEAQ 24(SP)(BX*4), R9
  5129. MOVL (R9), BX
  5130. MOVL R8, 24(SP)(DI*4)
  5131. MOVL CX, (R9)
  5132. CMPL (DX)(BX*1), SI
  5133. JEQ match_nolit_loop_encodeBlockAsm8B
  5134. INCL CX
  5135. JMP search_loop_encodeBlockAsm8B
  5136. emit_remainder_encodeBlockAsm8B:
  5137. MOVQ src_len+32(FP), CX
  5138. SUBL 12(SP), CX
  5139. LEAQ 3(AX)(CX*1), CX
  5140. CMPQ CX, (SP)
  5141. JB emit_remainder_ok_encodeBlockAsm8B
  5142. MOVQ $0x00000000, ret+48(FP)
  5143. RET
  5144. emit_remainder_ok_encodeBlockAsm8B:
  5145. MOVQ src_len+32(FP), CX
  5146. MOVL 12(SP), BX
  5147. CMPL BX, CX
  5148. JEQ emit_literal_done_emit_remainder_encodeBlockAsm8B
  5149. MOVL CX, SI
  5150. MOVL CX, 12(SP)
  5151. LEAQ (DX)(BX*1), CX
  5152. SUBL BX, SI
  5153. LEAL -1(SI), DX
  5154. CMPL DX, $0x3c
  5155. JB one_byte_emit_remainder_encodeBlockAsm8B
  5156. CMPL DX, $0x00000100
  5157. JB two_bytes_emit_remainder_encodeBlockAsm8B
  5158. JB three_bytes_emit_remainder_encodeBlockAsm8B
  5159. three_bytes_emit_remainder_encodeBlockAsm8B:
  5160. MOVB $0xf4, (AX)
  5161. MOVW DX, 1(AX)
  5162. ADDQ $0x03, AX
  5163. JMP memmove_long_emit_remainder_encodeBlockAsm8B
  5164. two_bytes_emit_remainder_encodeBlockAsm8B:
  5165. MOVB $0xf0, (AX)
  5166. MOVB DL, 1(AX)
  5167. ADDQ $0x02, AX
  5168. CMPL DX, $0x40
  5169. JB memmove_emit_remainder_encodeBlockAsm8B
  5170. JMP memmove_long_emit_remainder_encodeBlockAsm8B
  5171. one_byte_emit_remainder_encodeBlockAsm8B:
  5172. SHLB $0x02, DL
  5173. MOVB DL, (AX)
  5174. ADDQ $0x01, AX
  5175. memmove_emit_remainder_encodeBlockAsm8B:
  5176. LEAQ (AX)(SI*1), DX
  5177. MOVL SI, BX
  5178. // genMemMoveShort
  5179. CMPQ BX, $0x03
  5180. JB emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_1or2
  5181. JE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_3
  5182. CMPQ BX, $0x08
  5183. JB emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_4through7
  5184. CMPQ BX, $0x10
  5185. JBE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16
  5186. CMPQ BX, $0x20
  5187. JBE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32
  5188. JMP emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64
  5189. emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_1or2:
  5190. MOVB (CX), SI
  5191. MOVB -1(CX)(BX*1), CL
  5192. MOVB SI, (AX)
  5193. MOVB CL, -1(AX)(BX*1)
  5194. JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B
  5195. emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_3:
  5196. MOVW (CX), SI
  5197. MOVB 2(CX), CL
  5198. MOVW SI, (AX)
  5199. MOVB CL, 2(AX)
  5200. JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B
  5201. emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_4through7:
  5202. MOVL (CX), SI
  5203. MOVL -4(CX)(BX*1), CX
  5204. MOVL SI, (AX)
  5205. MOVL CX, -4(AX)(BX*1)
  5206. JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B
  5207. emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16:
  5208. MOVQ (CX), SI
  5209. MOVQ -8(CX)(BX*1), CX
  5210. MOVQ SI, (AX)
  5211. MOVQ CX, -8(AX)(BX*1)
  5212. JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B
  5213. emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32:
  5214. MOVOU (CX), X0
  5215. MOVOU -16(CX)(BX*1), X1
  5216. MOVOU X0, (AX)
  5217. MOVOU X1, -16(AX)(BX*1)
  5218. JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B
  5219. emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64:
  5220. MOVOU (CX), X0
  5221. MOVOU 16(CX), X1
  5222. MOVOU -32(CX)(BX*1), X2
  5223. MOVOU -16(CX)(BX*1), X3
  5224. MOVOU X0, (AX)
  5225. MOVOU X1, 16(AX)
  5226. MOVOU X2, -32(AX)(BX*1)
  5227. MOVOU X3, -16(AX)(BX*1)
  5228. memmove_end_copy_emit_remainder_encodeBlockAsm8B:
  5229. MOVQ DX, AX
  5230. JMP emit_literal_done_emit_remainder_encodeBlockAsm8B
  5231. memmove_long_emit_remainder_encodeBlockAsm8B:
  5232. LEAQ (AX)(SI*1), DX
  5233. MOVL SI, BX
  5234. // genMemMoveLong
  5235. MOVOU (CX), X0
  5236. MOVOU 16(CX), X1
  5237. MOVOU -32(CX)(BX*1), X2
  5238. MOVOU -16(CX)(BX*1), X3
  5239. MOVQ BX, DI
  5240. SHRQ $0x05, DI
  5241. MOVQ AX, SI
  5242. ANDL $0x0000001f, SI
  5243. MOVQ $0x00000040, R8
  5244. SUBQ SI, R8
  5245. DECQ DI
  5246. JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32
  5247. LEAQ -32(CX)(R8*1), SI
  5248. LEAQ -32(AX)(R8*1), R9
  5249. emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back:
  5250. MOVOU (SI), X4
  5251. MOVOU 16(SI), X5
  5252. MOVOA X4, (R9)
  5253. MOVOA X5, 16(R9)
  5254. ADDQ $0x20, R9
  5255. ADDQ $0x20, SI
  5256. ADDQ $0x20, R8
  5257. DECQ DI
  5258. JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back
  5259. emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32:
  5260. MOVOU -32(CX)(R8*1), X4
  5261. MOVOU -16(CX)(R8*1), X5
  5262. MOVOA X4, -32(AX)(R8*1)
  5263. MOVOA X5, -16(AX)(R8*1)
  5264. ADDQ $0x20, R8
  5265. CMPQ BX, R8
  5266. JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32
  5267. MOVOU X0, (AX)
  5268. MOVOU X1, 16(AX)
  5269. MOVOU X2, -32(AX)(BX*1)
  5270. MOVOU X3, -16(AX)(BX*1)
  5271. MOVQ DX, AX
  5272. emit_literal_done_emit_remainder_encodeBlockAsm8B:
  5273. MOVQ dst_base+0(FP), CX
  5274. SUBQ CX, AX
  5275. MOVQ AX, ret+48(FP)
  5276. RET
  5277. // func encodeBetterBlockAsm(dst []byte, src []byte) int
  5278. // Requires: BMI, SSE2
  5279. TEXT ·encodeBetterBlockAsm(SB), $589848-56
  5280. MOVQ dst_base+0(FP), AX
  5281. MOVQ $0x00001200, CX
  5282. LEAQ 24(SP), DX
  5283. PXOR X0, X0
  5284. zero_loop_encodeBetterBlockAsm:
  5285. MOVOU X0, (DX)
  5286. MOVOU X0, 16(DX)
  5287. MOVOU X0, 32(DX)
  5288. MOVOU X0, 48(DX)
  5289. MOVOU X0, 64(DX)
  5290. MOVOU X0, 80(DX)
  5291. MOVOU X0, 96(DX)
  5292. MOVOU X0, 112(DX)
  5293. ADDQ $0x80, DX
  5294. DECQ CX
  5295. JNZ zero_loop_encodeBetterBlockAsm
  5296. MOVL $0x00000000, 12(SP)
  5297. MOVQ src_len+32(FP), CX
  5298. LEAQ -6(CX), DX
  5299. LEAQ -8(CX), BX
  5300. MOVL BX, 8(SP)
  5301. SHRQ $0x05, CX
  5302. SUBL CX, DX
  5303. LEAQ (AX)(DX*1), DX
  5304. MOVQ DX, (SP)
  5305. MOVL $0x00000001, CX
  5306. MOVL $0x00000000, 16(SP)
  5307. MOVQ src_base+24(FP), DX
  5308. search_loop_encodeBetterBlockAsm:
  5309. MOVL CX, BX
  5310. SUBL 12(SP), BX
  5311. SHRL $0x07, BX
  5312. CMPL BX, $0x63
  5313. JBE check_maxskip_ok_encodeBetterBlockAsm
  5314. LEAL 100(CX), BX
  5315. JMP check_maxskip_cont_encodeBetterBlockAsm
  5316. check_maxskip_ok_encodeBetterBlockAsm:
  5317. LEAL 1(CX)(BX*1), BX
  5318. check_maxskip_cont_encodeBetterBlockAsm:
  5319. CMPL BX, 8(SP)
  5320. JAE emit_remainder_encodeBetterBlockAsm
  5321. MOVQ (DX)(CX*1), SI
  5322. MOVL BX, 20(SP)
  5323. MOVQ $0x00cf1bbcdcbfa563, R8
  5324. MOVQ $0x9e3779b1, BX
  5325. MOVQ SI, R9
  5326. MOVQ SI, R10
  5327. SHLQ $0x08, R9
  5328. IMULQ R8, R9
  5329. SHRQ $0x2f, R9
  5330. SHLQ $0x20, R10
  5331. IMULQ BX, R10
  5332. SHRQ $0x32, R10
  5333. MOVL 24(SP)(R9*4), BX
  5334. MOVL 524312(SP)(R10*4), DI
  5335. MOVL CX, 24(SP)(R9*4)
  5336. MOVL CX, 524312(SP)(R10*4)
  5337. MOVQ (DX)(BX*1), R9
  5338. MOVQ (DX)(DI*1), R10
  5339. CMPQ R9, SI
  5340. JEQ candidate_match_encodeBetterBlockAsm
  5341. CMPQ R10, SI
  5342. JNE no_short_found_encodeBetterBlockAsm
  5343. MOVL DI, BX
  5344. JMP candidate_match_encodeBetterBlockAsm
  5345. no_short_found_encodeBetterBlockAsm:
  5346. CMPL R9, SI
  5347. JEQ candidate_match_encodeBetterBlockAsm
  5348. CMPL R10, SI
  5349. JEQ candidateS_match_encodeBetterBlockAsm
  5350. MOVL 20(SP), CX
  5351. JMP search_loop_encodeBetterBlockAsm
  5352. candidateS_match_encodeBetterBlockAsm:
  5353. SHRQ $0x08, SI
  5354. MOVQ SI, R9
  5355. SHLQ $0x08, R9
  5356. IMULQ R8, R9
  5357. SHRQ $0x2f, R9
  5358. MOVL 24(SP)(R9*4), BX
  5359. INCL CX
  5360. MOVL CX, 24(SP)(R9*4)
  5361. CMPL (DX)(BX*1), SI
  5362. JEQ candidate_match_encodeBetterBlockAsm
  5363. DECL CX
  5364. MOVL DI, BX
  5365. candidate_match_encodeBetterBlockAsm:
  5366. MOVL 12(SP), SI
  5367. TESTL BX, BX
  5368. JZ match_extend_back_end_encodeBetterBlockAsm
  5369. match_extend_back_loop_encodeBetterBlockAsm:
  5370. CMPL CX, SI
  5371. JBE match_extend_back_end_encodeBetterBlockAsm
  5372. MOVB -1(DX)(BX*1), DI
  5373. MOVB -1(DX)(CX*1), R8
  5374. CMPB DI, R8
  5375. JNE match_extend_back_end_encodeBetterBlockAsm
  5376. LEAL -1(CX), CX
  5377. DECL BX
  5378. JZ match_extend_back_end_encodeBetterBlockAsm
  5379. JMP match_extend_back_loop_encodeBetterBlockAsm
  5380. match_extend_back_end_encodeBetterBlockAsm:
  5381. MOVL CX, SI
  5382. SUBL 12(SP), SI
  5383. LEAQ 5(AX)(SI*1), SI
  5384. CMPQ SI, (SP)
  5385. JB match_dst_size_check_encodeBetterBlockAsm
  5386. MOVQ $0x00000000, ret+48(FP)
  5387. RET
  5388. match_dst_size_check_encodeBetterBlockAsm:
  5389. MOVL CX, SI
  5390. ADDL $0x04, CX
  5391. ADDL $0x04, BX
  5392. MOVQ src_len+32(FP), DI
  5393. SUBL CX, DI
  5394. LEAQ (DX)(CX*1), R8
  5395. LEAQ (DX)(BX*1), R9
  5396. // matchLen
  5397. XORL R11, R11
  5398. matchlen_loopback_16_match_nolit_encodeBetterBlockAsm:
  5399. CMPL DI, $0x10
  5400. JB matchlen_match8_match_nolit_encodeBetterBlockAsm
  5401. MOVQ (R8)(R11*1), R10
  5402. MOVQ 8(R8)(R11*1), R12
  5403. XORQ (R9)(R11*1), R10
  5404. JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm
  5405. XORQ 8(R9)(R11*1), R12
  5406. JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm
  5407. LEAL -16(DI), DI
  5408. LEAL 16(R11), R11
  5409. JMP matchlen_loopback_16_match_nolit_encodeBetterBlockAsm
  5410. matchlen_bsf_16match_nolit_encodeBetterBlockAsm:
  5411. #ifdef GOAMD64_v3
  5412. TZCNTQ R12, R12
  5413. #else
  5414. BSFQ R12, R12
  5415. #endif
  5416. SARQ $0x03, R12
  5417. LEAL 8(R11)(R12*1), R11
  5418. JMP match_nolit_end_encodeBetterBlockAsm
  5419. matchlen_match8_match_nolit_encodeBetterBlockAsm:
  5420. CMPL DI, $0x08
  5421. JB matchlen_match4_match_nolit_encodeBetterBlockAsm
  5422. MOVQ (R8)(R11*1), R10
  5423. XORQ (R9)(R11*1), R10
  5424. JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm
  5425. LEAL -8(DI), DI
  5426. LEAL 8(R11), R11
  5427. JMP matchlen_match4_match_nolit_encodeBetterBlockAsm
  5428. matchlen_bsf_8_match_nolit_encodeBetterBlockAsm:
  5429. #ifdef GOAMD64_v3
  5430. TZCNTQ R10, R10
  5431. #else
  5432. BSFQ R10, R10
  5433. #endif
  5434. SARQ $0x03, R10
  5435. LEAL (R11)(R10*1), R11
  5436. JMP match_nolit_end_encodeBetterBlockAsm
  5437. matchlen_match4_match_nolit_encodeBetterBlockAsm:
  5438. CMPL DI, $0x04
  5439. JB matchlen_match2_match_nolit_encodeBetterBlockAsm
  5440. MOVL (R8)(R11*1), R10
  5441. CMPL (R9)(R11*1), R10
  5442. JNE matchlen_match2_match_nolit_encodeBetterBlockAsm
  5443. LEAL -4(DI), DI
  5444. LEAL 4(R11), R11
  5445. matchlen_match2_match_nolit_encodeBetterBlockAsm:
  5446. CMPL DI, $0x01
  5447. JE matchlen_match1_match_nolit_encodeBetterBlockAsm
  5448. JB match_nolit_end_encodeBetterBlockAsm
  5449. MOVW (R8)(R11*1), R10
  5450. CMPW (R9)(R11*1), R10
  5451. JNE matchlen_match1_match_nolit_encodeBetterBlockAsm
  5452. LEAL 2(R11), R11
  5453. SUBL $0x02, DI
  5454. JZ match_nolit_end_encodeBetterBlockAsm
  5455. matchlen_match1_match_nolit_encodeBetterBlockAsm:
  5456. MOVB (R8)(R11*1), R10
  5457. CMPB (R9)(R11*1), R10
  5458. JNE match_nolit_end_encodeBetterBlockAsm
  5459. LEAL 1(R11), R11
  5460. match_nolit_end_encodeBetterBlockAsm:
  5461. MOVL CX, DI
  5462. SUBL BX, DI
  5463. // Check if repeat
  5464. CMPL 16(SP), DI
  5465. JEQ match_is_repeat_encodeBetterBlockAsm
  5466. CMPL R11, $0x01
  5467. JA match_length_ok_encodeBetterBlockAsm
  5468. CMPL DI, $0x0000ffff
  5469. JBE match_length_ok_encodeBetterBlockAsm
  5470. MOVL 20(SP), CX
  5471. INCL CX
  5472. JMP search_loop_encodeBetterBlockAsm
  5473. match_length_ok_encodeBetterBlockAsm:
  5474. MOVL DI, 16(SP)
  5475. MOVL 12(SP), BX
  5476. CMPL BX, SI
  5477. JEQ emit_literal_done_match_emit_encodeBetterBlockAsm
  5478. MOVL SI, R8
  5479. MOVL SI, 12(SP)
  5480. LEAQ (DX)(BX*1), R9
  5481. SUBL BX, R8
  5482. LEAL -1(R8), BX
  5483. CMPL BX, $0x3c
  5484. JB one_byte_match_emit_encodeBetterBlockAsm
  5485. CMPL BX, $0x00000100
  5486. JB two_bytes_match_emit_encodeBetterBlockAsm
  5487. CMPL BX, $0x00010000
  5488. JB three_bytes_match_emit_encodeBetterBlockAsm
  5489. CMPL BX, $0x01000000
  5490. JB four_bytes_match_emit_encodeBetterBlockAsm
  5491. MOVB $0xfc, (AX)
  5492. MOVL BX, 1(AX)
  5493. ADDQ $0x05, AX
  5494. JMP memmove_long_match_emit_encodeBetterBlockAsm
  5495. four_bytes_match_emit_encodeBetterBlockAsm:
  5496. MOVL BX, R10
  5497. SHRL $0x10, R10
  5498. MOVB $0xf8, (AX)
  5499. MOVW BX, 1(AX)
  5500. MOVB R10, 3(AX)
  5501. ADDQ $0x04, AX
  5502. JMP memmove_long_match_emit_encodeBetterBlockAsm
  5503. three_bytes_match_emit_encodeBetterBlockAsm:
  5504. MOVB $0xf4, (AX)
  5505. MOVW BX, 1(AX)
  5506. ADDQ $0x03, AX
  5507. JMP memmove_long_match_emit_encodeBetterBlockAsm
  5508. two_bytes_match_emit_encodeBetterBlockAsm:
  5509. MOVB $0xf0, (AX)
  5510. MOVB BL, 1(AX)
  5511. ADDQ $0x02, AX
  5512. CMPL BX, $0x40
  5513. JB memmove_match_emit_encodeBetterBlockAsm
  5514. JMP memmove_long_match_emit_encodeBetterBlockAsm
  5515. one_byte_match_emit_encodeBetterBlockAsm:
  5516. SHLB $0x02, BL
  5517. MOVB BL, (AX)
  5518. ADDQ $0x01, AX
  5519. memmove_match_emit_encodeBetterBlockAsm:
  5520. LEAQ (AX)(R8*1), BX
  5521. // genMemMoveShort
  5522. CMPQ R8, $0x04
  5523. JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4
  5524. CMPQ R8, $0x08
  5525. JB emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7
  5526. CMPQ R8, $0x10
  5527. JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16
  5528. CMPQ R8, $0x20
  5529. JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32
  5530. JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64
  5531. emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4:
  5532. MOVL (R9), R10
  5533. MOVL R10, (AX)
  5534. JMP memmove_end_copy_match_emit_encodeBetterBlockAsm
  5535. emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7:
  5536. MOVL (R9), R10
  5537. MOVL -4(R9)(R8*1), R9
  5538. MOVL R10, (AX)
  5539. MOVL R9, -4(AX)(R8*1)
  5540. JMP memmove_end_copy_match_emit_encodeBetterBlockAsm
  5541. emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16:
  5542. MOVQ (R9), R10
  5543. MOVQ -8(R9)(R8*1), R9
  5544. MOVQ R10, (AX)
  5545. MOVQ R9, -8(AX)(R8*1)
  5546. JMP memmove_end_copy_match_emit_encodeBetterBlockAsm
  5547. emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32:
  5548. MOVOU (R9), X0
  5549. MOVOU -16(R9)(R8*1), X1
  5550. MOVOU X0, (AX)
  5551. MOVOU X1, -16(AX)(R8*1)
  5552. JMP memmove_end_copy_match_emit_encodeBetterBlockAsm
  5553. emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64:
  5554. MOVOU (R9), X0
  5555. MOVOU 16(R9), X1
  5556. MOVOU -32(R9)(R8*1), X2
  5557. MOVOU -16(R9)(R8*1), X3
  5558. MOVOU X0, (AX)
  5559. MOVOU X1, 16(AX)
  5560. MOVOU X2, -32(AX)(R8*1)
  5561. MOVOU X3, -16(AX)(R8*1)
  5562. memmove_end_copy_match_emit_encodeBetterBlockAsm:
  5563. MOVQ BX, AX
  5564. JMP emit_literal_done_match_emit_encodeBetterBlockAsm
  5565. memmove_long_match_emit_encodeBetterBlockAsm:
  5566. LEAQ (AX)(R8*1), BX
  5567. // genMemMoveLong
  5568. MOVOU (R9), X0
  5569. MOVOU 16(R9), X1
  5570. MOVOU -32(R9)(R8*1), X2
  5571. MOVOU -16(R9)(R8*1), X3
  5572. MOVQ R8, R12
  5573. SHRQ $0x05, R12
  5574. MOVQ AX, R10
  5575. ANDL $0x0000001f, R10
  5576. MOVQ $0x00000040, R13
  5577. SUBQ R10, R13
  5578. DECQ R12
  5579. JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32
  5580. LEAQ -32(R9)(R13*1), R10
  5581. LEAQ -32(AX)(R13*1), R14
  5582. emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back:
  5583. MOVOU (R10), X4
  5584. MOVOU 16(R10), X5
  5585. MOVOA X4, (R14)
  5586. MOVOA X5, 16(R14)
  5587. ADDQ $0x20, R14
  5588. ADDQ $0x20, R10
  5589. ADDQ $0x20, R13
  5590. DECQ R12
  5591. JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back
  5592. emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32:
  5593. MOVOU -32(R9)(R13*1), X4
  5594. MOVOU -16(R9)(R13*1), X5
  5595. MOVOA X4, -32(AX)(R13*1)
  5596. MOVOA X5, -16(AX)(R13*1)
  5597. ADDQ $0x20, R13
  5598. CMPQ R8, R13
  5599. JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32
  5600. MOVOU X0, (AX)
  5601. MOVOU X1, 16(AX)
  5602. MOVOU X2, -32(AX)(R8*1)
  5603. MOVOU X3, -16(AX)(R8*1)
  5604. MOVQ BX, AX
  5605. emit_literal_done_match_emit_encodeBetterBlockAsm:
  5606. ADDL R11, CX
  5607. ADDL $0x04, R11
  5608. MOVL CX, 12(SP)
  5609. // emitCopy
  5610. CMPL DI, $0x00010000
  5611. JB two_byte_offset_match_nolit_encodeBetterBlockAsm
  5612. CMPL R11, $0x40
  5613. JBE four_bytes_remain_match_nolit_encodeBetterBlockAsm
  5614. MOVB $0xff, (AX)
  5615. MOVL DI, 1(AX)
  5616. LEAL -64(R11), R11
  5617. ADDQ $0x05, AX
  5618. CMPL R11, $0x04
  5619. JB four_bytes_remain_match_nolit_encodeBetterBlockAsm
  5620. // emitRepeat
  5621. emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy:
  5622. MOVL R11, BX
  5623. LEAL -4(R11), R11
  5624. CMPL BX, $0x08
  5625. JBE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy
  5626. CMPL BX, $0x0c
  5627. JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy
  5628. CMPL DI, $0x00000800
  5629. JB repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy
  5630. cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy:
  5631. CMPL R11, $0x00000104
  5632. JB repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy
  5633. CMPL R11, $0x00010100
  5634. JB repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy
  5635. CMPL R11, $0x0100ffff
  5636. JB repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy
  5637. LEAL -16842747(R11), R11
  5638. MOVL $0xfffb001d, (AX)
  5639. MOVB $0xff, 4(AX)
  5640. ADDQ $0x05, AX
  5641. JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy
  5642. repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy:
  5643. LEAL -65536(R11), R11
  5644. MOVL R11, DI
  5645. MOVW $0x001d, (AX)
  5646. MOVW R11, 2(AX)
  5647. SARL $0x10, DI
  5648. MOVB DI, 4(AX)
  5649. ADDQ $0x05, AX
  5650. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
  5651. repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy:
  5652. LEAL -256(R11), R11
  5653. MOVW $0x0019, (AX)
  5654. MOVW R11, 2(AX)
  5655. ADDQ $0x04, AX
  5656. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
  5657. repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy:
  5658. LEAL -4(R11), R11
  5659. MOVW $0x0015, (AX)
  5660. MOVB R11, 2(AX)
  5661. ADDQ $0x03, AX
  5662. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
  5663. repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy:
  5664. SHLL $0x02, R11
  5665. ORL $0x01, R11
  5666. MOVW R11, (AX)
  5667. ADDQ $0x02, AX
  5668. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
  5669. repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy:
  5670. XORQ BX, BX
  5671. LEAL 1(BX)(R11*4), R11
  5672. MOVB DI, 1(AX)
  5673. SARL $0x08, DI
  5674. SHLL $0x05, DI
  5675. ORL DI, R11
  5676. MOVB R11, (AX)
  5677. ADDQ $0x02, AX
  5678. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
  5679. four_bytes_remain_match_nolit_encodeBetterBlockAsm:
  5680. TESTL R11, R11
  5681. JZ match_nolit_emitcopy_end_encodeBetterBlockAsm
  5682. XORL BX, BX
  5683. LEAL -1(BX)(R11*4), R11
  5684. MOVB R11, (AX)
  5685. MOVL DI, 1(AX)
  5686. ADDQ $0x05, AX
  5687. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
  5688. two_byte_offset_match_nolit_encodeBetterBlockAsm:
  5689. CMPL R11, $0x40
  5690. JBE two_byte_offset_short_match_nolit_encodeBetterBlockAsm
  5691. CMPL DI, $0x00000800
  5692. JAE long_offset_short_match_nolit_encodeBetterBlockAsm
  5693. MOVL $0x00000001, BX
  5694. LEAL 16(BX), BX
  5695. MOVB DI, 1(AX)
  5696. MOVL DI, R8
  5697. SHRL $0x08, R8
  5698. SHLL $0x05, R8
  5699. ORL R8, BX
  5700. MOVB BL, (AX)
  5701. ADDQ $0x02, AX
  5702. SUBL $0x08, R11
  5703. // emitRepeat
  5704. LEAL -4(R11), R11
  5705. JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
  5706. emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
  5707. MOVL R11, BX
  5708. LEAL -4(R11), R11
  5709. CMPL BX, $0x08
  5710. JBE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
  5711. CMPL BX, $0x0c
  5712. JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
  5713. CMPL DI, $0x00000800
  5714. JB repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
  5715. cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
  5716. CMPL R11, $0x00000104
  5717. JB repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
  5718. CMPL R11, $0x00010100
  5719. JB repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
  5720. CMPL R11, $0x0100ffff
  5721. JB repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
  5722. LEAL -16842747(R11), R11
  5723. MOVL $0xfffb001d, (AX)
  5724. MOVB $0xff, 4(AX)
  5725. ADDQ $0x05, AX
  5726. JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
  5727. repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
  5728. LEAL -65536(R11), R11
  5729. MOVL R11, DI
  5730. MOVW $0x001d, (AX)
  5731. MOVW R11, 2(AX)
  5732. SARL $0x10, DI
  5733. MOVB DI, 4(AX)
  5734. ADDQ $0x05, AX
  5735. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
  5736. repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
  5737. LEAL -256(R11), R11
  5738. MOVW $0x0019, (AX)
  5739. MOVW R11, 2(AX)
  5740. ADDQ $0x04, AX
  5741. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
  5742. repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
  5743. LEAL -4(R11), R11
  5744. MOVW $0x0015, (AX)
  5745. MOVB R11, 2(AX)
  5746. ADDQ $0x03, AX
  5747. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
  5748. repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
  5749. SHLL $0x02, R11
  5750. ORL $0x01, R11
  5751. MOVW R11, (AX)
  5752. ADDQ $0x02, AX
  5753. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
  5754. repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
  5755. XORQ BX, BX
  5756. LEAL 1(BX)(R11*4), R11
  5757. MOVB DI, 1(AX)
  5758. SARL $0x08, DI
  5759. SHLL $0x05, DI
  5760. ORL DI, R11
  5761. MOVB R11, (AX)
  5762. ADDQ $0x02, AX
  5763. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
  5764. long_offset_short_match_nolit_encodeBetterBlockAsm:
  5765. MOVB $0xee, (AX)
  5766. MOVW DI, 1(AX)
  5767. LEAL -60(R11), R11
  5768. ADDQ $0x03, AX
  5769. // emitRepeat
  5770. emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short:
  5771. MOVL R11, BX
  5772. LEAL -4(R11), R11
  5773. CMPL BX, $0x08
  5774. JBE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short
  5775. CMPL BX, $0x0c
  5776. JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short
  5777. CMPL DI, $0x00000800
  5778. JB repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short
  5779. cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short:
  5780. CMPL R11, $0x00000104
  5781. JB repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short
  5782. CMPL R11, $0x00010100
  5783. JB repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short
  5784. CMPL R11, $0x0100ffff
  5785. JB repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short
  5786. LEAL -16842747(R11), R11
  5787. MOVL $0xfffb001d, (AX)
  5788. MOVB $0xff, 4(AX)
  5789. ADDQ $0x05, AX
  5790. JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short
  5791. repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short:
  5792. LEAL -65536(R11), R11
  5793. MOVL R11, DI
  5794. MOVW $0x001d, (AX)
  5795. MOVW R11, 2(AX)
  5796. SARL $0x10, DI
  5797. MOVB DI, 4(AX)
  5798. ADDQ $0x05, AX
  5799. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
  5800. repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short:
  5801. LEAL -256(R11), R11
  5802. MOVW $0x0019, (AX)
  5803. MOVW R11, 2(AX)
  5804. ADDQ $0x04, AX
  5805. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
  5806. repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short:
  5807. LEAL -4(R11), R11
  5808. MOVW $0x0015, (AX)
  5809. MOVB R11, 2(AX)
  5810. ADDQ $0x03, AX
  5811. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
  5812. repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short:
  5813. SHLL $0x02, R11
  5814. ORL $0x01, R11
  5815. MOVW R11, (AX)
  5816. ADDQ $0x02, AX
  5817. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
  5818. repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short:
  5819. XORQ BX, BX
  5820. LEAL 1(BX)(R11*4), R11
  5821. MOVB DI, 1(AX)
  5822. SARL $0x08, DI
  5823. SHLL $0x05, DI
  5824. ORL DI, R11
  5825. MOVB R11, (AX)
  5826. ADDQ $0x02, AX
  5827. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
  5828. two_byte_offset_short_match_nolit_encodeBetterBlockAsm:
  5829. MOVL R11, BX
  5830. SHLL $0x02, BX
  5831. CMPL R11, $0x0c
  5832. JAE emit_copy_three_match_nolit_encodeBetterBlockAsm
  5833. CMPL DI, $0x00000800
  5834. JAE emit_copy_three_match_nolit_encodeBetterBlockAsm
  5835. LEAL -15(BX), BX
  5836. MOVB DI, 1(AX)
  5837. SHRL $0x08, DI
  5838. SHLL $0x05, DI
  5839. ORL DI, BX
  5840. MOVB BL, (AX)
  5841. ADDQ $0x02, AX
  5842. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
  5843. emit_copy_three_match_nolit_encodeBetterBlockAsm:
  5844. LEAL -2(BX), BX
  5845. MOVB BL, (AX)
  5846. MOVW DI, 1(AX)
  5847. ADDQ $0x03, AX
  5848. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
  5849. match_is_repeat_encodeBetterBlockAsm:
  5850. MOVL 12(SP), BX
  5851. CMPL BX, SI
  5852. JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm
  5853. MOVL SI, R8
  5854. MOVL SI, 12(SP)
  5855. LEAQ (DX)(BX*1), R9
  5856. SUBL BX, R8
  5857. LEAL -1(R8), BX
  5858. CMPL BX, $0x3c
  5859. JB one_byte_match_emit_repeat_encodeBetterBlockAsm
  5860. CMPL BX, $0x00000100
  5861. JB two_bytes_match_emit_repeat_encodeBetterBlockAsm
  5862. CMPL BX, $0x00010000
  5863. JB three_bytes_match_emit_repeat_encodeBetterBlockAsm
  5864. CMPL BX, $0x01000000
  5865. JB four_bytes_match_emit_repeat_encodeBetterBlockAsm
  5866. MOVB $0xfc, (AX)
  5867. MOVL BX, 1(AX)
  5868. ADDQ $0x05, AX
  5869. JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm
  5870. four_bytes_match_emit_repeat_encodeBetterBlockAsm:
  5871. MOVL BX, R10
  5872. SHRL $0x10, R10
  5873. MOVB $0xf8, (AX)
  5874. MOVW BX, 1(AX)
  5875. MOVB R10, 3(AX)
  5876. ADDQ $0x04, AX
  5877. JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm
  5878. three_bytes_match_emit_repeat_encodeBetterBlockAsm:
  5879. MOVB $0xf4, (AX)
  5880. MOVW BX, 1(AX)
  5881. ADDQ $0x03, AX
  5882. JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm
  5883. two_bytes_match_emit_repeat_encodeBetterBlockAsm:
  5884. MOVB $0xf0, (AX)
  5885. MOVB BL, 1(AX)
  5886. ADDQ $0x02, AX
  5887. CMPL BX, $0x40
  5888. JB memmove_match_emit_repeat_encodeBetterBlockAsm
  5889. JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm
  5890. one_byte_match_emit_repeat_encodeBetterBlockAsm:
  5891. SHLB $0x02, BL
  5892. MOVB BL, (AX)
  5893. ADDQ $0x01, AX
  5894. memmove_match_emit_repeat_encodeBetterBlockAsm:
  5895. LEAQ (AX)(R8*1), BX
  5896. // genMemMoveShort
  5897. CMPQ R8, $0x04
  5898. JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4
  5899. CMPQ R8, $0x08
  5900. JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4through7
  5901. CMPQ R8, $0x10
  5902. JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16
  5903. CMPQ R8, $0x20
  5904. JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32
  5905. JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64
  5906. emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4:
  5907. MOVL (R9), R10
  5908. MOVL R10, (AX)
  5909. JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
  5910. emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4through7:
  5911. MOVL (R9), R10
  5912. MOVL -4(R9)(R8*1), R9
  5913. MOVL R10, (AX)
  5914. MOVL R9, -4(AX)(R8*1)
  5915. JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
  5916. emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16:
  5917. MOVQ (R9), R10
  5918. MOVQ -8(R9)(R8*1), R9
  5919. MOVQ R10, (AX)
  5920. MOVQ R9, -8(AX)(R8*1)
  5921. JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
  5922. emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32:
  5923. MOVOU (R9), X0
  5924. MOVOU -16(R9)(R8*1), X1
  5925. MOVOU X0, (AX)
  5926. MOVOU X1, -16(AX)(R8*1)
  5927. JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
  5928. emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64:
  5929. MOVOU (R9), X0
  5930. MOVOU 16(R9), X1
  5931. MOVOU -32(R9)(R8*1), X2
  5932. MOVOU -16(R9)(R8*1), X3
  5933. MOVOU X0, (AX)
  5934. MOVOU X1, 16(AX)
  5935. MOVOU X2, -32(AX)(R8*1)
  5936. MOVOU X3, -16(AX)(R8*1)
  5937. memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm:
  5938. MOVQ BX, AX
  5939. JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm
  5940. memmove_long_match_emit_repeat_encodeBetterBlockAsm:
  5941. LEAQ (AX)(R8*1), BX
  5942. // genMemMoveLong
  5943. MOVOU (R9), X0
  5944. MOVOU 16(R9), X1
  5945. MOVOU -32(R9)(R8*1), X2
  5946. MOVOU -16(R9)(R8*1), X3
  5947. MOVQ R8, R12
  5948. SHRQ $0x05, R12
  5949. MOVQ AX, R10
  5950. ANDL $0x0000001f, R10
  5951. MOVQ $0x00000040, R13
  5952. SUBQ R10, R13
  5953. DECQ R12
  5954. JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32
  5955. LEAQ -32(R9)(R13*1), R10
  5956. LEAQ -32(AX)(R13*1), R14
  5957. emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back:
  5958. MOVOU (R10), X4
  5959. MOVOU 16(R10), X5
  5960. MOVOA X4, (R14)
  5961. MOVOA X5, 16(R14)
  5962. ADDQ $0x20, R14
  5963. ADDQ $0x20, R10
  5964. ADDQ $0x20, R13
  5965. DECQ R12
  5966. JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back
  5967. emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32:
  5968. MOVOU -32(R9)(R13*1), X4
  5969. MOVOU -16(R9)(R13*1), X5
  5970. MOVOA X4, -32(AX)(R13*1)
  5971. MOVOA X5, -16(AX)(R13*1)
  5972. ADDQ $0x20, R13
  5973. CMPQ R8, R13
  5974. JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32
  5975. MOVOU X0, (AX)
  5976. MOVOU X1, 16(AX)
  5977. MOVOU X2, -32(AX)(R8*1)
  5978. MOVOU X3, -16(AX)(R8*1)
  5979. MOVQ BX, AX
  5980. emit_literal_done_match_emit_repeat_encodeBetterBlockAsm:
  5981. ADDL R11, CX
  5982. ADDL $0x04, R11
  5983. MOVL CX, 12(SP)
  5984. // emitRepeat
  5985. emit_repeat_again_match_nolit_repeat_encodeBetterBlockAsm:
  5986. MOVL R11, BX
  5987. LEAL -4(R11), R11
  5988. CMPL BX, $0x08
  5989. JBE repeat_two_match_nolit_repeat_encodeBetterBlockAsm
  5990. CMPL BX, $0x0c
  5991. JAE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm
  5992. CMPL DI, $0x00000800
  5993. JB repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm
  5994. cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm:
  5995. CMPL R11, $0x00000104
  5996. JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm
  5997. CMPL R11, $0x00010100
  5998. JB repeat_four_match_nolit_repeat_encodeBetterBlockAsm
  5999. CMPL R11, $0x0100ffff
  6000. JB repeat_five_match_nolit_repeat_encodeBetterBlockAsm
  6001. LEAL -16842747(R11), R11
  6002. MOVL $0xfffb001d, (AX)
  6003. MOVB $0xff, 4(AX)
  6004. ADDQ $0x05, AX
  6005. JMP emit_repeat_again_match_nolit_repeat_encodeBetterBlockAsm
  6006. repeat_five_match_nolit_repeat_encodeBetterBlockAsm:
  6007. LEAL -65536(R11), R11
  6008. MOVL R11, DI
  6009. MOVW $0x001d, (AX)
  6010. MOVW R11, 2(AX)
  6011. SARL $0x10, DI
  6012. MOVB DI, 4(AX)
  6013. ADDQ $0x05, AX
  6014. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
  6015. repeat_four_match_nolit_repeat_encodeBetterBlockAsm:
  6016. LEAL -256(R11), R11
  6017. MOVW $0x0019, (AX)
  6018. MOVW R11, 2(AX)
  6019. ADDQ $0x04, AX
  6020. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
  6021. repeat_three_match_nolit_repeat_encodeBetterBlockAsm:
  6022. LEAL -4(R11), R11
  6023. MOVW $0x0015, (AX)
  6024. MOVB R11, 2(AX)
  6025. ADDQ $0x03, AX
  6026. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
  6027. repeat_two_match_nolit_repeat_encodeBetterBlockAsm:
  6028. SHLL $0x02, R11
  6029. ORL $0x01, R11
  6030. MOVW R11, (AX)
  6031. ADDQ $0x02, AX
  6032. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
  6033. repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm:
  6034. XORQ BX, BX
  6035. LEAL 1(BX)(R11*4), R11
  6036. MOVB DI, 1(AX)
  6037. SARL $0x08, DI
  6038. SHLL $0x05, DI
  6039. ORL DI, R11
  6040. MOVB R11, (AX)
  6041. ADDQ $0x02, AX
  6042. match_nolit_emitcopy_end_encodeBetterBlockAsm:
  6043. CMPL CX, 8(SP)
  6044. JAE emit_remainder_encodeBetterBlockAsm
  6045. CMPQ AX, (SP)
  6046. JB match_nolit_dst_ok_encodeBetterBlockAsm
  6047. MOVQ $0x00000000, ret+48(FP)
  6048. RET
  6049. match_nolit_dst_ok_encodeBetterBlockAsm:
  6050. MOVQ $0x00cf1bbcdcbfa563, BX
  6051. MOVQ $0x9e3779b1, DI
  6052. LEAQ 1(SI), SI
  6053. LEAQ -2(CX), R8
  6054. MOVQ (DX)(SI*1), R9
  6055. MOVQ 1(DX)(SI*1), R10
  6056. MOVQ (DX)(R8*1), R11
  6057. MOVQ 1(DX)(R8*1), R12
  6058. SHLQ $0x08, R9
  6059. IMULQ BX, R9
  6060. SHRQ $0x2f, R9
  6061. SHLQ $0x20, R10
  6062. IMULQ DI, R10
  6063. SHRQ $0x32, R10
  6064. SHLQ $0x08, R11
  6065. IMULQ BX, R11
  6066. SHRQ $0x2f, R11
  6067. SHLQ $0x20, R12
  6068. IMULQ DI, R12
  6069. SHRQ $0x32, R12
  6070. LEAQ 1(SI), DI
  6071. LEAQ 1(R8), R13
  6072. MOVL SI, 24(SP)(R9*4)
  6073. MOVL R8, 24(SP)(R11*4)
  6074. MOVL DI, 524312(SP)(R10*4)
  6075. MOVL R13, 524312(SP)(R12*4)
  6076. LEAQ 1(R8)(SI*1), DI
  6077. SHRQ $0x01, DI
  6078. ADDQ $0x01, SI
  6079. SUBQ $0x01, R8
  6080. index_loop_encodeBetterBlockAsm:
  6081. CMPQ DI, R8
  6082. JAE search_loop_encodeBetterBlockAsm
  6083. MOVQ (DX)(SI*1), R9
  6084. MOVQ (DX)(DI*1), R10
  6085. SHLQ $0x08, R9
  6086. IMULQ BX, R9
  6087. SHRQ $0x2f, R9
  6088. SHLQ $0x08, R10
  6089. IMULQ BX, R10
  6090. SHRQ $0x2f, R10
  6091. MOVL SI, 24(SP)(R9*4)
  6092. MOVL DI, 24(SP)(R10*4)
  6093. ADDQ $0x02, SI
  6094. ADDQ $0x02, DI
  6095. JMP index_loop_encodeBetterBlockAsm
  6096. emit_remainder_encodeBetterBlockAsm:
  6097. MOVQ src_len+32(FP), CX
  6098. SUBL 12(SP), CX
  6099. LEAQ 5(AX)(CX*1), CX
  6100. CMPQ CX, (SP)
  6101. JB emit_remainder_ok_encodeBetterBlockAsm
  6102. MOVQ $0x00000000, ret+48(FP)
  6103. RET
  6104. emit_remainder_ok_encodeBetterBlockAsm:
  6105. MOVQ src_len+32(FP), CX
  6106. MOVL 12(SP), BX
  6107. CMPL BX, CX
  6108. JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm
  6109. MOVL CX, SI
  6110. MOVL CX, 12(SP)
  6111. LEAQ (DX)(BX*1), CX
  6112. SUBL BX, SI
  6113. LEAL -1(SI), DX
  6114. CMPL DX, $0x3c
  6115. JB one_byte_emit_remainder_encodeBetterBlockAsm
  6116. CMPL DX, $0x00000100
  6117. JB two_bytes_emit_remainder_encodeBetterBlockAsm
  6118. CMPL DX, $0x00010000
  6119. JB three_bytes_emit_remainder_encodeBetterBlockAsm
  6120. CMPL DX, $0x01000000
  6121. JB four_bytes_emit_remainder_encodeBetterBlockAsm
  6122. MOVB $0xfc, (AX)
  6123. MOVL DX, 1(AX)
  6124. ADDQ $0x05, AX
  6125. JMP memmove_long_emit_remainder_encodeBetterBlockAsm
  6126. four_bytes_emit_remainder_encodeBetterBlockAsm:
  6127. MOVL DX, BX
  6128. SHRL $0x10, BX
  6129. MOVB $0xf8, (AX)
  6130. MOVW DX, 1(AX)
  6131. MOVB BL, 3(AX)
  6132. ADDQ $0x04, AX
  6133. JMP memmove_long_emit_remainder_encodeBetterBlockAsm
  6134. three_bytes_emit_remainder_encodeBetterBlockAsm:
  6135. MOVB $0xf4, (AX)
  6136. MOVW DX, 1(AX)
  6137. ADDQ $0x03, AX
  6138. JMP memmove_long_emit_remainder_encodeBetterBlockAsm
  6139. two_bytes_emit_remainder_encodeBetterBlockAsm:
  6140. MOVB $0xf0, (AX)
  6141. MOVB DL, 1(AX)
  6142. ADDQ $0x02, AX
  6143. CMPL DX, $0x40
  6144. JB memmove_emit_remainder_encodeBetterBlockAsm
  6145. JMP memmove_long_emit_remainder_encodeBetterBlockAsm
  6146. one_byte_emit_remainder_encodeBetterBlockAsm:
  6147. SHLB $0x02, DL
  6148. MOVB DL, (AX)
  6149. ADDQ $0x01, AX
  6150. memmove_emit_remainder_encodeBetterBlockAsm:
  6151. LEAQ (AX)(SI*1), DX
  6152. MOVL SI, BX
  6153. // genMemMoveShort
  6154. CMPQ BX, $0x03
  6155. JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2
  6156. JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3
  6157. CMPQ BX, $0x08
  6158. JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7
  6159. CMPQ BX, $0x10
  6160. JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16
  6161. CMPQ BX, $0x20
  6162. JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32
  6163. JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64
  6164. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2:
  6165. MOVB (CX), SI
  6166. MOVB -1(CX)(BX*1), CL
  6167. MOVB SI, (AX)
  6168. MOVB CL, -1(AX)(BX*1)
  6169. JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm
  6170. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3:
  6171. MOVW (CX), SI
  6172. MOVB 2(CX), CL
  6173. MOVW SI, (AX)
  6174. MOVB CL, 2(AX)
  6175. JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm
  6176. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7:
  6177. MOVL (CX), SI
  6178. MOVL -4(CX)(BX*1), CX
  6179. MOVL SI, (AX)
  6180. MOVL CX, -4(AX)(BX*1)
  6181. JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm
  6182. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16:
  6183. MOVQ (CX), SI
  6184. MOVQ -8(CX)(BX*1), CX
  6185. MOVQ SI, (AX)
  6186. MOVQ CX, -8(AX)(BX*1)
  6187. JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm
  6188. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32:
  6189. MOVOU (CX), X0
  6190. MOVOU -16(CX)(BX*1), X1
  6191. MOVOU X0, (AX)
  6192. MOVOU X1, -16(AX)(BX*1)
  6193. JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm
  6194. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64:
  6195. MOVOU (CX), X0
  6196. MOVOU 16(CX), X1
  6197. MOVOU -32(CX)(BX*1), X2
  6198. MOVOU -16(CX)(BX*1), X3
  6199. MOVOU X0, (AX)
  6200. MOVOU X1, 16(AX)
  6201. MOVOU X2, -32(AX)(BX*1)
  6202. MOVOU X3, -16(AX)(BX*1)
  6203. memmove_end_copy_emit_remainder_encodeBetterBlockAsm:
  6204. MOVQ DX, AX
  6205. JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm
  6206. memmove_long_emit_remainder_encodeBetterBlockAsm:
  6207. LEAQ (AX)(SI*1), DX
  6208. MOVL SI, BX
  6209. // genMemMoveLong
  6210. MOVOU (CX), X0
  6211. MOVOU 16(CX), X1
  6212. MOVOU -32(CX)(BX*1), X2
  6213. MOVOU -16(CX)(BX*1), X3
  6214. MOVQ BX, DI
  6215. SHRQ $0x05, DI
  6216. MOVQ AX, SI
  6217. ANDL $0x0000001f, SI
  6218. MOVQ $0x00000040, R8
  6219. SUBQ SI, R8
  6220. DECQ DI
  6221. JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32
  6222. LEAQ -32(CX)(R8*1), SI
  6223. LEAQ -32(AX)(R8*1), R9
  6224. emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back:
  6225. MOVOU (SI), X4
  6226. MOVOU 16(SI), X5
  6227. MOVOA X4, (R9)
  6228. MOVOA X5, 16(R9)
  6229. ADDQ $0x20, R9
  6230. ADDQ $0x20, SI
  6231. ADDQ $0x20, R8
  6232. DECQ DI
  6233. JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back
  6234. emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32:
  6235. MOVOU -32(CX)(R8*1), X4
  6236. MOVOU -16(CX)(R8*1), X5
  6237. MOVOA X4, -32(AX)(R8*1)
  6238. MOVOA X5, -16(AX)(R8*1)
  6239. ADDQ $0x20, R8
  6240. CMPQ BX, R8
  6241. JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32
  6242. MOVOU X0, (AX)
  6243. MOVOU X1, 16(AX)
  6244. MOVOU X2, -32(AX)(BX*1)
  6245. MOVOU X3, -16(AX)(BX*1)
  6246. MOVQ DX, AX
  6247. emit_literal_done_emit_remainder_encodeBetterBlockAsm:
  6248. MOVQ dst_base+0(FP), CX
  6249. SUBQ CX, AX
  6250. MOVQ AX, ret+48(FP)
  6251. RET
  6252. // func encodeBetterBlockAsm4MB(dst []byte, src []byte) int
  6253. // Requires: BMI, SSE2
  6254. TEXT ·encodeBetterBlockAsm4MB(SB), $589848-56
  6255. MOVQ dst_base+0(FP), AX
  6256. MOVQ $0x00001200, CX
  6257. LEAQ 24(SP), DX
  6258. PXOR X0, X0
  6259. zero_loop_encodeBetterBlockAsm4MB:
  6260. MOVOU X0, (DX)
  6261. MOVOU X0, 16(DX)
  6262. MOVOU X0, 32(DX)
  6263. MOVOU X0, 48(DX)
  6264. MOVOU X0, 64(DX)
  6265. MOVOU X0, 80(DX)
  6266. MOVOU X0, 96(DX)
  6267. MOVOU X0, 112(DX)
  6268. ADDQ $0x80, DX
  6269. DECQ CX
  6270. JNZ zero_loop_encodeBetterBlockAsm4MB
  6271. MOVL $0x00000000, 12(SP)
  6272. MOVQ src_len+32(FP), CX
  6273. LEAQ -6(CX), DX
  6274. LEAQ -8(CX), BX
  6275. MOVL BX, 8(SP)
  6276. SHRQ $0x05, CX
  6277. SUBL CX, DX
  6278. LEAQ (AX)(DX*1), DX
  6279. MOVQ DX, (SP)
  6280. MOVL $0x00000001, CX
  6281. MOVL $0x00000000, 16(SP)
  6282. MOVQ src_base+24(FP), DX
  6283. search_loop_encodeBetterBlockAsm4MB:
  6284. MOVL CX, BX
  6285. SUBL 12(SP), BX
  6286. SHRL $0x07, BX
  6287. CMPL BX, $0x63
  6288. JBE check_maxskip_ok_encodeBetterBlockAsm4MB
  6289. LEAL 100(CX), BX
  6290. JMP check_maxskip_cont_encodeBetterBlockAsm4MB
  6291. check_maxskip_ok_encodeBetterBlockAsm4MB:
  6292. LEAL 1(CX)(BX*1), BX
  6293. check_maxskip_cont_encodeBetterBlockAsm4MB:
  6294. CMPL BX, 8(SP)
  6295. JAE emit_remainder_encodeBetterBlockAsm4MB
  6296. MOVQ (DX)(CX*1), SI
  6297. MOVL BX, 20(SP)
  6298. MOVQ $0x00cf1bbcdcbfa563, R8
  6299. MOVQ $0x9e3779b1, BX
  6300. MOVQ SI, R9
  6301. MOVQ SI, R10
  6302. SHLQ $0x08, R9
  6303. IMULQ R8, R9
  6304. SHRQ $0x2f, R9
  6305. SHLQ $0x20, R10
  6306. IMULQ BX, R10
  6307. SHRQ $0x32, R10
  6308. MOVL 24(SP)(R9*4), BX
  6309. MOVL 524312(SP)(R10*4), DI
  6310. MOVL CX, 24(SP)(R9*4)
  6311. MOVL CX, 524312(SP)(R10*4)
  6312. MOVQ (DX)(BX*1), R9
  6313. MOVQ (DX)(DI*1), R10
  6314. CMPQ R9, SI
  6315. JEQ candidate_match_encodeBetterBlockAsm4MB
  6316. CMPQ R10, SI
  6317. JNE no_short_found_encodeBetterBlockAsm4MB
  6318. MOVL DI, BX
  6319. JMP candidate_match_encodeBetterBlockAsm4MB
  6320. no_short_found_encodeBetterBlockAsm4MB:
  6321. CMPL R9, SI
  6322. JEQ candidate_match_encodeBetterBlockAsm4MB
  6323. CMPL R10, SI
  6324. JEQ candidateS_match_encodeBetterBlockAsm4MB
  6325. MOVL 20(SP), CX
  6326. JMP search_loop_encodeBetterBlockAsm4MB
  6327. candidateS_match_encodeBetterBlockAsm4MB:
  6328. SHRQ $0x08, SI
  6329. MOVQ SI, R9
  6330. SHLQ $0x08, R9
  6331. IMULQ R8, R9
  6332. SHRQ $0x2f, R9
  6333. MOVL 24(SP)(R9*4), BX
  6334. INCL CX
  6335. MOVL CX, 24(SP)(R9*4)
  6336. CMPL (DX)(BX*1), SI
  6337. JEQ candidate_match_encodeBetterBlockAsm4MB
  6338. DECL CX
  6339. MOVL DI, BX
  6340. candidate_match_encodeBetterBlockAsm4MB:
  6341. MOVL 12(SP), SI
  6342. TESTL BX, BX
  6343. JZ match_extend_back_end_encodeBetterBlockAsm4MB
  6344. match_extend_back_loop_encodeBetterBlockAsm4MB:
  6345. CMPL CX, SI
  6346. JBE match_extend_back_end_encodeBetterBlockAsm4MB
  6347. MOVB -1(DX)(BX*1), DI
  6348. MOVB -1(DX)(CX*1), R8
  6349. CMPB DI, R8
  6350. JNE match_extend_back_end_encodeBetterBlockAsm4MB
  6351. LEAL -1(CX), CX
  6352. DECL BX
  6353. JZ match_extend_back_end_encodeBetterBlockAsm4MB
  6354. JMP match_extend_back_loop_encodeBetterBlockAsm4MB
  6355. match_extend_back_end_encodeBetterBlockAsm4MB:
  6356. MOVL CX, SI
  6357. SUBL 12(SP), SI
  6358. LEAQ 4(AX)(SI*1), SI
  6359. CMPQ SI, (SP)
  6360. JB match_dst_size_check_encodeBetterBlockAsm4MB
  6361. MOVQ $0x00000000, ret+48(FP)
  6362. RET
  6363. match_dst_size_check_encodeBetterBlockAsm4MB:
  6364. MOVL CX, SI
  6365. ADDL $0x04, CX
  6366. ADDL $0x04, BX
  6367. MOVQ src_len+32(FP), DI
  6368. SUBL CX, DI
  6369. LEAQ (DX)(CX*1), R8
  6370. LEAQ (DX)(BX*1), R9
  6371. // matchLen
  6372. XORL R11, R11
  6373. matchlen_loopback_16_match_nolit_encodeBetterBlockAsm4MB:
  6374. CMPL DI, $0x10
  6375. JB matchlen_match8_match_nolit_encodeBetterBlockAsm4MB
  6376. MOVQ (R8)(R11*1), R10
  6377. MOVQ 8(R8)(R11*1), R12
  6378. XORQ (R9)(R11*1), R10
  6379. JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm4MB
  6380. XORQ 8(R9)(R11*1), R12
  6381. JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm4MB
  6382. LEAL -16(DI), DI
  6383. LEAL 16(R11), R11
  6384. JMP matchlen_loopback_16_match_nolit_encodeBetterBlockAsm4MB
  6385. matchlen_bsf_16match_nolit_encodeBetterBlockAsm4MB:
  6386. #ifdef GOAMD64_v3
  6387. TZCNTQ R12, R12
  6388. #else
  6389. BSFQ R12, R12
  6390. #endif
  6391. SARQ $0x03, R12
  6392. LEAL 8(R11)(R12*1), R11
  6393. JMP match_nolit_end_encodeBetterBlockAsm4MB
  6394. matchlen_match8_match_nolit_encodeBetterBlockAsm4MB:
  6395. CMPL DI, $0x08
  6396. JB matchlen_match4_match_nolit_encodeBetterBlockAsm4MB
  6397. MOVQ (R8)(R11*1), R10
  6398. XORQ (R9)(R11*1), R10
  6399. JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm4MB
  6400. LEAL -8(DI), DI
  6401. LEAL 8(R11), R11
  6402. JMP matchlen_match4_match_nolit_encodeBetterBlockAsm4MB
  6403. matchlen_bsf_8_match_nolit_encodeBetterBlockAsm4MB:
  6404. #ifdef GOAMD64_v3
  6405. TZCNTQ R10, R10
  6406. #else
  6407. BSFQ R10, R10
  6408. #endif
  6409. SARQ $0x03, R10
  6410. LEAL (R11)(R10*1), R11
  6411. JMP match_nolit_end_encodeBetterBlockAsm4MB
  6412. matchlen_match4_match_nolit_encodeBetterBlockAsm4MB:
  6413. CMPL DI, $0x04
  6414. JB matchlen_match2_match_nolit_encodeBetterBlockAsm4MB
  6415. MOVL (R8)(R11*1), R10
  6416. CMPL (R9)(R11*1), R10
  6417. JNE matchlen_match2_match_nolit_encodeBetterBlockAsm4MB
  6418. LEAL -4(DI), DI
  6419. LEAL 4(R11), R11
  6420. matchlen_match2_match_nolit_encodeBetterBlockAsm4MB:
  6421. CMPL DI, $0x01
  6422. JE matchlen_match1_match_nolit_encodeBetterBlockAsm4MB
  6423. JB match_nolit_end_encodeBetterBlockAsm4MB
  6424. MOVW (R8)(R11*1), R10
  6425. CMPW (R9)(R11*1), R10
  6426. JNE matchlen_match1_match_nolit_encodeBetterBlockAsm4MB
  6427. LEAL 2(R11), R11
  6428. SUBL $0x02, DI
  6429. JZ match_nolit_end_encodeBetterBlockAsm4MB
  6430. matchlen_match1_match_nolit_encodeBetterBlockAsm4MB:
  6431. MOVB (R8)(R11*1), R10
  6432. CMPB (R9)(R11*1), R10
  6433. JNE match_nolit_end_encodeBetterBlockAsm4MB
  6434. LEAL 1(R11), R11
  6435. match_nolit_end_encodeBetterBlockAsm4MB:
  6436. MOVL CX, DI
  6437. SUBL BX, DI
  6438. // Check if repeat
  6439. CMPL 16(SP), DI
  6440. JEQ match_is_repeat_encodeBetterBlockAsm4MB
  6441. CMPL R11, $0x01
  6442. JA match_length_ok_encodeBetterBlockAsm4MB
  6443. CMPL DI, $0x0000ffff
  6444. JBE match_length_ok_encodeBetterBlockAsm4MB
  6445. MOVL 20(SP), CX
  6446. INCL CX
  6447. JMP search_loop_encodeBetterBlockAsm4MB
  6448. match_length_ok_encodeBetterBlockAsm4MB:
  6449. MOVL DI, 16(SP)
  6450. MOVL 12(SP), BX
  6451. CMPL BX, SI
  6452. JEQ emit_literal_done_match_emit_encodeBetterBlockAsm4MB
  6453. MOVL SI, R8
  6454. MOVL SI, 12(SP)
  6455. LEAQ (DX)(BX*1), R9
  6456. SUBL BX, R8
  6457. LEAL -1(R8), BX
  6458. CMPL BX, $0x3c
  6459. JB one_byte_match_emit_encodeBetterBlockAsm4MB
  6460. CMPL BX, $0x00000100
  6461. JB two_bytes_match_emit_encodeBetterBlockAsm4MB
  6462. CMPL BX, $0x00010000
  6463. JB three_bytes_match_emit_encodeBetterBlockAsm4MB
  6464. MOVL BX, R10
  6465. SHRL $0x10, R10
  6466. MOVB $0xf8, (AX)
  6467. MOVW BX, 1(AX)
  6468. MOVB R10, 3(AX)
  6469. ADDQ $0x04, AX
  6470. JMP memmove_long_match_emit_encodeBetterBlockAsm4MB
  6471. three_bytes_match_emit_encodeBetterBlockAsm4MB:
  6472. MOVB $0xf4, (AX)
  6473. MOVW BX, 1(AX)
  6474. ADDQ $0x03, AX
  6475. JMP memmove_long_match_emit_encodeBetterBlockAsm4MB
  6476. two_bytes_match_emit_encodeBetterBlockAsm4MB:
  6477. MOVB $0xf0, (AX)
  6478. MOVB BL, 1(AX)
  6479. ADDQ $0x02, AX
  6480. CMPL BX, $0x40
  6481. JB memmove_match_emit_encodeBetterBlockAsm4MB
  6482. JMP memmove_long_match_emit_encodeBetterBlockAsm4MB
  6483. one_byte_match_emit_encodeBetterBlockAsm4MB:
  6484. SHLB $0x02, BL
  6485. MOVB BL, (AX)
  6486. ADDQ $0x01, AX
  6487. memmove_match_emit_encodeBetterBlockAsm4MB:
  6488. LEAQ (AX)(R8*1), BX
  6489. // genMemMoveShort
  6490. CMPQ R8, $0x04
  6491. JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4
  6492. CMPQ R8, $0x08
  6493. JB emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7
  6494. CMPQ R8, $0x10
  6495. JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16
  6496. CMPQ R8, $0x20
  6497. JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32
  6498. JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64
  6499. emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4:
  6500. MOVL (R9), R10
  6501. MOVL R10, (AX)
  6502. JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
  6503. emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7:
  6504. MOVL (R9), R10
  6505. MOVL -4(R9)(R8*1), R9
  6506. MOVL R10, (AX)
  6507. MOVL R9, -4(AX)(R8*1)
  6508. JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
  6509. emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16:
  6510. MOVQ (R9), R10
  6511. MOVQ -8(R9)(R8*1), R9
  6512. MOVQ R10, (AX)
  6513. MOVQ R9, -8(AX)(R8*1)
  6514. JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
  6515. emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32:
  6516. MOVOU (R9), X0
  6517. MOVOU -16(R9)(R8*1), X1
  6518. MOVOU X0, (AX)
  6519. MOVOU X1, -16(AX)(R8*1)
  6520. JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
  6521. emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64:
  6522. MOVOU (R9), X0
  6523. MOVOU 16(R9), X1
  6524. MOVOU -32(R9)(R8*1), X2
  6525. MOVOU -16(R9)(R8*1), X3
  6526. MOVOU X0, (AX)
  6527. MOVOU X1, 16(AX)
  6528. MOVOU X2, -32(AX)(R8*1)
  6529. MOVOU X3, -16(AX)(R8*1)
  6530. memmove_end_copy_match_emit_encodeBetterBlockAsm4MB:
  6531. MOVQ BX, AX
  6532. JMP emit_literal_done_match_emit_encodeBetterBlockAsm4MB
  6533. memmove_long_match_emit_encodeBetterBlockAsm4MB:
  6534. LEAQ (AX)(R8*1), BX
  6535. // genMemMoveLong
  6536. MOVOU (R9), X0
  6537. MOVOU 16(R9), X1
  6538. MOVOU -32(R9)(R8*1), X2
  6539. MOVOU -16(R9)(R8*1), X3
  6540. MOVQ R8, R12
  6541. SHRQ $0x05, R12
  6542. MOVQ AX, R10
  6543. ANDL $0x0000001f, R10
  6544. MOVQ $0x00000040, R13
  6545. SUBQ R10, R13
  6546. DECQ R12
  6547. JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
  6548. LEAQ -32(R9)(R13*1), R10
  6549. LEAQ -32(AX)(R13*1), R14
  6550. emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back:
  6551. MOVOU (R10), X4
  6552. MOVOU 16(R10), X5
  6553. MOVOA X4, (R14)
  6554. MOVOA X5, 16(R14)
  6555. ADDQ $0x20, R14
  6556. ADDQ $0x20, R10
  6557. ADDQ $0x20, R13
  6558. DECQ R12
  6559. JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back
  6560. emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32:
  6561. MOVOU -32(R9)(R13*1), X4
  6562. MOVOU -16(R9)(R13*1), X5
  6563. MOVOA X4, -32(AX)(R13*1)
  6564. MOVOA X5, -16(AX)(R13*1)
  6565. ADDQ $0x20, R13
  6566. CMPQ R8, R13
  6567. JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
  6568. MOVOU X0, (AX)
  6569. MOVOU X1, 16(AX)
  6570. MOVOU X2, -32(AX)(R8*1)
  6571. MOVOU X3, -16(AX)(R8*1)
  6572. MOVQ BX, AX
  6573. emit_literal_done_match_emit_encodeBetterBlockAsm4MB:
  6574. ADDL R11, CX
  6575. ADDL $0x04, R11
  6576. MOVL CX, 12(SP)
  6577. // emitCopy
  6578. CMPL DI, $0x00010000
  6579. JB two_byte_offset_match_nolit_encodeBetterBlockAsm4MB
  6580. CMPL R11, $0x40
  6581. JBE four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB
  6582. MOVB $0xff, (AX)
  6583. MOVL DI, 1(AX)
  6584. LEAL -64(R11), R11
  6585. ADDQ $0x05, AX
  6586. CMPL R11, $0x04
  6587. JB four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB
  6588. // emitRepeat
  6589. MOVL R11, BX
  6590. LEAL -4(R11), R11
  6591. CMPL BX, $0x08
  6592. JBE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy
  6593. CMPL BX, $0x0c
  6594. JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy
  6595. CMPL DI, $0x00000800
  6596. JB repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy
  6597. cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
  6598. CMPL R11, $0x00000104
  6599. JB repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy
  6600. CMPL R11, $0x00010100
  6601. JB repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy
  6602. LEAL -65536(R11), R11
  6603. MOVL R11, DI
  6604. MOVW $0x001d, (AX)
  6605. MOVW R11, 2(AX)
  6606. SARL $0x10, DI
  6607. MOVB DI, 4(AX)
  6608. ADDQ $0x05, AX
  6609. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
  6610. repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
  6611. LEAL -256(R11), R11
  6612. MOVW $0x0019, (AX)
  6613. MOVW R11, 2(AX)
  6614. ADDQ $0x04, AX
  6615. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
  6616. repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
  6617. LEAL -4(R11), R11
  6618. MOVW $0x0015, (AX)
  6619. MOVB R11, 2(AX)
  6620. ADDQ $0x03, AX
  6621. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
  6622. repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
  6623. SHLL $0x02, R11
  6624. ORL $0x01, R11
  6625. MOVW R11, (AX)
  6626. ADDQ $0x02, AX
  6627. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
  6628. repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
  6629. XORQ BX, BX
  6630. LEAL 1(BX)(R11*4), R11
  6631. MOVB DI, 1(AX)
  6632. SARL $0x08, DI
  6633. SHLL $0x05, DI
  6634. ORL DI, R11
  6635. MOVB R11, (AX)
  6636. ADDQ $0x02, AX
  6637. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
  6638. four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB:
  6639. TESTL R11, R11
  6640. JZ match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
  6641. XORL BX, BX
  6642. LEAL -1(BX)(R11*4), R11
  6643. MOVB R11, (AX)
  6644. MOVL DI, 1(AX)
  6645. ADDQ $0x05, AX
  6646. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
  6647. two_byte_offset_match_nolit_encodeBetterBlockAsm4MB:
  6648. CMPL R11, $0x40
  6649. JBE two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB
  6650. CMPL DI, $0x00000800
  6651. JAE long_offset_short_match_nolit_encodeBetterBlockAsm4MB
  6652. MOVL $0x00000001, BX
  6653. LEAL 16(BX), BX
  6654. MOVB DI, 1(AX)
  6655. SHRL $0x08, DI
  6656. SHLL $0x05, DI
  6657. ORL DI, BX
  6658. MOVB BL, (AX)
  6659. ADDQ $0x02, AX
  6660. SUBL $0x08, R11
  6661. // emitRepeat
  6662. LEAL -4(R11), R11
  6663. JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
  6664. MOVL R11, BX
  6665. LEAL -4(R11), R11
  6666. CMPL BX, $0x08
  6667. JBE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
  6668. CMPL BX, $0x0c
  6669. JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
  6670. CMPL DI, $0x00000800
  6671. JB repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
  6672. cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
  6673. CMPL R11, $0x00000104
  6674. JB repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
  6675. CMPL R11, $0x00010100
  6676. JB repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
  6677. LEAL -65536(R11), R11
  6678. MOVL R11, DI
  6679. MOVW $0x001d, (AX)
  6680. MOVW R11, 2(AX)
  6681. SARL $0x10, DI
  6682. MOVB DI, 4(AX)
  6683. ADDQ $0x05, AX
  6684. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
  6685. repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
  6686. LEAL -256(R11), R11
  6687. MOVW $0x0019, (AX)
  6688. MOVW R11, 2(AX)
  6689. ADDQ $0x04, AX
  6690. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
  6691. repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
  6692. LEAL -4(R11), R11
  6693. MOVW $0x0015, (AX)
  6694. MOVB R11, 2(AX)
  6695. ADDQ $0x03, AX
  6696. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
  6697. repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
  6698. SHLL $0x02, R11
  6699. ORL $0x01, R11
  6700. MOVW R11, (AX)
  6701. ADDQ $0x02, AX
  6702. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
  6703. repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
  6704. XORQ BX, BX
  6705. LEAL 1(BX)(R11*4), R11
  6706. MOVB DI, 1(AX)
  6707. SARL $0x08, DI
  6708. SHLL $0x05, DI
  6709. ORL DI, R11
  6710. MOVB R11, (AX)
  6711. ADDQ $0x02, AX
  6712. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
  6713. long_offset_short_match_nolit_encodeBetterBlockAsm4MB:
  6714. MOVB $0xee, (AX)
  6715. MOVW DI, 1(AX)
  6716. LEAL -60(R11), R11
  6717. ADDQ $0x03, AX
  6718. // emitRepeat
  6719. MOVL R11, BX
  6720. LEAL -4(R11), R11
  6721. CMPL BX, $0x08
  6722. JBE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
  6723. CMPL BX, $0x0c
  6724. JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
  6725. CMPL DI, $0x00000800
  6726. JB repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
  6727. cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
  6728. CMPL R11, $0x00000104
  6729. JB repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
  6730. CMPL R11, $0x00010100
  6731. JB repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
  6732. LEAL -65536(R11), R11
  6733. MOVL R11, DI
  6734. MOVW $0x001d, (AX)
  6735. MOVW R11, 2(AX)
  6736. SARL $0x10, DI
  6737. MOVB DI, 4(AX)
  6738. ADDQ $0x05, AX
  6739. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
  6740. repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
  6741. LEAL -256(R11), R11
  6742. MOVW $0x0019, (AX)
  6743. MOVW R11, 2(AX)
  6744. ADDQ $0x04, AX
  6745. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
  6746. repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
  6747. LEAL -4(R11), R11
  6748. MOVW $0x0015, (AX)
  6749. MOVB R11, 2(AX)
  6750. ADDQ $0x03, AX
  6751. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
  6752. repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
  6753. SHLL $0x02, R11
  6754. ORL $0x01, R11
  6755. MOVW R11, (AX)
  6756. ADDQ $0x02, AX
  6757. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
  6758. repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
  6759. XORQ BX, BX
  6760. LEAL 1(BX)(R11*4), R11
  6761. MOVB DI, 1(AX)
  6762. SARL $0x08, DI
  6763. SHLL $0x05, DI
  6764. ORL DI, R11
  6765. MOVB R11, (AX)
  6766. ADDQ $0x02, AX
  6767. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
  6768. two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB:
  6769. MOVL R11, BX
  6770. SHLL $0x02, BX
  6771. CMPL R11, $0x0c
  6772. JAE emit_copy_three_match_nolit_encodeBetterBlockAsm4MB
  6773. CMPL DI, $0x00000800
  6774. JAE emit_copy_three_match_nolit_encodeBetterBlockAsm4MB
  6775. LEAL -15(BX), BX
  6776. MOVB DI, 1(AX)
  6777. SHRL $0x08, DI
  6778. SHLL $0x05, DI
  6779. ORL DI, BX
  6780. MOVB BL, (AX)
  6781. ADDQ $0x02, AX
  6782. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
  6783. emit_copy_three_match_nolit_encodeBetterBlockAsm4MB:
  6784. LEAL -2(BX), BX
  6785. MOVB BL, (AX)
  6786. MOVW DI, 1(AX)
  6787. ADDQ $0x03, AX
  6788. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
  6789. match_is_repeat_encodeBetterBlockAsm4MB:
  6790. MOVL 12(SP), BX
  6791. CMPL BX, SI
  6792. JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB
  6793. MOVL SI, R8
  6794. MOVL SI, 12(SP)
  6795. LEAQ (DX)(BX*1), R9
  6796. SUBL BX, R8
  6797. LEAL -1(R8), BX
  6798. CMPL BX, $0x3c
  6799. JB one_byte_match_emit_repeat_encodeBetterBlockAsm4MB
  6800. CMPL BX, $0x00000100
  6801. JB two_bytes_match_emit_repeat_encodeBetterBlockAsm4MB
  6802. CMPL BX, $0x00010000
  6803. JB three_bytes_match_emit_repeat_encodeBetterBlockAsm4MB
  6804. MOVL BX, R10
  6805. SHRL $0x10, R10
  6806. MOVB $0xf8, (AX)
  6807. MOVW BX, 1(AX)
  6808. MOVB R10, 3(AX)
  6809. ADDQ $0x04, AX
  6810. JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB
  6811. three_bytes_match_emit_repeat_encodeBetterBlockAsm4MB:
  6812. MOVB $0xf4, (AX)
  6813. MOVW BX, 1(AX)
  6814. ADDQ $0x03, AX
  6815. JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB
  6816. two_bytes_match_emit_repeat_encodeBetterBlockAsm4MB:
  6817. MOVB $0xf0, (AX)
  6818. MOVB BL, 1(AX)
  6819. ADDQ $0x02, AX
  6820. CMPL BX, $0x40
  6821. JB memmove_match_emit_repeat_encodeBetterBlockAsm4MB
  6822. JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB
  6823. one_byte_match_emit_repeat_encodeBetterBlockAsm4MB:
  6824. SHLB $0x02, BL
  6825. MOVB BL, (AX)
  6826. ADDQ $0x01, AX
  6827. memmove_match_emit_repeat_encodeBetterBlockAsm4MB:
  6828. LEAQ (AX)(R8*1), BX
  6829. // genMemMoveShort
  6830. CMPQ R8, $0x04
  6831. JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4
  6832. CMPQ R8, $0x08
  6833. JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4through7
  6834. CMPQ R8, $0x10
  6835. JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_8through16
  6836. CMPQ R8, $0x20
  6837. JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_17through32
  6838. JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_33through64
  6839. emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4:
  6840. MOVL (R9), R10
  6841. MOVL R10, (AX)
  6842. JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
  6843. emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4through7:
  6844. MOVL (R9), R10
  6845. MOVL -4(R9)(R8*1), R9
  6846. MOVL R10, (AX)
  6847. MOVL R9, -4(AX)(R8*1)
  6848. JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
  6849. emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_8through16:
  6850. MOVQ (R9), R10
  6851. MOVQ -8(R9)(R8*1), R9
  6852. MOVQ R10, (AX)
  6853. MOVQ R9, -8(AX)(R8*1)
  6854. JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
  6855. emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_17through32:
  6856. MOVOU (R9), X0
  6857. MOVOU -16(R9)(R8*1), X1
  6858. MOVOU X0, (AX)
  6859. MOVOU X1, -16(AX)(R8*1)
  6860. JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
  6861. emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_33through64:
  6862. MOVOU (R9), X0
  6863. MOVOU 16(R9), X1
  6864. MOVOU -32(R9)(R8*1), X2
  6865. MOVOU -16(R9)(R8*1), X3
  6866. MOVOU X0, (AX)
  6867. MOVOU X1, 16(AX)
  6868. MOVOU X2, -32(AX)(R8*1)
  6869. MOVOU X3, -16(AX)(R8*1)
  6870. memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB:
  6871. MOVQ BX, AX
  6872. JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB
  6873. memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB:
  6874. LEAQ (AX)(R8*1), BX
  6875. // genMemMoveLong
  6876. MOVOU (R9), X0
  6877. MOVOU 16(R9), X1
  6878. MOVOU -32(R9)(R8*1), X2
  6879. MOVOU -16(R9)(R8*1), X3
  6880. MOVQ R8, R12
  6881. SHRQ $0x05, R12
  6882. MOVQ AX, R10
  6883. ANDL $0x0000001f, R10
  6884. MOVQ $0x00000040, R13
  6885. SUBQ R10, R13
  6886. DECQ R12
  6887. JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
  6888. LEAQ -32(R9)(R13*1), R10
  6889. LEAQ -32(AX)(R13*1), R14
  6890. emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_big_loop_back:
  6891. MOVOU (R10), X4
  6892. MOVOU 16(R10), X5
  6893. MOVOA X4, (R14)
  6894. MOVOA X5, 16(R14)
  6895. ADDQ $0x20, R14
  6896. ADDQ $0x20, R10
  6897. ADDQ $0x20, R13
  6898. DECQ R12
  6899. JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_big_loop_back
  6900. emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32:
  6901. MOVOU -32(R9)(R13*1), X4
  6902. MOVOU -16(R9)(R13*1), X5
  6903. MOVOA X4, -32(AX)(R13*1)
  6904. MOVOA X5, -16(AX)(R13*1)
  6905. ADDQ $0x20, R13
  6906. CMPQ R8, R13
  6907. JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
  6908. MOVOU X0, (AX)
  6909. MOVOU X1, 16(AX)
  6910. MOVOU X2, -32(AX)(R8*1)
  6911. MOVOU X3, -16(AX)(R8*1)
  6912. MOVQ BX, AX
  6913. emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB:
  6914. ADDL R11, CX
  6915. ADDL $0x04, R11
  6916. MOVL CX, 12(SP)
  6917. // emitRepeat
  6918. MOVL R11, BX
  6919. LEAL -4(R11), R11
  6920. CMPL BX, $0x08
  6921. JBE repeat_two_match_nolit_repeat_encodeBetterBlockAsm4MB
  6922. CMPL BX, $0x0c
  6923. JAE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB
  6924. CMPL DI, $0x00000800
  6925. JB repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB
  6926. cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB:
  6927. CMPL R11, $0x00000104
  6928. JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm4MB
  6929. CMPL R11, $0x00010100
  6930. JB repeat_four_match_nolit_repeat_encodeBetterBlockAsm4MB
  6931. LEAL -65536(R11), R11
  6932. MOVL R11, DI
  6933. MOVW $0x001d, (AX)
  6934. MOVW R11, 2(AX)
  6935. SARL $0x10, DI
  6936. MOVB DI, 4(AX)
  6937. ADDQ $0x05, AX
  6938. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
  6939. repeat_four_match_nolit_repeat_encodeBetterBlockAsm4MB:
  6940. LEAL -256(R11), R11
  6941. MOVW $0x0019, (AX)
  6942. MOVW R11, 2(AX)
  6943. ADDQ $0x04, AX
  6944. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
  6945. repeat_three_match_nolit_repeat_encodeBetterBlockAsm4MB:
  6946. LEAL -4(R11), R11
  6947. MOVW $0x0015, (AX)
  6948. MOVB R11, 2(AX)
  6949. ADDQ $0x03, AX
  6950. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
  6951. repeat_two_match_nolit_repeat_encodeBetterBlockAsm4MB:
  6952. SHLL $0x02, R11
  6953. ORL $0x01, R11
  6954. MOVW R11, (AX)
  6955. ADDQ $0x02, AX
  6956. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
  6957. repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB:
  6958. XORQ BX, BX
  6959. LEAL 1(BX)(R11*4), R11
  6960. MOVB DI, 1(AX)
  6961. SARL $0x08, DI
  6962. SHLL $0x05, DI
  6963. ORL DI, R11
  6964. MOVB R11, (AX)
  6965. ADDQ $0x02, AX
  6966. match_nolit_emitcopy_end_encodeBetterBlockAsm4MB:
  6967. CMPL CX, 8(SP)
  6968. JAE emit_remainder_encodeBetterBlockAsm4MB
  6969. CMPQ AX, (SP)
  6970. JB match_nolit_dst_ok_encodeBetterBlockAsm4MB
  6971. MOVQ $0x00000000, ret+48(FP)
  6972. RET
  6973. match_nolit_dst_ok_encodeBetterBlockAsm4MB:
  6974. MOVQ $0x00cf1bbcdcbfa563, BX
  6975. MOVQ $0x9e3779b1, DI
  6976. LEAQ 1(SI), SI
  6977. LEAQ -2(CX), R8
  6978. MOVQ (DX)(SI*1), R9
  6979. MOVQ 1(DX)(SI*1), R10
  6980. MOVQ (DX)(R8*1), R11
  6981. MOVQ 1(DX)(R8*1), R12
  6982. SHLQ $0x08, R9
  6983. IMULQ BX, R9
  6984. SHRQ $0x2f, R9
  6985. SHLQ $0x20, R10
  6986. IMULQ DI, R10
  6987. SHRQ $0x32, R10
  6988. SHLQ $0x08, R11
  6989. IMULQ BX, R11
  6990. SHRQ $0x2f, R11
  6991. SHLQ $0x20, R12
  6992. IMULQ DI, R12
  6993. SHRQ $0x32, R12
  6994. LEAQ 1(SI), DI
  6995. LEAQ 1(R8), R13
  6996. MOVL SI, 24(SP)(R9*4)
  6997. MOVL R8, 24(SP)(R11*4)
  6998. MOVL DI, 524312(SP)(R10*4)
  6999. MOVL R13, 524312(SP)(R12*4)
  7000. LEAQ 1(R8)(SI*1), DI
  7001. SHRQ $0x01, DI
  7002. ADDQ $0x01, SI
  7003. SUBQ $0x01, R8
  7004. index_loop_encodeBetterBlockAsm4MB:
  7005. CMPQ DI, R8
  7006. JAE search_loop_encodeBetterBlockAsm4MB
  7007. MOVQ (DX)(SI*1), R9
  7008. MOVQ (DX)(DI*1), R10
  7009. SHLQ $0x08, R9
  7010. IMULQ BX, R9
  7011. SHRQ $0x2f, R9
  7012. SHLQ $0x08, R10
  7013. IMULQ BX, R10
  7014. SHRQ $0x2f, R10
  7015. MOVL SI, 24(SP)(R9*4)
  7016. MOVL DI, 24(SP)(R10*4)
  7017. ADDQ $0x02, SI
  7018. ADDQ $0x02, DI
  7019. JMP index_loop_encodeBetterBlockAsm4MB
  7020. emit_remainder_encodeBetterBlockAsm4MB:
  7021. MOVQ src_len+32(FP), CX
  7022. SUBL 12(SP), CX
  7023. LEAQ 4(AX)(CX*1), CX
  7024. CMPQ CX, (SP)
  7025. JB emit_remainder_ok_encodeBetterBlockAsm4MB
  7026. MOVQ $0x00000000, ret+48(FP)
  7027. RET
  7028. emit_remainder_ok_encodeBetterBlockAsm4MB:
  7029. MOVQ src_len+32(FP), CX
  7030. MOVL 12(SP), BX
  7031. CMPL BX, CX
  7032. JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB
  7033. MOVL CX, SI
  7034. MOVL CX, 12(SP)
  7035. LEAQ (DX)(BX*1), CX
  7036. SUBL BX, SI
  7037. LEAL -1(SI), DX
  7038. CMPL DX, $0x3c
  7039. JB one_byte_emit_remainder_encodeBetterBlockAsm4MB
  7040. CMPL DX, $0x00000100
  7041. JB two_bytes_emit_remainder_encodeBetterBlockAsm4MB
  7042. CMPL DX, $0x00010000
  7043. JB three_bytes_emit_remainder_encodeBetterBlockAsm4MB
  7044. MOVL DX, BX
  7045. SHRL $0x10, BX
  7046. MOVB $0xf8, (AX)
  7047. MOVW DX, 1(AX)
  7048. MOVB BL, 3(AX)
  7049. ADDQ $0x04, AX
  7050. JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB
  7051. three_bytes_emit_remainder_encodeBetterBlockAsm4MB:
  7052. MOVB $0xf4, (AX)
  7053. MOVW DX, 1(AX)
  7054. ADDQ $0x03, AX
  7055. JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB
  7056. two_bytes_emit_remainder_encodeBetterBlockAsm4MB:
  7057. MOVB $0xf0, (AX)
  7058. MOVB DL, 1(AX)
  7059. ADDQ $0x02, AX
  7060. CMPL DX, $0x40
  7061. JB memmove_emit_remainder_encodeBetterBlockAsm4MB
  7062. JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB
  7063. one_byte_emit_remainder_encodeBetterBlockAsm4MB:
  7064. SHLB $0x02, DL
  7065. MOVB DL, (AX)
  7066. ADDQ $0x01, AX
  7067. memmove_emit_remainder_encodeBetterBlockAsm4MB:
  7068. LEAQ (AX)(SI*1), DX
  7069. MOVL SI, BX
  7070. // genMemMoveShort
  7071. CMPQ BX, $0x03
  7072. JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_1or2
  7073. JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_3
  7074. CMPQ BX, $0x08
  7075. JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7
  7076. CMPQ BX, $0x10
  7077. JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_8through16
  7078. CMPQ BX, $0x20
  7079. JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32
  7080. JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64
  7081. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_1or2:
  7082. MOVB (CX), SI
  7083. MOVB -1(CX)(BX*1), CL
  7084. MOVB SI, (AX)
  7085. MOVB CL, -1(AX)(BX*1)
  7086. JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
  7087. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_3:
  7088. MOVW (CX), SI
  7089. MOVB 2(CX), CL
  7090. MOVW SI, (AX)
  7091. MOVB CL, 2(AX)
  7092. JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
  7093. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7:
  7094. MOVL (CX), SI
  7095. MOVL -4(CX)(BX*1), CX
  7096. MOVL SI, (AX)
  7097. MOVL CX, -4(AX)(BX*1)
  7098. JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
  7099. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_8through16:
  7100. MOVQ (CX), SI
  7101. MOVQ -8(CX)(BX*1), CX
  7102. MOVQ SI, (AX)
  7103. MOVQ CX, -8(AX)(BX*1)
  7104. JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
  7105. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32:
  7106. MOVOU (CX), X0
  7107. MOVOU -16(CX)(BX*1), X1
  7108. MOVOU X0, (AX)
  7109. MOVOU X1, -16(AX)(BX*1)
  7110. JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
  7111. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64:
  7112. MOVOU (CX), X0
  7113. MOVOU 16(CX), X1
  7114. MOVOU -32(CX)(BX*1), X2
  7115. MOVOU -16(CX)(BX*1), X3
  7116. MOVOU X0, (AX)
  7117. MOVOU X1, 16(AX)
  7118. MOVOU X2, -32(AX)(BX*1)
  7119. MOVOU X3, -16(AX)(BX*1)
  7120. memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB:
  7121. MOVQ DX, AX
  7122. JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB
  7123. memmove_long_emit_remainder_encodeBetterBlockAsm4MB:
  7124. LEAQ (AX)(SI*1), DX
  7125. MOVL SI, BX
  7126. // genMemMoveLong
  7127. MOVOU (CX), X0
  7128. MOVOU 16(CX), X1
  7129. MOVOU -32(CX)(BX*1), X2
  7130. MOVOU -16(CX)(BX*1), X3
  7131. MOVQ BX, DI
  7132. SHRQ $0x05, DI
  7133. MOVQ AX, SI
  7134. ANDL $0x0000001f, SI
  7135. MOVQ $0x00000040, R8
  7136. SUBQ SI, R8
  7137. DECQ DI
  7138. JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
  7139. LEAQ -32(CX)(R8*1), SI
  7140. LEAQ -32(AX)(R8*1), R9
  7141. emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_big_loop_back:
  7142. MOVOU (SI), X4
  7143. MOVOU 16(SI), X5
  7144. MOVOA X4, (R9)
  7145. MOVOA X5, 16(R9)
  7146. ADDQ $0x20, R9
  7147. ADDQ $0x20, SI
  7148. ADDQ $0x20, R8
  7149. DECQ DI
  7150. JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_big_loop_back
  7151. emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32:
  7152. MOVOU -32(CX)(R8*1), X4
  7153. MOVOU -16(CX)(R8*1), X5
  7154. MOVOA X4, -32(AX)(R8*1)
  7155. MOVOA X5, -16(AX)(R8*1)
  7156. ADDQ $0x20, R8
  7157. CMPQ BX, R8
  7158. JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
  7159. MOVOU X0, (AX)
  7160. MOVOU X1, 16(AX)
  7161. MOVOU X2, -32(AX)(BX*1)
  7162. MOVOU X3, -16(AX)(BX*1)
  7163. MOVQ DX, AX
  7164. emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB:
  7165. MOVQ dst_base+0(FP), CX
  7166. SUBQ CX, AX
  7167. MOVQ AX, ret+48(FP)
  7168. RET
  7169. // func encodeBetterBlockAsm12B(dst []byte, src []byte) int
  7170. // Requires: BMI, SSE2
  7171. TEXT ·encodeBetterBlockAsm12B(SB), $81944-56
  7172. MOVQ dst_base+0(FP), AX
  7173. MOVQ $0x00000280, CX
  7174. LEAQ 24(SP), DX
  7175. PXOR X0, X0
  7176. zero_loop_encodeBetterBlockAsm12B:
  7177. MOVOU X0, (DX)
  7178. MOVOU X0, 16(DX)
  7179. MOVOU X0, 32(DX)
  7180. MOVOU X0, 48(DX)
  7181. MOVOU X0, 64(DX)
  7182. MOVOU X0, 80(DX)
  7183. MOVOU X0, 96(DX)
  7184. MOVOU X0, 112(DX)
  7185. ADDQ $0x80, DX
  7186. DECQ CX
  7187. JNZ zero_loop_encodeBetterBlockAsm12B
  7188. MOVL $0x00000000, 12(SP)
  7189. MOVQ src_len+32(FP), CX
  7190. LEAQ -6(CX), DX
  7191. LEAQ -8(CX), BX
  7192. MOVL BX, 8(SP)
  7193. SHRQ $0x05, CX
  7194. SUBL CX, DX
  7195. LEAQ (AX)(DX*1), DX
  7196. MOVQ DX, (SP)
  7197. MOVL $0x00000001, CX
  7198. MOVL $0x00000000, 16(SP)
  7199. MOVQ src_base+24(FP), DX
  7200. search_loop_encodeBetterBlockAsm12B:
  7201. MOVL CX, BX
  7202. SUBL 12(SP), BX
  7203. SHRL $0x06, BX
  7204. LEAL 1(CX)(BX*1), BX
  7205. CMPL BX, 8(SP)
  7206. JAE emit_remainder_encodeBetterBlockAsm12B
  7207. MOVQ (DX)(CX*1), SI
  7208. MOVL BX, 20(SP)
  7209. MOVQ $0x0000cf1bbcdcbf9b, R8
  7210. MOVQ $0x9e3779b1, BX
  7211. MOVQ SI, R9
  7212. MOVQ SI, R10
  7213. SHLQ $0x10, R9
  7214. IMULQ R8, R9
  7215. SHRQ $0x32, R9
  7216. SHLQ $0x20, R10
  7217. IMULQ BX, R10
  7218. SHRQ $0x34, R10
  7219. MOVL 24(SP)(R9*4), BX
  7220. MOVL 65560(SP)(R10*4), DI
  7221. MOVL CX, 24(SP)(R9*4)
  7222. MOVL CX, 65560(SP)(R10*4)
  7223. MOVQ (DX)(BX*1), R9
  7224. MOVQ (DX)(DI*1), R10
  7225. CMPQ R9, SI
  7226. JEQ candidate_match_encodeBetterBlockAsm12B
  7227. CMPQ R10, SI
  7228. JNE no_short_found_encodeBetterBlockAsm12B
  7229. MOVL DI, BX
  7230. JMP candidate_match_encodeBetterBlockAsm12B
  7231. no_short_found_encodeBetterBlockAsm12B:
  7232. CMPL R9, SI
  7233. JEQ candidate_match_encodeBetterBlockAsm12B
  7234. CMPL R10, SI
  7235. JEQ candidateS_match_encodeBetterBlockAsm12B
  7236. MOVL 20(SP), CX
  7237. JMP search_loop_encodeBetterBlockAsm12B
  7238. candidateS_match_encodeBetterBlockAsm12B:
  7239. SHRQ $0x08, SI
  7240. MOVQ SI, R9
  7241. SHLQ $0x10, R9
  7242. IMULQ R8, R9
  7243. SHRQ $0x32, R9
  7244. MOVL 24(SP)(R9*4), BX
  7245. INCL CX
  7246. MOVL CX, 24(SP)(R9*4)
  7247. CMPL (DX)(BX*1), SI
  7248. JEQ candidate_match_encodeBetterBlockAsm12B
  7249. DECL CX
  7250. MOVL DI, BX
  7251. candidate_match_encodeBetterBlockAsm12B:
  7252. MOVL 12(SP), SI
  7253. TESTL BX, BX
  7254. JZ match_extend_back_end_encodeBetterBlockAsm12B
  7255. match_extend_back_loop_encodeBetterBlockAsm12B:
  7256. CMPL CX, SI
  7257. JBE match_extend_back_end_encodeBetterBlockAsm12B
  7258. MOVB -1(DX)(BX*1), DI
  7259. MOVB -1(DX)(CX*1), R8
  7260. CMPB DI, R8
  7261. JNE match_extend_back_end_encodeBetterBlockAsm12B
  7262. LEAL -1(CX), CX
  7263. DECL BX
  7264. JZ match_extend_back_end_encodeBetterBlockAsm12B
  7265. JMP match_extend_back_loop_encodeBetterBlockAsm12B
  7266. match_extend_back_end_encodeBetterBlockAsm12B:
  7267. MOVL CX, SI
  7268. SUBL 12(SP), SI
  7269. LEAQ 3(AX)(SI*1), SI
  7270. CMPQ SI, (SP)
  7271. JB match_dst_size_check_encodeBetterBlockAsm12B
  7272. MOVQ $0x00000000, ret+48(FP)
  7273. RET
  7274. match_dst_size_check_encodeBetterBlockAsm12B:
  7275. MOVL CX, SI
  7276. ADDL $0x04, CX
  7277. ADDL $0x04, BX
  7278. MOVQ src_len+32(FP), DI
  7279. SUBL CX, DI
  7280. LEAQ (DX)(CX*1), R8
  7281. LEAQ (DX)(BX*1), R9
  7282. // matchLen
  7283. XORL R11, R11
  7284. matchlen_loopback_16_match_nolit_encodeBetterBlockAsm12B:
  7285. CMPL DI, $0x10
  7286. JB matchlen_match8_match_nolit_encodeBetterBlockAsm12B
  7287. MOVQ (R8)(R11*1), R10
  7288. MOVQ 8(R8)(R11*1), R12
  7289. XORQ (R9)(R11*1), R10
  7290. JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm12B
  7291. XORQ 8(R9)(R11*1), R12
  7292. JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm12B
  7293. LEAL -16(DI), DI
  7294. LEAL 16(R11), R11
  7295. JMP matchlen_loopback_16_match_nolit_encodeBetterBlockAsm12B
  7296. matchlen_bsf_16match_nolit_encodeBetterBlockAsm12B:
  7297. #ifdef GOAMD64_v3
  7298. TZCNTQ R12, R12
  7299. #else
  7300. BSFQ R12, R12
  7301. #endif
  7302. SARQ $0x03, R12
  7303. LEAL 8(R11)(R12*1), R11
  7304. JMP match_nolit_end_encodeBetterBlockAsm12B
  7305. matchlen_match8_match_nolit_encodeBetterBlockAsm12B:
  7306. CMPL DI, $0x08
  7307. JB matchlen_match4_match_nolit_encodeBetterBlockAsm12B
  7308. MOVQ (R8)(R11*1), R10
  7309. XORQ (R9)(R11*1), R10
  7310. JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm12B
  7311. LEAL -8(DI), DI
  7312. LEAL 8(R11), R11
  7313. JMP matchlen_match4_match_nolit_encodeBetterBlockAsm12B
  7314. matchlen_bsf_8_match_nolit_encodeBetterBlockAsm12B:
  7315. #ifdef GOAMD64_v3
  7316. TZCNTQ R10, R10
  7317. #else
  7318. BSFQ R10, R10
  7319. #endif
  7320. SARQ $0x03, R10
  7321. LEAL (R11)(R10*1), R11
  7322. JMP match_nolit_end_encodeBetterBlockAsm12B
  7323. matchlen_match4_match_nolit_encodeBetterBlockAsm12B:
  7324. CMPL DI, $0x04
  7325. JB matchlen_match2_match_nolit_encodeBetterBlockAsm12B
  7326. MOVL (R8)(R11*1), R10
  7327. CMPL (R9)(R11*1), R10
  7328. JNE matchlen_match2_match_nolit_encodeBetterBlockAsm12B
  7329. LEAL -4(DI), DI
  7330. LEAL 4(R11), R11
  7331. matchlen_match2_match_nolit_encodeBetterBlockAsm12B:
  7332. CMPL DI, $0x01
  7333. JE matchlen_match1_match_nolit_encodeBetterBlockAsm12B
  7334. JB match_nolit_end_encodeBetterBlockAsm12B
  7335. MOVW (R8)(R11*1), R10
  7336. CMPW (R9)(R11*1), R10
  7337. JNE matchlen_match1_match_nolit_encodeBetterBlockAsm12B
  7338. LEAL 2(R11), R11
  7339. SUBL $0x02, DI
  7340. JZ match_nolit_end_encodeBetterBlockAsm12B
  7341. matchlen_match1_match_nolit_encodeBetterBlockAsm12B:
  7342. MOVB (R8)(R11*1), R10
  7343. CMPB (R9)(R11*1), R10
  7344. JNE match_nolit_end_encodeBetterBlockAsm12B
  7345. LEAL 1(R11), R11
  7346. match_nolit_end_encodeBetterBlockAsm12B:
  7347. MOVL CX, DI
  7348. SUBL BX, DI
  7349. // Check if repeat
  7350. CMPL 16(SP), DI
  7351. JEQ match_is_repeat_encodeBetterBlockAsm12B
  7352. MOVL DI, 16(SP)
  7353. MOVL 12(SP), BX
  7354. CMPL BX, SI
  7355. JEQ emit_literal_done_match_emit_encodeBetterBlockAsm12B
  7356. MOVL SI, R8
  7357. MOVL SI, 12(SP)
  7358. LEAQ (DX)(BX*1), R9
  7359. SUBL BX, R8
  7360. LEAL -1(R8), BX
  7361. CMPL BX, $0x3c
  7362. JB one_byte_match_emit_encodeBetterBlockAsm12B
  7363. CMPL BX, $0x00000100
  7364. JB two_bytes_match_emit_encodeBetterBlockAsm12B
  7365. JB three_bytes_match_emit_encodeBetterBlockAsm12B
  7366. three_bytes_match_emit_encodeBetterBlockAsm12B:
  7367. MOVB $0xf4, (AX)
  7368. MOVW BX, 1(AX)
  7369. ADDQ $0x03, AX
  7370. JMP memmove_long_match_emit_encodeBetterBlockAsm12B
  7371. two_bytes_match_emit_encodeBetterBlockAsm12B:
  7372. MOVB $0xf0, (AX)
  7373. MOVB BL, 1(AX)
  7374. ADDQ $0x02, AX
  7375. CMPL BX, $0x40
  7376. JB memmove_match_emit_encodeBetterBlockAsm12B
  7377. JMP memmove_long_match_emit_encodeBetterBlockAsm12B
  7378. one_byte_match_emit_encodeBetterBlockAsm12B:
  7379. SHLB $0x02, BL
  7380. MOVB BL, (AX)
  7381. ADDQ $0x01, AX
  7382. memmove_match_emit_encodeBetterBlockAsm12B:
  7383. LEAQ (AX)(R8*1), BX
  7384. // genMemMoveShort
  7385. CMPQ R8, $0x04
  7386. JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4
  7387. CMPQ R8, $0x08
  7388. JB emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7
  7389. CMPQ R8, $0x10
  7390. JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16
  7391. CMPQ R8, $0x20
  7392. JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32
  7393. JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64
  7394. emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4:
  7395. MOVL (R9), R10
  7396. MOVL R10, (AX)
  7397. JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B
  7398. emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7:
  7399. MOVL (R9), R10
  7400. MOVL -4(R9)(R8*1), R9
  7401. MOVL R10, (AX)
  7402. MOVL R9, -4(AX)(R8*1)
  7403. JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B
  7404. emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16:
  7405. MOVQ (R9), R10
  7406. MOVQ -8(R9)(R8*1), R9
  7407. MOVQ R10, (AX)
  7408. MOVQ R9, -8(AX)(R8*1)
  7409. JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B
  7410. emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32:
  7411. MOVOU (R9), X0
  7412. MOVOU -16(R9)(R8*1), X1
  7413. MOVOU X0, (AX)
  7414. MOVOU X1, -16(AX)(R8*1)
  7415. JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B
  7416. emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64:
  7417. MOVOU (R9), X0
  7418. MOVOU 16(R9), X1
  7419. MOVOU -32(R9)(R8*1), X2
  7420. MOVOU -16(R9)(R8*1), X3
  7421. MOVOU X0, (AX)
  7422. MOVOU X1, 16(AX)
  7423. MOVOU X2, -32(AX)(R8*1)
  7424. MOVOU X3, -16(AX)(R8*1)
  7425. memmove_end_copy_match_emit_encodeBetterBlockAsm12B:
  7426. MOVQ BX, AX
  7427. JMP emit_literal_done_match_emit_encodeBetterBlockAsm12B
  7428. memmove_long_match_emit_encodeBetterBlockAsm12B:
  7429. LEAQ (AX)(R8*1), BX
  7430. // genMemMoveLong
  7431. MOVOU (R9), X0
  7432. MOVOU 16(R9), X1
  7433. MOVOU -32(R9)(R8*1), X2
  7434. MOVOU -16(R9)(R8*1), X3
  7435. MOVQ R8, R12
  7436. SHRQ $0x05, R12
  7437. MOVQ AX, R10
  7438. ANDL $0x0000001f, R10
  7439. MOVQ $0x00000040, R13
  7440. SUBQ R10, R13
  7441. DECQ R12
  7442. JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
  7443. LEAQ -32(R9)(R13*1), R10
  7444. LEAQ -32(AX)(R13*1), R14
  7445. emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back:
  7446. MOVOU (R10), X4
  7447. MOVOU 16(R10), X5
  7448. MOVOA X4, (R14)
  7449. MOVOA X5, 16(R14)
  7450. ADDQ $0x20, R14
  7451. ADDQ $0x20, R10
  7452. ADDQ $0x20, R13
  7453. DECQ R12
  7454. JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back
  7455. emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32:
  7456. MOVOU -32(R9)(R13*1), X4
  7457. MOVOU -16(R9)(R13*1), X5
  7458. MOVOA X4, -32(AX)(R13*1)
  7459. MOVOA X5, -16(AX)(R13*1)
  7460. ADDQ $0x20, R13
  7461. CMPQ R8, R13
  7462. JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
  7463. MOVOU X0, (AX)
  7464. MOVOU X1, 16(AX)
  7465. MOVOU X2, -32(AX)(R8*1)
  7466. MOVOU X3, -16(AX)(R8*1)
  7467. MOVQ BX, AX
  7468. emit_literal_done_match_emit_encodeBetterBlockAsm12B:
  7469. ADDL R11, CX
  7470. ADDL $0x04, R11
  7471. MOVL CX, 12(SP)
  7472. // emitCopy
  7473. CMPL R11, $0x40
  7474. JBE two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B
  7475. CMPL DI, $0x00000800
  7476. JAE long_offset_short_match_nolit_encodeBetterBlockAsm12B
  7477. MOVL $0x00000001, BX
  7478. LEAL 16(BX), BX
  7479. MOVB DI, 1(AX)
  7480. SHRL $0x08, DI
  7481. SHLL $0x05, DI
  7482. ORL DI, BX
  7483. MOVB BL, (AX)
  7484. ADDQ $0x02, AX
  7485. SUBL $0x08, R11
  7486. // emitRepeat
  7487. LEAL -4(R11), R11
  7488. JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b
  7489. MOVL R11, BX
  7490. LEAL -4(R11), R11
  7491. CMPL BX, $0x08
  7492. JBE repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b
  7493. CMPL BX, $0x0c
  7494. JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b
  7495. CMPL DI, $0x00000800
  7496. JB repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b
  7497. cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b:
  7498. CMPL R11, $0x00000104
  7499. JB repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b
  7500. LEAL -256(R11), R11
  7501. MOVW $0x0019, (AX)
  7502. MOVW R11, 2(AX)
  7503. ADDQ $0x04, AX
  7504. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
  7505. repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b:
  7506. LEAL -4(R11), R11
  7507. MOVW $0x0015, (AX)
  7508. MOVB R11, 2(AX)
  7509. ADDQ $0x03, AX
  7510. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
  7511. repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b:
  7512. SHLL $0x02, R11
  7513. ORL $0x01, R11
  7514. MOVW R11, (AX)
  7515. ADDQ $0x02, AX
  7516. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
  7517. repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b:
  7518. XORQ BX, BX
  7519. LEAL 1(BX)(R11*4), R11
  7520. MOVB DI, 1(AX)
  7521. SARL $0x08, DI
  7522. SHLL $0x05, DI
  7523. ORL DI, R11
  7524. MOVB R11, (AX)
  7525. ADDQ $0x02, AX
  7526. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
  7527. long_offset_short_match_nolit_encodeBetterBlockAsm12B:
  7528. MOVB $0xee, (AX)
  7529. MOVW DI, 1(AX)
  7530. LEAL -60(R11), R11
  7531. ADDQ $0x03, AX
  7532. // emitRepeat
  7533. MOVL R11, BX
  7534. LEAL -4(R11), R11
  7535. CMPL BX, $0x08
  7536. JBE repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
  7537. CMPL BX, $0x0c
  7538. JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
  7539. CMPL DI, $0x00000800
  7540. JB repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
  7541. cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
  7542. CMPL R11, $0x00000104
  7543. JB repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
  7544. LEAL -256(R11), R11
  7545. MOVW $0x0019, (AX)
  7546. MOVW R11, 2(AX)
  7547. ADDQ $0x04, AX
  7548. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
  7549. repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
  7550. LEAL -4(R11), R11
  7551. MOVW $0x0015, (AX)
  7552. MOVB R11, 2(AX)
  7553. ADDQ $0x03, AX
  7554. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
  7555. repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
  7556. SHLL $0x02, R11
  7557. ORL $0x01, R11
  7558. MOVW R11, (AX)
  7559. ADDQ $0x02, AX
  7560. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
  7561. repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
  7562. XORQ BX, BX
  7563. LEAL 1(BX)(R11*4), R11
  7564. MOVB DI, 1(AX)
  7565. SARL $0x08, DI
  7566. SHLL $0x05, DI
  7567. ORL DI, R11
  7568. MOVB R11, (AX)
  7569. ADDQ $0x02, AX
  7570. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
  7571. two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B:
  7572. MOVL R11, BX
  7573. SHLL $0x02, BX
  7574. CMPL R11, $0x0c
  7575. JAE emit_copy_three_match_nolit_encodeBetterBlockAsm12B
  7576. CMPL DI, $0x00000800
  7577. JAE emit_copy_three_match_nolit_encodeBetterBlockAsm12B
  7578. LEAL -15(BX), BX
  7579. MOVB DI, 1(AX)
  7580. SHRL $0x08, DI
  7581. SHLL $0x05, DI
  7582. ORL DI, BX
  7583. MOVB BL, (AX)
  7584. ADDQ $0x02, AX
  7585. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
  7586. emit_copy_three_match_nolit_encodeBetterBlockAsm12B:
  7587. LEAL -2(BX), BX
  7588. MOVB BL, (AX)
  7589. MOVW DI, 1(AX)
  7590. ADDQ $0x03, AX
  7591. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
  7592. match_is_repeat_encodeBetterBlockAsm12B:
  7593. MOVL 12(SP), BX
  7594. CMPL BX, SI
  7595. JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B
  7596. MOVL SI, R8
  7597. MOVL SI, 12(SP)
  7598. LEAQ (DX)(BX*1), R9
  7599. SUBL BX, R8
  7600. LEAL -1(R8), BX
  7601. CMPL BX, $0x3c
  7602. JB one_byte_match_emit_repeat_encodeBetterBlockAsm12B
  7603. CMPL BX, $0x00000100
  7604. JB two_bytes_match_emit_repeat_encodeBetterBlockAsm12B
  7605. JB three_bytes_match_emit_repeat_encodeBetterBlockAsm12B
  7606. three_bytes_match_emit_repeat_encodeBetterBlockAsm12B:
  7607. MOVB $0xf4, (AX)
  7608. MOVW BX, 1(AX)
  7609. ADDQ $0x03, AX
  7610. JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm12B
  7611. two_bytes_match_emit_repeat_encodeBetterBlockAsm12B:
  7612. MOVB $0xf0, (AX)
  7613. MOVB BL, 1(AX)
  7614. ADDQ $0x02, AX
  7615. CMPL BX, $0x40
  7616. JB memmove_match_emit_repeat_encodeBetterBlockAsm12B
  7617. JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm12B
  7618. one_byte_match_emit_repeat_encodeBetterBlockAsm12B:
  7619. SHLB $0x02, BL
  7620. MOVB BL, (AX)
  7621. ADDQ $0x01, AX
  7622. memmove_match_emit_repeat_encodeBetterBlockAsm12B:
  7623. LEAQ (AX)(R8*1), BX
  7624. // genMemMoveShort
  7625. CMPQ R8, $0x04
  7626. JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4
  7627. CMPQ R8, $0x08
  7628. JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4through7
  7629. CMPQ R8, $0x10
  7630. JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_8through16
  7631. CMPQ R8, $0x20
  7632. JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_17through32
  7633. JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_33through64
  7634. emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4:
  7635. MOVL (R9), R10
  7636. MOVL R10, (AX)
  7637. JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
  7638. emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4through7:
  7639. MOVL (R9), R10
  7640. MOVL -4(R9)(R8*1), R9
  7641. MOVL R10, (AX)
  7642. MOVL R9, -4(AX)(R8*1)
  7643. JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
  7644. emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_8through16:
  7645. MOVQ (R9), R10
  7646. MOVQ -8(R9)(R8*1), R9
  7647. MOVQ R10, (AX)
  7648. MOVQ R9, -8(AX)(R8*1)
  7649. JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
  7650. emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_17through32:
  7651. MOVOU (R9), X0
  7652. MOVOU -16(R9)(R8*1), X1
  7653. MOVOU X0, (AX)
  7654. MOVOU X1, -16(AX)(R8*1)
  7655. JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
  7656. emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_33through64:
  7657. MOVOU (R9), X0
  7658. MOVOU 16(R9), X1
  7659. MOVOU -32(R9)(R8*1), X2
  7660. MOVOU -16(R9)(R8*1), X3
  7661. MOVOU X0, (AX)
  7662. MOVOU X1, 16(AX)
  7663. MOVOU X2, -32(AX)(R8*1)
  7664. MOVOU X3, -16(AX)(R8*1)
  7665. memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B:
  7666. MOVQ BX, AX
  7667. JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B
  7668. memmove_long_match_emit_repeat_encodeBetterBlockAsm12B:
  7669. LEAQ (AX)(R8*1), BX
  7670. // genMemMoveLong
  7671. MOVOU (R9), X0
  7672. MOVOU 16(R9), X1
  7673. MOVOU -32(R9)(R8*1), X2
  7674. MOVOU -16(R9)(R8*1), X3
  7675. MOVQ R8, R12
  7676. SHRQ $0x05, R12
  7677. MOVQ AX, R10
  7678. ANDL $0x0000001f, R10
  7679. MOVQ $0x00000040, R13
  7680. SUBQ R10, R13
  7681. DECQ R12
  7682. JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
  7683. LEAQ -32(R9)(R13*1), R10
  7684. LEAQ -32(AX)(R13*1), R14
  7685. emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_big_loop_back:
  7686. MOVOU (R10), X4
  7687. MOVOU 16(R10), X5
  7688. MOVOA X4, (R14)
  7689. MOVOA X5, 16(R14)
  7690. ADDQ $0x20, R14
  7691. ADDQ $0x20, R10
  7692. ADDQ $0x20, R13
  7693. DECQ R12
  7694. JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_big_loop_back
  7695. emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32:
  7696. MOVOU -32(R9)(R13*1), X4
  7697. MOVOU -16(R9)(R13*1), X5
  7698. MOVOA X4, -32(AX)(R13*1)
  7699. MOVOA X5, -16(AX)(R13*1)
  7700. ADDQ $0x20, R13
  7701. CMPQ R8, R13
  7702. JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
  7703. MOVOU X0, (AX)
  7704. MOVOU X1, 16(AX)
  7705. MOVOU X2, -32(AX)(R8*1)
  7706. MOVOU X3, -16(AX)(R8*1)
  7707. MOVQ BX, AX
  7708. emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B:
  7709. ADDL R11, CX
  7710. ADDL $0x04, R11
  7711. MOVL CX, 12(SP)
  7712. // emitRepeat
  7713. MOVL R11, BX
  7714. LEAL -4(R11), R11
  7715. CMPL BX, $0x08
  7716. JBE repeat_two_match_nolit_repeat_encodeBetterBlockAsm12B
  7717. CMPL BX, $0x0c
  7718. JAE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B
  7719. CMPL DI, $0x00000800
  7720. JB repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B
  7721. cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B:
  7722. CMPL R11, $0x00000104
  7723. JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm12B
  7724. LEAL -256(R11), R11
  7725. MOVW $0x0019, (AX)
  7726. MOVW R11, 2(AX)
  7727. ADDQ $0x04, AX
  7728. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
  7729. repeat_three_match_nolit_repeat_encodeBetterBlockAsm12B:
  7730. LEAL -4(R11), R11
  7731. MOVW $0x0015, (AX)
  7732. MOVB R11, 2(AX)
  7733. ADDQ $0x03, AX
  7734. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
  7735. repeat_two_match_nolit_repeat_encodeBetterBlockAsm12B:
  7736. SHLL $0x02, R11
  7737. ORL $0x01, R11
  7738. MOVW R11, (AX)
  7739. ADDQ $0x02, AX
  7740. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
  7741. repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B:
  7742. XORQ BX, BX
  7743. LEAL 1(BX)(R11*4), R11
  7744. MOVB DI, 1(AX)
  7745. SARL $0x08, DI
  7746. SHLL $0x05, DI
  7747. ORL DI, R11
  7748. MOVB R11, (AX)
  7749. ADDQ $0x02, AX
  7750. match_nolit_emitcopy_end_encodeBetterBlockAsm12B:
  7751. CMPL CX, 8(SP)
  7752. JAE emit_remainder_encodeBetterBlockAsm12B
  7753. CMPQ AX, (SP)
  7754. JB match_nolit_dst_ok_encodeBetterBlockAsm12B
  7755. MOVQ $0x00000000, ret+48(FP)
  7756. RET
  7757. match_nolit_dst_ok_encodeBetterBlockAsm12B:
  7758. MOVQ $0x0000cf1bbcdcbf9b, BX
  7759. MOVQ $0x9e3779b1, DI
  7760. LEAQ 1(SI), SI
  7761. LEAQ -2(CX), R8
  7762. MOVQ (DX)(SI*1), R9
  7763. MOVQ 1(DX)(SI*1), R10
  7764. MOVQ (DX)(R8*1), R11
  7765. MOVQ 1(DX)(R8*1), R12
  7766. SHLQ $0x10, R9
  7767. IMULQ BX, R9
  7768. SHRQ $0x32, R9
  7769. SHLQ $0x20, R10
  7770. IMULQ DI, R10
  7771. SHRQ $0x34, R10
  7772. SHLQ $0x10, R11
  7773. IMULQ BX, R11
  7774. SHRQ $0x32, R11
  7775. SHLQ $0x20, R12
  7776. IMULQ DI, R12
  7777. SHRQ $0x34, R12
  7778. LEAQ 1(SI), DI
  7779. LEAQ 1(R8), R13
  7780. MOVL SI, 24(SP)(R9*4)
  7781. MOVL R8, 24(SP)(R11*4)
  7782. MOVL DI, 65560(SP)(R10*4)
  7783. MOVL R13, 65560(SP)(R12*4)
  7784. LEAQ 1(R8)(SI*1), DI
  7785. SHRQ $0x01, DI
  7786. ADDQ $0x01, SI
  7787. SUBQ $0x01, R8
  7788. index_loop_encodeBetterBlockAsm12B:
  7789. CMPQ DI, R8
  7790. JAE search_loop_encodeBetterBlockAsm12B
  7791. MOVQ (DX)(SI*1), R9
  7792. MOVQ (DX)(DI*1), R10
  7793. SHLQ $0x10, R9
  7794. IMULQ BX, R9
  7795. SHRQ $0x32, R9
  7796. SHLQ $0x10, R10
  7797. IMULQ BX, R10
  7798. SHRQ $0x32, R10
  7799. MOVL SI, 24(SP)(R9*4)
  7800. MOVL DI, 24(SP)(R10*4)
  7801. ADDQ $0x02, SI
  7802. ADDQ $0x02, DI
  7803. JMP index_loop_encodeBetterBlockAsm12B
  7804. emit_remainder_encodeBetterBlockAsm12B:
  7805. MOVQ src_len+32(FP), CX
  7806. SUBL 12(SP), CX
  7807. LEAQ 3(AX)(CX*1), CX
  7808. CMPQ CX, (SP)
  7809. JB emit_remainder_ok_encodeBetterBlockAsm12B
  7810. MOVQ $0x00000000, ret+48(FP)
  7811. RET
  7812. emit_remainder_ok_encodeBetterBlockAsm12B:
  7813. MOVQ src_len+32(FP), CX
  7814. MOVL 12(SP), BX
  7815. CMPL BX, CX
  7816. JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm12B
  7817. MOVL CX, SI
  7818. MOVL CX, 12(SP)
  7819. LEAQ (DX)(BX*1), CX
  7820. SUBL BX, SI
  7821. LEAL -1(SI), DX
  7822. CMPL DX, $0x3c
  7823. JB one_byte_emit_remainder_encodeBetterBlockAsm12B
  7824. CMPL DX, $0x00000100
  7825. JB two_bytes_emit_remainder_encodeBetterBlockAsm12B
  7826. JB three_bytes_emit_remainder_encodeBetterBlockAsm12B
  7827. three_bytes_emit_remainder_encodeBetterBlockAsm12B:
  7828. MOVB $0xf4, (AX)
  7829. MOVW DX, 1(AX)
  7830. ADDQ $0x03, AX
  7831. JMP memmove_long_emit_remainder_encodeBetterBlockAsm12B
  7832. two_bytes_emit_remainder_encodeBetterBlockAsm12B:
  7833. MOVB $0xf0, (AX)
  7834. MOVB DL, 1(AX)
  7835. ADDQ $0x02, AX
  7836. CMPL DX, $0x40
  7837. JB memmove_emit_remainder_encodeBetterBlockAsm12B
  7838. JMP memmove_long_emit_remainder_encodeBetterBlockAsm12B
  7839. one_byte_emit_remainder_encodeBetterBlockAsm12B:
  7840. SHLB $0x02, DL
  7841. MOVB DL, (AX)
  7842. ADDQ $0x01, AX
  7843. memmove_emit_remainder_encodeBetterBlockAsm12B:
  7844. LEAQ (AX)(SI*1), DX
  7845. MOVL SI, BX
  7846. // genMemMoveShort
  7847. CMPQ BX, $0x03
  7848. JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_1or2
  7849. JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_3
  7850. CMPQ BX, $0x08
  7851. JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7
  7852. CMPQ BX, $0x10
  7853. JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16
  7854. CMPQ BX, $0x20
  7855. JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32
  7856. JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64
  7857. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_1or2:
  7858. MOVB (CX), SI
  7859. MOVB -1(CX)(BX*1), CL
  7860. MOVB SI, (AX)
  7861. MOVB CL, -1(AX)(BX*1)
  7862. JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
  7863. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_3:
  7864. MOVW (CX), SI
  7865. MOVB 2(CX), CL
  7866. MOVW SI, (AX)
  7867. MOVB CL, 2(AX)
  7868. JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
  7869. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7:
  7870. MOVL (CX), SI
  7871. MOVL -4(CX)(BX*1), CX
  7872. MOVL SI, (AX)
  7873. MOVL CX, -4(AX)(BX*1)
  7874. JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
  7875. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16:
  7876. MOVQ (CX), SI
  7877. MOVQ -8(CX)(BX*1), CX
  7878. MOVQ SI, (AX)
  7879. MOVQ CX, -8(AX)(BX*1)
  7880. JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
  7881. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32:
  7882. MOVOU (CX), X0
  7883. MOVOU -16(CX)(BX*1), X1
  7884. MOVOU X0, (AX)
  7885. MOVOU X1, -16(AX)(BX*1)
  7886. JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
  7887. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64:
  7888. MOVOU (CX), X0
  7889. MOVOU 16(CX), X1
  7890. MOVOU -32(CX)(BX*1), X2
  7891. MOVOU -16(CX)(BX*1), X3
  7892. MOVOU X0, (AX)
  7893. MOVOU X1, 16(AX)
  7894. MOVOU X2, -32(AX)(BX*1)
  7895. MOVOU X3, -16(AX)(BX*1)
  7896. memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B:
  7897. MOVQ DX, AX
  7898. JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm12B
  7899. memmove_long_emit_remainder_encodeBetterBlockAsm12B:
  7900. LEAQ (AX)(SI*1), DX
  7901. MOVL SI, BX
  7902. // genMemMoveLong
  7903. MOVOU (CX), X0
  7904. MOVOU 16(CX), X1
  7905. MOVOU -32(CX)(BX*1), X2
  7906. MOVOU -16(CX)(BX*1), X3
  7907. MOVQ BX, DI
  7908. SHRQ $0x05, DI
  7909. MOVQ AX, SI
  7910. ANDL $0x0000001f, SI
  7911. MOVQ $0x00000040, R8
  7912. SUBQ SI, R8
  7913. DECQ DI
  7914. JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
  7915. LEAQ -32(CX)(R8*1), SI
  7916. LEAQ -32(AX)(R8*1), R9
  7917. emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back:
  7918. MOVOU (SI), X4
  7919. MOVOU 16(SI), X5
  7920. MOVOA X4, (R9)
  7921. MOVOA X5, 16(R9)
  7922. ADDQ $0x20, R9
  7923. ADDQ $0x20, SI
  7924. ADDQ $0x20, R8
  7925. DECQ DI
  7926. JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back
  7927. emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32:
  7928. MOVOU -32(CX)(R8*1), X4
  7929. MOVOU -16(CX)(R8*1), X5
  7930. MOVOA X4, -32(AX)(R8*1)
  7931. MOVOA X5, -16(AX)(R8*1)
  7932. ADDQ $0x20, R8
  7933. CMPQ BX, R8
  7934. JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
  7935. MOVOU X0, (AX)
  7936. MOVOU X1, 16(AX)
  7937. MOVOU X2, -32(AX)(BX*1)
  7938. MOVOU X3, -16(AX)(BX*1)
  7939. MOVQ DX, AX
  7940. emit_literal_done_emit_remainder_encodeBetterBlockAsm12B:
  7941. MOVQ dst_base+0(FP), CX
  7942. SUBQ CX, AX
  7943. MOVQ AX, ret+48(FP)
  7944. RET
  7945. // func encodeBetterBlockAsm10B(dst []byte, src []byte) int
  7946. // Requires: BMI, SSE2
  7947. TEXT ·encodeBetterBlockAsm10B(SB), $20504-56
  7948. MOVQ dst_base+0(FP), AX
  7949. MOVQ $0x000000a0, CX
  7950. LEAQ 24(SP), DX
  7951. PXOR X0, X0
  7952. zero_loop_encodeBetterBlockAsm10B:
  7953. MOVOU X0, (DX)
  7954. MOVOU X0, 16(DX)
  7955. MOVOU X0, 32(DX)
  7956. MOVOU X0, 48(DX)
  7957. MOVOU X0, 64(DX)
  7958. MOVOU X0, 80(DX)
  7959. MOVOU X0, 96(DX)
  7960. MOVOU X0, 112(DX)
  7961. ADDQ $0x80, DX
  7962. DECQ CX
  7963. JNZ zero_loop_encodeBetterBlockAsm10B
  7964. MOVL $0x00000000, 12(SP)
  7965. MOVQ src_len+32(FP), CX
  7966. LEAQ -6(CX), DX
  7967. LEAQ -8(CX), BX
  7968. MOVL BX, 8(SP)
  7969. SHRQ $0x05, CX
  7970. SUBL CX, DX
  7971. LEAQ (AX)(DX*1), DX
  7972. MOVQ DX, (SP)
  7973. MOVL $0x00000001, CX
  7974. MOVL $0x00000000, 16(SP)
  7975. MOVQ src_base+24(FP), DX
  7976. search_loop_encodeBetterBlockAsm10B:
  7977. MOVL CX, BX
  7978. SUBL 12(SP), BX
  7979. SHRL $0x05, BX
  7980. LEAL 1(CX)(BX*1), BX
  7981. CMPL BX, 8(SP)
  7982. JAE emit_remainder_encodeBetterBlockAsm10B
  7983. MOVQ (DX)(CX*1), SI
  7984. MOVL BX, 20(SP)
  7985. MOVQ $0x0000cf1bbcdcbf9b, R8
  7986. MOVQ $0x9e3779b1, BX
  7987. MOVQ SI, R9
  7988. MOVQ SI, R10
  7989. SHLQ $0x10, R9
  7990. IMULQ R8, R9
  7991. SHRQ $0x34, R9
  7992. SHLQ $0x20, R10
  7993. IMULQ BX, R10
  7994. SHRQ $0x36, R10
  7995. MOVL 24(SP)(R9*4), BX
  7996. MOVL 16408(SP)(R10*4), DI
  7997. MOVL CX, 24(SP)(R9*4)
  7998. MOVL CX, 16408(SP)(R10*4)
  7999. MOVQ (DX)(BX*1), R9
  8000. MOVQ (DX)(DI*1), R10
  8001. CMPQ R9, SI
  8002. JEQ candidate_match_encodeBetterBlockAsm10B
  8003. CMPQ R10, SI
  8004. JNE no_short_found_encodeBetterBlockAsm10B
  8005. MOVL DI, BX
  8006. JMP candidate_match_encodeBetterBlockAsm10B
  8007. no_short_found_encodeBetterBlockAsm10B:
  8008. CMPL R9, SI
  8009. JEQ candidate_match_encodeBetterBlockAsm10B
  8010. CMPL R10, SI
  8011. JEQ candidateS_match_encodeBetterBlockAsm10B
  8012. MOVL 20(SP), CX
  8013. JMP search_loop_encodeBetterBlockAsm10B
  8014. candidateS_match_encodeBetterBlockAsm10B:
  8015. SHRQ $0x08, SI
  8016. MOVQ SI, R9
  8017. SHLQ $0x10, R9
  8018. IMULQ R8, R9
  8019. SHRQ $0x34, R9
  8020. MOVL 24(SP)(R9*4), BX
  8021. INCL CX
  8022. MOVL CX, 24(SP)(R9*4)
  8023. CMPL (DX)(BX*1), SI
  8024. JEQ candidate_match_encodeBetterBlockAsm10B
  8025. DECL CX
  8026. MOVL DI, BX
  8027. candidate_match_encodeBetterBlockAsm10B:
  8028. MOVL 12(SP), SI
  8029. TESTL BX, BX
  8030. JZ match_extend_back_end_encodeBetterBlockAsm10B
  8031. match_extend_back_loop_encodeBetterBlockAsm10B:
  8032. CMPL CX, SI
  8033. JBE match_extend_back_end_encodeBetterBlockAsm10B
  8034. MOVB -1(DX)(BX*1), DI
  8035. MOVB -1(DX)(CX*1), R8
  8036. CMPB DI, R8
  8037. JNE match_extend_back_end_encodeBetterBlockAsm10B
  8038. LEAL -1(CX), CX
  8039. DECL BX
  8040. JZ match_extend_back_end_encodeBetterBlockAsm10B
  8041. JMP match_extend_back_loop_encodeBetterBlockAsm10B
  8042. match_extend_back_end_encodeBetterBlockAsm10B:
  8043. MOVL CX, SI
  8044. SUBL 12(SP), SI
  8045. LEAQ 3(AX)(SI*1), SI
  8046. CMPQ SI, (SP)
  8047. JB match_dst_size_check_encodeBetterBlockAsm10B
  8048. MOVQ $0x00000000, ret+48(FP)
  8049. RET
  8050. match_dst_size_check_encodeBetterBlockAsm10B:
  8051. MOVL CX, SI
  8052. ADDL $0x04, CX
  8053. ADDL $0x04, BX
  8054. MOVQ src_len+32(FP), DI
  8055. SUBL CX, DI
  8056. LEAQ (DX)(CX*1), R8
  8057. LEAQ (DX)(BX*1), R9
  8058. // matchLen
  8059. XORL R11, R11
  8060. matchlen_loopback_16_match_nolit_encodeBetterBlockAsm10B:
  8061. CMPL DI, $0x10
  8062. JB matchlen_match8_match_nolit_encodeBetterBlockAsm10B
  8063. MOVQ (R8)(R11*1), R10
  8064. MOVQ 8(R8)(R11*1), R12
  8065. XORQ (R9)(R11*1), R10
  8066. JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm10B
  8067. XORQ 8(R9)(R11*1), R12
  8068. JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm10B
  8069. LEAL -16(DI), DI
  8070. LEAL 16(R11), R11
  8071. JMP matchlen_loopback_16_match_nolit_encodeBetterBlockAsm10B
  8072. matchlen_bsf_16match_nolit_encodeBetterBlockAsm10B:
  8073. #ifdef GOAMD64_v3
  8074. TZCNTQ R12, R12
  8075. #else
  8076. BSFQ R12, R12
  8077. #endif
  8078. SARQ $0x03, R12
  8079. LEAL 8(R11)(R12*1), R11
  8080. JMP match_nolit_end_encodeBetterBlockAsm10B
  8081. matchlen_match8_match_nolit_encodeBetterBlockAsm10B:
  8082. CMPL DI, $0x08
  8083. JB matchlen_match4_match_nolit_encodeBetterBlockAsm10B
  8084. MOVQ (R8)(R11*1), R10
  8085. XORQ (R9)(R11*1), R10
  8086. JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm10B
  8087. LEAL -8(DI), DI
  8088. LEAL 8(R11), R11
  8089. JMP matchlen_match4_match_nolit_encodeBetterBlockAsm10B
  8090. matchlen_bsf_8_match_nolit_encodeBetterBlockAsm10B:
  8091. #ifdef GOAMD64_v3
  8092. TZCNTQ R10, R10
  8093. #else
  8094. BSFQ R10, R10
  8095. #endif
  8096. SARQ $0x03, R10
  8097. LEAL (R11)(R10*1), R11
  8098. JMP match_nolit_end_encodeBetterBlockAsm10B
  8099. matchlen_match4_match_nolit_encodeBetterBlockAsm10B:
  8100. CMPL DI, $0x04
  8101. JB matchlen_match2_match_nolit_encodeBetterBlockAsm10B
  8102. MOVL (R8)(R11*1), R10
  8103. CMPL (R9)(R11*1), R10
  8104. JNE matchlen_match2_match_nolit_encodeBetterBlockAsm10B
  8105. LEAL -4(DI), DI
  8106. LEAL 4(R11), R11
  8107. matchlen_match2_match_nolit_encodeBetterBlockAsm10B:
  8108. CMPL DI, $0x01
  8109. JE matchlen_match1_match_nolit_encodeBetterBlockAsm10B
  8110. JB match_nolit_end_encodeBetterBlockAsm10B
  8111. MOVW (R8)(R11*1), R10
  8112. CMPW (R9)(R11*1), R10
  8113. JNE matchlen_match1_match_nolit_encodeBetterBlockAsm10B
  8114. LEAL 2(R11), R11
  8115. SUBL $0x02, DI
  8116. JZ match_nolit_end_encodeBetterBlockAsm10B
  8117. matchlen_match1_match_nolit_encodeBetterBlockAsm10B:
  8118. MOVB (R8)(R11*1), R10
  8119. CMPB (R9)(R11*1), R10
  8120. JNE match_nolit_end_encodeBetterBlockAsm10B
  8121. LEAL 1(R11), R11
  8122. match_nolit_end_encodeBetterBlockAsm10B:
  8123. MOVL CX, DI
  8124. SUBL BX, DI
  8125. // Check if repeat
  8126. CMPL 16(SP), DI
  8127. JEQ match_is_repeat_encodeBetterBlockAsm10B
  8128. MOVL DI, 16(SP)
  8129. MOVL 12(SP), BX
  8130. CMPL BX, SI
  8131. JEQ emit_literal_done_match_emit_encodeBetterBlockAsm10B
  8132. MOVL SI, R8
  8133. MOVL SI, 12(SP)
  8134. LEAQ (DX)(BX*1), R9
  8135. SUBL BX, R8
  8136. LEAL -1(R8), BX
  8137. CMPL BX, $0x3c
  8138. JB one_byte_match_emit_encodeBetterBlockAsm10B
  8139. CMPL BX, $0x00000100
  8140. JB two_bytes_match_emit_encodeBetterBlockAsm10B
  8141. JB three_bytes_match_emit_encodeBetterBlockAsm10B
  8142. three_bytes_match_emit_encodeBetterBlockAsm10B:
  8143. MOVB $0xf4, (AX)
  8144. MOVW BX, 1(AX)
  8145. ADDQ $0x03, AX
  8146. JMP memmove_long_match_emit_encodeBetterBlockAsm10B
  8147. two_bytes_match_emit_encodeBetterBlockAsm10B:
  8148. MOVB $0xf0, (AX)
  8149. MOVB BL, 1(AX)
  8150. ADDQ $0x02, AX
  8151. CMPL BX, $0x40
  8152. JB memmove_match_emit_encodeBetterBlockAsm10B
  8153. JMP memmove_long_match_emit_encodeBetterBlockAsm10B
  8154. one_byte_match_emit_encodeBetterBlockAsm10B:
  8155. SHLB $0x02, BL
  8156. MOVB BL, (AX)
  8157. ADDQ $0x01, AX
  8158. memmove_match_emit_encodeBetterBlockAsm10B:
  8159. LEAQ (AX)(R8*1), BX
  8160. // genMemMoveShort
  8161. CMPQ R8, $0x04
  8162. JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4
  8163. CMPQ R8, $0x08
  8164. JB emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7
  8165. CMPQ R8, $0x10
  8166. JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16
  8167. CMPQ R8, $0x20
  8168. JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32
  8169. JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64
  8170. emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4:
  8171. MOVL (R9), R10
  8172. MOVL R10, (AX)
  8173. JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B
  8174. emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7:
  8175. MOVL (R9), R10
  8176. MOVL -4(R9)(R8*1), R9
  8177. MOVL R10, (AX)
  8178. MOVL R9, -4(AX)(R8*1)
  8179. JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B
  8180. emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16:
  8181. MOVQ (R9), R10
  8182. MOVQ -8(R9)(R8*1), R9
  8183. MOVQ R10, (AX)
  8184. MOVQ R9, -8(AX)(R8*1)
  8185. JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B
  8186. emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32:
  8187. MOVOU (R9), X0
  8188. MOVOU -16(R9)(R8*1), X1
  8189. MOVOU X0, (AX)
  8190. MOVOU X1, -16(AX)(R8*1)
  8191. JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B
  8192. emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64:
  8193. MOVOU (R9), X0
  8194. MOVOU 16(R9), X1
  8195. MOVOU -32(R9)(R8*1), X2
  8196. MOVOU -16(R9)(R8*1), X3
  8197. MOVOU X0, (AX)
  8198. MOVOU X1, 16(AX)
  8199. MOVOU X2, -32(AX)(R8*1)
  8200. MOVOU X3, -16(AX)(R8*1)
  8201. memmove_end_copy_match_emit_encodeBetterBlockAsm10B:
  8202. MOVQ BX, AX
  8203. JMP emit_literal_done_match_emit_encodeBetterBlockAsm10B
  8204. memmove_long_match_emit_encodeBetterBlockAsm10B:
  8205. LEAQ (AX)(R8*1), BX
  8206. // genMemMoveLong
  8207. MOVOU (R9), X0
  8208. MOVOU 16(R9), X1
  8209. MOVOU -32(R9)(R8*1), X2
  8210. MOVOU -16(R9)(R8*1), X3
  8211. MOVQ R8, R12
  8212. SHRQ $0x05, R12
  8213. MOVQ AX, R10
  8214. ANDL $0x0000001f, R10
  8215. MOVQ $0x00000040, R13
  8216. SUBQ R10, R13
  8217. DECQ R12
  8218. JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
  8219. LEAQ -32(R9)(R13*1), R10
  8220. LEAQ -32(AX)(R13*1), R14
  8221. emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back:
  8222. MOVOU (R10), X4
  8223. MOVOU 16(R10), X5
  8224. MOVOA X4, (R14)
  8225. MOVOA X5, 16(R14)
  8226. ADDQ $0x20, R14
  8227. ADDQ $0x20, R10
  8228. ADDQ $0x20, R13
  8229. DECQ R12
  8230. JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back
  8231. emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32:
  8232. MOVOU -32(R9)(R13*1), X4
  8233. MOVOU -16(R9)(R13*1), X5
  8234. MOVOA X4, -32(AX)(R13*1)
  8235. MOVOA X5, -16(AX)(R13*1)
  8236. ADDQ $0x20, R13
  8237. CMPQ R8, R13
  8238. JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
  8239. MOVOU X0, (AX)
  8240. MOVOU X1, 16(AX)
  8241. MOVOU X2, -32(AX)(R8*1)
  8242. MOVOU X3, -16(AX)(R8*1)
  8243. MOVQ BX, AX
  8244. emit_literal_done_match_emit_encodeBetterBlockAsm10B:
  8245. ADDL R11, CX
  8246. ADDL $0x04, R11
  8247. MOVL CX, 12(SP)
  8248. // emitCopy
  8249. CMPL R11, $0x40
  8250. JBE two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B
  8251. CMPL DI, $0x00000800
  8252. JAE long_offset_short_match_nolit_encodeBetterBlockAsm10B
  8253. MOVL $0x00000001, BX
  8254. LEAL 16(BX), BX
  8255. MOVB DI, 1(AX)
  8256. SHRL $0x08, DI
  8257. SHLL $0x05, DI
  8258. ORL DI, BX
  8259. MOVB BL, (AX)
  8260. ADDQ $0x02, AX
  8261. SUBL $0x08, R11
  8262. // emitRepeat
  8263. LEAL -4(R11), R11
  8264. JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b
  8265. MOVL R11, BX
  8266. LEAL -4(R11), R11
  8267. CMPL BX, $0x08
  8268. JBE repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b
  8269. CMPL BX, $0x0c
  8270. JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b
  8271. CMPL DI, $0x00000800
  8272. JB repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b
  8273. cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b:
  8274. CMPL R11, $0x00000104
  8275. JB repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b
  8276. LEAL -256(R11), R11
  8277. MOVW $0x0019, (AX)
  8278. MOVW R11, 2(AX)
  8279. ADDQ $0x04, AX
  8280. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
  8281. repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b:
  8282. LEAL -4(R11), R11
  8283. MOVW $0x0015, (AX)
  8284. MOVB R11, 2(AX)
  8285. ADDQ $0x03, AX
  8286. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
  8287. repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b:
  8288. SHLL $0x02, R11
  8289. ORL $0x01, R11
  8290. MOVW R11, (AX)
  8291. ADDQ $0x02, AX
  8292. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
  8293. repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b:
  8294. XORQ BX, BX
  8295. LEAL 1(BX)(R11*4), R11
  8296. MOVB DI, 1(AX)
  8297. SARL $0x08, DI
  8298. SHLL $0x05, DI
  8299. ORL DI, R11
  8300. MOVB R11, (AX)
  8301. ADDQ $0x02, AX
  8302. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
  8303. long_offset_short_match_nolit_encodeBetterBlockAsm10B:
  8304. MOVB $0xee, (AX)
  8305. MOVW DI, 1(AX)
  8306. LEAL -60(R11), R11
  8307. ADDQ $0x03, AX
  8308. // emitRepeat
  8309. MOVL R11, BX
  8310. LEAL -4(R11), R11
  8311. CMPL BX, $0x08
  8312. JBE repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
  8313. CMPL BX, $0x0c
  8314. JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
  8315. CMPL DI, $0x00000800
  8316. JB repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
  8317. cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
  8318. CMPL R11, $0x00000104
  8319. JB repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
  8320. LEAL -256(R11), R11
  8321. MOVW $0x0019, (AX)
  8322. MOVW R11, 2(AX)
  8323. ADDQ $0x04, AX
  8324. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
  8325. repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
  8326. LEAL -4(R11), R11
  8327. MOVW $0x0015, (AX)
  8328. MOVB R11, 2(AX)
  8329. ADDQ $0x03, AX
  8330. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
  8331. repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
  8332. SHLL $0x02, R11
  8333. ORL $0x01, R11
  8334. MOVW R11, (AX)
  8335. ADDQ $0x02, AX
  8336. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
  8337. repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
  8338. XORQ BX, BX
  8339. LEAL 1(BX)(R11*4), R11
  8340. MOVB DI, 1(AX)
  8341. SARL $0x08, DI
  8342. SHLL $0x05, DI
  8343. ORL DI, R11
  8344. MOVB R11, (AX)
  8345. ADDQ $0x02, AX
  8346. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
  8347. two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B:
  8348. MOVL R11, BX
  8349. SHLL $0x02, BX
  8350. CMPL R11, $0x0c
  8351. JAE emit_copy_three_match_nolit_encodeBetterBlockAsm10B
  8352. CMPL DI, $0x00000800
  8353. JAE emit_copy_three_match_nolit_encodeBetterBlockAsm10B
  8354. LEAL -15(BX), BX
  8355. MOVB DI, 1(AX)
  8356. SHRL $0x08, DI
  8357. SHLL $0x05, DI
  8358. ORL DI, BX
  8359. MOVB BL, (AX)
  8360. ADDQ $0x02, AX
  8361. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
  8362. emit_copy_three_match_nolit_encodeBetterBlockAsm10B:
  8363. LEAL -2(BX), BX
  8364. MOVB BL, (AX)
  8365. MOVW DI, 1(AX)
  8366. ADDQ $0x03, AX
  8367. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
  8368. match_is_repeat_encodeBetterBlockAsm10B:
  8369. MOVL 12(SP), BX
  8370. CMPL BX, SI
  8371. JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B
  8372. MOVL SI, R8
  8373. MOVL SI, 12(SP)
  8374. LEAQ (DX)(BX*1), R9
  8375. SUBL BX, R8
  8376. LEAL -1(R8), BX
  8377. CMPL BX, $0x3c
  8378. JB one_byte_match_emit_repeat_encodeBetterBlockAsm10B
  8379. CMPL BX, $0x00000100
  8380. JB two_bytes_match_emit_repeat_encodeBetterBlockAsm10B
  8381. JB three_bytes_match_emit_repeat_encodeBetterBlockAsm10B
  8382. three_bytes_match_emit_repeat_encodeBetterBlockAsm10B:
  8383. MOVB $0xf4, (AX)
  8384. MOVW BX, 1(AX)
  8385. ADDQ $0x03, AX
  8386. JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm10B
  8387. two_bytes_match_emit_repeat_encodeBetterBlockAsm10B:
  8388. MOVB $0xf0, (AX)
  8389. MOVB BL, 1(AX)
  8390. ADDQ $0x02, AX
  8391. CMPL BX, $0x40
  8392. JB memmove_match_emit_repeat_encodeBetterBlockAsm10B
  8393. JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm10B
  8394. one_byte_match_emit_repeat_encodeBetterBlockAsm10B:
  8395. SHLB $0x02, BL
  8396. MOVB BL, (AX)
  8397. ADDQ $0x01, AX
  8398. memmove_match_emit_repeat_encodeBetterBlockAsm10B:
  8399. LEAQ (AX)(R8*1), BX
  8400. // genMemMoveShort
  8401. CMPQ R8, $0x04
  8402. JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4
  8403. CMPQ R8, $0x08
  8404. JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4through7
  8405. CMPQ R8, $0x10
  8406. JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_8through16
  8407. CMPQ R8, $0x20
  8408. JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_17through32
  8409. JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_33through64
  8410. emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4:
  8411. MOVL (R9), R10
  8412. MOVL R10, (AX)
  8413. JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
  8414. emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4through7:
  8415. MOVL (R9), R10
  8416. MOVL -4(R9)(R8*1), R9
  8417. MOVL R10, (AX)
  8418. MOVL R9, -4(AX)(R8*1)
  8419. JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
  8420. emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_8through16:
  8421. MOVQ (R9), R10
  8422. MOVQ -8(R9)(R8*1), R9
  8423. MOVQ R10, (AX)
  8424. MOVQ R9, -8(AX)(R8*1)
  8425. JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
  8426. emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_17through32:
  8427. MOVOU (R9), X0
  8428. MOVOU -16(R9)(R8*1), X1
  8429. MOVOU X0, (AX)
  8430. MOVOU X1, -16(AX)(R8*1)
  8431. JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
  8432. emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_33through64:
  8433. MOVOU (R9), X0
  8434. MOVOU 16(R9), X1
  8435. MOVOU -32(R9)(R8*1), X2
  8436. MOVOU -16(R9)(R8*1), X3
  8437. MOVOU X0, (AX)
  8438. MOVOU X1, 16(AX)
  8439. MOVOU X2, -32(AX)(R8*1)
  8440. MOVOU X3, -16(AX)(R8*1)
  8441. memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B:
  8442. MOVQ BX, AX
  8443. JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B
  8444. memmove_long_match_emit_repeat_encodeBetterBlockAsm10B:
  8445. LEAQ (AX)(R8*1), BX
  8446. // genMemMoveLong
  8447. MOVOU (R9), X0
  8448. MOVOU 16(R9), X1
  8449. MOVOU -32(R9)(R8*1), X2
  8450. MOVOU -16(R9)(R8*1), X3
  8451. MOVQ R8, R12
  8452. SHRQ $0x05, R12
  8453. MOVQ AX, R10
  8454. ANDL $0x0000001f, R10
  8455. MOVQ $0x00000040, R13
  8456. SUBQ R10, R13
  8457. DECQ R12
  8458. JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
  8459. LEAQ -32(R9)(R13*1), R10
  8460. LEAQ -32(AX)(R13*1), R14
  8461. emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_big_loop_back:
  8462. MOVOU (R10), X4
  8463. MOVOU 16(R10), X5
  8464. MOVOA X4, (R14)
  8465. MOVOA X5, 16(R14)
  8466. ADDQ $0x20, R14
  8467. ADDQ $0x20, R10
  8468. ADDQ $0x20, R13
  8469. DECQ R12
  8470. JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_big_loop_back
  8471. emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32:
  8472. MOVOU -32(R9)(R13*1), X4
  8473. MOVOU -16(R9)(R13*1), X5
  8474. MOVOA X4, -32(AX)(R13*1)
  8475. MOVOA X5, -16(AX)(R13*1)
  8476. ADDQ $0x20, R13
  8477. CMPQ R8, R13
  8478. JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
  8479. MOVOU X0, (AX)
  8480. MOVOU X1, 16(AX)
  8481. MOVOU X2, -32(AX)(R8*1)
  8482. MOVOU X3, -16(AX)(R8*1)
  8483. MOVQ BX, AX
  8484. emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B:
  8485. ADDL R11, CX
  8486. ADDL $0x04, R11
  8487. MOVL CX, 12(SP)
  8488. // emitRepeat
  8489. MOVL R11, BX
  8490. LEAL -4(R11), R11
  8491. CMPL BX, $0x08
  8492. JBE repeat_two_match_nolit_repeat_encodeBetterBlockAsm10B
  8493. CMPL BX, $0x0c
  8494. JAE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B
  8495. CMPL DI, $0x00000800
  8496. JB repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B
  8497. cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B:
  8498. CMPL R11, $0x00000104
  8499. JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm10B
  8500. LEAL -256(R11), R11
  8501. MOVW $0x0019, (AX)
  8502. MOVW R11, 2(AX)
  8503. ADDQ $0x04, AX
  8504. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
  8505. repeat_three_match_nolit_repeat_encodeBetterBlockAsm10B:
  8506. LEAL -4(R11), R11
  8507. MOVW $0x0015, (AX)
  8508. MOVB R11, 2(AX)
  8509. ADDQ $0x03, AX
  8510. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
  8511. repeat_two_match_nolit_repeat_encodeBetterBlockAsm10B:
  8512. SHLL $0x02, R11
  8513. ORL $0x01, R11
  8514. MOVW R11, (AX)
  8515. ADDQ $0x02, AX
  8516. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
  8517. repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B:
  8518. XORQ BX, BX
  8519. LEAL 1(BX)(R11*4), R11
  8520. MOVB DI, 1(AX)
  8521. SARL $0x08, DI
  8522. SHLL $0x05, DI
  8523. ORL DI, R11
  8524. MOVB R11, (AX)
  8525. ADDQ $0x02, AX
  8526. match_nolit_emitcopy_end_encodeBetterBlockAsm10B:
  8527. CMPL CX, 8(SP)
  8528. JAE emit_remainder_encodeBetterBlockAsm10B
  8529. CMPQ AX, (SP)
  8530. JB match_nolit_dst_ok_encodeBetterBlockAsm10B
  8531. MOVQ $0x00000000, ret+48(FP)
  8532. RET
  8533. match_nolit_dst_ok_encodeBetterBlockAsm10B:
  8534. MOVQ $0x0000cf1bbcdcbf9b, BX
  8535. MOVQ $0x9e3779b1, DI
  8536. LEAQ 1(SI), SI
  8537. LEAQ -2(CX), R8
  8538. MOVQ (DX)(SI*1), R9
  8539. MOVQ 1(DX)(SI*1), R10
  8540. MOVQ (DX)(R8*1), R11
  8541. MOVQ 1(DX)(R8*1), R12
  8542. SHLQ $0x10, R9
  8543. IMULQ BX, R9
  8544. SHRQ $0x34, R9
  8545. SHLQ $0x20, R10
  8546. IMULQ DI, R10
  8547. SHRQ $0x36, R10
  8548. SHLQ $0x10, R11
  8549. IMULQ BX, R11
  8550. SHRQ $0x34, R11
  8551. SHLQ $0x20, R12
  8552. IMULQ DI, R12
  8553. SHRQ $0x36, R12
  8554. LEAQ 1(SI), DI
  8555. LEAQ 1(R8), R13
  8556. MOVL SI, 24(SP)(R9*4)
  8557. MOVL R8, 24(SP)(R11*4)
  8558. MOVL DI, 16408(SP)(R10*4)
  8559. MOVL R13, 16408(SP)(R12*4)
  8560. LEAQ 1(R8)(SI*1), DI
  8561. SHRQ $0x01, DI
  8562. ADDQ $0x01, SI
  8563. SUBQ $0x01, R8
  8564. index_loop_encodeBetterBlockAsm10B:
  8565. CMPQ DI, R8
  8566. JAE search_loop_encodeBetterBlockAsm10B
  8567. MOVQ (DX)(SI*1), R9
  8568. MOVQ (DX)(DI*1), R10
  8569. SHLQ $0x10, R9
  8570. IMULQ BX, R9
  8571. SHRQ $0x34, R9
  8572. SHLQ $0x10, R10
  8573. IMULQ BX, R10
  8574. SHRQ $0x34, R10
  8575. MOVL SI, 24(SP)(R9*4)
  8576. MOVL DI, 24(SP)(R10*4)
  8577. ADDQ $0x02, SI
  8578. ADDQ $0x02, DI
  8579. JMP index_loop_encodeBetterBlockAsm10B
  8580. emit_remainder_encodeBetterBlockAsm10B:
  8581. MOVQ src_len+32(FP), CX
  8582. SUBL 12(SP), CX
  8583. LEAQ 3(AX)(CX*1), CX
  8584. CMPQ CX, (SP)
  8585. JB emit_remainder_ok_encodeBetterBlockAsm10B
  8586. MOVQ $0x00000000, ret+48(FP)
  8587. RET
  8588. emit_remainder_ok_encodeBetterBlockAsm10B:
  8589. MOVQ src_len+32(FP), CX
  8590. MOVL 12(SP), BX
  8591. CMPL BX, CX
  8592. JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm10B
  8593. MOVL CX, SI
  8594. MOVL CX, 12(SP)
  8595. LEAQ (DX)(BX*1), CX
  8596. SUBL BX, SI
  8597. LEAL -1(SI), DX
  8598. CMPL DX, $0x3c
  8599. JB one_byte_emit_remainder_encodeBetterBlockAsm10B
  8600. CMPL DX, $0x00000100
  8601. JB two_bytes_emit_remainder_encodeBetterBlockAsm10B
  8602. JB three_bytes_emit_remainder_encodeBetterBlockAsm10B
  8603. three_bytes_emit_remainder_encodeBetterBlockAsm10B:
  8604. MOVB $0xf4, (AX)
  8605. MOVW DX, 1(AX)
  8606. ADDQ $0x03, AX
  8607. JMP memmove_long_emit_remainder_encodeBetterBlockAsm10B
  8608. two_bytes_emit_remainder_encodeBetterBlockAsm10B:
  8609. MOVB $0xf0, (AX)
  8610. MOVB DL, 1(AX)
  8611. ADDQ $0x02, AX
  8612. CMPL DX, $0x40
  8613. JB memmove_emit_remainder_encodeBetterBlockAsm10B
  8614. JMP memmove_long_emit_remainder_encodeBetterBlockAsm10B
  8615. one_byte_emit_remainder_encodeBetterBlockAsm10B:
  8616. SHLB $0x02, DL
  8617. MOVB DL, (AX)
  8618. ADDQ $0x01, AX
  8619. memmove_emit_remainder_encodeBetterBlockAsm10B:
  8620. LEAQ (AX)(SI*1), DX
  8621. MOVL SI, BX
  8622. // genMemMoveShort
  8623. CMPQ BX, $0x03
  8624. JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_1or2
  8625. JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_3
  8626. CMPQ BX, $0x08
  8627. JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7
  8628. CMPQ BX, $0x10
  8629. JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16
  8630. CMPQ BX, $0x20
  8631. JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32
  8632. JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64
  8633. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_1or2:
  8634. MOVB (CX), SI
  8635. MOVB -1(CX)(BX*1), CL
  8636. MOVB SI, (AX)
  8637. MOVB CL, -1(AX)(BX*1)
  8638. JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
  8639. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_3:
  8640. MOVW (CX), SI
  8641. MOVB 2(CX), CL
  8642. MOVW SI, (AX)
  8643. MOVB CL, 2(AX)
  8644. JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
  8645. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7:
  8646. MOVL (CX), SI
  8647. MOVL -4(CX)(BX*1), CX
  8648. MOVL SI, (AX)
  8649. MOVL CX, -4(AX)(BX*1)
  8650. JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
  8651. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16:
  8652. MOVQ (CX), SI
  8653. MOVQ -8(CX)(BX*1), CX
  8654. MOVQ SI, (AX)
  8655. MOVQ CX, -8(AX)(BX*1)
  8656. JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
  8657. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32:
  8658. MOVOU (CX), X0
  8659. MOVOU -16(CX)(BX*1), X1
  8660. MOVOU X0, (AX)
  8661. MOVOU X1, -16(AX)(BX*1)
  8662. JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
  8663. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64:
  8664. MOVOU (CX), X0
  8665. MOVOU 16(CX), X1
  8666. MOVOU -32(CX)(BX*1), X2
  8667. MOVOU -16(CX)(BX*1), X3
  8668. MOVOU X0, (AX)
  8669. MOVOU X1, 16(AX)
  8670. MOVOU X2, -32(AX)(BX*1)
  8671. MOVOU X3, -16(AX)(BX*1)
  8672. memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B:
  8673. MOVQ DX, AX
  8674. JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm10B
  8675. memmove_long_emit_remainder_encodeBetterBlockAsm10B:
  8676. LEAQ (AX)(SI*1), DX
  8677. MOVL SI, BX
  8678. // genMemMoveLong
  8679. MOVOU (CX), X0
  8680. MOVOU 16(CX), X1
  8681. MOVOU -32(CX)(BX*1), X2
  8682. MOVOU -16(CX)(BX*1), X3
  8683. MOVQ BX, DI
  8684. SHRQ $0x05, DI
  8685. MOVQ AX, SI
  8686. ANDL $0x0000001f, SI
  8687. MOVQ $0x00000040, R8
  8688. SUBQ SI, R8
  8689. DECQ DI
  8690. JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
  8691. LEAQ -32(CX)(R8*1), SI
  8692. LEAQ -32(AX)(R8*1), R9
  8693. emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back:
  8694. MOVOU (SI), X4
  8695. MOVOU 16(SI), X5
  8696. MOVOA X4, (R9)
  8697. MOVOA X5, 16(R9)
  8698. ADDQ $0x20, R9
  8699. ADDQ $0x20, SI
  8700. ADDQ $0x20, R8
  8701. DECQ DI
  8702. JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back
  8703. emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32:
  8704. MOVOU -32(CX)(R8*1), X4
  8705. MOVOU -16(CX)(R8*1), X5
  8706. MOVOA X4, -32(AX)(R8*1)
  8707. MOVOA X5, -16(AX)(R8*1)
  8708. ADDQ $0x20, R8
  8709. CMPQ BX, R8
  8710. JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
  8711. MOVOU X0, (AX)
  8712. MOVOU X1, 16(AX)
  8713. MOVOU X2, -32(AX)(BX*1)
  8714. MOVOU X3, -16(AX)(BX*1)
  8715. MOVQ DX, AX
  8716. emit_literal_done_emit_remainder_encodeBetterBlockAsm10B:
  8717. MOVQ dst_base+0(FP), CX
  8718. SUBQ CX, AX
  8719. MOVQ AX, ret+48(FP)
  8720. RET
  8721. // func encodeBetterBlockAsm8B(dst []byte, src []byte) int
  8722. // Requires: BMI, SSE2
  8723. TEXT ·encodeBetterBlockAsm8B(SB), $5144-56
  8724. MOVQ dst_base+0(FP), AX
  8725. MOVQ $0x00000028, CX
  8726. LEAQ 24(SP), DX
  8727. PXOR X0, X0
  8728. zero_loop_encodeBetterBlockAsm8B:
  8729. MOVOU X0, (DX)
  8730. MOVOU X0, 16(DX)
  8731. MOVOU X0, 32(DX)
  8732. MOVOU X0, 48(DX)
  8733. MOVOU X0, 64(DX)
  8734. MOVOU X0, 80(DX)
  8735. MOVOU X0, 96(DX)
  8736. MOVOU X0, 112(DX)
  8737. ADDQ $0x80, DX
  8738. DECQ CX
  8739. JNZ zero_loop_encodeBetterBlockAsm8B
  8740. MOVL $0x00000000, 12(SP)
  8741. MOVQ src_len+32(FP), CX
  8742. LEAQ -6(CX), DX
  8743. LEAQ -8(CX), BX
  8744. MOVL BX, 8(SP)
  8745. SHRQ $0x05, CX
  8746. SUBL CX, DX
  8747. LEAQ (AX)(DX*1), DX
  8748. MOVQ DX, (SP)
  8749. MOVL $0x00000001, CX
  8750. MOVL $0x00000000, 16(SP)
  8751. MOVQ src_base+24(FP), DX
  8752. search_loop_encodeBetterBlockAsm8B:
  8753. MOVL CX, BX
  8754. SUBL 12(SP), BX
  8755. SHRL $0x04, BX
  8756. LEAL 1(CX)(BX*1), BX
  8757. CMPL BX, 8(SP)
  8758. JAE emit_remainder_encodeBetterBlockAsm8B
  8759. MOVQ (DX)(CX*1), SI
  8760. MOVL BX, 20(SP)
  8761. MOVQ $0x0000cf1bbcdcbf9b, R8
  8762. MOVQ $0x9e3779b1, BX
  8763. MOVQ SI, R9
  8764. MOVQ SI, R10
  8765. SHLQ $0x10, R9
  8766. IMULQ R8, R9
  8767. SHRQ $0x36, R9
  8768. SHLQ $0x20, R10
  8769. IMULQ BX, R10
  8770. SHRQ $0x38, R10
  8771. MOVL 24(SP)(R9*4), BX
  8772. MOVL 4120(SP)(R10*4), DI
  8773. MOVL CX, 24(SP)(R9*4)
  8774. MOVL CX, 4120(SP)(R10*4)
  8775. MOVQ (DX)(BX*1), R9
  8776. MOVQ (DX)(DI*1), R10
  8777. CMPQ R9, SI
  8778. JEQ candidate_match_encodeBetterBlockAsm8B
  8779. CMPQ R10, SI
  8780. JNE no_short_found_encodeBetterBlockAsm8B
  8781. MOVL DI, BX
  8782. JMP candidate_match_encodeBetterBlockAsm8B
  8783. no_short_found_encodeBetterBlockAsm8B:
  8784. CMPL R9, SI
  8785. JEQ candidate_match_encodeBetterBlockAsm8B
  8786. CMPL R10, SI
  8787. JEQ candidateS_match_encodeBetterBlockAsm8B
  8788. MOVL 20(SP), CX
  8789. JMP search_loop_encodeBetterBlockAsm8B
  8790. candidateS_match_encodeBetterBlockAsm8B:
  8791. SHRQ $0x08, SI
  8792. MOVQ SI, R9
  8793. SHLQ $0x10, R9
  8794. IMULQ R8, R9
  8795. SHRQ $0x36, R9
  8796. MOVL 24(SP)(R9*4), BX
  8797. INCL CX
  8798. MOVL CX, 24(SP)(R9*4)
  8799. CMPL (DX)(BX*1), SI
  8800. JEQ candidate_match_encodeBetterBlockAsm8B
  8801. DECL CX
  8802. MOVL DI, BX
  8803. candidate_match_encodeBetterBlockAsm8B:
  8804. MOVL 12(SP), SI
  8805. TESTL BX, BX
  8806. JZ match_extend_back_end_encodeBetterBlockAsm8B
  8807. match_extend_back_loop_encodeBetterBlockAsm8B:
  8808. CMPL CX, SI
  8809. JBE match_extend_back_end_encodeBetterBlockAsm8B
  8810. MOVB -1(DX)(BX*1), DI
  8811. MOVB -1(DX)(CX*1), R8
  8812. CMPB DI, R8
  8813. JNE match_extend_back_end_encodeBetterBlockAsm8B
  8814. LEAL -1(CX), CX
  8815. DECL BX
  8816. JZ match_extend_back_end_encodeBetterBlockAsm8B
  8817. JMP match_extend_back_loop_encodeBetterBlockAsm8B
  8818. match_extend_back_end_encodeBetterBlockAsm8B:
  8819. MOVL CX, SI
  8820. SUBL 12(SP), SI
  8821. LEAQ 3(AX)(SI*1), SI
  8822. CMPQ SI, (SP)
  8823. JB match_dst_size_check_encodeBetterBlockAsm8B
  8824. MOVQ $0x00000000, ret+48(FP)
  8825. RET
  8826. match_dst_size_check_encodeBetterBlockAsm8B:
  8827. MOVL CX, SI
  8828. ADDL $0x04, CX
  8829. ADDL $0x04, BX
  8830. MOVQ src_len+32(FP), DI
  8831. SUBL CX, DI
  8832. LEAQ (DX)(CX*1), R8
  8833. LEAQ (DX)(BX*1), R9
  8834. // matchLen
  8835. XORL R11, R11
  8836. matchlen_loopback_16_match_nolit_encodeBetterBlockAsm8B:
  8837. CMPL DI, $0x10
  8838. JB matchlen_match8_match_nolit_encodeBetterBlockAsm8B
  8839. MOVQ (R8)(R11*1), R10
  8840. MOVQ 8(R8)(R11*1), R12
  8841. XORQ (R9)(R11*1), R10
  8842. JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm8B
  8843. XORQ 8(R9)(R11*1), R12
  8844. JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm8B
  8845. LEAL -16(DI), DI
  8846. LEAL 16(R11), R11
  8847. JMP matchlen_loopback_16_match_nolit_encodeBetterBlockAsm8B
  8848. matchlen_bsf_16match_nolit_encodeBetterBlockAsm8B:
  8849. #ifdef GOAMD64_v3
  8850. TZCNTQ R12, R12
  8851. #else
  8852. BSFQ R12, R12
  8853. #endif
  8854. SARQ $0x03, R12
  8855. LEAL 8(R11)(R12*1), R11
  8856. JMP match_nolit_end_encodeBetterBlockAsm8B
  8857. matchlen_match8_match_nolit_encodeBetterBlockAsm8B:
  8858. CMPL DI, $0x08
  8859. JB matchlen_match4_match_nolit_encodeBetterBlockAsm8B
  8860. MOVQ (R8)(R11*1), R10
  8861. XORQ (R9)(R11*1), R10
  8862. JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm8B
  8863. LEAL -8(DI), DI
  8864. LEAL 8(R11), R11
  8865. JMP matchlen_match4_match_nolit_encodeBetterBlockAsm8B
  8866. matchlen_bsf_8_match_nolit_encodeBetterBlockAsm8B:
  8867. #ifdef GOAMD64_v3
  8868. TZCNTQ R10, R10
  8869. #else
  8870. BSFQ R10, R10
  8871. #endif
  8872. SARQ $0x03, R10
  8873. LEAL (R11)(R10*1), R11
  8874. JMP match_nolit_end_encodeBetterBlockAsm8B
  8875. matchlen_match4_match_nolit_encodeBetterBlockAsm8B:
  8876. CMPL DI, $0x04
  8877. JB matchlen_match2_match_nolit_encodeBetterBlockAsm8B
  8878. MOVL (R8)(R11*1), R10
  8879. CMPL (R9)(R11*1), R10
  8880. JNE matchlen_match2_match_nolit_encodeBetterBlockAsm8B
  8881. LEAL -4(DI), DI
  8882. LEAL 4(R11), R11
  8883. matchlen_match2_match_nolit_encodeBetterBlockAsm8B:
  8884. CMPL DI, $0x01
  8885. JE matchlen_match1_match_nolit_encodeBetterBlockAsm8B
  8886. JB match_nolit_end_encodeBetterBlockAsm8B
  8887. MOVW (R8)(R11*1), R10
  8888. CMPW (R9)(R11*1), R10
  8889. JNE matchlen_match1_match_nolit_encodeBetterBlockAsm8B
  8890. LEAL 2(R11), R11
  8891. SUBL $0x02, DI
  8892. JZ match_nolit_end_encodeBetterBlockAsm8B
  8893. matchlen_match1_match_nolit_encodeBetterBlockAsm8B:
  8894. MOVB (R8)(R11*1), R10
  8895. CMPB (R9)(R11*1), R10
  8896. JNE match_nolit_end_encodeBetterBlockAsm8B
  8897. LEAL 1(R11), R11
  8898. match_nolit_end_encodeBetterBlockAsm8B:
  8899. MOVL CX, DI
  8900. SUBL BX, DI
  8901. // Check if repeat
  8902. CMPL 16(SP), DI
  8903. JEQ match_is_repeat_encodeBetterBlockAsm8B
  8904. MOVL DI, 16(SP)
  8905. MOVL 12(SP), BX
  8906. CMPL BX, SI
  8907. JEQ emit_literal_done_match_emit_encodeBetterBlockAsm8B
  8908. MOVL SI, R8
  8909. MOVL SI, 12(SP)
  8910. LEAQ (DX)(BX*1), R9
  8911. SUBL BX, R8
  8912. LEAL -1(R8), BX
  8913. CMPL BX, $0x3c
  8914. JB one_byte_match_emit_encodeBetterBlockAsm8B
  8915. CMPL BX, $0x00000100
  8916. JB two_bytes_match_emit_encodeBetterBlockAsm8B
  8917. JB three_bytes_match_emit_encodeBetterBlockAsm8B
  8918. three_bytes_match_emit_encodeBetterBlockAsm8B:
  8919. MOVB $0xf4, (AX)
  8920. MOVW BX, 1(AX)
  8921. ADDQ $0x03, AX
  8922. JMP memmove_long_match_emit_encodeBetterBlockAsm8B
  8923. two_bytes_match_emit_encodeBetterBlockAsm8B:
  8924. MOVB $0xf0, (AX)
  8925. MOVB BL, 1(AX)
  8926. ADDQ $0x02, AX
  8927. CMPL BX, $0x40
  8928. JB memmove_match_emit_encodeBetterBlockAsm8B
  8929. JMP memmove_long_match_emit_encodeBetterBlockAsm8B
  8930. one_byte_match_emit_encodeBetterBlockAsm8B:
  8931. SHLB $0x02, BL
  8932. MOVB BL, (AX)
  8933. ADDQ $0x01, AX
  8934. memmove_match_emit_encodeBetterBlockAsm8B:
  8935. LEAQ (AX)(R8*1), BX
  8936. // genMemMoveShort
  8937. CMPQ R8, $0x04
  8938. JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4
  8939. CMPQ R8, $0x08
  8940. JB emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7
  8941. CMPQ R8, $0x10
  8942. JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16
  8943. CMPQ R8, $0x20
  8944. JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32
  8945. JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64
  8946. emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4:
  8947. MOVL (R9), R10
  8948. MOVL R10, (AX)
  8949. JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B
  8950. emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7:
  8951. MOVL (R9), R10
  8952. MOVL -4(R9)(R8*1), R9
  8953. MOVL R10, (AX)
  8954. MOVL R9, -4(AX)(R8*1)
  8955. JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B
  8956. emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16:
  8957. MOVQ (R9), R10
  8958. MOVQ -8(R9)(R8*1), R9
  8959. MOVQ R10, (AX)
  8960. MOVQ R9, -8(AX)(R8*1)
  8961. JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B
  8962. emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32:
  8963. MOVOU (R9), X0
  8964. MOVOU -16(R9)(R8*1), X1
  8965. MOVOU X0, (AX)
  8966. MOVOU X1, -16(AX)(R8*1)
  8967. JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B
  8968. emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64:
  8969. MOVOU (R9), X0
  8970. MOVOU 16(R9), X1
  8971. MOVOU -32(R9)(R8*1), X2
  8972. MOVOU -16(R9)(R8*1), X3
  8973. MOVOU X0, (AX)
  8974. MOVOU X1, 16(AX)
  8975. MOVOU X2, -32(AX)(R8*1)
  8976. MOVOU X3, -16(AX)(R8*1)
  8977. memmove_end_copy_match_emit_encodeBetterBlockAsm8B:
  8978. MOVQ BX, AX
  8979. JMP emit_literal_done_match_emit_encodeBetterBlockAsm8B
  8980. memmove_long_match_emit_encodeBetterBlockAsm8B:
  8981. LEAQ (AX)(R8*1), BX
  8982. // genMemMoveLong
  8983. MOVOU (R9), X0
  8984. MOVOU 16(R9), X1
  8985. MOVOU -32(R9)(R8*1), X2
  8986. MOVOU -16(R9)(R8*1), X3
  8987. MOVQ R8, R12
  8988. SHRQ $0x05, R12
  8989. MOVQ AX, R10
  8990. ANDL $0x0000001f, R10
  8991. MOVQ $0x00000040, R13
  8992. SUBQ R10, R13
  8993. DECQ R12
  8994. JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
  8995. LEAQ -32(R9)(R13*1), R10
  8996. LEAQ -32(AX)(R13*1), R14
  8997. emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back:
  8998. MOVOU (R10), X4
  8999. MOVOU 16(R10), X5
  9000. MOVOA X4, (R14)
  9001. MOVOA X5, 16(R14)
  9002. ADDQ $0x20, R14
  9003. ADDQ $0x20, R10
  9004. ADDQ $0x20, R13
  9005. DECQ R12
  9006. JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back
  9007. emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32:
  9008. MOVOU -32(R9)(R13*1), X4
  9009. MOVOU -16(R9)(R13*1), X5
  9010. MOVOA X4, -32(AX)(R13*1)
  9011. MOVOA X5, -16(AX)(R13*1)
  9012. ADDQ $0x20, R13
  9013. CMPQ R8, R13
  9014. JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
  9015. MOVOU X0, (AX)
  9016. MOVOU X1, 16(AX)
  9017. MOVOU X2, -32(AX)(R8*1)
  9018. MOVOU X3, -16(AX)(R8*1)
  9019. MOVQ BX, AX
  9020. emit_literal_done_match_emit_encodeBetterBlockAsm8B:
  9021. ADDL R11, CX
  9022. ADDL $0x04, R11
  9023. MOVL CX, 12(SP)
  9024. // emitCopy
  9025. CMPL R11, $0x40
  9026. JBE two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B
  9027. CMPL DI, $0x00000800
  9028. JAE long_offset_short_match_nolit_encodeBetterBlockAsm8B
  9029. MOVL $0x00000001, BX
  9030. LEAL 16(BX), BX
  9031. MOVB DI, 1(AX)
  9032. SHRL $0x08, DI
  9033. SHLL $0x05, DI
  9034. ORL DI, BX
  9035. MOVB BL, (AX)
  9036. ADDQ $0x02, AX
  9037. SUBL $0x08, R11
  9038. // emitRepeat
  9039. LEAL -4(R11), R11
  9040. JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b
  9041. MOVL R11, BX
  9042. LEAL -4(R11), R11
  9043. CMPL BX, $0x08
  9044. JBE repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b
  9045. CMPL BX, $0x0c
  9046. JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b
  9047. cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b:
  9048. CMPL R11, $0x00000104
  9049. JB repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b
  9050. LEAL -256(R11), R11
  9051. MOVW $0x0019, (AX)
  9052. MOVW R11, 2(AX)
  9053. ADDQ $0x04, AX
  9054. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
  9055. repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b:
  9056. LEAL -4(R11), R11
  9057. MOVW $0x0015, (AX)
  9058. MOVB R11, 2(AX)
  9059. ADDQ $0x03, AX
  9060. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
  9061. repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b:
  9062. SHLL $0x02, R11
  9063. ORL $0x01, R11
  9064. MOVW R11, (AX)
  9065. ADDQ $0x02, AX
  9066. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
  9067. XORQ BX, BX
  9068. LEAL 1(BX)(R11*4), R11
  9069. MOVB DI, 1(AX)
  9070. SARL $0x08, DI
  9071. SHLL $0x05, DI
  9072. ORL DI, R11
  9073. MOVB R11, (AX)
  9074. ADDQ $0x02, AX
  9075. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
  9076. long_offset_short_match_nolit_encodeBetterBlockAsm8B:
  9077. MOVB $0xee, (AX)
  9078. MOVW DI, 1(AX)
  9079. LEAL -60(R11), R11
  9080. ADDQ $0x03, AX
  9081. // emitRepeat
  9082. MOVL R11, BX
  9083. LEAL -4(R11), R11
  9084. CMPL BX, $0x08
  9085. JBE repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short
  9086. CMPL BX, $0x0c
  9087. JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short
  9088. cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short:
  9089. CMPL R11, $0x00000104
  9090. JB repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short
  9091. LEAL -256(R11), R11
  9092. MOVW $0x0019, (AX)
  9093. MOVW R11, 2(AX)
  9094. ADDQ $0x04, AX
  9095. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
  9096. repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short:
  9097. LEAL -4(R11), R11
  9098. MOVW $0x0015, (AX)
  9099. MOVB R11, 2(AX)
  9100. ADDQ $0x03, AX
  9101. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
  9102. repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short:
  9103. SHLL $0x02, R11
  9104. ORL $0x01, R11
  9105. MOVW R11, (AX)
  9106. ADDQ $0x02, AX
  9107. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
  9108. XORQ BX, BX
  9109. LEAL 1(BX)(R11*4), R11
  9110. MOVB DI, 1(AX)
  9111. SARL $0x08, DI
  9112. SHLL $0x05, DI
  9113. ORL DI, R11
  9114. MOVB R11, (AX)
  9115. ADDQ $0x02, AX
  9116. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
  9117. two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B:
  9118. MOVL R11, BX
  9119. SHLL $0x02, BX
  9120. CMPL R11, $0x0c
  9121. JAE emit_copy_three_match_nolit_encodeBetterBlockAsm8B
  9122. LEAL -15(BX), BX
  9123. MOVB DI, 1(AX)
  9124. SHRL $0x08, DI
  9125. SHLL $0x05, DI
  9126. ORL DI, BX
  9127. MOVB BL, (AX)
  9128. ADDQ $0x02, AX
  9129. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
  9130. emit_copy_three_match_nolit_encodeBetterBlockAsm8B:
  9131. LEAL -2(BX), BX
  9132. MOVB BL, (AX)
  9133. MOVW DI, 1(AX)
  9134. ADDQ $0x03, AX
  9135. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
  9136. match_is_repeat_encodeBetterBlockAsm8B:
  9137. MOVL 12(SP), BX
  9138. CMPL BX, SI
  9139. JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B
  9140. MOVL SI, DI
  9141. MOVL SI, 12(SP)
  9142. LEAQ (DX)(BX*1), R8
  9143. SUBL BX, DI
  9144. LEAL -1(DI), BX
  9145. CMPL BX, $0x3c
  9146. JB one_byte_match_emit_repeat_encodeBetterBlockAsm8B
  9147. CMPL BX, $0x00000100
  9148. JB two_bytes_match_emit_repeat_encodeBetterBlockAsm8B
  9149. JB three_bytes_match_emit_repeat_encodeBetterBlockAsm8B
  9150. three_bytes_match_emit_repeat_encodeBetterBlockAsm8B:
  9151. MOVB $0xf4, (AX)
  9152. MOVW BX, 1(AX)
  9153. ADDQ $0x03, AX
  9154. JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm8B
  9155. two_bytes_match_emit_repeat_encodeBetterBlockAsm8B:
  9156. MOVB $0xf0, (AX)
  9157. MOVB BL, 1(AX)
  9158. ADDQ $0x02, AX
  9159. CMPL BX, $0x40
  9160. JB memmove_match_emit_repeat_encodeBetterBlockAsm8B
  9161. JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm8B
  9162. one_byte_match_emit_repeat_encodeBetterBlockAsm8B:
  9163. SHLB $0x02, BL
  9164. MOVB BL, (AX)
  9165. ADDQ $0x01, AX
  9166. memmove_match_emit_repeat_encodeBetterBlockAsm8B:
  9167. LEAQ (AX)(DI*1), BX
  9168. // genMemMoveShort
  9169. CMPQ DI, $0x04
  9170. JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4
  9171. CMPQ DI, $0x08
  9172. JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7
  9173. CMPQ DI, $0x10
  9174. JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16
  9175. CMPQ DI, $0x20
  9176. JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32
  9177. JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64
  9178. emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4:
  9179. MOVL (R8), R9
  9180. MOVL R9, (AX)
  9181. JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
  9182. emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7:
  9183. MOVL (R8), R9
  9184. MOVL -4(R8)(DI*1), R8
  9185. MOVL R9, (AX)
  9186. MOVL R8, -4(AX)(DI*1)
  9187. JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
  9188. emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16:
  9189. MOVQ (R8), R9
  9190. MOVQ -8(R8)(DI*1), R8
  9191. MOVQ R9, (AX)
  9192. MOVQ R8, -8(AX)(DI*1)
  9193. JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
  9194. emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32:
  9195. MOVOU (R8), X0
  9196. MOVOU -16(R8)(DI*1), X1
  9197. MOVOU X0, (AX)
  9198. MOVOU X1, -16(AX)(DI*1)
  9199. JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
  9200. emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64:
  9201. MOVOU (R8), X0
  9202. MOVOU 16(R8), X1
  9203. MOVOU -32(R8)(DI*1), X2
  9204. MOVOU -16(R8)(DI*1), X3
  9205. MOVOU X0, (AX)
  9206. MOVOU X1, 16(AX)
  9207. MOVOU X2, -32(AX)(DI*1)
  9208. MOVOU X3, -16(AX)(DI*1)
  9209. memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B:
  9210. MOVQ BX, AX
  9211. JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B
  9212. memmove_long_match_emit_repeat_encodeBetterBlockAsm8B:
  9213. LEAQ (AX)(DI*1), BX
  9214. // genMemMoveLong
  9215. MOVOU (R8), X0
  9216. MOVOU 16(R8), X1
  9217. MOVOU -32(R8)(DI*1), X2
  9218. MOVOU -16(R8)(DI*1), X3
  9219. MOVQ DI, R10
  9220. SHRQ $0x05, R10
  9221. MOVQ AX, R9
  9222. ANDL $0x0000001f, R9
  9223. MOVQ $0x00000040, R12
  9224. SUBQ R9, R12
  9225. DECQ R10
  9226. JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
  9227. LEAQ -32(R8)(R12*1), R9
  9228. LEAQ -32(AX)(R12*1), R13
  9229. emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back:
  9230. MOVOU (R9), X4
  9231. MOVOU 16(R9), X5
  9232. MOVOA X4, (R13)
  9233. MOVOA X5, 16(R13)
  9234. ADDQ $0x20, R13
  9235. ADDQ $0x20, R9
  9236. ADDQ $0x20, R12
  9237. DECQ R10
  9238. JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back
  9239. emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32:
  9240. MOVOU -32(R8)(R12*1), X4
  9241. MOVOU -16(R8)(R12*1), X5
  9242. MOVOA X4, -32(AX)(R12*1)
  9243. MOVOA X5, -16(AX)(R12*1)
  9244. ADDQ $0x20, R12
  9245. CMPQ DI, R12
  9246. JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
  9247. MOVOU X0, (AX)
  9248. MOVOU X1, 16(AX)
  9249. MOVOU X2, -32(AX)(DI*1)
  9250. MOVOU X3, -16(AX)(DI*1)
  9251. MOVQ BX, AX
  9252. emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B:
  9253. ADDL R11, CX
  9254. ADDL $0x04, R11
  9255. MOVL CX, 12(SP)
  9256. // emitRepeat
  9257. MOVL R11, BX
  9258. LEAL -4(R11), R11
  9259. CMPL BX, $0x08
  9260. JBE repeat_two_match_nolit_repeat_encodeBetterBlockAsm8B
  9261. CMPL BX, $0x0c
  9262. JAE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm8B
  9263. cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm8B:
  9264. CMPL R11, $0x00000104
  9265. JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm8B
  9266. LEAL -256(R11), R11
  9267. MOVW $0x0019, (AX)
  9268. MOVW R11, 2(AX)
  9269. ADDQ $0x04, AX
  9270. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
  9271. repeat_three_match_nolit_repeat_encodeBetterBlockAsm8B:
  9272. LEAL -4(R11), R11
  9273. MOVW $0x0015, (AX)
  9274. MOVB R11, 2(AX)
  9275. ADDQ $0x03, AX
  9276. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
  9277. repeat_two_match_nolit_repeat_encodeBetterBlockAsm8B:
  9278. SHLL $0x02, R11
  9279. ORL $0x01, R11
  9280. MOVW R11, (AX)
  9281. ADDQ $0x02, AX
  9282. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
  9283. XORQ BX, BX
  9284. LEAL 1(BX)(R11*4), R11
  9285. MOVB DI, 1(AX)
  9286. SARL $0x08, DI
  9287. SHLL $0x05, DI
  9288. ORL DI, R11
  9289. MOVB R11, (AX)
  9290. ADDQ $0x02, AX
  9291. match_nolit_emitcopy_end_encodeBetterBlockAsm8B:
  9292. CMPL CX, 8(SP)
  9293. JAE emit_remainder_encodeBetterBlockAsm8B
  9294. CMPQ AX, (SP)
  9295. JB match_nolit_dst_ok_encodeBetterBlockAsm8B
  9296. MOVQ $0x00000000, ret+48(FP)
  9297. RET
  9298. match_nolit_dst_ok_encodeBetterBlockAsm8B:
  9299. MOVQ $0x0000cf1bbcdcbf9b, BX
  9300. MOVQ $0x9e3779b1, DI
  9301. LEAQ 1(SI), SI
  9302. LEAQ -2(CX), R8
  9303. MOVQ (DX)(SI*1), R9
  9304. MOVQ 1(DX)(SI*1), R10
  9305. MOVQ (DX)(R8*1), R11
  9306. MOVQ 1(DX)(R8*1), R12
  9307. SHLQ $0x10, R9
  9308. IMULQ BX, R9
  9309. SHRQ $0x36, R9
  9310. SHLQ $0x20, R10
  9311. IMULQ DI, R10
  9312. SHRQ $0x38, R10
  9313. SHLQ $0x10, R11
  9314. IMULQ BX, R11
  9315. SHRQ $0x36, R11
  9316. SHLQ $0x20, R12
  9317. IMULQ DI, R12
  9318. SHRQ $0x38, R12
  9319. LEAQ 1(SI), DI
  9320. LEAQ 1(R8), R13
  9321. MOVL SI, 24(SP)(R9*4)
  9322. MOVL R8, 24(SP)(R11*4)
  9323. MOVL DI, 4120(SP)(R10*4)
  9324. MOVL R13, 4120(SP)(R12*4)
  9325. LEAQ 1(R8)(SI*1), DI
  9326. SHRQ $0x01, DI
  9327. ADDQ $0x01, SI
  9328. SUBQ $0x01, R8
  9329. index_loop_encodeBetterBlockAsm8B:
  9330. CMPQ DI, R8
  9331. JAE search_loop_encodeBetterBlockAsm8B
  9332. MOVQ (DX)(SI*1), R9
  9333. MOVQ (DX)(DI*1), R10
  9334. SHLQ $0x10, R9
  9335. IMULQ BX, R9
  9336. SHRQ $0x36, R9
  9337. SHLQ $0x10, R10
  9338. IMULQ BX, R10
  9339. SHRQ $0x36, R10
  9340. MOVL SI, 24(SP)(R9*4)
  9341. MOVL DI, 24(SP)(R10*4)
  9342. ADDQ $0x02, SI
  9343. ADDQ $0x02, DI
  9344. JMP index_loop_encodeBetterBlockAsm8B
  9345. emit_remainder_encodeBetterBlockAsm8B:
  9346. MOVQ src_len+32(FP), CX
  9347. SUBL 12(SP), CX
  9348. LEAQ 3(AX)(CX*1), CX
  9349. CMPQ CX, (SP)
  9350. JB emit_remainder_ok_encodeBetterBlockAsm8B
  9351. MOVQ $0x00000000, ret+48(FP)
  9352. RET
  9353. emit_remainder_ok_encodeBetterBlockAsm8B:
  9354. MOVQ src_len+32(FP), CX
  9355. MOVL 12(SP), BX
  9356. CMPL BX, CX
  9357. JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm8B
  9358. MOVL CX, SI
  9359. MOVL CX, 12(SP)
  9360. LEAQ (DX)(BX*1), CX
  9361. SUBL BX, SI
  9362. LEAL -1(SI), DX
  9363. CMPL DX, $0x3c
  9364. JB one_byte_emit_remainder_encodeBetterBlockAsm8B
  9365. CMPL DX, $0x00000100
  9366. JB two_bytes_emit_remainder_encodeBetterBlockAsm8B
  9367. JB three_bytes_emit_remainder_encodeBetterBlockAsm8B
  9368. three_bytes_emit_remainder_encodeBetterBlockAsm8B:
  9369. MOVB $0xf4, (AX)
  9370. MOVW DX, 1(AX)
  9371. ADDQ $0x03, AX
  9372. JMP memmove_long_emit_remainder_encodeBetterBlockAsm8B
  9373. two_bytes_emit_remainder_encodeBetterBlockAsm8B:
  9374. MOVB $0xf0, (AX)
  9375. MOVB DL, 1(AX)
  9376. ADDQ $0x02, AX
  9377. CMPL DX, $0x40
  9378. JB memmove_emit_remainder_encodeBetterBlockAsm8B
  9379. JMP memmove_long_emit_remainder_encodeBetterBlockAsm8B
  9380. one_byte_emit_remainder_encodeBetterBlockAsm8B:
  9381. SHLB $0x02, DL
  9382. MOVB DL, (AX)
  9383. ADDQ $0x01, AX
  9384. memmove_emit_remainder_encodeBetterBlockAsm8B:
  9385. LEAQ (AX)(SI*1), DX
  9386. MOVL SI, BX
  9387. // genMemMoveShort
  9388. CMPQ BX, $0x03
  9389. JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_1or2
  9390. JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_3
  9391. CMPQ BX, $0x08
  9392. JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7
  9393. CMPQ BX, $0x10
  9394. JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16
  9395. CMPQ BX, $0x20
  9396. JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32
  9397. JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64
  9398. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_1or2:
  9399. MOVB (CX), SI
  9400. MOVB -1(CX)(BX*1), CL
  9401. MOVB SI, (AX)
  9402. MOVB CL, -1(AX)(BX*1)
  9403. JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
  9404. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_3:
  9405. MOVW (CX), SI
  9406. MOVB 2(CX), CL
  9407. MOVW SI, (AX)
  9408. MOVB CL, 2(AX)
  9409. JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
  9410. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7:
  9411. MOVL (CX), SI
  9412. MOVL -4(CX)(BX*1), CX
  9413. MOVL SI, (AX)
  9414. MOVL CX, -4(AX)(BX*1)
  9415. JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
  9416. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16:
  9417. MOVQ (CX), SI
  9418. MOVQ -8(CX)(BX*1), CX
  9419. MOVQ SI, (AX)
  9420. MOVQ CX, -8(AX)(BX*1)
  9421. JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
  9422. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32:
  9423. MOVOU (CX), X0
  9424. MOVOU -16(CX)(BX*1), X1
  9425. MOVOU X0, (AX)
  9426. MOVOU X1, -16(AX)(BX*1)
  9427. JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
  9428. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64:
  9429. MOVOU (CX), X0
  9430. MOVOU 16(CX), X1
  9431. MOVOU -32(CX)(BX*1), X2
  9432. MOVOU -16(CX)(BX*1), X3
  9433. MOVOU X0, (AX)
  9434. MOVOU X1, 16(AX)
  9435. MOVOU X2, -32(AX)(BX*1)
  9436. MOVOU X3, -16(AX)(BX*1)
  9437. memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B:
  9438. MOVQ DX, AX
  9439. JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm8B
  9440. memmove_long_emit_remainder_encodeBetterBlockAsm8B:
  9441. LEAQ (AX)(SI*1), DX
  9442. MOVL SI, BX
  9443. // genMemMoveLong
  9444. MOVOU (CX), X0
  9445. MOVOU 16(CX), X1
  9446. MOVOU -32(CX)(BX*1), X2
  9447. MOVOU -16(CX)(BX*1), X3
  9448. MOVQ BX, DI
  9449. SHRQ $0x05, DI
  9450. MOVQ AX, SI
  9451. ANDL $0x0000001f, SI
  9452. MOVQ $0x00000040, R8
  9453. SUBQ SI, R8
  9454. DECQ DI
  9455. JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
  9456. LEAQ -32(CX)(R8*1), SI
  9457. LEAQ -32(AX)(R8*1), R9
  9458. emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back:
  9459. MOVOU (SI), X4
  9460. MOVOU 16(SI), X5
  9461. MOVOA X4, (R9)
  9462. MOVOA X5, 16(R9)
  9463. ADDQ $0x20, R9
  9464. ADDQ $0x20, SI
  9465. ADDQ $0x20, R8
  9466. DECQ DI
  9467. JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back
  9468. emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32:
  9469. MOVOU -32(CX)(R8*1), X4
  9470. MOVOU -16(CX)(R8*1), X5
  9471. MOVOA X4, -32(AX)(R8*1)
  9472. MOVOA X5, -16(AX)(R8*1)
  9473. ADDQ $0x20, R8
  9474. CMPQ BX, R8
  9475. JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
  9476. MOVOU X0, (AX)
  9477. MOVOU X1, 16(AX)
  9478. MOVOU X2, -32(AX)(BX*1)
  9479. MOVOU X3, -16(AX)(BX*1)
  9480. MOVQ DX, AX
  9481. emit_literal_done_emit_remainder_encodeBetterBlockAsm8B:
  9482. MOVQ dst_base+0(FP), CX
  9483. SUBQ CX, AX
  9484. MOVQ AX, ret+48(FP)
  9485. RET
  9486. // func encodeSnappyBlockAsm(dst []byte, src []byte) int
  9487. // Requires: BMI, SSE2
  9488. TEXT ·encodeSnappyBlockAsm(SB), $65560-56
  9489. MOVQ dst_base+0(FP), AX
  9490. MOVQ $0x00000200, CX
  9491. LEAQ 24(SP), DX
  9492. PXOR X0, X0
  9493. zero_loop_encodeSnappyBlockAsm:
  9494. MOVOU X0, (DX)
  9495. MOVOU X0, 16(DX)
  9496. MOVOU X0, 32(DX)
  9497. MOVOU X0, 48(DX)
  9498. MOVOU X0, 64(DX)
  9499. MOVOU X0, 80(DX)
  9500. MOVOU X0, 96(DX)
  9501. MOVOU X0, 112(DX)
  9502. ADDQ $0x80, DX
  9503. DECQ CX
  9504. JNZ zero_loop_encodeSnappyBlockAsm
  9505. MOVL $0x00000000, 12(SP)
  9506. MOVQ src_len+32(FP), CX
  9507. LEAQ -9(CX), DX
  9508. LEAQ -8(CX), BX
  9509. MOVL BX, 8(SP)
  9510. SHRQ $0x05, CX
  9511. SUBL CX, DX
  9512. LEAQ (AX)(DX*1), DX
  9513. MOVQ DX, (SP)
  9514. MOVL $0x00000001, CX
  9515. MOVL CX, 16(SP)
  9516. MOVQ src_base+24(FP), DX
  9517. search_loop_encodeSnappyBlockAsm:
  9518. MOVL CX, BX
  9519. SUBL 12(SP), BX
  9520. SHRL $0x06, BX
  9521. LEAL 4(CX)(BX*1), BX
  9522. CMPL BX, 8(SP)
  9523. JAE emit_remainder_encodeSnappyBlockAsm
  9524. MOVQ (DX)(CX*1), SI
  9525. MOVL BX, 20(SP)
  9526. MOVQ $0x0000cf1bbcdcbf9b, R8
  9527. MOVQ SI, R9
  9528. MOVQ SI, R10
  9529. SHRQ $0x08, R10
  9530. SHLQ $0x10, R9
  9531. IMULQ R8, R9
  9532. SHRQ $0x32, R9
  9533. SHLQ $0x10, R10
  9534. IMULQ R8, R10
  9535. SHRQ $0x32, R10
  9536. MOVL 24(SP)(R9*4), BX
  9537. MOVL 24(SP)(R10*4), DI
  9538. MOVL CX, 24(SP)(R9*4)
  9539. LEAL 1(CX), R9
  9540. MOVL R9, 24(SP)(R10*4)
  9541. MOVQ SI, R9
  9542. SHRQ $0x10, R9
  9543. SHLQ $0x10, R9
  9544. IMULQ R8, R9
  9545. SHRQ $0x32, R9
  9546. MOVL CX, R8
  9547. SUBL 16(SP), R8
  9548. MOVL 1(DX)(R8*1), R10
  9549. MOVQ SI, R8
  9550. SHRQ $0x08, R8
  9551. CMPL R8, R10
  9552. JNE no_repeat_found_encodeSnappyBlockAsm
  9553. LEAL 1(CX), SI
  9554. MOVL 12(SP), BX
  9555. MOVL SI, DI
  9556. SUBL 16(SP), DI
  9557. JZ repeat_extend_back_end_encodeSnappyBlockAsm
  9558. repeat_extend_back_loop_encodeSnappyBlockAsm:
  9559. CMPL SI, BX
  9560. JBE repeat_extend_back_end_encodeSnappyBlockAsm
  9561. MOVB -1(DX)(DI*1), R8
  9562. MOVB -1(DX)(SI*1), R9
  9563. CMPB R8, R9
  9564. JNE repeat_extend_back_end_encodeSnappyBlockAsm
  9565. LEAL -1(SI), SI
  9566. DECL DI
  9567. JNZ repeat_extend_back_loop_encodeSnappyBlockAsm
  9568. repeat_extend_back_end_encodeSnappyBlockAsm:
  9569. MOVL 12(SP), BX
  9570. CMPL BX, SI
  9571. JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm
  9572. MOVL SI, DI
  9573. MOVL SI, 12(SP)
  9574. LEAQ (DX)(BX*1), R8
  9575. SUBL BX, DI
  9576. LEAL -1(DI), BX
  9577. CMPL BX, $0x3c
  9578. JB one_byte_repeat_emit_encodeSnappyBlockAsm
  9579. CMPL BX, $0x00000100
  9580. JB two_bytes_repeat_emit_encodeSnappyBlockAsm
  9581. CMPL BX, $0x00010000
  9582. JB three_bytes_repeat_emit_encodeSnappyBlockAsm
  9583. CMPL BX, $0x01000000
  9584. JB four_bytes_repeat_emit_encodeSnappyBlockAsm
  9585. MOVB $0xfc, (AX)
  9586. MOVL BX, 1(AX)
  9587. ADDQ $0x05, AX
  9588. JMP memmove_long_repeat_emit_encodeSnappyBlockAsm
  9589. four_bytes_repeat_emit_encodeSnappyBlockAsm:
  9590. MOVL BX, R9
  9591. SHRL $0x10, R9
  9592. MOVB $0xf8, (AX)
  9593. MOVW BX, 1(AX)
  9594. MOVB R9, 3(AX)
  9595. ADDQ $0x04, AX
  9596. JMP memmove_long_repeat_emit_encodeSnappyBlockAsm
  9597. three_bytes_repeat_emit_encodeSnappyBlockAsm:
  9598. MOVB $0xf4, (AX)
  9599. MOVW BX, 1(AX)
  9600. ADDQ $0x03, AX
  9601. JMP memmove_long_repeat_emit_encodeSnappyBlockAsm
  9602. two_bytes_repeat_emit_encodeSnappyBlockAsm:
  9603. MOVB $0xf0, (AX)
  9604. MOVB BL, 1(AX)
  9605. ADDQ $0x02, AX
  9606. CMPL BX, $0x40
  9607. JB memmove_repeat_emit_encodeSnappyBlockAsm
  9608. JMP memmove_long_repeat_emit_encodeSnappyBlockAsm
  9609. one_byte_repeat_emit_encodeSnappyBlockAsm:
  9610. SHLB $0x02, BL
  9611. MOVB BL, (AX)
  9612. ADDQ $0x01, AX
  9613. memmove_repeat_emit_encodeSnappyBlockAsm:
  9614. LEAQ (AX)(DI*1), BX
  9615. // genMemMoveShort
  9616. CMPQ DI, $0x08
  9617. JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8
  9618. CMPQ DI, $0x10
  9619. JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16
  9620. CMPQ DI, $0x20
  9621. JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32
  9622. JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64
  9623. emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8:
  9624. MOVQ (R8), R9
  9625. MOVQ R9, (AX)
  9626. JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm
  9627. emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16:
  9628. MOVQ (R8), R9
  9629. MOVQ -8(R8)(DI*1), R8
  9630. MOVQ R9, (AX)
  9631. MOVQ R8, -8(AX)(DI*1)
  9632. JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm
  9633. emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32:
  9634. MOVOU (R8), X0
  9635. MOVOU -16(R8)(DI*1), X1
  9636. MOVOU X0, (AX)
  9637. MOVOU X1, -16(AX)(DI*1)
  9638. JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm
  9639. emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64:
  9640. MOVOU (R8), X0
  9641. MOVOU 16(R8), X1
  9642. MOVOU -32(R8)(DI*1), X2
  9643. MOVOU -16(R8)(DI*1), X3
  9644. MOVOU X0, (AX)
  9645. MOVOU X1, 16(AX)
  9646. MOVOU X2, -32(AX)(DI*1)
  9647. MOVOU X3, -16(AX)(DI*1)
  9648. memmove_end_copy_repeat_emit_encodeSnappyBlockAsm:
  9649. MOVQ BX, AX
  9650. JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm
  9651. memmove_long_repeat_emit_encodeSnappyBlockAsm:
  9652. LEAQ (AX)(DI*1), BX
  9653. // genMemMoveLong
  9654. MOVOU (R8), X0
  9655. MOVOU 16(R8), X1
  9656. MOVOU -32(R8)(DI*1), X2
  9657. MOVOU -16(R8)(DI*1), X3
  9658. MOVQ DI, R10
  9659. SHRQ $0x05, R10
  9660. MOVQ AX, R9
  9661. ANDL $0x0000001f, R9
  9662. MOVQ $0x00000040, R11
  9663. SUBQ R9, R11
  9664. DECQ R10
  9665. JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
  9666. LEAQ -32(R8)(R11*1), R9
  9667. LEAQ -32(AX)(R11*1), R12
  9668. emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back:
  9669. MOVOU (R9), X4
  9670. MOVOU 16(R9), X5
  9671. MOVOA X4, (R12)
  9672. MOVOA X5, 16(R12)
  9673. ADDQ $0x20, R12
  9674. ADDQ $0x20, R9
  9675. ADDQ $0x20, R11
  9676. DECQ R10
  9677. JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back
  9678. emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32:
  9679. MOVOU -32(R8)(R11*1), X4
  9680. MOVOU -16(R8)(R11*1), X5
  9681. MOVOA X4, -32(AX)(R11*1)
  9682. MOVOA X5, -16(AX)(R11*1)
  9683. ADDQ $0x20, R11
  9684. CMPQ DI, R11
  9685. JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
  9686. MOVOU X0, (AX)
  9687. MOVOU X1, 16(AX)
  9688. MOVOU X2, -32(AX)(DI*1)
  9689. MOVOU X3, -16(AX)(DI*1)
  9690. MOVQ BX, AX
  9691. emit_literal_done_repeat_emit_encodeSnappyBlockAsm:
  9692. ADDL $0x05, CX
  9693. MOVL CX, BX
  9694. SUBL 16(SP), BX
  9695. MOVQ src_len+32(FP), DI
  9696. SUBL CX, DI
  9697. LEAQ (DX)(CX*1), R8
  9698. LEAQ (DX)(BX*1), BX
  9699. // matchLen
  9700. XORL R10, R10
  9701. matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm:
  9702. CMPL DI, $0x10
  9703. JB matchlen_match8_repeat_extend_encodeSnappyBlockAsm
  9704. MOVQ (R8)(R10*1), R9
  9705. MOVQ 8(R8)(R10*1), R11
  9706. XORQ (BX)(R10*1), R9
  9707. JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm
  9708. XORQ 8(BX)(R10*1), R11
  9709. JNZ matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm
  9710. LEAL -16(DI), DI
  9711. LEAL 16(R10), R10
  9712. JMP matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm
  9713. matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm:
  9714. #ifdef GOAMD64_v3
  9715. TZCNTQ R11, R11
  9716. #else
  9717. BSFQ R11, R11
  9718. #endif
  9719. SARQ $0x03, R11
  9720. LEAL 8(R10)(R11*1), R10
  9721. JMP repeat_extend_forward_end_encodeSnappyBlockAsm
  9722. matchlen_match8_repeat_extend_encodeSnappyBlockAsm:
  9723. CMPL DI, $0x08
  9724. JB matchlen_match4_repeat_extend_encodeSnappyBlockAsm
  9725. MOVQ (R8)(R10*1), R9
  9726. XORQ (BX)(R10*1), R9
  9727. JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm
  9728. LEAL -8(DI), DI
  9729. LEAL 8(R10), R10
  9730. JMP matchlen_match4_repeat_extend_encodeSnappyBlockAsm
  9731. matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm:
  9732. #ifdef GOAMD64_v3
  9733. TZCNTQ R9, R9
  9734. #else
  9735. BSFQ R9, R9
  9736. #endif
  9737. SARQ $0x03, R9
  9738. LEAL (R10)(R9*1), R10
  9739. JMP repeat_extend_forward_end_encodeSnappyBlockAsm
  9740. matchlen_match4_repeat_extend_encodeSnappyBlockAsm:
  9741. CMPL DI, $0x04
  9742. JB matchlen_match2_repeat_extend_encodeSnappyBlockAsm
  9743. MOVL (R8)(R10*1), R9
  9744. CMPL (BX)(R10*1), R9
  9745. JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm
  9746. LEAL -4(DI), DI
  9747. LEAL 4(R10), R10
  9748. matchlen_match2_repeat_extend_encodeSnappyBlockAsm:
  9749. CMPL DI, $0x01
  9750. JE matchlen_match1_repeat_extend_encodeSnappyBlockAsm
  9751. JB repeat_extend_forward_end_encodeSnappyBlockAsm
  9752. MOVW (R8)(R10*1), R9
  9753. CMPW (BX)(R10*1), R9
  9754. JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm
  9755. LEAL 2(R10), R10
  9756. SUBL $0x02, DI
  9757. JZ repeat_extend_forward_end_encodeSnappyBlockAsm
  9758. matchlen_match1_repeat_extend_encodeSnappyBlockAsm:
  9759. MOVB (R8)(R10*1), R9
  9760. CMPB (BX)(R10*1), R9
  9761. JNE repeat_extend_forward_end_encodeSnappyBlockAsm
  9762. LEAL 1(R10), R10
  9763. repeat_extend_forward_end_encodeSnappyBlockAsm:
  9764. ADDL R10, CX
  9765. MOVL CX, BX
  9766. SUBL SI, BX
  9767. MOVL 16(SP), SI
  9768. // emitCopy
  9769. CMPL SI, $0x00010000
  9770. JB two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm
  9771. four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm:
  9772. CMPL BX, $0x40
  9773. JBE four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm
  9774. MOVB $0xff, (AX)
  9775. MOVL SI, 1(AX)
  9776. LEAL -64(BX), BX
  9777. ADDQ $0x05, AX
  9778. CMPL BX, $0x04
  9779. JB four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm
  9780. JMP four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm
  9781. four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm:
  9782. TESTL BX, BX
  9783. JZ repeat_end_emit_encodeSnappyBlockAsm
  9784. XORL DI, DI
  9785. LEAL -1(DI)(BX*4), BX
  9786. MOVB BL, (AX)
  9787. MOVL SI, 1(AX)
  9788. ADDQ $0x05, AX
  9789. JMP repeat_end_emit_encodeSnappyBlockAsm
  9790. two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm:
  9791. CMPL BX, $0x40
  9792. JBE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm
  9793. MOVB $0xee, (AX)
  9794. MOVW SI, 1(AX)
  9795. LEAL -60(BX), BX
  9796. ADDQ $0x03, AX
  9797. JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm
  9798. two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm:
  9799. MOVL BX, DI
  9800. SHLL $0x02, DI
  9801. CMPL BX, $0x0c
  9802. JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm
  9803. CMPL SI, $0x00000800
  9804. JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm
  9805. LEAL -15(DI), DI
  9806. MOVB SI, 1(AX)
  9807. SHRL $0x08, SI
  9808. SHLL $0x05, SI
  9809. ORL SI, DI
  9810. MOVB DI, (AX)
  9811. ADDQ $0x02, AX
  9812. JMP repeat_end_emit_encodeSnappyBlockAsm
  9813. emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm:
  9814. LEAL -2(DI), DI
  9815. MOVB DI, (AX)
  9816. MOVW SI, 1(AX)
  9817. ADDQ $0x03, AX
  9818. repeat_end_emit_encodeSnappyBlockAsm:
  9819. MOVL CX, 12(SP)
  9820. JMP search_loop_encodeSnappyBlockAsm
  9821. no_repeat_found_encodeSnappyBlockAsm:
  9822. CMPL (DX)(BX*1), SI
  9823. JEQ candidate_match_encodeSnappyBlockAsm
  9824. SHRQ $0x08, SI
  9825. MOVL 24(SP)(R9*4), BX
  9826. LEAL 2(CX), R8
  9827. CMPL (DX)(DI*1), SI
  9828. JEQ candidate2_match_encodeSnappyBlockAsm
  9829. MOVL R8, 24(SP)(R9*4)
  9830. SHRQ $0x08, SI
  9831. CMPL (DX)(BX*1), SI
  9832. JEQ candidate3_match_encodeSnappyBlockAsm
  9833. MOVL 20(SP), CX
  9834. JMP search_loop_encodeSnappyBlockAsm
  9835. candidate3_match_encodeSnappyBlockAsm:
  9836. ADDL $0x02, CX
  9837. JMP candidate_match_encodeSnappyBlockAsm
  9838. candidate2_match_encodeSnappyBlockAsm:
  9839. MOVL R8, 24(SP)(R9*4)
  9840. INCL CX
  9841. MOVL DI, BX
  9842. candidate_match_encodeSnappyBlockAsm:
  9843. MOVL 12(SP), SI
  9844. TESTL BX, BX
  9845. JZ match_extend_back_end_encodeSnappyBlockAsm
  9846. match_extend_back_loop_encodeSnappyBlockAsm:
  9847. CMPL CX, SI
  9848. JBE match_extend_back_end_encodeSnappyBlockAsm
  9849. MOVB -1(DX)(BX*1), DI
  9850. MOVB -1(DX)(CX*1), R8
  9851. CMPB DI, R8
  9852. JNE match_extend_back_end_encodeSnappyBlockAsm
  9853. LEAL -1(CX), CX
  9854. DECL BX
  9855. JZ match_extend_back_end_encodeSnappyBlockAsm
  9856. JMP match_extend_back_loop_encodeSnappyBlockAsm
  9857. match_extend_back_end_encodeSnappyBlockAsm:
  9858. MOVL CX, SI
  9859. SUBL 12(SP), SI
  9860. LEAQ 5(AX)(SI*1), SI
  9861. CMPQ SI, (SP)
  9862. JB match_dst_size_check_encodeSnappyBlockAsm
  9863. MOVQ $0x00000000, ret+48(FP)
  9864. RET
  9865. match_dst_size_check_encodeSnappyBlockAsm:
  9866. MOVL CX, SI
  9867. MOVL 12(SP), DI
  9868. CMPL DI, SI
  9869. JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm
  9870. MOVL SI, R8
  9871. MOVL SI, 12(SP)
  9872. LEAQ (DX)(DI*1), SI
  9873. SUBL DI, R8
  9874. LEAL -1(R8), DI
  9875. CMPL DI, $0x3c
  9876. JB one_byte_match_emit_encodeSnappyBlockAsm
  9877. CMPL DI, $0x00000100
  9878. JB two_bytes_match_emit_encodeSnappyBlockAsm
  9879. CMPL DI, $0x00010000
  9880. JB three_bytes_match_emit_encodeSnappyBlockAsm
  9881. CMPL DI, $0x01000000
  9882. JB four_bytes_match_emit_encodeSnappyBlockAsm
  9883. MOVB $0xfc, (AX)
  9884. MOVL DI, 1(AX)
  9885. ADDQ $0x05, AX
  9886. JMP memmove_long_match_emit_encodeSnappyBlockAsm
  9887. four_bytes_match_emit_encodeSnappyBlockAsm:
  9888. MOVL DI, R9
  9889. SHRL $0x10, R9
  9890. MOVB $0xf8, (AX)
  9891. MOVW DI, 1(AX)
  9892. MOVB R9, 3(AX)
  9893. ADDQ $0x04, AX
  9894. JMP memmove_long_match_emit_encodeSnappyBlockAsm
  9895. three_bytes_match_emit_encodeSnappyBlockAsm:
  9896. MOVB $0xf4, (AX)
  9897. MOVW DI, 1(AX)
  9898. ADDQ $0x03, AX
  9899. JMP memmove_long_match_emit_encodeSnappyBlockAsm
  9900. two_bytes_match_emit_encodeSnappyBlockAsm:
  9901. MOVB $0xf0, (AX)
  9902. MOVB DI, 1(AX)
  9903. ADDQ $0x02, AX
  9904. CMPL DI, $0x40
  9905. JB memmove_match_emit_encodeSnappyBlockAsm
  9906. JMP memmove_long_match_emit_encodeSnappyBlockAsm
  9907. one_byte_match_emit_encodeSnappyBlockAsm:
  9908. SHLB $0x02, DI
  9909. MOVB DI, (AX)
  9910. ADDQ $0x01, AX
  9911. memmove_match_emit_encodeSnappyBlockAsm:
  9912. LEAQ (AX)(R8*1), DI
  9913. // genMemMoveShort
  9914. CMPQ R8, $0x08
  9915. JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8
  9916. CMPQ R8, $0x10
  9917. JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16
  9918. CMPQ R8, $0x20
  9919. JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32
  9920. JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64
  9921. emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8:
  9922. MOVQ (SI), R9
  9923. MOVQ R9, (AX)
  9924. JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm
  9925. emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16:
  9926. MOVQ (SI), R9
  9927. MOVQ -8(SI)(R8*1), SI
  9928. MOVQ R9, (AX)
  9929. MOVQ SI, -8(AX)(R8*1)
  9930. JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm
  9931. emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32:
  9932. MOVOU (SI), X0
  9933. MOVOU -16(SI)(R8*1), X1
  9934. MOVOU X0, (AX)
  9935. MOVOU X1, -16(AX)(R8*1)
  9936. JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm
  9937. emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64:
  9938. MOVOU (SI), X0
  9939. MOVOU 16(SI), X1
  9940. MOVOU -32(SI)(R8*1), X2
  9941. MOVOU -16(SI)(R8*1), X3
  9942. MOVOU X0, (AX)
  9943. MOVOU X1, 16(AX)
  9944. MOVOU X2, -32(AX)(R8*1)
  9945. MOVOU X3, -16(AX)(R8*1)
  9946. memmove_end_copy_match_emit_encodeSnappyBlockAsm:
  9947. MOVQ DI, AX
  9948. JMP emit_literal_done_match_emit_encodeSnappyBlockAsm
  9949. memmove_long_match_emit_encodeSnappyBlockAsm:
  9950. LEAQ (AX)(R8*1), DI
  9951. // genMemMoveLong
  9952. MOVOU (SI), X0
  9953. MOVOU 16(SI), X1
  9954. MOVOU -32(SI)(R8*1), X2
  9955. MOVOU -16(SI)(R8*1), X3
  9956. MOVQ R8, R10
  9957. SHRQ $0x05, R10
  9958. MOVQ AX, R9
  9959. ANDL $0x0000001f, R9
  9960. MOVQ $0x00000040, R11
  9961. SUBQ R9, R11
  9962. DECQ R10
  9963. JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
  9964. LEAQ -32(SI)(R11*1), R9
  9965. LEAQ -32(AX)(R11*1), R12
  9966. emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back:
  9967. MOVOU (R9), X4
  9968. MOVOU 16(R9), X5
  9969. MOVOA X4, (R12)
  9970. MOVOA X5, 16(R12)
  9971. ADDQ $0x20, R12
  9972. ADDQ $0x20, R9
  9973. ADDQ $0x20, R11
  9974. DECQ R10
  9975. JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back
  9976. emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32:
  9977. MOVOU -32(SI)(R11*1), X4
  9978. MOVOU -16(SI)(R11*1), X5
  9979. MOVOA X4, -32(AX)(R11*1)
  9980. MOVOA X5, -16(AX)(R11*1)
  9981. ADDQ $0x20, R11
  9982. CMPQ R8, R11
  9983. JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
  9984. MOVOU X0, (AX)
  9985. MOVOU X1, 16(AX)
  9986. MOVOU X2, -32(AX)(R8*1)
  9987. MOVOU X3, -16(AX)(R8*1)
  9988. MOVQ DI, AX
  9989. emit_literal_done_match_emit_encodeSnappyBlockAsm:
  9990. match_nolit_loop_encodeSnappyBlockAsm:
  9991. MOVL CX, SI
  9992. SUBL BX, SI
  9993. MOVL SI, 16(SP)
  9994. ADDL $0x04, CX
  9995. ADDL $0x04, BX
  9996. MOVQ src_len+32(FP), SI
  9997. SUBL CX, SI
  9998. LEAQ (DX)(CX*1), DI
  9999. LEAQ (DX)(BX*1), BX
  10000. // matchLen
  10001. XORL R9, R9
  10002. matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm:
  10003. CMPL SI, $0x10
  10004. JB matchlen_match8_match_nolit_encodeSnappyBlockAsm
  10005. MOVQ (DI)(R9*1), R8
  10006. MOVQ 8(DI)(R9*1), R10
  10007. XORQ (BX)(R9*1), R8
  10008. JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm
  10009. XORQ 8(BX)(R9*1), R10
  10010. JNZ matchlen_bsf_16match_nolit_encodeSnappyBlockAsm
  10011. LEAL -16(SI), SI
  10012. LEAL 16(R9), R9
  10013. JMP matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm
  10014. matchlen_bsf_16match_nolit_encodeSnappyBlockAsm:
  10015. #ifdef GOAMD64_v3
  10016. TZCNTQ R10, R10
  10017. #else
  10018. BSFQ R10, R10
  10019. #endif
  10020. SARQ $0x03, R10
  10021. LEAL 8(R9)(R10*1), R9
  10022. JMP match_nolit_end_encodeSnappyBlockAsm
  10023. matchlen_match8_match_nolit_encodeSnappyBlockAsm:
  10024. CMPL SI, $0x08
  10025. JB matchlen_match4_match_nolit_encodeSnappyBlockAsm
  10026. MOVQ (DI)(R9*1), R8
  10027. XORQ (BX)(R9*1), R8
  10028. JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm
  10029. LEAL -8(SI), SI
  10030. LEAL 8(R9), R9
  10031. JMP matchlen_match4_match_nolit_encodeSnappyBlockAsm
  10032. matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm:
  10033. #ifdef GOAMD64_v3
  10034. TZCNTQ R8, R8
  10035. #else
  10036. BSFQ R8, R8
  10037. #endif
  10038. SARQ $0x03, R8
  10039. LEAL (R9)(R8*1), R9
  10040. JMP match_nolit_end_encodeSnappyBlockAsm
  10041. matchlen_match4_match_nolit_encodeSnappyBlockAsm:
  10042. CMPL SI, $0x04
  10043. JB matchlen_match2_match_nolit_encodeSnappyBlockAsm
  10044. MOVL (DI)(R9*1), R8
  10045. CMPL (BX)(R9*1), R8
  10046. JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm
  10047. LEAL -4(SI), SI
  10048. LEAL 4(R9), R9
  10049. matchlen_match2_match_nolit_encodeSnappyBlockAsm:
  10050. CMPL SI, $0x01
  10051. JE matchlen_match1_match_nolit_encodeSnappyBlockAsm
  10052. JB match_nolit_end_encodeSnappyBlockAsm
  10053. MOVW (DI)(R9*1), R8
  10054. CMPW (BX)(R9*1), R8
  10055. JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm
  10056. LEAL 2(R9), R9
  10057. SUBL $0x02, SI
  10058. JZ match_nolit_end_encodeSnappyBlockAsm
  10059. matchlen_match1_match_nolit_encodeSnappyBlockAsm:
  10060. MOVB (DI)(R9*1), R8
  10061. CMPB (BX)(R9*1), R8
  10062. JNE match_nolit_end_encodeSnappyBlockAsm
  10063. LEAL 1(R9), R9
  10064. match_nolit_end_encodeSnappyBlockAsm:
  10065. ADDL R9, CX
  10066. MOVL 16(SP), BX
  10067. ADDL $0x04, R9
  10068. MOVL CX, 12(SP)
  10069. // emitCopy
  10070. CMPL BX, $0x00010000
  10071. JB two_byte_offset_match_nolit_encodeSnappyBlockAsm
  10072. four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm:
  10073. CMPL R9, $0x40
  10074. JBE four_bytes_remain_match_nolit_encodeSnappyBlockAsm
  10075. MOVB $0xff, (AX)
  10076. MOVL BX, 1(AX)
  10077. LEAL -64(R9), R9
  10078. ADDQ $0x05, AX
  10079. CMPL R9, $0x04
  10080. JB four_bytes_remain_match_nolit_encodeSnappyBlockAsm
  10081. JMP four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm
  10082. four_bytes_remain_match_nolit_encodeSnappyBlockAsm:
  10083. TESTL R9, R9
  10084. JZ match_nolit_emitcopy_end_encodeSnappyBlockAsm
  10085. XORL SI, SI
  10086. LEAL -1(SI)(R9*4), R9
  10087. MOVB R9, (AX)
  10088. MOVL BX, 1(AX)
  10089. ADDQ $0x05, AX
  10090. JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm
  10091. two_byte_offset_match_nolit_encodeSnappyBlockAsm:
  10092. CMPL R9, $0x40
  10093. JBE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm
  10094. MOVB $0xee, (AX)
  10095. MOVW BX, 1(AX)
  10096. LEAL -60(R9), R9
  10097. ADDQ $0x03, AX
  10098. JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm
  10099. two_byte_offset_short_match_nolit_encodeSnappyBlockAsm:
  10100. MOVL R9, SI
  10101. SHLL $0x02, SI
  10102. CMPL R9, $0x0c
  10103. JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm
  10104. CMPL BX, $0x00000800
  10105. JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm
  10106. LEAL -15(SI), SI
  10107. MOVB BL, 1(AX)
  10108. SHRL $0x08, BX
  10109. SHLL $0x05, BX
  10110. ORL BX, SI
  10111. MOVB SI, (AX)
  10112. ADDQ $0x02, AX
  10113. JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm
  10114. emit_copy_three_match_nolit_encodeSnappyBlockAsm:
  10115. LEAL -2(SI), SI
  10116. MOVB SI, (AX)
  10117. MOVW BX, 1(AX)
  10118. ADDQ $0x03, AX
  10119. match_nolit_emitcopy_end_encodeSnappyBlockAsm:
  10120. CMPL CX, 8(SP)
  10121. JAE emit_remainder_encodeSnappyBlockAsm
  10122. MOVQ -2(DX)(CX*1), SI
  10123. CMPQ AX, (SP)
  10124. JB match_nolit_dst_ok_encodeSnappyBlockAsm
  10125. MOVQ $0x00000000, ret+48(FP)
  10126. RET
  10127. match_nolit_dst_ok_encodeSnappyBlockAsm:
  10128. MOVQ $0x0000cf1bbcdcbf9b, R8
  10129. MOVQ SI, DI
  10130. SHRQ $0x10, SI
  10131. MOVQ SI, BX
  10132. SHLQ $0x10, DI
  10133. IMULQ R8, DI
  10134. SHRQ $0x32, DI
  10135. SHLQ $0x10, BX
  10136. IMULQ R8, BX
  10137. SHRQ $0x32, BX
  10138. LEAL -2(CX), R8
  10139. LEAQ 24(SP)(BX*4), R9
  10140. MOVL (R9), BX
  10141. MOVL R8, 24(SP)(DI*4)
  10142. MOVL CX, (R9)
  10143. CMPL (DX)(BX*1), SI
  10144. JEQ match_nolit_loop_encodeSnappyBlockAsm
  10145. INCL CX
  10146. JMP search_loop_encodeSnappyBlockAsm
  10147. emit_remainder_encodeSnappyBlockAsm:
  10148. MOVQ src_len+32(FP), CX
  10149. SUBL 12(SP), CX
  10150. LEAQ 5(AX)(CX*1), CX
  10151. CMPQ CX, (SP)
  10152. JB emit_remainder_ok_encodeSnappyBlockAsm
  10153. MOVQ $0x00000000, ret+48(FP)
  10154. RET
  10155. emit_remainder_ok_encodeSnappyBlockAsm:
  10156. MOVQ src_len+32(FP), CX
  10157. MOVL 12(SP), BX
  10158. CMPL BX, CX
  10159. JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm
  10160. MOVL CX, SI
  10161. MOVL CX, 12(SP)
  10162. LEAQ (DX)(BX*1), CX
  10163. SUBL BX, SI
  10164. LEAL -1(SI), DX
  10165. CMPL DX, $0x3c
  10166. JB one_byte_emit_remainder_encodeSnappyBlockAsm
  10167. CMPL DX, $0x00000100
  10168. JB two_bytes_emit_remainder_encodeSnappyBlockAsm
  10169. CMPL DX, $0x00010000
  10170. JB three_bytes_emit_remainder_encodeSnappyBlockAsm
  10171. CMPL DX, $0x01000000
  10172. JB four_bytes_emit_remainder_encodeSnappyBlockAsm
  10173. MOVB $0xfc, (AX)
  10174. MOVL DX, 1(AX)
  10175. ADDQ $0x05, AX
  10176. JMP memmove_long_emit_remainder_encodeSnappyBlockAsm
  10177. four_bytes_emit_remainder_encodeSnappyBlockAsm:
  10178. MOVL DX, BX
  10179. SHRL $0x10, BX
  10180. MOVB $0xf8, (AX)
  10181. MOVW DX, 1(AX)
  10182. MOVB BL, 3(AX)
  10183. ADDQ $0x04, AX
  10184. JMP memmove_long_emit_remainder_encodeSnappyBlockAsm
  10185. three_bytes_emit_remainder_encodeSnappyBlockAsm:
  10186. MOVB $0xf4, (AX)
  10187. MOVW DX, 1(AX)
  10188. ADDQ $0x03, AX
  10189. JMP memmove_long_emit_remainder_encodeSnappyBlockAsm
  10190. two_bytes_emit_remainder_encodeSnappyBlockAsm:
  10191. MOVB $0xf0, (AX)
  10192. MOVB DL, 1(AX)
  10193. ADDQ $0x02, AX
  10194. CMPL DX, $0x40
  10195. JB memmove_emit_remainder_encodeSnappyBlockAsm
  10196. JMP memmove_long_emit_remainder_encodeSnappyBlockAsm
  10197. one_byte_emit_remainder_encodeSnappyBlockAsm:
  10198. SHLB $0x02, DL
  10199. MOVB DL, (AX)
  10200. ADDQ $0x01, AX
  10201. memmove_emit_remainder_encodeSnappyBlockAsm:
  10202. LEAQ (AX)(SI*1), DX
  10203. MOVL SI, BX
  10204. // genMemMoveShort
  10205. CMPQ BX, $0x03
  10206. JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_1or2
  10207. JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_3
  10208. CMPQ BX, $0x08
  10209. JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_4through7
  10210. CMPQ BX, $0x10
  10211. JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16
  10212. CMPQ BX, $0x20
  10213. JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32
  10214. JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64
  10215. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_1or2:
  10216. MOVB (CX), SI
  10217. MOVB -1(CX)(BX*1), CL
  10218. MOVB SI, (AX)
  10219. MOVB CL, -1(AX)(BX*1)
  10220. JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm
  10221. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_3:
  10222. MOVW (CX), SI
  10223. MOVB 2(CX), CL
  10224. MOVW SI, (AX)
  10225. MOVB CL, 2(AX)
  10226. JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm
  10227. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_4through7:
  10228. MOVL (CX), SI
  10229. MOVL -4(CX)(BX*1), CX
  10230. MOVL SI, (AX)
  10231. MOVL CX, -4(AX)(BX*1)
  10232. JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm
  10233. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16:
  10234. MOVQ (CX), SI
  10235. MOVQ -8(CX)(BX*1), CX
  10236. MOVQ SI, (AX)
  10237. MOVQ CX, -8(AX)(BX*1)
  10238. JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm
  10239. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32:
  10240. MOVOU (CX), X0
  10241. MOVOU -16(CX)(BX*1), X1
  10242. MOVOU X0, (AX)
  10243. MOVOU X1, -16(AX)(BX*1)
  10244. JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm
  10245. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64:
  10246. MOVOU (CX), X0
  10247. MOVOU 16(CX), X1
  10248. MOVOU -32(CX)(BX*1), X2
  10249. MOVOU -16(CX)(BX*1), X3
  10250. MOVOU X0, (AX)
  10251. MOVOU X1, 16(AX)
  10252. MOVOU X2, -32(AX)(BX*1)
  10253. MOVOU X3, -16(AX)(BX*1)
  10254. memmove_end_copy_emit_remainder_encodeSnappyBlockAsm:
  10255. MOVQ DX, AX
  10256. JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm
  10257. memmove_long_emit_remainder_encodeSnappyBlockAsm:
  10258. LEAQ (AX)(SI*1), DX
  10259. MOVL SI, BX
  10260. // genMemMoveLong
  10261. MOVOU (CX), X0
  10262. MOVOU 16(CX), X1
  10263. MOVOU -32(CX)(BX*1), X2
  10264. MOVOU -16(CX)(BX*1), X3
  10265. MOVQ BX, DI
  10266. SHRQ $0x05, DI
  10267. MOVQ AX, SI
  10268. ANDL $0x0000001f, SI
  10269. MOVQ $0x00000040, R8
  10270. SUBQ SI, R8
  10271. DECQ DI
  10272. JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32
  10273. LEAQ -32(CX)(R8*1), SI
  10274. LEAQ -32(AX)(R8*1), R9
  10275. emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back:
  10276. MOVOU (SI), X4
  10277. MOVOU 16(SI), X5
  10278. MOVOA X4, (R9)
  10279. MOVOA X5, 16(R9)
  10280. ADDQ $0x20, R9
  10281. ADDQ $0x20, SI
  10282. ADDQ $0x20, R8
  10283. DECQ DI
  10284. JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back
  10285. emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32:
  10286. MOVOU -32(CX)(R8*1), X4
  10287. MOVOU -16(CX)(R8*1), X5
  10288. MOVOA X4, -32(AX)(R8*1)
  10289. MOVOA X5, -16(AX)(R8*1)
  10290. ADDQ $0x20, R8
  10291. CMPQ BX, R8
  10292. JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32
  10293. MOVOU X0, (AX)
  10294. MOVOU X1, 16(AX)
  10295. MOVOU X2, -32(AX)(BX*1)
  10296. MOVOU X3, -16(AX)(BX*1)
  10297. MOVQ DX, AX
  10298. emit_literal_done_emit_remainder_encodeSnappyBlockAsm:
  10299. MOVQ dst_base+0(FP), CX
  10300. SUBQ CX, AX
  10301. MOVQ AX, ret+48(FP)
  10302. RET
  10303. // func encodeSnappyBlockAsm64K(dst []byte, src []byte) int
  10304. // Requires: BMI, SSE2
  10305. TEXT ·encodeSnappyBlockAsm64K(SB), $65560-56
  10306. MOVQ dst_base+0(FP), AX
  10307. MOVQ $0x00000200, CX
  10308. LEAQ 24(SP), DX
  10309. PXOR X0, X0
  10310. zero_loop_encodeSnappyBlockAsm64K:
  10311. MOVOU X0, (DX)
  10312. MOVOU X0, 16(DX)
  10313. MOVOU X0, 32(DX)
  10314. MOVOU X0, 48(DX)
  10315. MOVOU X0, 64(DX)
  10316. MOVOU X0, 80(DX)
  10317. MOVOU X0, 96(DX)
  10318. MOVOU X0, 112(DX)
  10319. ADDQ $0x80, DX
  10320. DECQ CX
  10321. JNZ zero_loop_encodeSnappyBlockAsm64K
  10322. MOVL $0x00000000, 12(SP)
  10323. MOVQ src_len+32(FP), CX
  10324. LEAQ -9(CX), DX
  10325. LEAQ -8(CX), BX
  10326. MOVL BX, 8(SP)
  10327. SHRQ $0x05, CX
  10328. SUBL CX, DX
  10329. LEAQ (AX)(DX*1), DX
  10330. MOVQ DX, (SP)
  10331. MOVL $0x00000001, CX
  10332. MOVL CX, 16(SP)
  10333. MOVQ src_base+24(FP), DX
  10334. search_loop_encodeSnappyBlockAsm64K:
  10335. MOVL CX, BX
  10336. SUBL 12(SP), BX
  10337. SHRL $0x06, BX
  10338. LEAL 4(CX)(BX*1), BX
  10339. CMPL BX, 8(SP)
  10340. JAE emit_remainder_encodeSnappyBlockAsm64K
  10341. MOVQ (DX)(CX*1), SI
  10342. MOVL BX, 20(SP)
  10343. MOVQ $0x0000cf1bbcdcbf9b, R8
  10344. MOVQ SI, R9
  10345. MOVQ SI, R10
  10346. SHRQ $0x08, R10
  10347. SHLQ $0x10, R9
  10348. IMULQ R8, R9
  10349. SHRQ $0x32, R9
  10350. SHLQ $0x10, R10
  10351. IMULQ R8, R10
  10352. SHRQ $0x32, R10
  10353. MOVL 24(SP)(R9*4), BX
  10354. MOVL 24(SP)(R10*4), DI
  10355. MOVL CX, 24(SP)(R9*4)
  10356. LEAL 1(CX), R9
  10357. MOVL R9, 24(SP)(R10*4)
  10358. MOVQ SI, R9
  10359. SHRQ $0x10, R9
  10360. SHLQ $0x10, R9
  10361. IMULQ R8, R9
  10362. SHRQ $0x32, R9
  10363. MOVL CX, R8
  10364. SUBL 16(SP), R8
  10365. MOVL 1(DX)(R8*1), R10
  10366. MOVQ SI, R8
  10367. SHRQ $0x08, R8
  10368. CMPL R8, R10
  10369. JNE no_repeat_found_encodeSnappyBlockAsm64K
  10370. LEAL 1(CX), SI
  10371. MOVL 12(SP), BX
  10372. MOVL SI, DI
  10373. SUBL 16(SP), DI
  10374. JZ repeat_extend_back_end_encodeSnappyBlockAsm64K
  10375. repeat_extend_back_loop_encodeSnappyBlockAsm64K:
  10376. CMPL SI, BX
  10377. JBE repeat_extend_back_end_encodeSnappyBlockAsm64K
  10378. MOVB -1(DX)(DI*1), R8
  10379. MOVB -1(DX)(SI*1), R9
  10380. CMPB R8, R9
  10381. JNE repeat_extend_back_end_encodeSnappyBlockAsm64K
  10382. LEAL -1(SI), SI
  10383. DECL DI
  10384. JNZ repeat_extend_back_loop_encodeSnappyBlockAsm64K
  10385. repeat_extend_back_end_encodeSnappyBlockAsm64K:
  10386. MOVL 12(SP), BX
  10387. CMPL BX, SI
  10388. JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K
  10389. MOVL SI, DI
  10390. MOVL SI, 12(SP)
  10391. LEAQ (DX)(BX*1), R8
  10392. SUBL BX, DI
  10393. LEAL -1(DI), BX
  10394. CMPL BX, $0x3c
  10395. JB one_byte_repeat_emit_encodeSnappyBlockAsm64K
  10396. CMPL BX, $0x00000100
  10397. JB two_bytes_repeat_emit_encodeSnappyBlockAsm64K
  10398. JB three_bytes_repeat_emit_encodeSnappyBlockAsm64K
  10399. three_bytes_repeat_emit_encodeSnappyBlockAsm64K:
  10400. MOVB $0xf4, (AX)
  10401. MOVW BX, 1(AX)
  10402. ADDQ $0x03, AX
  10403. JMP memmove_long_repeat_emit_encodeSnappyBlockAsm64K
  10404. two_bytes_repeat_emit_encodeSnappyBlockAsm64K:
  10405. MOVB $0xf0, (AX)
  10406. MOVB BL, 1(AX)
  10407. ADDQ $0x02, AX
  10408. CMPL BX, $0x40
  10409. JB memmove_repeat_emit_encodeSnappyBlockAsm64K
  10410. JMP memmove_long_repeat_emit_encodeSnappyBlockAsm64K
  10411. one_byte_repeat_emit_encodeSnappyBlockAsm64K:
  10412. SHLB $0x02, BL
  10413. MOVB BL, (AX)
  10414. ADDQ $0x01, AX
  10415. memmove_repeat_emit_encodeSnappyBlockAsm64K:
  10416. LEAQ (AX)(DI*1), BX
  10417. // genMemMoveShort
  10418. CMPQ DI, $0x08
  10419. JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8
  10420. CMPQ DI, $0x10
  10421. JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8through16
  10422. CMPQ DI, $0x20
  10423. JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_17through32
  10424. JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_33through64
  10425. emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8:
  10426. MOVQ (R8), R9
  10427. MOVQ R9, (AX)
  10428. JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K
  10429. emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8through16:
  10430. MOVQ (R8), R9
  10431. MOVQ -8(R8)(DI*1), R8
  10432. MOVQ R9, (AX)
  10433. MOVQ R8, -8(AX)(DI*1)
  10434. JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K
  10435. emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_17through32:
  10436. MOVOU (R8), X0
  10437. MOVOU -16(R8)(DI*1), X1
  10438. MOVOU X0, (AX)
  10439. MOVOU X1, -16(AX)(DI*1)
  10440. JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K
  10441. emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_33through64:
  10442. MOVOU (R8), X0
  10443. MOVOU 16(R8), X1
  10444. MOVOU -32(R8)(DI*1), X2
  10445. MOVOU -16(R8)(DI*1), X3
  10446. MOVOU X0, (AX)
  10447. MOVOU X1, 16(AX)
  10448. MOVOU X2, -32(AX)(DI*1)
  10449. MOVOU X3, -16(AX)(DI*1)
  10450. memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K:
  10451. MOVQ BX, AX
  10452. JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K
  10453. memmove_long_repeat_emit_encodeSnappyBlockAsm64K:
  10454. LEAQ (AX)(DI*1), BX
  10455. // genMemMoveLong
  10456. MOVOU (R8), X0
  10457. MOVOU 16(R8), X1
  10458. MOVOU -32(R8)(DI*1), X2
  10459. MOVOU -16(R8)(DI*1), X3
  10460. MOVQ DI, R10
  10461. SHRQ $0x05, R10
  10462. MOVQ AX, R9
  10463. ANDL $0x0000001f, R9
  10464. MOVQ $0x00000040, R11
  10465. SUBQ R9, R11
  10466. DECQ R10
  10467. JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
  10468. LEAQ -32(R8)(R11*1), R9
  10469. LEAQ -32(AX)(R11*1), R12
  10470. emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_big_loop_back:
  10471. MOVOU (R9), X4
  10472. MOVOU 16(R9), X5
  10473. MOVOA X4, (R12)
  10474. MOVOA X5, 16(R12)
  10475. ADDQ $0x20, R12
  10476. ADDQ $0x20, R9
  10477. ADDQ $0x20, R11
  10478. DECQ R10
  10479. JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_big_loop_back
  10480. emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32:
  10481. MOVOU -32(R8)(R11*1), X4
  10482. MOVOU -16(R8)(R11*1), X5
  10483. MOVOA X4, -32(AX)(R11*1)
  10484. MOVOA X5, -16(AX)(R11*1)
  10485. ADDQ $0x20, R11
  10486. CMPQ DI, R11
  10487. JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
  10488. MOVOU X0, (AX)
  10489. MOVOU X1, 16(AX)
  10490. MOVOU X2, -32(AX)(DI*1)
  10491. MOVOU X3, -16(AX)(DI*1)
  10492. MOVQ BX, AX
  10493. emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K:
  10494. ADDL $0x05, CX
  10495. MOVL CX, BX
  10496. SUBL 16(SP), BX
  10497. MOVQ src_len+32(FP), DI
  10498. SUBL CX, DI
  10499. LEAQ (DX)(CX*1), R8
  10500. LEAQ (DX)(BX*1), BX
  10501. // matchLen
  10502. XORL R10, R10
  10503. matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm64K:
  10504. CMPL DI, $0x10
  10505. JB matchlen_match8_repeat_extend_encodeSnappyBlockAsm64K
  10506. MOVQ (R8)(R10*1), R9
  10507. MOVQ 8(R8)(R10*1), R11
  10508. XORQ (BX)(R10*1), R9
  10509. JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm64K
  10510. XORQ 8(BX)(R10*1), R11
  10511. JNZ matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm64K
  10512. LEAL -16(DI), DI
  10513. LEAL 16(R10), R10
  10514. JMP matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm64K
  10515. matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm64K:
  10516. #ifdef GOAMD64_v3
  10517. TZCNTQ R11, R11
  10518. #else
  10519. BSFQ R11, R11
  10520. #endif
  10521. SARQ $0x03, R11
  10522. LEAL 8(R10)(R11*1), R10
  10523. JMP repeat_extend_forward_end_encodeSnappyBlockAsm64K
  10524. matchlen_match8_repeat_extend_encodeSnappyBlockAsm64K:
  10525. CMPL DI, $0x08
  10526. JB matchlen_match4_repeat_extend_encodeSnappyBlockAsm64K
  10527. MOVQ (R8)(R10*1), R9
  10528. XORQ (BX)(R10*1), R9
  10529. JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm64K
  10530. LEAL -8(DI), DI
  10531. LEAL 8(R10), R10
  10532. JMP matchlen_match4_repeat_extend_encodeSnappyBlockAsm64K
  10533. matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm64K:
  10534. #ifdef GOAMD64_v3
  10535. TZCNTQ R9, R9
  10536. #else
  10537. BSFQ R9, R9
  10538. #endif
  10539. SARQ $0x03, R9
  10540. LEAL (R10)(R9*1), R10
  10541. JMP repeat_extend_forward_end_encodeSnappyBlockAsm64K
  10542. matchlen_match4_repeat_extend_encodeSnappyBlockAsm64K:
  10543. CMPL DI, $0x04
  10544. JB matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K
  10545. MOVL (R8)(R10*1), R9
  10546. CMPL (BX)(R10*1), R9
  10547. JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K
  10548. LEAL -4(DI), DI
  10549. LEAL 4(R10), R10
  10550. matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K:
  10551. CMPL DI, $0x01
  10552. JE matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K
  10553. JB repeat_extend_forward_end_encodeSnappyBlockAsm64K
  10554. MOVW (R8)(R10*1), R9
  10555. CMPW (BX)(R10*1), R9
  10556. JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K
  10557. LEAL 2(R10), R10
  10558. SUBL $0x02, DI
  10559. JZ repeat_extend_forward_end_encodeSnappyBlockAsm64K
  10560. matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K:
  10561. MOVB (R8)(R10*1), R9
  10562. CMPB (BX)(R10*1), R9
  10563. JNE repeat_extend_forward_end_encodeSnappyBlockAsm64K
  10564. LEAL 1(R10), R10
  10565. repeat_extend_forward_end_encodeSnappyBlockAsm64K:
  10566. ADDL R10, CX
  10567. MOVL CX, BX
  10568. SUBL SI, BX
  10569. MOVL 16(SP), SI
  10570. // emitCopy
  10571. two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm64K:
  10572. CMPL BX, $0x40
  10573. JBE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm64K
  10574. MOVB $0xee, (AX)
  10575. MOVW SI, 1(AX)
  10576. LEAL -60(BX), BX
  10577. ADDQ $0x03, AX
  10578. JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm64K
  10579. two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm64K:
  10580. MOVL BX, DI
  10581. SHLL $0x02, DI
  10582. CMPL BX, $0x0c
  10583. JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K
  10584. CMPL SI, $0x00000800
  10585. JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K
  10586. LEAL -15(DI), DI
  10587. MOVB SI, 1(AX)
  10588. SHRL $0x08, SI
  10589. SHLL $0x05, SI
  10590. ORL SI, DI
  10591. MOVB DI, (AX)
  10592. ADDQ $0x02, AX
  10593. JMP repeat_end_emit_encodeSnappyBlockAsm64K
  10594. emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K:
  10595. LEAL -2(DI), DI
  10596. MOVB DI, (AX)
  10597. MOVW SI, 1(AX)
  10598. ADDQ $0x03, AX
  10599. repeat_end_emit_encodeSnappyBlockAsm64K:
  10600. MOVL CX, 12(SP)
  10601. JMP search_loop_encodeSnappyBlockAsm64K
  10602. no_repeat_found_encodeSnappyBlockAsm64K:
  10603. CMPL (DX)(BX*1), SI
  10604. JEQ candidate_match_encodeSnappyBlockAsm64K
  10605. SHRQ $0x08, SI
  10606. MOVL 24(SP)(R9*4), BX
  10607. LEAL 2(CX), R8
  10608. CMPL (DX)(DI*1), SI
  10609. JEQ candidate2_match_encodeSnappyBlockAsm64K
  10610. MOVL R8, 24(SP)(R9*4)
  10611. SHRQ $0x08, SI
  10612. CMPL (DX)(BX*1), SI
  10613. JEQ candidate3_match_encodeSnappyBlockAsm64K
  10614. MOVL 20(SP), CX
  10615. JMP search_loop_encodeSnappyBlockAsm64K
  10616. candidate3_match_encodeSnappyBlockAsm64K:
  10617. ADDL $0x02, CX
  10618. JMP candidate_match_encodeSnappyBlockAsm64K
  10619. candidate2_match_encodeSnappyBlockAsm64K:
  10620. MOVL R8, 24(SP)(R9*4)
  10621. INCL CX
  10622. MOVL DI, BX
  10623. candidate_match_encodeSnappyBlockAsm64K:
  10624. MOVL 12(SP), SI
  10625. TESTL BX, BX
  10626. JZ match_extend_back_end_encodeSnappyBlockAsm64K
  10627. match_extend_back_loop_encodeSnappyBlockAsm64K:
  10628. CMPL CX, SI
  10629. JBE match_extend_back_end_encodeSnappyBlockAsm64K
  10630. MOVB -1(DX)(BX*1), DI
  10631. MOVB -1(DX)(CX*1), R8
  10632. CMPB DI, R8
  10633. JNE match_extend_back_end_encodeSnappyBlockAsm64K
  10634. LEAL -1(CX), CX
  10635. DECL BX
  10636. JZ match_extend_back_end_encodeSnappyBlockAsm64K
  10637. JMP match_extend_back_loop_encodeSnappyBlockAsm64K
  10638. match_extend_back_end_encodeSnappyBlockAsm64K:
  10639. MOVL CX, SI
  10640. SUBL 12(SP), SI
  10641. LEAQ 3(AX)(SI*1), SI
  10642. CMPQ SI, (SP)
  10643. JB match_dst_size_check_encodeSnappyBlockAsm64K
  10644. MOVQ $0x00000000, ret+48(FP)
  10645. RET
  10646. match_dst_size_check_encodeSnappyBlockAsm64K:
  10647. MOVL CX, SI
  10648. MOVL 12(SP), DI
  10649. CMPL DI, SI
  10650. JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm64K
  10651. MOVL SI, R8
  10652. MOVL SI, 12(SP)
  10653. LEAQ (DX)(DI*1), SI
  10654. SUBL DI, R8
  10655. LEAL -1(R8), DI
  10656. CMPL DI, $0x3c
  10657. JB one_byte_match_emit_encodeSnappyBlockAsm64K
  10658. CMPL DI, $0x00000100
  10659. JB two_bytes_match_emit_encodeSnappyBlockAsm64K
  10660. JB three_bytes_match_emit_encodeSnappyBlockAsm64K
  10661. three_bytes_match_emit_encodeSnappyBlockAsm64K:
  10662. MOVB $0xf4, (AX)
  10663. MOVW DI, 1(AX)
  10664. ADDQ $0x03, AX
  10665. JMP memmove_long_match_emit_encodeSnappyBlockAsm64K
  10666. two_bytes_match_emit_encodeSnappyBlockAsm64K:
  10667. MOVB $0xf0, (AX)
  10668. MOVB DI, 1(AX)
  10669. ADDQ $0x02, AX
  10670. CMPL DI, $0x40
  10671. JB memmove_match_emit_encodeSnappyBlockAsm64K
  10672. JMP memmove_long_match_emit_encodeSnappyBlockAsm64K
  10673. one_byte_match_emit_encodeSnappyBlockAsm64K:
  10674. SHLB $0x02, DI
  10675. MOVB DI, (AX)
  10676. ADDQ $0x01, AX
  10677. memmove_match_emit_encodeSnappyBlockAsm64K:
  10678. LEAQ (AX)(R8*1), DI
  10679. // genMemMoveShort
  10680. CMPQ R8, $0x08
  10681. JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8
  10682. CMPQ R8, $0x10
  10683. JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8through16
  10684. CMPQ R8, $0x20
  10685. JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_17through32
  10686. JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_33through64
  10687. emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8:
  10688. MOVQ (SI), R9
  10689. MOVQ R9, (AX)
  10690. JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K
  10691. emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8through16:
  10692. MOVQ (SI), R9
  10693. MOVQ -8(SI)(R8*1), SI
  10694. MOVQ R9, (AX)
  10695. MOVQ SI, -8(AX)(R8*1)
  10696. JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K
  10697. emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_17through32:
  10698. MOVOU (SI), X0
  10699. MOVOU -16(SI)(R8*1), X1
  10700. MOVOU X0, (AX)
  10701. MOVOU X1, -16(AX)(R8*1)
  10702. JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K
  10703. emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_33through64:
  10704. MOVOU (SI), X0
  10705. MOVOU 16(SI), X1
  10706. MOVOU -32(SI)(R8*1), X2
  10707. MOVOU -16(SI)(R8*1), X3
  10708. MOVOU X0, (AX)
  10709. MOVOU X1, 16(AX)
  10710. MOVOU X2, -32(AX)(R8*1)
  10711. MOVOU X3, -16(AX)(R8*1)
  10712. memmove_end_copy_match_emit_encodeSnappyBlockAsm64K:
  10713. MOVQ DI, AX
  10714. JMP emit_literal_done_match_emit_encodeSnappyBlockAsm64K
  10715. memmove_long_match_emit_encodeSnappyBlockAsm64K:
  10716. LEAQ (AX)(R8*1), DI
  10717. // genMemMoveLong
  10718. MOVOU (SI), X0
  10719. MOVOU 16(SI), X1
  10720. MOVOU -32(SI)(R8*1), X2
  10721. MOVOU -16(SI)(R8*1), X3
  10722. MOVQ R8, R10
  10723. SHRQ $0x05, R10
  10724. MOVQ AX, R9
  10725. ANDL $0x0000001f, R9
  10726. MOVQ $0x00000040, R11
  10727. SUBQ R9, R11
  10728. DECQ R10
  10729. JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
  10730. LEAQ -32(SI)(R11*1), R9
  10731. LEAQ -32(AX)(R11*1), R12
  10732. emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_big_loop_back:
  10733. MOVOU (R9), X4
  10734. MOVOU 16(R9), X5
  10735. MOVOA X4, (R12)
  10736. MOVOA X5, 16(R12)
  10737. ADDQ $0x20, R12
  10738. ADDQ $0x20, R9
  10739. ADDQ $0x20, R11
  10740. DECQ R10
  10741. JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_big_loop_back
  10742. emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32:
  10743. MOVOU -32(SI)(R11*1), X4
  10744. MOVOU -16(SI)(R11*1), X5
  10745. MOVOA X4, -32(AX)(R11*1)
  10746. MOVOA X5, -16(AX)(R11*1)
  10747. ADDQ $0x20, R11
  10748. CMPQ R8, R11
  10749. JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
  10750. MOVOU X0, (AX)
  10751. MOVOU X1, 16(AX)
  10752. MOVOU X2, -32(AX)(R8*1)
  10753. MOVOU X3, -16(AX)(R8*1)
  10754. MOVQ DI, AX
  10755. emit_literal_done_match_emit_encodeSnappyBlockAsm64K:
  10756. match_nolit_loop_encodeSnappyBlockAsm64K:
  10757. MOVL CX, SI
  10758. SUBL BX, SI
  10759. MOVL SI, 16(SP)
  10760. ADDL $0x04, CX
  10761. ADDL $0x04, BX
  10762. MOVQ src_len+32(FP), SI
  10763. SUBL CX, SI
  10764. LEAQ (DX)(CX*1), DI
  10765. LEAQ (DX)(BX*1), BX
  10766. // matchLen
  10767. XORL R9, R9
  10768. matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm64K:
  10769. CMPL SI, $0x10
  10770. JB matchlen_match8_match_nolit_encodeSnappyBlockAsm64K
  10771. MOVQ (DI)(R9*1), R8
  10772. MOVQ 8(DI)(R9*1), R10
  10773. XORQ (BX)(R9*1), R8
  10774. JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm64K
  10775. XORQ 8(BX)(R9*1), R10
  10776. JNZ matchlen_bsf_16match_nolit_encodeSnappyBlockAsm64K
  10777. LEAL -16(SI), SI
  10778. LEAL 16(R9), R9
  10779. JMP matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm64K
  10780. matchlen_bsf_16match_nolit_encodeSnappyBlockAsm64K:
  10781. #ifdef GOAMD64_v3
  10782. TZCNTQ R10, R10
  10783. #else
  10784. BSFQ R10, R10
  10785. #endif
  10786. SARQ $0x03, R10
  10787. LEAL 8(R9)(R10*1), R9
  10788. JMP match_nolit_end_encodeSnappyBlockAsm64K
  10789. matchlen_match8_match_nolit_encodeSnappyBlockAsm64K:
  10790. CMPL SI, $0x08
  10791. JB matchlen_match4_match_nolit_encodeSnappyBlockAsm64K
  10792. MOVQ (DI)(R9*1), R8
  10793. XORQ (BX)(R9*1), R8
  10794. JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm64K
  10795. LEAL -8(SI), SI
  10796. LEAL 8(R9), R9
  10797. JMP matchlen_match4_match_nolit_encodeSnappyBlockAsm64K
  10798. matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm64K:
  10799. #ifdef GOAMD64_v3
  10800. TZCNTQ R8, R8
  10801. #else
  10802. BSFQ R8, R8
  10803. #endif
  10804. SARQ $0x03, R8
  10805. LEAL (R9)(R8*1), R9
  10806. JMP match_nolit_end_encodeSnappyBlockAsm64K
  10807. matchlen_match4_match_nolit_encodeSnappyBlockAsm64K:
  10808. CMPL SI, $0x04
  10809. JB matchlen_match2_match_nolit_encodeSnappyBlockAsm64K
  10810. MOVL (DI)(R9*1), R8
  10811. CMPL (BX)(R9*1), R8
  10812. JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm64K
  10813. LEAL -4(SI), SI
  10814. LEAL 4(R9), R9
  10815. matchlen_match2_match_nolit_encodeSnappyBlockAsm64K:
  10816. CMPL SI, $0x01
  10817. JE matchlen_match1_match_nolit_encodeSnappyBlockAsm64K
  10818. JB match_nolit_end_encodeSnappyBlockAsm64K
  10819. MOVW (DI)(R9*1), R8
  10820. CMPW (BX)(R9*1), R8
  10821. JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm64K
  10822. LEAL 2(R9), R9
  10823. SUBL $0x02, SI
  10824. JZ match_nolit_end_encodeSnappyBlockAsm64K
  10825. matchlen_match1_match_nolit_encodeSnappyBlockAsm64K:
  10826. MOVB (DI)(R9*1), R8
  10827. CMPB (BX)(R9*1), R8
  10828. JNE match_nolit_end_encodeSnappyBlockAsm64K
  10829. LEAL 1(R9), R9
  10830. match_nolit_end_encodeSnappyBlockAsm64K:
  10831. ADDL R9, CX
  10832. MOVL 16(SP), BX
  10833. ADDL $0x04, R9
  10834. MOVL CX, 12(SP)
  10835. // emitCopy
  10836. two_byte_offset_match_nolit_encodeSnappyBlockAsm64K:
  10837. CMPL R9, $0x40
  10838. JBE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm64K
  10839. MOVB $0xee, (AX)
  10840. MOVW BX, 1(AX)
  10841. LEAL -60(R9), R9
  10842. ADDQ $0x03, AX
  10843. JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm64K
  10844. two_byte_offset_short_match_nolit_encodeSnappyBlockAsm64K:
  10845. MOVL R9, SI
  10846. SHLL $0x02, SI
  10847. CMPL R9, $0x0c
  10848. JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm64K
  10849. CMPL BX, $0x00000800
  10850. JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm64K
  10851. LEAL -15(SI), SI
  10852. MOVB BL, 1(AX)
  10853. SHRL $0x08, BX
  10854. SHLL $0x05, BX
  10855. ORL BX, SI
  10856. MOVB SI, (AX)
  10857. ADDQ $0x02, AX
  10858. JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm64K
  10859. emit_copy_three_match_nolit_encodeSnappyBlockAsm64K:
  10860. LEAL -2(SI), SI
  10861. MOVB SI, (AX)
  10862. MOVW BX, 1(AX)
  10863. ADDQ $0x03, AX
  10864. match_nolit_emitcopy_end_encodeSnappyBlockAsm64K:
  10865. CMPL CX, 8(SP)
  10866. JAE emit_remainder_encodeSnappyBlockAsm64K
  10867. MOVQ -2(DX)(CX*1), SI
  10868. CMPQ AX, (SP)
  10869. JB match_nolit_dst_ok_encodeSnappyBlockAsm64K
  10870. MOVQ $0x00000000, ret+48(FP)
  10871. RET
  10872. match_nolit_dst_ok_encodeSnappyBlockAsm64K:
  10873. MOVQ $0x0000cf1bbcdcbf9b, R8
  10874. MOVQ SI, DI
  10875. SHRQ $0x10, SI
  10876. MOVQ SI, BX
  10877. SHLQ $0x10, DI
  10878. IMULQ R8, DI
  10879. SHRQ $0x32, DI
  10880. SHLQ $0x10, BX
  10881. IMULQ R8, BX
  10882. SHRQ $0x32, BX
  10883. LEAL -2(CX), R8
  10884. LEAQ 24(SP)(BX*4), R9
  10885. MOVL (R9), BX
  10886. MOVL R8, 24(SP)(DI*4)
  10887. MOVL CX, (R9)
  10888. CMPL (DX)(BX*1), SI
  10889. JEQ match_nolit_loop_encodeSnappyBlockAsm64K
  10890. INCL CX
  10891. JMP search_loop_encodeSnappyBlockAsm64K
  10892. emit_remainder_encodeSnappyBlockAsm64K:
  10893. MOVQ src_len+32(FP), CX
  10894. SUBL 12(SP), CX
  10895. LEAQ 3(AX)(CX*1), CX
  10896. CMPQ CX, (SP)
  10897. JB emit_remainder_ok_encodeSnappyBlockAsm64K
  10898. MOVQ $0x00000000, ret+48(FP)
  10899. RET
  10900. emit_remainder_ok_encodeSnappyBlockAsm64K:
  10901. MOVQ src_len+32(FP), CX
  10902. MOVL 12(SP), BX
  10903. CMPL BX, CX
  10904. JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K
  10905. MOVL CX, SI
  10906. MOVL CX, 12(SP)
  10907. LEAQ (DX)(BX*1), CX
  10908. SUBL BX, SI
  10909. LEAL -1(SI), DX
  10910. CMPL DX, $0x3c
  10911. JB one_byte_emit_remainder_encodeSnappyBlockAsm64K
  10912. CMPL DX, $0x00000100
  10913. JB two_bytes_emit_remainder_encodeSnappyBlockAsm64K
  10914. JB three_bytes_emit_remainder_encodeSnappyBlockAsm64K
  10915. three_bytes_emit_remainder_encodeSnappyBlockAsm64K:
  10916. MOVB $0xf4, (AX)
  10917. MOVW DX, 1(AX)
  10918. ADDQ $0x03, AX
  10919. JMP memmove_long_emit_remainder_encodeSnappyBlockAsm64K
  10920. two_bytes_emit_remainder_encodeSnappyBlockAsm64K:
  10921. MOVB $0xf0, (AX)
  10922. MOVB DL, 1(AX)
  10923. ADDQ $0x02, AX
  10924. CMPL DX, $0x40
  10925. JB memmove_emit_remainder_encodeSnappyBlockAsm64K
  10926. JMP memmove_long_emit_remainder_encodeSnappyBlockAsm64K
  10927. one_byte_emit_remainder_encodeSnappyBlockAsm64K:
  10928. SHLB $0x02, DL
  10929. MOVB DL, (AX)
  10930. ADDQ $0x01, AX
  10931. memmove_emit_remainder_encodeSnappyBlockAsm64K:
  10932. LEAQ (AX)(SI*1), DX
  10933. MOVL SI, BX
  10934. // genMemMoveShort
  10935. CMPQ BX, $0x03
  10936. JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_1or2
  10937. JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_3
  10938. CMPQ BX, $0x08
  10939. JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_4through7
  10940. CMPQ BX, $0x10
  10941. JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16
  10942. CMPQ BX, $0x20
  10943. JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_17through32
  10944. JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_33through64
  10945. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_1or2:
  10946. MOVB (CX), SI
  10947. MOVB -1(CX)(BX*1), CL
  10948. MOVB SI, (AX)
  10949. MOVB CL, -1(AX)(BX*1)
  10950. JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K
  10951. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_3:
  10952. MOVW (CX), SI
  10953. MOVB 2(CX), CL
  10954. MOVW SI, (AX)
  10955. MOVB CL, 2(AX)
  10956. JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K
  10957. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_4through7:
  10958. MOVL (CX), SI
  10959. MOVL -4(CX)(BX*1), CX
  10960. MOVL SI, (AX)
  10961. MOVL CX, -4(AX)(BX*1)
  10962. JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K
  10963. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16:
  10964. MOVQ (CX), SI
  10965. MOVQ -8(CX)(BX*1), CX
  10966. MOVQ SI, (AX)
  10967. MOVQ CX, -8(AX)(BX*1)
  10968. JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K
  10969. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_17through32:
  10970. MOVOU (CX), X0
  10971. MOVOU -16(CX)(BX*1), X1
  10972. MOVOU X0, (AX)
  10973. MOVOU X1, -16(AX)(BX*1)
  10974. JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K
  10975. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_33through64:
  10976. MOVOU (CX), X0
  10977. MOVOU 16(CX), X1
  10978. MOVOU -32(CX)(BX*1), X2
  10979. MOVOU -16(CX)(BX*1), X3
  10980. MOVOU X0, (AX)
  10981. MOVOU X1, 16(AX)
  10982. MOVOU X2, -32(AX)(BX*1)
  10983. MOVOU X3, -16(AX)(BX*1)
  10984. memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K:
  10985. MOVQ DX, AX
  10986. JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K
  10987. memmove_long_emit_remainder_encodeSnappyBlockAsm64K:
  10988. LEAQ (AX)(SI*1), DX
  10989. MOVL SI, BX
  10990. // genMemMoveLong
  10991. MOVOU (CX), X0
  10992. MOVOU 16(CX), X1
  10993. MOVOU -32(CX)(BX*1), X2
  10994. MOVOU -16(CX)(BX*1), X3
  10995. MOVQ BX, DI
  10996. SHRQ $0x05, DI
  10997. MOVQ AX, SI
  10998. ANDL $0x0000001f, SI
  10999. MOVQ $0x00000040, R8
  11000. SUBQ SI, R8
  11001. DECQ DI
  11002. JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
  11003. LEAQ -32(CX)(R8*1), SI
  11004. LEAQ -32(AX)(R8*1), R9
  11005. emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_big_loop_back:
  11006. MOVOU (SI), X4
  11007. MOVOU 16(SI), X5
  11008. MOVOA X4, (R9)
  11009. MOVOA X5, 16(R9)
  11010. ADDQ $0x20, R9
  11011. ADDQ $0x20, SI
  11012. ADDQ $0x20, R8
  11013. DECQ DI
  11014. JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_big_loop_back
  11015. emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32:
  11016. MOVOU -32(CX)(R8*1), X4
  11017. MOVOU -16(CX)(R8*1), X5
  11018. MOVOA X4, -32(AX)(R8*1)
  11019. MOVOA X5, -16(AX)(R8*1)
  11020. ADDQ $0x20, R8
  11021. CMPQ BX, R8
  11022. JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
  11023. MOVOU X0, (AX)
  11024. MOVOU X1, 16(AX)
  11025. MOVOU X2, -32(AX)(BX*1)
  11026. MOVOU X3, -16(AX)(BX*1)
  11027. MOVQ DX, AX
  11028. emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K:
  11029. MOVQ dst_base+0(FP), CX
  11030. SUBQ CX, AX
  11031. MOVQ AX, ret+48(FP)
  11032. RET
  11033. // func encodeSnappyBlockAsm12B(dst []byte, src []byte) int
  11034. // Requires: BMI, SSE2
  11035. TEXT ·encodeSnappyBlockAsm12B(SB), $16408-56
  11036. MOVQ dst_base+0(FP), AX
  11037. MOVQ $0x00000080, CX
  11038. LEAQ 24(SP), DX
  11039. PXOR X0, X0
  11040. zero_loop_encodeSnappyBlockAsm12B:
  11041. MOVOU X0, (DX)
  11042. MOVOU X0, 16(DX)
  11043. MOVOU X0, 32(DX)
  11044. MOVOU X0, 48(DX)
  11045. MOVOU X0, 64(DX)
  11046. MOVOU X0, 80(DX)
  11047. MOVOU X0, 96(DX)
  11048. MOVOU X0, 112(DX)
  11049. ADDQ $0x80, DX
  11050. DECQ CX
  11051. JNZ zero_loop_encodeSnappyBlockAsm12B
  11052. MOVL $0x00000000, 12(SP)
  11053. MOVQ src_len+32(FP), CX
  11054. LEAQ -9(CX), DX
  11055. LEAQ -8(CX), BX
  11056. MOVL BX, 8(SP)
  11057. SHRQ $0x05, CX
  11058. SUBL CX, DX
  11059. LEAQ (AX)(DX*1), DX
  11060. MOVQ DX, (SP)
  11061. MOVL $0x00000001, CX
  11062. MOVL CX, 16(SP)
  11063. MOVQ src_base+24(FP), DX
  11064. search_loop_encodeSnappyBlockAsm12B:
  11065. MOVL CX, BX
  11066. SUBL 12(SP), BX
  11067. SHRL $0x05, BX
  11068. LEAL 4(CX)(BX*1), BX
  11069. CMPL BX, 8(SP)
  11070. JAE emit_remainder_encodeSnappyBlockAsm12B
  11071. MOVQ (DX)(CX*1), SI
  11072. MOVL BX, 20(SP)
  11073. MOVQ $0x000000cf1bbcdcbb, R8
  11074. MOVQ SI, R9
  11075. MOVQ SI, R10
  11076. SHRQ $0x08, R10
  11077. SHLQ $0x18, R9
  11078. IMULQ R8, R9
  11079. SHRQ $0x34, R9
  11080. SHLQ $0x18, R10
  11081. IMULQ R8, R10
  11082. SHRQ $0x34, R10
  11083. MOVL 24(SP)(R9*4), BX
  11084. MOVL 24(SP)(R10*4), DI
  11085. MOVL CX, 24(SP)(R9*4)
  11086. LEAL 1(CX), R9
  11087. MOVL R9, 24(SP)(R10*4)
  11088. MOVQ SI, R9
  11089. SHRQ $0x10, R9
  11090. SHLQ $0x18, R9
  11091. IMULQ R8, R9
  11092. SHRQ $0x34, R9
  11093. MOVL CX, R8
  11094. SUBL 16(SP), R8
  11095. MOVL 1(DX)(R8*1), R10
  11096. MOVQ SI, R8
  11097. SHRQ $0x08, R8
  11098. CMPL R8, R10
  11099. JNE no_repeat_found_encodeSnappyBlockAsm12B
  11100. LEAL 1(CX), SI
  11101. MOVL 12(SP), BX
  11102. MOVL SI, DI
  11103. SUBL 16(SP), DI
  11104. JZ repeat_extend_back_end_encodeSnappyBlockAsm12B
  11105. repeat_extend_back_loop_encodeSnappyBlockAsm12B:
  11106. CMPL SI, BX
  11107. JBE repeat_extend_back_end_encodeSnappyBlockAsm12B
  11108. MOVB -1(DX)(DI*1), R8
  11109. MOVB -1(DX)(SI*1), R9
  11110. CMPB R8, R9
  11111. JNE repeat_extend_back_end_encodeSnappyBlockAsm12B
  11112. LEAL -1(SI), SI
  11113. DECL DI
  11114. JNZ repeat_extend_back_loop_encodeSnappyBlockAsm12B
  11115. repeat_extend_back_end_encodeSnappyBlockAsm12B:
  11116. MOVL 12(SP), BX
  11117. CMPL BX, SI
  11118. JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B
  11119. MOVL SI, DI
  11120. MOVL SI, 12(SP)
  11121. LEAQ (DX)(BX*1), R8
  11122. SUBL BX, DI
  11123. LEAL -1(DI), BX
  11124. CMPL BX, $0x3c
  11125. JB one_byte_repeat_emit_encodeSnappyBlockAsm12B
  11126. CMPL BX, $0x00000100
  11127. JB two_bytes_repeat_emit_encodeSnappyBlockAsm12B
  11128. JB three_bytes_repeat_emit_encodeSnappyBlockAsm12B
  11129. three_bytes_repeat_emit_encodeSnappyBlockAsm12B:
  11130. MOVB $0xf4, (AX)
  11131. MOVW BX, 1(AX)
  11132. ADDQ $0x03, AX
  11133. JMP memmove_long_repeat_emit_encodeSnappyBlockAsm12B
  11134. two_bytes_repeat_emit_encodeSnappyBlockAsm12B:
  11135. MOVB $0xf0, (AX)
  11136. MOVB BL, 1(AX)
  11137. ADDQ $0x02, AX
  11138. CMPL BX, $0x40
  11139. JB memmove_repeat_emit_encodeSnappyBlockAsm12B
  11140. JMP memmove_long_repeat_emit_encodeSnappyBlockAsm12B
  11141. one_byte_repeat_emit_encodeSnappyBlockAsm12B:
  11142. SHLB $0x02, BL
  11143. MOVB BL, (AX)
  11144. ADDQ $0x01, AX
  11145. memmove_repeat_emit_encodeSnappyBlockAsm12B:
  11146. LEAQ (AX)(DI*1), BX
  11147. // genMemMoveShort
  11148. CMPQ DI, $0x08
  11149. JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8
  11150. CMPQ DI, $0x10
  11151. JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16
  11152. CMPQ DI, $0x20
  11153. JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32
  11154. JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64
  11155. emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8:
  11156. MOVQ (R8), R9
  11157. MOVQ R9, (AX)
  11158. JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B
  11159. emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16:
  11160. MOVQ (R8), R9
  11161. MOVQ -8(R8)(DI*1), R8
  11162. MOVQ R9, (AX)
  11163. MOVQ R8, -8(AX)(DI*1)
  11164. JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B
  11165. emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32:
  11166. MOVOU (R8), X0
  11167. MOVOU -16(R8)(DI*1), X1
  11168. MOVOU X0, (AX)
  11169. MOVOU X1, -16(AX)(DI*1)
  11170. JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B
  11171. emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64:
  11172. MOVOU (R8), X0
  11173. MOVOU 16(R8), X1
  11174. MOVOU -32(R8)(DI*1), X2
  11175. MOVOU -16(R8)(DI*1), X3
  11176. MOVOU X0, (AX)
  11177. MOVOU X1, 16(AX)
  11178. MOVOU X2, -32(AX)(DI*1)
  11179. MOVOU X3, -16(AX)(DI*1)
  11180. memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B:
  11181. MOVQ BX, AX
  11182. JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B
  11183. memmove_long_repeat_emit_encodeSnappyBlockAsm12B:
  11184. LEAQ (AX)(DI*1), BX
  11185. // genMemMoveLong
  11186. MOVOU (R8), X0
  11187. MOVOU 16(R8), X1
  11188. MOVOU -32(R8)(DI*1), X2
  11189. MOVOU -16(R8)(DI*1), X3
  11190. MOVQ DI, R10
  11191. SHRQ $0x05, R10
  11192. MOVQ AX, R9
  11193. ANDL $0x0000001f, R9
  11194. MOVQ $0x00000040, R11
  11195. SUBQ R9, R11
  11196. DECQ R10
  11197. JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
  11198. LEAQ -32(R8)(R11*1), R9
  11199. LEAQ -32(AX)(R11*1), R12
  11200. emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back:
  11201. MOVOU (R9), X4
  11202. MOVOU 16(R9), X5
  11203. MOVOA X4, (R12)
  11204. MOVOA X5, 16(R12)
  11205. ADDQ $0x20, R12
  11206. ADDQ $0x20, R9
  11207. ADDQ $0x20, R11
  11208. DECQ R10
  11209. JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back
  11210. emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32:
  11211. MOVOU -32(R8)(R11*1), X4
  11212. MOVOU -16(R8)(R11*1), X5
  11213. MOVOA X4, -32(AX)(R11*1)
  11214. MOVOA X5, -16(AX)(R11*1)
  11215. ADDQ $0x20, R11
  11216. CMPQ DI, R11
  11217. JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
  11218. MOVOU X0, (AX)
  11219. MOVOU X1, 16(AX)
  11220. MOVOU X2, -32(AX)(DI*1)
  11221. MOVOU X3, -16(AX)(DI*1)
  11222. MOVQ BX, AX
  11223. emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B:
  11224. ADDL $0x05, CX
  11225. MOVL CX, BX
  11226. SUBL 16(SP), BX
  11227. MOVQ src_len+32(FP), DI
  11228. SUBL CX, DI
  11229. LEAQ (DX)(CX*1), R8
  11230. LEAQ (DX)(BX*1), BX
  11231. // matchLen
  11232. XORL R10, R10
  11233. matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm12B:
  11234. CMPL DI, $0x10
  11235. JB matchlen_match8_repeat_extend_encodeSnappyBlockAsm12B
  11236. MOVQ (R8)(R10*1), R9
  11237. MOVQ 8(R8)(R10*1), R11
  11238. XORQ (BX)(R10*1), R9
  11239. JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm12B
  11240. XORQ 8(BX)(R10*1), R11
  11241. JNZ matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm12B
  11242. LEAL -16(DI), DI
  11243. LEAL 16(R10), R10
  11244. JMP matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm12B
  11245. matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm12B:
  11246. #ifdef GOAMD64_v3
  11247. TZCNTQ R11, R11
  11248. #else
  11249. BSFQ R11, R11
  11250. #endif
  11251. SARQ $0x03, R11
  11252. LEAL 8(R10)(R11*1), R10
  11253. JMP repeat_extend_forward_end_encodeSnappyBlockAsm12B
  11254. matchlen_match8_repeat_extend_encodeSnappyBlockAsm12B:
  11255. CMPL DI, $0x08
  11256. JB matchlen_match4_repeat_extend_encodeSnappyBlockAsm12B
  11257. MOVQ (R8)(R10*1), R9
  11258. XORQ (BX)(R10*1), R9
  11259. JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm12B
  11260. LEAL -8(DI), DI
  11261. LEAL 8(R10), R10
  11262. JMP matchlen_match4_repeat_extend_encodeSnappyBlockAsm12B
  11263. matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm12B:
  11264. #ifdef GOAMD64_v3
  11265. TZCNTQ R9, R9
  11266. #else
  11267. BSFQ R9, R9
  11268. #endif
  11269. SARQ $0x03, R9
  11270. LEAL (R10)(R9*1), R10
  11271. JMP repeat_extend_forward_end_encodeSnappyBlockAsm12B
  11272. matchlen_match4_repeat_extend_encodeSnappyBlockAsm12B:
  11273. CMPL DI, $0x04
  11274. JB matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B
  11275. MOVL (R8)(R10*1), R9
  11276. CMPL (BX)(R10*1), R9
  11277. JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B
  11278. LEAL -4(DI), DI
  11279. LEAL 4(R10), R10
  11280. matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B:
  11281. CMPL DI, $0x01
  11282. JE matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B
  11283. JB repeat_extend_forward_end_encodeSnappyBlockAsm12B
  11284. MOVW (R8)(R10*1), R9
  11285. CMPW (BX)(R10*1), R9
  11286. JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B
  11287. LEAL 2(R10), R10
  11288. SUBL $0x02, DI
  11289. JZ repeat_extend_forward_end_encodeSnappyBlockAsm12B
  11290. matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B:
  11291. MOVB (R8)(R10*1), R9
  11292. CMPB (BX)(R10*1), R9
  11293. JNE repeat_extend_forward_end_encodeSnappyBlockAsm12B
  11294. LEAL 1(R10), R10
  11295. repeat_extend_forward_end_encodeSnappyBlockAsm12B:
  11296. ADDL R10, CX
  11297. MOVL CX, BX
  11298. SUBL SI, BX
  11299. MOVL 16(SP), SI
  11300. // emitCopy
  11301. two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B:
  11302. CMPL BX, $0x40
  11303. JBE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B
  11304. MOVB $0xee, (AX)
  11305. MOVW SI, 1(AX)
  11306. LEAL -60(BX), BX
  11307. ADDQ $0x03, AX
  11308. JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B
  11309. two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B:
  11310. MOVL BX, DI
  11311. SHLL $0x02, DI
  11312. CMPL BX, $0x0c
  11313. JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B
  11314. CMPL SI, $0x00000800
  11315. JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B
  11316. LEAL -15(DI), DI
  11317. MOVB SI, 1(AX)
  11318. SHRL $0x08, SI
  11319. SHLL $0x05, SI
  11320. ORL SI, DI
  11321. MOVB DI, (AX)
  11322. ADDQ $0x02, AX
  11323. JMP repeat_end_emit_encodeSnappyBlockAsm12B
  11324. emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B:
  11325. LEAL -2(DI), DI
  11326. MOVB DI, (AX)
  11327. MOVW SI, 1(AX)
  11328. ADDQ $0x03, AX
  11329. repeat_end_emit_encodeSnappyBlockAsm12B:
  11330. MOVL CX, 12(SP)
  11331. JMP search_loop_encodeSnappyBlockAsm12B
  11332. no_repeat_found_encodeSnappyBlockAsm12B:
  11333. CMPL (DX)(BX*1), SI
  11334. JEQ candidate_match_encodeSnappyBlockAsm12B
  11335. SHRQ $0x08, SI
  11336. MOVL 24(SP)(R9*4), BX
  11337. LEAL 2(CX), R8
  11338. CMPL (DX)(DI*1), SI
  11339. JEQ candidate2_match_encodeSnappyBlockAsm12B
  11340. MOVL R8, 24(SP)(R9*4)
  11341. SHRQ $0x08, SI
  11342. CMPL (DX)(BX*1), SI
  11343. JEQ candidate3_match_encodeSnappyBlockAsm12B
  11344. MOVL 20(SP), CX
  11345. JMP search_loop_encodeSnappyBlockAsm12B
  11346. candidate3_match_encodeSnappyBlockAsm12B:
  11347. ADDL $0x02, CX
  11348. JMP candidate_match_encodeSnappyBlockAsm12B
  11349. candidate2_match_encodeSnappyBlockAsm12B:
  11350. MOVL R8, 24(SP)(R9*4)
  11351. INCL CX
  11352. MOVL DI, BX
  11353. candidate_match_encodeSnappyBlockAsm12B:
  11354. MOVL 12(SP), SI
  11355. TESTL BX, BX
  11356. JZ match_extend_back_end_encodeSnappyBlockAsm12B
  11357. match_extend_back_loop_encodeSnappyBlockAsm12B:
  11358. CMPL CX, SI
  11359. JBE match_extend_back_end_encodeSnappyBlockAsm12B
  11360. MOVB -1(DX)(BX*1), DI
  11361. MOVB -1(DX)(CX*1), R8
  11362. CMPB DI, R8
  11363. JNE match_extend_back_end_encodeSnappyBlockAsm12B
  11364. LEAL -1(CX), CX
  11365. DECL BX
  11366. JZ match_extend_back_end_encodeSnappyBlockAsm12B
  11367. JMP match_extend_back_loop_encodeSnappyBlockAsm12B
  11368. match_extend_back_end_encodeSnappyBlockAsm12B:
  11369. MOVL CX, SI
  11370. SUBL 12(SP), SI
  11371. LEAQ 3(AX)(SI*1), SI
  11372. CMPQ SI, (SP)
  11373. JB match_dst_size_check_encodeSnappyBlockAsm12B
  11374. MOVQ $0x00000000, ret+48(FP)
  11375. RET
  11376. match_dst_size_check_encodeSnappyBlockAsm12B:
  11377. MOVL CX, SI
  11378. MOVL 12(SP), DI
  11379. CMPL DI, SI
  11380. JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm12B
  11381. MOVL SI, R8
  11382. MOVL SI, 12(SP)
  11383. LEAQ (DX)(DI*1), SI
  11384. SUBL DI, R8
  11385. LEAL -1(R8), DI
  11386. CMPL DI, $0x3c
  11387. JB one_byte_match_emit_encodeSnappyBlockAsm12B
  11388. CMPL DI, $0x00000100
  11389. JB two_bytes_match_emit_encodeSnappyBlockAsm12B
  11390. JB three_bytes_match_emit_encodeSnappyBlockAsm12B
  11391. three_bytes_match_emit_encodeSnappyBlockAsm12B:
  11392. MOVB $0xf4, (AX)
  11393. MOVW DI, 1(AX)
  11394. ADDQ $0x03, AX
  11395. JMP memmove_long_match_emit_encodeSnappyBlockAsm12B
  11396. two_bytes_match_emit_encodeSnappyBlockAsm12B:
  11397. MOVB $0xf0, (AX)
  11398. MOVB DI, 1(AX)
  11399. ADDQ $0x02, AX
  11400. CMPL DI, $0x40
  11401. JB memmove_match_emit_encodeSnappyBlockAsm12B
  11402. JMP memmove_long_match_emit_encodeSnappyBlockAsm12B
  11403. one_byte_match_emit_encodeSnappyBlockAsm12B:
  11404. SHLB $0x02, DI
  11405. MOVB DI, (AX)
  11406. ADDQ $0x01, AX
  11407. memmove_match_emit_encodeSnappyBlockAsm12B:
  11408. LEAQ (AX)(R8*1), DI
  11409. // genMemMoveShort
  11410. CMPQ R8, $0x08
  11411. JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8
  11412. CMPQ R8, $0x10
  11413. JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16
  11414. CMPQ R8, $0x20
  11415. JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32
  11416. JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64
  11417. emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8:
  11418. MOVQ (SI), R9
  11419. MOVQ R9, (AX)
  11420. JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B
  11421. emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16:
  11422. MOVQ (SI), R9
  11423. MOVQ -8(SI)(R8*1), SI
  11424. MOVQ R9, (AX)
  11425. MOVQ SI, -8(AX)(R8*1)
  11426. JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B
  11427. emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32:
  11428. MOVOU (SI), X0
  11429. MOVOU -16(SI)(R8*1), X1
  11430. MOVOU X0, (AX)
  11431. MOVOU X1, -16(AX)(R8*1)
  11432. JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B
  11433. emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64:
  11434. MOVOU (SI), X0
  11435. MOVOU 16(SI), X1
  11436. MOVOU -32(SI)(R8*1), X2
  11437. MOVOU -16(SI)(R8*1), X3
  11438. MOVOU X0, (AX)
  11439. MOVOU X1, 16(AX)
  11440. MOVOU X2, -32(AX)(R8*1)
  11441. MOVOU X3, -16(AX)(R8*1)
  11442. memmove_end_copy_match_emit_encodeSnappyBlockAsm12B:
  11443. MOVQ DI, AX
  11444. JMP emit_literal_done_match_emit_encodeSnappyBlockAsm12B
  11445. memmove_long_match_emit_encodeSnappyBlockAsm12B:
  11446. LEAQ (AX)(R8*1), DI
  11447. // genMemMoveLong
  11448. MOVOU (SI), X0
  11449. MOVOU 16(SI), X1
  11450. MOVOU -32(SI)(R8*1), X2
  11451. MOVOU -16(SI)(R8*1), X3
  11452. MOVQ R8, R10
  11453. SHRQ $0x05, R10
  11454. MOVQ AX, R9
  11455. ANDL $0x0000001f, R9
  11456. MOVQ $0x00000040, R11
  11457. SUBQ R9, R11
  11458. DECQ R10
  11459. JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
  11460. LEAQ -32(SI)(R11*1), R9
  11461. LEAQ -32(AX)(R11*1), R12
  11462. emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back:
  11463. MOVOU (R9), X4
  11464. MOVOU 16(R9), X5
  11465. MOVOA X4, (R12)
  11466. MOVOA X5, 16(R12)
  11467. ADDQ $0x20, R12
  11468. ADDQ $0x20, R9
  11469. ADDQ $0x20, R11
  11470. DECQ R10
  11471. JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back
  11472. emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32:
  11473. MOVOU -32(SI)(R11*1), X4
  11474. MOVOU -16(SI)(R11*1), X5
  11475. MOVOA X4, -32(AX)(R11*1)
  11476. MOVOA X5, -16(AX)(R11*1)
  11477. ADDQ $0x20, R11
  11478. CMPQ R8, R11
  11479. JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
  11480. MOVOU X0, (AX)
  11481. MOVOU X1, 16(AX)
  11482. MOVOU X2, -32(AX)(R8*1)
  11483. MOVOU X3, -16(AX)(R8*1)
  11484. MOVQ DI, AX
  11485. emit_literal_done_match_emit_encodeSnappyBlockAsm12B:
  11486. match_nolit_loop_encodeSnappyBlockAsm12B:
  11487. MOVL CX, SI
  11488. SUBL BX, SI
  11489. MOVL SI, 16(SP)
  11490. ADDL $0x04, CX
  11491. ADDL $0x04, BX
  11492. MOVQ src_len+32(FP), SI
  11493. SUBL CX, SI
  11494. LEAQ (DX)(CX*1), DI
  11495. LEAQ (DX)(BX*1), BX
  11496. // matchLen
  11497. XORL R9, R9
  11498. matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm12B:
  11499. CMPL SI, $0x10
  11500. JB matchlen_match8_match_nolit_encodeSnappyBlockAsm12B
  11501. MOVQ (DI)(R9*1), R8
  11502. MOVQ 8(DI)(R9*1), R10
  11503. XORQ (BX)(R9*1), R8
  11504. JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm12B
  11505. XORQ 8(BX)(R9*1), R10
  11506. JNZ matchlen_bsf_16match_nolit_encodeSnappyBlockAsm12B
  11507. LEAL -16(SI), SI
  11508. LEAL 16(R9), R9
  11509. JMP matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm12B
  11510. matchlen_bsf_16match_nolit_encodeSnappyBlockAsm12B:
  11511. #ifdef GOAMD64_v3
  11512. TZCNTQ R10, R10
  11513. #else
  11514. BSFQ R10, R10
  11515. #endif
  11516. SARQ $0x03, R10
  11517. LEAL 8(R9)(R10*1), R9
  11518. JMP match_nolit_end_encodeSnappyBlockAsm12B
  11519. matchlen_match8_match_nolit_encodeSnappyBlockAsm12B:
  11520. CMPL SI, $0x08
  11521. JB matchlen_match4_match_nolit_encodeSnappyBlockAsm12B
  11522. MOVQ (DI)(R9*1), R8
  11523. XORQ (BX)(R9*1), R8
  11524. JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm12B
  11525. LEAL -8(SI), SI
  11526. LEAL 8(R9), R9
  11527. JMP matchlen_match4_match_nolit_encodeSnappyBlockAsm12B
  11528. matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm12B:
  11529. #ifdef GOAMD64_v3
  11530. TZCNTQ R8, R8
  11531. #else
  11532. BSFQ R8, R8
  11533. #endif
  11534. SARQ $0x03, R8
  11535. LEAL (R9)(R8*1), R9
  11536. JMP match_nolit_end_encodeSnappyBlockAsm12B
  11537. matchlen_match4_match_nolit_encodeSnappyBlockAsm12B:
  11538. CMPL SI, $0x04
  11539. JB matchlen_match2_match_nolit_encodeSnappyBlockAsm12B
  11540. MOVL (DI)(R9*1), R8
  11541. CMPL (BX)(R9*1), R8
  11542. JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm12B
  11543. LEAL -4(SI), SI
  11544. LEAL 4(R9), R9
  11545. matchlen_match2_match_nolit_encodeSnappyBlockAsm12B:
  11546. CMPL SI, $0x01
  11547. JE matchlen_match1_match_nolit_encodeSnappyBlockAsm12B
  11548. JB match_nolit_end_encodeSnappyBlockAsm12B
  11549. MOVW (DI)(R9*1), R8
  11550. CMPW (BX)(R9*1), R8
  11551. JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm12B
  11552. LEAL 2(R9), R9
  11553. SUBL $0x02, SI
  11554. JZ match_nolit_end_encodeSnappyBlockAsm12B
  11555. matchlen_match1_match_nolit_encodeSnappyBlockAsm12B:
  11556. MOVB (DI)(R9*1), R8
  11557. CMPB (BX)(R9*1), R8
  11558. JNE match_nolit_end_encodeSnappyBlockAsm12B
  11559. LEAL 1(R9), R9
  11560. match_nolit_end_encodeSnappyBlockAsm12B:
  11561. ADDL R9, CX
  11562. MOVL 16(SP), BX
  11563. ADDL $0x04, R9
  11564. MOVL CX, 12(SP)
  11565. // emitCopy
  11566. two_byte_offset_match_nolit_encodeSnappyBlockAsm12B:
  11567. CMPL R9, $0x40
  11568. JBE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B
  11569. MOVB $0xee, (AX)
  11570. MOVW BX, 1(AX)
  11571. LEAL -60(R9), R9
  11572. ADDQ $0x03, AX
  11573. JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm12B
  11574. two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B:
  11575. MOVL R9, SI
  11576. SHLL $0x02, SI
  11577. CMPL R9, $0x0c
  11578. JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm12B
  11579. CMPL BX, $0x00000800
  11580. JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm12B
  11581. LEAL -15(SI), SI
  11582. MOVB BL, 1(AX)
  11583. SHRL $0x08, BX
  11584. SHLL $0x05, BX
  11585. ORL BX, SI
  11586. MOVB SI, (AX)
  11587. ADDQ $0x02, AX
  11588. JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm12B
  11589. emit_copy_three_match_nolit_encodeSnappyBlockAsm12B:
  11590. LEAL -2(SI), SI
  11591. MOVB SI, (AX)
  11592. MOVW BX, 1(AX)
  11593. ADDQ $0x03, AX
  11594. match_nolit_emitcopy_end_encodeSnappyBlockAsm12B:
  11595. CMPL CX, 8(SP)
  11596. JAE emit_remainder_encodeSnappyBlockAsm12B
  11597. MOVQ -2(DX)(CX*1), SI
  11598. CMPQ AX, (SP)
  11599. JB match_nolit_dst_ok_encodeSnappyBlockAsm12B
  11600. MOVQ $0x00000000, ret+48(FP)
  11601. RET
  11602. match_nolit_dst_ok_encodeSnappyBlockAsm12B:
  11603. MOVQ $0x000000cf1bbcdcbb, R8
  11604. MOVQ SI, DI
  11605. SHRQ $0x10, SI
  11606. MOVQ SI, BX
  11607. SHLQ $0x18, DI
  11608. IMULQ R8, DI
  11609. SHRQ $0x34, DI
  11610. SHLQ $0x18, BX
  11611. IMULQ R8, BX
  11612. SHRQ $0x34, BX
  11613. LEAL -2(CX), R8
  11614. LEAQ 24(SP)(BX*4), R9
  11615. MOVL (R9), BX
  11616. MOVL R8, 24(SP)(DI*4)
  11617. MOVL CX, (R9)
  11618. CMPL (DX)(BX*1), SI
  11619. JEQ match_nolit_loop_encodeSnappyBlockAsm12B
  11620. INCL CX
  11621. JMP search_loop_encodeSnappyBlockAsm12B
  11622. emit_remainder_encodeSnappyBlockAsm12B:
  11623. MOVQ src_len+32(FP), CX
  11624. SUBL 12(SP), CX
  11625. LEAQ 3(AX)(CX*1), CX
  11626. CMPQ CX, (SP)
  11627. JB emit_remainder_ok_encodeSnappyBlockAsm12B
  11628. MOVQ $0x00000000, ret+48(FP)
  11629. RET
  11630. emit_remainder_ok_encodeSnappyBlockAsm12B:
  11631. MOVQ src_len+32(FP), CX
  11632. MOVL 12(SP), BX
  11633. CMPL BX, CX
  11634. JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B
  11635. MOVL CX, SI
  11636. MOVL CX, 12(SP)
  11637. LEAQ (DX)(BX*1), CX
  11638. SUBL BX, SI
  11639. LEAL -1(SI), DX
  11640. CMPL DX, $0x3c
  11641. JB one_byte_emit_remainder_encodeSnappyBlockAsm12B
  11642. CMPL DX, $0x00000100
  11643. JB two_bytes_emit_remainder_encodeSnappyBlockAsm12B
  11644. JB three_bytes_emit_remainder_encodeSnappyBlockAsm12B
  11645. three_bytes_emit_remainder_encodeSnappyBlockAsm12B:
  11646. MOVB $0xf4, (AX)
  11647. MOVW DX, 1(AX)
  11648. ADDQ $0x03, AX
  11649. JMP memmove_long_emit_remainder_encodeSnappyBlockAsm12B
  11650. two_bytes_emit_remainder_encodeSnappyBlockAsm12B:
  11651. MOVB $0xf0, (AX)
  11652. MOVB DL, 1(AX)
  11653. ADDQ $0x02, AX
  11654. CMPL DX, $0x40
  11655. JB memmove_emit_remainder_encodeSnappyBlockAsm12B
  11656. JMP memmove_long_emit_remainder_encodeSnappyBlockAsm12B
  11657. one_byte_emit_remainder_encodeSnappyBlockAsm12B:
  11658. SHLB $0x02, DL
  11659. MOVB DL, (AX)
  11660. ADDQ $0x01, AX
  11661. memmove_emit_remainder_encodeSnappyBlockAsm12B:
  11662. LEAQ (AX)(SI*1), DX
  11663. MOVL SI, BX
  11664. // genMemMoveShort
  11665. CMPQ BX, $0x03
  11666. JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_1or2
  11667. JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_3
  11668. CMPQ BX, $0x08
  11669. JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_4through7
  11670. CMPQ BX, $0x10
  11671. JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16
  11672. CMPQ BX, $0x20
  11673. JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32
  11674. JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64
  11675. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_1or2:
  11676. MOVB (CX), SI
  11677. MOVB -1(CX)(BX*1), CL
  11678. MOVB SI, (AX)
  11679. MOVB CL, -1(AX)(BX*1)
  11680. JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B
  11681. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_3:
  11682. MOVW (CX), SI
  11683. MOVB 2(CX), CL
  11684. MOVW SI, (AX)
  11685. MOVB CL, 2(AX)
  11686. JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B
  11687. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_4through7:
  11688. MOVL (CX), SI
  11689. MOVL -4(CX)(BX*1), CX
  11690. MOVL SI, (AX)
  11691. MOVL CX, -4(AX)(BX*1)
  11692. JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B
  11693. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16:
  11694. MOVQ (CX), SI
  11695. MOVQ -8(CX)(BX*1), CX
  11696. MOVQ SI, (AX)
  11697. MOVQ CX, -8(AX)(BX*1)
  11698. JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B
  11699. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32:
  11700. MOVOU (CX), X0
  11701. MOVOU -16(CX)(BX*1), X1
  11702. MOVOU X0, (AX)
  11703. MOVOU X1, -16(AX)(BX*1)
  11704. JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B
  11705. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64:
  11706. MOVOU (CX), X0
  11707. MOVOU 16(CX), X1
  11708. MOVOU -32(CX)(BX*1), X2
  11709. MOVOU -16(CX)(BX*1), X3
  11710. MOVOU X0, (AX)
  11711. MOVOU X1, 16(AX)
  11712. MOVOU X2, -32(AX)(BX*1)
  11713. MOVOU X3, -16(AX)(BX*1)
  11714. memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B:
  11715. MOVQ DX, AX
  11716. JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B
  11717. memmove_long_emit_remainder_encodeSnappyBlockAsm12B:
  11718. LEAQ (AX)(SI*1), DX
  11719. MOVL SI, BX
  11720. // genMemMoveLong
  11721. MOVOU (CX), X0
  11722. MOVOU 16(CX), X1
  11723. MOVOU -32(CX)(BX*1), X2
  11724. MOVOU -16(CX)(BX*1), X3
  11725. MOVQ BX, DI
  11726. SHRQ $0x05, DI
  11727. MOVQ AX, SI
  11728. ANDL $0x0000001f, SI
  11729. MOVQ $0x00000040, R8
  11730. SUBQ SI, R8
  11731. DECQ DI
  11732. JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
  11733. LEAQ -32(CX)(R8*1), SI
  11734. LEAQ -32(AX)(R8*1), R9
  11735. emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back:
  11736. MOVOU (SI), X4
  11737. MOVOU 16(SI), X5
  11738. MOVOA X4, (R9)
  11739. MOVOA X5, 16(R9)
  11740. ADDQ $0x20, R9
  11741. ADDQ $0x20, SI
  11742. ADDQ $0x20, R8
  11743. DECQ DI
  11744. JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back
  11745. emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32:
  11746. MOVOU -32(CX)(R8*1), X4
  11747. MOVOU -16(CX)(R8*1), X5
  11748. MOVOA X4, -32(AX)(R8*1)
  11749. MOVOA X5, -16(AX)(R8*1)
  11750. ADDQ $0x20, R8
  11751. CMPQ BX, R8
  11752. JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
  11753. MOVOU X0, (AX)
  11754. MOVOU X1, 16(AX)
  11755. MOVOU X2, -32(AX)(BX*1)
  11756. MOVOU X3, -16(AX)(BX*1)
  11757. MOVQ DX, AX
  11758. emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B:
  11759. MOVQ dst_base+0(FP), CX
  11760. SUBQ CX, AX
  11761. MOVQ AX, ret+48(FP)
  11762. RET
  11763. // func encodeSnappyBlockAsm10B(dst []byte, src []byte) int
  11764. // Requires: BMI, SSE2
  11765. TEXT ·encodeSnappyBlockAsm10B(SB), $4120-56
  11766. MOVQ dst_base+0(FP), AX
  11767. MOVQ $0x00000020, CX
  11768. LEAQ 24(SP), DX
  11769. PXOR X0, X0
  11770. zero_loop_encodeSnappyBlockAsm10B:
  11771. MOVOU X0, (DX)
  11772. MOVOU X0, 16(DX)
  11773. MOVOU X0, 32(DX)
  11774. MOVOU X0, 48(DX)
  11775. MOVOU X0, 64(DX)
  11776. MOVOU X0, 80(DX)
  11777. MOVOU X0, 96(DX)
  11778. MOVOU X0, 112(DX)
  11779. ADDQ $0x80, DX
  11780. DECQ CX
  11781. JNZ zero_loop_encodeSnappyBlockAsm10B
  11782. MOVL $0x00000000, 12(SP)
  11783. MOVQ src_len+32(FP), CX
  11784. LEAQ -9(CX), DX
  11785. LEAQ -8(CX), BX
  11786. MOVL BX, 8(SP)
  11787. SHRQ $0x05, CX
  11788. SUBL CX, DX
  11789. LEAQ (AX)(DX*1), DX
  11790. MOVQ DX, (SP)
  11791. MOVL $0x00000001, CX
  11792. MOVL CX, 16(SP)
  11793. MOVQ src_base+24(FP), DX
  11794. search_loop_encodeSnappyBlockAsm10B:
  11795. MOVL CX, BX
  11796. SUBL 12(SP), BX
  11797. SHRL $0x05, BX
  11798. LEAL 4(CX)(BX*1), BX
  11799. CMPL BX, 8(SP)
  11800. JAE emit_remainder_encodeSnappyBlockAsm10B
  11801. MOVQ (DX)(CX*1), SI
  11802. MOVL BX, 20(SP)
  11803. MOVQ $0x9e3779b1, R8
  11804. MOVQ SI, R9
  11805. MOVQ SI, R10
  11806. SHRQ $0x08, R10
  11807. SHLQ $0x20, R9
  11808. IMULQ R8, R9
  11809. SHRQ $0x36, R9
  11810. SHLQ $0x20, R10
  11811. IMULQ R8, R10
  11812. SHRQ $0x36, R10
  11813. MOVL 24(SP)(R9*4), BX
  11814. MOVL 24(SP)(R10*4), DI
  11815. MOVL CX, 24(SP)(R9*4)
  11816. LEAL 1(CX), R9
  11817. MOVL R9, 24(SP)(R10*4)
  11818. MOVQ SI, R9
  11819. SHRQ $0x10, R9
  11820. SHLQ $0x20, R9
  11821. IMULQ R8, R9
  11822. SHRQ $0x36, R9
  11823. MOVL CX, R8
  11824. SUBL 16(SP), R8
  11825. MOVL 1(DX)(R8*1), R10
  11826. MOVQ SI, R8
  11827. SHRQ $0x08, R8
  11828. CMPL R8, R10
  11829. JNE no_repeat_found_encodeSnappyBlockAsm10B
  11830. LEAL 1(CX), SI
  11831. MOVL 12(SP), BX
  11832. MOVL SI, DI
  11833. SUBL 16(SP), DI
  11834. JZ repeat_extend_back_end_encodeSnappyBlockAsm10B
  11835. repeat_extend_back_loop_encodeSnappyBlockAsm10B:
  11836. CMPL SI, BX
  11837. JBE repeat_extend_back_end_encodeSnappyBlockAsm10B
  11838. MOVB -1(DX)(DI*1), R8
  11839. MOVB -1(DX)(SI*1), R9
  11840. CMPB R8, R9
  11841. JNE repeat_extend_back_end_encodeSnappyBlockAsm10B
  11842. LEAL -1(SI), SI
  11843. DECL DI
  11844. JNZ repeat_extend_back_loop_encodeSnappyBlockAsm10B
  11845. repeat_extend_back_end_encodeSnappyBlockAsm10B:
  11846. MOVL 12(SP), BX
  11847. CMPL BX, SI
  11848. JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B
  11849. MOVL SI, DI
  11850. MOVL SI, 12(SP)
  11851. LEAQ (DX)(BX*1), R8
  11852. SUBL BX, DI
  11853. LEAL -1(DI), BX
  11854. CMPL BX, $0x3c
  11855. JB one_byte_repeat_emit_encodeSnappyBlockAsm10B
  11856. CMPL BX, $0x00000100
  11857. JB two_bytes_repeat_emit_encodeSnappyBlockAsm10B
  11858. JB three_bytes_repeat_emit_encodeSnappyBlockAsm10B
  11859. three_bytes_repeat_emit_encodeSnappyBlockAsm10B:
  11860. MOVB $0xf4, (AX)
  11861. MOVW BX, 1(AX)
  11862. ADDQ $0x03, AX
  11863. JMP memmove_long_repeat_emit_encodeSnappyBlockAsm10B
  11864. two_bytes_repeat_emit_encodeSnappyBlockAsm10B:
  11865. MOVB $0xf0, (AX)
  11866. MOVB BL, 1(AX)
  11867. ADDQ $0x02, AX
  11868. CMPL BX, $0x40
  11869. JB memmove_repeat_emit_encodeSnappyBlockAsm10B
  11870. JMP memmove_long_repeat_emit_encodeSnappyBlockAsm10B
  11871. one_byte_repeat_emit_encodeSnappyBlockAsm10B:
  11872. SHLB $0x02, BL
  11873. MOVB BL, (AX)
  11874. ADDQ $0x01, AX
  11875. memmove_repeat_emit_encodeSnappyBlockAsm10B:
  11876. LEAQ (AX)(DI*1), BX
  11877. // genMemMoveShort
  11878. CMPQ DI, $0x08
  11879. JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8
  11880. CMPQ DI, $0x10
  11881. JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16
  11882. CMPQ DI, $0x20
  11883. JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32
  11884. JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64
  11885. emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8:
  11886. MOVQ (R8), R9
  11887. MOVQ R9, (AX)
  11888. JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B
  11889. emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16:
  11890. MOVQ (R8), R9
  11891. MOVQ -8(R8)(DI*1), R8
  11892. MOVQ R9, (AX)
  11893. MOVQ R8, -8(AX)(DI*1)
  11894. JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B
  11895. emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32:
  11896. MOVOU (R8), X0
  11897. MOVOU -16(R8)(DI*1), X1
  11898. MOVOU X0, (AX)
  11899. MOVOU X1, -16(AX)(DI*1)
  11900. JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B
  11901. emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64:
  11902. MOVOU (R8), X0
  11903. MOVOU 16(R8), X1
  11904. MOVOU -32(R8)(DI*1), X2
  11905. MOVOU -16(R8)(DI*1), X3
  11906. MOVOU X0, (AX)
  11907. MOVOU X1, 16(AX)
  11908. MOVOU X2, -32(AX)(DI*1)
  11909. MOVOU X3, -16(AX)(DI*1)
  11910. memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B:
  11911. MOVQ BX, AX
  11912. JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B
  11913. memmove_long_repeat_emit_encodeSnappyBlockAsm10B:
  11914. LEAQ (AX)(DI*1), BX
  11915. // genMemMoveLong
  11916. MOVOU (R8), X0
  11917. MOVOU 16(R8), X1
  11918. MOVOU -32(R8)(DI*1), X2
  11919. MOVOU -16(R8)(DI*1), X3
  11920. MOVQ DI, R10
  11921. SHRQ $0x05, R10
  11922. MOVQ AX, R9
  11923. ANDL $0x0000001f, R9
  11924. MOVQ $0x00000040, R11
  11925. SUBQ R9, R11
  11926. DECQ R10
  11927. JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
  11928. LEAQ -32(R8)(R11*1), R9
  11929. LEAQ -32(AX)(R11*1), R12
  11930. emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back:
  11931. MOVOU (R9), X4
  11932. MOVOU 16(R9), X5
  11933. MOVOA X4, (R12)
  11934. MOVOA X5, 16(R12)
  11935. ADDQ $0x20, R12
  11936. ADDQ $0x20, R9
  11937. ADDQ $0x20, R11
  11938. DECQ R10
  11939. JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back
  11940. emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32:
  11941. MOVOU -32(R8)(R11*1), X4
  11942. MOVOU -16(R8)(R11*1), X5
  11943. MOVOA X4, -32(AX)(R11*1)
  11944. MOVOA X5, -16(AX)(R11*1)
  11945. ADDQ $0x20, R11
  11946. CMPQ DI, R11
  11947. JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
  11948. MOVOU X0, (AX)
  11949. MOVOU X1, 16(AX)
  11950. MOVOU X2, -32(AX)(DI*1)
  11951. MOVOU X3, -16(AX)(DI*1)
  11952. MOVQ BX, AX
  11953. emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B:
  11954. ADDL $0x05, CX
  11955. MOVL CX, BX
  11956. SUBL 16(SP), BX
  11957. MOVQ src_len+32(FP), DI
  11958. SUBL CX, DI
  11959. LEAQ (DX)(CX*1), R8
  11960. LEAQ (DX)(BX*1), BX
  11961. // matchLen
  11962. XORL R10, R10
  11963. matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm10B:
  11964. CMPL DI, $0x10
  11965. JB matchlen_match8_repeat_extend_encodeSnappyBlockAsm10B
  11966. MOVQ (R8)(R10*1), R9
  11967. MOVQ 8(R8)(R10*1), R11
  11968. XORQ (BX)(R10*1), R9
  11969. JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm10B
  11970. XORQ 8(BX)(R10*1), R11
  11971. JNZ matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm10B
  11972. LEAL -16(DI), DI
  11973. LEAL 16(R10), R10
  11974. JMP matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm10B
  11975. matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm10B:
  11976. #ifdef GOAMD64_v3
  11977. TZCNTQ R11, R11
  11978. #else
  11979. BSFQ R11, R11
  11980. #endif
  11981. SARQ $0x03, R11
  11982. LEAL 8(R10)(R11*1), R10
  11983. JMP repeat_extend_forward_end_encodeSnappyBlockAsm10B
  11984. matchlen_match8_repeat_extend_encodeSnappyBlockAsm10B:
  11985. CMPL DI, $0x08
  11986. JB matchlen_match4_repeat_extend_encodeSnappyBlockAsm10B
  11987. MOVQ (R8)(R10*1), R9
  11988. XORQ (BX)(R10*1), R9
  11989. JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm10B
  11990. LEAL -8(DI), DI
  11991. LEAL 8(R10), R10
  11992. JMP matchlen_match4_repeat_extend_encodeSnappyBlockAsm10B
  11993. matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm10B:
  11994. #ifdef GOAMD64_v3
  11995. TZCNTQ R9, R9
  11996. #else
  11997. BSFQ R9, R9
  11998. #endif
  11999. SARQ $0x03, R9
  12000. LEAL (R10)(R9*1), R10
  12001. JMP repeat_extend_forward_end_encodeSnappyBlockAsm10B
  12002. matchlen_match4_repeat_extend_encodeSnappyBlockAsm10B:
  12003. CMPL DI, $0x04
  12004. JB matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B
  12005. MOVL (R8)(R10*1), R9
  12006. CMPL (BX)(R10*1), R9
  12007. JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B
  12008. LEAL -4(DI), DI
  12009. LEAL 4(R10), R10
  12010. matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B:
  12011. CMPL DI, $0x01
  12012. JE matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B
  12013. JB repeat_extend_forward_end_encodeSnappyBlockAsm10B
  12014. MOVW (R8)(R10*1), R9
  12015. CMPW (BX)(R10*1), R9
  12016. JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B
  12017. LEAL 2(R10), R10
  12018. SUBL $0x02, DI
  12019. JZ repeat_extend_forward_end_encodeSnappyBlockAsm10B
  12020. matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B:
  12021. MOVB (R8)(R10*1), R9
  12022. CMPB (BX)(R10*1), R9
  12023. JNE repeat_extend_forward_end_encodeSnappyBlockAsm10B
  12024. LEAL 1(R10), R10
  12025. repeat_extend_forward_end_encodeSnappyBlockAsm10B:
  12026. ADDL R10, CX
  12027. MOVL CX, BX
  12028. SUBL SI, BX
  12029. MOVL 16(SP), SI
  12030. // emitCopy
  12031. two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B:
  12032. CMPL BX, $0x40
  12033. JBE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B
  12034. MOVB $0xee, (AX)
  12035. MOVW SI, 1(AX)
  12036. LEAL -60(BX), BX
  12037. ADDQ $0x03, AX
  12038. JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B
  12039. two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B:
  12040. MOVL BX, DI
  12041. SHLL $0x02, DI
  12042. CMPL BX, $0x0c
  12043. JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B
  12044. CMPL SI, $0x00000800
  12045. JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B
  12046. LEAL -15(DI), DI
  12047. MOVB SI, 1(AX)
  12048. SHRL $0x08, SI
  12049. SHLL $0x05, SI
  12050. ORL SI, DI
  12051. MOVB DI, (AX)
  12052. ADDQ $0x02, AX
  12053. JMP repeat_end_emit_encodeSnappyBlockAsm10B
  12054. emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B:
  12055. LEAL -2(DI), DI
  12056. MOVB DI, (AX)
  12057. MOVW SI, 1(AX)
  12058. ADDQ $0x03, AX
  12059. repeat_end_emit_encodeSnappyBlockAsm10B:
  12060. MOVL CX, 12(SP)
  12061. JMP search_loop_encodeSnappyBlockAsm10B
  12062. no_repeat_found_encodeSnappyBlockAsm10B:
  12063. CMPL (DX)(BX*1), SI
  12064. JEQ candidate_match_encodeSnappyBlockAsm10B
  12065. SHRQ $0x08, SI
  12066. MOVL 24(SP)(R9*4), BX
  12067. LEAL 2(CX), R8
  12068. CMPL (DX)(DI*1), SI
  12069. JEQ candidate2_match_encodeSnappyBlockAsm10B
  12070. MOVL R8, 24(SP)(R9*4)
  12071. SHRQ $0x08, SI
  12072. CMPL (DX)(BX*1), SI
  12073. JEQ candidate3_match_encodeSnappyBlockAsm10B
  12074. MOVL 20(SP), CX
  12075. JMP search_loop_encodeSnappyBlockAsm10B
  12076. candidate3_match_encodeSnappyBlockAsm10B:
  12077. ADDL $0x02, CX
  12078. JMP candidate_match_encodeSnappyBlockAsm10B
  12079. candidate2_match_encodeSnappyBlockAsm10B:
  12080. MOVL R8, 24(SP)(R9*4)
  12081. INCL CX
  12082. MOVL DI, BX
  12083. candidate_match_encodeSnappyBlockAsm10B:
  12084. MOVL 12(SP), SI
  12085. TESTL BX, BX
  12086. JZ match_extend_back_end_encodeSnappyBlockAsm10B
  12087. match_extend_back_loop_encodeSnappyBlockAsm10B:
  12088. CMPL CX, SI
  12089. JBE match_extend_back_end_encodeSnappyBlockAsm10B
  12090. MOVB -1(DX)(BX*1), DI
  12091. MOVB -1(DX)(CX*1), R8
  12092. CMPB DI, R8
  12093. JNE match_extend_back_end_encodeSnappyBlockAsm10B
  12094. LEAL -1(CX), CX
  12095. DECL BX
  12096. JZ match_extend_back_end_encodeSnappyBlockAsm10B
  12097. JMP match_extend_back_loop_encodeSnappyBlockAsm10B
  12098. match_extend_back_end_encodeSnappyBlockAsm10B:
  12099. MOVL CX, SI
  12100. SUBL 12(SP), SI
  12101. LEAQ 3(AX)(SI*1), SI
  12102. CMPQ SI, (SP)
  12103. JB match_dst_size_check_encodeSnappyBlockAsm10B
  12104. MOVQ $0x00000000, ret+48(FP)
  12105. RET
  12106. match_dst_size_check_encodeSnappyBlockAsm10B:
  12107. MOVL CX, SI
  12108. MOVL 12(SP), DI
  12109. CMPL DI, SI
  12110. JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm10B
  12111. MOVL SI, R8
  12112. MOVL SI, 12(SP)
  12113. LEAQ (DX)(DI*1), SI
  12114. SUBL DI, R8
  12115. LEAL -1(R8), DI
  12116. CMPL DI, $0x3c
  12117. JB one_byte_match_emit_encodeSnappyBlockAsm10B
  12118. CMPL DI, $0x00000100
  12119. JB two_bytes_match_emit_encodeSnappyBlockAsm10B
  12120. JB three_bytes_match_emit_encodeSnappyBlockAsm10B
  12121. three_bytes_match_emit_encodeSnappyBlockAsm10B:
  12122. MOVB $0xf4, (AX)
  12123. MOVW DI, 1(AX)
  12124. ADDQ $0x03, AX
  12125. JMP memmove_long_match_emit_encodeSnappyBlockAsm10B
  12126. two_bytes_match_emit_encodeSnappyBlockAsm10B:
  12127. MOVB $0xf0, (AX)
  12128. MOVB DI, 1(AX)
  12129. ADDQ $0x02, AX
  12130. CMPL DI, $0x40
  12131. JB memmove_match_emit_encodeSnappyBlockAsm10B
  12132. JMP memmove_long_match_emit_encodeSnappyBlockAsm10B
  12133. one_byte_match_emit_encodeSnappyBlockAsm10B:
  12134. SHLB $0x02, DI
  12135. MOVB DI, (AX)
  12136. ADDQ $0x01, AX
  12137. memmove_match_emit_encodeSnappyBlockAsm10B:
  12138. LEAQ (AX)(R8*1), DI
  12139. // genMemMoveShort
  12140. CMPQ R8, $0x08
  12141. JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8
  12142. CMPQ R8, $0x10
  12143. JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16
  12144. CMPQ R8, $0x20
  12145. JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32
  12146. JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64
  12147. emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8:
  12148. MOVQ (SI), R9
  12149. MOVQ R9, (AX)
  12150. JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B
  12151. emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16:
  12152. MOVQ (SI), R9
  12153. MOVQ -8(SI)(R8*1), SI
  12154. MOVQ R9, (AX)
  12155. MOVQ SI, -8(AX)(R8*1)
  12156. JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B
  12157. emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32:
  12158. MOVOU (SI), X0
  12159. MOVOU -16(SI)(R8*1), X1
  12160. MOVOU X0, (AX)
  12161. MOVOU X1, -16(AX)(R8*1)
  12162. JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B
  12163. emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64:
  12164. MOVOU (SI), X0
  12165. MOVOU 16(SI), X1
  12166. MOVOU -32(SI)(R8*1), X2
  12167. MOVOU -16(SI)(R8*1), X3
  12168. MOVOU X0, (AX)
  12169. MOVOU X1, 16(AX)
  12170. MOVOU X2, -32(AX)(R8*1)
  12171. MOVOU X3, -16(AX)(R8*1)
  12172. memmove_end_copy_match_emit_encodeSnappyBlockAsm10B:
  12173. MOVQ DI, AX
  12174. JMP emit_literal_done_match_emit_encodeSnappyBlockAsm10B
  12175. memmove_long_match_emit_encodeSnappyBlockAsm10B:
  12176. LEAQ (AX)(R8*1), DI
  12177. // genMemMoveLong
  12178. MOVOU (SI), X0
  12179. MOVOU 16(SI), X1
  12180. MOVOU -32(SI)(R8*1), X2
  12181. MOVOU -16(SI)(R8*1), X3
  12182. MOVQ R8, R10
  12183. SHRQ $0x05, R10
  12184. MOVQ AX, R9
  12185. ANDL $0x0000001f, R9
  12186. MOVQ $0x00000040, R11
  12187. SUBQ R9, R11
  12188. DECQ R10
  12189. JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
  12190. LEAQ -32(SI)(R11*1), R9
  12191. LEAQ -32(AX)(R11*1), R12
  12192. emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back:
  12193. MOVOU (R9), X4
  12194. MOVOU 16(R9), X5
  12195. MOVOA X4, (R12)
  12196. MOVOA X5, 16(R12)
  12197. ADDQ $0x20, R12
  12198. ADDQ $0x20, R9
  12199. ADDQ $0x20, R11
  12200. DECQ R10
  12201. JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back
  12202. emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32:
  12203. MOVOU -32(SI)(R11*1), X4
  12204. MOVOU -16(SI)(R11*1), X5
  12205. MOVOA X4, -32(AX)(R11*1)
  12206. MOVOA X5, -16(AX)(R11*1)
  12207. ADDQ $0x20, R11
  12208. CMPQ R8, R11
  12209. JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
  12210. MOVOU X0, (AX)
  12211. MOVOU X1, 16(AX)
  12212. MOVOU X2, -32(AX)(R8*1)
  12213. MOVOU X3, -16(AX)(R8*1)
  12214. MOVQ DI, AX
  12215. emit_literal_done_match_emit_encodeSnappyBlockAsm10B:
  12216. match_nolit_loop_encodeSnappyBlockAsm10B:
  12217. MOVL CX, SI
  12218. SUBL BX, SI
  12219. MOVL SI, 16(SP)
  12220. ADDL $0x04, CX
  12221. ADDL $0x04, BX
  12222. MOVQ src_len+32(FP), SI
  12223. SUBL CX, SI
  12224. LEAQ (DX)(CX*1), DI
  12225. LEAQ (DX)(BX*1), BX
  12226. // matchLen
  12227. XORL R9, R9
  12228. matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm10B:
  12229. CMPL SI, $0x10
  12230. JB matchlen_match8_match_nolit_encodeSnappyBlockAsm10B
  12231. MOVQ (DI)(R9*1), R8
  12232. MOVQ 8(DI)(R9*1), R10
  12233. XORQ (BX)(R9*1), R8
  12234. JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm10B
  12235. XORQ 8(BX)(R9*1), R10
  12236. JNZ matchlen_bsf_16match_nolit_encodeSnappyBlockAsm10B
  12237. LEAL -16(SI), SI
  12238. LEAL 16(R9), R9
  12239. JMP matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm10B
  12240. matchlen_bsf_16match_nolit_encodeSnappyBlockAsm10B:
  12241. #ifdef GOAMD64_v3
  12242. TZCNTQ R10, R10
  12243. #else
  12244. BSFQ R10, R10
  12245. #endif
  12246. SARQ $0x03, R10
  12247. LEAL 8(R9)(R10*1), R9
  12248. JMP match_nolit_end_encodeSnappyBlockAsm10B
  12249. matchlen_match8_match_nolit_encodeSnappyBlockAsm10B:
  12250. CMPL SI, $0x08
  12251. JB matchlen_match4_match_nolit_encodeSnappyBlockAsm10B
  12252. MOVQ (DI)(R9*1), R8
  12253. XORQ (BX)(R9*1), R8
  12254. JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm10B
  12255. LEAL -8(SI), SI
  12256. LEAL 8(R9), R9
  12257. JMP matchlen_match4_match_nolit_encodeSnappyBlockAsm10B
  12258. matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm10B:
  12259. #ifdef GOAMD64_v3
  12260. TZCNTQ R8, R8
  12261. #else
  12262. BSFQ R8, R8
  12263. #endif
  12264. SARQ $0x03, R8
  12265. LEAL (R9)(R8*1), R9
  12266. JMP match_nolit_end_encodeSnappyBlockAsm10B
  12267. matchlen_match4_match_nolit_encodeSnappyBlockAsm10B:
  12268. CMPL SI, $0x04
  12269. JB matchlen_match2_match_nolit_encodeSnappyBlockAsm10B
  12270. MOVL (DI)(R9*1), R8
  12271. CMPL (BX)(R9*1), R8
  12272. JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm10B
  12273. LEAL -4(SI), SI
  12274. LEAL 4(R9), R9
  12275. matchlen_match2_match_nolit_encodeSnappyBlockAsm10B:
  12276. CMPL SI, $0x01
  12277. JE matchlen_match1_match_nolit_encodeSnappyBlockAsm10B
  12278. JB match_nolit_end_encodeSnappyBlockAsm10B
  12279. MOVW (DI)(R9*1), R8
  12280. CMPW (BX)(R9*1), R8
  12281. JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm10B
  12282. LEAL 2(R9), R9
  12283. SUBL $0x02, SI
  12284. JZ match_nolit_end_encodeSnappyBlockAsm10B
  12285. matchlen_match1_match_nolit_encodeSnappyBlockAsm10B:
  12286. MOVB (DI)(R9*1), R8
  12287. CMPB (BX)(R9*1), R8
  12288. JNE match_nolit_end_encodeSnappyBlockAsm10B
  12289. LEAL 1(R9), R9
  12290. match_nolit_end_encodeSnappyBlockAsm10B:
  12291. ADDL R9, CX
  12292. MOVL 16(SP), BX
  12293. ADDL $0x04, R9
  12294. MOVL CX, 12(SP)
  12295. // emitCopy
  12296. two_byte_offset_match_nolit_encodeSnappyBlockAsm10B:
  12297. CMPL R9, $0x40
  12298. JBE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B
  12299. MOVB $0xee, (AX)
  12300. MOVW BX, 1(AX)
  12301. LEAL -60(R9), R9
  12302. ADDQ $0x03, AX
  12303. JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm10B
  12304. two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B:
  12305. MOVL R9, SI
  12306. SHLL $0x02, SI
  12307. CMPL R9, $0x0c
  12308. JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm10B
  12309. CMPL BX, $0x00000800
  12310. JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm10B
  12311. LEAL -15(SI), SI
  12312. MOVB BL, 1(AX)
  12313. SHRL $0x08, BX
  12314. SHLL $0x05, BX
  12315. ORL BX, SI
  12316. MOVB SI, (AX)
  12317. ADDQ $0x02, AX
  12318. JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm10B
  12319. emit_copy_three_match_nolit_encodeSnappyBlockAsm10B:
  12320. LEAL -2(SI), SI
  12321. MOVB SI, (AX)
  12322. MOVW BX, 1(AX)
  12323. ADDQ $0x03, AX
  12324. match_nolit_emitcopy_end_encodeSnappyBlockAsm10B:
  12325. CMPL CX, 8(SP)
  12326. JAE emit_remainder_encodeSnappyBlockAsm10B
  12327. MOVQ -2(DX)(CX*1), SI
  12328. CMPQ AX, (SP)
  12329. JB match_nolit_dst_ok_encodeSnappyBlockAsm10B
  12330. MOVQ $0x00000000, ret+48(FP)
  12331. RET
  12332. match_nolit_dst_ok_encodeSnappyBlockAsm10B:
  12333. MOVQ $0x9e3779b1, R8
  12334. MOVQ SI, DI
  12335. SHRQ $0x10, SI
  12336. MOVQ SI, BX
  12337. SHLQ $0x20, DI
  12338. IMULQ R8, DI
  12339. SHRQ $0x36, DI
  12340. SHLQ $0x20, BX
  12341. IMULQ R8, BX
  12342. SHRQ $0x36, BX
  12343. LEAL -2(CX), R8
  12344. LEAQ 24(SP)(BX*4), R9
  12345. MOVL (R9), BX
  12346. MOVL R8, 24(SP)(DI*4)
  12347. MOVL CX, (R9)
  12348. CMPL (DX)(BX*1), SI
  12349. JEQ match_nolit_loop_encodeSnappyBlockAsm10B
  12350. INCL CX
  12351. JMP search_loop_encodeSnappyBlockAsm10B
  12352. emit_remainder_encodeSnappyBlockAsm10B:
  12353. MOVQ src_len+32(FP), CX
  12354. SUBL 12(SP), CX
  12355. LEAQ 3(AX)(CX*1), CX
  12356. CMPQ CX, (SP)
  12357. JB emit_remainder_ok_encodeSnappyBlockAsm10B
  12358. MOVQ $0x00000000, ret+48(FP)
  12359. RET
  12360. emit_remainder_ok_encodeSnappyBlockAsm10B:
  12361. MOVQ src_len+32(FP), CX
  12362. MOVL 12(SP), BX
  12363. CMPL BX, CX
  12364. JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B
  12365. MOVL CX, SI
  12366. MOVL CX, 12(SP)
  12367. LEAQ (DX)(BX*1), CX
  12368. SUBL BX, SI
  12369. LEAL -1(SI), DX
  12370. CMPL DX, $0x3c
  12371. JB one_byte_emit_remainder_encodeSnappyBlockAsm10B
  12372. CMPL DX, $0x00000100
  12373. JB two_bytes_emit_remainder_encodeSnappyBlockAsm10B
  12374. JB three_bytes_emit_remainder_encodeSnappyBlockAsm10B
  12375. three_bytes_emit_remainder_encodeSnappyBlockAsm10B:
  12376. MOVB $0xf4, (AX)
  12377. MOVW DX, 1(AX)
  12378. ADDQ $0x03, AX
  12379. JMP memmove_long_emit_remainder_encodeSnappyBlockAsm10B
  12380. two_bytes_emit_remainder_encodeSnappyBlockAsm10B:
  12381. MOVB $0xf0, (AX)
  12382. MOVB DL, 1(AX)
  12383. ADDQ $0x02, AX
  12384. CMPL DX, $0x40
  12385. JB memmove_emit_remainder_encodeSnappyBlockAsm10B
  12386. JMP memmove_long_emit_remainder_encodeSnappyBlockAsm10B
  12387. one_byte_emit_remainder_encodeSnappyBlockAsm10B:
  12388. SHLB $0x02, DL
  12389. MOVB DL, (AX)
  12390. ADDQ $0x01, AX
  12391. memmove_emit_remainder_encodeSnappyBlockAsm10B:
  12392. LEAQ (AX)(SI*1), DX
  12393. MOVL SI, BX
  12394. // genMemMoveShort
  12395. CMPQ BX, $0x03
  12396. JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_1or2
  12397. JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_3
  12398. CMPQ BX, $0x08
  12399. JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_4through7
  12400. CMPQ BX, $0x10
  12401. JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16
  12402. CMPQ BX, $0x20
  12403. JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32
  12404. JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64
  12405. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_1or2:
  12406. MOVB (CX), SI
  12407. MOVB -1(CX)(BX*1), CL
  12408. MOVB SI, (AX)
  12409. MOVB CL, -1(AX)(BX*1)
  12410. JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B
  12411. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_3:
  12412. MOVW (CX), SI
  12413. MOVB 2(CX), CL
  12414. MOVW SI, (AX)
  12415. MOVB CL, 2(AX)
  12416. JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B
  12417. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_4through7:
  12418. MOVL (CX), SI
  12419. MOVL -4(CX)(BX*1), CX
  12420. MOVL SI, (AX)
  12421. MOVL CX, -4(AX)(BX*1)
  12422. JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B
  12423. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16:
  12424. MOVQ (CX), SI
  12425. MOVQ -8(CX)(BX*1), CX
  12426. MOVQ SI, (AX)
  12427. MOVQ CX, -8(AX)(BX*1)
  12428. JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B
  12429. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32:
  12430. MOVOU (CX), X0
  12431. MOVOU -16(CX)(BX*1), X1
  12432. MOVOU X0, (AX)
  12433. MOVOU X1, -16(AX)(BX*1)
  12434. JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B
  12435. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64:
  12436. MOVOU (CX), X0
  12437. MOVOU 16(CX), X1
  12438. MOVOU -32(CX)(BX*1), X2
  12439. MOVOU -16(CX)(BX*1), X3
  12440. MOVOU X0, (AX)
  12441. MOVOU X1, 16(AX)
  12442. MOVOU X2, -32(AX)(BX*1)
  12443. MOVOU X3, -16(AX)(BX*1)
  12444. memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B:
  12445. MOVQ DX, AX
  12446. JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B
  12447. memmove_long_emit_remainder_encodeSnappyBlockAsm10B:
  12448. LEAQ (AX)(SI*1), DX
  12449. MOVL SI, BX
  12450. // genMemMoveLong
  12451. MOVOU (CX), X0
  12452. MOVOU 16(CX), X1
  12453. MOVOU -32(CX)(BX*1), X2
  12454. MOVOU -16(CX)(BX*1), X3
  12455. MOVQ BX, DI
  12456. SHRQ $0x05, DI
  12457. MOVQ AX, SI
  12458. ANDL $0x0000001f, SI
  12459. MOVQ $0x00000040, R8
  12460. SUBQ SI, R8
  12461. DECQ DI
  12462. JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
  12463. LEAQ -32(CX)(R8*1), SI
  12464. LEAQ -32(AX)(R8*1), R9
  12465. emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back:
  12466. MOVOU (SI), X4
  12467. MOVOU 16(SI), X5
  12468. MOVOA X4, (R9)
  12469. MOVOA X5, 16(R9)
  12470. ADDQ $0x20, R9
  12471. ADDQ $0x20, SI
  12472. ADDQ $0x20, R8
  12473. DECQ DI
  12474. JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back
  12475. emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32:
  12476. MOVOU -32(CX)(R8*1), X4
  12477. MOVOU -16(CX)(R8*1), X5
  12478. MOVOA X4, -32(AX)(R8*1)
  12479. MOVOA X5, -16(AX)(R8*1)
  12480. ADDQ $0x20, R8
  12481. CMPQ BX, R8
  12482. JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
  12483. MOVOU X0, (AX)
  12484. MOVOU X1, 16(AX)
  12485. MOVOU X2, -32(AX)(BX*1)
  12486. MOVOU X3, -16(AX)(BX*1)
  12487. MOVQ DX, AX
  12488. emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B:
  12489. MOVQ dst_base+0(FP), CX
  12490. SUBQ CX, AX
  12491. MOVQ AX, ret+48(FP)
  12492. RET
  12493. // func encodeSnappyBlockAsm8B(dst []byte, src []byte) int
  12494. // Requires: BMI, SSE2
  12495. TEXT ·encodeSnappyBlockAsm8B(SB), $1048-56
  12496. MOVQ dst_base+0(FP), AX
  12497. MOVQ $0x00000008, CX
  12498. LEAQ 24(SP), DX
  12499. PXOR X0, X0
  12500. zero_loop_encodeSnappyBlockAsm8B:
  12501. MOVOU X0, (DX)
  12502. MOVOU X0, 16(DX)
  12503. MOVOU X0, 32(DX)
  12504. MOVOU X0, 48(DX)
  12505. MOVOU X0, 64(DX)
  12506. MOVOU X0, 80(DX)
  12507. MOVOU X0, 96(DX)
  12508. MOVOU X0, 112(DX)
  12509. ADDQ $0x80, DX
  12510. DECQ CX
  12511. JNZ zero_loop_encodeSnappyBlockAsm8B
  12512. MOVL $0x00000000, 12(SP)
  12513. MOVQ src_len+32(FP), CX
  12514. LEAQ -9(CX), DX
  12515. LEAQ -8(CX), BX
  12516. MOVL BX, 8(SP)
  12517. SHRQ $0x05, CX
  12518. SUBL CX, DX
  12519. LEAQ (AX)(DX*1), DX
  12520. MOVQ DX, (SP)
  12521. MOVL $0x00000001, CX
  12522. MOVL CX, 16(SP)
  12523. MOVQ src_base+24(FP), DX
  12524. search_loop_encodeSnappyBlockAsm8B:
  12525. MOVL CX, BX
  12526. SUBL 12(SP), BX
  12527. SHRL $0x04, BX
  12528. LEAL 4(CX)(BX*1), BX
  12529. CMPL BX, 8(SP)
  12530. JAE emit_remainder_encodeSnappyBlockAsm8B
  12531. MOVQ (DX)(CX*1), SI
  12532. MOVL BX, 20(SP)
  12533. MOVQ $0x9e3779b1, R8
  12534. MOVQ SI, R9
  12535. MOVQ SI, R10
  12536. SHRQ $0x08, R10
  12537. SHLQ $0x20, R9
  12538. IMULQ R8, R9
  12539. SHRQ $0x38, R9
  12540. SHLQ $0x20, R10
  12541. IMULQ R8, R10
  12542. SHRQ $0x38, R10
  12543. MOVL 24(SP)(R9*4), BX
  12544. MOVL 24(SP)(R10*4), DI
  12545. MOVL CX, 24(SP)(R9*4)
  12546. LEAL 1(CX), R9
  12547. MOVL R9, 24(SP)(R10*4)
  12548. MOVQ SI, R9
  12549. SHRQ $0x10, R9
  12550. SHLQ $0x20, R9
  12551. IMULQ R8, R9
  12552. SHRQ $0x38, R9
  12553. MOVL CX, R8
  12554. SUBL 16(SP), R8
  12555. MOVL 1(DX)(R8*1), R10
  12556. MOVQ SI, R8
  12557. SHRQ $0x08, R8
  12558. CMPL R8, R10
  12559. JNE no_repeat_found_encodeSnappyBlockAsm8B
  12560. LEAL 1(CX), SI
  12561. MOVL 12(SP), BX
  12562. MOVL SI, DI
  12563. SUBL 16(SP), DI
  12564. JZ repeat_extend_back_end_encodeSnappyBlockAsm8B
  12565. repeat_extend_back_loop_encodeSnappyBlockAsm8B:
  12566. CMPL SI, BX
  12567. JBE repeat_extend_back_end_encodeSnappyBlockAsm8B
  12568. MOVB -1(DX)(DI*1), R8
  12569. MOVB -1(DX)(SI*1), R9
  12570. CMPB R8, R9
  12571. JNE repeat_extend_back_end_encodeSnappyBlockAsm8B
  12572. LEAL -1(SI), SI
  12573. DECL DI
  12574. JNZ repeat_extend_back_loop_encodeSnappyBlockAsm8B
  12575. repeat_extend_back_end_encodeSnappyBlockAsm8B:
  12576. MOVL 12(SP), BX
  12577. CMPL BX, SI
  12578. JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B
  12579. MOVL SI, DI
  12580. MOVL SI, 12(SP)
  12581. LEAQ (DX)(BX*1), R8
  12582. SUBL BX, DI
  12583. LEAL -1(DI), BX
  12584. CMPL BX, $0x3c
  12585. JB one_byte_repeat_emit_encodeSnappyBlockAsm8B
  12586. CMPL BX, $0x00000100
  12587. JB two_bytes_repeat_emit_encodeSnappyBlockAsm8B
  12588. JB three_bytes_repeat_emit_encodeSnappyBlockAsm8B
  12589. three_bytes_repeat_emit_encodeSnappyBlockAsm8B:
  12590. MOVB $0xf4, (AX)
  12591. MOVW BX, 1(AX)
  12592. ADDQ $0x03, AX
  12593. JMP memmove_long_repeat_emit_encodeSnappyBlockAsm8B
  12594. two_bytes_repeat_emit_encodeSnappyBlockAsm8B:
  12595. MOVB $0xf0, (AX)
  12596. MOVB BL, 1(AX)
  12597. ADDQ $0x02, AX
  12598. CMPL BX, $0x40
  12599. JB memmove_repeat_emit_encodeSnappyBlockAsm8B
  12600. JMP memmove_long_repeat_emit_encodeSnappyBlockAsm8B
  12601. one_byte_repeat_emit_encodeSnappyBlockAsm8B:
  12602. SHLB $0x02, BL
  12603. MOVB BL, (AX)
  12604. ADDQ $0x01, AX
  12605. memmove_repeat_emit_encodeSnappyBlockAsm8B:
  12606. LEAQ (AX)(DI*1), BX
  12607. // genMemMoveShort
  12608. CMPQ DI, $0x08
  12609. JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8
  12610. CMPQ DI, $0x10
  12611. JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16
  12612. CMPQ DI, $0x20
  12613. JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32
  12614. JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64
  12615. emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8:
  12616. MOVQ (R8), R9
  12617. MOVQ R9, (AX)
  12618. JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B
  12619. emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16:
  12620. MOVQ (R8), R9
  12621. MOVQ -8(R8)(DI*1), R8
  12622. MOVQ R9, (AX)
  12623. MOVQ R8, -8(AX)(DI*1)
  12624. JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B
  12625. emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32:
  12626. MOVOU (R8), X0
  12627. MOVOU -16(R8)(DI*1), X1
  12628. MOVOU X0, (AX)
  12629. MOVOU X1, -16(AX)(DI*1)
  12630. JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B
  12631. emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64:
  12632. MOVOU (R8), X0
  12633. MOVOU 16(R8), X1
  12634. MOVOU -32(R8)(DI*1), X2
  12635. MOVOU -16(R8)(DI*1), X3
  12636. MOVOU X0, (AX)
  12637. MOVOU X1, 16(AX)
  12638. MOVOU X2, -32(AX)(DI*1)
  12639. MOVOU X3, -16(AX)(DI*1)
  12640. memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B:
  12641. MOVQ BX, AX
  12642. JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B
  12643. memmove_long_repeat_emit_encodeSnappyBlockAsm8B:
  12644. LEAQ (AX)(DI*1), BX
  12645. // genMemMoveLong
  12646. MOVOU (R8), X0
  12647. MOVOU 16(R8), X1
  12648. MOVOU -32(R8)(DI*1), X2
  12649. MOVOU -16(R8)(DI*1), X3
  12650. MOVQ DI, R10
  12651. SHRQ $0x05, R10
  12652. MOVQ AX, R9
  12653. ANDL $0x0000001f, R9
  12654. MOVQ $0x00000040, R11
  12655. SUBQ R9, R11
  12656. DECQ R10
  12657. JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
  12658. LEAQ -32(R8)(R11*1), R9
  12659. LEAQ -32(AX)(R11*1), R12
  12660. emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back:
  12661. MOVOU (R9), X4
  12662. MOVOU 16(R9), X5
  12663. MOVOA X4, (R12)
  12664. MOVOA X5, 16(R12)
  12665. ADDQ $0x20, R12
  12666. ADDQ $0x20, R9
  12667. ADDQ $0x20, R11
  12668. DECQ R10
  12669. JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back
  12670. emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32:
  12671. MOVOU -32(R8)(R11*1), X4
  12672. MOVOU -16(R8)(R11*1), X5
  12673. MOVOA X4, -32(AX)(R11*1)
  12674. MOVOA X5, -16(AX)(R11*1)
  12675. ADDQ $0x20, R11
  12676. CMPQ DI, R11
  12677. JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
  12678. MOVOU X0, (AX)
  12679. MOVOU X1, 16(AX)
  12680. MOVOU X2, -32(AX)(DI*1)
  12681. MOVOU X3, -16(AX)(DI*1)
  12682. MOVQ BX, AX
  12683. emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B:
  12684. ADDL $0x05, CX
  12685. MOVL CX, BX
  12686. SUBL 16(SP), BX
  12687. MOVQ src_len+32(FP), DI
  12688. SUBL CX, DI
  12689. LEAQ (DX)(CX*1), R8
  12690. LEAQ (DX)(BX*1), BX
  12691. // matchLen
  12692. XORL R10, R10
  12693. matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm8B:
  12694. CMPL DI, $0x10
  12695. JB matchlen_match8_repeat_extend_encodeSnappyBlockAsm8B
  12696. MOVQ (R8)(R10*1), R9
  12697. MOVQ 8(R8)(R10*1), R11
  12698. XORQ (BX)(R10*1), R9
  12699. JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm8B
  12700. XORQ 8(BX)(R10*1), R11
  12701. JNZ matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm8B
  12702. LEAL -16(DI), DI
  12703. LEAL 16(R10), R10
  12704. JMP matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm8B
  12705. matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm8B:
  12706. #ifdef GOAMD64_v3
  12707. TZCNTQ R11, R11
  12708. #else
  12709. BSFQ R11, R11
  12710. #endif
  12711. SARQ $0x03, R11
  12712. LEAL 8(R10)(R11*1), R10
  12713. JMP repeat_extend_forward_end_encodeSnappyBlockAsm8B
  12714. matchlen_match8_repeat_extend_encodeSnappyBlockAsm8B:
  12715. CMPL DI, $0x08
  12716. JB matchlen_match4_repeat_extend_encodeSnappyBlockAsm8B
  12717. MOVQ (R8)(R10*1), R9
  12718. XORQ (BX)(R10*1), R9
  12719. JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm8B
  12720. LEAL -8(DI), DI
  12721. LEAL 8(R10), R10
  12722. JMP matchlen_match4_repeat_extend_encodeSnappyBlockAsm8B
  12723. matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm8B:
  12724. #ifdef GOAMD64_v3
  12725. TZCNTQ R9, R9
  12726. #else
  12727. BSFQ R9, R9
  12728. #endif
  12729. SARQ $0x03, R9
  12730. LEAL (R10)(R9*1), R10
  12731. JMP repeat_extend_forward_end_encodeSnappyBlockAsm8B
  12732. matchlen_match4_repeat_extend_encodeSnappyBlockAsm8B:
  12733. CMPL DI, $0x04
  12734. JB matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B
  12735. MOVL (R8)(R10*1), R9
  12736. CMPL (BX)(R10*1), R9
  12737. JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B
  12738. LEAL -4(DI), DI
  12739. LEAL 4(R10), R10
  12740. matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B:
  12741. CMPL DI, $0x01
  12742. JE matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B
  12743. JB repeat_extend_forward_end_encodeSnappyBlockAsm8B
  12744. MOVW (R8)(R10*1), R9
  12745. CMPW (BX)(R10*1), R9
  12746. JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B
  12747. LEAL 2(R10), R10
  12748. SUBL $0x02, DI
  12749. JZ repeat_extend_forward_end_encodeSnappyBlockAsm8B
  12750. matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B:
  12751. MOVB (R8)(R10*1), R9
  12752. CMPB (BX)(R10*1), R9
  12753. JNE repeat_extend_forward_end_encodeSnappyBlockAsm8B
  12754. LEAL 1(R10), R10
  12755. repeat_extend_forward_end_encodeSnappyBlockAsm8B:
  12756. ADDL R10, CX
  12757. MOVL CX, BX
  12758. SUBL SI, BX
  12759. MOVL 16(SP), SI
  12760. // emitCopy
  12761. two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B:
  12762. CMPL BX, $0x40
  12763. JBE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B
  12764. MOVB $0xee, (AX)
  12765. MOVW SI, 1(AX)
  12766. LEAL -60(BX), BX
  12767. ADDQ $0x03, AX
  12768. JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B
  12769. two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B:
  12770. MOVL BX, DI
  12771. SHLL $0x02, DI
  12772. CMPL BX, $0x0c
  12773. JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B
  12774. LEAL -15(DI), DI
  12775. MOVB SI, 1(AX)
  12776. SHRL $0x08, SI
  12777. SHLL $0x05, SI
  12778. ORL SI, DI
  12779. MOVB DI, (AX)
  12780. ADDQ $0x02, AX
  12781. JMP repeat_end_emit_encodeSnappyBlockAsm8B
  12782. emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B:
  12783. LEAL -2(DI), DI
  12784. MOVB DI, (AX)
  12785. MOVW SI, 1(AX)
  12786. ADDQ $0x03, AX
  12787. repeat_end_emit_encodeSnappyBlockAsm8B:
  12788. MOVL CX, 12(SP)
  12789. JMP search_loop_encodeSnappyBlockAsm8B
  12790. no_repeat_found_encodeSnappyBlockAsm8B:
  12791. CMPL (DX)(BX*1), SI
  12792. JEQ candidate_match_encodeSnappyBlockAsm8B
  12793. SHRQ $0x08, SI
  12794. MOVL 24(SP)(R9*4), BX
  12795. LEAL 2(CX), R8
  12796. CMPL (DX)(DI*1), SI
  12797. JEQ candidate2_match_encodeSnappyBlockAsm8B
  12798. MOVL R8, 24(SP)(R9*4)
  12799. SHRQ $0x08, SI
  12800. CMPL (DX)(BX*1), SI
  12801. JEQ candidate3_match_encodeSnappyBlockAsm8B
  12802. MOVL 20(SP), CX
  12803. JMP search_loop_encodeSnappyBlockAsm8B
  12804. candidate3_match_encodeSnappyBlockAsm8B:
  12805. ADDL $0x02, CX
  12806. JMP candidate_match_encodeSnappyBlockAsm8B
  12807. candidate2_match_encodeSnappyBlockAsm8B:
  12808. MOVL R8, 24(SP)(R9*4)
  12809. INCL CX
  12810. MOVL DI, BX
  12811. candidate_match_encodeSnappyBlockAsm8B:
  12812. MOVL 12(SP), SI
  12813. TESTL BX, BX
  12814. JZ match_extend_back_end_encodeSnappyBlockAsm8B
  12815. match_extend_back_loop_encodeSnappyBlockAsm8B:
  12816. CMPL CX, SI
  12817. JBE match_extend_back_end_encodeSnappyBlockAsm8B
  12818. MOVB -1(DX)(BX*1), DI
  12819. MOVB -1(DX)(CX*1), R8
  12820. CMPB DI, R8
  12821. JNE match_extend_back_end_encodeSnappyBlockAsm8B
  12822. LEAL -1(CX), CX
  12823. DECL BX
  12824. JZ match_extend_back_end_encodeSnappyBlockAsm8B
  12825. JMP match_extend_back_loop_encodeSnappyBlockAsm8B
  12826. match_extend_back_end_encodeSnappyBlockAsm8B:
  12827. MOVL CX, SI
  12828. SUBL 12(SP), SI
  12829. LEAQ 3(AX)(SI*1), SI
  12830. CMPQ SI, (SP)
  12831. JB match_dst_size_check_encodeSnappyBlockAsm8B
  12832. MOVQ $0x00000000, ret+48(FP)
  12833. RET
  12834. match_dst_size_check_encodeSnappyBlockAsm8B:
  12835. MOVL CX, SI
  12836. MOVL 12(SP), DI
  12837. CMPL DI, SI
  12838. JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm8B
  12839. MOVL SI, R8
  12840. MOVL SI, 12(SP)
  12841. LEAQ (DX)(DI*1), SI
  12842. SUBL DI, R8
  12843. LEAL -1(R8), DI
  12844. CMPL DI, $0x3c
  12845. JB one_byte_match_emit_encodeSnappyBlockAsm8B
  12846. CMPL DI, $0x00000100
  12847. JB two_bytes_match_emit_encodeSnappyBlockAsm8B
  12848. JB three_bytes_match_emit_encodeSnappyBlockAsm8B
  12849. three_bytes_match_emit_encodeSnappyBlockAsm8B:
  12850. MOVB $0xf4, (AX)
  12851. MOVW DI, 1(AX)
  12852. ADDQ $0x03, AX
  12853. JMP memmove_long_match_emit_encodeSnappyBlockAsm8B
  12854. two_bytes_match_emit_encodeSnappyBlockAsm8B:
  12855. MOVB $0xf0, (AX)
  12856. MOVB DI, 1(AX)
  12857. ADDQ $0x02, AX
  12858. CMPL DI, $0x40
  12859. JB memmove_match_emit_encodeSnappyBlockAsm8B
  12860. JMP memmove_long_match_emit_encodeSnappyBlockAsm8B
  12861. one_byte_match_emit_encodeSnappyBlockAsm8B:
  12862. SHLB $0x02, DI
  12863. MOVB DI, (AX)
  12864. ADDQ $0x01, AX
  12865. memmove_match_emit_encodeSnappyBlockAsm8B:
  12866. LEAQ (AX)(R8*1), DI
  12867. // genMemMoveShort
  12868. CMPQ R8, $0x08
  12869. JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8
  12870. CMPQ R8, $0x10
  12871. JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16
  12872. CMPQ R8, $0x20
  12873. JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32
  12874. JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64
  12875. emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8:
  12876. MOVQ (SI), R9
  12877. MOVQ R9, (AX)
  12878. JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B
  12879. emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16:
  12880. MOVQ (SI), R9
  12881. MOVQ -8(SI)(R8*1), SI
  12882. MOVQ R9, (AX)
  12883. MOVQ SI, -8(AX)(R8*1)
  12884. JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B
  12885. emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32:
  12886. MOVOU (SI), X0
  12887. MOVOU -16(SI)(R8*1), X1
  12888. MOVOU X0, (AX)
  12889. MOVOU X1, -16(AX)(R8*1)
  12890. JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B
  12891. emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64:
  12892. MOVOU (SI), X0
  12893. MOVOU 16(SI), X1
  12894. MOVOU -32(SI)(R8*1), X2
  12895. MOVOU -16(SI)(R8*1), X3
  12896. MOVOU X0, (AX)
  12897. MOVOU X1, 16(AX)
  12898. MOVOU X2, -32(AX)(R8*1)
  12899. MOVOU X3, -16(AX)(R8*1)
  12900. memmove_end_copy_match_emit_encodeSnappyBlockAsm8B:
  12901. MOVQ DI, AX
  12902. JMP emit_literal_done_match_emit_encodeSnappyBlockAsm8B
  12903. memmove_long_match_emit_encodeSnappyBlockAsm8B:
  12904. LEAQ (AX)(R8*1), DI
  12905. // genMemMoveLong
  12906. MOVOU (SI), X0
  12907. MOVOU 16(SI), X1
  12908. MOVOU -32(SI)(R8*1), X2
  12909. MOVOU -16(SI)(R8*1), X3
  12910. MOVQ R8, R10
  12911. SHRQ $0x05, R10
  12912. MOVQ AX, R9
  12913. ANDL $0x0000001f, R9
  12914. MOVQ $0x00000040, R11
  12915. SUBQ R9, R11
  12916. DECQ R10
  12917. JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
  12918. LEAQ -32(SI)(R11*1), R9
  12919. LEAQ -32(AX)(R11*1), R12
  12920. emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back:
  12921. MOVOU (R9), X4
  12922. MOVOU 16(R9), X5
  12923. MOVOA X4, (R12)
  12924. MOVOA X5, 16(R12)
  12925. ADDQ $0x20, R12
  12926. ADDQ $0x20, R9
  12927. ADDQ $0x20, R11
  12928. DECQ R10
  12929. JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back
  12930. emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32:
  12931. MOVOU -32(SI)(R11*1), X4
  12932. MOVOU -16(SI)(R11*1), X5
  12933. MOVOA X4, -32(AX)(R11*1)
  12934. MOVOA X5, -16(AX)(R11*1)
  12935. ADDQ $0x20, R11
  12936. CMPQ R8, R11
  12937. JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
  12938. MOVOU X0, (AX)
  12939. MOVOU X1, 16(AX)
  12940. MOVOU X2, -32(AX)(R8*1)
  12941. MOVOU X3, -16(AX)(R8*1)
  12942. MOVQ DI, AX
  12943. emit_literal_done_match_emit_encodeSnappyBlockAsm8B:
  12944. match_nolit_loop_encodeSnappyBlockAsm8B:
  12945. MOVL CX, SI
  12946. SUBL BX, SI
  12947. MOVL SI, 16(SP)
  12948. ADDL $0x04, CX
  12949. ADDL $0x04, BX
  12950. MOVQ src_len+32(FP), SI
  12951. SUBL CX, SI
  12952. LEAQ (DX)(CX*1), DI
  12953. LEAQ (DX)(BX*1), BX
  12954. // matchLen
  12955. XORL R9, R9
  12956. matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm8B:
  12957. CMPL SI, $0x10
  12958. JB matchlen_match8_match_nolit_encodeSnappyBlockAsm8B
  12959. MOVQ (DI)(R9*1), R8
  12960. MOVQ 8(DI)(R9*1), R10
  12961. XORQ (BX)(R9*1), R8
  12962. JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm8B
  12963. XORQ 8(BX)(R9*1), R10
  12964. JNZ matchlen_bsf_16match_nolit_encodeSnappyBlockAsm8B
  12965. LEAL -16(SI), SI
  12966. LEAL 16(R9), R9
  12967. JMP matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm8B
  12968. matchlen_bsf_16match_nolit_encodeSnappyBlockAsm8B:
  12969. #ifdef GOAMD64_v3
  12970. TZCNTQ R10, R10
  12971. #else
  12972. BSFQ R10, R10
  12973. #endif
  12974. SARQ $0x03, R10
  12975. LEAL 8(R9)(R10*1), R9
  12976. JMP match_nolit_end_encodeSnappyBlockAsm8B
  12977. matchlen_match8_match_nolit_encodeSnappyBlockAsm8B:
  12978. CMPL SI, $0x08
  12979. JB matchlen_match4_match_nolit_encodeSnappyBlockAsm8B
  12980. MOVQ (DI)(R9*1), R8
  12981. XORQ (BX)(R9*1), R8
  12982. JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm8B
  12983. LEAL -8(SI), SI
  12984. LEAL 8(R9), R9
  12985. JMP matchlen_match4_match_nolit_encodeSnappyBlockAsm8B
  12986. matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm8B:
  12987. #ifdef GOAMD64_v3
  12988. TZCNTQ R8, R8
  12989. #else
  12990. BSFQ R8, R8
  12991. #endif
  12992. SARQ $0x03, R8
  12993. LEAL (R9)(R8*1), R9
  12994. JMP match_nolit_end_encodeSnappyBlockAsm8B
  12995. matchlen_match4_match_nolit_encodeSnappyBlockAsm8B:
  12996. CMPL SI, $0x04
  12997. JB matchlen_match2_match_nolit_encodeSnappyBlockAsm8B
  12998. MOVL (DI)(R9*1), R8
  12999. CMPL (BX)(R9*1), R8
  13000. JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm8B
  13001. LEAL -4(SI), SI
  13002. LEAL 4(R9), R9
  13003. matchlen_match2_match_nolit_encodeSnappyBlockAsm8B:
  13004. CMPL SI, $0x01
  13005. JE matchlen_match1_match_nolit_encodeSnappyBlockAsm8B
  13006. JB match_nolit_end_encodeSnappyBlockAsm8B
  13007. MOVW (DI)(R9*1), R8
  13008. CMPW (BX)(R9*1), R8
  13009. JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm8B
  13010. LEAL 2(R9), R9
  13011. SUBL $0x02, SI
  13012. JZ match_nolit_end_encodeSnappyBlockAsm8B
  13013. matchlen_match1_match_nolit_encodeSnappyBlockAsm8B:
  13014. MOVB (DI)(R9*1), R8
  13015. CMPB (BX)(R9*1), R8
  13016. JNE match_nolit_end_encodeSnappyBlockAsm8B
  13017. LEAL 1(R9), R9
  13018. match_nolit_end_encodeSnappyBlockAsm8B:
  13019. ADDL R9, CX
  13020. MOVL 16(SP), BX
  13021. ADDL $0x04, R9
  13022. MOVL CX, 12(SP)
  13023. // emitCopy
  13024. two_byte_offset_match_nolit_encodeSnappyBlockAsm8B:
  13025. CMPL R9, $0x40
  13026. JBE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B
  13027. MOVB $0xee, (AX)
  13028. MOVW BX, 1(AX)
  13029. LEAL -60(R9), R9
  13030. ADDQ $0x03, AX
  13031. JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm8B
  13032. two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B:
  13033. MOVL R9, SI
  13034. SHLL $0x02, SI
  13035. CMPL R9, $0x0c
  13036. JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm8B
  13037. LEAL -15(SI), SI
  13038. MOVB BL, 1(AX)
  13039. SHRL $0x08, BX
  13040. SHLL $0x05, BX
  13041. ORL BX, SI
  13042. MOVB SI, (AX)
  13043. ADDQ $0x02, AX
  13044. JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm8B
  13045. emit_copy_three_match_nolit_encodeSnappyBlockAsm8B:
  13046. LEAL -2(SI), SI
  13047. MOVB SI, (AX)
  13048. MOVW BX, 1(AX)
  13049. ADDQ $0x03, AX
  13050. match_nolit_emitcopy_end_encodeSnappyBlockAsm8B:
  13051. CMPL CX, 8(SP)
  13052. JAE emit_remainder_encodeSnappyBlockAsm8B
  13053. MOVQ -2(DX)(CX*1), SI
  13054. CMPQ AX, (SP)
  13055. JB match_nolit_dst_ok_encodeSnappyBlockAsm8B
  13056. MOVQ $0x00000000, ret+48(FP)
  13057. RET
  13058. match_nolit_dst_ok_encodeSnappyBlockAsm8B:
  13059. MOVQ $0x9e3779b1, R8
  13060. MOVQ SI, DI
  13061. SHRQ $0x10, SI
  13062. MOVQ SI, BX
  13063. SHLQ $0x20, DI
  13064. IMULQ R8, DI
  13065. SHRQ $0x38, DI
  13066. SHLQ $0x20, BX
  13067. IMULQ R8, BX
  13068. SHRQ $0x38, BX
  13069. LEAL -2(CX), R8
  13070. LEAQ 24(SP)(BX*4), R9
  13071. MOVL (R9), BX
  13072. MOVL R8, 24(SP)(DI*4)
  13073. MOVL CX, (R9)
  13074. CMPL (DX)(BX*1), SI
  13075. JEQ match_nolit_loop_encodeSnappyBlockAsm8B
  13076. INCL CX
  13077. JMP search_loop_encodeSnappyBlockAsm8B
  13078. emit_remainder_encodeSnappyBlockAsm8B:
  13079. MOVQ src_len+32(FP), CX
  13080. SUBL 12(SP), CX
  13081. LEAQ 3(AX)(CX*1), CX
  13082. CMPQ CX, (SP)
  13083. JB emit_remainder_ok_encodeSnappyBlockAsm8B
  13084. MOVQ $0x00000000, ret+48(FP)
  13085. RET
  13086. emit_remainder_ok_encodeSnappyBlockAsm8B:
  13087. MOVQ src_len+32(FP), CX
  13088. MOVL 12(SP), BX
  13089. CMPL BX, CX
  13090. JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B
  13091. MOVL CX, SI
  13092. MOVL CX, 12(SP)
  13093. LEAQ (DX)(BX*1), CX
  13094. SUBL BX, SI
  13095. LEAL -1(SI), DX
  13096. CMPL DX, $0x3c
  13097. JB one_byte_emit_remainder_encodeSnappyBlockAsm8B
  13098. CMPL DX, $0x00000100
  13099. JB two_bytes_emit_remainder_encodeSnappyBlockAsm8B
  13100. JB three_bytes_emit_remainder_encodeSnappyBlockAsm8B
  13101. three_bytes_emit_remainder_encodeSnappyBlockAsm8B:
  13102. MOVB $0xf4, (AX)
  13103. MOVW DX, 1(AX)
  13104. ADDQ $0x03, AX
  13105. JMP memmove_long_emit_remainder_encodeSnappyBlockAsm8B
  13106. two_bytes_emit_remainder_encodeSnappyBlockAsm8B:
  13107. MOVB $0xf0, (AX)
  13108. MOVB DL, 1(AX)
  13109. ADDQ $0x02, AX
  13110. CMPL DX, $0x40
  13111. JB memmove_emit_remainder_encodeSnappyBlockAsm8B
  13112. JMP memmove_long_emit_remainder_encodeSnappyBlockAsm8B
  13113. one_byte_emit_remainder_encodeSnappyBlockAsm8B:
  13114. SHLB $0x02, DL
  13115. MOVB DL, (AX)
  13116. ADDQ $0x01, AX
  13117. memmove_emit_remainder_encodeSnappyBlockAsm8B:
  13118. LEAQ (AX)(SI*1), DX
  13119. MOVL SI, BX
  13120. // genMemMoveShort
  13121. CMPQ BX, $0x03
  13122. JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_1or2
  13123. JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_3
  13124. CMPQ BX, $0x08
  13125. JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_4through7
  13126. CMPQ BX, $0x10
  13127. JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16
  13128. CMPQ BX, $0x20
  13129. JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32
  13130. JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64
  13131. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_1or2:
  13132. MOVB (CX), SI
  13133. MOVB -1(CX)(BX*1), CL
  13134. MOVB SI, (AX)
  13135. MOVB CL, -1(AX)(BX*1)
  13136. JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B
  13137. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_3:
  13138. MOVW (CX), SI
  13139. MOVB 2(CX), CL
  13140. MOVW SI, (AX)
  13141. MOVB CL, 2(AX)
  13142. JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B
  13143. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_4through7:
  13144. MOVL (CX), SI
  13145. MOVL -4(CX)(BX*1), CX
  13146. MOVL SI, (AX)
  13147. MOVL CX, -4(AX)(BX*1)
  13148. JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B
  13149. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16:
  13150. MOVQ (CX), SI
  13151. MOVQ -8(CX)(BX*1), CX
  13152. MOVQ SI, (AX)
  13153. MOVQ CX, -8(AX)(BX*1)
  13154. JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B
  13155. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32:
  13156. MOVOU (CX), X0
  13157. MOVOU -16(CX)(BX*1), X1
  13158. MOVOU X0, (AX)
  13159. MOVOU X1, -16(AX)(BX*1)
  13160. JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B
  13161. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64:
  13162. MOVOU (CX), X0
  13163. MOVOU 16(CX), X1
  13164. MOVOU -32(CX)(BX*1), X2
  13165. MOVOU -16(CX)(BX*1), X3
  13166. MOVOU X0, (AX)
  13167. MOVOU X1, 16(AX)
  13168. MOVOU X2, -32(AX)(BX*1)
  13169. MOVOU X3, -16(AX)(BX*1)
  13170. memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B:
  13171. MOVQ DX, AX
  13172. JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B
  13173. memmove_long_emit_remainder_encodeSnappyBlockAsm8B:
  13174. LEAQ (AX)(SI*1), DX
  13175. MOVL SI, BX
  13176. // genMemMoveLong
  13177. MOVOU (CX), X0
  13178. MOVOU 16(CX), X1
  13179. MOVOU -32(CX)(BX*1), X2
  13180. MOVOU -16(CX)(BX*1), X3
  13181. MOVQ BX, DI
  13182. SHRQ $0x05, DI
  13183. MOVQ AX, SI
  13184. ANDL $0x0000001f, SI
  13185. MOVQ $0x00000040, R8
  13186. SUBQ SI, R8
  13187. DECQ DI
  13188. JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
  13189. LEAQ -32(CX)(R8*1), SI
  13190. LEAQ -32(AX)(R8*1), R9
  13191. emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back:
  13192. MOVOU (SI), X4
  13193. MOVOU 16(SI), X5
  13194. MOVOA X4, (R9)
  13195. MOVOA X5, 16(R9)
  13196. ADDQ $0x20, R9
  13197. ADDQ $0x20, SI
  13198. ADDQ $0x20, R8
  13199. DECQ DI
  13200. JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back
  13201. emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32:
  13202. MOVOU -32(CX)(R8*1), X4
  13203. MOVOU -16(CX)(R8*1), X5
  13204. MOVOA X4, -32(AX)(R8*1)
  13205. MOVOA X5, -16(AX)(R8*1)
  13206. ADDQ $0x20, R8
  13207. CMPQ BX, R8
  13208. JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
  13209. MOVOU X0, (AX)
  13210. MOVOU X1, 16(AX)
  13211. MOVOU X2, -32(AX)(BX*1)
  13212. MOVOU X3, -16(AX)(BX*1)
  13213. MOVQ DX, AX
  13214. emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B:
  13215. MOVQ dst_base+0(FP), CX
  13216. SUBQ CX, AX
  13217. MOVQ AX, ret+48(FP)
  13218. RET
  13219. // func encodeSnappyBetterBlockAsm(dst []byte, src []byte) int
  13220. // Requires: BMI, SSE2
  13221. TEXT ·encodeSnappyBetterBlockAsm(SB), $589848-56
  13222. MOVQ dst_base+0(FP), AX
  13223. MOVQ $0x00001200, CX
  13224. LEAQ 24(SP), DX
  13225. PXOR X0, X0
  13226. zero_loop_encodeSnappyBetterBlockAsm:
  13227. MOVOU X0, (DX)
  13228. MOVOU X0, 16(DX)
  13229. MOVOU X0, 32(DX)
  13230. MOVOU X0, 48(DX)
  13231. MOVOU X0, 64(DX)
  13232. MOVOU X0, 80(DX)
  13233. MOVOU X0, 96(DX)
  13234. MOVOU X0, 112(DX)
  13235. ADDQ $0x80, DX
  13236. DECQ CX
  13237. JNZ zero_loop_encodeSnappyBetterBlockAsm
  13238. MOVL $0x00000000, 12(SP)
  13239. MOVQ src_len+32(FP), CX
  13240. LEAQ -9(CX), DX
  13241. LEAQ -8(CX), BX
  13242. MOVL BX, 8(SP)
  13243. SHRQ $0x05, CX
  13244. SUBL CX, DX
  13245. LEAQ (AX)(DX*1), DX
  13246. MOVQ DX, (SP)
  13247. MOVL $0x00000001, CX
  13248. MOVL $0x00000000, 16(SP)
  13249. MOVQ src_base+24(FP), DX
  13250. search_loop_encodeSnappyBetterBlockAsm:
  13251. MOVL CX, BX
  13252. SUBL 12(SP), BX
  13253. SHRL $0x07, BX
  13254. CMPL BX, $0x63
  13255. JBE check_maxskip_ok_encodeSnappyBetterBlockAsm
  13256. LEAL 100(CX), BX
  13257. JMP check_maxskip_cont_encodeSnappyBetterBlockAsm
  13258. check_maxskip_ok_encodeSnappyBetterBlockAsm:
  13259. LEAL 1(CX)(BX*1), BX
  13260. check_maxskip_cont_encodeSnappyBetterBlockAsm:
  13261. CMPL BX, 8(SP)
  13262. JAE emit_remainder_encodeSnappyBetterBlockAsm
  13263. MOVQ (DX)(CX*1), SI
  13264. MOVL BX, 20(SP)
  13265. MOVQ $0x00cf1bbcdcbfa563, R8
  13266. MOVQ $0x9e3779b1, BX
  13267. MOVQ SI, R9
  13268. MOVQ SI, R10
  13269. SHLQ $0x08, R9
  13270. IMULQ R8, R9
  13271. SHRQ $0x2f, R9
  13272. SHLQ $0x20, R10
  13273. IMULQ BX, R10
  13274. SHRQ $0x32, R10
  13275. MOVL 24(SP)(R9*4), BX
  13276. MOVL 524312(SP)(R10*4), DI
  13277. MOVL CX, 24(SP)(R9*4)
  13278. MOVL CX, 524312(SP)(R10*4)
  13279. MOVQ (DX)(BX*1), R9
  13280. MOVQ (DX)(DI*1), R10
  13281. CMPQ R9, SI
  13282. JEQ candidate_match_encodeSnappyBetterBlockAsm
  13283. CMPQ R10, SI
  13284. JNE no_short_found_encodeSnappyBetterBlockAsm
  13285. MOVL DI, BX
  13286. JMP candidate_match_encodeSnappyBetterBlockAsm
  13287. no_short_found_encodeSnappyBetterBlockAsm:
  13288. CMPL R9, SI
  13289. JEQ candidate_match_encodeSnappyBetterBlockAsm
  13290. CMPL R10, SI
  13291. JEQ candidateS_match_encodeSnappyBetterBlockAsm
  13292. MOVL 20(SP), CX
  13293. JMP search_loop_encodeSnappyBetterBlockAsm
  13294. candidateS_match_encodeSnappyBetterBlockAsm:
  13295. SHRQ $0x08, SI
  13296. MOVQ SI, R9
  13297. SHLQ $0x08, R9
  13298. IMULQ R8, R9
  13299. SHRQ $0x2f, R9
  13300. MOVL 24(SP)(R9*4), BX
  13301. INCL CX
  13302. MOVL CX, 24(SP)(R9*4)
  13303. CMPL (DX)(BX*1), SI
  13304. JEQ candidate_match_encodeSnappyBetterBlockAsm
  13305. DECL CX
  13306. MOVL DI, BX
  13307. candidate_match_encodeSnappyBetterBlockAsm:
  13308. MOVL 12(SP), SI
  13309. TESTL BX, BX
  13310. JZ match_extend_back_end_encodeSnappyBetterBlockAsm
  13311. match_extend_back_loop_encodeSnappyBetterBlockAsm:
  13312. CMPL CX, SI
  13313. JBE match_extend_back_end_encodeSnappyBetterBlockAsm
  13314. MOVB -1(DX)(BX*1), DI
  13315. MOVB -1(DX)(CX*1), R8
  13316. CMPB DI, R8
  13317. JNE match_extend_back_end_encodeSnappyBetterBlockAsm
  13318. LEAL -1(CX), CX
  13319. DECL BX
  13320. JZ match_extend_back_end_encodeSnappyBetterBlockAsm
  13321. JMP match_extend_back_loop_encodeSnappyBetterBlockAsm
  13322. match_extend_back_end_encodeSnappyBetterBlockAsm:
  13323. MOVL CX, SI
  13324. SUBL 12(SP), SI
  13325. LEAQ 5(AX)(SI*1), SI
  13326. CMPQ SI, (SP)
  13327. JB match_dst_size_check_encodeSnappyBetterBlockAsm
  13328. MOVQ $0x00000000, ret+48(FP)
  13329. RET
  13330. match_dst_size_check_encodeSnappyBetterBlockAsm:
  13331. MOVL CX, SI
  13332. ADDL $0x04, CX
  13333. ADDL $0x04, BX
  13334. MOVQ src_len+32(FP), DI
  13335. SUBL CX, DI
  13336. LEAQ (DX)(CX*1), R8
  13337. LEAQ (DX)(BX*1), R9
  13338. // matchLen
  13339. XORL R11, R11
  13340. matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm:
  13341. CMPL DI, $0x10
  13342. JB matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm
  13343. MOVQ (R8)(R11*1), R10
  13344. MOVQ 8(R8)(R11*1), R12
  13345. XORQ (R9)(R11*1), R10
  13346. JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm
  13347. XORQ 8(R9)(R11*1), R12
  13348. JNZ matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm
  13349. LEAL -16(DI), DI
  13350. LEAL 16(R11), R11
  13351. JMP matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm
  13352. matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm:
  13353. #ifdef GOAMD64_v3
  13354. TZCNTQ R12, R12
  13355. #else
  13356. BSFQ R12, R12
  13357. #endif
  13358. SARQ $0x03, R12
  13359. LEAL 8(R11)(R12*1), R11
  13360. JMP match_nolit_end_encodeSnappyBetterBlockAsm
  13361. matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm:
  13362. CMPL DI, $0x08
  13363. JB matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm
  13364. MOVQ (R8)(R11*1), R10
  13365. XORQ (R9)(R11*1), R10
  13366. JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm
  13367. LEAL -8(DI), DI
  13368. LEAL 8(R11), R11
  13369. JMP matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm
  13370. matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm:
  13371. #ifdef GOAMD64_v3
  13372. TZCNTQ R10, R10
  13373. #else
  13374. BSFQ R10, R10
  13375. #endif
  13376. SARQ $0x03, R10
  13377. LEAL (R11)(R10*1), R11
  13378. JMP match_nolit_end_encodeSnappyBetterBlockAsm
  13379. matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm:
  13380. CMPL DI, $0x04
  13381. JB matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm
  13382. MOVL (R8)(R11*1), R10
  13383. CMPL (R9)(R11*1), R10
  13384. JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm
  13385. LEAL -4(DI), DI
  13386. LEAL 4(R11), R11
  13387. matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm:
  13388. CMPL DI, $0x01
  13389. JE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm
  13390. JB match_nolit_end_encodeSnappyBetterBlockAsm
  13391. MOVW (R8)(R11*1), R10
  13392. CMPW (R9)(R11*1), R10
  13393. JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm
  13394. LEAL 2(R11), R11
  13395. SUBL $0x02, DI
  13396. JZ match_nolit_end_encodeSnappyBetterBlockAsm
  13397. matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm:
  13398. MOVB (R8)(R11*1), R10
  13399. CMPB (R9)(R11*1), R10
  13400. JNE match_nolit_end_encodeSnappyBetterBlockAsm
  13401. LEAL 1(R11), R11
  13402. match_nolit_end_encodeSnappyBetterBlockAsm:
  13403. MOVL CX, DI
  13404. SUBL BX, DI
  13405. // Check if repeat
  13406. CMPL R11, $0x01
  13407. JA match_length_ok_encodeSnappyBetterBlockAsm
  13408. CMPL DI, $0x0000ffff
  13409. JBE match_length_ok_encodeSnappyBetterBlockAsm
  13410. MOVL 20(SP), CX
  13411. INCL CX
  13412. JMP search_loop_encodeSnappyBetterBlockAsm
  13413. match_length_ok_encodeSnappyBetterBlockAsm:
  13414. MOVL DI, 16(SP)
  13415. MOVL 12(SP), BX
  13416. CMPL BX, SI
  13417. JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm
  13418. MOVL SI, R8
  13419. MOVL SI, 12(SP)
  13420. LEAQ (DX)(BX*1), R9
  13421. SUBL BX, R8
  13422. LEAL -1(R8), BX
  13423. CMPL BX, $0x3c
  13424. JB one_byte_match_emit_encodeSnappyBetterBlockAsm
  13425. CMPL BX, $0x00000100
  13426. JB two_bytes_match_emit_encodeSnappyBetterBlockAsm
  13427. CMPL BX, $0x00010000
  13428. JB three_bytes_match_emit_encodeSnappyBetterBlockAsm
  13429. CMPL BX, $0x01000000
  13430. JB four_bytes_match_emit_encodeSnappyBetterBlockAsm
  13431. MOVB $0xfc, (AX)
  13432. MOVL BX, 1(AX)
  13433. ADDQ $0x05, AX
  13434. JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm
  13435. four_bytes_match_emit_encodeSnappyBetterBlockAsm:
  13436. MOVL BX, R10
  13437. SHRL $0x10, R10
  13438. MOVB $0xf8, (AX)
  13439. MOVW BX, 1(AX)
  13440. MOVB R10, 3(AX)
  13441. ADDQ $0x04, AX
  13442. JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm
  13443. three_bytes_match_emit_encodeSnappyBetterBlockAsm:
  13444. MOVB $0xf4, (AX)
  13445. MOVW BX, 1(AX)
  13446. ADDQ $0x03, AX
  13447. JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm
  13448. two_bytes_match_emit_encodeSnappyBetterBlockAsm:
  13449. MOVB $0xf0, (AX)
  13450. MOVB BL, 1(AX)
  13451. ADDQ $0x02, AX
  13452. CMPL BX, $0x40
  13453. JB memmove_match_emit_encodeSnappyBetterBlockAsm
  13454. JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm
  13455. one_byte_match_emit_encodeSnappyBetterBlockAsm:
  13456. SHLB $0x02, BL
  13457. MOVB BL, (AX)
  13458. ADDQ $0x01, AX
  13459. memmove_match_emit_encodeSnappyBetterBlockAsm:
  13460. LEAQ (AX)(R8*1), BX
  13461. // genMemMoveShort
  13462. CMPQ R8, $0x08
  13463. JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8
  13464. CMPQ R8, $0x10
  13465. JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8through16
  13466. CMPQ R8, $0x20
  13467. JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_17through32
  13468. JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_33through64
  13469. emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8:
  13470. MOVQ (R9), R10
  13471. MOVQ R10, (AX)
  13472. JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm
  13473. emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8through16:
  13474. MOVQ (R9), R10
  13475. MOVQ -8(R9)(R8*1), R9
  13476. MOVQ R10, (AX)
  13477. MOVQ R9, -8(AX)(R8*1)
  13478. JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm
  13479. emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_17through32:
  13480. MOVOU (R9), X0
  13481. MOVOU -16(R9)(R8*1), X1
  13482. MOVOU X0, (AX)
  13483. MOVOU X1, -16(AX)(R8*1)
  13484. JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm
  13485. emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_33through64:
  13486. MOVOU (R9), X0
  13487. MOVOU 16(R9), X1
  13488. MOVOU -32(R9)(R8*1), X2
  13489. MOVOU -16(R9)(R8*1), X3
  13490. MOVOU X0, (AX)
  13491. MOVOU X1, 16(AX)
  13492. MOVOU X2, -32(AX)(R8*1)
  13493. MOVOU X3, -16(AX)(R8*1)
  13494. memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm:
  13495. MOVQ BX, AX
  13496. JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm
  13497. memmove_long_match_emit_encodeSnappyBetterBlockAsm:
  13498. LEAQ (AX)(R8*1), BX
  13499. // genMemMoveLong
  13500. MOVOU (R9), X0
  13501. MOVOU 16(R9), X1
  13502. MOVOU -32(R9)(R8*1), X2
  13503. MOVOU -16(R9)(R8*1), X3
  13504. MOVQ R8, R12
  13505. SHRQ $0x05, R12
  13506. MOVQ AX, R10
  13507. ANDL $0x0000001f, R10
  13508. MOVQ $0x00000040, R13
  13509. SUBQ R10, R13
  13510. DECQ R12
  13511. JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
  13512. LEAQ -32(R9)(R13*1), R10
  13513. LEAQ -32(AX)(R13*1), R14
  13514. emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_big_loop_back:
  13515. MOVOU (R10), X4
  13516. MOVOU 16(R10), X5
  13517. MOVOA X4, (R14)
  13518. MOVOA X5, 16(R14)
  13519. ADDQ $0x20, R14
  13520. ADDQ $0x20, R10
  13521. ADDQ $0x20, R13
  13522. DECQ R12
  13523. JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_big_loop_back
  13524. emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32:
  13525. MOVOU -32(R9)(R13*1), X4
  13526. MOVOU -16(R9)(R13*1), X5
  13527. MOVOA X4, -32(AX)(R13*1)
  13528. MOVOA X5, -16(AX)(R13*1)
  13529. ADDQ $0x20, R13
  13530. CMPQ R8, R13
  13531. JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
  13532. MOVOU X0, (AX)
  13533. MOVOU X1, 16(AX)
  13534. MOVOU X2, -32(AX)(R8*1)
  13535. MOVOU X3, -16(AX)(R8*1)
  13536. MOVQ BX, AX
  13537. emit_literal_done_match_emit_encodeSnappyBetterBlockAsm:
  13538. ADDL R11, CX
  13539. ADDL $0x04, R11
  13540. MOVL CX, 12(SP)
  13541. // emitCopy
  13542. CMPL DI, $0x00010000
  13543. JB two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm
  13544. four_bytes_loop_back_match_nolit_encodeSnappyBetterBlockAsm:
  13545. CMPL R11, $0x40
  13546. JBE four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm
  13547. MOVB $0xff, (AX)
  13548. MOVL DI, 1(AX)
  13549. LEAL -64(R11), R11
  13550. ADDQ $0x05, AX
  13551. CMPL R11, $0x04
  13552. JB four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm
  13553. JMP four_bytes_loop_back_match_nolit_encodeSnappyBetterBlockAsm
  13554. four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm:
  13555. TESTL R11, R11
  13556. JZ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm
  13557. XORL BX, BX
  13558. LEAL -1(BX)(R11*4), R11
  13559. MOVB R11, (AX)
  13560. MOVL DI, 1(AX)
  13561. ADDQ $0x05, AX
  13562. JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm
  13563. two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm:
  13564. CMPL R11, $0x40
  13565. JBE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm
  13566. MOVB $0xee, (AX)
  13567. MOVW DI, 1(AX)
  13568. LEAL -60(R11), R11
  13569. ADDQ $0x03, AX
  13570. JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm
  13571. two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm:
  13572. MOVL R11, BX
  13573. SHLL $0x02, BX
  13574. CMPL R11, $0x0c
  13575. JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm
  13576. CMPL DI, $0x00000800
  13577. JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm
  13578. LEAL -15(BX), BX
  13579. MOVB DI, 1(AX)
  13580. SHRL $0x08, DI
  13581. SHLL $0x05, DI
  13582. ORL DI, BX
  13583. MOVB BL, (AX)
  13584. ADDQ $0x02, AX
  13585. JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm
  13586. emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm:
  13587. LEAL -2(BX), BX
  13588. MOVB BL, (AX)
  13589. MOVW DI, 1(AX)
  13590. ADDQ $0x03, AX
  13591. match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm:
  13592. CMPL CX, 8(SP)
  13593. JAE emit_remainder_encodeSnappyBetterBlockAsm
  13594. CMPQ AX, (SP)
  13595. JB match_nolit_dst_ok_encodeSnappyBetterBlockAsm
  13596. MOVQ $0x00000000, ret+48(FP)
  13597. RET
  13598. match_nolit_dst_ok_encodeSnappyBetterBlockAsm:
  13599. MOVQ $0x00cf1bbcdcbfa563, BX
  13600. MOVQ $0x9e3779b1, DI
  13601. LEAQ 1(SI), SI
  13602. LEAQ -2(CX), R8
  13603. MOVQ (DX)(SI*1), R9
  13604. MOVQ 1(DX)(SI*1), R10
  13605. MOVQ (DX)(R8*1), R11
  13606. MOVQ 1(DX)(R8*1), R12
  13607. SHLQ $0x08, R9
  13608. IMULQ BX, R9
  13609. SHRQ $0x2f, R9
  13610. SHLQ $0x20, R10
  13611. IMULQ DI, R10
  13612. SHRQ $0x32, R10
  13613. SHLQ $0x08, R11
  13614. IMULQ BX, R11
  13615. SHRQ $0x2f, R11
  13616. SHLQ $0x20, R12
  13617. IMULQ DI, R12
  13618. SHRQ $0x32, R12
  13619. LEAQ 1(SI), DI
  13620. LEAQ 1(R8), R13
  13621. MOVL SI, 24(SP)(R9*4)
  13622. MOVL R8, 24(SP)(R11*4)
  13623. MOVL DI, 524312(SP)(R10*4)
  13624. MOVL R13, 524312(SP)(R12*4)
  13625. LEAQ 1(R8)(SI*1), DI
  13626. SHRQ $0x01, DI
  13627. ADDQ $0x01, SI
  13628. SUBQ $0x01, R8
  13629. index_loop_encodeSnappyBetterBlockAsm:
  13630. CMPQ DI, R8
  13631. JAE search_loop_encodeSnappyBetterBlockAsm
  13632. MOVQ (DX)(SI*1), R9
  13633. MOVQ (DX)(DI*1), R10
  13634. SHLQ $0x08, R9
  13635. IMULQ BX, R9
  13636. SHRQ $0x2f, R9
  13637. SHLQ $0x08, R10
  13638. IMULQ BX, R10
  13639. SHRQ $0x2f, R10
  13640. MOVL SI, 24(SP)(R9*4)
  13641. MOVL DI, 24(SP)(R10*4)
  13642. ADDQ $0x02, SI
  13643. ADDQ $0x02, DI
  13644. JMP index_loop_encodeSnappyBetterBlockAsm
  13645. emit_remainder_encodeSnappyBetterBlockAsm:
  13646. MOVQ src_len+32(FP), CX
  13647. SUBL 12(SP), CX
  13648. LEAQ 5(AX)(CX*1), CX
  13649. CMPQ CX, (SP)
  13650. JB emit_remainder_ok_encodeSnappyBetterBlockAsm
  13651. MOVQ $0x00000000, ret+48(FP)
  13652. RET
  13653. emit_remainder_ok_encodeSnappyBetterBlockAsm:
  13654. MOVQ src_len+32(FP), CX
  13655. MOVL 12(SP), BX
  13656. CMPL BX, CX
  13657. JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm
  13658. MOVL CX, SI
  13659. MOVL CX, 12(SP)
  13660. LEAQ (DX)(BX*1), CX
  13661. SUBL BX, SI
  13662. LEAL -1(SI), DX
  13663. CMPL DX, $0x3c
  13664. JB one_byte_emit_remainder_encodeSnappyBetterBlockAsm
  13665. CMPL DX, $0x00000100
  13666. JB two_bytes_emit_remainder_encodeSnappyBetterBlockAsm
  13667. CMPL DX, $0x00010000
  13668. JB three_bytes_emit_remainder_encodeSnappyBetterBlockAsm
  13669. CMPL DX, $0x01000000
  13670. JB four_bytes_emit_remainder_encodeSnappyBetterBlockAsm
  13671. MOVB $0xfc, (AX)
  13672. MOVL DX, 1(AX)
  13673. ADDQ $0x05, AX
  13674. JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm
  13675. four_bytes_emit_remainder_encodeSnappyBetterBlockAsm:
  13676. MOVL DX, BX
  13677. SHRL $0x10, BX
  13678. MOVB $0xf8, (AX)
  13679. MOVW DX, 1(AX)
  13680. MOVB BL, 3(AX)
  13681. ADDQ $0x04, AX
  13682. JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm
  13683. three_bytes_emit_remainder_encodeSnappyBetterBlockAsm:
  13684. MOVB $0xf4, (AX)
  13685. MOVW DX, 1(AX)
  13686. ADDQ $0x03, AX
  13687. JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm
  13688. two_bytes_emit_remainder_encodeSnappyBetterBlockAsm:
  13689. MOVB $0xf0, (AX)
  13690. MOVB DL, 1(AX)
  13691. ADDQ $0x02, AX
  13692. CMPL DX, $0x40
  13693. JB memmove_emit_remainder_encodeSnappyBetterBlockAsm
  13694. JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm
  13695. one_byte_emit_remainder_encodeSnappyBetterBlockAsm:
  13696. SHLB $0x02, DL
  13697. MOVB DL, (AX)
  13698. ADDQ $0x01, AX
  13699. memmove_emit_remainder_encodeSnappyBetterBlockAsm:
  13700. LEAQ (AX)(SI*1), DX
  13701. MOVL SI, BX
  13702. // genMemMoveShort
  13703. CMPQ BX, $0x03
  13704. JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_1or2
  13705. JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_3
  13706. CMPQ BX, $0x08
  13707. JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_4through7
  13708. CMPQ BX, $0x10
  13709. JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16
  13710. CMPQ BX, $0x20
  13711. JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_17through32
  13712. JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_33through64
  13713. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_1or2:
  13714. MOVB (CX), SI
  13715. MOVB -1(CX)(BX*1), CL
  13716. MOVB SI, (AX)
  13717. MOVB CL, -1(AX)(BX*1)
  13718. JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm
  13719. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_3:
  13720. MOVW (CX), SI
  13721. MOVB 2(CX), CL
  13722. MOVW SI, (AX)
  13723. MOVB CL, 2(AX)
  13724. JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm
  13725. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_4through7:
  13726. MOVL (CX), SI
  13727. MOVL -4(CX)(BX*1), CX
  13728. MOVL SI, (AX)
  13729. MOVL CX, -4(AX)(BX*1)
  13730. JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm
  13731. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16:
  13732. MOVQ (CX), SI
  13733. MOVQ -8(CX)(BX*1), CX
  13734. MOVQ SI, (AX)
  13735. MOVQ CX, -8(AX)(BX*1)
  13736. JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm
  13737. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_17through32:
  13738. MOVOU (CX), X0
  13739. MOVOU -16(CX)(BX*1), X1
  13740. MOVOU X0, (AX)
  13741. MOVOU X1, -16(AX)(BX*1)
  13742. JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm
  13743. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_33through64:
  13744. MOVOU (CX), X0
  13745. MOVOU 16(CX), X1
  13746. MOVOU -32(CX)(BX*1), X2
  13747. MOVOU -16(CX)(BX*1), X3
  13748. MOVOU X0, (AX)
  13749. MOVOU X1, 16(AX)
  13750. MOVOU X2, -32(AX)(BX*1)
  13751. MOVOU X3, -16(AX)(BX*1)
  13752. memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm:
  13753. MOVQ DX, AX
  13754. JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm
  13755. memmove_long_emit_remainder_encodeSnappyBetterBlockAsm:
  13756. LEAQ (AX)(SI*1), DX
  13757. MOVL SI, BX
  13758. // genMemMoveLong
  13759. MOVOU (CX), X0
  13760. MOVOU 16(CX), X1
  13761. MOVOU -32(CX)(BX*1), X2
  13762. MOVOU -16(CX)(BX*1), X3
  13763. MOVQ BX, DI
  13764. SHRQ $0x05, DI
  13765. MOVQ AX, SI
  13766. ANDL $0x0000001f, SI
  13767. MOVQ $0x00000040, R8
  13768. SUBQ SI, R8
  13769. DECQ DI
  13770. JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
  13771. LEAQ -32(CX)(R8*1), SI
  13772. LEAQ -32(AX)(R8*1), R9
  13773. emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_big_loop_back:
  13774. MOVOU (SI), X4
  13775. MOVOU 16(SI), X5
  13776. MOVOA X4, (R9)
  13777. MOVOA X5, 16(R9)
  13778. ADDQ $0x20, R9
  13779. ADDQ $0x20, SI
  13780. ADDQ $0x20, R8
  13781. DECQ DI
  13782. JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_big_loop_back
  13783. emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32:
  13784. MOVOU -32(CX)(R8*1), X4
  13785. MOVOU -16(CX)(R8*1), X5
  13786. MOVOA X4, -32(AX)(R8*1)
  13787. MOVOA X5, -16(AX)(R8*1)
  13788. ADDQ $0x20, R8
  13789. CMPQ BX, R8
  13790. JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
  13791. MOVOU X0, (AX)
  13792. MOVOU X1, 16(AX)
  13793. MOVOU X2, -32(AX)(BX*1)
  13794. MOVOU X3, -16(AX)(BX*1)
  13795. MOVQ DX, AX
  13796. emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm:
  13797. MOVQ dst_base+0(FP), CX
  13798. SUBQ CX, AX
  13799. MOVQ AX, ret+48(FP)
  13800. RET
  13801. // func encodeSnappyBetterBlockAsm64K(dst []byte, src []byte) int
  13802. // Requires: BMI, SSE2
  13803. TEXT ·encodeSnappyBetterBlockAsm64K(SB), $327704-56
  13804. MOVQ dst_base+0(FP), AX
  13805. MOVQ $0x00000a00, CX
  13806. LEAQ 24(SP), DX
  13807. PXOR X0, X0
  13808. zero_loop_encodeSnappyBetterBlockAsm64K:
  13809. MOVOU X0, (DX)
  13810. MOVOU X0, 16(DX)
  13811. MOVOU X0, 32(DX)
  13812. MOVOU X0, 48(DX)
  13813. MOVOU X0, 64(DX)
  13814. MOVOU X0, 80(DX)
  13815. MOVOU X0, 96(DX)
  13816. MOVOU X0, 112(DX)
  13817. ADDQ $0x80, DX
  13818. DECQ CX
  13819. JNZ zero_loop_encodeSnappyBetterBlockAsm64K
  13820. MOVL $0x00000000, 12(SP)
  13821. MOVQ src_len+32(FP), CX
  13822. LEAQ -9(CX), DX
  13823. LEAQ -8(CX), BX
  13824. MOVL BX, 8(SP)
  13825. SHRQ $0x05, CX
  13826. SUBL CX, DX
  13827. LEAQ (AX)(DX*1), DX
  13828. MOVQ DX, (SP)
  13829. MOVL $0x00000001, CX
  13830. MOVL $0x00000000, 16(SP)
  13831. MOVQ src_base+24(FP), DX
  13832. search_loop_encodeSnappyBetterBlockAsm64K:
  13833. MOVL CX, BX
  13834. SUBL 12(SP), BX
  13835. SHRL $0x07, BX
  13836. LEAL 1(CX)(BX*1), BX
  13837. CMPL BX, 8(SP)
  13838. JAE emit_remainder_encodeSnappyBetterBlockAsm64K
  13839. MOVQ (DX)(CX*1), SI
  13840. MOVL BX, 20(SP)
  13841. MOVQ $0x00cf1bbcdcbfa563, R8
  13842. MOVQ $0x9e3779b1, BX
  13843. MOVQ SI, R9
  13844. MOVQ SI, R10
  13845. SHLQ $0x08, R9
  13846. IMULQ R8, R9
  13847. SHRQ $0x30, R9
  13848. SHLQ $0x20, R10
  13849. IMULQ BX, R10
  13850. SHRQ $0x32, R10
  13851. MOVL 24(SP)(R9*4), BX
  13852. MOVL 262168(SP)(R10*4), DI
  13853. MOVL CX, 24(SP)(R9*4)
  13854. MOVL CX, 262168(SP)(R10*4)
  13855. MOVQ (DX)(BX*1), R9
  13856. MOVQ (DX)(DI*1), R10
  13857. CMPQ R9, SI
  13858. JEQ candidate_match_encodeSnappyBetterBlockAsm64K
  13859. CMPQ R10, SI
  13860. JNE no_short_found_encodeSnappyBetterBlockAsm64K
  13861. MOVL DI, BX
  13862. JMP candidate_match_encodeSnappyBetterBlockAsm64K
  13863. no_short_found_encodeSnappyBetterBlockAsm64K:
  13864. CMPL R9, SI
  13865. JEQ candidate_match_encodeSnappyBetterBlockAsm64K
  13866. CMPL R10, SI
  13867. JEQ candidateS_match_encodeSnappyBetterBlockAsm64K
  13868. MOVL 20(SP), CX
  13869. JMP search_loop_encodeSnappyBetterBlockAsm64K
  13870. candidateS_match_encodeSnappyBetterBlockAsm64K:
  13871. SHRQ $0x08, SI
  13872. MOVQ SI, R9
  13873. SHLQ $0x08, R9
  13874. IMULQ R8, R9
  13875. SHRQ $0x30, R9
  13876. MOVL 24(SP)(R9*4), BX
  13877. INCL CX
  13878. MOVL CX, 24(SP)(R9*4)
  13879. CMPL (DX)(BX*1), SI
  13880. JEQ candidate_match_encodeSnappyBetterBlockAsm64K
  13881. DECL CX
  13882. MOVL DI, BX
  13883. candidate_match_encodeSnappyBetterBlockAsm64K:
  13884. MOVL 12(SP), SI
  13885. TESTL BX, BX
  13886. JZ match_extend_back_end_encodeSnappyBetterBlockAsm64K
  13887. match_extend_back_loop_encodeSnappyBetterBlockAsm64K:
  13888. CMPL CX, SI
  13889. JBE match_extend_back_end_encodeSnappyBetterBlockAsm64K
  13890. MOVB -1(DX)(BX*1), DI
  13891. MOVB -1(DX)(CX*1), R8
  13892. CMPB DI, R8
  13893. JNE match_extend_back_end_encodeSnappyBetterBlockAsm64K
  13894. LEAL -1(CX), CX
  13895. DECL BX
  13896. JZ match_extend_back_end_encodeSnappyBetterBlockAsm64K
  13897. JMP match_extend_back_loop_encodeSnappyBetterBlockAsm64K
  13898. match_extend_back_end_encodeSnappyBetterBlockAsm64K:
  13899. MOVL CX, SI
  13900. SUBL 12(SP), SI
  13901. LEAQ 3(AX)(SI*1), SI
  13902. CMPQ SI, (SP)
  13903. JB match_dst_size_check_encodeSnappyBetterBlockAsm64K
  13904. MOVQ $0x00000000, ret+48(FP)
  13905. RET
  13906. match_dst_size_check_encodeSnappyBetterBlockAsm64K:
  13907. MOVL CX, SI
  13908. ADDL $0x04, CX
  13909. ADDL $0x04, BX
  13910. MOVQ src_len+32(FP), DI
  13911. SUBL CX, DI
  13912. LEAQ (DX)(CX*1), R8
  13913. LEAQ (DX)(BX*1), R9
  13914. // matchLen
  13915. XORL R11, R11
  13916. matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm64K:
  13917. CMPL DI, $0x10
  13918. JB matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm64K
  13919. MOVQ (R8)(R11*1), R10
  13920. MOVQ 8(R8)(R11*1), R12
  13921. XORQ (R9)(R11*1), R10
  13922. JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm64K
  13923. XORQ 8(R9)(R11*1), R12
  13924. JNZ matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm64K
  13925. LEAL -16(DI), DI
  13926. LEAL 16(R11), R11
  13927. JMP matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm64K
  13928. matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm64K:
  13929. #ifdef GOAMD64_v3
  13930. TZCNTQ R12, R12
  13931. #else
  13932. BSFQ R12, R12
  13933. #endif
  13934. SARQ $0x03, R12
  13935. LEAL 8(R11)(R12*1), R11
  13936. JMP match_nolit_end_encodeSnappyBetterBlockAsm64K
  13937. matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm64K:
  13938. CMPL DI, $0x08
  13939. JB matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm64K
  13940. MOVQ (R8)(R11*1), R10
  13941. XORQ (R9)(R11*1), R10
  13942. JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm64K
  13943. LEAL -8(DI), DI
  13944. LEAL 8(R11), R11
  13945. JMP matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm64K
  13946. matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm64K:
  13947. #ifdef GOAMD64_v3
  13948. TZCNTQ R10, R10
  13949. #else
  13950. BSFQ R10, R10
  13951. #endif
  13952. SARQ $0x03, R10
  13953. LEAL (R11)(R10*1), R11
  13954. JMP match_nolit_end_encodeSnappyBetterBlockAsm64K
  13955. matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm64K:
  13956. CMPL DI, $0x04
  13957. JB matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K
  13958. MOVL (R8)(R11*1), R10
  13959. CMPL (R9)(R11*1), R10
  13960. JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K
  13961. LEAL -4(DI), DI
  13962. LEAL 4(R11), R11
  13963. matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K:
  13964. CMPL DI, $0x01
  13965. JE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K
  13966. JB match_nolit_end_encodeSnappyBetterBlockAsm64K
  13967. MOVW (R8)(R11*1), R10
  13968. CMPW (R9)(R11*1), R10
  13969. JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K
  13970. LEAL 2(R11), R11
  13971. SUBL $0x02, DI
  13972. JZ match_nolit_end_encodeSnappyBetterBlockAsm64K
  13973. matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K:
  13974. MOVB (R8)(R11*1), R10
  13975. CMPB (R9)(R11*1), R10
  13976. JNE match_nolit_end_encodeSnappyBetterBlockAsm64K
  13977. LEAL 1(R11), R11
  13978. match_nolit_end_encodeSnappyBetterBlockAsm64K:
  13979. MOVL CX, DI
  13980. SUBL BX, DI
  13981. // Check if repeat
  13982. MOVL DI, 16(SP)
  13983. MOVL 12(SP), BX
  13984. CMPL BX, SI
  13985. JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K
  13986. MOVL SI, R8
  13987. MOVL SI, 12(SP)
  13988. LEAQ (DX)(BX*1), R9
  13989. SUBL BX, R8
  13990. LEAL -1(R8), BX
  13991. CMPL BX, $0x3c
  13992. JB one_byte_match_emit_encodeSnappyBetterBlockAsm64K
  13993. CMPL BX, $0x00000100
  13994. JB two_bytes_match_emit_encodeSnappyBetterBlockAsm64K
  13995. JB three_bytes_match_emit_encodeSnappyBetterBlockAsm64K
  13996. three_bytes_match_emit_encodeSnappyBetterBlockAsm64K:
  13997. MOVB $0xf4, (AX)
  13998. MOVW BX, 1(AX)
  13999. ADDQ $0x03, AX
  14000. JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm64K
  14001. two_bytes_match_emit_encodeSnappyBetterBlockAsm64K:
  14002. MOVB $0xf0, (AX)
  14003. MOVB BL, 1(AX)
  14004. ADDQ $0x02, AX
  14005. CMPL BX, $0x40
  14006. JB memmove_match_emit_encodeSnappyBetterBlockAsm64K
  14007. JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm64K
  14008. one_byte_match_emit_encodeSnappyBetterBlockAsm64K:
  14009. SHLB $0x02, BL
  14010. MOVB BL, (AX)
  14011. ADDQ $0x01, AX
  14012. memmove_match_emit_encodeSnappyBetterBlockAsm64K:
  14013. LEAQ (AX)(R8*1), BX
  14014. // genMemMoveShort
  14015. CMPQ R8, $0x08
  14016. JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8
  14017. CMPQ R8, $0x10
  14018. JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8through16
  14019. CMPQ R8, $0x20
  14020. JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_17through32
  14021. JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_33through64
  14022. emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8:
  14023. MOVQ (R9), R10
  14024. MOVQ R10, (AX)
  14025. JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K
  14026. emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8through16:
  14027. MOVQ (R9), R10
  14028. MOVQ -8(R9)(R8*1), R9
  14029. MOVQ R10, (AX)
  14030. MOVQ R9, -8(AX)(R8*1)
  14031. JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K
  14032. emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_17through32:
  14033. MOVOU (R9), X0
  14034. MOVOU -16(R9)(R8*1), X1
  14035. MOVOU X0, (AX)
  14036. MOVOU X1, -16(AX)(R8*1)
  14037. JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K
  14038. emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_33through64:
  14039. MOVOU (R9), X0
  14040. MOVOU 16(R9), X1
  14041. MOVOU -32(R9)(R8*1), X2
  14042. MOVOU -16(R9)(R8*1), X3
  14043. MOVOU X0, (AX)
  14044. MOVOU X1, 16(AX)
  14045. MOVOU X2, -32(AX)(R8*1)
  14046. MOVOU X3, -16(AX)(R8*1)
  14047. memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K:
  14048. MOVQ BX, AX
  14049. JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K
  14050. memmove_long_match_emit_encodeSnappyBetterBlockAsm64K:
  14051. LEAQ (AX)(R8*1), BX
  14052. // genMemMoveLong
  14053. MOVOU (R9), X0
  14054. MOVOU 16(R9), X1
  14055. MOVOU -32(R9)(R8*1), X2
  14056. MOVOU -16(R9)(R8*1), X3
  14057. MOVQ R8, R12
  14058. SHRQ $0x05, R12
  14059. MOVQ AX, R10
  14060. ANDL $0x0000001f, R10
  14061. MOVQ $0x00000040, R13
  14062. SUBQ R10, R13
  14063. DECQ R12
  14064. JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
  14065. LEAQ -32(R9)(R13*1), R10
  14066. LEAQ -32(AX)(R13*1), R14
  14067. emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_big_loop_back:
  14068. MOVOU (R10), X4
  14069. MOVOU 16(R10), X5
  14070. MOVOA X4, (R14)
  14071. MOVOA X5, 16(R14)
  14072. ADDQ $0x20, R14
  14073. ADDQ $0x20, R10
  14074. ADDQ $0x20, R13
  14075. DECQ R12
  14076. JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_big_loop_back
  14077. emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32:
  14078. MOVOU -32(R9)(R13*1), X4
  14079. MOVOU -16(R9)(R13*1), X5
  14080. MOVOA X4, -32(AX)(R13*1)
  14081. MOVOA X5, -16(AX)(R13*1)
  14082. ADDQ $0x20, R13
  14083. CMPQ R8, R13
  14084. JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
  14085. MOVOU X0, (AX)
  14086. MOVOU X1, 16(AX)
  14087. MOVOU X2, -32(AX)(R8*1)
  14088. MOVOU X3, -16(AX)(R8*1)
  14089. MOVQ BX, AX
  14090. emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K:
  14091. ADDL R11, CX
  14092. ADDL $0x04, R11
  14093. MOVL CX, 12(SP)
  14094. // emitCopy
  14095. two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm64K:
  14096. CMPL R11, $0x40
  14097. JBE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm64K
  14098. MOVB $0xee, (AX)
  14099. MOVW DI, 1(AX)
  14100. LEAL -60(R11), R11
  14101. ADDQ $0x03, AX
  14102. JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm64K
  14103. two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm64K:
  14104. MOVL R11, BX
  14105. SHLL $0x02, BX
  14106. CMPL R11, $0x0c
  14107. JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K
  14108. CMPL DI, $0x00000800
  14109. JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K
  14110. LEAL -15(BX), BX
  14111. MOVB DI, 1(AX)
  14112. SHRL $0x08, DI
  14113. SHLL $0x05, DI
  14114. ORL DI, BX
  14115. MOVB BL, (AX)
  14116. ADDQ $0x02, AX
  14117. JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K
  14118. emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K:
  14119. LEAL -2(BX), BX
  14120. MOVB BL, (AX)
  14121. MOVW DI, 1(AX)
  14122. ADDQ $0x03, AX
  14123. match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K:
  14124. CMPL CX, 8(SP)
  14125. JAE emit_remainder_encodeSnappyBetterBlockAsm64K
  14126. CMPQ AX, (SP)
  14127. JB match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K
  14128. MOVQ $0x00000000, ret+48(FP)
  14129. RET
  14130. match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K:
  14131. MOVQ $0x00cf1bbcdcbfa563, BX
  14132. MOVQ $0x9e3779b1, DI
  14133. LEAQ 1(SI), SI
  14134. LEAQ -2(CX), R8
  14135. MOVQ (DX)(SI*1), R9
  14136. MOVQ 1(DX)(SI*1), R10
  14137. MOVQ (DX)(R8*1), R11
  14138. MOVQ 1(DX)(R8*1), R12
  14139. SHLQ $0x08, R9
  14140. IMULQ BX, R9
  14141. SHRQ $0x30, R9
  14142. SHLQ $0x20, R10
  14143. IMULQ DI, R10
  14144. SHRQ $0x32, R10
  14145. SHLQ $0x08, R11
  14146. IMULQ BX, R11
  14147. SHRQ $0x30, R11
  14148. SHLQ $0x20, R12
  14149. IMULQ DI, R12
  14150. SHRQ $0x32, R12
  14151. LEAQ 1(SI), DI
  14152. LEAQ 1(R8), R13
  14153. MOVL SI, 24(SP)(R9*4)
  14154. MOVL R8, 24(SP)(R11*4)
  14155. MOVL DI, 262168(SP)(R10*4)
  14156. MOVL R13, 262168(SP)(R12*4)
  14157. LEAQ 1(R8)(SI*1), DI
  14158. SHRQ $0x01, DI
  14159. ADDQ $0x01, SI
  14160. SUBQ $0x01, R8
  14161. index_loop_encodeSnappyBetterBlockAsm64K:
  14162. CMPQ DI, R8
  14163. JAE search_loop_encodeSnappyBetterBlockAsm64K
  14164. MOVQ (DX)(SI*1), R9
  14165. MOVQ (DX)(DI*1), R10
  14166. SHLQ $0x08, R9
  14167. IMULQ BX, R9
  14168. SHRQ $0x30, R9
  14169. SHLQ $0x08, R10
  14170. IMULQ BX, R10
  14171. SHRQ $0x30, R10
  14172. MOVL SI, 24(SP)(R9*4)
  14173. MOVL DI, 24(SP)(R10*4)
  14174. ADDQ $0x02, SI
  14175. ADDQ $0x02, DI
  14176. JMP index_loop_encodeSnappyBetterBlockAsm64K
  14177. emit_remainder_encodeSnappyBetterBlockAsm64K:
  14178. MOVQ src_len+32(FP), CX
  14179. SUBL 12(SP), CX
  14180. LEAQ 3(AX)(CX*1), CX
  14181. CMPQ CX, (SP)
  14182. JB emit_remainder_ok_encodeSnappyBetterBlockAsm64K
  14183. MOVQ $0x00000000, ret+48(FP)
  14184. RET
  14185. emit_remainder_ok_encodeSnappyBetterBlockAsm64K:
  14186. MOVQ src_len+32(FP), CX
  14187. MOVL 12(SP), BX
  14188. CMPL BX, CX
  14189. JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K
  14190. MOVL CX, SI
  14191. MOVL CX, 12(SP)
  14192. LEAQ (DX)(BX*1), CX
  14193. SUBL BX, SI
  14194. LEAL -1(SI), DX
  14195. CMPL DX, $0x3c
  14196. JB one_byte_emit_remainder_encodeSnappyBetterBlockAsm64K
  14197. CMPL DX, $0x00000100
  14198. JB two_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K
  14199. JB three_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K
  14200. three_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K:
  14201. MOVB $0xf4, (AX)
  14202. MOVW DX, 1(AX)
  14203. ADDQ $0x03, AX
  14204. JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K
  14205. two_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K:
  14206. MOVB $0xf0, (AX)
  14207. MOVB DL, 1(AX)
  14208. ADDQ $0x02, AX
  14209. CMPL DX, $0x40
  14210. JB memmove_emit_remainder_encodeSnappyBetterBlockAsm64K
  14211. JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K
  14212. one_byte_emit_remainder_encodeSnappyBetterBlockAsm64K:
  14213. SHLB $0x02, DL
  14214. MOVB DL, (AX)
  14215. ADDQ $0x01, AX
  14216. memmove_emit_remainder_encodeSnappyBetterBlockAsm64K:
  14217. LEAQ (AX)(SI*1), DX
  14218. MOVL SI, BX
  14219. // genMemMoveShort
  14220. CMPQ BX, $0x03
  14221. JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_1or2
  14222. JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_3
  14223. CMPQ BX, $0x08
  14224. JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_4through7
  14225. CMPQ BX, $0x10
  14226. JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16
  14227. CMPQ BX, $0x20
  14228. JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_17through32
  14229. JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_33through64
  14230. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_1or2:
  14231. MOVB (CX), SI
  14232. MOVB -1(CX)(BX*1), CL
  14233. MOVB SI, (AX)
  14234. MOVB CL, -1(AX)(BX*1)
  14235. JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K
  14236. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_3:
  14237. MOVW (CX), SI
  14238. MOVB 2(CX), CL
  14239. MOVW SI, (AX)
  14240. MOVB CL, 2(AX)
  14241. JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K
  14242. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_4through7:
  14243. MOVL (CX), SI
  14244. MOVL -4(CX)(BX*1), CX
  14245. MOVL SI, (AX)
  14246. MOVL CX, -4(AX)(BX*1)
  14247. JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K
  14248. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16:
  14249. MOVQ (CX), SI
  14250. MOVQ -8(CX)(BX*1), CX
  14251. MOVQ SI, (AX)
  14252. MOVQ CX, -8(AX)(BX*1)
  14253. JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K
  14254. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_17through32:
  14255. MOVOU (CX), X0
  14256. MOVOU -16(CX)(BX*1), X1
  14257. MOVOU X0, (AX)
  14258. MOVOU X1, -16(AX)(BX*1)
  14259. JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K
  14260. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_33through64:
  14261. MOVOU (CX), X0
  14262. MOVOU 16(CX), X1
  14263. MOVOU -32(CX)(BX*1), X2
  14264. MOVOU -16(CX)(BX*1), X3
  14265. MOVOU X0, (AX)
  14266. MOVOU X1, 16(AX)
  14267. MOVOU X2, -32(AX)(BX*1)
  14268. MOVOU X3, -16(AX)(BX*1)
  14269. memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K:
  14270. MOVQ DX, AX
  14271. JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K
  14272. memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K:
  14273. LEAQ (AX)(SI*1), DX
  14274. MOVL SI, BX
  14275. // genMemMoveLong
  14276. MOVOU (CX), X0
  14277. MOVOU 16(CX), X1
  14278. MOVOU -32(CX)(BX*1), X2
  14279. MOVOU -16(CX)(BX*1), X3
  14280. MOVQ BX, DI
  14281. SHRQ $0x05, DI
  14282. MOVQ AX, SI
  14283. ANDL $0x0000001f, SI
  14284. MOVQ $0x00000040, R8
  14285. SUBQ SI, R8
  14286. DECQ DI
  14287. JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
  14288. LEAQ -32(CX)(R8*1), SI
  14289. LEAQ -32(AX)(R8*1), R9
  14290. emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_big_loop_back:
  14291. MOVOU (SI), X4
  14292. MOVOU 16(SI), X5
  14293. MOVOA X4, (R9)
  14294. MOVOA X5, 16(R9)
  14295. ADDQ $0x20, R9
  14296. ADDQ $0x20, SI
  14297. ADDQ $0x20, R8
  14298. DECQ DI
  14299. JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_big_loop_back
  14300. emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32:
  14301. MOVOU -32(CX)(R8*1), X4
  14302. MOVOU -16(CX)(R8*1), X5
  14303. MOVOA X4, -32(AX)(R8*1)
  14304. MOVOA X5, -16(AX)(R8*1)
  14305. ADDQ $0x20, R8
  14306. CMPQ BX, R8
  14307. JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
  14308. MOVOU X0, (AX)
  14309. MOVOU X1, 16(AX)
  14310. MOVOU X2, -32(AX)(BX*1)
  14311. MOVOU X3, -16(AX)(BX*1)
  14312. MOVQ DX, AX
  14313. emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K:
  14314. MOVQ dst_base+0(FP), CX
  14315. SUBQ CX, AX
  14316. MOVQ AX, ret+48(FP)
  14317. RET
  14318. // func encodeSnappyBetterBlockAsm12B(dst []byte, src []byte) int
  14319. // Requires: BMI, SSE2
  14320. TEXT ·encodeSnappyBetterBlockAsm12B(SB), $81944-56
  14321. MOVQ dst_base+0(FP), AX
  14322. MOVQ $0x00000280, CX
  14323. LEAQ 24(SP), DX
  14324. PXOR X0, X0
  14325. zero_loop_encodeSnappyBetterBlockAsm12B:
  14326. MOVOU X0, (DX)
  14327. MOVOU X0, 16(DX)
  14328. MOVOU X0, 32(DX)
  14329. MOVOU X0, 48(DX)
  14330. MOVOU X0, 64(DX)
  14331. MOVOU X0, 80(DX)
  14332. MOVOU X0, 96(DX)
  14333. MOVOU X0, 112(DX)
  14334. ADDQ $0x80, DX
  14335. DECQ CX
  14336. JNZ zero_loop_encodeSnappyBetterBlockAsm12B
  14337. MOVL $0x00000000, 12(SP)
  14338. MOVQ src_len+32(FP), CX
  14339. LEAQ -9(CX), DX
  14340. LEAQ -8(CX), BX
  14341. MOVL BX, 8(SP)
  14342. SHRQ $0x05, CX
  14343. SUBL CX, DX
  14344. LEAQ (AX)(DX*1), DX
  14345. MOVQ DX, (SP)
  14346. MOVL $0x00000001, CX
  14347. MOVL $0x00000000, 16(SP)
  14348. MOVQ src_base+24(FP), DX
  14349. search_loop_encodeSnappyBetterBlockAsm12B:
  14350. MOVL CX, BX
  14351. SUBL 12(SP), BX
  14352. SHRL $0x06, BX
  14353. LEAL 1(CX)(BX*1), BX
  14354. CMPL BX, 8(SP)
  14355. JAE emit_remainder_encodeSnappyBetterBlockAsm12B
  14356. MOVQ (DX)(CX*1), SI
  14357. MOVL BX, 20(SP)
  14358. MOVQ $0x0000cf1bbcdcbf9b, R8
  14359. MOVQ $0x9e3779b1, BX
  14360. MOVQ SI, R9
  14361. MOVQ SI, R10
  14362. SHLQ $0x10, R9
  14363. IMULQ R8, R9
  14364. SHRQ $0x32, R9
  14365. SHLQ $0x20, R10
  14366. IMULQ BX, R10
  14367. SHRQ $0x34, R10
  14368. MOVL 24(SP)(R9*4), BX
  14369. MOVL 65560(SP)(R10*4), DI
  14370. MOVL CX, 24(SP)(R9*4)
  14371. MOVL CX, 65560(SP)(R10*4)
  14372. MOVQ (DX)(BX*1), R9
  14373. MOVQ (DX)(DI*1), R10
  14374. CMPQ R9, SI
  14375. JEQ candidate_match_encodeSnappyBetterBlockAsm12B
  14376. CMPQ R10, SI
  14377. JNE no_short_found_encodeSnappyBetterBlockAsm12B
  14378. MOVL DI, BX
  14379. JMP candidate_match_encodeSnappyBetterBlockAsm12B
  14380. no_short_found_encodeSnappyBetterBlockAsm12B:
  14381. CMPL R9, SI
  14382. JEQ candidate_match_encodeSnappyBetterBlockAsm12B
  14383. CMPL R10, SI
  14384. JEQ candidateS_match_encodeSnappyBetterBlockAsm12B
  14385. MOVL 20(SP), CX
  14386. JMP search_loop_encodeSnappyBetterBlockAsm12B
  14387. candidateS_match_encodeSnappyBetterBlockAsm12B:
  14388. SHRQ $0x08, SI
  14389. MOVQ SI, R9
  14390. SHLQ $0x10, R9
  14391. IMULQ R8, R9
  14392. SHRQ $0x32, R9
  14393. MOVL 24(SP)(R9*4), BX
  14394. INCL CX
  14395. MOVL CX, 24(SP)(R9*4)
  14396. CMPL (DX)(BX*1), SI
  14397. JEQ candidate_match_encodeSnappyBetterBlockAsm12B
  14398. DECL CX
  14399. MOVL DI, BX
  14400. candidate_match_encodeSnappyBetterBlockAsm12B:
  14401. MOVL 12(SP), SI
  14402. TESTL BX, BX
  14403. JZ match_extend_back_end_encodeSnappyBetterBlockAsm12B
  14404. match_extend_back_loop_encodeSnappyBetterBlockAsm12B:
  14405. CMPL CX, SI
  14406. JBE match_extend_back_end_encodeSnappyBetterBlockAsm12B
  14407. MOVB -1(DX)(BX*1), DI
  14408. MOVB -1(DX)(CX*1), R8
  14409. CMPB DI, R8
  14410. JNE match_extend_back_end_encodeSnappyBetterBlockAsm12B
  14411. LEAL -1(CX), CX
  14412. DECL BX
  14413. JZ match_extend_back_end_encodeSnappyBetterBlockAsm12B
  14414. JMP match_extend_back_loop_encodeSnappyBetterBlockAsm12B
  14415. match_extend_back_end_encodeSnappyBetterBlockAsm12B:
  14416. MOVL CX, SI
  14417. SUBL 12(SP), SI
  14418. LEAQ 3(AX)(SI*1), SI
  14419. CMPQ SI, (SP)
  14420. JB match_dst_size_check_encodeSnappyBetterBlockAsm12B
  14421. MOVQ $0x00000000, ret+48(FP)
  14422. RET
  14423. match_dst_size_check_encodeSnappyBetterBlockAsm12B:
  14424. MOVL CX, SI
  14425. ADDL $0x04, CX
  14426. ADDL $0x04, BX
  14427. MOVQ src_len+32(FP), DI
  14428. SUBL CX, DI
  14429. LEAQ (DX)(CX*1), R8
  14430. LEAQ (DX)(BX*1), R9
  14431. // matchLen
  14432. XORL R11, R11
  14433. matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm12B:
  14434. CMPL DI, $0x10
  14435. JB matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm12B
  14436. MOVQ (R8)(R11*1), R10
  14437. MOVQ 8(R8)(R11*1), R12
  14438. XORQ (R9)(R11*1), R10
  14439. JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm12B
  14440. XORQ 8(R9)(R11*1), R12
  14441. JNZ matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm12B
  14442. LEAL -16(DI), DI
  14443. LEAL 16(R11), R11
  14444. JMP matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm12B
  14445. matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm12B:
  14446. #ifdef GOAMD64_v3
  14447. TZCNTQ R12, R12
  14448. #else
  14449. BSFQ R12, R12
  14450. #endif
  14451. SARQ $0x03, R12
  14452. LEAL 8(R11)(R12*1), R11
  14453. JMP match_nolit_end_encodeSnappyBetterBlockAsm12B
  14454. matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm12B:
  14455. CMPL DI, $0x08
  14456. JB matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm12B
  14457. MOVQ (R8)(R11*1), R10
  14458. XORQ (R9)(R11*1), R10
  14459. JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm12B
  14460. LEAL -8(DI), DI
  14461. LEAL 8(R11), R11
  14462. JMP matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm12B
  14463. matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm12B:
  14464. #ifdef GOAMD64_v3
  14465. TZCNTQ R10, R10
  14466. #else
  14467. BSFQ R10, R10
  14468. #endif
  14469. SARQ $0x03, R10
  14470. LEAL (R11)(R10*1), R11
  14471. JMP match_nolit_end_encodeSnappyBetterBlockAsm12B
  14472. matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm12B:
  14473. CMPL DI, $0x04
  14474. JB matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B
  14475. MOVL (R8)(R11*1), R10
  14476. CMPL (R9)(R11*1), R10
  14477. JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B
  14478. LEAL -4(DI), DI
  14479. LEAL 4(R11), R11
  14480. matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B:
  14481. CMPL DI, $0x01
  14482. JE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B
  14483. JB match_nolit_end_encodeSnappyBetterBlockAsm12B
  14484. MOVW (R8)(R11*1), R10
  14485. CMPW (R9)(R11*1), R10
  14486. JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B
  14487. LEAL 2(R11), R11
  14488. SUBL $0x02, DI
  14489. JZ match_nolit_end_encodeSnappyBetterBlockAsm12B
  14490. matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B:
  14491. MOVB (R8)(R11*1), R10
  14492. CMPB (R9)(R11*1), R10
  14493. JNE match_nolit_end_encodeSnappyBetterBlockAsm12B
  14494. LEAL 1(R11), R11
  14495. match_nolit_end_encodeSnappyBetterBlockAsm12B:
  14496. MOVL CX, DI
  14497. SUBL BX, DI
  14498. // Check if repeat
  14499. MOVL DI, 16(SP)
  14500. MOVL 12(SP), BX
  14501. CMPL BX, SI
  14502. JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B
  14503. MOVL SI, R8
  14504. MOVL SI, 12(SP)
  14505. LEAQ (DX)(BX*1), R9
  14506. SUBL BX, R8
  14507. LEAL -1(R8), BX
  14508. CMPL BX, $0x3c
  14509. JB one_byte_match_emit_encodeSnappyBetterBlockAsm12B
  14510. CMPL BX, $0x00000100
  14511. JB two_bytes_match_emit_encodeSnappyBetterBlockAsm12B
  14512. JB three_bytes_match_emit_encodeSnappyBetterBlockAsm12B
  14513. three_bytes_match_emit_encodeSnappyBetterBlockAsm12B:
  14514. MOVB $0xf4, (AX)
  14515. MOVW BX, 1(AX)
  14516. ADDQ $0x03, AX
  14517. JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm12B
  14518. two_bytes_match_emit_encodeSnappyBetterBlockAsm12B:
  14519. MOVB $0xf0, (AX)
  14520. MOVB BL, 1(AX)
  14521. ADDQ $0x02, AX
  14522. CMPL BX, $0x40
  14523. JB memmove_match_emit_encodeSnappyBetterBlockAsm12B
  14524. JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm12B
  14525. one_byte_match_emit_encodeSnappyBetterBlockAsm12B:
  14526. SHLB $0x02, BL
  14527. MOVB BL, (AX)
  14528. ADDQ $0x01, AX
  14529. memmove_match_emit_encodeSnappyBetterBlockAsm12B:
  14530. LEAQ (AX)(R8*1), BX
  14531. // genMemMoveShort
  14532. CMPQ R8, $0x08
  14533. JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8
  14534. CMPQ R8, $0x10
  14535. JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8through16
  14536. CMPQ R8, $0x20
  14537. JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_17through32
  14538. JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_33through64
  14539. emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8:
  14540. MOVQ (R9), R10
  14541. MOVQ R10, (AX)
  14542. JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B
  14543. emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8through16:
  14544. MOVQ (R9), R10
  14545. MOVQ -8(R9)(R8*1), R9
  14546. MOVQ R10, (AX)
  14547. MOVQ R9, -8(AX)(R8*1)
  14548. JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B
  14549. emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_17through32:
  14550. MOVOU (R9), X0
  14551. MOVOU -16(R9)(R8*1), X1
  14552. MOVOU X0, (AX)
  14553. MOVOU X1, -16(AX)(R8*1)
  14554. JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B
  14555. emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_33through64:
  14556. MOVOU (R9), X0
  14557. MOVOU 16(R9), X1
  14558. MOVOU -32(R9)(R8*1), X2
  14559. MOVOU -16(R9)(R8*1), X3
  14560. MOVOU X0, (AX)
  14561. MOVOU X1, 16(AX)
  14562. MOVOU X2, -32(AX)(R8*1)
  14563. MOVOU X3, -16(AX)(R8*1)
  14564. memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B:
  14565. MOVQ BX, AX
  14566. JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B
  14567. memmove_long_match_emit_encodeSnappyBetterBlockAsm12B:
  14568. LEAQ (AX)(R8*1), BX
  14569. // genMemMoveLong
  14570. MOVOU (R9), X0
  14571. MOVOU 16(R9), X1
  14572. MOVOU -32(R9)(R8*1), X2
  14573. MOVOU -16(R9)(R8*1), X3
  14574. MOVQ R8, R12
  14575. SHRQ $0x05, R12
  14576. MOVQ AX, R10
  14577. ANDL $0x0000001f, R10
  14578. MOVQ $0x00000040, R13
  14579. SUBQ R10, R13
  14580. DECQ R12
  14581. JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
  14582. LEAQ -32(R9)(R13*1), R10
  14583. LEAQ -32(AX)(R13*1), R14
  14584. emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_big_loop_back:
  14585. MOVOU (R10), X4
  14586. MOVOU 16(R10), X5
  14587. MOVOA X4, (R14)
  14588. MOVOA X5, 16(R14)
  14589. ADDQ $0x20, R14
  14590. ADDQ $0x20, R10
  14591. ADDQ $0x20, R13
  14592. DECQ R12
  14593. JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_big_loop_back
  14594. emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32:
  14595. MOVOU -32(R9)(R13*1), X4
  14596. MOVOU -16(R9)(R13*1), X5
  14597. MOVOA X4, -32(AX)(R13*1)
  14598. MOVOA X5, -16(AX)(R13*1)
  14599. ADDQ $0x20, R13
  14600. CMPQ R8, R13
  14601. JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
  14602. MOVOU X0, (AX)
  14603. MOVOU X1, 16(AX)
  14604. MOVOU X2, -32(AX)(R8*1)
  14605. MOVOU X3, -16(AX)(R8*1)
  14606. MOVQ BX, AX
  14607. emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B:
  14608. ADDL R11, CX
  14609. ADDL $0x04, R11
  14610. MOVL CX, 12(SP)
  14611. // emitCopy
  14612. two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm12B:
  14613. CMPL R11, $0x40
  14614. JBE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm12B
  14615. MOVB $0xee, (AX)
  14616. MOVW DI, 1(AX)
  14617. LEAL -60(R11), R11
  14618. ADDQ $0x03, AX
  14619. JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm12B
  14620. two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm12B:
  14621. MOVL R11, BX
  14622. SHLL $0x02, BX
  14623. CMPL R11, $0x0c
  14624. JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B
  14625. CMPL DI, $0x00000800
  14626. JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B
  14627. LEAL -15(BX), BX
  14628. MOVB DI, 1(AX)
  14629. SHRL $0x08, DI
  14630. SHLL $0x05, DI
  14631. ORL DI, BX
  14632. MOVB BL, (AX)
  14633. ADDQ $0x02, AX
  14634. JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B
  14635. emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B:
  14636. LEAL -2(BX), BX
  14637. MOVB BL, (AX)
  14638. MOVW DI, 1(AX)
  14639. ADDQ $0x03, AX
  14640. match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B:
  14641. CMPL CX, 8(SP)
  14642. JAE emit_remainder_encodeSnappyBetterBlockAsm12B
  14643. CMPQ AX, (SP)
  14644. JB match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B
  14645. MOVQ $0x00000000, ret+48(FP)
  14646. RET
  14647. match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B:
  14648. MOVQ $0x0000cf1bbcdcbf9b, BX
  14649. MOVQ $0x9e3779b1, DI
  14650. LEAQ 1(SI), SI
  14651. LEAQ -2(CX), R8
  14652. MOVQ (DX)(SI*1), R9
  14653. MOVQ 1(DX)(SI*1), R10
  14654. MOVQ (DX)(R8*1), R11
  14655. MOVQ 1(DX)(R8*1), R12
  14656. SHLQ $0x10, R9
  14657. IMULQ BX, R9
  14658. SHRQ $0x32, R9
  14659. SHLQ $0x20, R10
  14660. IMULQ DI, R10
  14661. SHRQ $0x34, R10
  14662. SHLQ $0x10, R11
  14663. IMULQ BX, R11
  14664. SHRQ $0x32, R11
  14665. SHLQ $0x20, R12
  14666. IMULQ DI, R12
  14667. SHRQ $0x34, R12
  14668. LEAQ 1(SI), DI
  14669. LEAQ 1(R8), R13
  14670. MOVL SI, 24(SP)(R9*4)
  14671. MOVL R8, 24(SP)(R11*4)
  14672. MOVL DI, 65560(SP)(R10*4)
  14673. MOVL R13, 65560(SP)(R12*4)
  14674. LEAQ 1(R8)(SI*1), DI
  14675. SHRQ $0x01, DI
  14676. ADDQ $0x01, SI
  14677. SUBQ $0x01, R8
  14678. index_loop_encodeSnappyBetterBlockAsm12B:
  14679. CMPQ DI, R8
  14680. JAE search_loop_encodeSnappyBetterBlockAsm12B
  14681. MOVQ (DX)(SI*1), R9
  14682. MOVQ (DX)(DI*1), R10
  14683. SHLQ $0x10, R9
  14684. IMULQ BX, R9
  14685. SHRQ $0x32, R9
  14686. SHLQ $0x10, R10
  14687. IMULQ BX, R10
  14688. SHRQ $0x32, R10
  14689. MOVL SI, 24(SP)(R9*4)
  14690. MOVL DI, 24(SP)(R10*4)
  14691. ADDQ $0x02, SI
  14692. ADDQ $0x02, DI
  14693. JMP index_loop_encodeSnappyBetterBlockAsm12B
  14694. emit_remainder_encodeSnappyBetterBlockAsm12B:
  14695. MOVQ src_len+32(FP), CX
  14696. SUBL 12(SP), CX
  14697. LEAQ 3(AX)(CX*1), CX
  14698. CMPQ CX, (SP)
  14699. JB emit_remainder_ok_encodeSnappyBetterBlockAsm12B
  14700. MOVQ $0x00000000, ret+48(FP)
  14701. RET
  14702. emit_remainder_ok_encodeSnappyBetterBlockAsm12B:
  14703. MOVQ src_len+32(FP), CX
  14704. MOVL 12(SP), BX
  14705. CMPL BX, CX
  14706. JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B
  14707. MOVL CX, SI
  14708. MOVL CX, 12(SP)
  14709. LEAQ (DX)(BX*1), CX
  14710. SUBL BX, SI
  14711. LEAL -1(SI), DX
  14712. CMPL DX, $0x3c
  14713. JB one_byte_emit_remainder_encodeSnappyBetterBlockAsm12B
  14714. CMPL DX, $0x00000100
  14715. JB two_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B
  14716. JB three_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B
  14717. three_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B:
  14718. MOVB $0xf4, (AX)
  14719. MOVW DX, 1(AX)
  14720. ADDQ $0x03, AX
  14721. JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B
  14722. two_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B:
  14723. MOVB $0xf0, (AX)
  14724. MOVB DL, 1(AX)
  14725. ADDQ $0x02, AX
  14726. CMPL DX, $0x40
  14727. JB memmove_emit_remainder_encodeSnappyBetterBlockAsm12B
  14728. JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B
  14729. one_byte_emit_remainder_encodeSnappyBetterBlockAsm12B:
  14730. SHLB $0x02, DL
  14731. MOVB DL, (AX)
  14732. ADDQ $0x01, AX
  14733. memmove_emit_remainder_encodeSnappyBetterBlockAsm12B:
  14734. LEAQ (AX)(SI*1), DX
  14735. MOVL SI, BX
  14736. // genMemMoveShort
  14737. CMPQ BX, $0x03
  14738. JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_1or2
  14739. JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_3
  14740. CMPQ BX, $0x08
  14741. JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_4through7
  14742. CMPQ BX, $0x10
  14743. JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16
  14744. CMPQ BX, $0x20
  14745. JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_17through32
  14746. JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_33through64
  14747. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_1or2:
  14748. MOVB (CX), SI
  14749. MOVB -1(CX)(BX*1), CL
  14750. MOVB SI, (AX)
  14751. MOVB CL, -1(AX)(BX*1)
  14752. JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B
  14753. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_3:
  14754. MOVW (CX), SI
  14755. MOVB 2(CX), CL
  14756. MOVW SI, (AX)
  14757. MOVB CL, 2(AX)
  14758. JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B
  14759. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_4through7:
  14760. MOVL (CX), SI
  14761. MOVL -4(CX)(BX*1), CX
  14762. MOVL SI, (AX)
  14763. MOVL CX, -4(AX)(BX*1)
  14764. JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B
  14765. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16:
  14766. MOVQ (CX), SI
  14767. MOVQ -8(CX)(BX*1), CX
  14768. MOVQ SI, (AX)
  14769. MOVQ CX, -8(AX)(BX*1)
  14770. JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B
  14771. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_17through32:
  14772. MOVOU (CX), X0
  14773. MOVOU -16(CX)(BX*1), X1
  14774. MOVOU X0, (AX)
  14775. MOVOU X1, -16(AX)(BX*1)
  14776. JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B
  14777. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_33through64:
  14778. MOVOU (CX), X0
  14779. MOVOU 16(CX), X1
  14780. MOVOU -32(CX)(BX*1), X2
  14781. MOVOU -16(CX)(BX*1), X3
  14782. MOVOU X0, (AX)
  14783. MOVOU X1, 16(AX)
  14784. MOVOU X2, -32(AX)(BX*1)
  14785. MOVOU X3, -16(AX)(BX*1)
  14786. memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B:
  14787. MOVQ DX, AX
  14788. JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B
  14789. memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B:
  14790. LEAQ (AX)(SI*1), DX
  14791. MOVL SI, BX
  14792. // genMemMoveLong
  14793. MOVOU (CX), X0
  14794. MOVOU 16(CX), X1
  14795. MOVOU -32(CX)(BX*1), X2
  14796. MOVOU -16(CX)(BX*1), X3
  14797. MOVQ BX, DI
  14798. SHRQ $0x05, DI
  14799. MOVQ AX, SI
  14800. ANDL $0x0000001f, SI
  14801. MOVQ $0x00000040, R8
  14802. SUBQ SI, R8
  14803. DECQ DI
  14804. JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
  14805. LEAQ -32(CX)(R8*1), SI
  14806. LEAQ -32(AX)(R8*1), R9
  14807. emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_big_loop_back:
  14808. MOVOU (SI), X4
  14809. MOVOU 16(SI), X5
  14810. MOVOA X4, (R9)
  14811. MOVOA X5, 16(R9)
  14812. ADDQ $0x20, R9
  14813. ADDQ $0x20, SI
  14814. ADDQ $0x20, R8
  14815. DECQ DI
  14816. JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_big_loop_back
  14817. emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32:
  14818. MOVOU -32(CX)(R8*1), X4
  14819. MOVOU -16(CX)(R8*1), X5
  14820. MOVOA X4, -32(AX)(R8*1)
  14821. MOVOA X5, -16(AX)(R8*1)
  14822. ADDQ $0x20, R8
  14823. CMPQ BX, R8
  14824. JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
  14825. MOVOU X0, (AX)
  14826. MOVOU X1, 16(AX)
  14827. MOVOU X2, -32(AX)(BX*1)
  14828. MOVOU X3, -16(AX)(BX*1)
  14829. MOVQ DX, AX
  14830. emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B:
  14831. MOVQ dst_base+0(FP), CX
  14832. SUBQ CX, AX
  14833. MOVQ AX, ret+48(FP)
  14834. RET
  14835. // func encodeSnappyBetterBlockAsm10B(dst []byte, src []byte) int
  14836. // Requires: BMI, SSE2
  14837. TEXT ·encodeSnappyBetterBlockAsm10B(SB), $20504-56
  14838. MOVQ dst_base+0(FP), AX
  14839. MOVQ $0x000000a0, CX
  14840. LEAQ 24(SP), DX
  14841. PXOR X0, X0
  14842. zero_loop_encodeSnappyBetterBlockAsm10B:
  14843. MOVOU X0, (DX)
  14844. MOVOU X0, 16(DX)
  14845. MOVOU X0, 32(DX)
  14846. MOVOU X0, 48(DX)
  14847. MOVOU X0, 64(DX)
  14848. MOVOU X0, 80(DX)
  14849. MOVOU X0, 96(DX)
  14850. MOVOU X0, 112(DX)
  14851. ADDQ $0x80, DX
  14852. DECQ CX
  14853. JNZ zero_loop_encodeSnappyBetterBlockAsm10B
  14854. MOVL $0x00000000, 12(SP)
  14855. MOVQ src_len+32(FP), CX
  14856. LEAQ -9(CX), DX
  14857. LEAQ -8(CX), BX
  14858. MOVL BX, 8(SP)
  14859. SHRQ $0x05, CX
  14860. SUBL CX, DX
  14861. LEAQ (AX)(DX*1), DX
  14862. MOVQ DX, (SP)
  14863. MOVL $0x00000001, CX
  14864. MOVL $0x00000000, 16(SP)
  14865. MOVQ src_base+24(FP), DX
  14866. search_loop_encodeSnappyBetterBlockAsm10B:
  14867. MOVL CX, BX
  14868. SUBL 12(SP), BX
  14869. SHRL $0x05, BX
  14870. LEAL 1(CX)(BX*1), BX
  14871. CMPL BX, 8(SP)
  14872. JAE emit_remainder_encodeSnappyBetterBlockAsm10B
  14873. MOVQ (DX)(CX*1), SI
  14874. MOVL BX, 20(SP)
  14875. MOVQ $0x0000cf1bbcdcbf9b, R8
  14876. MOVQ $0x9e3779b1, BX
  14877. MOVQ SI, R9
  14878. MOVQ SI, R10
  14879. SHLQ $0x10, R9
  14880. IMULQ R8, R9
  14881. SHRQ $0x34, R9
  14882. SHLQ $0x20, R10
  14883. IMULQ BX, R10
  14884. SHRQ $0x36, R10
  14885. MOVL 24(SP)(R9*4), BX
  14886. MOVL 16408(SP)(R10*4), DI
  14887. MOVL CX, 24(SP)(R9*4)
  14888. MOVL CX, 16408(SP)(R10*4)
  14889. MOVQ (DX)(BX*1), R9
  14890. MOVQ (DX)(DI*1), R10
  14891. CMPQ R9, SI
  14892. JEQ candidate_match_encodeSnappyBetterBlockAsm10B
  14893. CMPQ R10, SI
  14894. JNE no_short_found_encodeSnappyBetterBlockAsm10B
  14895. MOVL DI, BX
  14896. JMP candidate_match_encodeSnappyBetterBlockAsm10B
  14897. no_short_found_encodeSnappyBetterBlockAsm10B:
  14898. CMPL R9, SI
  14899. JEQ candidate_match_encodeSnappyBetterBlockAsm10B
  14900. CMPL R10, SI
  14901. JEQ candidateS_match_encodeSnappyBetterBlockAsm10B
  14902. MOVL 20(SP), CX
  14903. JMP search_loop_encodeSnappyBetterBlockAsm10B
  14904. candidateS_match_encodeSnappyBetterBlockAsm10B:
  14905. SHRQ $0x08, SI
  14906. MOVQ SI, R9
  14907. SHLQ $0x10, R9
  14908. IMULQ R8, R9
  14909. SHRQ $0x34, R9
  14910. MOVL 24(SP)(R9*4), BX
  14911. INCL CX
  14912. MOVL CX, 24(SP)(R9*4)
  14913. CMPL (DX)(BX*1), SI
  14914. JEQ candidate_match_encodeSnappyBetterBlockAsm10B
  14915. DECL CX
  14916. MOVL DI, BX
  14917. candidate_match_encodeSnappyBetterBlockAsm10B:
  14918. MOVL 12(SP), SI
  14919. TESTL BX, BX
  14920. JZ match_extend_back_end_encodeSnappyBetterBlockAsm10B
  14921. match_extend_back_loop_encodeSnappyBetterBlockAsm10B:
  14922. CMPL CX, SI
  14923. JBE match_extend_back_end_encodeSnappyBetterBlockAsm10B
  14924. MOVB -1(DX)(BX*1), DI
  14925. MOVB -1(DX)(CX*1), R8
  14926. CMPB DI, R8
  14927. JNE match_extend_back_end_encodeSnappyBetterBlockAsm10B
  14928. LEAL -1(CX), CX
  14929. DECL BX
  14930. JZ match_extend_back_end_encodeSnappyBetterBlockAsm10B
  14931. JMP match_extend_back_loop_encodeSnappyBetterBlockAsm10B
  14932. match_extend_back_end_encodeSnappyBetterBlockAsm10B:
  14933. MOVL CX, SI
  14934. SUBL 12(SP), SI
  14935. LEAQ 3(AX)(SI*1), SI
  14936. CMPQ SI, (SP)
  14937. JB match_dst_size_check_encodeSnappyBetterBlockAsm10B
  14938. MOVQ $0x00000000, ret+48(FP)
  14939. RET
  14940. match_dst_size_check_encodeSnappyBetterBlockAsm10B:
  14941. MOVL CX, SI
  14942. ADDL $0x04, CX
  14943. ADDL $0x04, BX
  14944. MOVQ src_len+32(FP), DI
  14945. SUBL CX, DI
  14946. LEAQ (DX)(CX*1), R8
  14947. LEAQ (DX)(BX*1), R9
  14948. // matchLen
  14949. XORL R11, R11
  14950. matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm10B:
  14951. CMPL DI, $0x10
  14952. JB matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm10B
  14953. MOVQ (R8)(R11*1), R10
  14954. MOVQ 8(R8)(R11*1), R12
  14955. XORQ (R9)(R11*1), R10
  14956. JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm10B
  14957. XORQ 8(R9)(R11*1), R12
  14958. JNZ matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm10B
  14959. LEAL -16(DI), DI
  14960. LEAL 16(R11), R11
  14961. JMP matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm10B
  14962. matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm10B:
  14963. #ifdef GOAMD64_v3
  14964. TZCNTQ R12, R12
  14965. #else
  14966. BSFQ R12, R12
  14967. #endif
  14968. SARQ $0x03, R12
  14969. LEAL 8(R11)(R12*1), R11
  14970. JMP match_nolit_end_encodeSnappyBetterBlockAsm10B
  14971. matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm10B:
  14972. CMPL DI, $0x08
  14973. JB matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm10B
  14974. MOVQ (R8)(R11*1), R10
  14975. XORQ (R9)(R11*1), R10
  14976. JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm10B
  14977. LEAL -8(DI), DI
  14978. LEAL 8(R11), R11
  14979. JMP matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm10B
  14980. matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm10B:
  14981. #ifdef GOAMD64_v3
  14982. TZCNTQ R10, R10
  14983. #else
  14984. BSFQ R10, R10
  14985. #endif
  14986. SARQ $0x03, R10
  14987. LEAL (R11)(R10*1), R11
  14988. JMP match_nolit_end_encodeSnappyBetterBlockAsm10B
  14989. matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm10B:
  14990. CMPL DI, $0x04
  14991. JB matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B
  14992. MOVL (R8)(R11*1), R10
  14993. CMPL (R9)(R11*1), R10
  14994. JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B
  14995. LEAL -4(DI), DI
  14996. LEAL 4(R11), R11
  14997. matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B:
  14998. CMPL DI, $0x01
  14999. JE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B
  15000. JB match_nolit_end_encodeSnappyBetterBlockAsm10B
  15001. MOVW (R8)(R11*1), R10
  15002. CMPW (R9)(R11*1), R10
  15003. JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B
  15004. LEAL 2(R11), R11
  15005. SUBL $0x02, DI
  15006. JZ match_nolit_end_encodeSnappyBetterBlockAsm10B
  15007. matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B:
  15008. MOVB (R8)(R11*1), R10
  15009. CMPB (R9)(R11*1), R10
  15010. JNE match_nolit_end_encodeSnappyBetterBlockAsm10B
  15011. LEAL 1(R11), R11
  15012. match_nolit_end_encodeSnappyBetterBlockAsm10B:
  15013. MOVL CX, DI
  15014. SUBL BX, DI
  15015. // Check if repeat
  15016. MOVL DI, 16(SP)
  15017. MOVL 12(SP), BX
  15018. CMPL BX, SI
  15019. JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B
  15020. MOVL SI, R8
  15021. MOVL SI, 12(SP)
  15022. LEAQ (DX)(BX*1), R9
  15023. SUBL BX, R8
  15024. LEAL -1(R8), BX
  15025. CMPL BX, $0x3c
  15026. JB one_byte_match_emit_encodeSnappyBetterBlockAsm10B
  15027. CMPL BX, $0x00000100
  15028. JB two_bytes_match_emit_encodeSnappyBetterBlockAsm10B
  15029. JB three_bytes_match_emit_encodeSnappyBetterBlockAsm10B
  15030. three_bytes_match_emit_encodeSnappyBetterBlockAsm10B:
  15031. MOVB $0xf4, (AX)
  15032. MOVW BX, 1(AX)
  15033. ADDQ $0x03, AX
  15034. JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm10B
  15035. two_bytes_match_emit_encodeSnappyBetterBlockAsm10B:
  15036. MOVB $0xf0, (AX)
  15037. MOVB BL, 1(AX)
  15038. ADDQ $0x02, AX
  15039. CMPL BX, $0x40
  15040. JB memmove_match_emit_encodeSnappyBetterBlockAsm10B
  15041. JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm10B
  15042. one_byte_match_emit_encodeSnappyBetterBlockAsm10B:
  15043. SHLB $0x02, BL
  15044. MOVB BL, (AX)
  15045. ADDQ $0x01, AX
  15046. memmove_match_emit_encodeSnappyBetterBlockAsm10B:
  15047. LEAQ (AX)(R8*1), BX
  15048. // genMemMoveShort
  15049. CMPQ R8, $0x08
  15050. JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8
  15051. CMPQ R8, $0x10
  15052. JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8through16
  15053. CMPQ R8, $0x20
  15054. JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_17through32
  15055. JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_33through64
  15056. emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8:
  15057. MOVQ (R9), R10
  15058. MOVQ R10, (AX)
  15059. JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B
  15060. emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8through16:
  15061. MOVQ (R9), R10
  15062. MOVQ -8(R9)(R8*1), R9
  15063. MOVQ R10, (AX)
  15064. MOVQ R9, -8(AX)(R8*1)
  15065. JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B
  15066. emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_17through32:
  15067. MOVOU (R9), X0
  15068. MOVOU -16(R9)(R8*1), X1
  15069. MOVOU X0, (AX)
  15070. MOVOU X1, -16(AX)(R8*1)
  15071. JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B
  15072. emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_33through64:
  15073. MOVOU (R9), X0
  15074. MOVOU 16(R9), X1
  15075. MOVOU -32(R9)(R8*1), X2
  15076. MOVOU -16(R9)(R8*1), X3
  15077. MOVOU X0, (AX)
  15078. MOVOU X1, 16(AX)
  15079. MOVOU X2, -32(AX)(R8*1)
  15080. MOVOU X3, -16(AX)(R8*1)
  15081. memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B:
  15082. MOVQ BX, AX
  15083. JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B
  15084. memmove_long_match_emit_encodeSnappyBetterBlockAsm10B:
  15085. LEAQ (AX)(R8*1), BX
  15086. // genMemMoveLong
  15087. MOVOU (R9), X0
  15088. MOVOU 16(R9), X1
  15089. MOVOU -32(R9)(R8*1), X2
  15090. MOVOU -16(R9)(R8*1), X3
  15091. MOVQ R8, R12
  15092. SHRQ $0x05, R12
  15093. MOVQ AX, R10
  15094. ANDL $0x0000001f, R10
  15095. MOVQ $0x00000040, R13
  15096. SUBQ R10, R13
  15097. DECQ R12
  15098. JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
  15099. LEAQ -32(R9)(R13*1), R10
  15100. LEAQ -32(AX)(R13*1), R14
  15101. emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_big_loop_back:
  15102. MOVOU (R10), X4
  15103. MOVOU 16(R10), X5
  15104. MOVOA X4, (R14)
  15105. MOVOA X5, 16(R14)
  15106. ADDQ $0x20, R14
  15107. ADDQ $0x20, R10
  15108. ADDQ $0x20, R13
  15109. DECQ R12
  15110. JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_big_loop_back
  15111. emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32:
  15112. MOVOU -32(R9)(R13*1), X4
  15113. MOVOU -16(R9)(R13*1), X5
  15114. MOVOA X4, -32(AX)(R13*1)
  15115. MOVOA X5, -16(AX)(R13*1)
  15116. ADDQ $0x20, R13
  15117. CMPQ R8, R13
  15118. JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
  15119. MOVOU X0, (AX)
  15120. MOVOU X1, 16(AX)
  15121. MOVOU X2, -32(AX)(R8*1)
  15122. MOVOU X3, -16(AX)(R8*1)
  15123. MOVQ BX, AX
  15124. emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B:
  15125. ADDL R11, CX
  15126. ADDL $0x04, R11
  15127. MOVL CX, 12(SP)
  15128. // emitCopy
  15129. two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm10B:
  15130. CMPL R11, $0x40
  15131. JBE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm10B
  15132. MOVB $0xee, (AX)
  15133. MOVW DI, 1(AX)
  15134. LEAL -60(R11), R11
  15135. ADDQ $0x03, AX
  15136. JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm10B
  15137. two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm10B:
  15138. MOVL R11, BX
  15139. SHLL $0x02, BX
  15140. CMPL R11, $0x0c
  15141. JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B
  15142. CMPL DI, $0x00000800
  15143. JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B
  15144. LEAL -15(BX), BX
  15145. MOVB DI, 1(AX)
  15146. SHRL $0x08, DI
  15147. SHLL $0x05, DI
  15148. ORL DI, BX
  15149. MOVB BL, (AX)
  15150. ADDQ $0x02, AX
  15151. JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B
  15152. emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B:
  15153. LEAL -2(BX), BX
  15154. MOVB BL, (AX)
  15155. MOVW DI, 1(AX)
  15156. ADDQ $0x03, AX
  15157. match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B:
  15158. CMPL CX, 8(SP)
  15159. JAE emit_remainder_encodeSnappyBetterBlockAsm10B
  15160. CMPQ AX, (SP)
  15161. JB match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B
  15162. MOVQ $0x00000000, ret+48(FP)
  15163. RET
  15164. match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B:
  15165. MOVQ $0x0000cf1bbcdcbf9b, BX
  15166. MOVQ $0x9e3779b1, DI
  15167. LEAQ 1(SI), SI
  15168. LEAQ -2(CX), R8
  15169. MOVQ (DX)(SI*1), R9
  15170. MOVQ 1(DX)(SI*1), R10
  15171. MOVQ (DX)(R8*1), R11
  15172. MOVQ 1(DX)(R8*1), R12
  15173. SHLQ $0x10, R9
  15174. IMULQ BX, R9
  15175. SHRQ $0x34, R9
  15176. SHLQ $0x20, R10
  15177. IMULQ DI, R10
  15178. SHRQ $0x36, R10
  15179. SHLQ $0x10, R11
  15180. IMULQ BX, R11
  15181. SHRQ $0x34, R11
  15182. SHLQ $0x20, R12
  15183. IMULQ DI, R12
  15184. SHRQ $0x36, R12
  15185. LEAQ 1(SI), DI
  15186. LEAQ 1(R8), R13
  15187. MOVL SI, 24(SP)(R9*4)
  15188. MOVL R8, 24(SP)(R11*4)
  15189. MOVL DI, 16408(SP)(R10*4)
  15190. MOVL R13, 16408(SP)(R12*4)
  15191. LEAQ 1(R8)(SI*1), DI
  15192. SHRQ $0x01, DI
  15193. ADDQ $0x01, SI
  15194. SUBQ $0x01, R8
  15195. index_loop_encodeSnappyBetterBlockAsm10B:
  15196. CMPQ DI, R8
  15197. JAE search_loop_encodeSnappyBetterBlockAsm10B
  15198. MOVQ (DX)(SI*1), R9
  15199. MOVQ (DX)(DI*1), R10
  15200. SHLQ $0x10, R9
  15201. IMULQ BX, R9
  15202. SHRQ $0x34, R9
  15203. SHLQ $0x10, R10
  15204. IMULQ BX, R10
  15205. SHRQ $0x34, R10
  15206. MOVL SI, 24(SP)(R9*4)
  15207. MOVL DI, 24(SP)(R10*4)
  15208. ADDQ $0x02, SI
  15209. ADDQ $0x02, DI
  15210. JMP index_loop_encodeSnappyBetterBlockAsm10B
  15211. emit_remainder_encodeSnappyBetterBlockAsm10B:
  15212. MOVQ src_len+32(FP), CX
  15213. SUBL 12(SP), CX
  15214. LEAQ 3(AX)(CX*1), CX
  15215. CMPQ CX, (SP)
  15216. JB emit_remainder_ok_encodeSnappyBetterBlockAsm10B
  15217. MOVQ $0x00000000, ret+48(FP)
  15218. RET
  15219. emit_remainder_ok_encodeSnappyBetterBlockAsm10B:
  15220. MOVQ src_len+32(FP), CX
  15221. MOVL 12(SP), BX
  15222. CMPL BX, CX
  15223. JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B
  15224. MOVL CX, SI
  15225. MOVL CX, 12(SP)
  15226. LEAQ (DX)(BX*1), CX
  15227. SUBL BX, SI
  15228. LEAL -1(SI), DX
  15229. CMPL DX, $0x3c
  15230. JB one_byte_emit_remainder_encodeSnappyBetterBlockAsm10B
  15231. CMPL DX, $0x00000100
  15232. JB two_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B
  15233. JB three_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B
  15234. three_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B:
  15235. MOVB $0xf4, (AX)
  15236. MOVW DX, 1(AX)
  15237. ADDQ $0x03, AX
  15238. JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B
  15239. two_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B:
  15240. MOVB $0xf0, (AX)
  15241. MOVB DL, 1(AX)
  15242. ADDQ $0x02, AX
  15243. CMPL DX, $0x40
  15244. JB memmove_emit_remainder_encodeSnappyBetterBlockAsm10B
  15245. JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B
  15246. one_byte_emit_remainder_encodeSnappyBetterBlockAsm10B:
  15247. SHLB $0x02, DL
  15248. MOVB DL, (AX)
  15249. ADDQ $0x01, AX
  15250. memmove_emit_remainder_encodeSnappyBetterBlockAsm10B:
  15251. LEAQ (AX)(SI*1), DX
  15252. MOVL SI, BX
  15253. // genMemMoveShort
  15254. CMPQ BX, $0x03
  15255. JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_1or2
  15256. JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_3
  15257. CMPQ BX, $0x08
  15258. JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_4through7
  15259. CMPQ BX, $0x10
  15260. JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16
  15261. CMPQ BX, $0x20
  15262. JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_17through32
  15263. JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_33through64
  15264. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_1or2:
  15265. MOVB (CX), SI
  15266. MOVB -1(CX)(BX*1), CL
  15267. MOVB SI, (AX)
  15268. MOVB CL, -1(AX)(BX*1)
  15269. JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B
  15270. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_3:
  15271. MOVW (CX), SI
  15272. MOVB 2(CX), CL
  15273. MOVW SI, (AX)
  15274. MOVB CL, 2(AX)
  15275. JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B
  15276. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_4through7:
  15277. MOVL (CX), SI
  15278. MOVL -4(CX)(BX*1), CX
  15279. MOVL SI, (AX)
  15280. MOVL CX, -4(AX)(BX*1)
  15281. JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B
  15282. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16:
  15283. MOVQ (CX), SI
  15284. MOVQ -8(CX)(BX*1), CX
  15285. MOVQ SI, (AX)
  15286. MOVQ CX, -8(AX)(BX*1)
  15287. JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B
  15288. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_17through32:
  15289. MOVOU (CX), X0
  15290. MOVOU -16(CX)(BX*1), X1
  15291. MOVOU X0, (AX)
  15292. MOVOU X1, -16(AX)(BX*1)
  15293. JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B
  15294. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_33through64:
  15295. MOVOU (CX), X0
  15296. MOVOU 16(CX), X1
  15297. MOVOU -32(CX)(BX*1), X2
  15298. MOVOU -16(CX)(BX*1), X3
  15299. MOVOU X0, (AX)
  15300. MOVOU X1, 16(AX)
  15301. MOVOU X2, -32(AX)(BX*1)
  15302. MOVOU X3, -16(AX)(BX*1)
  15303. memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B:
  15304. MOVQ DX, AX
  15305. JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B
  15306. memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B:
  15307. LEAQ (AX)(SI*1), DX
  15308. MOVL SI, BX
  15309. // genMemMoveLong
  15310. MOVOU (CX), X0
  15311. MOVOU 16(CX), X1
  15312. MOVOU -32(CX)(BX*1), X2
  15313. MOVOU -16(CX)(BX*1), X3
  15314. MOVQ BX, DI
  15315. SHRQ $0x05, DI
  15316. MOVQ AX, SI
  15317. ANDL $0x0000001f, SI
  15318. MOVQ $0x00000040, R8
  15319. SUBQ SI, R8
  15320. DECQ DI
  15321. JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
  15322. LEAQ -32(CX)(R8*1), SI
  15323. LEAQ -32(AX)(R8*1), R9
  15324. emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_big_loop_back:
  15325. MOVOU (SI), X4
  15326. MOVOU 16(SI), X5
  15327. MOVOA X4, (R9)
  15328. MOVOA X5, 16(R9)
  15329. ADDQ $0x20, R9
  15330. ADDQ $0x20, SI
  15331. ADDQ $0x20, R8
  15332. DECQ DI
  15333. JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_big_loop_back
  15334. emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32:
  15335. MOVOU -32(CX)(R8*1), X4
  15336. MOVOU -16(CX)(R8*1), X5
  15337. MOVOA X4, -32(AX)(R8*1)
  15338. MOVOA X5, -16(AX)(R8*1)
  15339. ADDQ $0x20, R8
  15340. CMPQ BX, R8
  15341. JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
  15342. MOVOU X0, (AX)
  15343. MOVOU X1, 16(AX)
  15344. MOVOU X2, -32(AX)(BX*1)
  15345. MOVOU X3, -16(AX)(BX*1)
  15346. MOVQ DX, AX
  15347. emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B:
  15348. MOVQ dst_base+0(FP), CX
  15349. SUBQ CX, AX
  15350. MOVQ AX, ret+48(FP)
  15351. RET
  15352. // func encodeSnappyBetterBlockAsm8B(dst []byte, src []byte) int
  15353. // Requires: BMI, SSE2
  15354. TEXT ·encodeSnappyBetterBlockAsm8B(SB), $5144-56
  15355. MOVQ dst_base+0(FP), AX
  15356. MOVQ $0x00000028, CX
  15357. LEAQ 24(SP), DX
  15358. PXOR X0, X0
  15359. zero_loop_encodeSnappyBetterBlockAsm8B:
  15360. MOVOU X0, (DX)
  15361. MOVOU X0, 16(DX)
  15362. MOVOU X0, 32(DX)
  15363. MOVOU X0, 48(DX)
  15364. MOVOU X0, 64(DX)
  15365. MOVOU X0, 80(DX)
  15366. MOVOU X0, 96(DX)
  15367. MOVOU X0, 112(DX)
  15368. ADDQ $0x80, DX
  15369. DECQ CX
  15370. JNZ zero_loop_encodeSnappyBetterBlockAsm8B
  15371. MOVL $0x00000000, 12(SP)
  15372. MOVQ src_len+32(FP), CX
  15373. LEAQ -9(CX), DX
  15374. LEAQ -8(CX), BX
  15375. MOVL BX, 8(SP)
  15376. SHRQ $0x05, CX
  15377. SUBL CX, DX
  15378. LEAQ (AX)(DX*1), DX
  15379. MOVQ DX, (SP)
  15380. MOVL $0x00000001, CX
  15381. MOVL $0x00000000, 16(SP)
  15382. MOVQ src_base+24(FP), DX
  15383. search_loop_encodeSnappyBetterBlockAsm8B:
  15384. MOVL CX, BX
  15385. SUBL 12(SP), BX
  15386. SHRL $0x04, BX
  15387. LEAL 1(CX)(BX*1), BX
  15388. CMPL BX, 8(SP)
  15389. JAE emit_remainder_encodeSnappyBetterBlockAsm8B
  15390. MOVQ (DX)(CX*1), SI
  15391. MOVL BX, 20(SP)
  15392. MOVQ $0x0000cf1bbcdcbf9b, R8
  15393. MOVQ $0x9e3779b1, BX
  15394. MOVQ SI, R9
  15395. MOVQ SI, R10
  15396. SHLQ $0x10, R9
  15397. IMULQ R8, R9
  15398. SHRQ $0x36, R9
  15399. SHLQ $0x20, R10
  15400. IMULQ BX, R10
  15401. SHRQ $0x38, R10
  15402. MOVL 24(SP)(R9*4), BX
  15403. MOVL 4120(SP)(R10*4), DI
  15404. MOVL CX, 24(SP)(R9*4)
  15405. MOVL CX, 4120(SP)(R10*4)
  15406. MOVQ (DX)(BX*1), R9
  15407. MOVQ (DX)(DI*1), R10
  15408. CMPQ R9, SI
  15409. JEQ candidate_match_encodeSnappyBetterBlockAsm8B
  15410. CMPQ R10, SI
  15411. JNE no_short_found_encodeSnappyBetterBlockAsm8B
  15412. MOVL DI, BX
  15413. JMP candidate_match_encodeSnappyBetterBlockAsm8B
  15414. no_short_found_encodeSnappyBetterBlockAsm8B:
  15415. CMPL R9, SI
  15416. JEQ candidate_match_encodeSnappyBetterBlockAsm8B
  15417. CMPL R10, SI
  15418. JEQ candidateS_match_encodeSnappyBetterBlockAsm8B
  15419. MOVL 20(SP), CX
  15420. JMP search_loop_encodeSnappyBetterBlockAsm8B
  15421. candidateS_match_encodeSnappyBetterBlockAsm8B:
  15422. SHRQ $0x08, SI
  15423. MOVQ SI, R9
  15424. SHLQ $0x10, R9
  15425. IMULQ R8, R9
  15426. SHRQ $0x36, R9
  15427. MOVL 24(SP)(R9*4), BX
  15428. INCL CX
  15429. MOVL CX, 24(SP)(R9*4)
  15430. CMPL (DX)(BX*1), SI
  15431. JEQ candidate_match_encodeSnappyBetterBlockAsm8B
  15432. DECL CX
  15433. MOVL DI, BX
  15434. candidate_match_encodeSnappyBetterBlockAsm8B:
  15435. MOVL 12(SP), SI
  15436. TESTL BX, BX
  15437. JZ match_extend_back_end_encodeSnappyBetterBlockAsm8B
  15438. match_extend_back_loop_encodeSnappyBetterBlockAsm8B:
  15439. CMPL CX, SI
  15440. JBE match_extend_back_end_encodeSnappyBetterBlockAsm8B
  15441. MOVB -1(DX)(BX*1), DI
  15442. MOVB -1(DX)(CX*1), R8
  15443. CMPB DI, R8
  15444. JNE match_extend_back_end_encodeSnappyBetterBlockAsm8B
  15445. LEAL -1(CX), CX
  15446. DECL BX
  15447. JZ match_extend_back_end_encodeSnappyBetterBlockAsm8B
  15448. JMP match_extend_back_loop_encodeSnappyBetterBlockAsm8B
  15449. match_extend_back_end_encodeSnappyBetterBlockAsm8B:
  15450. MOVL CX, SI
  15451. SUBL 12(SP), SI
  15452. LEAQ 3(AX)(SI*1), SI
  15453. CMPQ SI, (SP)
  15454. JB match_dst_size_check_encodeSnappyBetterBlockAsm8B
  15455. MOVQ $0x00000000, ret+48(FP)
  15456. RET
  15457. match_dst_size_check_encodeSnappyBetterBlockAsm8B:
  15458. MOVL CX, SI
  15459. ADDL $0x04, CX
  15460. ADDL $0x04, BX
  15461. MOVQ src_len+32(FP), DI
  15462. SUBL CX, DI
  15463. LEAQ (DX)(CX*1), R8
  15464. LEAQ (DX)(BX*1), R9
  15465. // matchLen
  15466. XORL R11, R11
  15467. matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm8B:
  15468. CMPL DI, $0x10
  15469. JB matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm8B
  15470. MOVQ (R8)(R11*1), R10
  15471. MOVQ 8(R8)(R11*1), R12
  15472. XORQ (R9)(R11*1), R10
  15473. JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm8B
  15474. XORQ 8(R9)(R11*1), R12
  15475. JNZ matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm8B
  15476. LEAL -16(DI), DI
  15477. LEAL 16(R11), R11
  15478. JMP matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm8B
  15479. matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm8B:
  15480. #ifdef GOAMD64_v3
  15481. TZCNTQ R12, R12
  15482. #else
  15483. BSFQ R12, R12
  15484. #endif
  15485. SARQ $0x03, R12
  15486. LEAL 8(R11)(R12*1), R11
  15487. JMP match_nolit_end_encodeSnappyBetterBlockAsm8B
  15488. matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm8B:
  15489. CMPL DI, $0x08
  15490. JB matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm8B
  15491. MOVQ (R8)(R11*1), R10
  15492. XORQ (R9)(R11*1), R10
  15493. JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm8B
  15494. LEAL -8(DI), DI
  15495. LEAL 8(R11), R11
  15496. JMP matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm8B
  15497. matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm8B:
  15498. #ifdef GOAMD64_v3
  15499. TZCNTQ R10, R10
  15500. #else
  15501. BSFQ R10, R10
  15502. #endif
  15503. SARQ $0x03, R10
  15504. LEAL (R11)(R10*1), R11
  15505. JMP match_nolit_end_encodeSnappyBetterBlockAsm8B
  15506. matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm8B:
  15507. CMPL DI, $0x04
  15508. JB matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B
  15509. MOVL (R8)(R11*1), R10
  15510. CMPL (R9)(R11*1), R10
  15511. JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B
  15512. LEAL -4(DI), DI
  15513. LEAL 4(R11), R11
  15514. matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B:
  15515. CMPL DI, $0x01
  15516. JE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B
  15517. JB match_nolit_end_encodeSnappyBetterBlockAsm8B
  15518. MOVW (R8)(R11*1), R10
  15519. CMPW (R9)(R11*1), R10
  15520. JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B
  15521. LEAL 2(R11), R11
  15522. SUBL $0x02, DI
  15523. JZ match_nolit_end_encodeSnappyBetterBlockAsm8B
  15524. matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B:
  15525. MOVB (R8)(R11*1), R10
  15526. CMPB (R9)(R11*1), R10
  15527. JNE match_nolit_end_encodeSnappyBetterBlockAsm8B
  15528. LEAL 1(R11), R11
  15529. match_nolit_end_encodeSnappyBetterBlockAsm8B:
  15530. MOVL CX, DI
  15531. SUBL BX, DI
  15532. // Check if repeat
  15533. MOVL DI, 16(SP)
  15534. MOVL 12(SP), BX
  15535. CMPL BX, SI
  15536. JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B
  15537. MOVL SI, R8
  15538. MOVL SI, 12(SP)
  15539. LEAQ (DX)(BX*1), R9
  15540. SUBL BX, R8
  15541. LEAL -1(R8), BX
  15542. CMPL BX, $0x3c
  15543. JB one_byte_match_emit_encodeSnappyBetterBlockAsm8B
  15544. CMPL BX, $0x00000100
  15545. JB two_bytes_match_emit_encodeSnappyBetterBlockAsm8B
  15546. JB three_bytes_match_emit_encodeSnappyBetterBlockAsm8B
  15547. three_bytes_match_emit_encodeSnappyBetterBlockAsm8B:
  15548. MOVB $0xf4, (AX)
  15549. MOVW BX, 1(AX)
  15550. ADDQ $0x03, AX
  15551. JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm8B
  15552. two_bytes_match_emit_encodeSnappyBetterBlockAsm8B:
  15553. MOVB $0xf0, (AX)
  15554. MOVB BL, 1(AX)
  15555. ADDQ $0x02, AX
  15556. CMPL BX, $0x40
  15557. JB memmove_match_emit_encodeSnappyBetterBlockAsm8B
  15558. JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm8B
  15559. one_byte_match_emit_encodeSnappyBetterBlockAsm8B:
  15560. SHLB $0x02, BL
  15561. MOVB BL, (AX)
  15562. ADDQ $0x01, AX
  15563. memmove_match_emit_encodeSnappyBetterBlockAsm8B:
  15564. LEAQ (AX)(R8*1), BX
  15565. // genMemMoveShort
  15566. CMPQ R8, $0x08
  15567. JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8
  15568. CMPQ R8, $0x10
  15569. JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8through16
  15570. CMPQ R8, $0x20
  15571. JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_17through32
  15572. JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_33through64
  15573. emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8:
  15574. MOVQ (R9), R10
  15575. MOVQ R10, (AX)
  15576. JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B
  15577. emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8through16:
  15578. MOVQ (R9), R10
  15579. MOVQ -8(R9)(R8*1), R9
  15580. MOVQ R10, (AX)
  15581. MOVQ R9, -8(AX)(R8*1)
  15582. JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B
  15583. emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_17through32:
  15584. MOVOU (R9), X0
  15585. MOVOU -16(R9)(R8*1), X1
  15586. MOVOU X0, (AX)
  15587. MOVOU X1, -16(AX)(R8*1)
  15588. JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B
  15589. emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_33through64:
  15590. MOVOU (R9), X0
  15591. MOVOU 16(R9), X1
  15592. MOVOU -32(R9)(R8*1), X2
  15593. MOVOU -16(R9)(R8*1), X3
  15594. MOVOU X0, (AX)
  15595. MOVOU X1, 16(AX)
  15596. MOVOU X2, -32(AX)(R8*1)
  15597. MOVOU X3, -16(AX)(R8*1)
  15598. memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B:
  15599. MOVQ BX, AX
  15600. JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B
  15601. memmove_long_match_emit_encodeSnappyBetterBlockAsm8B:
  15602. LEAQ (AX)(R8*1), BX
  15603. // genMemMoveLong
  15604. MOVOU (R9), X0
  15605. MOVOU 16(R9), X1
  15606. MOVOU -32(R9)(R8*1), X2
  15607. MOVOU -16(R9)(R8*1), X3
  15608. MOVQ R8, R12
  15609. SHRQ $0x05, R12
  15610. MOVQ AX, R10
  15611. ANDL $0x0000001f, R10
  15612. MOVQ $0x00000040, R13
  15613. SUBQ R10, R13
  15614. DECQ R12
  15615. JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
  15616. LEAQ -32(R9)(R13*1), R10
  15617. LEAQ -32(AX)(R13*1), R14
  15618. emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_big_loop_back:
  15619. MOVOU (R10), X4
  15620. MOVOU 16(R10), X5
  15621. MOVOA X4, (R14)
  15622. MOVOA X5, 16(R14)
  15623. ADDQ $0x20, R14
  15624. ADDQ $0x20, R10
  15625. ADDQ $0x20, R13
  15626. DECQ R12
  15627. JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_big_loop_back
  15628. emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32:
  15629. MOVOU -32(R9)(R13*1), X4
  15630. MOVOU -16(R9)(R13*1), X5
  15631. MOVOA X4, -32(AX)(R13*1)
  15632. MOVOA X5, -16(AX)(R13*1)
  15633. ADDQ $0x20, R13
  15634. CMPQ R8, R13
  15635. JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
  15636. MOVOU X0, (AX)
  15637. MOVOU X1, 16(AX)
  15638. MOVOU X2, -32(AX)(R8*1)
  15639. MOVOU X3, -16(AX)(R8*1)
  15640. MOVQ BX, AX
  15641. emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B:
  15642. ADDL R11, CX
  15643. ADDL $0x04, R11
  15644. MOVL CX, 12(SP)
  15645. // emitCopy
  15646. two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm8B:
  15647. CMPL R11, $0x40
  15648. JBE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm8B
  15649. MOVB $0xee, (AX)
  15650. MOVW DI, 1(AX)
  15651. LEAL -60(R11), R11
  15652. ADDQ $0x03, AX
  15653. JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm8B
  15654. two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm8B:
  15655. MOVL R11, BX
  15656. SHLL $0x02, BX
  15657. CMPL R11, $0x0c
  15658. JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm8B
  15659. LEAL -15(BX), BX
  15660. MOVB DI, 1(AX)
  15661. SHRL $0x08, DI
  15662. SHLL $0x05, DI
  15663. ORL DI, BX
  15664. MOVB BL, (AX)
  15665. ADDQ $0x02, AX
  15666. JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B
  15667. emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm8B:
  15668. LEAL -2(BX), BX
  15669. MOVB BL, (AX)
  15670. MOVW DI, 1(AX)
  15671. ADDQ $0x03, AX
  15672. match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B:
  15673. CMPL CX, 8(SP)
  15674. JAE emit_remainder_encodeSnappyBetterBlockAsm8B
  15675. CMPQ AX, (SP)
  15676. JB match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B
  15677. MOVQ $0x00000000, ret+48(FP)
  15678. RET
  15679. match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B:
  15680. MOVQ $0x0000cf1bbcdcbf9b, BX
  15681. MOVQ $0x9e3779b1, DI
  15682. LEAQ 1(SI), SI
  15683. LEAQ -2(CX), R8
  15684. MOVQ (DX)(SI*1), R9
  15685. MOVQ 1(DX)(SI*1), R10
  15686. MOVQ (DX)(R8*1), R11
  15687. MOVQ 1(DX)(R8*1), R12
  15688. SHLQ $0x10, R9
  15689. IMULQ BX, R9
  15690. SHRQ $0x36, R9
  15691. SHLQ $0x20, R10
  15692. IMULQ DI, R10
  15693. SHRQ $0x38, R10
  15694. SHLQ $0x10, R11
  15695. IMULQ BX, R11
  15696. SHRQ $0x36, R11
  15697. SHLQ $0x20, R12
  15698. IMULQ DI, R12
  15699. SHRQ $0x38, R12
  15700. LEAQ 1(SI), DI
  15701. LEAQ 1(R8), R13
  15702. MOVL SI, 24(SP)(R9*4)
  15703. MOVL R8, 24(SP)(R11*4)
  15704. MOVL DI, 4120(SP)(R10*4)
  15705. MOVL R13, 4120(SP)(R12*4)
  15706. LEAQ 1(R8)(SI*1), DI
  15707. SHRQ $0x01, DI
  15708. ADDQ $0x01, SI
  15709. SUBQ $0x01, R8
  15710. index_loop_encodeSnappyBetterBlockAsm8B:
  15711. CMPQ DI, R8
  15712. JAE search_loop_encodeSnappyBetterBlockAsm8B
  15713. MOVQ (DX)(SI*1), R9
  15714. MOVQ (DX)(DI*1), R10
  15715. SHLQ $0x10, R9
  15716. IMULQ BX, R9
  15717. SHRQ $0x36, R9
  15718. SHLQ $0x10, R10
  15719. IMULQ BX, R10
  15720. SHRQ $0x36, R10
  15721. MOVL SI, 24(SP)(R9*4)
  15722. MOVL DI, 24(SP)(R10*4)
  15723. ADDQ $0x02, SI
  15724. ADDQ $0x02, DI
  15725. JMP index_loop_encodeSnappyBetterBlockAsm8B
  15726. emit_remainder_encodeSnappyBetterBlockAsm8B:
  15727. MOVQ src_len+32(FP), CX
  15728. SUBL 12(SP), CX
  15729. LEAQ 3(AX)(CX*1), CX
  15730. CMPQ CX, (SP)
  15731. JB emit_remainder_ok_encodeSnappyBetterBlockAsm8B
  15732. MOVQ $0x00000000, ret+48(FP)
  15733. RET
  15734. emit_remainder_ok_encodeSnappyBetterBlockAsm8B:
  15735. MOVQ src_len+32(FP), CX
  15736. MOVL 12(SP), BX
  15737. CMPL BX, CX
  15738. JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B
  15739. MOVL CX, SI
  15740. MOVL CX, 12(SP)
  15741. LEAQ (DX)(BX*1), CX
  15742. SUBL BX, SI
  15743. LEAL -1(SI), DX
  15744. CMPL DX, $0x3c
  15745. JB one_byte_emit_remainder_encodeSnappyBetterBlockAsm8B
  15746. CMPL DX, $0x00000100
  15747. JB two_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B
  15748. JB three_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B
  15749. three_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B:
  15750. MOVB $0xf4, (AX)
  15751. MOVW DX, 1(AX)
  15752. ADDQ $0x03, AX
  15753. JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B
  15754. two_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B:
  15755. MOVB $0xf0, (AX)
  15756. MOVB DL, 1(AX)
  15757. ADDQ $0x02, AX
  15758. CMPL DX, $0x40
  15759. JB memmove_emit_remainder_encodeSnappyBetterBlockAsm8B
  15760. JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B
  15761. one_byte_emit_remainder_encodeSnappyBetterBlockAsm8B:
  15762. SHLB $0x02, DL
  15763. MOVB DL, (AX)
  15764. ADDQ $0x01, AX
  15765. memmove_emit_remainder_encodeSnappyBetterBlockAsm8B:
  15766. LEAQ (AX)(SI*1), DX
  15767. MOVL SI, BX
  15768. // genMemMoveShort
  15769. CMPQ BX, $0x03
  15770. JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_1or2
  15771. JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_3
  15772. CMPQ BX, $0x08
  15773. JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_4through7
  15774. CMPQ BX, $0x10
  15775. JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16
  15776. CMPQ BX, $0x20
  15777. JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_17through32
  15778. JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_33through64
  15779. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_1or2:
  15780. MOVB (CX), SI
  15781. MOVB -1(CX)(BX*1), CL
  15782. MOVB SI, (AX)
  15783. MOVB CL, -1(AX)(BX*1)
  15784. JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B
  15785. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_3:
  15786. MOVW (CX), SI
  15787. MOVB 2(CX), CL
  15788. MOVW SI, (AX)
  15789. MOVB CL, 2(AX)
  15790. JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B
  15791. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_4through7:
  15792. MOVL (CX), SI
  15793. MOVL -4(CX)(BX*1), CX
  15794. MOVL SI, (AX)
  15795. MOVL CX, -4(AX)(BX*1)
  15796. JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B
  15797. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16:
  15798. MOVQ (CX), SI
  15799. MOVQ -8(CX)(BX*1), CX
  15800. MOVQ SI, (AX)
  15801. MOVQ CX, -8(AX)(BX*1)
  15802. JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B
  15803. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_17through32:
  15804. MOVOU (CX), X0
  15805. MOVOU -16(CX)(BX*1), X1
  15806. MOVOU X0, (AX)
  15807. MOVOU X1, -16(AX)(BX*1)
  15808. JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B
  15809. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_33through64:
  15810. MOVOU (CX), X0
  15811. MOVOU 16(CX), X1
  15812. MOVOU -32(CX)(BX*1), X2
  15813. MOVOU -16(CX)(BX*1), X3
  15814. MOVOU X0, (AX)
  15815. MOVOU X1, 16(AX)
  15816. MOVOU X2, -32(AX)(BX*1)
  15817. MOVOU X3, -16(AX)(BX*1)
  15818. memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B:
  15819. MOVQ DX, AX
  15820. JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B
  15821. memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B:
  15822. LEAQ (AX)(SI*1), DX
  15823. MOVL SI, BX
  15824. // genMemMoveLong
  15825. MOVOU (CX), X0
  15826. MOVOU 16(CX), X1
  15827. MOVOU -32(CX)(BX*1), X2
  15828. MOVOU -16(CX)(BX*1), X3
  15829. MOVQ BX, DI
  15830. SHRQ $0x05, DI
  15831. MOVQ AX, SI
  15832. ANDL $0x0000001f, SI
  15833. MOVQ $0x00000040, R8
  15834. SUBQ SI, R8
  15835. DECQ DI
  15836. JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
  15837. LEAQ -32(CX)(R8*1), SI
  15838. LEAQ -32(AX)(R8*1), R9
  15839. emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_big_loop_back:
  15840. MOVOU (SI), X4
  15841. MOVOU 16(SI), X5
  15842. MOVOA X4, (R9)
  15843. MOVOA X5, 16(R9)
  15844. ADDQ $0x20, R9
  15845. ADDQ $0x20, SI
  15846. ADDQ $0x20, R8
  15847. DECQ DI
  15848. JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_big_loop_back
  15849. emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32:
  15850. MOVOU -32(CX)(R8*1), X4
  15851. MOVOU -16(CX)(R8*1), X5
  15852. MOVOA X4, -32(AX)(R8*1)
  15853. MOVOA X5, -16(AX)(R8*1)
  15854. ADDQ $0x20, R8
  15855. CMPQ BX, R8
  15856. JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
  15857. MOVOU X0, (AX)
  15858. MOVOU X1, 16(AX)
  15859. MOVOU X2, -32(AX)(BX*1)
  15860. MOVOU X3, -16(AX)(BX*1)
  15861. MOVQ DX, AX
  15862. emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B:
  15863. MOVQ dst_base+0(FP), CX
  15864. SUBQ CX, AX
  15865. MOVQ AX, ret+48(FP)
  15866. RET
  15867. // func calcBlockSize(src []byte) int
  15868. // Requires: BMI, SSE2
  15869. TEXT ·calcBlockSize(SB), $32792-32
  15870. XORQ AX, AX
  15871. MOVQ $0x00000100, CX
  15872. LEAQ 24(SP), DX
  15873. PXOR X0, X0
  15874. zero_loop_calcBlockSize:
  15875. MOVOU X0, (DX)
  15876. MOVOU X0, 16(DX)
  15877. MOVOU X0, 32(DX)
  15878. MOVOU X0, 48(DX)
  15879. MOVOU X0, 64(DX)
  15880. MOVOU X0, 80(DX)
  15881. MOVOU X0, 96(DX)
  15882. MOVOU X0, 112(DX)
  15883. ADDQ $0x80, DX
  15884. DECQ CX
  15885. JNZ zero_loop_calcBlockSize
  15886. MOVL $0x00000000, 12(SP)
  15887. MOVQ src_len+8(FP), CX
  15888. LEAQ -9(CX), DX
  15889. LEAQ -8(CX), BX
  15890. MOVL BX, 8(SP)
  15891. SHRQ $0x05, CX
  15892. SUBL CX, DX
  15893. LEAQ (AX)(DX*1), DX
  15894. MOVQ DX, (SP)
  15895. MOVL $0x00000001, CX
  15896. MOVL CX, 16(SP)
  15897. MOVQ src_base+0(FP), DX
  15898. search_loop_calcBlockSize:
  15899. MOVL CX, BX
  15900. SUBL 12(SP), BX
  15901. SHRL $0x05, BX
  15902. LEAL 4(CX)(BX*1), BX
  15903. CMPL BX, 8(SP)
  15904. JAE emit_remainder_calcBlockSize
  15905. MOVQ (DX)(CX*1), SI
  15906. MOVL BX, 20(SP)
  15907. MOVQ $0x0000cf1bbcdcbf9b, R8
  15908. MOVQ SI, R9
  15909. MOVQ SI, R10
  15910. SHRQ $0x08, R10
  15911. SHLQ $0x10, R9
  15912. IMULQ R8, R9
  15913. SHRQ $0x33, R9
  15914. SHLQ $0x10, R10
  15915. IMULQ R8, R10
  15916. SHRQ $0x33, R10
  15917. MOVL 24(SP)(R9*4), BX
  15918. MOVL 24(SP)(R10*4), DI
  15919. MOVL CX, 24(SP)(R9*4)
  15920. LEAL 1(CX), R9
  15921. MOVL R9, 24(SP)(R10*4)
  15922. MOVQ SI, R9
  15923. SHRQ $0x10, R9
  15924. SHLQ $0x10, R9
  15925. IMULQ R8, R9
  15926. SHRQ $0x33, R9
  15927. MOVL CX, R8
  15928. SUBL 16(SP), R8
  15929. MOVL 1(DX)(R8*1), R10
  15930. MOVQ SI, R8
  15931. SHRQ $0x08, R8
  15932. CMPL R8, R10
  15933. JNE no_repeat_found_calcBlockSize
  15934. LEAL 1(CX), SI
  15935. MOVL 12(SP), BX
  15936. MOVL SI, DI
  15937. SUBL 16(SP), DI
  15938. JZ repeat_extend_back_end_calcBlockSize
  15939. repeat_extend_back_loop_calcBlockSize:
  15940. CMPL SI, BX
  15941. JBE repeat_extend_back_end_calcBlockSize
  15942. MOVB -1(DX)(DI*1), R8
  15943. MOVB -1(DX)(SI*1), R9
  15944. CMPB R8, R9
  15945. JNE repeat_extend_back_end_calcBlockSize
  15946. LEAL -1(SI), SI
  15947. DECL DI
  15948. JNZ repeat_extend_back_loop_calcBlockSize
  15949. repeat_extend_back_end_calcBlockSize:
  15950. MOVL 12(SP), BX
  15951. CMPL BX, SI
  15952. JEQ emit_literal_done_repeat_emit_calcBlockSize
  15953. MOVL SI, DI
  15954. MOVL SI, 12(SP)
  15955. LEAQ (DX)(BX*1), R8
  15956. SUBL BX, DI
  15957. LEAL -1(DI), BX
  15958. CMPL BX, $0x3c
  15959. JB one_byte_repeat_emit_calcBlockSize
  15960. CMPL BX, $0x00000100
  15961. JB two_bytes_repeat_emit_calcBlockSize
  15962. CMPL BX, $0x00010000
  15963. JB three_bytes_repeat_emit_calcBlockSize
  15964. CMPL BX, $0x01000000
  15965. JB four_bytes_repeat_emit_calcBlockSize
  15966. ADDQ $0x05, AX
  15967. JMP memmove_long_repeat_emit_calcBlockSize
  15968. four_bytes_repeat_emit_calcBlockSize:
  15969. ADDQ $0x04, AX
  15970. JMP memmove_long_repeat_emit_calcBlockSize
  15971. three_bytes_repeat_emit_calcBlockSize:
  15972. ADDQ $0x03, AX
  15973. JMP memmove_long_repeat_emit_calcBlockSize
  15974. two_bytes_repeat_emit_calcBlockSize:
  15975. ADDQ $0x02, AX
  15976. CMPL BX, $0x40
  15977. JB memmove_repeat_emit_calcBlockSize
  15978. JMP memmove_long_repeat_emit_calcBlockSize
  15979. one_byte_repeat_emit_calcBlockSize:
  15980. ADDQ $0x01, AX
  15981. memmove_repeat_emit_calcBlockSize:
  15982. LEAQ (AX)(DI*1), AX
  15983. JMP emit_literal_done_repeat_emit_calcBlockSize
  15984. memmove_long_repeat_emit_calcBlockSize:
  15985. LEAQ (AX)(DI*1), AX
  15986. emit_literal_done_repeat_emit_calcBlockSize:
  15987. ADDL $0x05, CX
  15988. MOVL CX, BX
  15989. SUBL 16(SP), BX
  15990. MOVQ src_len+8(FP), DI
  15991. SUBL CX, DI
  15992. LEAQ (DX)(CX*1), R8
  15993. LEAQ (DX)(BX*1), BX
  15994. // matchLen
  15995. XORL R10, R10
  15996. matchlen_loopback_16_repeat_extend_calcBlockSize:
  15997. CMPL DI, $0x10
  15998. JB matchlen_match8_repeat_extend_calcBlockSize
  15999. MOVQ (R8)(R10*1), R9
  16000. MOVQ 8(R8)(R10*1), R11
  16001. XORQ (BX)(R10*1), R9
  16002. JNZ matchlen_bsf_8_repeat_extend_calcBlockSize
  16003. XORQ 8(BX)(R10*1), R11
  16004. JNZ matchlen_bsf_16repeat_extend_calcBlockSize
  16005. LEAL -16(DI), DI
  16006. LEAL 16(R10), R10
  16007. JMP matchlen_loopback_16_repeat_extend_calcBlockSize
  16008. matchlen_bsf_16repeat_extend_calcBlockSize:
  16009. #ifdef GOAMD64_v3
  16010. TZCNTQ R11, R11
  16011. #else
  16012. BSFQ R11, R11
  16013. #endif
  16014. SARQ $0x03, R11
  16015. LEAL 8(R10)(R11*1), R10
  16016. JMP repeat_extend_forward_end_calcBlockSize
  16017. matchlen_match8_repeat_extend_calcBlockSize:
  16018. CMPL DI, $0x08
  16019. JB matchlen_match4_repeat_extend_calcBlockSize
  16020. MOVQ (R8)(R10*1), R9
  16021. XORQ (BX)(R10*1), R9
  16022. JNZ matchlen_bsf_8_repeat_extend_calcBlockSize
  16023. LEAL -8(DI), DI
  16024. LEAL 8(R10), R10
  16025. JMP matchlen_match4_repeat_extend_calcBlockSize
  16026. matchlen_bsf_8_repeat_extend_calcBlockSize:
  16027. #ifdef GOAMD64_v3
  16028. TZCNTQ R9, R9
  16029. #else
  16030. BSFQ R9, R9
  16031. #endif
  16032. SARQ $0x03, R9
  16033. LEAL (R10)(R9*1), R10
  16034. JMP repeat_extend_forward_end_calcBlockSize
  16035. matchlen_match4_repeat_extend_calcBlockSize:
  16036. CMPL DI, $0x04
  16037. JB matchlen_match2_repeat_extend_calcBlockSize
  16038. MOVL (R8)(R10*1), R9
  16039. CMPL (BX)(R10*1), R9
  16040. JNE matchlen_match2_repeat_extend_calcBlockSize
  16041. LEAL -4(DI), DI
  16042. LEAL 4(R10), R10
  16043. matchlen_match2_repeat_extend_calcBlockSize:
  16044. CMPL DI, $0x01
  16045. JE matchlen_match1_repeat_extend_calcBlockSize
  16046. JB repeat_extend_forward_end_calcBlockSize
  16047. MOVW (R8)(R10*1), R9
  16048. CMPW (BX)(R10*1), R9
  16049. JNE matchlen_match1_repeat_extend_calcBlockSize
  16050. LEAL 2(R10), R10
  16051. SUBL $0x02, DI
  16052. JZ repeat_extend_forward_end_calcBlockSize
  16053. matchlen_match1_repeat_extend_calcBlockSize:
  16054. MOVB (R8)(R10*1), R9
  16055. CMPB (BX)(R10*1), R9
  16056. JNE repeat_extend_forward_end_calcBlockSize
  16057. LEAL 1(R10), R10
  16058. repeat_extend_forward_end_calcBlockSize:
  16059. ADDL R10, CX
  16060. MOVL CX, BX
  16061. SUBL SI, BX
  16062. MOVL 16(SP), SI
  16063. // emitCopy
  16064. CMPL SI, $0x00010000
  16065. JB two_byte_offset_repeat_as_copy_calcBlockSize
  16066. four_bytes_loop_back_repeat_as_copy_calcBlockSize:
  16067. CMPL BX, $0x40
  16068. JBE four_bytes_remain_repeat_as_copy_calcBlockSize
  16069. LEAL -64(BX), BX
  16070. ADDQ $0x05, AX
  16071. CMPL BX, $0x04
  16072. JB four_bytes_remain_repeat_as_copy_calcBlockSize
  16073. JMP four_bytes_loop_back_repeat_as_copy_calcBlockSize
  16074. four_bytes_remain_repeat_as_copy_calcBlockSize:
  16075. TESTL BX, BX
  16076. JZ repeat_end_emit_calcBlockSize
  16077. XORL BX, BX
  16078. ADDQ $0x05, AX
  16079. JMP repeat_end_emit_calcBlockSize
  16080. two_byte_offset_repeat_as_copy_calcBlockSize:
  16081. CMPL BX, $0x40
  16082. JBE two_byte_offset_short_repeat_as_copy_calcBlockSize
  16083. LEAL -60(BX), BX
  16084. ADDQ $0x03, AX
  16085. JMP two_byte_offset_repeat_as_copy_calcBlockSize
  16086. two_byte_offset_short_repeat_as_copy_calcBlockSize:
  16087. MOVL BX, DI
  16088. SHLL $0x02, DI
  16089. CMPL BX, $0x0c
  16090. JAE emit_copy_three_repeat_as_copy_calcBlockSize
  16091. CMPL SI, $0x00000800
  16092. JAE emit_copy_three_repeat_as_copy_calcBlockSize
  16093. ADDQ $0x02, AX
  16094. JMP repeat_end_emit_calcBlockSize
  16095. emit_copy_three_repeat_as_copy_calcBlockSize:
  16096. ADDQ $0x03, AX
  16097. repeat_end_emit_calcBlockSize:
  16098. MOVL CX, 12(SP)
  16099. JMP search_loop_calcBlockSize
  16100. no_repeat_found_calcBlockSize:
  16101. CMPL (DX)(BX*1), SI
  16102. JEQ candidate_match_calcBlockSize
  16103. SHRQ $0x08, SI
  16104. MOVL 24(SP)(R9*4), BX
  16105. LEAL 2(CX), R8
  16106. CMPL (DX)(DI*1), SI
  16107. JEQ candidate2_match_calcBlockSize
  16108. MOVL R8, 24(SP)(R9*4)
  16109. SHRQ $0x08, SI
  16110. CMPL (DX)(BX*1), SI
  16111. JEQ candidate3_match_calcBlockSize
  16112. MOVL 20(SP), CX
  16113. JMP search_loop_calcBlockSize
  16114. candidate3_match_calcBlockSize:
  16115. ADDL $0x02, CX
  16116. JMP candidate_match_calcBlockSize
  16117. candidate2_match_calcBlockSize:
  16118. MOVL R8, 24(SP)(R9*4)
  16119. INCL CX
  16120. MOVL DI, BX
  16121. candidate_match_calcBlockSize:
  16122. MOVL 12(SP), SI
  16123. TESTL BX, BX
  16124. JZ match_extend_back_end_calcBlockSize
  16125. match_extend_back_loop_calcBlockSize:
  16126. CMPL CX, SI
  16127. JBE match_extend_back_end_calcBlockSize
  16128. MOVB -1(DX)(BX*1), DI
  16129. MOVB -1(DX)(CX*1), R8
  16130. CMPB DI, R8
  16131. JNE match_extend_back_end_calcBlockSize
  16132. LEAL -1(CX), CX
  16133. DECL BX
  16134. JZ match_extend_back_end_calcBlockSize
  16135. JMP match_extend_back_loop_calcBlockSize
  16136. match_extend_back_end_calcBlockSize:
  16137. MOVL CX, SI
  16138. SUBL 12(SP), SI
  16139. LEAQ 5(AX)(SI*1), SI
  16140. CMPQ SI, (SP)
  16141. JB match_dst_size_check_calcBlockSize
  16142. MOVQ $0x00000000, ret+24(FP)
  16143. RET
  16144. match_dst_size_check_calcBlockSize:
  16145. MOVL CX, SI
  16146. MOVL 12(SP), DI
  16147. CMPL DI, SI
  16148. JEQ emit_literal_done_match_emit_calcBlockSize
  16149. MOVL SI, R8
  16150. MOVL SI, 12(SP)
  16151. LEAQ (DX)(DI*1), SI
  16152. SUBL DI, R8
  16153. LEAL -1(R8), SI
  16154. CMPL SI, $0x3c
  16155. JB one_byte_match_emit_calcBlockSize
  16156. CMPL SI, $0x00000100
  16157. JB two_bytes_match_emit_calcBlockSize
  16158. CMPL SI, $0x00010000
  16159. JB three_bytes_match_emit_calcBlockSize
  16160. CMPL SI, $0x01000000
  16161. JB four_bytes_match_emit_calcBlockSize
  16162. ADDQ $0x05, AX
  16163. JMP memmove_long_match_emit_calcBlockSize
  16164. four_bytes_match_emit_calcBlockSize:
  16165. ADDQ $0x04, AX
  16166. JMP memmove_long_match_emit_calcBlockSize
  16167. three_bytes_match_emit_calcBlockSize:
  16168. ADDQ $0x03, AX
  16169. JMP memmove_long_match_emit_calcBlockSize
  16170. two_bytes_match_emit_calcBlockSize:
  16171. ADDQ $0x02, AX
  16172. CMPL SI, $0x40
  16173. JB memmove_match_emit_calcBlockSize
  16174. JMP memmove_long_match_emit_calcBlockSize
  16175. one_byte_match_emit_calcBlockSize:
  16176. ADDQ $0x01, AX
  16177. memmove_match_emit_calcBlockSize:
  16178. LEAQ (AX)(R8*1), AX
  16179. JMP emit_literal_done_match_emit_calcBlockSize
  16180. memmove_long_match_emit_calcBlockSize:
  16181. LEAQ (AX)(R8*1), AX
  16182. emit_literal_done_match_emit_calcBlockSize:
  16183. match_nolit_loop_calcBlockSize:
  16184. MOVL CX, SI
  16185. SUBL BX, SI
  16186. MOVL SI, 16(SP)
  16187. ADDL $0x04, CX
  16188. ADDL $0x04, BX
  16189. MOVQ src_len+8(FP), SI
  16190. SUBL CX, SI
  16191. LEAQ (DX)(CX*1), DI
  16192. LEAQ (DX)(BX*1), BX
  16193. // matchLen
  16194. XORL R9, R9
  16195. matchlen_loopback_16_match_nolit_calcBlockSize:
  16196. CMPL SI, $0x10
  16197. JB matchlen_match8_match_nolit_calcBlockSize
  16198. MOVQ (DI)(R9*1), R8
  16199. MOVQ 8(DI)(R9*1), R10
  16200. XORQ (BX)(R9*1), R8
  16201. JNZ matchlen_bsf_8_match_nolit_calcBlockSize
  16202. XORQ 8(BX)(R9*1), R10
  16203. JNZ matchlen_bsf_16match_nolit_calcBlockSize
  16204. LEAL -16(SI), SI
  16205. LEAL 16(R9), R9
  16206. JMP matchlen_loopback_16_match_nolit_calcBlockSize
  16207. matchlen_bsf_16match_nolit_calcBlockSize:
  16208. #ifdef GOAMD64_v3
  16209. TZCNTQ R10, R10
  16210. #else
  16211. BSFQ R10, R10
  16212. #endif
  16213. SARQ $0x03, R10
  16214. LEAL 8(R9)(R10*1), R9
  16215. JMP match_nolit_end_calcBlockSize
  16216. matchlen_match8_match_nolit_calcBlockSize:
  16217. CMPL SI, $0x08
  16218. JB matchlen_match4_match_nolit_calcBlockSize
  16219. MOVQ (DI)(R9*1), R8
  16220. XORQ (BX)(R9*1), R8
  16221. JNZ matchlen_bsf_8_match_nolit_calcBlockSize
  16222. LEAL -8(SI), SI
  16223. LEAL 8(R9), R9
  16224. JMP matchlen_match4_match_nolit_calcBlockSize
  16225. matchlen_bsf_8_match_nolit_calcBlockSize:
  16226. #ifdef GOAMD64_v3
  16227. TZCNTQ R8, R8
  16228. #else
  16229. BSFQ R8, R8
  16230. #endif
  16231. SARQ $0x03, R8
  16232. LEAL (R9)(R8*1), R9
  16233. JMP match_nolit_end_calcBlockSize
  16234. matchlen_match4_match_nolit_calcBlockSize:
  16235. CMPL SI, $0x04
  16236. JB matchlen_match2_match_nolit_calcBlockSize
  16237. MOVL (DI)(R9*1), R8
  16238. CMPL (BX)(R9*1), R8
  16239. JNE matchlen_match2_match_nolit_calcBlockSize
  16240. LEAL -4(SI), SI
  16241. LEAL 4(R9), R9
  16242. matchlen_match2_match_nolit_calcBlockSize:
  16243. CMPL SI, $0x01
  16244. JE matchlen_match1_match_nolit_calcBlockSize
  16245. JB match_nolit_end_calcBlockSize
  16246. MOVW (DI)(R9*1), R8
  16247. CMPW (BX)(R9*1), R8
  16248. JNE matchlen_match1_match_nolit_calcBlockSize
  16249. LEAL 2(R9), R9
  16250. SUBL $0x02, SI
  16251. JZ match_nolit_end_calcBlockSize
  16252. matchlen_match1_match_nolit_calcBlockSize:
  16253. MOVB (DI)(R9*1), R8
  16254. CMPB (BX)(R9*1), R8
  16255. JNE match_nolit_end_calcBlockSize
  16256. LEAL 1(R9), R9
  16257. match_nolit_end_calcBlockSize:
  16258. ADDL R9, CX
  16259. MOVL 16(SP), BX
  16260. ADDL $0x04, R9
  16261. MOVL CX, 12(SP)
  16262. // emitCopy
  16263. CMPL BX, $0x00010000
  16264. JB two_byte_offset_match_nolit_calcBlockSize
  16265. four_bytes_loop_back_match_nolit_calcBlockSize:
  16266. CMPL R9, $0x40
  16267. JBE four_bytes_remain_match_nolit_calcBlockSize
  16268. LEAL -64(R9), R9
  16269. ADDQ $0x05, AX
  16270. CMPL R9, $0x04
  16271. JB four_bytes_remain_match_nolit_calcBlockSize
  16272. JMP four_bytes_loop_back_match_nolit_calcBlockSize
  16273. four_bytes_remain_match_nolit_calcBlockSize:
  16274. TESTL R9, R9
  16275. JZ match_nolit_emitcopy_end_calcBlockSize
  16276. XORL BX, BX
  16277. ADDQ $0x05, AX
  16278. JMP match_nolit_emitcopy_end_calcBlockSize
  16279. two_byte_offset_match_nolit_calcBlockSize:
  16280. CMPL R9, $0x40
  16281. JBE two_byte_offset_short_match_nolit_calcBlockSize
  16282. LEAL -60(R9), R9
  16283. ADDQ $0x03, AX
  16284. JMP two_byte_offset_match_nolit_calcBlockSize
  16285. two_byte_offset_short_match_nolit_calcBlockSize:
  16286. MOVL R9, SI
  16287. SHLL $0x02, SI
  16288. CMPL R9, $0x0c
  16289. JAE emit_copy_three_match_nolit_calcBlockSize
  16290. CMPL BX, $0x00000800
  16291. JAE emit_copy_three_match_nolit_calcBlockSize
  16292. ADDQ $0x02, AX
  16293. JMP match_nolit_emitcopy_end_calcBlockSize
  16294. emit_copy_three_match_nolit_calcBlockSize:
  16295. ADDQ $0x03, AX
  16296. match_nolit_emitcopy_end_calcBlockSize:
  16297. CMPL CX, 8(SP)
  16298. JAE emit_remainder_calcBlockSize
  16299. MOVQ -2(DX)(CX*1), SI
  16300. CMPQ AX, (SP)
  16301. JB match_nolit_dst_ok_calcBlockSize
  16302. MOVQ $0x00000000, ret+24(FP)
  16303. RET
  16304. match_nolit_dst_ok_calcBlockSize:
  16305. MOVQ $0x0000cf1bbcdcbf9b, R8
  16306. MOVQ SI, DI
  16307. SHRQ $0x10, SI
  16308. MOVQ SI, BX
  16309. SHLQ $0x10, DI
  16310. IMULQ R8, DI
  16311. SHRQ $0x33, DI
  16312. SHLQ $0x10, BX
  16313. IMULQ R8, BX
  16314. SHRQ $0x33, BX
  16315. LEAL -2(CX), R8
  16316. LEAQ 24(SP)(BX*4), R9
  16317. MOVL (R9), BX
  16318. MOVL R8, 24(SP)(DI*4)
  16319. MOVL CX, (R9)
  16320. CMPL (DX)(BX*1), SI
  16321. JEQ match_nolit_loop_calcBlockSize
  16322. INCL CX
  16323. JMP search_loop_calcBlockSize
  16324. emit_remainder_calcBlockSize:
  16325. MOVQ src_len+8(FP), CX
  16326. SUBL 12(SP), CX
  16327. LEAQ 5(AX)(CX*1), CX
  16328. CMPQ CX, (SP)
  16329. JB emit_remainder_ok_calcBlockSize
  16330. MOVQ $0x00000000, ret+24(FP)
  16331. RET
  16332. emit_remainder_ok_calcBlockSize:
  16333. MOVQ src_len+8(FP), CX
  16334. MOVL 12(SP), BX
  16335. CMPL BX, CX
  16336. JEQ emit_literal_done_emit_remainder_calcBlockSize
  16337. MOVL CX, SI
  16338. MOVL CX, 12(SP)
  16339. LEAQ (DX)(BX*1), CX
  16340. SUBL BX, SI
  16341. LEAL -1(SI), CX
  16342. CMPL CX, $0x3c
  16343. JB one_byte_emit_remainder_calcBlockSize
  16344. CMPL CX, $0x00000100
  16345. JB two_bytes_emit_remainder_calcBlockSize
  16346. CMPL CX, $0x00010000
  16347. JB three_bytes_emit_remainder_calcBlockSize
  16348. CMPL CX, $0x01000000
  16349. JB four_bytes_emit_remainder_calcBlockSize
  16350. ADDQ $0x05, AX
  16351. JMP memmove_long_emit_remainder_calcBlockSize
  16352. four_bytes_emit_remainder_calcBlockSize:
  16353. ADDQ $0x04, AX
  16354. JMP memmove_long_emit_remainder_calcBlockSize
  16355. three_bytes_emit_remainder_calcBlockSize:
  16356. ADDQ $0x03, AX
  16357. JMP memmove_long_emit_remainder_calcBlockSize
  16358. two_bytes_emit_remainder_calcBlockSize:
  16359. ADDQ $0x02, AX
  16360. CMPL CX, $0x40
  16361. JB memmove_emit_remainder_calcBlockSize
  16362. JMP memmove_long_emit_remainder_calcBlockSize
  16363. one_byte_emit_remainder_calcBlockSize:
  16364. ADDQ $0x01, AX
  16365. memmove_emit_remainder_calcBlockSize:
  16366. LEAQ (AX)(SI*1), AX
  16367. JMP emit_literal_done_emit_remainder_calcBlockSize
  16368. memmove_long_emit_remainder_calcBlockSize:
  16369. LEAQ (AX)(SI*1), AX
  16370. emit_literal_done_emit_remainder_calcBlockSize:
  16371. MOVQ AX, ret+24(FP)
  16372. RET
  16373. // func calcBlockSizeSmall(src []byte) int
  16374. // Requires: BMI, SSE2
  16375. TEXT ·calcBlockSizeSmall(SB), $2072-32
  16376. XORQ AX, AX
  16377. MOVQ $0x00000010, CX
  16378. LEAQ 24(SP), DX
  16379. PXOR X0, X0
  16380. zero_loop_calcBlockSizeSmall:
  16381. MOVOU X0, (DX)
  16382. MOVOU X0, 16(DX)
  16383. MOVOU X0, 32(DX)
  16384. MOVOU X0, 48(DX)
  16385. MOVOU X0, 64(DX)
  16386. MOVOU X0, 80(DX)
  16387. MOVOU X0, 96(DX)
  16388. MOVOU X0, 112(DX)
  16389. ADDQ $0x80, DX
  16390. DECQ CX
  16391. JNZ zero_loop_calcBlockSizeSmall
  16392. MOVL $0x00000000, 12(SP)
  16393. MOVQ src_len+8(FP), CX
  16394. LEAQ -9(CX), DX
  16395. LEAQ -8(CX), BX
  16396. MOVL BX, 8(SP)
  16397. SHRQ $0x05, CX
  16398. SUBL CX, DX
  16399. LEAQ (AX)(DX*1), DX
  16400. MOVQ DX, (SP)
  16401. MOVL $0x00000001, CX
  16402. MOVL CX, 16(SP)
  16403. MOVQ src_base+0(FP), DX
  16404. search_loop_calcBlockSizeSmall:
  16405. MOVL CX, BX
  16406. SUBL 12(SP), BX
  16407. SHRL $0x04, BX
  16408. LEAL 4(CX)(BX*1), BX
  16409. CMPL BX, 8(SP)
  16410. JAE emit_remainder_calcBlockSizeSmall
  16411. MOVQ (DX)(CX*1), SI
  16412. MOVL BX, 20(SP)
  16413. MOVQ $0x9e3779b1, R8
  16414. MOVQ SI, R9
  16415. MOVQ SI, R10
  16416. SHRQ $0x08, R10
  16417. SHLQ $0x20, R9
  16418. IMULQ R8, R9
  16419. SHRQ $0x37, R9
  16420. SHLQ $0x20, R10
  16421. IMULQ R8, R10
  16422. SHRQ $0x37, R10
  16423. MOVL 24(SP)(R9*4), BX
  16424. MOVL 24(SP)(R10*4), DI
  16425. MOVL CX, 24(SP)(R9*4)
  16426. LEAL 1(CX), R9
  16427. MOVL R9, 24(SP)(R10*4)
  16428. MOVQ SI, R9
  16429. SHRQ $0x10, R9
  16430. SHLQ $0x20, R9
  16431. IMULQ R8, R9
  16432. SHRQ $0x37, R9
  16433. MOVL CX, R8
  16434. SUBL 16(SP), R8
  16435. MOVL 1(DX)(R8*1), R10
  16436. MOVQ SI, R8
  16437. SHRQ $0x08, R8
  16438. CMPL R8, R10
  16439. JNE no_repeat_found_calcBlockSizeSmall
  16440. LEAL 1(CX), SI
  16441. MOVL 12(SP), BX
  16442. MOVL SI, DI
  16443. SUBL 16(SP), DI
  16444. JZ repeat_extend_back_end_calcBlockSizeSmall
  16445. repeat_extend_back_loop_calcBlockSizeSmall:
  16446. CMPL SI, BX
  16447. JBE repeat_extend_back_end_calcBlockSizeSmall
  16448. MOVB -1(DX)(DI*1), R8
  16449. MOVB -1(DX)(SI*1), R9
  16450. CMPB R8, R9
  16451. JNE repeat_extend_back_end_calcBlockSizeSmall
  16452. LEAL -1(SI), SI
  16453. DECL DI
  16454. JNZ repeat_extend_back_loop_calcBlockSizeSmall
  16455. repeat_extend_back_end_calcBlockSizeSmall:
  16456. MOVL 12(SP), BX
  16457. CMPL BX, SI
  16458. JEQ emit_literal_done_repeat_emit_calcBlockSizeSmall
  16459. MOVL SI, DI
  16460. MOVL SI, 12(SP)
  16461. LEAQ (DX)(BX*1), R8
  16462. SUBL BX, DI
  16463. LEAL -1(DI), BX
  16464. CMPL BX, $0x3c
  16465. JB one_byte_repeat_emit_calcBlockSizeSmall
  16466. CMPL BX, $0x00000100
  16467. JB two_bytes_repeat_emit_calcBlockSizeSmall
  16468. JB three_bytes_repeat_emit_calcBlockSizeSmall
  16469. three_bytes_repeat_emit_calcBlockSizeSmall:
  16470. ADDQ $0x03, AX
  16471. JMP memmove_long_repeat_emit_calcBlockSizeSmall
  16472. two_bytes_repeat_emit_calcBlockSizeSmall:
  16473. ADDQ $0x02, AX
  16474. CMPL BX, $0x40
  16475. JB memmove_repeat_emit_calcBlockSizeSmall
  16476. JMP memmove_long_repeat_emit_calcBlockSizeSmall
  16477. one_byte_repeat_emit_calcBlockSizeSmall:
  16478. ADDQ $0x01, AX
  16479. memmove_repeat_emit_calcBlockSizeSmall:
  16480. LEAQ (AX)(DI*1), AX
  16481. JMP emit_literal_done_repeat_emit_calcBlockSizeSmall
  16482. memmove_long_repeat_emit_calcBlockSizeSmall:
  16483. LEAQ (AX)(DI*1), AX
  16484. emit_literal_done_repeat_emit_calcBlockSizeSmall:
  16485. ADDL $0x05, CX
  16486. MOVL CX, BX
  16487. SUBL 16(SP), BX
  16488. MOVQ src_len+8(FP), DI
  16489. SUBL CX, DI
  16490. LEAQ (DX)(CX*1), R8
  16491. LEAQ (DX)(BX*1), BX
  16492. // matchLen
  16493. XORL R10, R10
  16494. matchlen_loopback_16_repeat_extend_calcBlockSizeSmall:
  16495. CMPL DI, $0x10
  16496. JB matchlen_match8_repeat_extend_calcBlockSizeSmall
  16497. MOVQ (R8)(R10*1), R9
  16498. MOVQ 8(R8)(R10*1), R11
  16499. XORQ (BX)(R10*1), R9
  16500. JNZ matchlen_bsf_8_repeat_extend_calcBlockSizeSmall
  16501. XORQ 8(BX)(R10*1), R11
  16502. JNZ matchlen_bsf_16repeat_extend_calcBlockSizeSmall
  16503. LEAL -16(DI), DI
  16504. LEAL 16(R10), R10
  16505. JMP matchlen_loopback_16_repeat_extend_calcBlockSizeSmall
  16506. matchlen_bsf_16repeat_extend_calcBlockSizeSmall:
  16507. #ifdef GOAMD64_v3
  16508. TZCNTQ R11, R11
  16509. #else
  16510. BSFQ R11, R11
  16511. #endif
  16512. SARQ $0x03, R11
  16513. LEAL 8(R10)(R11*1), R10
  16514. JMP repeat_extend_forward_end_calcBlockSizeSmall
  16515. matchlen_match8_repeat_extend_calcBlockSizeSmall:
  16516. CMPL DI, $0x08
  16517. JB matchlen_match4_repeat_extend_calcBlockSizeSmall
  16518. MOVQ (R8)(R10*1), R9
  16519. XORQ (BX)(R10*1), R9
  16520. JNZ matchlen_bsf_8_repeat_extend_calcBlockSizeSmall
  16521. LEAL -8(DI), DI
  16522. LEAL 8(R10), R10
  16523. JMP matchlen_match4_repeat_extend_calcBlockSizeSmall
  16524. matchlen_bsf_8_repeat_extend_calcBlockSizeSmall:
  16525. #ifdef GOAMD64_v3
  16526. TZCNTQ R9, R9
  16527. #else
  16528. BSFQ R9, R9
  16529. #endif
  16530. SARQ $0x03, R9
  16531. LEAL (R10)(R9*1), R10
  16532. JMP repeat_extend_forward_end_calcBlockSizeSmall
  16533. matchlen_match4_repeat_extend_calcBlockSizeSmall:
  16534. CMPL DI, $0x04
  16535. JB matchlen_match2_repeat_extend_calcBlockSizeSmall
  16536. MOVL (R8)(R10*1), R9
  16537. CMPL (BX)(R10*1), R9
  16538. JNE matchlen_match2_repeat_extend_calcBlockSizeSmall
  16539. LEAL -4(DI), DI
  16540. LEAL 4(R10), R10
  16541. matchlen_match2_repeat_extend_calcBlockSizeSmall:
  16542. CMPL DI, $0x01
  16543. JE matchlen_match1_repeat_extend_calcBlockSizeSmall
  16544. JB repeat_extend_forward_end_calcBlockSizeSmall
  16545. MOVW (R8)(R10*1), R9
  16546. CMPW (BX)(R10*1), R9
  16547. JNE matchlen_match1_repeat_extend_calcBlockSizeSmall
  16548. LEAL 2(R10), R10
  16549. SUBL $0x02, DI
  16550. JZ repeat_extend_forward_end_calcBlockSizeSmall
  16551. matchlen_match1_repeat_extend_calcBlockSizeSmall:
  16552. MOVB (R8)(R10*1), R9
  16553. CMPB (BX)(R10*1), R9
  16554. JNE repeat_extend_forward_end_calcBlockSizeSmall
  16555. LEAL 1(R10), R10
  16556. repeat_extend_forward_end_calcBlockSizeSmall:
  16557. ADDL R10, CX
  16558. MOVL CX, BX
  16559. SUBL SI, BX
  16560. MOVL 16(SP), SI
  16561. // emitCopy
  16562. two_byte_offset_repeat_as_copy_calcBlockSizeSmall:
  16563. CMPL BX, $0x40
  16564. JBE two_byte_offset_short_repeat_as_copy_calcBlockSizeSmall
  16565. LEAL -60(BX), BX
  16566. ADDQ $0x03, AX
  16567. JMP two_byte_offset_repeat_as_copy_calcBlockSizeSmall
  16568. two_byte_offset_short_repeat_as_copy_calcBlockSizeSmall:
  16569. MOVL BX, SI
  16570. SHLL $0x02, SI
  16571. CMPL BX, $0x0c
  16572. JAE emit_copy_three_repeat_as_copy_calcBlockSizeSmall
  16573. ADDQ $0x02, AX
  16574. JMP repeat_end_emit_calcBlockSizeSmall
  16575. emit_copy_three_repeat_as_copy_calcBlockSizeSmall:
  16576. ADDQ $0x03, AX
  16577. repeat_end_emit_calcBlockSizeSmall:
  16578. MOVL CX, 12(SP)
  16579. JMP search_loop_calcBlockSizeSmall
  16580. no_repeat_found_calcBlockSizeSmall:
  16581. CMPL (DX)(BX*1), SI
  16582. JEQ candidate_match_calcBlockSizeSmall
  16583. SHRQ $0x08, SI
  16584. MOVL 24(SP)(R9*4), BX
  16585. LEAL 2(CX), R8
  16586. CMPL (DX)(DI*1), SI
  16587. JEQ candidate2_match_calcBlockSizeSmall
  16588. MOVL R8, 24(SP)(R9*4)
  16589. SHRQ $0x08, SI
  16590. CMPL (DX)(BX*1), SI
  16591. JEQ candidate3_match_calcBlockSizeSmall
  16592. MOVL 20(SP), CX
  16593. JMP search_loop_calcBlockSizeSmall
  16594. candidate3_match_calcBlockSizeSmall:
  16595. ADDL $0x02, CX
  16596. JMP candidate_match_calcBlockSizeSmall
  16597. candidate2_match_calcBlockSizeSmall:
  16598. MOVL R8, 24(SP)(R9*4)
  16599. INCL CX
  16600. MOVL DI, BX
  16601. candidate_match_calcBlockSizeSmall:
  16602. MOVL 12(SP), SI
  16603. TESTL BX, BX
  16604. JZ match_extend_back_end_calcBlockSizeSmall
  16605. match_extend_back_loop_calcBlockSizeSmall:
  16606. CMPL CX, SI
  16607. JBE match_extend_back_end_calcBlockSizeSmall
  16608. MOVB -1(DX)(BX*1), DI
  16609. MOVB -1(DX)(CX*1), R8
  16610. CMPB DI, R8
  16611. JNE match_extend_back_end_calcBlockSizeSmall
  16612. LEAL -1(CX), CX
  16613. DECL BX
  16614. JZ match_extend_back_end_calcBlockSizeSmall
  16615. JMP match_extend_back_loop_calcBlockSizeSmall
  16616. match_extend_back_end_calcBlockSizeSmall:
  16617. MOVL CX, SI
  16618. SUBL 12(SP), SI
  16619. LEAQ 3(AX)(SI*1), SI
  16620. CMPQ SI, (SP)
  16621. JB match_dst_size_check_calcBlockSizeSmall
  16622. MOVQ $0x00000000, ret+24(FP)
  16623. RET
  16624. match_dst_size_check_calcBlockSizeSmall:
  16625. MOVL CX, SI
  16626. MOVL 12(SP), DI
  16627. CMPL DI, SI
  16628. JEQ emit_literal_done_match_emit_calcBlockSizeSmall
  16629. MOVL SI, R8
  16630. MOVL SI, 12(SP)
  16631. LEAQ (DX)(DI*1), SI
  16632. SUBL DI, R8
  16633. LEAL -1(R8), SI
  16634. CMPL SI, $0x3c
  16635. JB one_byte_match_emit_calcBlockSizeSmall
  16636. CMPL SI, $0x00000100
  16637. JB two_bytes_match_emit_calcBlockSizeSmall
  16638. JB three_bytes_match_emit_calcBlockSizeSmall
  16639. three_bytes_match_emit_calcBlockSizeSmall:
  16640. ADDQ $0x03, AX
  16641. JMP memmove_long_match_emit_calcBlockSizeSmall
  16642. two_bytes_match_emit_calcBlockSizeSmall:
  16643. ADDQ $0x02, AX
  16644. CMPL SI, $0x40
  16645. JB memmove_match_emit_calcBlockSizeSmall
  16646. JMP memmove_long_match_emit_calcBlockSizeSmall
  16647. one_byte_match_emit_calcBlockSizeSmall:
  16648. ADDQ $0x01, AX
  16649. memmove_match_emit_calcBlockSizeSmall:
  16650. LEAQ (AX)(R8*1), AX
  16651. JMP emit_literal_done_match_emit_calcBlockSizeSmall
  16652. memmove_long_match_emit_calcBlockSizeSmall:
  16653. LEAQ (AX)(R8*1), AX
  16654. emit_literal_done_match_emit_calcBlockSizeSmall:
  16655. match_nolit_loop_calcBlockSizeSmall:
  16656. MOVL CX, SI
  16657. SUBL BX, SI
  16658. MOVL SI, 16(SP)
  16659. ADDL $0x04, CX
  16660. ADDL $0x04, BX
  16661. MOVQ src_len+8(FP), SI
  16662. SUBL CX, SI
  16663. LEAQ (DX)(CX*1), DI
  16664. LEAQ (DX)(BX*1), BX
  16665. // matchLen
  16666. XORL R9, R9
  16667. matchlen_loopback_16_match_nolit_calcBlockSizeSmall:
  16668. CMPL SI, $0x10
  16669. JB matchlen_match8_match_nolit_calcBlockSizeSmall
  16670. MOVQ (DI)(R9*1), R8
  16671. MOVQ 8(DI)(R9*1), R10
  16672. XORQ (BX)(R9*1), R8
  16673. JNZ matchlen_bsf_8_match_nolit_calcBlockSizeSmall
  16674. XORQ 8(BX)(R9*1), R10
  16675. JNZ matchlen_bsf_16match_nolit_calcBlockSizeSmall
  16676. LEAL -16(SI), SI
  16677. LEAL 16(R9), R9
  16678. JMP matchlen_loopback_16_match_nolit_calcBlockSizeSmall
  16679. matchlen_bsf_16match_nolit_calcBlockSizeSmall:
  16680. #ifdef GOAMD64_v3
  16681. TZCNTQ R10, R10
  16682. #else
  16683. BSFQ R10, R10
  16684. #endif
  16685. SARQ $0x03, R10
  16686. LEAL 8(R9)(R10*1), R9
  16687. JMP match_nolit_end_calcBlockSizeSmall
  16688. matchlen_match8_match_nolit_calcBlockSizeSmall:
  16689. CMPL SI, $0x08
  16690. JB matchlen_match4_match_nolit_calcBlockSizeSmall
  16691. MOVQ (DI)(R9*1), R8
  16692. XORQ (BX)(R9*1), R8
  16693. JNZ matchlen_bsf_8_match_nolit_calcBlockSizeSmall
  16694. LEAL -8(SI), SI
  16695. LEAL 8(R9), R9
  16696. JMP matchlen_match4_match_nolit_calcBlockSizeSmall
  16697. matchlen_bsf_8_match_nolit_calcBlockSizeSmall:
  16698. #ifdef GOAMD64_v3
  16699. TZCNTQ R8, R8
  16700. #else
  16701. BSFQ R8, R8
  16702. #endif
  16703. SARQ $0x03, R8
  16704. LEAL (R9)(R8*1), R9
  16705. JMP match_nolit_end_calcBlockSizeSmall
  16706. matchlen_match4_match_nolit_calcBlockSizeSmall:
  16707. CMPL SI, $0x04
  16708. JB matchlen_match2_match_nolit_calcBlockSizeSmall
  16709. MOVL (DI)(R9*1), R8
  16710. CMPL (BX)(R9*1), R8
  16711. JNE matchlen_match2_match_nolit_calcBlockSizeSmall
  16712. LEAL -4(SI), SI
  16713. LEAL 4(R9), R9
  16714. matchlen_match2_match_nolit_calcBlockSizeSmall:
  16715. CMPL SI, $0x01
  16716. JE matchlen_match1_match_nolit_calcBlockSizeSmall
  16717. JB match_nolit_end_calcBlockSizeSmall
  16718. MOVW (DI)(R9*1), R8
  16719. CMPW (BX)(R9*1), R8
  16720. JNE matchlen_match1_match_nolit_calcBlockSizeSmall
  16721. LEAL 2(R9), R9
  16722. SUBL $0x02, SI
  16723. JZ match_nolit_end_calcBlockSizeSmall
  16724. matchlen_match1_match_nolit_calcBlockSizeSmall:
  16725. MOVB (DI)(R9*1), R8
  16726. CMPB (BX)(R9*1), R8
  16727. JNE match_nolit_end_calcBlockSizeSmall
  16728. LEAL 1(R9), R9
  16729. match_nolit_end_calcBlockSizeSmall:
  16730. ADDL R9, CX
  16731. MOVL 16(SP), BX
  16732. ADDL $0x04, R9
  16733. MOVL CX, 12(SP)
  16734. // emitCopy
  16735. two_byte_offset_match_nolit_calcBlockSizeSmall:
  16736. CMPL R9, $0x40
  16737. JBE two_byte_offset_short_match_nolit_calcBlockSizeSmall
  16738. LEAL -60(R9), R9
  16739. ADDQ $0x03, AX
  16740. JMP two_byte_offset_match_nolit_calcBlockSizeSmall
  16741. two_byte_offset_short_match_nolit_calcBlockSizeSmall:
  16742. MOVL R9, BX
  16743. SHLL $0x02, BX
  16744. CMPL R9, $0x0c
  16745. JAE emit_copy_three_match_nolit_calcBlockSizeSmall
  16746. ADDQ $0x02, AX
  16747. JMP match_nolit_emitcopy_end_calcBlockSizeSmall
  16748. emit_copy_three_match_nolit_calcBlockSizeSmall:
  16749. ADDQ $0x03, AX
  16750. match_nolit_emitcopy_end_calcBlockSizeSmall:
  16751. CMPL CX, 8(SP)
  16752. JAE emit_remainder_calcBlockSizeSmall
  16753. MOVQ -2(DX)(CX*1), SI
  16754. CMPQ AX, (SP)
  16755. JB match_nolit_dst_ok_calcBlockSizeSmall
  16756. MOVQ $0x00000000, ret+24(FP)
  16757. RET
  16758. match_nolit_dst_ok_calcBlockSizeSmall:
  16759. MOVQ $0x9e3779b1, R8
  16760. MOVQ SI, DI
  16761. SHRQ $0x10, SI
  16762. MOVQ SI, BX
  16763. SHLQ $0x20, DI
  16764. IMULQ R8, DI
  16765. SHRQ $0x37, DI
  16766. SHLQ $0x20, BX
  16767. IMULQ R8, BX
  16768. SHRQ $0x37, BX
  16769. LEAL -2(CX), R8
  16770. LEAQ 24(SP)(BX*4), R9
  16771. MOVL (R9), BX
  16772. MOVL R8, 24(SP)(DI*4)
  16773. MOVL CX, (R9)
  16774. CMPL (DX)(BX*1), SI
  16775. JEQ match_nolit_loop_calcBlockSizeSmall
  16776. INCL CX
  16777. JMP search_loop_calcBlockSizeSmall
  16778. emit_remainder_calcBlockSizeSmall:
  16779. MOVQ src_len+8(FP), CX
  16780. SUBL 12(SP), CX
  16781. LEAQ 3(AX)(CX*1), CX
  16782. CMPQ CX, (SP)
  16783. JB emit_remainder_ok_calcBlockSizeSmall
  16784. MOVQ $0x00000000, ret+24(FP)
  16785. RET
  16786. emit_remainder_ok_calcBlockSizeSmall:
  16787. MOVQ src_len+8(FP), CX
  16788. MOVL 12(SP), BX
  16789. CMPL BX, CX
  16790. JEQ emit_literal_done_emit_remainder_calcBlockSizeSmall
  16791. MOVL CX, SI
  16792. MOVL CX, 12(SP)
  16793. LEAQ (DX)(BX*1), CX
  16794. SUBL BX, SI
  16795. LEAL -1(SI), CX
  16796. CMPL CX, $0x3c
  16797. JB one_byte_emit_remainder_calcBlockSizeSmall
  16798. CMPL CX, $0x00000100
  16799. JB two_bytes_emit_remainder_calcBlockSizeSmall
  16800. JB three_bytes_emit_remainder_calcBlockSizeSmall
  16801. three_bytes_emit_remainder_calcBlockSizeSmall:
  16802. ADDQ $0x03, AX
  16803. JMP memmove_long_emit_remainder_calcBlockSizeSmall
  16804. two_bytes_emit_remainder_calcBlockSizeSmall:
  16805. ADDQ $0x02, AX
  16806. CMPL CX, $0x40
  16807. JB memmove_emit_remainder_calcBlockSizeSmall
  16808. JMP memmove_long_emit_remainder_calcBlockSizeSmall
  16809. one_byte_emit_remainder_calcBlockSizeSmall:
  16810. ADDQ $0x01, AX
  16811. memmove_emit_remainder_calcBlockSizeSmall:
  16812. LEAQ (AX)(SI*1), AX
  16813. JMP emit_literal_done_emit_remainder_calcBlockSizeSmall
  16814. memmove_long_emit_remainder_calcBlockSizeSmall:
  16815. LEAQ (AX)(SI*1), AX
  16816. emit_literal_done_emit_remainder_calcBlockSizeSmall:
  16817. MOVQ AX, ret+24(FP)
  16818. RET
  16819. // func emitLiteral(dst []byte, lit []byte) int
  16820. // Requires: SSE2
  16821. TEXT ·emitLiteral(SB), NOSPLIT, $0-56
  16822. MOVQ lit_len+32(FP), DX
  16823. MOVQ dst_base+0(FP), AX
  16824. MOVQ lit_base+24(FP), CX
  16825. TESTQ DX, DX
  16826. JZ emit_literal_end_standalone_skip
  16827. MOVL DX, BX
  16828. LEAL -1(DX), SI
  16829. CMPL SI, $0x3c
  16830. JB one_byte_standalone
  16831. CMPL SI, $0x00000100
  16832. JB two_bytes_standalone
  16833. CMPL SI, $0x00010000
  16834. JB three_bytes_standalone
  16835. CMPL SI, $0x01000000
  16836. JB four_bytes_standalone
  16837. MOVB $0xfc, (AX)
  16838. MOVL SI, 1(AX)
  16839. ADDQ $0x05, BX
  16840. ADDQ $0x05, AX
  16841. JMP memmove_long_standalone
  16842. four_bytes_standalone:
  16843. MOVL SI, DI
  16844. SHRL $0x10, DI
  16845. MOVB $0xf8, (AX)
  16846. MOVW SI, 1(AX)
  16847. MOVB DI, 3(AX)
  16848. ADDQ $0x04, BX
  16849. ADDQ $0x04, AX
  16850. JMP memmove_long_standalone
  16851. three_bytes_standalone:
  16852. MOVB $0xf4, (AX)
  16853. MOVW SI, 1(AX)
  16854. ADDQ $0x03, BX
  16855. ADDQ $0x03, AX
  16856. JMP memmove_long_standalone
  16857. two_bytes_standalone:
  16858. MOVB $0xf0, (AX)
  16859. MOVB SI, 1(AX)
  16860. ADDQ $0x02, BX
  16861. ADDQ $0x02, AX
  16862. CMPL SI, $0x40
  16863. JB memmove_standalone
  16864. JMP memmove_long_standalone
  16865. one_byte_standalone:
  16866. SHLB $0x02, SI
  16867. MOVB SI, (AX)
  16868. ADDQ $0x01, BX
  16869. ADDQ $0x01, AX
  16870. memmove_standalone:
  16871. // genMemMoveShort
  16872. CMPQ DX, $0x03
  16873. JB emit_lit_memmove_standalone_memmove_move_1or2
  16874. JE emit_lit_memmove_standalone_memmove_move_3
  16875. CMPQ DX, $0x08
  16876. JB emit_lit_memmove_standalone_memmove_move_4through7
  16877. CMPQ DX, $0x10
  16878. JBE emit_lit_memmove_standalone_memmove_move_8through16
  16879. CMPQ DX, $0x20
  16880. JBE emit_lit_memmove_standalone_memmove_move_17through32
  16881. JMP emit_lit_memmove_standalone_memmove_move_33through64
  16882. emit_lit_memmove_standalone_memmove_move_1or2:
  16883. MOVB (CX), SI
  16884. MOVB -1(CX)(DX*1), CL
  16885. MOVB SI, (AX)
  16886. MOVB CL, -1(AX)(DX*1)
  16887. JMP emit_literal_end_standalone
  16888. emit_lit_memmove_standalone_memmove_move_3:
  16889. MOVW (CX), SI
  16890. MOVB 2(CX), CL
  16891. MOVW SI, (AX)
  16892. MOVB CL, 2(AX)
  16893. JMP emit_literal_end_standalone
  16894. emit_lit_memmove_standalone_memmove_move_4through7:
  16895. MOVL (CX), SI
  16896. MOVL -4(CX)(DX*1), CX
  16897. MOVL SI, (AX)
  16898. MOVL CX, -4(AX)(DX*1)
  16899. JMP emit_literal_end_standalone
  16900. emit_lit_memmove_standalone_memmove_move_8through16:
  16901. MOVQ (CX), SI
  16902. MOVQ -8(CX)(DX*1), CX
  16903. MOVQ SI, (AX)
  16904. MOVQ CX, -8(AX)(DX*1)
  16905. JMP emit_literal_end_standalone
  16906. emit_lit_memmove_standalone_memmove_move_17through32:
  16907. MOVOU (CX), X0
  16908. MOVOU -16(CX)(DX*1), X1
  16909. MOVOU X0, (AX)
  16910. MOVOU X1, -16(AX)(DX*1)
  16911. JMP emit_literal_end_standalone
  16912. emit_lit_memmove_standalone_memmove_move_33through64:
  16913. MOVOU (CX), X0
  16914. MOVOU 16(CX), X1
  16915. MOVOU -32(CX)(DX*1), X2
  16916. MOVOU -16(CX)(DX*1), X3
  16917. MOVOU X0, (AX)
  16918. MOVOU X1, 16(AX)
  16919. MOVOU X2, -32(AX)(DX*1)
  16920. MOVOU X3, -16(AX)(DX*1)
  16921. JMP emit_literal_end_standalone
  16922. JMP emit_literal_end_standalone
  16923. memmove_long_standalone:
  16924. // genMemMoveLong
  16925. MOVOU (CX), X0
  16926. MOVOU 16(CX), X1
  16927. MOVOU -32(CX)(DX*1), X2
  16928. MOVOU -16(CX)(DX*1), X3
  16929. MOVQ DX, DI
  16930. SHRQ $0x05, DI
  16931. MOVQ AX, SI
  16932. ANDL $0x0000001f, SI
  16933. MOVQ $0x00000040, R8
  16934. SUBQ SI, R8
  16935. DECQ DI
  16936. JA emit_lit_memmove_long_standalonelarge_forward_sse_loop_32
  16937. LEAQ -32(CX)(R8*1), SI
  16938. LEAQ -32(AX)(R8*1), R9
  16939. emit_lit_memmove_long_standalonelarge_big_loop_back:
  16940. MOVOU (SI), X4
  16941. MOVOU 16(SI), X5
  16942. MOVOA X4, (R9)
  16943. MOVOA X5, 16(R9)
  16944. ADDQ $0x20, R9
  16945. ADDQ $0x20, SI
  16946. ADDQ $0x20, R8
  16947. DECQ DI
  16948. JNA emit_lit_memmove_long_standalonelarge_big_loop_back
  16949. emit_lit_memmove_long_standalonelarge_forward_sse_loop_32:
  16950. MOVOU -32(CX)(R8*1), X4
  16951. MOVOU -16(CX)(R8*1), X5
  16952. MOVOA X4, -32(AX)(R8*1)
  16953. MOVOA X5, -16(AX)(R8*1)
  16954. ADDQ $0x20, R8
  16955. CMPQ DX, R8
  16956. JAE emit_lit_memmove_long_standalonelarge_forward_sse_loop_32
  16957. MOVOU X0, (AX)
  16958. MOVOU X1, 16(AX)
  16959. MOVOU X2, -32(AX)(DX*1)
  16960. MOVOU X3, -16(AX)(DX*1)
  16961. JMP emit_literal_end_standalone
  16962. JMP emit_literal_end_standalone
  16963. emit_literal_end_standalone_skip:
  16964. XORQ BX, BX
  16965. emit_literal_end_standalone:
  16966. MOVQ BX, ret+48(FP)
  16967. RET
  16968. // func emitRepeat(dst []byte, offset int, length int) int
  16969. TEXT ·emitRepeat(SB), NOSPLIT, $0-48
  16970. XORQ BX, BX
  16971. MOVQ dst_base+0(FP), AX
  16972. MOVQ offset+24(FP), CX
  16973. MOVQ length+32(FP), DX
  16974. // emitRepeat
  16975. emit_repeat_again_standalone:
  16976. MOVL DX, SI
  16977. LEAL -4(DX), DX
  16978. CMPL SI, $0x08
  16979. JBE repeat_two_standalone
  16980. CMPL SI, $0x0c
  16981. JAE cant_repeat_two_offset_standalone
  16982. CMPL CX, $0x00000800
  16983. JB repeat_two_offset_standalone
  16984. cant_repeat_two_offset_standalone:
  16985. CMPL DX, $0x00000104
  16986. JB repeat_three_standalone
  16987. CMPL DX, $0x00010100
  16988. JB repeat_four_standalone
  16989. CMPL DX, $0x0100ffff
  16990. JB repeat_five_standalone
  16991. LEAL -16842747(DX), DX
  16992. MOVL $0xfffb001d, (AX)
  16993. MOVB $0xff, 4(AX)
  16994. ADDQ $0x05, AX
  16995. ADDQ $0x05, BX
  16996. JMP emit_repeat_again_standalone
  16997. repeat_five_standalone:
  16998. LEAL -65536(DX), DX
  16999. MOVL DX, CX
  17000. MOVW $0x001d, (AX)
  17001. MOVW DX, 2(AX)
  17002. SARL $0x10, CX
  17003. MOVB CL, 4(AX)
  17004. ADDQ $0x05, BX
  17005. ADDQ $0x05, AX
  17006. JMP gen_emit_repeat_end
  17007. repeat_four_standalone:
  17008. LEAL -256(DX), DX
  17009. MOVW $0x0019, (AX)
  17010. MOVW DX, 2(AX)
  17011. ADDQ $0x04, BX
  17012. ADDQ $0x04, AX
  17013. JMP gen_emit_repeat_end
  17014. repeat_three_standalone:
  17015. LEAL -4(DX), DX
  17016. MOVW $0x0015, (AX)
  17017. MOVB DL, 2(AX)
  17018. ADDQ $0x03, BX
  17019. ADDQ $0x03, AX
  17020. JMP gen_emit_repeat_end
  17021. repeat_two_standalone:
  17022. SHLL $0x02, DX
  17023. ORL $0x01, DX
  17024. MOVW DX, (AX)
  17025. ADDQ $0x02, BX
  17026. ADDQ $0x02, AX
  17027. JMP gen_emit_repeat_end
  17028. repeat_two_offset_standalone:
  17029. XORQ SI, SI
  17030. LEAL 1(SI)(DX*4), DX
  17031. MOVB CL, 1(AX)
  17032. SARL $0x08, CX
  17033. SHLL $0x05, CX
  17034. ORL CX, DX
  17035. MOVB DL, (AX)
  17036. ADDQ $0x02, BX
  17037. ADDQ $0x02, AX
  17038. gen_emit_repeat_end:
  17039. MOVQ BX, ret+40(FP)
  17040. RET
  17041. // func emitCopy(dst []byte, offset int, length int) int
  17042. TEXT ·emitCopy(SB), NOSPLIT, $0-48
  17043. XORQ BX, BX
  17044. MOVQ dst_base+0(FP), AX
  17045. MOVQ offset+24(FP), CX
  17046. MOVQ length+32(FP), DX
  17047. // emitCopy
  17048. CMPL CX, $0x00010000
  17049. JB two_byte_offset_standalone
  17050. CMPL DX, $0x40
  17051. JBE four_bytes_remain_standalone
  17052. MOVB $0xff, (AX)
  17053. MOVL CX, 1(AX)
  17054. LEAL -64(DX), DX
  17055. ADDQ $0x05, BX
  17056. ADDQ $0x05, AX
  17057. CMPL DX, $0x04
  17058. JB four_bytes_remain_standalone
  17059. // emitRepeat
  17060. emit_repeat_again_standalone_emit_copy:
  17061. MOVL DX, SI
  17062. LEAL -4(DX), DX
  17063. CMPL SI, $0x08
  17064. JBE repeat_two_standalone_emit_copy
  17065. CMPL SI, $0x0c
  17066. JAE cant_repeat_two_offset_standalone_emit_copy
  17067. CMPL CX, $0x00000800
  17068. JB repeat_two_offset_standalone_emit_copy
  17069. cant_repeat_two_offset_standalone_emit_copy:
  17070. CMPL DX, $0x00000104
  17071. JB repeat_three_standalone_emit_copy
  17072. CMPL DX, $0x00010100
  17073. JB repeat_four_standalone_emit_copy
  17074. CMPL DX, $0x0100ffff
  17075. JB repeat_five_standalone_emit_copy
  17076. LEAL -16842747(DX), DX
  17077. MOVL $0xfffb001d, (AX)
  17078. MOVB $0xff, 4(AX)
  17079. ADDQ $0x05, AX
  17080. ADDQ $0x05, BX
  17081. JMP emit_repeat_again_standalone_emit_copy
  17082. repeat_five_standalone_emit_copy:
  17083. LEAL -65536(DX), DX
  17084. MOVL DX, CX
  17085. MOVW $0x001d, (AX)
  17086. MOVW DX, 2(AX)
  17087. SARL $0x10, CX
  17088. MOVB CL, 4(AX)
  17089. ADDQ $0x05, BX
  17090. ADDQ $0x05, AX
  17091. JMP gen_emit_copy_end
  17092. repeat_four_standalone_emit_copy:
  17093. LEAL -256(DX), DX
  17094. MOVW $0x0019, (AX)
  17095. MOVW DX, 2(AX)
  17096. ADDQ $0x04, BX
  17097. ADDQ $0x04, AX
  17098. JMP gen_emit_copy_end
  17099. repeat_three_standalone_emit_copy:
  17100. LEAL -4(DX), DX
  17101. MOVW $0x0015, (AX)
  17102. MOVB DL, 2(AX)
  17103. ADDQ $0x03, BX
  17104. ADDQ $0x03, AX
  17105. JMP gen_emit_copy_end
  17106. repeat_two_standalone_emit_copy:
  17107. SHLL $0x02, DX
  17108. ORL $0x01, DX
  17109. MOVW DX, (AX)
  17110. ADDQ $0x02, BX
  17111. ADDQ $0x02, AX
  17112. JMP gen_emit_copy_end
  17113. repeat_two_offset_standalone_emit_copy:
  17114. XORQ SI, SI
  17115. LEAL 1(SI)(DX*4), DX
  17116. MOVB CL, 1(AX)
  17117. SARL $0x08, CX
  17118. SHLL $0x05, CX
  17119. ORL CX, DX
  17120. MOVB DL, (AX)
  17121. ADDQ $0x02, BX
  17122. ADDQ $0x02, AX
  17123. JMP gen_emit_copy_end
  17124. four_bytes_remain_standalone:
  17125. TESTL DX, DX
  17126. JZ gen_emit_copy_end
  17127. XORL SI, SI
  17128. LEAL -1(SI)(DX*4), DX
  17129. MOVB DL, (AX)
  17130. MOVL CX, 1(AX)
  17131. ADDQ $0x05, BX
  17132. ADDQ $0x05, AX
  17133. JMP gen_emit_copy_end
  17134. two_byte_offset_standalone:
  17135. CMPL DX, $0x40
  17136. JBE two_byte_offset_short_standalone
  17137. CMPL CX, $0x00000800
  17138. JAE long_offset_short_standalone
  17139. MOVL $0x00000001, SI
  17140. LEAL 16(SI), SI
  17141. MOVB CL, 1(AX)
  17142. MOVL CX, DI
  17143. SHRL $0x08, DI
  17144. SHLL $0x05, DI
  17145. ORL DI, SI
  17146. MOVB SI, (AX)
  17147. ADDQ $0x02, BX
  17148. ADDQ $0x02, AX
  17149. SUBL $0x08, DX
  17150. // emitRepeat
  17151. LEAL -4(DX), DX
  17152. JMP cant_repeat_two_offset_standalone_emit_copy_short_2b
  17153. emit_repeat_again_standalone_emit_copy_short_2b:
  17154. MOVL DX, SI
  17155. LEAL -4(DX), DX
  17156. CMPL SI, $0x08
  17157. JBE repeat_two_standalone_emit_copy_short_2b
  17158. CMPL SI, $0x0c
  17159. JAE cant_repeat_two_offset_standalone_emit_copy_short_2b
  17160. CMPL CX, $0x00000800
  17161. JB repeat_two_offset_standalone_emit_copy_short_2b
  17162. cant_repeat_two_offset_standalone_emit_copy_short_2b:
  17163. CMPL DX, $0x00000104
  17164. JB repeat_three_standalone_emit_copy_short_2b
  17165. CMPL DX, $0x00010100
  17166. JB repeat_four_standalone_emit_copy_short_2b
  17167. CMPL DX, $0x0100ffff
  17168. JB repeat_five_standalone_emit_copy_short_2b
  17169. LEAL -16842747(DX), DX
  17170. MOVL $0xfffb001d, (AX)
  17171. MOVB $0xff, 4(AX)
  17172. ADDQ $0x05, AX
  17173. ADDQ $0x05, BX
  17174. JMP emit_repeat_again_standalone_emit_copy_short_2b
  17175. repeat_five_standalone_emit_copy_short_2b:
  17176. LEAL -65536(DX), DX
  17177. MOVL DX, CX
  17178. MOVW $0x001d, (AX)
  17179. MOVW DX, 2(AX)
  17180. SARL $0x10, CX
  17181. MOVB CL, 4(AX)
  17182. ADDQ $0x05, BX
  17183. ADDQ $0x05, AX
  17184. JMP gen_emit_copy_end
  17185. repeat_four_standalone_emit_copy_short_2b:
  17186. LEAL -256(DX), DX
  17187. MOVW $0x0019, (AX)
  17188. MOVW DX, 2(AX)
  17189. ADDQ $0x04, BX
  17190. ADDQ $0x04, AX
  17191. JMP gen_emit_copy_end
  17192. repeat_three_standalone_emit_copy_short_2b:
  17193. LEAL -4(DX), DX
  17194. MOVW $0x0015, (AX)
  17195. MOVB DL, 2(AX)
  17196. ADDQ $0x03, BX
  17197. ADDQ $0x03, AX
  17198. JMP gen_emit_copy_end
  17199. repeat_two_standalone_emit_copy_short_2b:
  17200. SHLL $0x02, DX
  17201. ORL $0x01, DX
  17202. MOVW DX, (AX)
  17203. ADDQ $0x02, BX
  17204. ADDQ $0x02, AX
  17205. JMP gen_emit_copy_end
  17206. repeat_two_offset_standalone_emit_copy_short_2b:
  17207. XORQ SI, SI
  17208. LEAL 1(SI)(DX*4), DX
  17209. MOVB CL, 1(AX)
  17210. SARL $0x08, CX
  17211. SHLL $0x05, CX
  17212. ORL CX, DX
  17213. MOVB DL, (AX)
  17214. ADDQ $0x02, BX
  17215. ADDQ $0x02, AX
  17216. JMP gen_emit_copy_end
  17217. long_offset_short_standalone:
  17218. MOVB $0xee, (AX)
  17219. MOVW CX, 1(AX)
  17220. LEAL -60(DX), DX
  17221. ADDQ $0x03, AX
  17222. ADDQ $0x03, BX
  17223. // emitRepeat
  17224. emit_repeat_again_standalone_emit_copy_short:
  17225. MOVL DX, SI
  17226. LEAL -4(DX), DX
  17227. CMPL SI, $0x08
  17228. JBE repeat_two_standalone_emit_copy_short
  17229. CMPL SI, $0x0c
  17230. JAE cant_repeat_two_offset_standalone_emit_copy_short
  17231. CMPL CX, $0x00000800
  17232. JB repeat_two_offset_standalone_emit_copy_short
  17233. cant_repeat_two_offset_standalone_emit_copy_short:
  17234. CMPL DX, $0x00000104
  17235. JB repeat_three_standalone_emit_copy_short
  17236. CMPL DX, $0x00010100
  17237. JB repeat_four_standalone_emit_copy_short
  17238. CMPL DX, $0x0100ffff
  17239. JB repeat_five_standalone_emit_copy_short
  17240. LEAL -16842747(DX), DX
  17241. MOVL $0xfffb001d, (AX)
  17242. MOVB $0xff, 4(AX)
  17243. ADDQ $0x05, AX
  17244. ADDQ $0x05, BX
  17245. JMP emit_repeat_again_standalone_emit_copy_short
  17246. repeat_five_standalone_emit_copy_short:
  17247. LEAL -65536(DX), DX
  17248. MOVL DX, CX
  17249. MOVW $0x001d, (AX)
  17250. MOVW DX, 2(AX)
  17251. SARL $0x10, CX
  17252. MOVB CL, 4(AX)
  17253. ADDQ $0x05, BX
  17254. ADDQ $0x05, AX
  17255. JMP gen_emit_copy_end
  17256. repeat_four_standalone_emit_copy_short:
  17257. LEAL -256(DX), DX
  17258. MOVW $0x0019, (AX)
  17259. MOVW DX, 2(AX)
  17260. ADDQ $0x04, BX
  17261. ADDQ $0x04, AX
  17262. JMP gen_emit_copy_end
  17263. repeat_three_standalone_emit_copy_short:
  17264. LEAL -4(DX), DX
  17265. MOVW $0x0015, (AX)
  17266. MOVB DL, 2(AX)
  17267. ADDQ $0x03, BX
  17268. ADDQ $0x03, AX
  17269. JMP gen_emit_copy_end
  17270. repeat_two_standalone_emit_copy_short:
  17271. SHLL $0x02, DX
  17272. ORL $0x01, DX
  17273. MOVW DX, (AX)
  17274. ADDQ $0x02, BX
  17275. ADDQ $0x02, AX
  17276. JMP gen_emit_copy_end
  17277. repeat_two_offset_standalone_emit_copy_short:
  17278. XORQ SI, SI
  17279. LEAL 1(SI)(DX*4), DX
  17280. MOVB CL, 1(AX)
  17281. SARL $0x08, CX
  17282. SHLL $0x05, CX
  17283. ORL CX, DX
  17284. MOVB DL, (AX)
  17285. ADDQ $0x02, BX
  17286. ADDQ $0x02, AX
  17287. JMP gen_emit_copy_end
  17288. two_byte_offset_short_standalone:
  17289. MOVL DX, SI
  17290. SHLL $0x02, SI
  17291. CMPL DX, $0x0c
  17292. JAE emit_copy_three_standalone
  17293. CMPL CX, $0x00000800
  17294. JAE emit_copy_three_standalone
  17295. LEAL -15(SI), SI
  17296. MOVB CL, 1(AX)
  17297. SHRL $0x08, CX
  17298. SHLL $0x05, CX
  17299. ORL CX, SI
  17300. MOVB SI, (AX)
  17301. ADDQ $0x02, BX
  17302. ADDQ $0x02, AX
  17303. JMP gen_emit_copy_end
  17304. emit_copy_three_standalone:
  17305. LEAL -2(SI), SI
  17306. MOVB SI, (AX)
  17307. MOVW CX, 1(AX)
  17308. ADDQ $0x03, BX
  17309. ADDQ $0x03, AX
  17310. gen_emit_copy_end:
  17311. MOVQ BX, ret+40(FP)
  17312. RET
  17313. // func emitCopyNoRepeat(dst []byte, offset int, length int) int
  17314. TEXT ·emitCopyNoRepeat(SB), NOSPLIT, $0-48
  17315. XORQ BX, BX
  17316. MOVQ dst_base+0(FP), AX
  17317. MOVQ offset+24(FP), CX
  17318. MOVQ length+32(FP), DX
  17319. // emitCopy
  17320. CMPL CX, $0x00010000
  17321. JB two_byte_offset_standalone_snappy
  17322. four_bytes_loop_back_standalone_snappy:
  17323. CMPL DX, $0x40
  17324. JBE four_bytes_remain_standalone_snappy
  17325. MOVB $0xff, (AX)
  17326. MOVL CX, 1(AX)
  17327. LEAL -64(DX), DX
  17328. ADDQ $0x05, BX
  17329. ADDQ $0x05, AX
  17330. CMPL DX, $0x04
  17331. JB four_bytes_remain_standalone_snappy
  17332. JMP four_bytes_loop_back_standalone_snappy
  17333. four_bytes_remain_standalone_snappy:
  17334. TESTL DX, DX
  17335. JZ gen_emit_copy_end_snappy
  17336. XORL SI, SI
  17337. LEAL -1(SI)(DX*4), DX
  17338. MOVB DL, (AX)
  17339. MOVL CX, 1(AX)
  17340. ADDQ $0x05, BX
  17341. ADDQ $0x05, AX
  17342. JMP gen_emit_copy_end_snappy
  17343. two_byte_offset_standalone_snappy:
  17344. CMPL DX, $0x40
  17345. JBE two_byte_offset_short_standalone_snappy
  17346. MOVB $0xee, (AX)
  17347. MOVW CX, 1(AX)
  17348. LEAL -60(DX), DX
  17349. ADDQ $0x03, AX
  17350. ADDQ $0x03, BX
  17351. JMP two_byte_offset_standalone_snappy
  17352. two_byte_offset_short_standalone_snappy:
  17353. MOVL DX, SI
  17354. SHLL $0x02, SI
  17355. CMPL DX, $0x0c
  17356. JAE emit_copy_three_standalone_snappy
  17357. CMPL CX, $0x00000800
  17358. JAE emit_copy_three_standalone_snappy
  17359. LEAL -15(SI), SI
  17360. MOVB CL, 1(AX)
  17361. SHRL $0x08, CX
  17362. SHLL $0x05, CX
  17363. ORL CX, SI
  17364. MOVB SI, (AX)
  17365. ADDQ $0x02, BX
  17366. ADDQ $0x02, AX
  17367. JMP gen_emit_copy_end_snappy
  17368. emit_copy_three_standalone_snappy:
  17369. LEAL -2(SI), SI
  17370. MOVB SI, (AX)
  17371. MOVW CX, 1(AX)
  17372. ADDQ $0x03, BX
  17373. ADDQ $0x03, AX
  17374. gen_emit_copy_end_snappy:
  17375. MOVQ BX, ret+40(FP)
  17376. RET
  17377. // func matchLen(a []byte, b []byte) int
  17378. // Requires: BMI
  17379. TEXT ·matchLen(SB), NOSPLIT, $0-56
  17380. MOVQ a_base+0(FP), AX
  17381. MOVQ b_base+24(FP), CX
  17382. MOVQ a_len+8(FP), DX
  17383. // matchLen
  17384. XORL SI, SI
  17385. matchlen_loopback_16_standalone:
  17386. CMPL DX, $0x10
  17387. JB matchlen_match8_standalone
  17388. MOVQ (AX)(SI*1), BX
  17389. MOVQ 8(AX)(SI*1), DI
  17390. XORQ (CX)(SI*1), BX
  17391. JNZ matchlen_bsf_8_standalone
  17392. XORQ 8(CX)(SI*1), DI
  17393. JNZ matchlen_bsf_16standalone
  17394. LEAL -16(DX), DX
  17395. LEAL 16(SI), SI
  17396. JMP matchlen_loopback_16_standalone
  17397. matchlen_bsf_16standalone:
  17398. #ifdef GOAMD64_v3
  17399. TZCNTQ DI, DI
  17400. #else
  17401. BSFQ DI, DI
  17402. #endif
  17403. SARQ $0x03, DI
  17404. LEAL 8(SI)(DI*1), SI
  17405. JMP gen_match_len_end
  17406. matchlen_match8_standalone:
  17407. CMPL DX, $0x08
  17408. JB matchlen_match4_standalone
  17409. MOVQ (AX)(SI*1), BX
  17410. XORQ (CX)(SI*1), BX
  17411. JNZ matchlen_bsf_8_standalone
  17412. LEAL -8(DX), DX
  17413. LEAL 8(SI), SI
  17414. JMP matchlen_match4_standalone
  17415. matchlen_bsf_8_standalone:
  17416. #ifdef GOAMD64_v3
  17417. TZCNTQ BX, BX
  17418. #else
  17419. BSFQ BX, BX
  17420. #endif
  17421. SARQ $0x03, BX
  17422. LEAL (SI)(BX*1), SI
  17423. JMP gen_match_len_end
  17424. matchlen_match4_standalone:
  17425. CMPL DX, $0x04
  17426. JB matchlen_match2_standalone
  17427. MOVL (AX)(SI*1), BX
  17428. CMPL (CX)(SI*1), BX
  17429. JNE matchlen_match2_standalone
  17430. LEAL -4(DX), DX
  17431. LEAL 4(SI), SI
  17432. matchlen_match2_standalone:
  17433. CMPL DX, $0x01
  17434. JE matchlen_match1_standalone
  17435. JB gen_match_len_end
  17436. MOVW (AX)(SI*1), BX
  17437. CMPW (CX)(SI*1), BX
  17438. JNE matchlen_match1_standalone
  17439. LEAL 2(SI), SI
  17440. SUBL $0x02, DX
  17441. JZ gen_match_len_end
  17442. matchlen_match1_standalone:
  17443. MOVB (AX)(SI*1), BL
  17444. CMPB (CX)(SI*1), BL
  17445. JNE gen_match_len_end
  17446. LEAL 1(SI), SI
  17447. gen_match_len_end:
  17448. MOVQ SI, ret+48(FP)
  17449. RET
  17450. // func cvtLZ4BlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
  17451. // Requires: SSE2
  17452. TEXT ·cvtLZ4BlockAsm(SB), NOSPLIT, $0-64
  17453. XORQ SI, SI
  17454. MOVQ dst_base+0(FP), AX
  17455. MOVQ dst_len+8(FP), CX
  17456. MOVQ src_base+24(FP), DX
  17457. MOVQ src_len+32(FP), BX
  17458. LEAQ (DX)(BX*1), BX
  17459. LEAQ -10(AX)(CX*1), CX
  17460. XORQ DI, DI
  17461. lz4_s2_loop:
  17462. CMPQ DX, BX
  17463. JAE lz4_s2_corrupt
  17464. CMPQ AX, CX
  17465. JAE lz4_s2_dstfull
  17466. MOVBQZX (DX), R8
  17467. MOVQ R8, R9
  17468. MOVQ R8, R10
  17469. SHRQ $0x04, R9
  17470. ANDQ $0x0f, R10
  17471. CMPQ R8, $0xf0
  17472. JB lz4_s2_ll_end
  17473. lz4_s2_ll_loop:
  17474. INCQ DX
  17475. CMPQ DX, BX
  17476. JAE lz4_s2_corrupt
  17477. MOVBQZX (DX), R8
  17478. ADDQ R8, R9
  17479. CMPQ R8, $0xff
  17480. JEQ lz4_s2_ll_loop
  17481. lz4_s2_ll_end:
  17482. LEAQ (DX)(R9*1), R8
  17483. ADDQ $0x04, R10
  17484. CMPQ R8, BX
  17485. JAE lz4_s2_corrupt
  17486. INCQ DX
  17487. INCQ R8
  17488. TESTQ R9, R9
  17489. JZ lz4_s2_lits_done
  17490. LEAQ (AX)(R9*1), R11
  17491. CMPQ R11, CX
  17492. JAE lz4_s2_dstfull
  17493. ADDQ R9, SI
  17494. LEAL -1(R9), R11
  17495. CMPL R11, $0x3c
  17496. JB one_byte_lz4_s2
  17497. CMPL R11, $0x00000100
  17498. JB two_bytes_lz4_s2
  17499. CMPL R11, $0x00010000
  17500. JB three_bytes_lz4_s2
  17501. CMPL R11, $0x01000000
  17502. JB four_bytes_lz4_s2
  17503. MOVB $0xfc, (AX)
  17504. MOVL R11, 1(AX)
  17505. ADDQ $0x05, AX
  17506. JMP memmove_long_lz4_s2
  17507. four_bytes_lz4_s2:
  17508. MOVL R11, R12
  17509. SHRL $0x10, R12
  17510. MOVB $0xf8, (AX)
  17511. MOVW R11, 1(AX)
  17512. MOVB R12, 3(AX)
  17513. ADDQ $0x04, AX
  17514. JMP memmove_long_lz4_s2
  17515. three_bytes_lz4_s2:
  17516. MOVB $0xf4, (AX)
  17517. MOVW R11, 1(AX)
  17518. ADDQ $0x03, AX
  17519. JMP memmove_long_lz4_s2
  17520. two_bytes_lz4_s2:
  17521. MOVB $0xf0, (AX)
  17522. MOVB R11, 1(AX)
  17523. ADDQ $0x02, AX
  17524. CMPL R11, $0x40
  17525. JB memmove_lz4_s2
  17526. JMP memmove_long_lz4_s2
  17527. one_byte_lz4_s2:
  17528. SHLB $0x02, R11
  17529. MOVB R11, (AX)
  17530. ADDQ $0x01, AX
  17531. memmove_lz4_s2:
  17532. LEAQ (AX)(R9*1), R11
  17533. // genMemMoveShort
  17534. CMPQ R9, $0x08
  17535. JBE emit_lit_memmove_lz4_s2_memmove_move_8
  17536. CMPQ R9, $0x10
  17537. JBE emit_lit_memmove_lz4_s2_memmove_move_8through16
  17538. CMPQ R9, $0x20
  17539. JBE emit_lit_memmove_lz4_s2_memmove_move_17through32
  17540. JMP emit_lit_memmove_lz4_s2_memmove_move_33through64
  17541. emit_lit_memmove_lz4_s2_memmove_move_8:
  17542. MOVQ (DX), R12
  17543. MOVQ R12, (AX)
  17544. JMP memmove_end_copy_lz4_s2
  17545. emit_lit_memmove_lz4_s2_memmove_move_8through16:
  17546. MOVQ (DX), R12
  17547. MOVQ -8(DX)(R9*1), DX
  17548. MOVQ R12, (AX)
  17549. MOVQ DX, -8(AX)(R9*1)
  17550. JMP memmove_end_copy_lz4_s2
  17551. emit_lit_memmove_lz4_s2_memmove_move_17through32:
  17552. MOVOU (DX), X0
  17553. MOVOU -16(DX)(R9*1), X1
  17554. MOVOU X0, (AX)
  17555. MOVOU X1, -16(AX)(R9*1)
  17556. JMP memmove_end_copy_lz4_s2
  17557. emit_lit_memmove_lz4_s2_memmove_move_33through64:
  17558. MOVOU (DX), X0
  17559. MOVOU 16(DX), X1
  17560. MOVOU -32(DX)(R9*1), X2
  17561. MOVOU -16(DX)(R9*1), X3
  17562. MOVOU X0, (AX)
  17563. MOVOU X1, 16(AX)
  17564. MOVOU X2, -32(AX)(R9*1)
  17565. MOVOU X3, -16(AX)(R9*1)
  17566. memmove_end_copy_lz4_s2:
  17567. MOVQ R11, AX
  17568. JMP lz4_s2_lits_emit_done
  17569. memmove_long_lz4_s2:
  17570. LEAQ (AX)(R9*1), R11
  17571. // genMemMoveLong
  17572. MOVOU (DX), X0
  17573. MOVOU 16(DX), X1
  17574. MOVOU -32(DX)(R9*1), X2
  17575. MOVOU -16(DX)(R9*1), X3
  17576. MOVQ R9, R13
  17577. SHRQ $0x05, R13
  17578. MOVQ AX, R12
  17579. ANDL $0x0000001f, R12
  17580. MOVQ $0x00000040, R14
  17581. SUBQ R12, R14
  17582. DECQ R13
  17583. JA emit_lit_memmove_long_lz4_s2large_forward_sse_loop_32
  17584. LEAQ -32(DX)(R14*1), R12
  17585. LEAQ -32(AX)(R14*1), R15
  17586. emit_lit_memmove_long_lz4_s2large_big_loop_back:
  17587. MOVOU (R12), X4
  17588. MOVOU 16(R12), X5
  17589. MOVOA X4, (R15)
  17590. MOVOA X5, 16(R15)
  17591. ADDQ $0x20, R15
  17592. ADDQ $0x20, R12
  17593. ADDQ $0x20, R14
  17594. DECQ R13
  17595. JNA emit_lit_memmove_long_lz4_s2large_big_loop_back
  17596. emit_lit_memmove_long_lz4_s2large_forward_sse_loop_32:
  17597. MOVOU -32(DX)(R14*1), X4
  17598. MOVOU -16(DX)(R14*1), X5
  17599. MOVOA X4, -32(AX)(R14*1)
  17600. MOVOA X5, -16(AX)(R14*1)
  17601. ADDQ $0x20, R14
  17602. CMPQ R9, R14
  17603. JAE emit_lit_memmove_long_lz4_s2large_forward_sse_loop_32
  17604. MOVOU X0, (AX)
  17605. MOVOU X1, 16(AX)
  17606. MOVOU X2, -32(AX)(R9*1)
  17607. MOVOU X3, -16(AX)(R9*1)
  17608. MOVQ R11, AX
  17609. lz4_s2_lits_emit_done:
  17610. MOVQ R8, DX
  17611. lz4_s2_lits_done:
  17612. CMPQ DX, BX
  17613. JNE lz4_s2_match
  17614. CMPQ R10, $0x04
  17615. JEQ lz4_s2_done
  17616. JMP lz4_s2_corrupt
  17617. lz4_s2_match:
  17618. LEAQ 2(DX), R8
  17619. CMPQ R8, BX
  17620. JAE lz4_s2_corrupt
  17621. MOVWQZX (DX), R9
  17622. MOVQ R8, DX
  17623. TESTQ R9, R9
  17624. JZ lz4_s2_corrupt
  17625. CMPQ R9, SI
  17626. JA lz4_s2_corrupt
  17627. CMPQ R10, $0x13
  17628. JNE lz4_s2_ml_done
  17629. lz4_s2_ml_loop:
  17630. MOVBQZX (DX), R8
  17631. INCQ DX
  17632. ADDQ R8, R10
  17633. CMPQ DX, BX
  17634. JAE lz4_s2_corrupt
  17635. CMPQ R8, $0xff
  17636. JEQ lz4_s2_ml_loop
  17637. lz4_s2_ml_done:
  17638. ADDQ R10, SI
  17639. CMPQ R9, DI
  17640. JNE lz4_s2_docopy
  17641. // emitRepeat
  17642. emit_repeat_again_lz4_s2:
  17643. MOVL R10, R8
  17644. LEAL -4(R10), R10
  17645. CMPL R8, $0x08
  17646. JBE repeat_two_lz4_s2
  17647. CMPL R8, $0x0c
  17648. JAE cant_repeat_two_offset_lz4_s2
  17649. CMPL R9, $0x00000800
  17650. JB repeat_two_offset_lz4_s2
  17651. cant_repeat_two_offset_lz4_s2:
  17652. CMPL R10, $0x00000104
  17653. JB repeat_three_lz4_s2
  17654. CMPL R10, $0x00010100
  17655. JB repeat_four_lz4_s2
  17656. CMPL R10, $0x0100ffff
  17657. JB repeat_five_lz4_s2
  17658. LEAL -16842747(R10), R10
  17659. MOVL $0xfffb001d, (AX)
  17660. MOVB $0xff, 4(AX)
  17661. ADDQ $0x05, AX
  17662. JMP emit_repeat_again_lz4_s2
  17663. repeat_five_lz4_s2:
  17664. LEAL -65536(R10), R10
  17665. MOVL R10, R9
  17666. MOVW $0x001d, (AX)
  17667. MOVW R10, 2(AX)
  17668. SARL $0x10, R9
  17669. MOVB R9, 4(AX)
  17670. ADDQ $0x05, AX
  17671. JMP lz4_s2_loop
  17672. repeat_four_lz4_s2:
  17673. LEAL -256(R10), R10
  17674. MOVW $0x0019, (AX)
  17675. MOVW R10, 2(AX)
  17676. ADDQ $0x04, AX
  17677. JMP lz4_s2_loop
  17678. repeat_three_lz4_s2:
  17679. LEAL -4(R10), R10
  17680. MOVW $0x0015, (AX)
  17681. MOVB R10, 2(AX)
  17682. ADDQ $0x03, AX
  17683. JMP lz4_s2_loop
  17684. repeat_two_lz4_s2:
  17685. SHLL $0x02, R10
  17686. ORL $0x01, R10
  17687. MOVW R10, (AX)
  17688. ADDQ $0x02, AX
  17689. JMP lz4_s2_loop
  17690. repeat_two_offset_lz4_s2:
  17691. XORQ R8, R8
  17692. LEAL 1(R8)(R10*4), R10
  17693. MOVB R9, 1(AX)
  17694. SARL $0x08, R9
  17695. SHLL $0x05, R9
  17696. ORL R9, R10
  17697. MOVB R10, (AX)
  17698. ADDQ $0x02, AX
  17699. JMP lz4_s2_loop
  17700. lz4_s2_docopy:
  17701. MOVQ R9, DI
  17702. // emitCopy
  17703. CMPL R10, $0x40
  17704. JBE two_byte_offset_short_lz4_s2
  17705. CMPL R9, $0x00000800
  17706. JAE long_offset_short_lz4_s2
  17707. MOVL $0x00000001, R8
  17708. LEAL 16(R8), R8
  17709. MOVB R9, 1(AX)
  17710. MOVL R9, R11
  17711. SHRL $0x08, R11
  17712. SHLL $0x05, R11
  17713. ORL R11, R8
  17714. MOVB R8, (AX)
  17715. ADDQ $0x02, AX
  17716. SUBL $0x08, R10
  17717. // emitRepeat
  17718. LEAL -4(R10), R10
  17719. JMP cant_repeat_two_offset_lz4_s2_emit_copy_short_2b
  17720. emit_repeat_again_lz4_s2_emit_copy_short_2b:
  17721. MOVL R10, R8
  17722. LEAL -4(R10), R10
  17723. CMPL R8, $0x08
  17724. JBE repeat_two_lz4_s2_emit_copy_short_2b
  17725. CMPL R8, $0x0c
  17726. JAE cant_repeat_two_offset_lz4_s2_emit_copy_short_2b
  17727. CMPL R9, $0x00000800
  17728. JB repeat_two_offset_lz4_s2_emit_copy_short_2b
  17729. cant_repeat_two_offset_lz4_s2_emit_copy_short_2b:
  17730. CMPL R10, $0x00000104
  17731. JB repeat_three_lz4_s2_emit_copy_short_2b
  17732. CMPL R10, $0x00010100
  17733. JB repeat_four_lz4_s2_emit_copy_short_2b
  17734. CMPL R10, $0x0100ffff
  17735. JB repeat_five_lz4_s2_emit_copy_short_2b
  17736. LEAL -16842747(R10), R10
  17737. MOVL $0xfffb001d, (AX)
  17738. MOVB $0xff, 4(AX)
  17739. ADDQ $0x05, AX
  17740. JMP emit_repeat_again_lz4_s2_emit_copy_short_2b
  17741. repeat_five_lz4_s2_emit_copy_short_2b:
  17742. LEAL -65536(R10), R10
  17743. MOVL R10, R9
  17744. MOVW $0x001d, (AX)
  17745. MOVW R10, 2(AX)
  17746. SARL $0x10, R9
  17747. MOVB R9, 4(AX)
  17748. ADDQ $0x05, AX
  17749. JMP lz4_s2_loop
  17750. repeat_four_lz4_s2_emit_copy_short_2b:
  17751. LEAL -256(R10), R10
  17752. MOVW $0x0019, (AX)
  17753. MOVW R10, 2(AX)
  17754. ADDQ $0x04, AX
  17755. JMP lz4_s2_loop
  17756. repeat_three_lz4_s2_emit_copy_short_2b:
  17757. LEAL -4(R10), R10
  17758. MOVW $0x0015, (AX)
  17759. MOVB R10, 2(AX)
  17760. ADDQ $0x03, AX
  17761. JMP lz4_s2_loop
  17762. repeat_two_lz4_s2_emit_copy_short_2b:
  17763. SHLL $0x02, R10
  17764. ORL $0x01, R10
  17765. MOVW R10, (AX)
  17766. ADDQ $0x02, AX
  17767. JMP lz4_s2_loop
  17768. repeat_two_offset_lz4_s2_emit_copy_short_2b:
  17769. XORQ R8, R8
  17770. LEAL 1(R8)(R10*4), R10
  17771. MOVB R9, 1(AX)
  17772. SARL $0x08, R9
  17773. SHLL $0x05, R9
  17774. ORL R9, R10
  17775. MOVB R10, (AX)
  17776. ADDQ $0x02, AX
  17777. JMP lz4_s2_loop
  17778. long_offset_short_lz4_s2:
  17779. MOVB $0xee, (AX)
  17780. MOVW R9, 1(AX)
  17781. LEAL -60(R10), R10
  17782. ADDQ $0x03, AX
  17783. // emitRepeat
  17784. emit_repeat_again_lz4_s2_emit_copy_short:
  17785. MOVL R10, R8
  17786. LEAL -4(R10), R10
  17787. CMPL R8, $0x08
  17788. JBE repeat_two_lz4_s2_emit_copy_short
  17789. CMPL R8, $0x0c
  17790. JAE cant_repeat_two_offset_lz4_s2_emit_copy_short
  17791. CMPL R9, $0x00000800
  17792. JB repeat_two_offset_lz4_s2_emit_copy_short
  17793. cant_repeat_two_offset_lz4_s2_emit_copy_short:
  17794. CMPL R10, $0x00000104
  17795. JB repeat_three_lz4_s2_emit_copy_short
  17796. CMPL R10, $0x00010100
  17797. JB repeat_four_lz4_s2_emit_copy_short
  17798. CMPL R10, $0x0100ffff
  17799. JB repeat_five_lz4_s2_emit_copy_short
  17800. LEAL -16842747(R10), R10
  17801. MOVL $0xfffb001d, (AX)
  17802. MOVB $0xff, 4(AX)
  17803. ADDQ $0x05, AX
  17804. JMP emit_repeat_again_lz4_s2_emit_copy_short
  17805. repeat_five_lz4_s2_emit_copy_short:
  17806. LEAL -65536(R10), R10
  17807. MOVL R10, R9
  17808. MOVW $0x001d, (AX)
  17809. MOVW R10, 2(AX)
  17810. SARL $0x10, R9
  17811. MOVB R9, 4(AX)
  17812. ADDQ $0x05, AX
  17813. JMP lz4_s2_loop
  17814. repeat_four_lz4_s2_emit_copy_short:
  17815. LEAL -256(R10), R10
  17816. MOVW $0x0019, (AX)
  17817. MOVW R10, 2(AX)
  17818. ADDQ $0x04, AX
  17819. JMP lz4_s2_loop
  17820. repeat_three_lz4_s2_emit_copy_short:
  17821. LEAL -4(R10), R10
  17822. MOVW $0x0015, (AX)
  17823. MOVB R10, 2(AX)
  17824. ADDQ $0x03, AX
  17825. JMP lz4_s2_loop
  17826. repeat_two_lz4_s2_emit_copy_short:
  17827. SHLL $0x02, R10
  17828. ORL $0x01, R10
  17829. MOVW R10, (AX)
  17830. ADDQ $0x02, AX
  17831. JMP lz4_s2_loop
  17832. repeat_two_offset_lz4_s2_emit_copy_short:
  17833. XORQ R8, R8
  17834. LEAL 1(R8)(R10*4), R10
  17835. MOVB R9, 1(AX)
  17836. SARL $0x08, R9
  17837. SHLL $0x05, R9
  17838. ORL R9, R10
  17839. MOVB R10, (AX)
  17840. ADDQ $0x02, AX
  17841. JMP lz4_s2_loop
  17842. two_byte_offset_short_lz4_s2:
  17843. MOVL R10, R8
  17844. SHLL $0x02, R8
  17845. CMPL R10, $0x0c
  17846. JAE emit_copy_three_lz4_s2
  17847. CMPL R9, $0x00000800
  17848. JAE emit_copy_three_lz4_s2
  17849. LEAL -15(R8), R8
  17850. MOVB R9, 1(AX)
  17851. SHRL $0x08, R9
  17852. SHLL $0x05, R9
  17853. ORL R9, R8
  17854. MOVB R8, (AX)
  17855. ADDQ $0x02, AX
  17856. JMP lz4_s2_loop
  17857. emit_copy_three_lz4_s2:
  17858. LEAL -2(R8), R8
  17859. MOVB R8, (AX)
  17860. MOVW R9, 1(AX)
  17861. ADDQ $0x03, AX
  17862. JMP lz4_s2_loop
  17863. lz4_s2_done:
  17864. MOVQ dst_base+0(FP), CX
  17865. SUBQ CX, AX
  17866. MOVQ SI, uncompressed+48(FP)
  17867. MOVQ AX, dstUsed+56(FP)
  17868. RET
  17869. lz4_s2_corrupt:
  17870. XORQ AX, AX
  17871. LEAQ -1(AX), SI
  17872. MOVQ SI, uncompressed+48(FP)
  17873. RET
  17874. lz4_s2_dstfull:
  17875. XORQ AX, AX
  17876. LEAQ -2(AX), SI
  17877. MOVQ SI, uncompressed+48(FP)
  17878. RET
  17879. // func cvtLZ4sBlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
  17880. // Requires: SSE2
  17881. TEXT ·cvtLZ4sBlockAsm(SB), NOSPLIT, $0-64
  17882. XORQ SI, SI
  17883. MOVQ dst_base+0(FP), AX
  17884. MOVQ dst_len+8(FP), CX
  17885. MOVQ src_base+24(FP), DX
  17886. MOVQ src_len+32(FP), BX
  17887. LEAQ (DX)(BX*1), BX
  17888. LEAQ -10(AX)(CX*1), CX
  17889. XORQ DI, DI
  17890. lz4s_s2_loop:
  17891. CMPQ DX, BX
  17892. JAE lz4s_s2_corrupt
  17893. CMPQ AX, CX
  17894. JAE lz4s_s2_dstfull
  17895. MOVBQZX (DX), R8
  17896. MOVQ R8, R9
  17897. MOVQ R8, R10
  17898. SHRQ $0x04, R9
  17899. ANDQ $0x0f, R10
  17900. CMPQ R8, $0xf0
  17901. JB lz4s_s2_ll_end
  17902. lz4s_s2_ll_loop:
  17903. INCQ DX
  17904. CMPQ DX, BX
  17905. JAE lz4s_s2_corrupt
  17906. MOVBQZX (DX), R8
  17907. ADDQ R8, R9
  17908. CMPQ R8, $0xff
  17909. JEQ lz4s_s2_ll_loop
  17910. lz4s_s2_ll_end:
  17911. LEAQ (DX)(R9*1), R8
  17912. ADDQ $0x03, R10
  17913. CMPQ R8, BX
  17914. JAE lz4s_s2_corrupt
  17915. INCQ DX
  17916. INCQ R8
  17917. TESTQ R9, R9
  17918. JZ lz4s_s2_lits_done
  17919. LEAQ (AX)(R9*1), R11
  17920. CMPQ R11, CX
  17921. JAE lz4s_s2_dstfull
  17922. ADDQ R9, SI
  17923. LEAL -1(R9), R11
  17924. CMPL R11, $0x3c
  17925. JB one_byte_lz4s_s2
  17926. CMPL R11, $0x00000100
  17927. JB two_bytes_lz4s_s2
  17928. CMPL R11, $0x00010000
  17929. JB three_bytes_lz4s_s2
  17930. CMPL R11, $0x01000000
  17931. JB four_bytes_lz4s_s2
  17932. MOVB $0xfc, (AX)
  17933. MOVL R11, 1(AX)
  17934. ADDQ $0x05, AX
  17935. JMP memmove_long_lz4s_s2
  17936. four_bytes_lz4s_s2:
  17937. MOVL R11, R12
  17938. SHRL $0x10, R12
  17939. MOVB $0xf8, (AX)
  17940. MOVW R11, 1(AX)
  17941. MOVB R12, 3(AX)
  17942. ADDQ $0x04, AX
  17943. JMP memmove_long_lz4s_s2
  17944. three_bytes_lz4s_s2:
  17945. MOVB $0xf4, (AX)
  17946. MOVW R11, 1(AX)
  17947. ADDQ $0x03, AX
  17948. JMP memmove_long_lz4s_s2
  17949. two_bytes_lz4s_s2:
  17950. MOVB $0xf0, (AX)
  17951. MOVB R11, 1(AX)
  17952. ADDQ $0x02, AX
  17953. CMPL R11, $0x40
  17954. JB memmove_lz4s_s2
  17955. JMP memmove_long_lz4s_s2
  17956. one_byte_lz4s_s2:
  17957. SHLB $0x02, R11
  17958. MOVB R11, (AX)
  17959. ADDQ $0x01, AX
  17960. memmove_lz4s_s2:
  17961. LEAQ (AX)(R9*1), R11
  17962. // genMemMoveShort
  17963. CMPQ R9, $0x08
  17964. JBE emit_lit_memmove_lz4s_s2_memmove_move_8
  17965. CMPQ R9, $0x10
  17966. JBE emit_lit_memmove_lz4s_s2_memmove_move_8through16
  17967. CMPQ R9, $0x20
  17968. JBE emit_lit_memmove_lz4s_s2_memmove_move_17through32
  17969. JMP emit_lit_memmove_lz4s_s2_memmove_move_33through64
  17970. emit_lit_memmove_lz4s_s2_memmove_move_8:
  17971. MOVQ (DX), R12
  17972. MOVQ R12, (AX)
  17973. JMP memmove_end_copy_lz4s_s2
  17974. emit_lit_memmove_lz4s_s2_memmove_move_8through16:
  17975. MOVQ (DX), R12
  17976. MOVQ -8(DX)(R9*1), DX
  17977. MOVQ R12, (AX)
  17978. MOVQ DX, -8(AX)(R9*1)
  17979. JMP memmove_end_copy_lz4s_s2
  17980. emit_lit_memmove_lz4s_s2_memmove_move_17through32:
  17981. MOVOU (DX), X0
  17982. MOVOU -16(DX)(R9*1), X1
  17983. MOVOU X0, (AX)
  17984. MOVOU X1, -16(AX)(R9*1)
  17985. JMP memmove_end_copy_lz4s_s2
  17986. emit_lit_memmove_lz4s_s2_memmove_move_33through64:
  17987. MOVOU (DX), X0
  17988. MOVOU 16(DX), X1
  17989. MOVOU -32(DX)(R9*1), X2
  17990. MOVOU -16(DX)(R9*1), X3
  17991. MOVOU X0, (AX)
  17992. MOVOU X1, 16(AX)
  17993. MOVOU X2, -32(AX)(R9*1)
  17994. MOVOU X3, -16(AX)(R9*1)
  17995. memmove_end_copy_lz4s_s2:
  17996. MOVQ R11, AX
  17997. JMP lz4s_s2_lits_emit_done
  17998. memmove_long_lz4s_s2:
  17999. LEAQ (AX)(R9*1), R11
  18000. // genMemMoveLong
  18001. MOVOU (DX), X0
  18002. MOVOU 16(DX), X1
  18003. MOVOU -32(DX)(R9*1), X2
  18004. MOVOU -16(DX)(R9*1), X3
  18005. MOVQ R9, R13
  18006. SHRQ $0x05, R13
  18007. MOVQ AX, R12
  18008. ANDL $0x0000001f, R12
  18009. MOVQ $0x00000040, R14
  18010. SUBQ R12, R14
  18011. DECQ R13
  18012. JA emit_lit_memmove_long_lz4s_s2large_forward_sse_loop_32
  18013. LEAQ -32(DX)(R14*1), R12
  18014. LEAQ -32(AX)(R14*1), R15
  18015. emit_lit_memmove_long_lz4s_s2large_big_loop_back:
  18016. MOVOU (R12), X4
  18017. MOVOU 16(R12), X5
  18018. MOVOA X4, (R15)
  18019. MOVOA X5, 16(R15)
  18020. ADDQ $0x20, R15
  18021. ADDQ $0x20, R12
  18022. ADDQ $0x20, R14
  18023. DECQ R13
  18024. JNA emit_lit_memmove_long_lz4s_s2large_big_loop_back
  18025. emit_lit_memmove_long_lz4s_s2large_forward_sse_loop_32:
  18026. MOVOU -32(DX)(R14*1), X4
  18027. MOVOU -16(DX)(R14*1), X5
  18028. MOVOA X4, -32(AX)(R14*1)
  18029. MOVOA X5, -16(AX)(R14*1)
  18030. ADDQ $0x20, R14
  18031. CMPQ R9, R14
  18032. JAE emit_lit_memmove_long_lz4s_s2large_forward_sse_loop_32
  18033. MOVOU X0, (AX)
  18034. MOVOU X1, 16(AX)
  18035. MOVOU X2, -32(AX)(R9*1)
  18036. MOVOU X3, -16(AX)(R9*1)
  18037. MOVQ R11, AX
  18038. lz4s_s2_lits_emit_done:
  18039. MOVQ R8, DX
  18040. lz4s_s2_lits_done:
  18041. CMPQ DX, BX
  18042. JNE lz4s_s2_match
  18043. CMPQ R10, $0x03
  18044. JEQ lz4s_s2_done
  18045. JMP lz4s_s2_corrupt
  18046. lz4s_s2_match:
  18047. CMPQ R10, $0x03
  18048. JEQ lz4s_s2_loop
  18049. LEAQ 2(DX), R8
  18050. CMPQ R8, BX
  18051. JAE lz4s_s2_corrupt
  18052. MOVWQZX (DX), R9
  18053. MOVQ R8, DX
  18054. TESTQ R9, R9
  18055. JZ lz4s_s2_corrupt
  18056. CMPQ R9, SI
  18057. JA lz4s_s2_corrupt
  18058. CMPQ R10, $0x12
  18059. JNE lz4s_s2_ml_done
  18060. lz4s_s2_ml_loop:
  18061. MOVBQZX (DX), R8
  18062. INCQ DX
  18063. ADDQ R8, R10
  18064. CMPQ DX, BX
  18065. JAE lz4s_s2_corrupt
  18066. CMPQ R8, $0xff
  18067. JEQ lz4s_s2_ml_loop
  18068. lz4s_s2_ml_done:
  18069. ADDQ R10, SI
  18070. CMPQ R9, DI
  18071. JNE lz4s_s2_docopy
  18072. // emitRepeat
  18073. emit_repeat_again_lz4_s2:
  18074. MOVL R10, R8
  18075. LEAL -4(R10), R10
  18076. CMPL R8, $0x08
  18077. JBE repeat_two_lz4_s2
  18078. CMPL R8, $0x0c
  18079. JAE cant_repeat_two_offset_lz4_s2
  18080. CMPL R9, $0x00000800
  18081. JB repeat_two_offset_lz4_s2
  18082. cant_repeat_two_offset_lz4_s2:
  18083. CMPL R10, $0x00000104
  18084. JB repeat_three_lz4_s2
  18085. CMPL R10, $0x00010100
  18086. JB repeat_four_lz4_s2
  18087. CMPL R10, $0x0100ffff
  18088. JB repeat_five_lz4_s2
  18089. LEAL -16842747(R10), R10
  18090. MOVL $0xfffb001d, (AX)
  18091. MOVB $0xff, 4(AX)
  18092. ADDQ $0x05, AX
  18093. JMP emit_repeat_again_lz4_s2
  18094. repeat_five_lz4_s2:
  18095. LEAL -65536(R10), R10
  18096. MOVL R10, R9
  18097. MOVW $0x001d, (AX)
  18098. MOVW R10, 2(AX)
  18099. SARL $0x10, R9
  18100. MOVB R9, 4(AX)
  18101. ADDQ $0x05, AX
  18102. JMP lz4s_s2_loop
  18103. repeat_four_lz4_s2:
  18104. LEAL -256(R10), R10
  18105. MOVW $0x0019, (AX)
  18106. MOVW R10, 2(AX)
  18107. ADDQ $0x04, AX
  18108. JMP lz4s_s2_loop
  18109. repeat_three_lz4_s2:
  18110. LEAL -4(R10), R10
  18111. MOVW $0x0015, (AX)
  18112. MOVB R10, 2(AX)
  18113. ADDQ $0x03, AX
  18114. JMP lz4s_s2_loop
  18115. repeat_two_lz4_s2:
  18116. SHLL $0x02, R10
  18117. ORL $0x01, R10
  18118. MOVW R10, (AX)
  18119. ADDQ $0x02, AX
  18120. JMP lz4s_s2_loop
  18121. repeat_two_offset_lz4_s2:
  18122. XORQ R8, R8
  18123. LEAL 1(R8)(R10*4), R10
  18124. MOVB R9, 1(AX)
  18125. SARL $0x08, R9
  18126. SHLL $0x05, R9
  18127. ORL R9, R10
  18128. MOVB R10, (AX)
  18129. ADDQ $0x02, AX
  18130. JMP lz4s_s2_loop
  18131. lz4s_s2_docopy:
  18132. MOVQ R9, DI
  18133. // emitCopy
  18134. CMPL R10, $0x40
  18135. JBE two_byte_offset_short_lz4_s2
  18136. CMPL R9, $0x00000800
  18137. JAE long_offset_short_lz4_s2
  18138. MOVL $0x00000001, R8
  18139. LEAL 16(R8), R8
  18140. MOVB R9, 1(AX)
  18141. MOVL R9, R11
  18142. SHRL $0x08, R11
  18143. SHLL $0x05, R11
  18144. ORL R11, R8
  18145. MOVB R8, (AX)
  18146. ADDQ $0x02, AX
  18147. SUBL $0x08, R10
  18148. // emitRepeat
  18149. LEAL -4(R10), R10
  18150. JMP cant_repeat_two_offset_lz4_s2_emit_copy_short_2b
  18151. emit_repeat_again_lz4_s2_emit_copy_short_2b:
  18152. MOVL R10, R8
  18153. LEAL -4(R10), R10
  18154. CMPL R8, $0x08
  18155. JBE repeat_two_lz4_s2_emit_copy_short_2b
  18156. CMPL R8, $0x0c
  18157. JAE cant_repeat_two_offset_lz4_s2_emit_copy_short_2b
  18158. CMPL R9, $0x00000800
  18159. JB repeat_two_offset_lz4_s2_emit_copy_short_2b
  18160. cant_repeat_two_offset_lz4_s2_emit_copy_short_2b:
  18161. CMPL R10, $0x00000104
  18162. JB repeat_three_lz4_s2_emit_copy_short_2b
  18163. CMPL R10, $0x00010100
  18164. JB repeat_four_lz4_s2_emit_copy_short_2b
  18165. CMPL R10, $0x0100ffff
  18166. JB repeat_five_lz4_s2_emit_copy_short_2b
  18167. LEAL -16842747(R10), R10
  18168. MOVL $0xfffb001d, (AX)
  18169. MOVB $0xff, 4(AX)
  18170. ADDQ $0x05, AX
  18171. JMP emit_repeat_again_lz4_s2_emit_copy_short_2b
  18172. repeat_five_lz4_s2_emit_copy_short_2b:
  18173. LEAL -65536(R10), R10
  18174. MOVL R10, R9
  18175. MOVW $0x001d, (AX)
  18176. MOVW R10, 2(AX)
  18177. SARL $0x10, R9
  18178. MOVB R9, 4(AX)
  18179. ADDQ $0x05, AX
  18180. JMP lz4s_s2_loop
  18181. repeat_four_lz4_s2_emit_copy_short_2b:
  18182. LEAL -256(R10), R10
  18183. MOVW $0x0019, (AX)
  18184. MOVW R10, 2(AX)
  18185. ADDQ $0x04, AX
  18186. JMP lz4s_s2_loop
  18187. repeat_three_lz4_s2_emit_copy_short_2b:
  18188. LEAL -4(R10), R10
  18189. MOVW $0x0015, (AX)
  18190. MOVB R10, 2(AX)
  18191. ADDQ $0x03, AX
  18192. JMP lz4s_s2_loop
  18193. repeat_two_lz4_s2_emit_copy_short_2b:
  18194. SHLL $0x02, R10
  18195. ORL $0x01, R10
  18196. MOVW R10, (AX)
  18197. ADDQ $0x02, AX
  18198. JMP lz4s_s2_loop
  18199. repeat_two_offset_lz4_s2_emit_copy_short_2b:
  18200. XORQ R8, R8
  18201. LEAL 1(R8)(R10*4), R10
  18202. MOVB R9, 1(AX)
  18203. SARL $0x08, R9
  18204. SHLL $0x05, R9
  18205. ORL R9, R10
  18206. MOVB R10, (AX)
  18207. ADDQ $0x02, AX
  18208. JMP lz4s_s2_loop
  18209. long_offset_short_lz4_s2:
  18210. MOVB $0xee, (AX)
  18211. MOVW R9, 1(AX)
  18212. LEAL -60(R10), R10
  18213. ADDQ $0x03, AX
  18214. // emitRepeat
  18215. emit_repeat_again_lz4_s2_emit_copy_short:
  18216. MOVL R10, R8
  18217. LEAL -4(R10), R10
  18218. CMPL R8, $0x08
  18219. JBE repeat_two_lz4_s2_emit_copy_short
  18220. CMPL R8, $0x0c
  18221. JAE cant_repeat_two_offset_lz4_s2_emit_copy_short
  18222. CMPL R9, $0x00000800
  18223. JB repeat_two_offset_lz4_s2_emit_copy_short
  18224. cant_repeat_two_offset_lz4_s2_emit_copy_short:
  18225. CMPL R10, $0x00000104
  18226. JB repeat_three_lz4_s2_emit_copy_short
  18227. CMPL R10, $0x00010100
  18228. JB repeat_four_lz4_s2_emit_copy_short
  18229. CMPL R10, $0x0100ffff
  18230. JB repeat_five_lz4_s2_emit_copy_short
  18231. LEAL -16842747(R10), R10
  18232. MOVL $0xfffb001d, (AX)
  18233. MOVB $0xff, 4(AX)
  18234. ADDQ $0x05, AX
  18235. JMP emit_repeat_again_lz4_s2_emit_copy_short
  18236. repeat_five_lz4_s2_emit_copy_short:
  18237. LEAL -65536(R10), R10
  18238. MOVL R10, R9
  18239. MOVW $0x001d, (AX)
  18240. MOVW R10, 2(AX)
  18241. SARL $0x10, R9
  18242. MOVB R9, 4(AX)
  18243. ADDQ $0x05, AX
  18244. JMP lz4s_s2_loop
  18245. repeat_four_lz4_s2_emit_copy_short:
  18246. LEAL -256(R10), R10
  18247. MOVW $0x0019, (AX)
  18248. MOVW R10, 2(AX)
  18249. ADDQ $0x04, AX
  18250. JMP lz4s_s2_loop
  18251. repeat_three_lz4_s2_emit_copy_short:
  18252. LEAL -4(R10), R10
  18253. MOVW $0x0015, (AX)
  18254. MOVB R10, 2(AX)
  18255. ADDQ $0x03, AX
  18256. JMP lz4s_s2_loop
  18257. repeat_two_lz4_s2_emit_copy_short:
  18258. SHLL $0x02, R10
  18259. ORL $0x01, R10
  18260. MOVW R10, (AX)
  18261. ADDQ $0x02, AX
  18262. JMP lz4s_s2_loop
  18263. repeat_two_offset_lz4_s2_emit_copy_short:
  18264. XORQ R8, R8
  18265. LEAL 1(R8)(R10*4), R10
  18266. MOVB R9, 1(AX)
  18267. SARL $0x08, R9
  18268. SHLL $0x05, R9
  18269. ORL R9, R10
  18270. MOVB R10, (AX)
  18271. ADDQ $0x02, AX
  18272. JMP lz4s_s2_loop
  18273. two_byte_offset_short_lz4_s2:
  18274. MOVL R10, R8
  18275. SHLL $0x02, R8
  18276. CMPL R10, $0x0c
  18277. JAE emit_copy_three_lz4_s2
  18278. CMPL R9, $0x00000800
  18279. JAE emit_copy_three_lz4_s2
  18280. LEAL -15(R8), R8
  18281. MOVB R9, 1(AX)
  18282. SHRL $0x08, R9
  18283. SHLL $0x05, R9
  18284. ORL R9, R8
  18285. MOVB R8, (AX)
  18286. ADDQ $0x02, AX
  18287. JMP lz4s_s2_loop
  18288. emit_copy_three_lz4_s2:
  18289. LEAL -2(R8), R8
  18290. MOVB R8, (AX)
  18291. MOVW R9, 1(AX)
  18292. ADDQ $0x03, AX
  18293. JMP lz4s_s2_loop
  18294. lz4s_s2_done:
  18295. MOVQ dst_base+0(FP), CX
  18296. SUBQ CX, AX
  18297. MOVQ SI, uncompressed+48(FP)
  18298. MOVQ AX, dstUsed+56(FP)
  18299. RET
  18300. lz4s_s2_corrupt:
  18301. XORQ AX, AX
  18302. LEAQ -1(AX), SI
  18303. MOVQ SI, uncompressed+48(FP)
  18304. RET
  18305. lz4s_s2_dstfull:
  18306. XORQ AX, AX
  18307. LEAQ -2(AX), SI
  18308. MOVQ SI, uncompressed+48(FP)
  18309. RET
  18310. // func cvtLZ4BlockSnappyAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
  18311. // Requires: SSE2
  18312. TEXT ·cvtLZ4BlockSnappyAsm(SB), NOSPLIT, $0-64
  18313. XORQ SI, SI
  18314. MOVQ dst_base+0(FP), AX
  18315. MOVQ dst_len+8(FP), CX
  18316. MOVQ src_base+24(FP), DX
  18317. MOVQ src_len+32(FP), BX
  18318. LEAQ (DX)(BX*1), BX
  18319. LEAQ -10(AX)(CX*1), CX
  18320. lz4_snappy_loop:
  18321. CMPQ DX, BX
  18322. JAE lz4_snappy_corrupt
  18323. CMPQ AX, CX
  18324. JAE lz4_snappy_dstfull
  18325. MOVBQZX (DX), DI
  18326. MOVQ DI, R8
  18327. MOVQ DI, R9
  18328. SHRQ $0x04, R8
  18329. ANDQ $0x0f, R9
  18330. CMPQ DI, $0xf0
  18331. JB lz4_snappy_ll_end
  18332. lz4_snappy_ll_loop:
  18333. INCQ DX
  18334. CMPQ DX, BX
  18335. JAE lz4_snappy_corrupt
  18336. MOVBQZX (DX), DI
  18337. ADDQ DI, R8
  18338. CMPQ DI, $0xff
  18339. JEQ lz4_snappy_ll_loop
  18340. lz4_snappy_ll_end:
  18341. LEAQ (DX)(R8*1), DI
  18342. ADDQ $0x04, R9
  18343. CMPQ DI, BX
  18344. JAE lz4_snappy_corrupt
  18345. INCQ DX
  18346. INCQ DI
  18347. TESTQ R8, R8
  18348. JZ lz4_snappy_lits_done
  18349. LEAQ (AX)(R8*1), R10
  18350. CMPQ R10, CX
  18351. JAE lz4_snappy_dstfull
  18352. ADDQ R8, SI
  18353. LEAL -1(R8), R10
  18354. CMPL R10, $0x3c
  18355. JB one_byte_lz4_snappy
  18356. CMPL R10, $0x00000100
  18357. JB two_bytes_lz4_snappy
  18358. CMPL R10, $0x00010000
  18359. JB three_bytes_lz4_snappy
  18360. CMPL R10, $0x01000000
  18361. JB four_bytes_lz4_snappy
  18362. MOVB $0xfc, (AX)
  18363. MOVL R10, 1(AX)
  18364. ADDQ $0x05, AX
  18365. JMP memmove_long_lz4_snappy
  18366. four_bytes_lz4_snappy:
  18367. MOVL R10, R11
  18368. SHRL $0x10, R11
  18369. MOVB $0xf8, (AX)
  18370. MOVW R10, 1(AX)
  18371. MOVB R11, 3(AX)
  18372. ADDQ $0x04, AX
  18373. JMP memmove_long_lz4_snappy
  18374. three_bytes_lz4_snappy:
  18375. MOVB $0xf4, (AX)
  18376. MOVW R10, 1(AX)
  18377. ADDQ $0x03, AX
  18378. JMP memmove_long_lz4_snappy
  18379. two_bytes_lz4_snappy:
  18380. MOVB $0xf0, (AX)
  18381. MOVB R10, 1(AX)
  18382. ADDQ $0x02, AX
  18383. CMPL R10, $0x40
  18384. JB memmove_lz4_snappy
  18385. JMP memmove_long_lz4_snappy
  18386. one_byte_lz4_snappy:
  18387. SHLB $0x02, R10
  18388. MOVB R10, (AX)
  18389. ADDQ $0x01, AX
  18390. memmove_lz4_snappy:
  18391. LEAQ (AX)(R8*1), R10
  18392. // genMemMoveShort
  18393. CMPQ R8, $0x08
  18394. JBE emit_lit_memmove_lz4_snappy_memmove_move_8
  18395. CMPQ R8, $0x10
  18396. JBE emit_lit_memmove_lz4_snappy_memmove_move_8through16
  18397. CMPQ R8, $0x20
  18398. JBE emit_lit_memmove_lz4_snappy_memmove_move_17through32
  18399. JMP emit_lit_memmove_lz4_snappy_memmove_move_33through64
  18400. emit_lit_memmove_lz4_snappy_memmove_move_8:
  18401. MOVQ (DX), R11
  18402. MOVQ R11, (AX)
  18403. JMP memmove_end_copy_lz4_snappy
  18404. emit_lit_memmove_lz4_snappy_memmove_move_8through16:
  18405. MOVQ (DX), R11
  18406. MOVQ -8(DX)(R8*1), DX
  18407. MOVQ R11, (AX)
  18408. MOVQ DX, -8(AX)(R8*1)
  18409. JMP memmove_end_copy_lz4_snappy
  18410. emit_lit_memmove_lz4_snappy_memmove_move_17through32:
  18411. MOVOU (DX), X0
  18412. MOVOU -16(DX)(R8*1), X1
  18413. MOVOU X0, (AX)
  18414. MOVOU X1, -16(AX)(R8*1)
  18415. JMP memmove_end_copy_lz4_snappy
  18416. emit_lit_memmove_lz4_snappy_memmove_move_33through64:
  18417. MOVOU (DX), X0
  18418. MOVOU 16(DX), X1
  18419. MOVOU -32(DX)(R8*1), X2
  18420. MOVOU -16(DX)(R8*1), X3
  18421. MOVOU X0, (AX)
  18422. MOVOU X1, 16(AX)
  18423. MOVOU X2, -32(AX)(R8*1)
  18424. MOVOU X3, -16(AX)(R8*1)
  18425. memmove_end_copy_lz4_snappy:
  18426. MOVQ R10, AX
  18427. JMP lz4_snappy_lits_emit_done
  18428. memmove_long_lz4_snappy:
  18429. LEAQ (AX)(R8*1), R10
  18430. // genMemMoveLong
  18431. MOVOU (DX), X0
  18432. MOVOU 16(DX), X1
  18433. MOVOU -32(DX)(R8*1), X2
  18434. MOVOU -16(DX)(R8*1), X3
  18435. MOVQ R8, R12
  18436. SHRQ $0x05, R12
  18437. MOVQ AX, R11
  18438. ANDL $0x0000001f, R11
  18439. MOVQ $0x00000040, R13
  18440. SUBQ R11, R13
  18441. DECQ R12
  18442. JA emit_lit_memmove_long_lz4_snappylarge_forward_sse_loop_32
  18443. LEAQ -32(DX)(R13*1), R11
  18444. LEAQ -32(AX)(R13*1), R14
  18445. emit_lit_memmove_long_lz4_snappylarge_big_loop_back:
  18446. MOVOU (R11), X4
  18447. MOVOU 16(R11), X5
  18448. MOVOA X4, (R14)
  18449. MOVOA X5, 16(R14)
  18450. ADDQ $0x20, R14
  18451. ADDQ $0x20, R11
  18452. ADDQ $0x20, R13
  18453. DECQ R12
  18454. JNA emit_lit_memmove_long_lz4_snappylarge_big_loop_back
  18455. emit_lit_memmove_long_lz4_snappylarge_forward_sse_loop_32:
  18456. MOVOU -32(DX)(R13*1), X4
  18457. MOVOU -16(DX)(R13*1), X5
  18458. MOVOA X4, -32(AX)(R13*1)
  18459. MOVOA X5, -16(AX)(R13*1)
  18460. ADDQ $0x20, R13
  18461. CMPQ R8, R13
  18462. JAE emit_lit_memmove_long_lz4_snappylarge_forward_sse_loop_32
  18463. MOVOU X0, (AX)
  18464. MOVOU X1, 16(AX)
  18465. MOVOU X2, -32(AX)(R8*1)
  18466. MOVOU X3, -16(AX)(R8*1)
  18467. MOVQ R10, AX
  18468. lz4_snappy_lits_emit_done:
  18469. MOVQ DI, DX
  18470. lz4_snappy_lits_done:
  18471. CMPQ DX, BX
  18472. JNE lz4_snappy_match
  18473. CMPQ R9, $0x04
  18474. JEQ lz4_snappy_done
  18475. JMP lz4_snappy_corrupt
  18476. lz4_snappy_match:
  18477. LEAQ 2(DX), DI
  18478. CMPQ DI, BX
  18479. JAE lz4_snappy_corrupt
  18480. MOVWQZX (DX), R8
  18481. MOVQ DI, DX
  18482. TESTQ R8, R8
  18483. JZ lz4_snappy_corrupt
  18484. CMPQ R8, SI
  18485. JA lz4_snappy_corrupt
  18486. CMPQ R9, $0x13
  18487. JNE lz4_snappy_ml_done
  18488. lz4_snappy_ml_loop:
  18489. MOVBQZX (DX), DI
  18490. INCQ DX
  18491. ADDQ DI, R9
  18492. CMPQ DX, BX
  18493. JAE lz4_snappy_corrupt
  18494. CMPQ DI, $0xff
  18495. JEQ lz4_snappy_ml_loop
  18496. lz4_snappy_ml_done:
  18497. ADDQ R9, SI
  18498. // emitCopy
  18499. two_byte_offset_lz4_s2:
  18500. CMPL R9, $0x40
  18501. JBE two_byte_offset_short_lz4_s2
  18502. MOVB $0xee, (AX)
  18503. MOVW R8, 1(AX)
  18504. LEAL -60(R9), R9
  18505. ADDQ $0x03, AX
  18506. CMPQ AX, CX
  18507. JAE lz4_snappy_loop
  18508. JMP two_byte_offset_lz4_s2
  18509. two_byte_offset_short_lz4_s2:
  18510. MOVL R9, DI
  18511. SHLL $0x02, DI
  18512. CMPL R9, $0x0c
  18513. JAE emit_copy_three_lz4_s2
  18514. CMPL R8, $0x00000800
  18515. JAE emit_copy_three_lz4_s2
  18516. LEAL -15(DI), DI
  18517. MOVB R8, 1(AX)
  18518. SHRL $0x08, R8
  18519. SHLL $0x05, R8
  18520. ORL R8, DI
  18521. MOVB DI, (AX)
  18522. ADDQ $0x02, AX
  18523. JMP lz4_snappy_loop
  18524. emit_copy_three_lz4_s2:
  18525. LEAL -2(DI), DI
  18526. MOVB DI, (AX)
  18527. MOVW R8, 1(AX)
  18528. ADDQ $0x03, AX
  18529. JMP lz4_snappy_loop
  18530. lz4_snappy_done:
  18531. MOVQ dst_base+0(FP), CX
  18532. SUBQ CX, AX
  18533. MOVQ SI, uncompressed+48(FP)
  18534. MOVQ AX, dstUsed+56(FP)
  18535. RET
  18536. lz4_snappy_corrupt:
  18537. XORQ AX, AX
  18538. LEAQ -1(AX), SI
  18539. MOVQ SI, uncompressed+48(FP)
  18540. RET
  18541. lz4_snappy_dstfull:
  18542. XORQ AX, AX
  18543. LEAQ -2(AX), SI
  18544. MOVQ SI, uncompressed+48(FP)
  18545. RET
  18546. // func cvtLZ4sBlockSnappyAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
  18547. // Requires: SSE2
  18548. TEXT ·cvtLZ4sBlockSnappyAsm(SB), NOSPLIT, $0-64
  18549. XORQ SI, SI
  18550. MOVQ dst_base+0(FP), AX
  18551. MOVQ dst_len+8(FP), CX
  18552. MOVQ src_base+24(FP), DX
  18553. MOVQ src_len+32(FP), BX
  18554. LEAQ (DX)(BX*1), BX
  18555. LEAQ -10(AX)(CX*1), CX
  18556. lz4s_snappy_loop:
  18557. CMPQ DX, BX
  18558. JAE lz4s_snappy_corrupt
  18559. CMPQ AX, CX
  18560. JAE lz4s_snappy_dstfull
  18561. MOVBQZX (DX), DI
  18562. MOVQ DI, R8
  18563. MOVQ DI, R9
  18564. SHRQ $0x04, R8
  18565. ANDQ $0x0f, R9
  18566. CMPQ DI, $0xf0
  18567. JB lz4s_snappy_ll_end
  18568. lz4s_snappy_ll_loop:
  18569. INCQ DX
  18570. CMPQ DX, BX
  18571. JAE lz4s_snappy_corrupt
  18572. MOVBQZX (DX), DI
  18573. ADDQ DI, R8
  18574. CMPQ DI, $0xff
  18575. JEQ lz4s_snappy_ll_loop
  18576. lz4s_snappy_ll_end:
  18577. LEAQ (DX)(R8*1), DI
  18578. ADDQ $0x03, R9
  18579. CMPQ DI, BX
  18580. JAE lz4s_snappy_corrupt
  18581. INCQ DX
  18582. INCQ DI
  18583. TESTQ R8, R8
  18584. JZ lz4s_snappy_lits_done
  18585. LEAQ (AX)(R8*1), R10
  18586. CMPQ R10, CX
  18587. JAE lz4s_snappy_dstfull
  18588. ADDQ R8, SI
  18589. LEAL -1(R8), R10
  18590. CMPL R10, $0x3c
  18591. JB one_byte_lz4s_snappy
  18592. CMPL R10, $0x00000100
  18593. JB two_bytes_lz4s_snappy
  18594. CMPL R10, $0x00010000
  18595. JB three_bytes_lz4s_snappy
  18596. CMPL R10, $0x01000000
  18597. JB four_bytes_lz4s_snappy
  18598. MOVB $0xfc, (AX)
  18599. MOVL R10, 1(AX)
  18600. ADDQ $0x05, AX
  18601. JMP memmove_long_lz4s_snappy
  18602. four_bytes_lz4s_snappy:
  18603. MOVL R10, R11
  18604. SHRL $0x10, R11
  18605. MOVB $0xf8, (AX)
  18606. MOVW R10, 1(AX)
  18607. MOVB R11, 3(AX)
  18608. ADDQ $0x04, AX
  18609. JMP memmove_long_lz4s_snappy
  18610. three_bytes_lz4s_snappy:
  18611. MOVB $0xf4, (AX)
  18612. MOVW R10, 1(AX)
  18613. ADDQ $0x03, AX
  18614. JMP memmove_long_lz4s_snappy
  18615. two_bytes_lz4s_snappy:
  18616. MOVB $0xf0, (AX)
  18617. MOVB R10, 1(AX)
  18618. ADDQ $0x02, AX
  18619. CMPL R10, $0x40
  18620. JB memmove_lz4s_snappy
  18621. JMP memmove_long_lz4s_snappy
  18622. one_byte_lz4s_snappy:
  18623. SHLB $0x02, R10
  18624. MOVB R10, (AX)
  18625. ADDQ $0x01, AX
  18626. memmove_lz4s_snappy:
  18627. LEAQ (AX)(R8*1), R10
  18628. // genMemMoveShort
  18629. CMPQ R8, $0x08
  18630. JBE emit_lit_memmove_lz4s_snappy_memmove_move_8
  18631. CMPQ R8, $0x10
  18632. JBE emit_lit_memmove_lz4s_snappy_memmove_move_8through16
  18633. CMPQ R8, $0x20
  18634. JBE emit_lit_memmove_lz4s_snappy_memmove_move_17through32
  18635. JMP emit_lit_memmove_lz4s_snappy_memmove_move_33through64
  18636. emit_lit_memmove_lz4s_snappy_memmove_move_8:
  18637. MOVQ (DX), R11
  18638. MOVQ R11, (AX)
  18639. JMP memmove_end_copy_lz4s_snappy
  18640. emit_lit_memmove_lz4s_snappy_memmove_move_8through16:
  18641. MOVQ (DX), R11
  18642. MOVQ -8(DX)(R8*1), DX
  18643. MOVQ R11, (AX)
  18644. MOVQ DX, -8(AX)(R8*1)
  18645. JMP memmove_end_copy_lz4s_snappy
  18646. emit_lit_memmove_lz4s_snappy_memmove_move_17through32:
  18647. MOVOU (DX), X0
  18648. MOVOU -16(DX)(R8*1), X1
  18649. MOVOU X0, (AX)
  18650. MOVOU X1, -16(AX)(R8*1)
  18651. JMP memmove_end_copy_lz4s_snappy
  18652. emit_lit_memmove_lz4s_snappy_memmove_move_33through64:
  18653. MOVOU (DX), X0
  18654. MOVOU 16(DX), X1
  18655. MOVOU -32(DX)(R8*1), X2
  18656. MOVOU -16(DX)(R8*1), X3
  18657. MOVOU X0, (AX)
  18658. MOVOU X1, 16(AX)
  18659. MOVOU X2, -32(AX)(R8*1)
  18660. MOVOU X3, -16(AX)(R8*1)
  18661. memmove_end_copy_lz4s_snappy:
  18662. MOVQ R10, AX
  18663. JMP lz4s_snappy_lits_emit_done
  18664. memmove_long_lz4s_snappy:
  18665. LEAQ (AX)(R8*1), R10
  18666. // genMemMoveLong
  18667. MOVOU (DX), X0
  18668. MOVOU 16(DX), X1
  18669. MOVOU -32(DX)(R8*1), X2
  18670. MOVOU -16(DX)(R8*1), X3
  18671. MOVQ R8, R12
  18672. SHRQ $0x05, R12
  18673. MOVQ AX, R11
  18674. ANDL $0x0000001f, R11
  18675. MOVQ $0x00000040, R13
  18676. SUBQ R11, R13
  18677. DECQ R12
  18678. JA emit_lit_memmove_long_lz4s_snappylarge_forward_sse_loop_32
  18679. LEAQ -32(DX)(R13*1), R11
  18680. LEAQ -32(AX)(R13*1), R14
  18681. emit_lit_memmove_long_lz4s_snappylarge_big_loop_back:
  18682. MOVOU (R11), X4
  18683. MOVOU 16(R11), X5
  18684. MOVOA X4, (R14)
  18685. MOVOA X5, 16(R14)
  18686. ADDQ $0x20, R14
  18687. ADDQ $0x20, R11
  18688. ADDQ $0x20, R13
  18689. DECQ R12
  18690. JNA emit_lit_memmove_long_lz4s_snappylarge_big_loop_back
  18691. emit_lit_memmove_long_lz4s_snappylarge_forward_sse_loop_32:
  18692. MOVOU -32(DX)(R13*1), X4
  18693. MOVOU -16(DX)(R13*1), X5
  18694. MOVOA X4, -32(AX)(R13*1)
  18695. MOVOA X5, -16(AX)(R13*1)
  18696. ADDQ $0x20, R13
  18697. CMPQ R8, R13
  18698. JAE emit_lit_memmove_long_lz4s_snappylarge_forward_sse_loop_32
  18699. MOVOU X0, (AX)
  18700. MOVOU X1, 16(AX)
  18701. MOVOU X2, -32(AX)(R8*1)
  18702. MOVOU X3, -16(AX)(R8*1)
  18703. MOVQ R10, AX
  18704. lz4s_snappy_lits_emit_done:
  18705. MOVQ DI, DX
  18706. lz4s_snappy_lits_done:
  18707. CMPQ DX, BX
  18708. JNE lz4s_snappy_match
  18709. CMPQ R9, $0x03
  18710. JEQ lz4s_snappy_done
  18711. JMP lz4s_snappy_corrupt
  18712. lz4s_snappy_match:
  18713. CMPQ R9, $0x03
  18714. JEQ lz4s_snappy_loop
  18715. LEAQ 2(DX), DI
  18716. CMPQ DI, BX
  18717. JAE lz4s_snappy_corrupt
  18718. MOVWQZX (DX), R8
  18719. MOVQ DI, DX
  18720. TESTQ R8, R8
  18721. JZ lz4s_snappy_corrupt
  18722. CMPQ R8, SI
  18723. JA lz4s_snappy_corrupt
  18724. CMPQ R9, $0x12
  18725. JNE lz4s_snappy_ml_done
  18726. lz4s_snappy_ml_loop:
  18727. MOVBQZX (DX), DI
  18728. INCQ DX
  18729. ADDQ DI, R9
  18730. CMPQ DX, BX
  18731. JAE lz4s_snappy_corrupt
  18732. CMPQ DI, $0xff
  18733. JEQ lz4s_snappy_ml_loop
  18734. lz4s_snappy_ml_done:
  18735. ADDQ R9, SI
  18736. // emitCopy
  18737. two_byte_offset_lz4_s2:
  18738. CMPL R9, $0x40
  18739. JBE two_byte_offset_short_lz4_s2
  18740. MOVB $0xee, (AX)
  18741. MOVW R8, 1(AX)
  18742. LEAL -60(R9), R9
  18743. ADDQ $0x03, AX
  18744. CMPQ AX, CX
  18745. JAE lz4s_snappy_loop
  18746. JMP two_byte_offset_lz4_s2
  18747. two_byte_offset_short_lz4_s2:
  18748. MOVL R9, DI
  18749. SHLL $0x02, DI
  18750. CMPL R9, $0x0c
  18751. JAE emit_copy_three_lz4_s2
  18752. CMPL R8, $0x00000800
  18753. JAE emit_copy_three_lz4_s2
  18754. LEAL -15(DI), DI
  18755. MOVB R8, 1(AX)
  18756. SHRL $0x08, R8
  18757. SHLL $0x05, R8
  18758. ORL R8, DI
  18759. MOVB DI, (AX)
  18760. ADDQ $0x02, AX
  18761. JMP lz4s_snappy_loop
  18762. emit_copy_three_lz4_s2:
  18763. LEAL -2(DI), DI
  18764. MOVB DI, (AX)
  18765. MOVW R8, 1(AX)
  18766. ADDQ $0x03, AX
  18767. JMP lz4s_snappy_loop
  18768. lz4s_snappy_done:
  18769. MOVQ dst_base+0(FP), CX
  18770. SUBQ CX, AX
  18771. MOVQ SI, uncompressed+48(FP)
  18772. MOVQ AX, dstUsed+56(FP)
  18773. RET
  18774. lz4s_snappy_corrupt:
  18775. XORQ AX, AX
  18776. LEAQ -1(AX), SI
  18777. MOVQ SI, uncompressed+48(FP)
  18778. RET
  18779. lz4s_snappy_dstfull:
  18780. XORQ AX, AX
  18781. LEAQ -2(AX), SI
  18782. MOVQ SI, uncompressed+48(FP)
  18783. RET