decode_arm64.s 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574
  1. // Copyright 2020 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // +build !appengine
  5. // +build gc
  6. // +build !noasm
  7. #include "textflag.h"
  8. #define R_TMP0 R2
  9. #define R_TMP1 R3
  10. #define R_LEN R4
  11. #define R_OFF R5
  12. #define R_SRC R6
  13. #define R_DST R7
  14. #define R_DBASE R8
  15. #define R_DLEN R9
  16. #define R_DEND R10
  17. #define R_SBASE R11
  18. #define R_SLEN R12
  19. #define R_SEND R13
  20. #define R_TMP2 R14
  21. #define R_TMP3 R15
  22. // TEST_SRC will check if R_SRC is <= SRC_END
  23. #define TEST_SRC() \
  24. CMP R_SEND, R_SRC \
  25. BGT errCorrupt
  26. // MOVD R_SRC, R_TMP1
  27. // SUB R_SBASE, R_TMP1, R_TMP1
  28. // CMP R_SLEN, R_TMP1
  29. // BGT errCorrupt
  30. // The asm code generally follows the pure Go code in decode_other.go, except
  31. // where marked with a "!!!".
  32. // func decode(dst, src []byte) int
  33. //
  34. // All local variables fit into registers. The non-zero stack size is only to
  35. // spill registers and push args when issuing a CALL. The register allocation:
  36. // - R_TMP0 scratch
  37. // - R_TMP1 scratch
  38. // - R_LEN length or x
  39. // - R_OFF offset
  40. // - R_SRC &src[s]
  41. // - R_DST &dst[d]
  42. // + R_DBASE dst_base
  43. // + R_DLEN dst_len
  44. // + R_DEND dst_base + dst_len
  45. // + R_SBASE src_base
  46. // + R_SLEN src_len
  47. // + R_SEND src_base + src_len
  48. // - R_TMP2 used by doCopy
  49. // - R_TMP3 used by doCopy
  50. //
  51. // The registers R_DBASE-R_SEND (marked with a "+") are set at the start of the
  52. // function, and after a CALL returns, and are not otherwise modified.
  53. //
  54. // The d variable is implicitly R_DST - R_DBASE, and len(dst)-d is R_DEND - R_DST.
  55. // The s variable is implicitly R_SRC - R_SBASE, and len(src)-s is R_SEND - R_SRC.
  56. TEXT ·s2Decode(SB), NOSPLIT, $56-64
  57. // Initialize R_SRC, R_DST and R_DBASE-R_SEND.
  58. MOVD dst_base+0(FP), R_DBASE
  59. MOVD dst_len+8(FP), R_DLEN
  60. MOVD R_DBASE, R_DST
  61. MOVD R_DBASE, R_DEND
  62. ADD R_DLEN, R_DEND, R_DEND
  63. MOVD src_base+24(FP), R_SBASE
  64. MOVD src_len+32(FP), R_SLEN
  65. MOVD R_SBASE, R_SRC
  66. MOVD R_SBASE, R_SEND
  67. ADD R_SLEN, R_SEND, R_SEND
  68. MOVD $0, R_OFF
  69. loop:
  70. // for s < len(src)
  71. CMP R_SEND, R_SRC
  72. BEQ end
  73. // R_LEN = uint32(src[s])
  74. //
  75. // switch src[s] & 0x03
  76. MOVBU (R_SRC), R_LEN
  77. MOVW R_LEN, R_TMP1
  78. ANDW $3, R_TMP1
  79. MOVW $1, R1
  80. CMPW R1, R_TMP1
  81. BGE tagCopy
  82. // ----------------------------------------
  83. // The code below handles literal tags.
  84. // case tagLiteral:
  85. // x := uint32(src[s] >> 2)
  86. // switch
  87. MOVW $60, R1
  88. LSRW $2, R_LEN, R_LEN
  89. CMPW R_LEN, R1
  90. BLS tagLit60Plus
  91. // case x < 60:
  92. // s++
  93. ADD $1, R_SRC, R_SRC
  94. doLit:
  95. // This is the end of the inner "switch", when we have a literal tag.
  96. //
  97. // We assume that R_LEN == x and x fits in a uint32, where x is the variable
  98. // used in the pure Go decode_other.go code.
  99. // length = int(x) + 1
  100. //
  101. // Unlike the pure Go code, we don't need to check if length <= 0 because
  102. // R_LEN can hold 64 bits, so the increment cannot overflow.
  103. ADD $1, R_LEN, R_LEN
  104. // Prepare to check if copying length bytes will run past the end of dst or
  105. // src.
  106. //
  107. // R_TMP0 = len(dst) - d
  108. // R_TMP1 = len(src) - s
  109. MOVD R_DEND, R_TMP0
  110. SUB R_DST, R_TMP0, R_TMP0
  111. MOVD R_SEND, R_TMP1
  112. SUB R_SRC, R_TMP1, R_TMP1
  113. // !!! Try a faster technique for short (16 or fewer bytes) copies.
  114. //
  115. // if length > 16 || len(dst)-d < 16 || len(src)-s < 16 {
  116. // goto callMemmove // Fall back on calling runtime·memmove.
  117. // }
  118. //
  119. // The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
  120. // against 21 instead of 16, because it cannot assume that all of its input
  121. // is contiguous in memory and so it needs to leave enough source bytes to
  122. // read the next tag without refilling buffers, but Go's Decode assumes
  123. // contiguousness (the src argument is a []byte).
  124. CMP $16, R_LEN
  125. BGT callMemmove
  126. CMP $16, R_TMP0
  127. BLT callMemmove
  128. CMP $16, R_TMP1
  129. BLT callMemmove
  130. // !!! Implement the copy from src to dst as a 16-byte load and store.
  131. // (Decode's documentation says that dst and src must not overlap.)
  132. //
  133. // This always copies 16 bytes, instead of only length bytes, but that's
  134. // OK. If the input is a valid Snappy encoding then subsequent iterations
  135. // will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
  136. // non-nil error), so the overrun will be ignored.
  137. //
  138. // Note that on arm64, it is legal and cheap to issue unaligned 8-byte or
  139. // 16-byte loads and stores. This technique probably wouldn't be as
  140. // effective on architectures that are fussier about alignment.
  141. LDP 0(R_SRC), (R_TMP2, R_TMP3)
  142. STP (R_TMP2, R_TMP3), 0(R_DST)
  143. // d += length
  144. // s += length
  145. ADD R_LEN, R_DST, R_DST
  146. ADD R_LEN, R_SRC, R_SRC
  147. B loop
  148. callMemmove:
  149. // if length > len(dst)-d || length > len(src)-s { etc }
  150. CMP R_TMP0, R_LEN
  151. BGT errCorrupt
  152. CMP R_TMP1, R_LEN
  153. BGT errCorrupt
  154. // copy(dst[d:], src[s:s+length])
  155. //
  156. // This means calling runtime·memmove(&dst[d], &src[s], length), so we push
  157. // R_DST, R_SRC and R_LEN as arguments. Coincidentally, we also need to spill those
  158. // three registers to the stack, to save local variables across the CALL.
  159. MOVD R_DST, 8(RSP)
  160. MOVD R_SRC, 16(RSP)
  161. MOVD R_LEN, 24(RSP)
  162. MOVD R_DST, 32(RSP)
  163. MOVD R_SRC, 40(RSP)
  164. MOVD R_LEN, 48(RSP)
  165. MOVD R_OFF, 56(RSP)
  166. CALL runtime·memmove(SB)
  167. // Restore local variables: unspill registers from the stack and
  168. // re-calculate R_DBASE-R_SEND.
  169. MOVD 32(RSP), R_DST
  170. MOVD 40(RSP), R_SRC
  171. MOVD 48(RSP), R_LEN
  172. MOVD 56(RSP), R_OFF
  173. MOVD dst_base+0(FP), R_DBASE
  174. MOVD dst_len+8(FP), R_DLEN
  175. MOVD R_DBASE, R_DEND
  176. ADD R_DLEN, R_DEND, R_DEND
  177. MOVD src_base+24(FP), R_SBASE
  178. MOVD src_len+32(FP), R_SLEN
  179. MOVD R_SBASE, R_SEND
  180. ADD R_SLEN, R_SEND, R_SEND
  181. // d += length
  182. // s += length
  183. ADD R_LEN, R_DST, R_DST
  184. ADD R_LEN, R_SRC, R_SRC
  185. B loop
  186. tagLit60Plus:
  187. // !!! This fragment does the
  188. //
  189. // s += x - 58; if uint(s) > uint(len(src)) { etc }
  190. //
  191. // checks. In the asm version, we code it once instead of once per switch case.
  192. ADD R_LEN, R_SRC, R_SRC
  193. SUB $58, R_SRC, R_SRC
  194. TEST_SRC()
  195. // case x == 60:
  196. MOVW $61, R1
  197. CMPW R1, R_LEN
  198. BEQ tagLit61
  199. BGT tagLit62Plus
  200. // x = uint32(src[s-1])
  201. MOVBU -1(R_SRC), R_LEN
  202. B doLit
  203. tagLit61:
  204. // case x == 61:
  205. // x = uint32(src[s-2]) | uint32(src[s-1])<<8
  206. MOVHU -2(R_SRC), R_LEN
  207. B doLit
  208. tagLit62Plus:
  209. CMPW $62, R_LEN
  210. BHI tagLit63
  211. // case x == 62:
  212. // x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
  213. MOVHU -3(R_SRC), R_LEN
  214. MOVBU -1(R_SRC), R_TMP1
  215. ORR R_TMP1<<16, R_LEN
  216. B doLit
  217. tagLit63:
  218. // case x == 63:
  219. // x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
  220. MOVWU -4(R_SRC), R_LEN
  221. B doLit
  222. // The code above handles literal tags.
  223. // ----------------------------------------
  224. // The code below handles copy tags.
  225. tagCopy4:
  226. // case tagCopy4:
  227. // s += 5
  228. ADD $5, R_SRC, R_SRC
  229. // if uint(s) > uint(len(src)) { etc }
  230. MOVD R_SRC, R_TMP1
  231. SUB R_SBASE, R_TMP1, R_TMP1
  232. CMP R_SLEN, R_TMP1
  233. BGT errCorrupt
  234. // length = 1 + int(src[s-5])>>2
  235. MOVD $1, R1
  236. ADD R_LEN>>2, R1, R_LEN
  237. // offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
  238. MOVWU -4(R_SRC), R_OFF
  239. B doCopy
  240. tagCopy2:
  241. // case tagCopy2:
  242. // s += 3
  243. ADD $3, R_SRC, R_SRC
  244. // if uint(s) > uint(len(src)) { etc }
  245. TEST_SRC()
  246. // length = 1 + int(src[s-3])>>2
  247. MOVD $1, R1
  248. ADD R_LEN>>2, R1, R_LEN
  249. // offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
  250. MOVHU -2(R_SRC), R_OFF
  251. B doCopy
  252. tagCopy:
  253. // We have a copy tag. We assume that:
  254. // - R_TMP1 == src[s] & 0x03
  255. // - R_LEN == src[s]
  256. CMP $2, R_TMP1
  257. BEQ tagCopy2
  258. BGT tagCopy4
  259. // case tagCopy1:
  260. // s += 2
  261. ADD $2, R_SRC, R_SRC
  262. // if uint(s) > uint(len(src)) { etc }
  263. TEST_SRC()
  264. // offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
  265. // Calculate offset in R_TMP0 in case it is a repeat.
  266. MOVD R_LEN, R_TMP0
  267. AND $0xe0, R_TMP0
  268. MOVBU -1(R_SRC), R_TMP1
  269. ORR R_TMP0<<3, R_TMP1, R_TMP0
  270. // length = 4 + int(src[s-2])>>2&0x7
  271. MOVD $7, R1
  272. AND R_LEN>>2, R1, R_LEN
  273. ADD $4, R_LEN, R_LEN
  274. // check if repeat code with offset 0.
  275. CMP $0, R_TMP0
  276. BEQ repeatCode
  277. // This is a regular copy, transfer our temporary value to R_OFF (offset)
  278. MOVD R_TMP0, R_OFF
  279. B doCopy
  280. // This is a repeat code.
  281. repeatCode:
  282. // If length < 9, reuse last offset, with the length already calculated.
  283. CMP $9, R_LEN
  284. BLT doCopyRepeat
  285. BEQ repeatLen1
  286. CMP $10, R_LEN
  287. BEQ repeatLen2
  288. repeatLen3:
  289. // s +=3
  290. ADD $3, R_SRC, R_SRC
  291. // if uint(s) > uint(len(src)) { etc }
  292. TEST_SRC()
  293. // length = uint32(src[s-3]) | (uint32(src[s-2])<<8) | (uint32(src[s-1])<<16) + 65540
  294. MOVBU -1(R_SRC), R_TMP0
  295. MOVHU -3(R_SRC), R_LEN
  296. ORR R_TMP0<<16, R_LEN, R_LEN
  297. ADD $65540, R_LEN, R_LEN
  298. B doCopyRepeat
  299. repeatLen2:
  300. // s +=2
  301. ADD $2, R_SRC, R_SRC
  302. // if uint(s) > uint(len(src)) { etc }
  303. TEST_SRC()
  304. // length = uint32(src[s-2]) | (uint32(src[s-1])<<8) + 260
  305. MOVHU -2(R_SRC), R_LEN
  306. ADD $260, R_LEN, R_LEN
  307. B doCopyRepeat
  308. repeatLen1:
  309. // s +=1
  310. ADD $1, R_SRC, R_SRC
  311. // if uint(s) > uint(len(src)) { etc }
  312. TEST_SRC()
  313. // length = src[s-1] + 8
  314. MOVBU -1(R_SRC), R_LEN
  315. ADD $8, R_LEN, R_LEN
  316. B doCopyRepeat
  317. doCopy:
  318. // This is the end of the outer "switch", when we have a copy tag.
  319. //
  320. // We assume that:
  321. // - R_LEN == length && R_LEN > 0
  322. // - R_OFF == offset
  323. // if d < offset { etc }
  324. MOVD R_DST, R_TMP1
  325. SUB R_DBASE, R_TMP1, R_TMP1
  326. CMP R_OFF, R_TMP1
  327. BLT errCorrupt
  328. // Repeat values can skip the test above, since any offset > 0 will be in dst.
  329. doCopyRepeat:
  330. // if offset <= 0 { etc }
  331. CMP $0, R_OFF
  332. BLE errCorrupt
  333. // if length > len(dst)-d { etc }
  334. MOVD R_DEND, R_TMP1
  335. SUB R_DST, R_TMP1, R_TMP1
  336. CMP R_TMP1, R_LEN
  337. BGT errCorrupt
  338. // forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
  339. //
  340. // Set:
  341. // - R_TMP2 = len(dst)-d
  342. // - R_TMP3 = &dst[d-offset]
  343. MOVD R_DEND, R_TMP2
  344. SUB R_DST, R_TMP2, R_TMP2
  345. MOVD R_DST, R_TMP3
  346. SUB R_OFF, R_TMP3, R_TMP3
  347. // !!! Try a faster technique for short (16 or fewer bytes) forward copies.
  348. //
  349. // First, try using two 8-byte load/stores, similar to the doLit technique
  350. // above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
  351. // still OK if offset >= 8. Note that this has to be two 8-byte load/stores
  352. // and not one 16-byte load/store, and the first store has to be before the
  353. // second load, due to the overlap if offset is in the range [8, 16).
  354. //
  355. // if length > 16 || offset < 8 || len(dst)-d < 16 {
  356. // goto slowForwardCopy
  357. // }
  358. // copy 16 bytes
  359. // d += length
  360. CMP $16, R_LEN
  361. BGT slowForwardCopy
  362. CMP $8, R_OFF
  363. BLT slowForwardCopy
  364. CMP $16, R_TMP2
  365. BLT slowForwardCopy
  366. MOVD 0(R_TMP3), R_TMP0
  367. MOVD R_TMP0, 0(R_DST)
  368. MOVD 8(R_TMP3), R_TMP1
  369. MOVD R_TMP1, 8(R_DST)
  370. ADD R_LEN, R_DST, R_DST
  371. B loop
  372. slowForwardCopy:
  373. // !!! If the forward copy is longer than 16 bytes, or if offset < 8, we
  374. // can still try 8-byte load stores, provided we can overrun up to 10 extra
  375. // bytes. As above, the overrun will be fixed up by subsequent iterations
  376. // of the outermost loop.
  377. //
  378. // The C++ snappy code calls this technique IncrementalCopyFastPath. Its
  379. // commentary says:
  380. //
  381. // ----
  382. //
  383. // The main part of this loop is a simple copy of eight bytes at a time
  384. // until we've copied (at least) the requested amount of bytes. However,
  385. // if d and d-offset are less than eight bytes apart (indicating a
  386. // repeating pattern of length < 8), we first need to expand the pattern in
  387. // order to get the correct results. For instance, if the buffer looks like
  388. // this, with the eight-byte <d-offset> and <d> patterns marked as
  389. // intervals:
  390. //
  391. // abxxxxxxxxxxxx
  392. // [------] d-offset
  393. // [------] d
  394. //
  395. // a single eight-byte copy from <d-offset> to <d> will repeat the pattern
  396. // once, after which we can move <d> two bytes without moving <d-offset>:
  397. //
  398. // ababxxxxxxxxxx
  399. // [------] d-offset
  400. // [------] d
  401. //
  402. // and repeat the exercise until the two no longer overlap.
  403. //
  404. // This allows us to do very well in the special case of one single byte
  405. // repeated many times, without taking a big hit for more general cases.
  406. //
  407. // The worst case of extra writing past the end of the match occurs when
  408. // offset == 1 and length == 1; the last copy will read from byte positions
  409. // [0..7] and write to [4..11], whereas it was only supposed to write to
  410. // position 1. Thus, ten excess bytes.
  411. //
  412. // ----
  413. //
  414. // That "10 byte overrun" worst case is confirmed by Go's
  415. // TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy
  416. // and finishSlowForwardCopy algorithm.
  417. //
  418. // if length > len(dst)-d-10 {
  419. // goto verySlowForwardCopy
  420. // }
  421. SUB $10, R_TMP2, R_TMP2
  422. CMP R_TMP2, R_LEN
  423. BGT verySlowForwardCopy
  424. // We want to keep the offset, so we use R_TMP2 from here.
  425. MOVD R_OFF, R_TMP2
  426. makeOffsetAtLeast8:
  427. // !!! As above, expand the pattern so that offset >= 8 and we can use
  428. // 8-byte load/stores.
  429. //
  430. // for offset < 8 {
  431. // copy 8 bytes from dst[d-offset:] to dst[d:]
  432. // length -= offset
  433. // d += offset
  434. // offset += offset
  435. // // The two previous lines together means that d-offset, and therefore
  436. // // R_TMP3, is unchanged.
  437. // }
  438. CMP $8, R_TMP2
  439. BGE fixUpSlowForwardCopy
  440. MOVD (R_TMP3), R_TMP1
  441. MOVD R_TMP1, (R_DST)
  442. SUB R_TMP2, R_LEN, R_LEN
  443. ADD R_TMP2, R_DST, R_DST
  444. ADD R_TMP2, R_TMP2, R_TMP2
  445. B makeOffsetAtLeast8
  446. fixUpSlowForwardCopy:
  447. // !!! Add length (which might be negative now) to d (implied by R_DST being
  448. // &dst[d]) so that d ends up at the right place when we jump back to the
  449. // top of the loop. Before we do that, though, we save R_DST to R_TMP0 so that, if
  450. // length is positive, copying the remaining length bytes will write to the
  451. // right place.
  452. MOVD R_DST, R_TMP0
  453. ADD R_LEN, R_DST, R_DST
  454. finishSlowForwardCopy:
  455. // !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
  456. // length means that we overrun, but as above, that will be fixed up by
  457. // subsequent iterations of the outermost loop.
  458. MOVD $0, R1
  459. CMP R1, R_LEN
  460. BLE loop
  461. MOVD (R_TMP3), R_TMP1
  462. MOVD R_TMP1, (R_TMP0)
  463. ADD $8, R_TMP3, R_TMP3
  464. ADD $8, R_TMP0, R_TMP0
  465. SUB $8, R_LEN, R_LEN
  466. B finishSlowForwardCopy
  467. verySlowForwardCopy:
  468. // verySlowForwardCopy is a simple implementation of forward copy. In C
  469. // parlance, this is a do/while loop instead of a while loop, since we know
  470. // that length > 0. In Go syntax:
  471. //
  472. // for {
  473. // dst[d] = dst[d - offset]
  474. // d++
  475. // length--
  476. // if length == 0 {
  477. // break
  478. // }
  479. // }
  480. MOVB (R_TMP3), R_TMP1
  481. MOVB R_TMP1, (R_DST)
  482. ADD $1, R_TMP3, R_TMP3
  483. ADD $1, R_DST, R_DST
  484. SUB $1, R_LEN, R_LEN
  485. CBNZ R_LEN, verySlowForwardCopy
  486. B loop
  487. // The code above handles copy tags.
  488. // ----------------------------------------
  489. end:
  490. // This is the end of the "for s < len(src)".
  491. //
  492. // if d != len(dst) { etc }
  493. CMP R_DEND, R_DST
  494. BNE errCorrupt
  495. // return 0
  496. MOVD $0, ret+48(FP)
  497. RET
  498. errCorrupt:
  499. // return decodeErrCodeCorrupt
  500. MOVD $1, R_TMP0
  501. MOVD R_TMP0, ret+48(FP)
  502. RET