123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448 |
- // +build !appengine
- // +build gc
- // +build !noasm
- #include "go_asm.h"
- #include "textflag.h"
- // AX scratch
- // BX scratch
- // CX literal and match lengths
- // DX token, match offset
- //
- // DI &dst
- // SI &src
- // R8 &dst + len(dst)
- // R9 &src + len(src)
- // R11 &dst
- // R12 short output end
- // R13 short input end
- // R14 &dict
- // R15 len(dict)
- // func decodeBlock(dst, src, dict []byte) int
- TEXT ·decodeBlock(SB), NOSPLIT, $48-80
- MOVQ dst_base+0(FP), DI
- MOVQ DI, R11
- MOVQ dst_len+8(FP), R8
- ADDQ DI, R8
- MOVQ src_base+24(FP), SI
- MOVQ src_len+32(FP), R9
- CMPQ R9, $0
- JE err_corrupt
- ADDQ SI, R9
- MOVQ dict_base+48(FP), R14
- MOVQ dict_len+56(FP), R15
- // shortcut ends
- // short output end
- MOVQ R8, R12
- SUBQ $32, R12
- // short input end
- MOVQ R9, R13
- SUBQ $16, R13
- XORL CX, CX
- loop:
- // token := uint32(src[si])
- MOVBLZX (SI), DX
- INCQ SI
- // lit_len = token >> 4
- // if lit_len > 0
- // CX = lit_len
- MOVL DX, CX
- SHRL $4, CX
- // if lit_len != 0xF
- CMPL CX, $0xF
- JEQ lit_len_loop
- CMPQ DI, R12
- JAE copy_literal
- CMPQ SI, R13
- JAE copy_literal
- // copy shortcut
- // A two-stage shortcut for the most common case:
- // 1) If the literal length is 0..14, and there is enough space,
- // enter the shortcut and copy 16 bytes on behalf of the literals
- // (in the fast mode, only 8 bytes can be safely copied this way).
- // 2) Further if the match length is 4..18, copy 18 bytes in a similar
- // manner; but we ensure that there's enough space in the output for
- // those 18 bytes earlier, upon entering the shortcut (in other words,
- // there is a combined check for both stages).
- // copy literal
- MOVOU (SI), X0
- MOVOU X0, (DI)
- ADDQ CX, DI
- ADDQ CX, SI
- MOVL DX, CX
- ANDL $0xF, CX
- // The second stage: prepare for match copying, decode full info.
- // If it doesn't work out, the info won't be wasted.
- // offset := uint16(data[:2])
- MOVWLZX (SI), DX
- TESTL DX, DX
- JE err_corrupt
- ADDQ $2, SI
- JC err_short_buf
- MOVQ DI, AX
- SUBQ DX, AX
- JC err_corrupt
- CMPQ AX, DI
- JA err_short_buf
- // if we can't do the second stage then jump straight to read the
- // match length, we already have the offset.
- CMPL CX, $0xF
- JEQ match_len_loop_pre
- CMPL DX, $8
- JLT match_len_loop_pre
- CMPQ AX, R11
- JB match_len_loop_pre
- // memcpy(op + 0, match + 0, 8);
- MOVQ (AX), BX
- MOVQ BX, (DI)
- // memcpy(op + 8, match + 8, 8);
- MOVQ 8(AX), BX
- MOVQ BX, 8(DI)
- // memcpy(op +16, match +16, 2);
- MOVW 16(AX), BX
- MOVW BX, 16(DI)
- LEAQ const_minMatch(DI)(CX*1), DI
- // shortcut complete, load next token
- JMP loopcheck
- // Read the rest of the literal length:
- // do { BX = src[si++]; lit_len += BX } while (BX == 0xFF).
- lit_len_loop:
- CMPQ SI, R9
- JAE err_short_buf
- MOVBLZX (SI), BX
- INCQ SI
- ADDQ BX, CX
- CMPB BX, $0xFF
- JE lit_len_loop
- copy_literal:
- // bounds check src and dst
- MOVQ SI, AX
- ADDQ CX, AX
- JC err_short_buf
- CMPQ AX, R9
- JA err_short_buf
- MOVQ DI, BX
- ADDQ CX, BX
- JC err_short_buf
- CMPQ BX, R8
- JA err_short_buf
- // Copy literals of <=48 bytes through the XMM registers.
- CMPQ CX, $48
- JGT memmove_lit
- // if len(dst[di:]) < 48
- MOVQ R8, AX
- SUBQ DI, AX
- CMPQ AX, $48
- JLT memmove_lit
- // if len(src[si:]) < 48
- MOVQ R9, BX
- SUBQ SI, BX
- CMPQ BX, $48
- JLT memmove_lit
- MOVOU (SI), X0
- MOVOU 16(SI), X1
- MOVOU 32(SI), X2
- MOVOU X0, (DI)
- MOVOU X1, 16(DI)
- MOVOU X2, 32(DI)
- ADDQ CX, SI
- ADDQ CX, DI
- JMP finish_lit_copy
- memmove_lit:
- // memmove(to, from, len)
- MOVQ DI, 0(SP)
- MOVQ SI, 8(SP)
- MOVQ CX, 16(SP)
- // Spill registers. Increment SI, DI now so we don't need to save CX.
- ADDQ CX, DI
- ADDQ CX, SI
- MOVQ DI, 24(SP)
- MOVQ SI, 32(SP)
- MOVL DX, 40(SP)
- CALL runtime·memmove(SB)
- // restore registers
- MOVQ 24(SP), DI
- MOVQ 32(SP), SI
- MOVL 40(SP), DX
- // recalc initial values
- MOVQ dst_base+0(FP), R8
- MOVQ R8, R11
- ADDQ dst_len+8(FP), R8
- MOVQ src_base+24(FP), R9
- ADDQ src_len+32(FP), R9
- MOVQ dict_base+48(FP), R14
- MOVQ dict_len+56(FP), R15
- MOVQ R8, R12
- SUBQ $32, R12
- MOVQ R9, R13
- SUBQ $16, R13
- finish_lit_copy:
- // CX := mLen
- // free up DX to use for offset
- MOVL DX, CX
- ANDL $0xF, CX
- CMPQ SI, R9
- JAE end
- // offset
- // si += 2
- // DX := int(src[si-2]) | int(src[si-1])<<8
- ADDQ $2, SI
- JC err_short_buf
- CMPQ SI, R9
- JA err_short_buf
- MOVWQZX -2(SI), DX
- // 0 offset is invalid
- TESTL DX, DX
- JEQ err_corrupt
- match_len_loop_pre:
- // if mlen != 0xF
- CMPB CX, $0xF
- JNE copy_match
- // do { BX = src[si++]; mlen += BX } while (BX == 0xFF).
- match_len_loop:
- CMPQ SI, R9
- JAE err_short_buf
- MOVBLZX (SI), BX
- INCQ SI
- ADDQ BX, CX
- CMPB BX, $0xFF
- JE match_len_loop
- copy_match:
- ADDQ $const_minMatch, CX
- // check we have match_len bytes left in dst
- // di+match_len < len(dst)
- MOVQ DI, AX
- ADDQ CX, AX
- JC err_short_buf
- CMPQ AX, R8
- JA err_short_buf
- // DX = offset
- // CX = match_len
- // BX = &dst + (di - offset)
- MOVQ DI, BX
- SUBQ DX, BX
- // check BX is within dst
- // if BX < &dst
- JC copy_match_from_dict
- CMPQ BX, R11
- JBE copy_match_from_dict
- // if offset + match_len < di
- LEAQ (BX)(CX*1), AX
- CMPQ DI, AX
- JA copy_interior_match
- // AX := len(dst[:di])
- // MOVQ DI, AX
- // SUBQ R11, AX
- // copy 16 bytes at a time
- // if di-offset < 16 copy 16-(di-offset) bytes to di
- // then do the remaining
- copy_match_loop:
- // for match_len >= 0
- // dst[di] = dst[i]
- // di++
- // i++
- MOVB (BX), AX
- MOVB AX, (DI)
- INCQ DI
- INCQ BX
- DECQ CX
- JNZ copy_match_loop
- JMP loopcheck
- copy_interior_match:
- CMPQ CX, $16
- JGT memmove_match
- // if len(dst[di:]) < 16
- MOVQ R8, AX
- SUBQ DI, AX
- CMPQ AX, $16
- JLT memmove_match
- MOVOU (BX), X0
- MOVOU X0, (DI)
- ADDQ CX, DI
- XORL CX, CX
- JMP loopcheck
- copy_match_from_dict:
- // CX = match_len
- // BX = &dst + (di - offset)
- // AX = offset - di = dict_bytes_available => count of bytes potentially covered by the dictionary
- MOVQ R11, AX
- SUBQ BX, AX
- // BX = len(dict) - dict_bytes_available
- MOVQ R15, BX
- SUBQ AX, BX
- JS err_short_dict
- ADDQ R14, BX
- // if match_len > dict_bytes_available, match fits entirely within external dictionary : just copy
- CMPQ CX, AX
- JLT memmove_match
- // The match stretches over the dictionary and our block
- // 1) copy what comes from the dictionary
- // AX = dict_bytes_available = copy_size
- // BX = &dict_end - copy_size
- // CX = match_len
- // memmove(to, from, len)
- MOVQ DI, 0(SP)
- MOVQ BX, 8(SP)
- MOVQ AX, 16(SP)
- // store extra stuff we want to recover
- // spill
- MOVQ DI, 24(SP)
- MOVQ SI, 32(SP)
- MOVQ CX, 40(SP)
- CALL runtime·memmove(SB)
- // restore registers
- MOVQ 16(SP), AX // copy_size
- MOVQ 24(SP), DI
- MOVQ 32(SP), SI
- MOVQ 40(SP), CX // match_len
- // recalc initial values
- MOVQ dst_base+0(FP), R8
- MOVQ R8, R11 // TODO: make these sensible numbers
- ADDQ dst_len+8(FP), R8
- MOVQ src_base+24(FP), R9
- ADDQ src_len+32(FP), R9
- MOVQ dict_base+48(FP), R14
- MOVQ dict_len+56(FP), R15
- MOVQ R8, R12
- SUBQ $32, R12
- MOVQ R9, R13
- SUBQ $16, R13
- // di+=copy_size
- ADDQ AX, DI
- // 2) copy the rest from the current block
- // CX = match_len - copy_size = rest_size
- SUBQ AX, CX
- MOVQ R11, BX
- // check if we have a copy overlap
- // AX = &dst + rest_size
- MOVQ CX, AX
- ADDQ BX, AX
- // if &dst + rest_size > di, copy byte by byte
- CMPQ AX, DI
- JA copy_match_loop
- memmove_match:
- // memmove(to, from, len)
- MOVQ DI, 0(SP)
- MOVQ BX, 8(SP)
- MOVQ CX, 16(SP)
- // Spill registers. Increment DI now so we don't need to save CX.
- ADDQ CX, DI
- MOVQ DI, 24(SP)
- MOVQ SI, 32(SP)
- CALL runtime·memmove(SB)
- // restore registers
- MOVQ 24(SP), DI
- MOVQ 32(SP), SI
- // recalc initial values
- MOVQ dst_base+0(FP), R8
- MOVQ R8, R11 // TODO: make these sensible numbers
- ADDQ dst_len+8(FP), R8
- MOVQ src_base+24(FP), R9
- ADDQ src_len+32(FP), R9
- MOVQ R8, R12
- SUBQ $32, R12
- MOVQ R9, R13
- SUBQ $16, R13
- MOVQ dict_base+48(FP), R14
- MOVQ dict_len+56(FP), R15
- XORL CX, CX
- loopcheck:
- // for si < len(src)
- CMPQ SI, R9
- JB loop
- end:
- // Remaining length must be zero.
- TESTQ CX, CX
- JNE err_corrupt
- SUBQ R11, DI
- MOVQ DI, ret+72(FP)
- RET
- err_corrupt:
- MOVQ $-1, ret+72(FP)
- RET
- err_short_buf:
- MOVQ $-2, ret+72(FP)
- RET
- err_short_dict:
- MOVQ $-3, ret+72(FP)
- RET
|