123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350 |
- // Copyright (c) 2022+ Klaus Post. All rights reserved.
- // Use of this source code is governed by a BSD-style
- // license that can be found in the LICENSE file.
- package s2
- import (
- "bytes"
- "encoding/binary"
- "sync"
- )
- const (
- // MinDictSize is the minimum dictionary size when repeat has been read.
- MinDictSize = 16
- // MaxDictSize is the maximum dictionary size when repeat has been read.
- MaxDictSize = 65536
- // MaxDictSrcOffset is the maximum offset where a dictionary entry can start.
- MaxDictSrcOffset = 65535
- )
- // Dict contains a dictionary that can be used for encoding and decoding s2
- type Dict struct {
- dict []byte
- repeat int // Repeat as index of dict
- fast, better, best sync.Once
- fastTable *[1 << 14]uint16
- betterTableShort *[1 << 14]uint16
- betterTableLong *[1 << 17]uint16
- bestTableShort *[1 << 16]uint32
- bestTableLong *[1 << 19]uint32
- }
- // NewDict will read a dictionary.
- // It will return nil if the dictionary is invalid.
- func NewDict(dict []byte) *Dict {
- if len(dict) == 0 {
- return nil
- }
- var d Dict
- // Repeat is the first value of the dict
- r, n := binary.Uvarint(dict)
- if n <= 0 {
- return nil
- }
- dict = dict[n:]
- d.dict = dict
- if cap(d.dict) < len(d.dict)+16 {
- d.dict = append(make([]byte, 0, len(d.dict)+16), d.dict...)
- }
- if len(dict) < MinDictSize || len(dict) > MaxDictSize {
- return nil
- }
- d.repeat = int(r)
- if d.repeat > len(dict) {
- return nil
- }
- return &d
- }
- // Bytes will return a serialized version of the dictionary.
- // The output can be sent to NewDict.
- func (d *Dict) Bytes() []byte {
- dst := make([]byte, binary.MaxVarintLen16+len(d.dict))
- return append(dst[:binary.PutUvarint(dst, uint64(d.repeat))], d.dict...)
- }
- // MakeDict will create a dictionary.
- // 'data' must be at least MinDictSize.
- // If data is longer than MaxDictSize only the last MaxDictSize bytes will be used.
- // If searchStart is set the start repeat value will be set to the last
- // match of this content.
- // If no matches are found, it will attempt to find shorter matches.
- // This content should match the typical start of a block.
- // If at least 4 bytes cannot be matched, repeat is set to start of block.
- func MakeDict(data []byte, searchStart []byte) *Dict {
- if len(data) == 0 {
- return nil
- }
- if len(data) > MaxDictSize {
- data = data[len(data)-MaxDictSize:]
- }
- var d Dict
- dict := data
- d.dict = dict
- if cap(d.dict) < len(d.dict)+16 {
- d.dict = append(make([]byte, 0, len(d.dict)+16), d.dict...)
- }
- if len(dict) < MinDictSize {
- return nil
- }
- // Find the longest match possible, last entry if multiple.
- for s := len(searchStart); s > 4; s-- {
- if idx := bytes.LastIndex(data, searchStart[:s]); idx >= 0 && idx <= len(data)-8 {
- d.repeat = idx
- break
- }
- }
- return &d
- }
- // MakeDictManual will create a dictionary.
- // 'data' must be at least MinDictSize and less than or equal to MaxDictSize.
- // A manual first repeat index into data must be provided.
- // It must be less than len(data)-8.
- func MakeDictManual(data []byte, firstIdx uint16) *Dict {
- if len(data) < MinDictSize || int(firstIdx) >= len(data)-8 || len(data) > MaxDictSize {
- return nil
- }
- var d Dict
- dict := data
- d.dict = dict
- if cap(d.dict) < len(d.dict)+16 {
- d.dict = append(make([]byte, 0, len(d.dict)+16), d.dict...)
- }
- d.repeat = int(firstIdx)
- return &d
- }
- // Encode returns the encoded form of src. The returned slice may be a sub-
- // slice of dst if dst was large enough to hold the entire encoded block.
- // Otherwise, a newly allocated slice will be returned.
- //
- // The dst and src must not overlap. It is valid to pass a nil dst.
- //
- // The blocks will require the same amount of memory to decode as encoding,
- // and does not make for concurrent decoding.
- // Also note that blocks do not contain CRC information, so corruption may be undetected.
- //
- // If you need to encode larger amounts of data, consider using
- // the streaming interface which gives all of these features.
- func (d *Dict) Encode(dst, src []byte) []byte {
- if n := MaxEncodedLen(len(src)); n < 0 {
- panic(ErrTooLarge)
- } else if cap(dst) < n {
- dst = make([]byte, n)
- } else {
- dst = dst[:n]
- }
- // The block starts with the varint-encoded length of the decompressed bytes.
- dstP := binary.PutUvarint(dst, uint64(len(src)))
- if len(src) == 0 {
- return dst[:dstP]
- }
- if len(src) < minNonLiteralBlockSize {
- dstP += emitLiteral(dst[dstP:], src)
- return dst[:dstP]
- }
- n := encodeBlockDictGo(dst[dstP:], src, d)
- if n > 0 {
- dstP += n
- return dst[:dstP]
- }
- // Not compressible
- dstP += emitLiteral(dst[dstP:], src)
- return dst[:dstP]
- }
- // EncodeBetter returns the encoded form of src. The returned slice may be a sub-
- // slice of dst if dst was large enough to hold the entire encoded block.
- // Otherwise, a newly allocated slice will be returned.
- //
- // EncodeBetter compresses better than Encode but typically with a
- // 10-40% speed decrease on both compression and decompression.
- //
- // The dst and src must not overlap. It is valid to pass a nil dst.
- //
- // The blocks will require the same amount of memory to decode as encoding,
- // and does not make for concurrent decoding.
- // Also note that blocks do not contain CRC information, so corruption may be undetected.
- //
- // If you need to encode larger amounts of data, consider using
- // the streaming interface which gives all of these features.
- func (d *Dict) EncodeBetter(dst, src []byte) []byte {
- if n := MaxEncodedLen(len(src)); n < 0 {
- panic(ErrTooLarge)
- } else if len(dst) < n {
- dst = make([]byte, n)
- }
- // The block starts with the varint-encoded length of the decompressed bytes.
- dstP := binary.PutUvarint(dst, uint64(len(src)))
- if len(src) == 0 {
- return dst[:dstP]
- }
- if len(src) < minNonLiteralBlockSize {
- dstP += emitLiteral(dst[dstP:], src)
- return dst[:dstP]
- }
- n := encodeBlockBetterDict(dst[dstP:], src, d)
- if n > 0 {
- dstP += n
- return dst[:dstP]
- }
- // Not compressible
- dstP += emitLiteral(dst[dstP:], src)
- return dst[:dstP]
- }
- // EncodeBest returns the encoded form of src. The returned slice may be a sub-
- // slice of dst if dst was large enough to hold the entire encoded block.
- // Otherwise, a newly allocated slice will be returned.
- //
- // EncodeBest compresses as good as reasonably possible but with a
- // big speed decrease.
- //
- // The dst and src must not overlap. It is valid to pass a nil dst.
- //
- // The blocks will require the same amount of memory to decode as encoding,
- // and does not make for concurrent decoding.
- // Also note that blocks do not contain CRC information, so corruption may be undetected.
- //
- // If you need to encode larger amounts of data, consider using
- // the streaming interface which gives all of these features.
- func (d *Dict) EncodeBest(dst, src []byte) []byte {
- if n := MaxEncodedLen(len(src)); n < 0 {
- panic(ErrTooLarge)
- } else if len(dst) < n {
- dst = make([]byte, n)
- }
- // The block starts with the varint-encoded length of the decompressed bytes.
- dstP := binary.PutUvarint(dst, uint64(len(src)))
- if len(src) == 0 {
- return dst[:dstP]
- }
- if len(src) < minNonLiteralBlockSize {
- dstP += emitLiteral(dst[dstP:], src)
- return dst[:dstP]
- }
- n := encodeBlockBest(dst[dstP:], src, d)
- if n > 0 {
- dstP += n
- return dst[:dstP]
- }
- // Not compressible
- dstP += emitLiteral(dst[dstP:], src)
- return dst[:dstP]
- }
- // Decode returns the decoded form of src. The returned slice may be a sub-
- // slice of dst if dst was large enough to hold the entire decoded block.
- // Otherwise, a newly allocated slice will be returned.
- //
- // The dst and src must not overlap. It is valid to pass a nil dst.
- func (d *Dict) Decode(dst, src []byte) ([]byte, error) {
- dLen, s, err := decodedLen(src)
- if err != nil {
- return nil, err
- }
- if dLen <= cap(dst) {
- dst = dst[:dLen]
- } else {
- dst = make([]byte, dLen)
- }
- if s2DecodeDict(dst, src[s:], d) != 0 {
- return nil, ErrCorrupt
- }
- return dst, nil
- }
- func (d *Dict) initFast() {
- d.fast.Do(func() {
- const (
- tableBits = 14
- maxTableSize = 1 << tableBits
- )
- var table [maxTableSize]uint16
- // We stop so any entry of length 8 can always be read.
- for i := 0; i < len(d.dict)-8-2; i += 3 {
- x0 := load64(d.dict, i)
- h0 := hash6(x0, tableBits)
- h1 := hash6(x0>>8, tableBits)
- h2 := hash6(x0>>16, tableBits)
- table[h0] = uint16(i)
- table[h1] = uint16(i + 1)
- table[h2] = uint16(i + 2)
- }
- d.fastTable = &table
- })
- }
- func (d *Dict) initBetter() {
- d.better.Do(func() {
- const (
- // Long hash matches.
- lTableBits = 17
- maxLTableSize = 1 << lTableBits
- // Short hash matches.
- sTableBits = 14
- maxSTableSize = 1 << sTableBits
- )
- var lTable [maxLTableSize]uint16
- var sTable [maxSTableSize]uint16
- // We stop so any entry of length 8 can always be read.
- for i := 0; i < len(d.dict)-8; i++ {
- cv := load64(d.dict, i)
- lTable[hash7(cv, lTableBits)] = uint16(i)
- sTable[hash4(cv, sTableBits)] = uint16(i)
- }
- d.betterTableShort = &sTable
- d.betterTableLong = &lTable
- })
- }
- func (d *Dict) initBest() {
- d.best.Do(func() {
- const (
- // Long hash matches.
- lTableBits = 19
- maxLTableSize = 1 << lTableBits
- // Short hash matches.
- sTableBits = 16
- maxSTableSize = 1 << sTableBits
- )
- var lTable [maxLTableSize]uint32
- var sTable [maxSTableSize]uint32
- // We stop so any entry of length 8 can always be read.
- for i := 0; i < len(d.dict)-8; i++ {
- cv := load64(d.dict, i)
- hashL := hash8(cv, lTableBits)
- hashS := hash4(cv, sTableBits)
- candidateL := lTable[hashL]
- candidateS := sTable[hashS]
- lTable[hashL] = uint32(i) | candidateL<<16
- sTable[hashS] = uint32(i) | candidateS<<16
- }
- d.bestTableShort = &sTable
- d.bestTableLong = &lTable
- })
- }
|