index.go 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596
  1. // Copyright (c) 2022+ Klaus Post. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package s2
  5. import (
  6. "bytes"
  7. "encoding/binary"
  8. "encoding/json"
  9. "fmt"
  10. "io"
  11. "sort"
  12. )
  13. const (
  14. S2IndexHeader = "s2idx\x00"
  15. S2IndexTrailer = "\x00xdi2s"
  16. maxIndexEntries = 1 << 16
  17. )
  18. // Index represents an S2/Snappy index.
  19. type Index struct {
  20. TotalUncompressed int64 // Total Uncompressed size if known. Will be -1 if unknown.
  21. TotalCompressed int64 // Total Compressed size if known. Will be -1 if unknown.
  22. info []struct {
  23. compressedOffset int64
  24. uncompressedOffset int64
  25. }
  26. estBlockUncomp int64
  27. }
  28. func (i *Index) reset(maxBlock int) {
  29. i.estBlockUncomp = int64(maxBlock)
  30. i.TotalCompressed = -1
  31. i.TotalUncompressed = -1
  32. if len(i.info) > 0 {
  33. i.info = i.info[:0]
  34. }
  35. }
  36. // allocInfos will allocate an empty slice of infos.
  37. func (i *Index) allocInfos(n int) {
  38. if n > maxIndexEntries {
  39. panic("n > maxIndexEntries")
  40. }
  41. i.info = make([]struct {
  42. compressedOffset int64
  43. uncompressedOffset int64
  44. }, 0, n)
  45. }
  46. // add an uncompressed and compressed pair.
  47. // Entries must be sent in order.
  48. func (i *Index) add(compressedOffset, uncompressedOffset int64) error {
  49. if i == nil {
  50. return nil
  51. }
  52. lastIdx := len(i.info) - 1
  53. if lastIdx >= 0 {
  54. latest := i.info[lastIdx]
  55. if latest.uncompressedOffset == uncompressedOffset {
  56. // Uncompressed didn't change, don't add entry,
  57. // but update start index.
  58. latest.compressedOffset = compressedOffset
  59. i.info[lastIdx] = latest
  60. return nil
  61. }
  62. if latest.uncompressedOffset > uncompressedOffset {
  63. return fmt.Errorf("internal error: Earlier uncompressed received (%d > %d)", latest.uncompressedOffset, uncompressedOffset)
  64. }
  65. if latest.compressedOffset > compressedOffset {
  66. return fmt.Errorf("internal error: Earlier compressed received (%d > %d)", latest.uncompressedOffset, uncompressedOffset)
  67. }
  68. }
  69. i.info = append(i.info, struct {
  70. compressedOffset int64
  71. uncompressedOffset int64
  72. }{compressedOffset: compressedOffset, uncompressedOffset: uncompressedOffset})
  73. return nil
  74. }
  75. // Find the offset at or before the wanted (uncompressed) offset.
  76. // If offset is 0 or positive it is the offset from the beginning of the file.
  77. // If the uncompressed size is known, the offset must be within the file.
  78. // If an offset outside the file is requested io.ErrUnexpectedEOF is returned.
  79. // If the offset is negative, it is interpreted as the distance from the end of the file,
  80. // where -1 represents the last byte.
  81. // If offset from the end of the file is requested, but size is unknown,
  82. // ErrUnsupported will be returned.
  83. func (i *Index) Find(offset int64) (compressedOff, uncompressedOff int64, err error) {
  84. if i.TotalUncompressed < 0 {
  85. return 0, 0, ErrCorrupt
  86. }
  87. if offset < 0 {
  88. offset = i.TotalUncompressed + offset
  89. if offset < 0 {
  90. return 0, 0, io.ErrUnexpectedEOF
  91. }
  92. }
  93. if offset > i.TotalUncompressed {
  94. return 0, 0, io.ErrUnexpectedEOF
  95. }
  96. if len(i.info) > 200 {
  97. n := sort.Search(len(i.info), func(n int) bool {
  98. return i.info[n].uncompressedOffset > offset
  99. })
  100. if n == 0 {
  101. n = 1
  102. }
  103. return i.info[n-1].compressedOffset, i.info[n-1].uncompressedOffset, nil
  104. }
  105. for _, info := range i.info {
  106. if info.uncompressedOffset > offset {
  107. break
  108. }
  109. compressedOff = info.compressedOffset
  110. uncompressedOff = info.uncompressedOffset
  111. }
  112. return compressedOff, uncompressedOff, nil
  113. }
  114. // reduce to stay below maxIndexEntries
  115. func (i *Index) reduce() {
  116. if len(i.info) < maxIndexEntries && i.estBlockUncomp >= 1<<20 {
  117. return
  118. }
  119. // Algorithm, keep 1, remove removeN entries...
  120. removeN := (len(i.info) + 1) / maxIndexEntries
  121. src := i.info
  122. j := 0
  123. // Each block should be at least 1MB, but don't reduce below 1000 entries.
  124. for i.estBlockUncomp*(int64(removeN)+1) < 1<<20 && len(i.info)/(removeN+1) > 1000 {
  125. removeN++
  126. }
  127. for idx := 0; idx < len(src); idx++ {
  128. i.info[j] = src[idx]
  129. j++
  130. idx += removeN
  131. }
  132. i.info = i.info[:j]
  133. // Update maxblock estimate.
  134. i.estBlockUncomp += i.estBlockUncomp * int64(removeN)
  135. }
  136. func (i *Index) appendTo(b []byte, uncompTotal, compTotal int64) []byte {
  137. i.reduce()
  138. var tmp [binary.MaxVarintLen64]byte
  139. initSize := len(b)
  140. // We make the start a skippable header+size.
  141. b = append(b, ChunkTypeIndex, 0, 0, 0)
  142. b = append(b, []byte(S2IndexHeader)...)
  143. // Total Uncompressed size
  144. n := binary.PutVarint(tmp[:], uncompTotal)
  145. b = append(b, tmp[:n]...)
  146. // Total Compressed size
  147. n = binary.PutVarint(tmp[:], compTotal)
  148. b = append(b, tmp[:n]...)
  149. // Put EstBlockUncomp size
  150. n = binary.PutVarint(tmp[:], i.estBlockUncomp)
  151. b = append(b, tmp[:n]...)
  152. // Put length
  153. n = binary.PutVarint(tmp[:], int64(len(i.info)))
  154. b = append(b, tmp[:n]...)
  155. // Check if we should add uncompressed offsets
  156. var hasUncompressed byte
  157. for idx, info := range i.info {
  158. if idx == 0 {
  159. if info.uncompressedOffset != 0 {
  160. hasUncompressed = 1
  161. break
  162. }
  163. continue
  164. }
  165. if info.uncompressedOffset != i.info[idx-1].uncompressedOffset+i.estBlockUncomp {
  166. hasUncompressed = 1
  167. break
  168. }
  169. }
  170. b = append(b, hasUncompressed)
  171. // Add each entry
  172. if hasUncompressed == 1 {
  173. for idx, info := range i.info {
  174. uOff := info.uncompressedOffset
  175. if idx > 0 {
  176. prev := i.info[idx-1]
  177. uOff -= prev.uncompressedOffset + (i.estBlockUncomp)
  178. }
  179. n = binary.PutVarint(tmp[:], uOff)
  180. b = append(b, tmp[:n]...)
  181. }
  182. }
  183. // Initial compressed size estimate.
  184. cPredict := i.estBlockUncomp / 2
  185. for idx, info := range i.info {
  186. cOff := info.compressedOffset
  187. if idx > 0 {
  188. prev := i.info[idx-1]
  189. cOff -= prev.compressedOffset + cPredict
  190. // Update compressed size prediction, with half the error.
  191. cPredict += cOff / 2
  192. }
  193. n = binary.PutVarint(tmp[:], cOff)
  194. b = append(b, tmp[:n]...)
  195. }
  196. // Add Total Size.
  197. // Stored as fixed size for easier reading.
  198. binary.LittleEndian.PutUint32(tmp[:], uint32(len(b)-initSize+4+len(S2IndexTrailer)))
  199. b = append(b, tmp[:4]...)
  200. // Trailer
  201. b = append(b, []byte(S2IndexTrailer)...)
  202. // Update size
  203. chunkLen := len(b) - initSize - skippableFrameHeader
  204. b[initSize+1] = uint8(chunkLen >> 0)
  205. b[initSize+2] = uint8(chunkLen >> 8)
  206. b[initSize+3] = uint8(chunkLen >> 16)
  207. //fmt.Printf("chunklen: 0x%x Uncomp:%d, Comp:%d\n", chunkLen, uncompTotal, compTotal)
  208. return b
  209. }
  210. // Load a binary index.
  211. // A zero value Index can be used or a previous one can be reused.
  212. func (i *Index) Load(b []byte) ([]byte, error) {
  213. if len(b) <= 4+len(S2IndexHeader)+len(S2IndexTrailer) {
  214. return b, io.ErrUnexpectedEOF
  215. }
  216. if b[0] != ChunkTypeIndex {
  217. return b, ErrCorrupt
  218. }
  219. chunkLen := int(b[1]) | int(b[2])<<8 | int(b[3])<<16
  220. b = b[4:]
  221. // Validate we have enough...
  222. if len(b) < chunkLen {
  223. return b, io.ErrUnexpectedEOF
  224. }
  225. if !bytes.Equal(b[:len(S2IndexHeader)], []byte(S2IndexHeader)) {
  226. return b, ErrUnsupported
  227. }
  228. b = b[len(S2IndexHeader):]
  229. // Total Uncompressed
  230. if v, n := binary.Varint(b); n <= 0 || v < 0 {
  231. return b, ErrCorrupt
  232. } else {
  233. i.TotalUncompressed = v
  234. b = b[n:]
  235. }
  236. // Total Compressed
  237. if v, n := binary.Varint(b); n <= 0 {
  238. return b, ErrCorrupt
  239. } else {
  240. i.TotalCompressed = v
  241. b = b[n:]
  242. }
  243. // Read EstBlockUncomp
  244. if v, n := binary.Varint(b); n <= 0 {
  245. return b, ErrCorrupt
  246. } else {
  247. if v < 0 {
  248. return b, ErrCorrupt
  249. }
  250. i.estBlockUncomp = v
  251. b = b[n:]
  252. }
  253. var entries int
  254. if v, n := binary.Varint(b); n <= 0 {
  255. return b, ErrCorrupt
  256. } else {
  257. if v < 0 || v > maxIndexEntries {
  258. return b, ErrCorrupt
  259. }
  260. entries = int(v)
  261. b = b[n:]
  262. }
  263. if cap(i.info) < entries {
  264. i.allocInfos(entries)
  265. }
  266. i.info = i.info[:entries]
  267. if len(b) < 1 {
  268. return b, io.ErrUnexpectedEOF
  269. }
  270. hasUncompressed := b[0]
  271. b = b[1:]
  272. if hasUncompressed&1 != hasUncompressed {
  273. return b, ErrCorrupt
  274. }
  275. // Add each uncompressed entry
  276. for idx := range i.info {
  277. var uOff int64
  278. if hasUncompressed != 0 {
  279. // Load delta
  280. if v, n := binary.Varint(b); n <= 0 {
  281. return b, ErrCorrupt
  282. } else {
  283. uOff = v
  284. b = b[n:]
  285. }
  286. }
  287. if idx > 0 {
  288. prev := i.info[idx-1].uncompressedOffset
  289. uOff += prev + (i.estBlockUncomp)
  290. if uOff <= prev {
  291. return b, ErrCorrupt
  292. }
  293. }
  294. if uOff < 0 {
  295. return b, ErrCorrupt
  296. }
  297. i.info[idx].uncompressedOffset = uOff
  298. }
  299. // Initial compressed size estimate.
  300. cPredict := i.estBlockUncomp / 2
  301. // Add each compressed entry
  302. for idx := range i.info {
  303. var cOff int64
  304. if v, n := binary.Varint(b); n <= 0 {
  305. return b, ErrCorrupt
  306. } else {
  307. cOff = v
  308. b = b[n:]
  309. }
  310. if idx > 0 {
  311. // Update compressed size prediction, with half the error.
  312. cPredictNew := cPredict + cOff/2
  313. prev := i.info[idx-1].compressedOffset
  314. cOff += prev + cPredict
  315. if cOff <= prev {
  316. return b, ErrCorrupt
  317. }
  318. cPredict = cPredictNew
  319. }
  320. if cOff < 0 {
  321. return b, ErrCorrupt
  322. }
  323. i.info[idx].compressedOffset = cOff
  324. }
  325. if len(b) < 4+len(S2IndexTrailer) {
  326. return b, io.ErrUnexpectedEOF
  327. }
  328. // Skip size...
  329. b = b[4:]
  330. // Check trailer...
  331. if !bytes.Equal(b[:len(S2IndexTrailer)], []byte(S2IndexTrailer)) {
  332. return b, ErrCorrupt
  333. }
  334. return b[len(S2IndexTrailer):], nil
  335. }
  336. // LoadStream will load an index from the end of the supplied stream.
  337. // ErrUnsupported will be returned if the signature cannot be found.
  338. // ErrCorrupt will be returned if unexpected values are found.
  339. // io.ErrUnexpectedEOF is returned if there are too few bytes.
  340. // IO errors are returned as-is.
  341. func (i *Index) LoadStream(rs io.ReadSeeker) error {
  342. // Go to end.
  343. _, err := rs.Seek(-10, io.SeekEnd)
  344. if err != nil {
  345. return err
  346. }
  347. var tmp [10]byte
  348. _, err = io.ReadFull(rs, tmp[:])
  349. if err != nil {
  350. return err
  351. }
  352. // Check trailer...
  353. if !bytes.Equal(tmp[4:4+len(S2IndexTrailer)], []byte(S2IndexTrailer)) {
  354. return ErrUnsupported
  355. }
  356. sz := binary.LittleEndian.Uint32(tmp[:4])
  357. if sz > maxChunkSize+skippableFrameHeader {
  358. return ErrCorrupt
  359. }
  360. _, err = rs.Seek(-int64(sz), io.SeekEnd)
  361. if err != nil {
  362. return err
  363. }
  364. // Read index.
  365. buf := make([]byte, sz)
  366. _, err = io.ReadFull(rs, buf)
  367. if err != nil {
  368. return err
  369. }
  370. _, err = i.Load(buf)
  371. return err
  372. }
  373. // IndexStream will return an index for a stream.
  374. // The stream structure will be checked, but
  375. // data within blocks is not verified.
  376. // The returned index can either be appended to the end of the stream
  377. // or stored separately.
  378. func IndexStream(r io.Reader) ([]byte, error) {
  379. var i Index
  380. var buf [maxChunkSize]byte
  381. var readHeader bool
  382. for {
  383. _, err := io.ReadFull(r, buf[:4])
  384. if err != nil {
  385. if err == io.EOF {
  386. return i.appendTo(nil, i.TotalUncompressed, i.TotalCompressed), nil
  387. }
  388. return nil, err
  389. }
  390. // Start of this chunk.
  391. startChunk := i.TotalCompressed
  392. i.TotalCompressed += 4
  393. chunkType := buf[0]
  394. if !readHeader {
  395. if chunkType != chunkTypeStreamIdentifier {
  396. return nil, ErrCorrupt
  397. }
  398. readHeader = true
  399. }
  400. chunkLen := int(buf[1]) | int(buf[2])<<8 | int(buf[3])<<16
  401. if chunkLen < checksumSize {
  402. return nil, ErrCorrupt
  403. }
  404. i.TotalCompressed += int64(chunkLen)
  405. _, err = io.ReadFull(r, buf[:chunkLen])
  406. if err != nil {
  407. return nil, io.ErrUnexpectedEOF
  408. }
  409. // The chunk types are specified at
  410. // https://github.com/google/snappy/blob/master/framing_format.txt
  411. switch chunkType {
  412. case chunkTypeCompressedData:
  413. // Section 4.2. Compressed data (chunk type 0x00).
  414. // Skip checksum.
  415. dLen, err := DecodedLen(buf[checksumSize:])
  416. if err != nil {
  417. return nil, err
  418. }
  419. if dLen > maxBlockSize {
  420. return nil, ErrCorrupt
  421. }
  422. if i.estBlockUncomp == 0 {
  423. // Use first block for estimate...
  424. i.estBlockUncomp = int64(dLen)
  425. }
  426. err = i.add(startChunk, i.TotalUncompressed)
  427. if err != nil {
  428. return nil, err
  429. }
  430. i.TotalUncompressed += int64(dLen)
  431. continue
  432. case chunkTypeUncompressedData:
  433. n2 := chunkLen - checksumSize
  434. if n2 > maxBlockSize {
  435. return nil, ErrCorrupt
  436. }
  437. if i.estBlockUncomp == 0 {
  438. // Use first block for estimate...
  439. i.estBlockUncomp = int64(n2)
  440. }
  441. err = i.add(startChunk, i.TotalUncompressed)
  442. if err != nil {
  443. return nil, err
  444. }
  445. i.TotalUncompressed += int64(n2)
  446. continue
  447. case chunkTypeStreamIdentifier:
  448. // Section 4.1. Stream identifier (chunk type 0xff).
  449. if chunkLen != len(magicBody) {
  450. return nil, ErrCorrupt
  451. }
  452. if string(buf[:len(magicBody)]) != magicBody {
  453. if string(buf[:len(magicBody)]) != magicBodySnappy {
  454. return nil, ErrCorrupt
  455. }
  456. }
  457. continue
  458. }
  459. if chunkType <= 0x7f {
  460. // Section 4.5. Reserved unskippable chunks (chunk types 0x02-0x7f).
  461. return nil, ErrUnsupported
  462. }
  463. if chunkLen > maxChunkSize {
  464. return nil, ErrUnsupported
  465. }
  466. // Section 4.4 Padding (chunk type 0xfe).
  467. // Section 4.6. Reserved skippable chunks (chunk types 0x80-0xfd).
  468. }
  469. }
  470. // JSON returns the index as JSON text.
  471. func (i *Index) JSON() []byte {
  472. type offset struct {
  473. CompressedOffset int64 `json:"compressed"`
  474. UncompressedOffset int64 `json:"uncompressed"`
  475. }
  476. x := struct {
  477. TotalUncompressed int64 `json:"total_uncompressed"` // Total Uncompressed size if known. Will be -1 if unknown.
  478. TotalCompressed int64 `json:"total_compressed"` // Total Compressed size if known. Will be -1 if unknown.
  479. Offsets []offset `json:"offsets"`
  480. EstBlockUncomp int64 `json:"est_block_uncompressed"`
  481. }{
  482. TotalUncompressed: i.TotalUncompressed,
  483. TotalCompressed: i.TotalCompressed,
  484. EstBlockUncomp: i.estBlockUncomp,
  485. }
  486. for _, v := range i.info {
  487. x.Offsets = append(x.Offsets, offset{CompressedOffset: v.compressedOffset, UncompressedOffset: v.uncompressedOffset})
  488. }
  489. b, _ := json.MarshalIndent(x, "", " ")
  490. return b
  491. }
  492. // RemoveIndexHeaders will trim all headers and trailers from a given index.
  493. // This is expected to save 20 bytes.
  494. // These can be restored using RestoreIndexHeaders.
  495. // This removes a layer of security, but is the most compact representation.
  496. // Returns nil if headers contains errors.
  497. // The returned slice references the provided slice.
  498. func RemoveIndexHeaders(b []byte) []byte {
  499. const save = 4 + len(S2IndexHeader) + len(S2IndexTrailer) + 4
  500. if len(b) <= save {
  501. return nil
  502. }
  503. if b[0] != ChunkTypeIndex {
  504. return nil
  505. }
  506. chunkLen := int(b[1]) | int(b[2])<<8 | int(b[3])<<16
  507. b = b[4:]
  508. // Validate we have enough...
  509. if len(b) < chunkLen {
  510. return nil
  511. }
  512. b = b[:chunkLen]
  513. if !bytes.Equal(b[:len(S2IndexHeader)], []byte(S2IndexHeader)) {
  514. return nil
  515. }
  516. b = b[len(S2IndexHeader):]
  517. if !bytes.HasSuffix(b, []byte(S2IndexTrailer)) {
  518. return nil
  519. }
  520. b = bytes.TrimSuffix(b, []byte(S2IndexTrailer))
  521. if len(b) < 4 {
  522. return nil
  523. }
  524. return b[:len(b)-4]
  525. }
  526. // RestoreIndexHeaders will index restore headers removed by RemoveIndexHeaders.
  527. // No error checking is performed on the input.
  528. // If a 0 length slice is sent, it is returned without modification.
  529. func RestoreIndexHeaders(in []byte) []byte {
  530. if len(in) == 0 {
  531. return in
  532. }
  533. b := make([]byte, 0, 4+len(S2IndexHeader)+len(in)+len(S2IndexTrailer)+4)
  534. b = append(b, ChunkTypeIndex, 0, 0, 0)
  535. b = append(b, []byte(S2IndexHeader)...)
  536. b = append(b, in...)
  537. var tmp [4]byte
  538. binary.LittleEndian.PutUint32(tmp[:], uint32(len(b)+4+len(S2IndexTrailer)))
  539. b = append(b, tmp[:4]...)
  540. // Trailer
  541. b = append(b, []byte(S2IndexTrailer)...)
  542. chunkLen := len(b) - skippableFrameHeader
  543. b[1] = uint8(chunkLen >> 0)
  544. b[2] = uint8(chunkLen >> 8)
  545. b[3] = uint8(chunkLen >> 16)
  546. return b
  547. }