common/bitutil/compress.go


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170

// Copyright 2017 The go-ethereum Authors
// This file is part of the go-ethereum library.
//
// The go-ethereum library is free software: you can redistribute it and/or modify
// it under the terms of the GNU Lesser General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// The go-ethereum library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>.

package bitutil

import "errors"

var (
    // errMissingData is returned from decompression if the byte referenced by
    // the bitset header overflows the input data.
    errMissingData = errors.New("missing bytes on input")

    // errUnreferencedData is returned from decompression if not all bytes were used
    // up from the input data after decompressing it.
    errUnreferencedData = errors.New("extra bytes on input")

    // errExceededTarget is returned from decompression if the bitset header has
    // more bits defined than the number of target buffer space available.
    errExceededTarget = errors.New("target data size exceeded")

    // errZeroContent is returned from decompression if a data byte referenced in
    // the bitset header is actually a zero byte.
    errZeroContent = errors.New("zero byte in input content")
)

// The compression algorithm implemented by CompressBytes and DecompressBytes is
// optimized for sparse input data which contains a lot of zero bytes. Decompression
// requires knowledge of the decompressed data length.
//
// Compression works as follows:
//
//   if data only contains zeroes,
//       CompressBytes(data) == nil
//   otherwise if len(data) <= 1,
//       CompressBytes(data) == data
//   otherwise:
//       CompressBytes(data) == append(CompressBytes(nonZeroBitset(data)), nonZeroBytes(data)...)
//       where
//         nonZeroBitset(data) is a bit vector with len(data) bits (MSB first):
//             nonZeroBitset(data)[i/8] && (1 << (7-i%8)) != 0  if data[i] != 0
//             len(nonZeroBitset(data)) == (len(data)+7)/8
//         nonZeroBytes(data) contains the non-zero bytes of data in the same order

// CompressBytes compresses the input byte slice according to the sparse bitset
// representation algorithm. If the result is bigger than the original input, no
// compression is done.
func CompressBytes(data []byte) []byte {
    if out := bitsetEncodeBytes(data); len(out) < len(data) {
        return out
    }
    cpy := make([]byte, len(data))
    copy(cpy, data)
    return cpy
}

// bitsetEncodeBytes compresses the input byte slice according to the sparse
// bitset representation algorithm.
func bitsetEncodeBytes(data []byte) []byte {
    // Empty slices get compressed to nil
    if len(data) == 0 {
        return nil
    }
    // One byte slices compress to nil or retain the single byte
    if len(data) == 1 {
        if data[0] == 0 {
            return nil
        }
        return data
    }
    // Calculate the bitset of set bytes, and gather the non-zero bytes
    nonZeroBitset := make([]byte, (len(data)+7)/8)
    nonZeroBytes := make([]byte, 0, len(data))

    for i, b := range data {
        if b != 0 {
            nonZeroBytes = append(nonZeroBytes, b)
            nonZeroBitset[i/8] |= 1 << byte(7-i%8)
        }
    }
    if len(nonZeroBytes) == 0 {
        return nil
    }
    return append(bitsetEncodeBytes(nonZeroBitset), nonZeroBytes...)
}

// DecompressBytes decompresses data with a known target size. If the input data
// matches the size of the target, it means no compression was done in the first
// place.
func DecompressBytes(data []byte, target int) ([]byte, error) {
    if len(data) > target {
        return nil, errExceededTarget
    }
    if len(data) == target {
        cpy := make([]byte, len(data))
        copy(cpy, data)
        return cpy, nil
    }
    return bitsetDecodeBytes(data, target)
}

// bitsetDecodeBytes decompresses data with a known target size.
func bitsetDecodeBytes(data []byte, target int) ([]byte, error) {
    out, size, err := bitsetDecodePartialBytes(data, target)
    if err != nil {
        return nil, err
    }
    if size != len(data) {
        return nil, errUnreferencedData
    }
    return out, nil
}

// bitsetDecodePartialBytes decompresses data with a known target size, but does
// not enforce consuming all the input bytes. In addition to the decompressed
// output, the function returns the length of compressed input data corresponding
// to the output as the input slice may be longer.
func bitsetDecodePartialBytes(data []byte, target int) ([]byte, int, error) {
    // Sanity check 0 targets to avoid infinite recursion
    if target == 0 {
        return nil, 0, nil
    }
    // Handle the zero and single byte corner cases
    decomp := make([]byte, target)
    if len(data) == 0 {
        return decomp, 0, nil
    }
    if target == 1 {
        decomp[0] = data[0] // copy to avoid referencing the input slice
        if data[0] != 0 {
            return decomp, 1, nil
        }
        return decomp, 0, nil
    }
    // Decompress the bitset of set bytes and distribute the non zero bytes
    nonZeroBitset, ptr, err := bitsetDecodePartialBytes(data, (target+7)/8)
    if err != nil {
        return nil, ptr, err
    }
    for i := 0; i < 8*len(nonZeroBitset); i++ {
        if nonZeroBitset[i/8]&(1<<byte(7-i%8)) != 0 {
            // Make sure we have enough data to push into the correct slot
            if ptr >= len(data) {
                return nil, 0, errMissingData
            }
            if i >= len(decomp) {
                return nil, 0, errExceededTarget
            }
            // Make sure the data is valid and push into the slot
            if data[ptr] == 0 {
                return nil, 0, errZeroContent
            }
            decomp[i] = data[ptr]
            ptr++
        }
    }
    return decomp, ptr, nil
}