1 package compress
2
3 import (
4 "crypto/rand"
5 "encoding/base32"
6 "testing"
7 )
8
9 func BenchmarkEstimate(b *testing.B) {
10 b.ReportAllocs()
11
12 b.Run("zeroes-5k", func(b *testing.B) {
13 var testData = make([]byte, 5000)
14 b.SetBytes(int64(len(testData)))
15 b.ResetTimer()
16 for i := 0; i < b.N; i++ {
17 Estimate(testData)
18 }
19 b.Log(Estimate(testData))
20 })
21
22
23 b.Run("predictable-5k", func(b *testing.B) {
24 var testData = make([]byte, 5000)
25 for i := range testData {
26 testData[i] = byte(float64(i) / float64(len(testData)) * 256)
27 }
28 b.SetBytes(int64(len(testData)))
29 b.ResetTimer()
30 for i := 0; i < b.N; i++ {
31 Estimate(testData)
32 }
33 b.Log(Estimate(testData))
34 })
35
36
37 b.Run("random-500b", func(b *testing.B) {
38 var testData = make([]byte, 500)
39 rand.Read(testData)
40 b.SetBytes(int64(len(testData)))
41 b.ResetTimer()
42 for i := 0; i < b.N; i++ {
43 Estimate(testData)
44 }
45 b.Log(Estimate(testData))
46 })
47
48
49 b.Run("random-5k", func(b *testing.B) {
50 var testData = make([]byte, 5000)
51 rand.Read(testData)
52 b.SetBytes(int64(len(testData)))
53 b.ResetTimer()
54 for i := 0; i < b.N; i++ {
55 Estimate(testData)
56 }
57 b.Log(Estimate(testData))
58 })
59
60
61 b.Run("random-50k", func(b *testing.B) {
62 var testData = make([]byte, 50000)
63 rand.Read(testData)
64 b.SetBytes(int64(len(testData)))
65 b.ResetTimer()
66 for i := 0; i < b.N; i++ {
67 Estimate(testData)
68 }
69 b.Log(Estimate(testData))
70 })
71
72
73 b.Run("random-500k", func(b *testing.B) {
74 var testData = make([]byte, 500000)
75 rand.Read(testData)
76 b.SetBytes(int64(len(testData)))
77 b.ResetTimer()
78 for i := 0; i < b.N; i++ {
79 Estimate(testData)
80 }
81 b.Log(Estimate(testData))
82 })
83
84
85 b.Run("base-32-5k", func(b *testing.B) {
86 var testData = make([]byte, 5000)
87 rand.Read(testData)
88 s := base32.StdEncoding.EncodeToString(testData)
89 testData = []byte(s)
90 testData = testData[:5000]
91 b.SetBytes(int64(len(testData)))
92 b.ResetTimer()
93 for i := 0; i < b.N; i++ {
94 Estimate(testData)
95 }
96 b.Log(Estimate(testData))
97 })
98
99 b.Run("text", func(b *testing.B) {
100 var testData = []byte(`If compression is done per-chunk, care should be taken that it doesn't leave restic backups open to watermarking/fingerprinting attacks.
101 This is essentially the same problem we discussed related to fingerprinting the CDC deduplication process:
102 With "naive" CDC, a "known plaintext" file can be verified to exist within the backup if the size of individual blocks can be observed by an attacker, by using CDC on the file in parallel and comparing the resulting amount of chunks and individual chunk lengths.
103 As discussed earlier, this can be somewhat mitigated by salting the CDC algorithm with a secret value, as done in attic.
104 With salted CDC, I assume compression would happen on each individual chunk, after splitting the problematic file into chunks. Restic chunks are in the range of 512 KB to 8 MB (but not evenly distributed - right?).
105 Attacker knows that the CDC algorithm uses a secret salt, so the attacker generates a range of chunks consisting of the first 512 KB to 8 MB of the file, one for each valid chunk length. The attacker is also able to determine the lengths of compressed chunks.
106 The attacker then compresses that chunk using the compression algorithm.
107 The attacker compares the lengths of the resulting chunks to the first chunk in the restic backup sets.
108 IF a matching block length is found, the attacker repeats the exercise with the next chunk, and the next chunk, and the next chunk, ... and the next chunk.
109 It is my belief that with sufficiently large files, and considering the fact that the CDC algorithm is "biased" (in lack of better of words) towards generating blocks of about 1 MB, this would be sufficient to ascertain whether or not a certain large file exists in the backup.
110 AS always, a paranoid and highly unscientific stream of consciousness.
111 Thoughts?`)
112 testData = append(testData, testData...)
113 testData = append(testData, testData...)
114 b.SetBytes(int64(len(testData)))
115 b.ResetTimer()
116 for i := 0; i < b.N; i++ {
117 Estimate(testData)
118 }
119 b.Log(Estimate(testData))
120 })
121 }
122
123 func BenchmarkSnannonEntropyBits(b *testing.B) {
124 b.ReportAllocs()
125
126 b.Run("zeroes-5k", func(b *testing.B) {
127 var testData = make([]byte, 5000)
128 b.SetBytes(int64(len(testData)))
129 b.ResetTimer()
130 for i := 0; i < b.N; i++ {
131 ShannonEntropyBits(testData)
132 }
133 b.Log(ShannonEntropyBits(testData))
134 })
135
136
137 b.Run("predictable-5k", func(b *testing.B) {
138 var testData = make([]byte, 5000)
139 for i := range testData {
140 testData[i] = byte(float64(i) / float64(len(testData)) * 256)
141 }
142 b.SetBytes(int64(len(testData)))
143 b.ResetTimer()
144 for i := 0; i < b.N; i++ {
145 ShannonEntropyBits(testData)
146 }
147 b.Log(ShannonEntropyBits(testData))
148 })
149
150
151 b.Run("random-500b", func(b *testing.B) {
152 var testData = make([]byte, 500)
153 rand.Read(testData)
154 b.SetBytes(int64(len(testData)))
155 b.ResetTimer()
156 for i := 0; i < b.N; i++ {
157 ShannonEntropyBits(testData)
158 }
159 b.Log(ShannonEntropyBits(testData))
160 })
161
162
163 b.Run("random-5k", func(b *testing.B) {
164 var testData = make([]byte, 5000)
165 rand.Read(testData)
166 b.SetBytes(int64(len(testData)))
167 b.ResetTimer()
168 for i := 0; i < b.N; i++ {
169 ShannonEntropyBits(testData)
170 }
171 b.Log(ShannonEntropyBits(testData))
172 })
173
174
175 b.Run("random-50k", func(b *testing.B) {
176 var testData = make([]byte, 50000)
177 rand.Read(testData)
178 b.SetBytes(int64(len(testData)))
179 b.ResetTimer()
180 for i := 0; i < b.N; i++ {
181 ShannonEntropyBits(testData)
182 }
183 b.Log(ShannonEntropyBits(testData))
184 })
185
186
187 b.Run("random-500k", func(b *testing.B) {
188 var testData = make([]byte, 500000)
189 rand.Read(testData)
190 b.SetBytes(int64(len(testData)))
191 b.ResetTimer()
192 for i := 0; i < b.N; i++ {
193 ShannonEntropyBits(testData)
194 }
195 b.Log(ShannonEntropyBits(testData))
196 })
197
198
199 b.Run("base-32-5k", func(b *testing.B) {
200 var testData = make([]byte, 5000)
201 rand.Read(testData)
202 s := base32.StdEncoding.EncodeToString(testData)
203 testData = []byte(s)
204 testData = testData[:5000]
205 b.SetBytes(int64(len(testData)))
206 b.ResetTimer()
207 for i := 0; i < b.N; i++ {
208 ShannonEntropyBits(testData)
209 }
210 b.Log(ShannonEntropyBits(testData))
211 })
212
213 b.Run("text", func(b *testing.B) {
214 var testData = []byte(`If compression is done per-chunk, care should be taken that it doesn't leave restic backups open to watermarking/fingerprinting attacks.
215 This is essentially the same problem we discussed related to fingerprinting the CDC deduplication process:
216 With "naive" CDC, a "known plaintext" file can be verified to exist within the backup if the size of individual blocks can be observed by an attacker, by using CDC on the file in parallel and comparing the resulting amount of chunks and individual chunk lengths.
217 As discussed earlier, this can be somewhat mitigated by salting the CDC algorithm with a secret value, as done in attic.
218 With salted CDC, I assume compression would happen on each individual chunk, after splitting the problematic file into chunks. Restic chunks are in the range of 512 KB to 8 MB (but not evenly distributed - right?).
219 Attacker knows that the CDC algorithm uses a secret salt, so the attacker generates a range of chunks consisting of the first 512 KB to 8 MB of the file, one for each valid chunk length. The attacker is also able to determine the lengths of compressed chunks.
220 The attacker then compresses that chunk using the compression algorithm.
221 The attacker compares the lengths of the resulting chunks to the first chunk in the restic backup sets.
222 IF a matching block length is found, the attacker repeats the exercise with the next chunk, and the next chunk, and the next chunk, ... and the next chunk.
223 It is my belief that with sufficiently large files, and considering the fact that the CDC algorithm is "biased" (in lack of better of words) towards generating blocks of about 1 MB, this would be sufficient to ascertain whether or not a certain large file exists in the backup.
224 AS always, a paranoid and highly unscientific stream of consciousness.
225 Thoughts?`)
226 testData = append(testData, testData...)
227 testData = append(testData, testData...)
228 b.SetBytes(int64(len(testData)))
229 b.ResetTimer()
230 for i := 0; i < b.N; i++ {
231 ShannonEntropyBits(testData)
232 }
233 b.Log(ShannonEntropyBits(testData))
234 })
235 }
236
View as plain text