compressible_test.go

Documentation: github.com/klauspost/compress

     1  package compress
     2  
     3  import (
     4  	"crypto/rand"
     5  	"encoding/base32"
     6  	"testing"
     7  )
     8  
     9  func BenchmarkEstimate(b *testing.B) {
    10  	b.ReportAllocs()
    11  	// (predictable, low entropy distibution)
    12  	b.Run("zeroes-5k", func(b *testing.B) {
    13  		var testData = make([]byte, 5000)
    14  		b.SetBytes(int64(len(testData)))
    15  		b.ResetTimer()
    16  		for i := 0; i < b.N; i++ {
    17  			Estimate(testData)
    18  		}
    19  		b.Log(Estimate(testData))
    20  	})
    21  
    22  	// (predictable, high entropy distibution)
    23  	b.Run("predictable-5k", func(b *testing.B) {
    24  		var testData = make([]byte, 5000)
    25  		for i := range testData {
    26  			testData[i] = byte(float64(i) / float64(len(testData)) * 256)
    27  		}
    28  		b.SetBytes(int64(len(testData)))
    29  		b.ResetTimer()
    30  		for i := 0; i < b.N; i++ {
    31  			Estimate(testData)
    32  		}
    33  		b.Log(Estimate(testData))
    34  	})
    35  
    36  	// (not predictable, high entropy distibution)
    37  	b.Run("random-500b", func(b *testing.B) {
    38  		var testData = make([]byte, 500)
    39  		rand.Read(testData)
    40  		b.SetBytes(int64(len(testData)))
    41  		b.ResetTimer()
    42  		for i := 0; i < b.N; i++ {
    43  			Estimate(testData)
    44  		}
    45  		b.Log(Estimate(testData))
    46  	})
    47  
    48  	// (not predictable, high entropy distibution)
    49  	b.Run("random-5k", func(b *testing.B) {
    50  		var testData = make([]byte, 5000)
    51  		rand.Read(testData)
    52  		b.SetBytes(int64(len(testData)))
    53  		b.ResetTimer()
    54  		for i := 0; i < b.N; i++ {
    55  			Estimate(testData)
    56  		}
    57  		b.Log(Estimate(testData))
    58  	})
    59  
    60  	// (not predictable, high entropy distibution)
    61  	b.Run("random-50k", func(b *testing.B) {
    62  		var testData = make([]byte, 50000)
    63  		rand.Read(testData)
    64  		b.SetBytes(int64(len(testData)))
    65  		b.ResetTimer()
    66  		for i := 0; i < b.N; i++ {
    67  			Estimate(testData)
    68  		}
    69  		b.Log(Estimate(testData))
    70  	})
    71  
    72  	// (not predictable, high entropy distibution)
    73  	b.Run("random-500k", func(b *testing.B) {
    74  		var testData = make([]byte, 500000)
    75  		rand.Read(testData)
    76  		b.SetBytes(int64(len(testData)))
    77  		b.ResetTimer()
    78  		for i := 0; i < b.N; i++ {
    79  			Estimate(testData)
    80  		}
    81  		b.Log(Estimate(testData))
    82  	})
    83  
    84  	// (not predictable, medium entropy distibution)
    85  	b.Run("base-32-5k", func(b *testing.B) {
    86  		var testData = make([]byte, 5000)
    87  		rand.Read(testData)
    88  		s := base32.StdEncoding.EncodeToString(testData)
    89  		testData = []byte(s)
    90  		testData = testData[:5000]
    91  		b.SetBytes(int64(len(testData)))
    92  		b.ResetTimer()
    93  		for i := 0; i < b.N; i++ {
    94  			Estimate(testData)
    95  		}
    96  		b.Log(Estimate(testData))
    97  	})
    98  	// (medium predictable, medium entropy distibution)
    99  	b.Run("text", func(b *testing.B) {
   100  		var testData = []byte(`If compression is done per-chunk, care should be taken that it doesn't leave restic backups open to watermarking/fingerprinting attacks.
   101  This is essentially the same problem we discussed related to fingerprinting the CDC deduplication process:
   102  With "naive" CDC, a "known plaintext" file can be verified to exist within the backup if the size of individual blocks can be observed by an attacker, by using CDC on the file in parallel and comparing the resulting amount of chunks and individual chunk lengths.
   103  As discussed earlier, this can be somewhat mitigated by salting the CDC algorithm with a secret value, as done in attic.
   104  With salted CDC, I assume compression would happen on each individual chunk, after splitting the problematic file into chunks. Restic chunks are in the range of 512 KB to 8 MB (but not evenly distributed - right?).
   105  Attacker knows that the CDC algorithm uses a secret salt, so the attacker generates a range of chunks consisting of the first 512 KB to 8 MB of the file, one for each valid chunk length. The attacker is also able to determine the lengths of compressed chunks.
   106  The attacker then compresses that chunk using the compression algorithm.
   107  The attacker compares the lengths of the resulting chunks to the first chunk in the restic backup sets.
   108  IF a matching block length is found, the attacker repeats the exercise with the next chunk, and the next chunk, and the next chunk, ... and the next chunk.
   109  It is my belief that with sufficiently large files, and considering the fact that the CDC algorithm is "biased" (in lack of better of words) towards generating blocks of about 1 MB, this would be sufficient to ascertain whether or not a certain large file exists in the backup.
   110  AS always, a paranoid and highly unscientific stream of consciousness.
   111  Thoughts?`)
   112  		testData = append(testData, testData...)
   113  		testData = append(testData, testData...)
   114  		b.SetBytes(int64(len(testData)))
   115  		b.ResetTimer()
   116  		for i := 0; i < b.N; i++ {
   117  			Estimate(testData)
   118  		}
   119  		b.Log(Estimate(testData))
   120  	})
   121  }
   122  
   123  func BenchmarkSnannonEntropyBits(b *testing.B) {
   124  	b.ReportAllocs()
   125  	// (predictable, low entropy distibution)
   126  	b.Run("zeroes-5k", func(b *testing.B) {
   127  		var testData = make([]byte, 5000)
   128  		b.SetBytes(int64(len(testData)))
   129  		b.ResetTimer()
   130  		for i := 0; i < b.N; i++ {
   131  			ShannonEntropyBits(testData)
   132  		}
   133  		b.Log(ShannonEntropyBits(testData))
   134  	})
   135  
   136  	// (predictable, high entropy distibution)
   137  	b.Run("predictable-5k", func(b *testing.B) {
   138  		var testData = make([]byte, 5000)
   139  		for i := range testData {
   140  			testData[i] = byte(float64(i) / float64(len(testData)) * 256)
   141  		}
   142  		b.SetBytes(int64(len(testData)))
   143  		b.ResetTimer()
   144  		for i := 0; i < b.N; i++ {
   145  			ShannonEntropyBits(testData)
   146  		}
   147  		b.Log(ShannonEntropyBits(testData))
   148  	})
   149  
   150  	// (not predictable, high entropy distibution)
   151  	b.Run("random-500b", func(b *testing.B) {
   152  		var testData = make([]byte, 500)
   153  		rand.Read(testData)
   154  		b.SetBytes(int64(len(testData)))
   155  		b.ResetTimer()
   156  		for i := 0; i < b.N; i++ {
   157  			ShannonEntropyBits(testData)
   158  		}
   159  		b.Log(ShannonEntropyBits(testData))
   160  	})
   161  
   162  	// (not predictable, high entropy distibution)
   163  	b.Run("random-5k", func(b *testing.B) {
   164  		var testData = make([]byte, 5000)
   165  		rand.Read(testData)
   166  		b.SetBytes(int64(len(testData)))
   167  		b.ResetTimer()
   168  		for i := 0; i < b.N; i++ {
   169  			ShannonEntropyBits(testData)
   170  		}
   171  		b.Log(ShannonEntropyBits(testData))
   172  	})
   173  
   174  	// (not predictable, high entropy distibution)
   175  	b.Run("random-50k", func(b *testing.B) {
   176  		var testData = make([]byte, 50000)
   177  		rand.Read(testData)
   178  		b.SetBytes(int64(len(testData)))
   179  		b.ResetTimer()
   180  		for i := 0; i < b.N; i++ {
   181  			ShannonEntropyBits(testData)
   182  		}
   183  		b.Log(ShannonEntropyBits(testData))
   184  	})
   185  
   186  	// (not predictable, high entropy distibution)
   187  	b.Run("random-500k", func(b *testing.B) {
   188  		var testData = make([]byte, 500000)
   189  		rand.Read(testData)
   190  		b.SetBytes(int64(len(testData)))
   191  		b.ResetTimer()
   192  		for i := 0; i < b.N; i++ {
   193  			ShannonEntropyBits(testData)
   194  		}
   195  		b.Log(ShannonEntropyBits(testData))
   196  	})
   197  
   198  	// (not predictable, medium entropy distibution)
   199  	b.Run("base-32-5k", func(b *testing.B) {
   200  		var testData = make([]byte, 5000)
   201  		rand.Read(testData)
   202  		s := base32.StdEncoding.EncodeToString(testData)
   203  		testData = []byte(s)
   204  		testData = testData[:5000]
   205  		b.SetBytes(int64(len(testData)))
   206  		b.ResetTimer()
   207  		for i := 0; i < b.N; i++ {
   208  			ShannonEntropyBits(testData)
   209  		}
   210  		b.Log(ShannonEntropyBits(testData))
   211  	})
   212  	// (medium predictable, medium entropy distibution)
   213  	b.Run("text", func(b *testing.B) {
   214  		var testData = []byte(`If compression is done per-chunk, care should be taken that it doesn't leave restic backups open to watermarking/fingerprinting attacks.
   215  This is essentially the same problem we discussed related to fingerprinting the CDC deduplication process:
   216  With "naive" CDC, a "known plaintext" file can be verified to exist within the backup if the size of individual blocks can be observed by an attacker, by using CDC on the file in parallel and comparing the resulting amount of chunks and individual chunk lengths.
   217  As discussed earlier, this can be somewhat mitigated by salting the CDC algorithm with a secret value, as done in attic.
   218  With salted CDC, I assume compression would happen on each individual chunk, after splitting the problematic file into chunks. Restic chunks are in the range of 512 KB to 8 MB (but not evenly distributed - right?).
   219  Attacker knows that the CDC algorithm uses a secret salt, so the attacker generates a range of chunks consisting of the first 512 KB to 8 MB of the file, one for each valid chunk length. The attacker is also able to determine the lengths of compressed chunks.
   220  The attacker then compresses that chunk using the compression algorithm.
   221  The attacker compares the lengths of the resulting chunks to the first chunk in the restic backup sets.
   222  IF a matching block length is found, the attacker repeats the exercise with the next chunk, and the next chunk, and the next chunk, ... and the next chunk.
   223  It is my belief that with sufficiently large files, and considering the fact that the CDC algorithm is "biased" (in lack of better of words) towards generating blocks of about 1 MB, this would be sufficient to ascertain whether or not a certain large file exists in the backup.
   224  AS always, a paranoid and highly unscientific stream of consciousness.
   225  Thoughts?`)
   226  		testData = append(testData, testData...)
   227  		testData = append(testData, testData...)
   228  		b.SetBytes(int64(len(testData)))
   229  		b.ResetTimer()
   230  		for i := 0; i < b.N; i++ {
   231  			ShannonEntropyBits(testData)
   232  		}
   233  		b.Log(ShannonEntropyBits(testData))
   234  	})
   235  }
   236
View as plain text