Nie wiem czy to się nadaje na forum :) ale klepię sobie program do kompresji danych i mam problem bo źle dekompresuje :p
Oto kod:
/*
* chunker.cpp -- a fast experimental compression program
*
* Copyright (C) 2013 Piotr Tarsa ( http://github.com/tarsa )
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the author be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*
*/
#include <algorithm>
#include <cassert>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#define MAX_BLOCK_SIZE (1 << 20)
#if MAX_BLOCK_SIZE % 4 != 0
#error Max block size must contain integral number of chunks
#endif
#define FAST_MODE_MARKER 1
#define GOOD_MODE_MARKER 2
#define TAG_SIZE_FAST 2
#define TAG_SIZE_GOOD 3
#define CHUNKS_PER_SIGNATURE_FAST (32 / TAG_SIZE_FAST)
#define CHUNKS_PER_SIGNATURE_GOOD (32 / TAG_SIZE_GOOD)
#define restrict __restrict__
typedef uint32_t chunk_t;
typedef struct {
chunk_t dictionary[1 << 16][2];
chunk_t predictions[1 << 16];
chunk_t smallDictionary[1 << 10];
uint32_t lastHash;
#ifndef NDEBUG
bool initialized;
#endif
} workset_t;
int err(char const * const message) {
fputs(message, stderr);
fputs("\n", stderr);
return EXIT_FAILURE;
}
size_t fgetd(uint32_t * const value) {
return fread_unlocked(value, sizeof (uint32_t), 1, stdin);
}
size_t fputd(uint32_t const value) {
return fwrite_unlocked(&value, sizeof (uint32_t), 1, stdout);
}
size_t loadBlock(uint8_t * const buffer, size_t const toLoad) {
size_t bufferLoaded = 0;
while (bufferLoaded < toLoad && feof_unlocked(stdin) == 0
&& ferror_unlocked(stdin) == 0) {
size_t const loaded = fread_unlocked(buffer + bufferLoaded,
sizeof (uint8_t), toLoad - bufferLoaded, stdin);
bufferLoaded += loaded;
}
return bufferLoaded;
}
uint32_t hashF(uint32_t const value) {
return value * 123456791u >> 16;
}
uint32_t reduceHash(uint32_t const bigHash) {
return bigHash >> 6;
}
int32_t compressBlockFast(chunk_t const * const inputChunksBuffer,
int32_t const inputChunksNumber, chunk_t * const signaturesBuffer,
uint8_t * const codeStreamBuffer, workset_t * const workset) {
int32_t codeStreamIndex = 0;
int32_t chunkNumber = 0;
int32_t signatureNumber = 0;
while (chunkNumber < inputChunksNumber) {
chunk_t signature = 0;
int32_t lastChunkInSlice = std::min(inputChunksNumber,
chunkNumber + CHUNKS_PER_SIGNATURE_FAST);
while (chunkNumber < lastChunkInSlice) {
chunk_t const chunk = inputChunksBuffer[chunkNumber++];
uint32_t const hash = hashF(chunk);
chunk_t * const prediction = workset->predictions
+ workset->lastHash;
uint32_t selector = 3;
if (*prediction != chunk) {
chunk_t * const found = workset->dictionary[hash];
int8_t const equal_0 = found[0] == chunk;
int8_t const equal_1 = found[1] == chunk;
int8_t const equal_any = equal_0 || equal_1;
selector = equal_0 * 2 + equal_1;
found[1] = found[equal_0];
found[0] = chunk;
uint32_t const code = equal_any ? hash : chunk;
*(uint32_t *) (codeStreamBuffer + codeStreamIndex) = code;
codeStreamIndex += equal_any ? 2 : 4;
*prediction = chunk;
}
workset->lastHash = hash;
signature <<= TAG_SIZE_FAST;
signature |= selector;
}
if (chunkNumber == inputChunksNumber) {
int32_t const tagsInSignature = chunkNumber
% CHUNKS_PER_SIGNATURE_FAST;
if (tagsInSignature != 0) {
for (int32_t i = tagsInSignature; i < CHUNKS_PER_SIGNATURE_FAST;
i++) {
signature <<= TAG_SIZE_FAST;
}
}
}
signaturesBuffer[signatureNumber++] = signature;
}
assert(chunkNumber == inputChunksNumber);
assert(signatureNumber == (inputChunksNumber + CHUNKS_PER_SIGNATURE_FAST
- 1) / CHUNKS_PER_SIGNATURE_FAST);
assert(codeStreamIndex >= 0);
assert(codeStreamIndex <= inputChunksNumber * sizeof (chunk_t));
return codeStreamIndex;
}
int32_t compressBlockGood(chunk_t const * const inputChunksBuffer,
int32_t const inputChunksNumber, chunk_t * const signaturesBuffer,
uint8_t * const codeStreamBuffer, workset_t * const workset) {
int32_t codeStreamIndex = 0;
int32_t chunkNumber = 0;
int32_t signatureNumber = 0;
while (chunkNumber < inputChunksNumber) {
chunk_t signature = 0;
int32_t lastChunkInSlice = std::min(inputChunksNumber,
chunkNumber + CHUNKS_PER_SIGNATURE_GOOD);
while (chunkNumber < lastChunkInSlice) {
chunk_t const chunk = inputChunksBuffer[chunkNumber++];
uint32_t const hash = hashF(chunk);
chunk_t * const prediction = workset->predictions
+ workset->lastHash;
uint32_t selector = 7;
if (*prediction != chunk) {
uint32_t const hashSmall = reduceHash(hash);
chunk_t * const foundSmall = workset->smallDictionary
+ hashSmall;
int8_t const equalSmall = *foundSmall == chunk;
if (equalSmall) {
selector = hashSmall >> 8;
uint8_t const code = hashSmall;
codeStreamBuffer[codeStreamIndex++] = code;
} else {
chunk_t * const found = workset->dictionary[hash];
int8_t const equal_0 = found[0] == chunk;
int8_t const equal_1 = found[1] == chunk;
int8_t const equal_any = equal_0 || equal_1;
*foundSmall = equal_0 ? chunk : *foundSmall;
selector = 4 + equal_0 * 2 + equal_1;
found[1] = found[equal_0];
found[0] = chunk;
uint32_t const code = equal_any ? hash : chunk;
*(uint32_t *) (codeStreamBuffer + codeStreamIndex) = code;
codeStreamIndex += equal_any ? 2 : 4;
}
*prediction = chunk;
}
workset->lastHash = hash;
signature <<= TAG_SIZE_GOOD;
signature |= selector;
}
if (chunkNumber == inputChunksNumber) {
int32_t const tagsInSignature = chunkNumber
% CHUNKS_PER_SIGNATURE_GOOD;
if (tagsInSignature != 0) {
for (int32_t i = tagsInSignature; i < CHUNKS_PER_SIGNATURE_GOOD;
i++) {
signature <<= TAG_SIZE_GOOD;
}
}
}
signaturesBuffer[signatureNumber++] = signature;
}
assert(chunkNumber == inputChunksNumber);
assert(signatureNumber == (inputChunksNumber + CHUNKS_PER_SIGNATURE_GOOD
- 1) / CHUNKS_PER_SIGNATURE_GOOD);
assert(codeStreamIndex >= 0);
assert(codeStreamIndex <= inputChunksNumber * sizeof (chunk_t));
return codeStreamIndex;
}
int compress(bool const fastMode) {
fputc_unlocked(fastMode ? FAST_MODE_MARKER : GOOD_MODE_MARKER, stdout);
uint8_t * const inputBuffer = new uint8_t[MAX_BLOCK_SIZE
+ sizeof (chunk_t)];
chunk_t * inputChunksBuffers[3] = {nullptr, nullptr, nullptr};
chunk_t * signaturesBuffers[3] = {nullptr, nullptr, nullptr};
uint8_t * codeStreamBuffers[3] = {nullptr, nullptr, nullptr};
workset_t * worksets[3] = {nullptr, nullptr, nullptr};
uint32_t inputChunksQuantities[3];
uint32_t signaturesQuantities[3];
uint32_t codeStreamsLengths[3];
inputChunksBuffers[0] = (chunk_t *) inputBuffer;
bool firstBlock = true;
bool fullBlock;
do {
size_t const inputBufferLoaded = loadBlock(inputBuffer, MAX_BLOCK_SIZE);
fullBlock = inputBufferLoaded == MAX_BLOCK_SIZE;
bool const onlyBlock = firstBlock && !fullBlock;
int32_t const iterations = fastMode ? 2 : 3;
for (int32_t iteration = 1; iteration <= iterations; iteration++) {
if (iteration == 1) {
inputChunksQuantities[0] = (inputBufferLoaded + sizeof (chunk_t)
- 1) / sizeof (chunk_t);
} else {
uint32_t const chunksPerSignature = (iteration != 2 || fastMode)
? CHUNKS_PER_SIGNATURE_FAST : CHUNKS_PER_SIGNATURE_GOOD;
inputChunksQuantities[iteration - 1] =
(inputChunksQuantities[iteration - 2]
+ chunksPerSignature - 1) / chunksPerSignature;
}
{
uint32_t const chunksPerSignature = (iteration != 1 || fastMode)
? CHUNKS_PER_SIGNATURE_FAST : CHUNKS_PER_SIGNATURE_GOOD;
signaturesQuantities[iteration - 1] =
(inputChunksQuantities[iteration - 1]
+ chunksPerSignature - 1) / chunksPerSignature;
}
if (firstBlock) {
if (iteration > 1) {
inputChunksBuffers[iteration - 1] =
signaturesBuffers[iteration - 2];
}
signaturesBuffers[iteration - 1] =
new chunk_t[signaturesQuantities[iteration - 1]];
codeStreamBuffers[iteration - 1] =
new uint8_t[(inputChunksQuantities[iteration - 1] + 1)
* sizeof (chunk_t)];
if (onlyBlock && iteration > 1) {
worksets[iteration - 1] = worksets[iteration - 2];
} else {
worksets[iteration - 1] = new workset_t;
}
workset_t * const workset = worksets[iteration - 1];
memset(workset->dictionary, 0, sizeof (workset->dictionary));
memset(workset->predictions, 0, sizeof (workset->predictions));
memset(workset->smallDictionary, 0,
sizeof (workset->smallDictionary));
workset->lastHash = 0;
#ifndef NDEBUG
workset->initialized = true;
#endif
}
#ifndef NDEBUG
assert(worksets[iteration - 1]->initialized);
#endif
codeStreamsLengths[iteration - 1] = (iteration != 1 || fastMode)
? compressBlockFast(inputChunksBuffers[iteration - 1],
inputChunksQuantities[iteration - 1],
signaturesBuffers[iteration - 1],
codeStreamBuffers[iteration - 1], worksets[iteration - 1])
: compressBlockGood(inputChunksBuffers[iteration - 1],
inputChunksQuantities[iteration - 1],
signaturesBuffers[iteration - 1],
codeStreamBuffers[iteration - 1], worksets[iteration - 1]);
}
int32_t bestNumberOfIterations = 0;
{
uint32_t bestTotalSize = inputChunksQuantities[0]
* sizeof (chunk_t);
for (int32_t iteration = 1; iteration <= iterations; iteration++) {
uint32_t totalSize = signaturesQuantities[iteration - 1]
* sizeof (chunk_t);
for (int32_t iteration2 = 1; iteration2 <= iteration;
iteration2++) {
totalSize += codeStreamsLengths[iteration2 - 1];
}
if (totalSize < bestTotalSize) {
bestNumberOfIterations = iteration;
bestTotalSize = totalSize;
}
}
}
fputc_unlocked(bestNumberOfIterations, stdout);
fputd(inputChunksQuantities[0]);
if (bestNumberOfIterations == 0) {
fwrite_unlocked(inputChunksBuffers[0], sizeof (chunk_t),
inputChunksQuantities[0], stdout);
} else {
fwrite_unlocked(signaturesBuffers[bestNumberOfIterations - 1],
sizeof (chunk_t), signaturesQuantities[
bestNumberOfIterations - 1], stdout);
for (int32_t iteration = bestNumberOfIterations; iteration >= 1;
iteration--) {
fputd(codeStreamsLengths[iteration - 1]);
fwrite_unlocked(codeStreamBuffers[iteration - 1],
sizeof (uint8_t), codeStreamsLengths[iteration - 1],
stdout);
}
}
if (!fullBlock) {
size_t const remainingsLength = inputBufferLoaded
% sizeof (chunk_t);
fputc_unlocked(remainingsLength, stdout);
fwrite_unlocked(inputBuffer + inputBufferLoaded - remainingsLength,
sizeof (uint8_t), remainingsLength, stdout);
}
firstBlock = false;
} while (fullBlock);
return EXIT_SUCCESS;
}
int32_t decompressBlockFast(chunk_t const * const signaturesBuffer,
uint8_t const * const codeStreamBuffer, workset_t * const workset,
chunk_t * const outputChunksBuffer, int32_t const outputChunksNumber) {
int32_t codeStreamIndex = 0;
int32_t chunkNumber = 0;
int32_t signatureNumber = 0;
while (chunkNumber < outputChunksNumber) {
chunk_t signature = signaturesBuffer[signatureNumber++];
int32_t lastChunkInSlice = std::min(outputChunksNumber,
chunkNumber + CHUNKS_PER_SIGNATURE_FAST);
while (chunkNumber < lastChunkInSlice) {
uint32_t const selector = signature >> 30;
if (selector == 3) {
chunk_t const chunk = workset->predictions[workset->lastHash];
outputChunksBuffer[chunkNumber++] = chunk;
workset->lastHash = hashF(chunk);
} else if (selector) {
uint32_t const hash = *(uint16_t*) (codeStreamBuffer
+ codeStreamIndex);
codeStreamIndex += 2;
chunk_t * const found = workset->dictionary[hash];
chunk_t const chunk = found[selector & 1];
uint8_t const equal_0 = selector >> 1;
found[1] = found[equal_0];
found[0] = chunk;
outputChunksBuffer[chunkNumber++] = chunk;
workset->predictions[workset->lastHash] = chunk;
workset->lastHash = hash;
} else {
chunk_t const chunk = *(chunk_t*) (codeStreamBuffer
+ codeStreamIndex);
codeStreamIndex += 4;
outputChunksBuffer[chunkNumber++] = chunk;
uint32_t const hash = hashF(chunk);
chunk_t * const found = workset->dictionary[hash];
found[1] = found[0];
found[0] = chunk;
workset->predictions[workset->lastHash] = chunk;
workset->lastHash = hash;
}
signature <<= 2;
}
}
assert(chunkNumber == outputChunksNumber);
assert(signatureNumber == (outputChunksNumber + CHUNKS_PER_SIGNATURE_FAST
- 1) / CHUNKS_PER_SIGNATURE_FAST);
assert(codeStreamIndex >= 0);
assert(codeStreamIndex <= outputChunksNumber * sizeof (chunk_t));
return codeStreamIndex;
}
int32_t decompressBlockGood(chunk_t const * const signaturesBuffer,
uint8_t const * const codeStreamBuffer, workset_t * const workset,
chunk_t * const outputChunksBuffer, int32_t const outputChunksNumber) {
int32_t codeStreamIndex = 0;
int32_t chunkNumber = 0;
int32_t signatureNumber = 0;
while (chunkNumber < outputChunksNumber) {
chunk_t signature = signaturesBuffer[signatureNumber++] << 2;
int32_t lastChunkInSlice = std::min(outputChunksNumber,
chunkNumber + CHUNKS_PER_SIGNATURE_GOOD);
while (chunkNumber < lastChunkInSlice) {
uint32_t const selector = signature >> 29;
if (selector == 7) {
chunk_t const chunk = workset->predictions[workset->lastHash];
outputChunksBuffer[chunkNumber++] = chunk;
workset->lastHash = hashF(chunk);
} else if (selector < 4) {
uint32_t const hashSmall = (selector << 8)
| codeStreamBuffer[codeStreamIndex];
codeStreamIndex++;
chunk_t const chunk = workset->smallDictionary[hashSmall];
outputChunksBuffer[chunkNumber++] = chunk;
workset->predictions[workset->lastHash] = chunk;
workset->lastHash = hashF(chunk);
} else if (selector & 3) {
uint32_t const hash = *(uint16_t*) (codeStreamBuffer
+ codeStreamIndex);
codeStreamIndex += 2;
chunk_t * const found = workset->dictionary[hash];
chunk_t const chunk = found[selector & 1];
uint8_t const equal_0 = !(selector & 1);
found[1] = found[equal_0];
found[0] = chunk;
uint32_t const hashSmall = reduceHash(hash);
workset->smallDictionary[hashSmall] = equal_0 ? chunk
: workset->smallDictionary[hashSmall];
outputChunksBuffer[chunkNumber++] = chunk;
workset->predictions[workset->lastHash] = chunk;
workset->lastHash = hash;
} else {
chunk_t const chunk = *(chunk_t*) (codeStreamBuffer
+ codeStreamIndex);
codeStreamIndex += 4;
outputChunksBuffer[chunkNumber++] = chunk;
uint32_t const hash = hashF(chunk);
chunk_t * const found = workset->dictionary[hash];
found[1] = found[0];
found[0] = chunk;
workset->predictions[workset->lastHash] = chunk;
workset->lastHash = hash;
}
signature <<= 3;
}
}
assert(chunkNumber == outputChunksNumber);
assert(signatureNumber == (outputChunksNumber + CHUNKS_PER_SIGNATURE_GOOD
- 1) / CHUNKS_PER_SIGNATURE_GOOD);
assert(codeStreamIndex >= 0);
assert(codeStreamIndex <= outputChunksNumber * sizeof (chunk_t));
return codeStreamIndex;
}
int decompress() {
int32_t const modeMarker = fgetc_unlocked(stdin);
if (modeMarker != FAST_MODE_MARKER && modeMarker != GOOD_MODE_MARKER) {
return err("Invalid mode marker.");
}
bool const fastMode = modeMarker == FAST_MODE_MARKER;
uint8_t * const outputBuffer = new uint8_t[MAX_BLOCK_SIZE
+ sizeof (chunk_t)];
chunk_t * outputChunksBuffers[3] = {nullptr, nullptr, nullptr};
chunk_t * signaturesBuffers[3] = {nullptr, nullptr, nullptr};
uint8_t * codeStreamBuffers[3] = {nullptr, nullptr, nullptr};
workset_t * worksets[3] = {nullptr, nullptr, nullptr};
uint32_t outputChunksQuantities[3];
uint32_t signaturesQuantities[3];
uint32_t codeStreamsLengths[3];
outputChunksBuffers[0] = (chunk_t *) outputBuffer;
bool firstBlock = true;
bool fullBlock;
do {
int32_t const maxIterations = fastMode ? 2 : 3;
int32_t const iterations = fgetc_unlocked(stdin);
assert(iterations >= 0);
assert(iterations <= maxIterations);
//fprintf(stderr, "%d\n", iterations);
fgetd(outputChunksQuantities + 0);
fullBlock = outputChunksQuantities[0] * sizeof (chunk_t)
== MAX_BLOCK_SIZE;
bool const onlyBlock = firstBlock && !fullBlock;
if (!fullBlock && iterations == 0) {
loadBlock((uint8_t*) outputChunksBuffers[0],
outputChunksQuantities[0] * sizeof (chunk_t));
} else {
for (int32_t iteration = 1; iteration <= maxIterations;
iteration++) {
if (iteration > 1) {
uint32_t const chunksPerSignature = (iteration != 2
|| fastMode) ? CHUNKS_PER_SIGNATURE_FAST
: CHUNKS_PER_SIGNATURE_GOOD;
outputChunksQuantities[iteration - 1] =
(outputChunksQuantities[iteration - 2]
+ chunksPerSignature - 1) / chunksPerSignature;
}
{
uint32_t const chunksPerSignature = (iteration != 1
|| fastMode) ? CHUNKS_PER_SIGNATURE_FAST
: CHUNKS_PER_SIGNATURE_GOOD;
signaturesQuantities[iteration - 1] =
(outputChunksQuantities[iteration - 1]
+ chunksPerSignature - 1) / chunksPerSignature;
}
}
if (firstBlock) {
int32_t const allocatedIterations = fullBlock
? maxIterations : iterations;
for (int32_t iteration = 1; iteration <= allocatedIterations;
iteration++) {
if (iteration > 1) {
outputChunksBuffers[iteration - 1] =
signaturesBuffers[iteration - 2];
}
signaturesBuffers[iteration - 1] =
new chunk_t[signaturesQuantities[iteration - 1]];
codeStreamBuffers[iteration - 1] =
new uint8_t[(outputChunksQuantities[iteration - 1]
+ 1) * sizeof (chunk_t)];
if (onlyBlock && iteration > 1) {
worksets[iteration - 1] = worksets[iteration - 2];
} else {
worksets[iteration - 1] = new workset_t;
}
#ifndef NDEBUG
worksets[iteration - 1]->initialized = false;
#endif
}
}
if (iterations == 0) {
fread_unlocked(outputChunksBuffers[0], sizeof (chunk_t),
outputChunksQuantities[0], stdin);
} else {
for (int32_t iteration = iterations; iteration >= 1;
iteration--) {
if (iteration == iterations) {
fread_unlocked(signaturesBuffers[iteration - 1],
sizeof (chunk_t),
signaturesQuantities[iteration - 1], stdin);
}
fgetd(codeStreamsLengths + iteration - 1);
fread_unlocked(codeStreamBuffers[iteration - 1],
sizeof (uint8_t), codeStreamsLengths[iteration - 1],
stdin);
if (firstBlock) {
workset_t * const workset = worksets[iteration - 1];
memset(workset->dictionary, 0,
sizeof (workset->dictionary));
memset(workset->predictions, 0,
sizeof (workset->predictions));
memset(workset->smallDictionary, 0,
sizeof (workset->smallDictionary));
workset->lastHash = 0;
#ifndef NDEBUG
workset->initialized = true;
#endif
}
int32_t decodedCodeStreamLength;
#ifndef NDEBUG
assert(worksets[iteration - 1]->initialized);
#endif
if (iteration != 1 || fastMode) {
decodedCodeStreamLength = decompressBlockFast(
signaturesBuffers[iteration - 1],
codeStreamBuffers[iteration - 1],
worksets[iteration - 1],
outputChunksBuffers[iteration - 1],
outputChunksQuantities[iteration - 1]);
} else {
decodedCodeStreamLength = decompressBlockGood(
signaturesBuffers[iteration - 1],
codeStreamBuffers[iteration - 1],
worksets[iteration - 1],
outputChunksBuffers[iteration - 1],
outputChunksQuantities[iteration - 1]);
}
if (decodedCodeStreamLength
!= codeStreamsLengths[iteration - 1]) {
fprintf(stderr, "%d %d\n", decodedCodeStreamLength,
codeStreamsLengths[iteration - 1]);
}
assert(decodedCodeStreamLength
== codeStreamsLengths[iteration - 1]);
}
}
if (fullBlock) {
for (int32_t iteration = iterations + 1; iteration
<= maxIterations; iteration++) {
if (firstBlock) {
workset_t * const workset = worksets[iteration - 1];
memset(workset->dictionary, 0,
sizeof (workset->dictionary));
memset(workset->predictions, 0,
sizeof (workset->predictions));
memset(workset->smallDictionary, 0,
sizeof (workset->smallDictionary));
workset->lastHash = 0;
#ifndef NDEBUG
workset->initialized = true;
#endif
}
#ifndef NDEBUG
assert(worksets[iteration - 1]->initialized);
#endif
if (iteration != 1 || fastMode) {
compressBlockFast(outputChunksBuffers[iteration - 1],
outputChunksQuantities[iteration - 1],
signaturesBuffers[iteration - 1],
codeStreamBuffers[iteration - 1],
worksets[iteration - 1]);
} else {
compressBlockGood(outputChunksBuffers[iteration - 1],
outputChunksQuantities[iteration - 1],
signaturesBuffers[iteration - 1],
codeStreamBuffers[iteration - 1],
worksets[iteration - 1]);
}
}
}
}
fwrite_unlocked(outputChunksBuffers[0], sizeof (chunk_t),
outputChunksQuantities[0], stdout);
if (!fullBlock) {
size_t const remainingsLength = fgetc(stdin);
uint8_t smallBuffer[4];
fread_unlocked(smallBuffer, sizeof (uint8_t), remainingsLength,
stdin);
fwrite_unlocked(smallBuffer, sizeof (uint8_t), remainingsLength,
stdout);
}
firstBlock = false;
} while (fullBlock);
return EXIT_SUCCESS;
}
int main(int const argc, char const * const * const argv) {
err("Chunker -- fast compression utility");
err("Author: Piotr Tarsa ( http://github.com/tarsa )");
err("Release date: January, 2013");
err("");
if (argc != 2 || (strcmp(argv[1], "cf") != 0 && strcmp(argv[1], "cx") != 0
&& strcmp(argv[1], "d") != 0)) {
err("Usage: <program name> [cf|cx|d] < input > output");
return EXIT_FAILURE;
}
int result;
if (strcmp(argv[1], "d") == 0) {
result = decompress();
} else {
result = compress(strcmp(argv[1], "cf") == 0);
}
err("Done.");
return result;
}
Szkopuł w tym, że jak wyłączę "rekurencyjną" kompresję sygnatur, podmieniając w metodzie compress linijkę:
int32_t const iterations = fastMode ? 2 : 3;
na:
int32_t const iterations = fastMode ? 1 : 1;
To wtedy hula.
Próbowałem działać Valgrindem i nie wykrył żadnych problemów. Dodałem trochę assertów i puściłem - one też generalnie nie pomogły wiele, bo dostaję tylko że (przy próbie na pliku enwik9 stąd: http://mattmahoney.net/dc/textdata.html ):
chunker: chunker.cpp:588: int decompress(): Assertion
decodedCodeStreamLength == codeStreamsLengths[iteration - 1]' failed.`
Kod mam zamiar opublikować (na GitHubie). Jak ktoś znajdzie krytycznego buga to w nagrodę dam mu creditsa, tzn napiszę w komentarzu u góry kodu, że znalazł buga :)
Kompresja działa w taki sposób, że strumień jest dzielony na bloki. Dla każdego bloku po kompresji powstają dwa bloki wyjściowe: blok sygnatur i blok innych danych (w kodzie nazwany codeStream). Jeśli iterations jest większe od 1 to blok sygnatur jest kompresowany rekurencyjnie. I właśnie ta kompresja rekurencyjna nie działa dobrze.
Jak algorytm działa na niskim poziomie? Dane są dzielone na 4-bajtowe kawałki, następnie hashowane. Jest jeden lub dwa słowniki wielopoziomowe (tzn dla danego hasha jest cały zbiór elementów w słowniku) oraz tablica predykcji kawałków (która podaje po prostu następny kawałek z poprzedniego wystąpienia aktualnie przerabianego kawałka). Sygnatura składa się z tagów, gdzie każdy tag odpowiada jednemu kawałkowi i opisuje czy kawałek został znaleziony w słowniku (a jeśli tak to w którym słowniku i którym elemencie w zbiorze), tabeli predykcji albo nigdzie (brak kompresji). Do bloku codeStream idą zarówno dane nieskompresowane jak i hashe (małe i duże).
Może jest tu jakiś trywialny błąd którego nie zauważyłem. Albo może ktoś ma pomysł jak by go szybko znaleźć?