more accurate compression overhead on random data
diff --git a/Makefile.in b/Makefile.in
index 56594e9..f5f5832 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -4,8 +4,8 @@ CFLAGS=-O2 -march=native -mtune=native -flto -Iinclude -g3
.PHONY: all clean format install cloc
-LIBBZ3_OBJECTS=obj/libsais.o obj/crc32.o obj/srt.o obj/rle.o \
- obj/cm.o obj/libbz3.o obj/txt.o obj/lzp.o
+LIBBZ3_OBJECTS=obj/libsais.o obj/crc32.o obj/rle.o obj/cm.o \
+ obj/libbz3.o obj/lzp.o
all: bzip3 bzip3.so
diff --git a/include/libbz3.h b/include/libbz3.h
index 234a6b9..1b544c3 100644
--- a/include/libbz3.h
+++ b/include/libbz3.h
@@ -55,14 +55,14 @@ void bz3_free(struct bz3_state * state);
/**
* @brief Encode a single block. Returns the amount of bytes written to `buffer'.
- * `buffer' must be able to hold at least `size + size / 4' bytes. The size must not
+ * `buffer' must be able to hold at least `size + size / 50 + 16' bytes. The size must not
* exceed the block size associated with the state.
*/
int32_t bz3_encode_block(struct bz3_state * state, uint8_t * buffer, int32_t size);
/**
* @brief Decode a single block.
- * `buffer' must be able to hold at least `size + size / 4' bytes. The size must not exceed
+ * `buffer' must be able to hold at least `size + size / 50 + 16' bytes. The size must not exceed
* the block size associated with the state.
* @param size The size of the compressed data in `buffer'
* @param orig_size The original size of the data before compression.
diff --git a/include/srt.h b/include/srt.h
deleted file mode 100644
index 026c781..0000000
--- a/include/srt.h
+++ /dev/null
@@ -1,40 +0,0 @@
-
-/*
- * BZip3 - A spiritual successor to BZip2.
- * Copyright (C) 2022 Kamila Szewczyk
- *
- * This program is free software: you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation, either version 3 of the License, or (at your option)
- * any later version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef _SRT_H
-#define _SRT_H
-
-#include <inttypes.h>
-#include <stddef.h>
-
-#include "common.h"
-
-struct srt_state {
- u32 freqs[256];
- u8 symbols[256];
- u32 r2s[256];
- u32 s2r[256];
- u32 buckets[256];
- u32 bucket_ends[256];
-};
-
-u32 srt_encode(struct srt_state * mtf, u8 * src, u8 * dst, u32 count);
-u32 srt_decode(struct srt_state * mtf, u8 * src, u8 * dst, u32 count);
-
-#endif
diff --git a/include/txt.h b/include/txt.h
deleted file mode 100644
index eac6d12..0000000
--- a/include/txt.h
+++ /dev/null
@@ -1,27 +0,0 @@
-
-/*
- * BZip3 - A spiritual successor to BZip2.
- * Copyright (C) 2022 Kamila Szewczyk
- *
- * This program is free software: you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation, either version 3 of the License, or (at your option)
- * any later version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef _TXT_H
-#define _TXT_H
-
-#include "common.h"
-
-int is_text(const u8 * buffer, s32 length);
-
-#endif
diff --git a/src/libbz3.c b/src/libbz3.c
index 6a141a0..72b912d 100644
--- a/src/libbz3.c
+++ b/src/libbz3.c
@@ -28,8 +28,6 @@
#include "libsais.h"
#include "lzp.h"
#include "rle.h"
-#include "srt.h"
-#include "txt.h"
#define LZP_DICTIONARY 18
#define LZP_MIN_MATCH 40
@@ -38,7 +36,6 @@ struct bz3_state {
u8 * swap_buffer;
s32 block_size;
s32 *sais_array, *lzp_lut;
- struct srt_state * srt_state;
state * cm_state;
s8 last_error;
};
@@ -74,14 +71,13 @@ struct bz3_state * bz3_new(s32 block_size) {
}
bz3_state->cm_state = malloc(sizeof(state));
- bz3_state->srt_state = malloc(sizeof(struct srt_state));
- bz3_state->swap_buffer = malloc(block_size + block_size / 4);
+ bz3_state->swap_buffer = malloc(block_size + block_size / 50 + 16);
bz3_state->sais_array = malloc(block_size * sizeof(s32));
bz3_state->lzp_lut = calloc(1 << LZP_DICTIONARY, sizeof(s32));
- if (!bz3_state->cm_state || !bz3_state->srt_state || !bz3_state->swap_buffer || !bz3_state->sais_array ||
+ if (!bz3_state->cm_state || !bz3_state->swap_buffer || !bz3_state->sais_array ||
!bz3_state->lzp_lut) {
return NULL;
}
@@ -96,7 +92,6 @@ struct bz3_state * bz3_new(s32 block_size) {
void bz3_free(struct bz3_state * state) {
free(state->swap_buffer);
free(state->sais_array);
- free(state->srt_state);
free(state->cm_state);
free(state->lzp_lut);
free(state);
diff --git a/src/main.c b/src/main.c
index 10271d1..ececc58 100644
--- a/src/main.c
+++ b/src/main.c
@@ -158,7 +158,7 @@ int main(int argc, char * argv[]) {
return 1;
}
- u8 * buffer = malloc(block_size + block_size / 4);
+ u8 * buffer = malloc(block_size + block_size / 50 + 16);
if (mode == 1) {
s32 read_count;
diff --git a/src/srt.c b/src/srt.c
deleted file mode 100644
index c5c1e38..0000000
--- a/src/srt.c
+++ /dev/null
@@ -1,152 +0,0 @@
-
-/*
- * BZip3 - A spiritual successor to BZip2.
- * Copyright (C) 2022 Kamila Szewczyk
- *
- * This program is free software: you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation, either version 3 of the License, or (at your option)
- * any later version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "srt.h"
-
-static const s32 MAX_HDR_SIZE = 4 * 256;
-
-static s32 preprocess(const u32 * freqs, u8 * symbols) {
- s32 nb_symbols = 0;
- for (s32 i = 0; i < 256; i++)
- if (freqs[i] > 0) symbols[nb_symbols++] = i;
- u32 h = 4;
- while (h < nb_symbols) h = h * 3 + 1;
- while (1) {
- h /= 3;
- for (u32 i = h; i < nb_symbols; i++) {
- const s32 t = symbols[i];
- s32 b = i - h;
- while ((b >= 0) && (freqs[symbols[b]] < freqs[t] || (freqs[t] == freqs[symbols[b]] && t < symbols[b]))) {
- symbols[b + h] = symbols[b];
- b -= h;
- }
- symbols[b + h] = t;
- }
- if (h == 1) break;
- }
- return nb_symbols;
-}
-
-static s32 encode_header(u32 * freqs, u8 * dst) {
- u32 idx = 0;
- for (s32 i = 0; i < 256; i++) {
- u32 f = freqs[i];
- while (f >= 128) {
- dst[idx++] = (u8)(f | 0x80);
- f >>= 7;
- }
- dst[idx++] = (u8)f;
- }
- return idx;
-}
-
-static s32 decode_header(u8 * src, u32 * freqs) {
- u32 idx = 0;
- for (s32 i = 0; i < 256; i++) {
- s32 val = src[idx++] & 0xFF;
- s32 res = val & 0x7F;
- s32 shift = 7;
- while (val >= 128) {
- val = src[idx++] & 0xFF;
- res |= (val & 0x7F) << shift;
- if (shift > 21) break;
- shift += 7;
- }
- freqs[i] = res;
- }
- return idx;
-}
-
-u32 srt_encode(struct srt_state * mtf, u8 * src, u8 * dst, u32 count) {
- // Find first symbols and build a histogram.
- for (s32 i = 0; i < 256; i++) mtf->freqs[i] = 0;
- for (u32 i = 0, b = 0; i < count;) {
- if (mtf->freqs[src[i]] == 0) {
- mtf->r2s[b] = src[i];
- mtf->s2r[src[i]] = b;
- b++;
- }
- u32 j = i + 1;
- while (j < count && src[j] == src[i]) j++;
- mtf->freqs[src[i]] += j - i;
- i = j;
- }
-
- s32 n_symbols = preprocess(mtf->freqs, mtf->symbols);
- for (u32 i = 0, bucket_pos = 0; i < n_symbols; i++) {
- mtf->buckets[mtf->symbols[i]] = bucket_pos;
- bucket_pos += mtf->freqs[mtf->symbols[i]];
- }
-
- const u32 header_size = encode_header(mtf->freqs, dst);
- const s32 dst_idx = header_size;
- for (u32 i = 0; i < count;) {
- const s32 c = src[i] & 0xFF;
- s32 r = mtf->s2r[c] & 0xFF;
- u32 p = mtf->buckets[c];
- dst[dst_idx + p++] = r;
- if (r != 0) {
- do {
- mtf->r2s[r] = mtf->r2s[r - 1];
- mtf->s2r[mtf->r2s[r]] = r;
- r--;
- } while (r != 0);
- mtf->r2s[0] = c;
- mtf->s2r[c] = 0;
- }
- i++;
- while (i < count && src[i] == c) {
- dst[dst_idx + p++] = 0;
- i++;
- }
- mtf->buckets[c] = p;
- }
- return count + header_size;
-}
-
-u32 srt_decode(struct srt_state * mtf, u8 * src, u8 * dst, u32 count) {
- const u32 header_size = decode_header(src, mtf->freqs);
- const u32 src_idx = header_size;
- s32 nb_symbols = preprocess(mtf->freqs, mtf->symbols);
- for (u32 i = 0, bucket_pos = 0; i < nb_symbols; i++) {
- const s32 c = mtf->symbols[i] & 0xFF;
- mtf->r2s[src[src_idx + bucket_pos] & 0xFF] = c;
- mtf->buckets[c] = bucket_pos + 1;
- bucket_pos += mtf->freqs[c];
- mtf->bucket_ends[c] = bucket_pos;
- }
- u32 c = mtf->r2s[0];
- for (u32 i = 0; i < count; i++) {
- dst[i] = c;
- if (mtf->buckets[c] < mtf->bucket_ends[c]) {
- const s32 r = src[src_idx + mtf->buckets[c]] & 0xFF;
- mtf->buckets[c]++;
- if (r == 0) continue;
- for (s32 s = 0; s < r; s++) mtf->r2s[s] = mtf->r2s[s + 1];
- mtf->r2s[r] = c;
- c = mtf->r2s[0];
- } else {
- if (nb_symbols == 1) continue;
- nb_symbols--;
- for (s32 s = 0; s < nb_symbols; s++) mtf->r2s[s] = mtf->r2s[s + 1];
- c = mtf->r2s[0];
- }
- }
- return count - header_size;
-}
diff --git a/src/txt.c b/src/txt.c
deleted file mode 100644
index 7c651aa..0000000
--- a/src/txt.c
+++ /dev/null
@@ -1,100 +0,0 @@
-
-/*
- * BZip3 - A spiritual successor to BZip2.
- * Copyright (C) 2022 Kamila Szewczyk
- *
- * This program is free software: you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation, either version 3 of the License, or (at your option)
- * any later version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "txt.h"
-
-#include <math.h>
-
-#ifdef STANDALONE
- #include <stdio.h>
-#endif
-
-int is_text(const u8 * data, s32 len) {
-#ifdef STANDALONE
- printf("Data of length %d.\n");
-#endif
-
- s32 histogram[256] = { 0 };
- for (s32 i = 0; i < len; i++) histogram[data[i]]++;
-
- // Text criterions:
- // 1. Shannon entropy is between 4.5 and 5.2.
- // 2. Majority of the document must be uppercase/lowercase numbers.
- // 3. The file has a proper amount of whitespace
- // -----
-
- // Step 1
- double entropy = 0;
- for (s32 i = 0; i < 256; i++) {
- double p = (double)histogram[i] / len;
- if (p == 0) continue;
- entropy += p * log2(p);
- }
- entropy = -entropy;
-
-#ifdef STANDALONE
- printf("Shannon entropy: %lf\n", entropy);
-#endif
-
- if (entropy > 5.4 || entropy < 4.5) return 0;
-
- // Step 2
- s32 letters = 0;
- s32 whitespace = 0;
- for (s32 i = 0; i < 256; i++) {
- if (i >= 'A' && i <= 'Z')
- letters += histogram[i];
- else if (i >= 'a' && i <= 'z')
- letters += histogram[i];
- else if (i >= '0' && i <= '9')
- letters += histogram[i];
- else if (i == ' ' || i == '\t' || i == '\n' || i == '\r' || i == '\v')
- whitespace += histogram[i];
- }
-
-#ifdef STANDALONE
- printf("Letters: %d, whitespace: %d, text to other ratio: %lf\n", letters, whitespace,
- (double)(letters + whitespace) / len);
-#endif
-
- if ((double)(letters + whitespace) / len < 0.6) return 0;
-
- // Step 3
- double letters_ratio = (double)letters / whitespace;
- if (letters_ratio < 2 || letters_ratio > 9) return 0;
-
-#ifdef STANDALONE
- printf("Letter to whitespace ratio: %lf\n", letters_ratio);
-#endif
-
- return 1;
-}
-
-#ifdef STANDALONE
-int main(int argc, char * argv[]) {
- FILE * f = fopen(argv[1], "rb");
- fseek(f, 0, SEEK_END);
- s32 len = ftell(f);
- fseek(f, 0, SEEK_SET);
- u8 * data = malloc(len);
- fread(data, 1, len, f);
- fclose(f);
- printf("%d\n", is_text(data, len));
-}
-#endif
