:: commit 35691387c6b102b65d118c40aee1e06e795cc0b3

Kamila Szewczyk <kspalaiologos@gmail.com> — 2022-05-06 17:39

parents: 9a90a883b3

more accurate compression overhead on random data

diff --git a/Makefile.in b/Makefile.in
index 56594e9..f5f5832 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -4,8 +4,8 @@ CFLAGS=-O2 -march=native -mtune=native -flto -Iinclude -g3
 
 .PHONY: all clean format install cloc
 
-LIBBZ3_OBJECTS=obj/libsais.o obj/crc32.o obj/srt.o obj/rle.o \
-               obj/cm.o obj/libbz3.o obj/txt.o obj/lzp.o
+LIBBZ3_OBJECTS=obj/libsais.o obj/crc32.o obj/rle.o obj/cm.o \
+               obj/libbz3.o obj/lzp.o
 
 all: bzip3 bzip3.so
 
diff --git a/include/libbz3.h b/include/libbz3.h
index 234a6b9..1b544c3 100644
--- a/include/libbz3.h
+++ b/include/libbz3.h
@@ -55,14 +55,14 @@ void bz3_free(struct bz3_state * state);
 
 /**
  * @brief Encode a single block. Returns the amount of bytes written to `buffer'.
- * `buffer' must be able to hold at least `size + size / 4' bytes. The size must not
+ * `buffer' must be able to hold at least `size + size / 50 + 16' bytes. The size must not
  * exceed the block size associated with the state.
  */
 int32_t bz3_encode_block(struct bz3_state * state, uint8_t * buffer, int32_t size);
 
 /**
  * @brief Decode a single block.
- * `buffer' must be able to hold at least `size + size / 4' bytes. The size must not exceed
+ * `buffer' must be able to hold at least `size + size / 50 + 16' bytes. The size must not exceed
  * the block size associated with the state.
  * @param size The size of the compressed data in `buffer'
  * @param orig_size The original size of the data before compression.
diff --git a/include/srt.h b/include/srt.h
deleted file mode 100644
index 026c781..0000000
--- a/include/srt.h
+++ /dev/null
@@ -1,40 +0,0 @@
-
-/*
- * BZip3 - A spiritual successor to BZip2.
- * Copyright (C) 2022 Kamila Szewczyk
- *
- * This program is free software: you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation, either version 3 of the License, or (at your option)
- * any later version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of  MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef _SRT_H
-#define _SRT_H
-
-#include <inttypes.h>
-#include <stddef.h>
-
-#include "common.h"
-
-struct srt_state {
-    u32 freqs[256];
-    u8 symbols[256];
-    u32 r2s[256];
-    u32 s2r[256];
-    u32 buckets[256];
-    u32 bucket_ends[256];
-};
-
-u32 srt_encode(struct srt_state * mtf, u8 * src, u8 * dst, u32 count);
-u32 srt_decode(struct srt_state * mtf, u8 * src, u8 * dst, u32 count);
-
-#endif
diff --git a/include/txt.h b/include/txt.h
deleted file mode 100644
index eac6d12..0000000
--- a/include/txt.h
+++ /dev/null
@@ -1,27 +0,0 @@
-
-/*
- * BZip3 - A spiritual successor to BZip2.
- * Copyright (C) 2022 Kamila Szewczyk
- *
- * This program is free software: you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation, either version 3 of the License, or (at your option)
- * any later version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of  MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef _TXT_H
-#define _TXT_H
-
-#include "common.h"
-
-int is_text(const u8 * buffer, s32 length);
-
-#endif
diff --git a/src/libbz3.c b/src/libbz3.c
index 6a141a0..72b912d 100644
--- a/src/libbz3.c
+++ b/src/libbz3.c
@@ -28,8 +28,6 @@
 #include "libsais.h"
 #include "lzp.h"
 #include "rle.h"
-#include "srt.h"
-#include "txt.h"
 
 #define LZP_DICTIONARY 18
 #define LZP_MIN_MATCH 40
@@ -38,7 +36,6 @@ struct bz3_state {
     u8 * swap_buffer;
     s32 block_size;
     s32 *sais_array, *lzp_lut;
-    struct srt_state * srt_state;
     state * cm_state;
     s8 last_error;
 };
@@ -74,14 +71,13 @@ struct bz3_state * bz3_new(s32 block_size) {
     }
 
     bz3_state->cm_state = malloc(sizeof(state));
-    bz3_state->srt_state = malloc(sizeof(struct srt_state));
 
-    bz3_state->swap_buffer = malloc(block_size + block_size / 4);
+    bz3_state->swap_buffer = malloc(block_size + block_size / 50 + 16);
     bz3_state->sais_array = malloc(block_size * sizeof(s32));
 
     bz3_state->lzp_lut = calloc(1 << LZP_DICTIONARY, sizeof(s32));
 
-    if (!bz3_state->cm_state || !bz3_state->srt_state || !bz3_state->swap_buffer || !bz3_state->sais_array ||
+    if (!bz3_state->cm_state || !bz3_state->swap_buffer || !bz3_state->sais_array ||
         !bz3_state->lzp_lut) {
         return NULL;
     }
@@ -96,7 +92,6 @@ struct bz3_state * bz3_new(s32 block_size) {
 void bz3_free(struct bz3_state * state) {
     free(state->swap_buffer);
     free(state->sais_array);
-    free(state->srt_state);
     free(state->cm_state);
     free(state->lzp_lut);
     free(state);
diff --git a/src/main.c b/src/main.c
index 10271d1..ececc58 100644
--- a/src/main.c
+++ b/src/main.c
@@ -158,7 +158,7 @@ int main(int argc, char * argv[]) {
         return 1;
     }
 
-    u8 * buffer = malloc(block_size + block_size / 4);
+    u8 * buffer = malloc(block_size + block_size / 50 + 16);
 
     if (mode == 1) {
         s32 read_count;
diff --git a/src/srt.c b/src/srt.c
deleted file mode 100644
index c5c1e38..0000000
--- a/src/srt.c
+++ /dev/null
@@ -1,152 +0,0 @@
-
-/*
- * BZip3 - A spiritual successor to BZip2.
- * Copyright (C) 2022 Kamila Szewczyk
- *
- * This program is free software: you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation, either version 3 of the License, or (at your option)
- * any later version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of  MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "srt.h"
-
-static const s32 MAX_HDR_SIZE = 4 * 256;
-
-static s32 preprocess(const u32 * freqs, u8 * symbols) {
-    s32 nb_symbols = 0;
-    for (s32 i = 0; i < 256; i++)
-        if (freqs[i] > 0) symbols[nb_symbols++] = i;
-    u32 h = 4;
-    while (h < nb_symbols) h = h * 3 + 1;
-    while (1) {
-        h /= 3;
-        for (u32 i = h; i < nb_symbols; i++) {
-            const s32 t = symbols[i];
-            s32 b = i - h;
-            while ((b >= 0) && (freqs[symbols[b]] < freqs[t] || (freqs[t] == freqs[symbols[b]] && t < symbols[b]))) {
-                symbols[b + h] = symbols[b];
-                b -= h;
-            }
-            symbols[b + h] = t;
-        }
-        if (h == 1) break;
-    }
-    return nb_symbols;
-}
-
-static s32 encode_header(u32 * freqs, u8 * dst) {
-    u32 idx = 0;
-    for (s32 i = 0; i < 256; i++) {
-        u32 f = freqs[i];
-        while (f >= 128) {
-            dst[idx++] = (u8)(f | 0x80);
-            f >>= 7;
-        }
-        dst[idx++] = (u8)f;
-    }
-    return idx;
-}
-
-static s32 decode_header(u8 * src, u32 * freqs) {
-    u32 idx = 0;
-    for (s32 i = 0; i < 256; i++) {
-        s32 val = src[idx++] & 0xFF;
-        s32 res = val & 0x7F;
-        s32 shift = 7;
-        while (val >= 128) {
-            val = src[idx++] & 0xFF;
-            res |= (val & 0x7F) << shift;
-            if (shift > 21) break;
-            shift += 7;
-        }
-        freqs[i] = res;
-    }
-    return idx;
-}
-
-u32 srt_encode(struct srt_state * mtf, u8 * src, u8 * dst, u32 count) {
-    // Find first symbols and build a histogram.
-    for (s32 i = 0; i < 256; i++) mtf->freqs[i] = 0;
-    for (u32 i = 0, b = 0; i < count;) {
-        if (mtf->freqs[src[i]] == 0) {
-            mtf->r2s[b] = src[i];
-            mtf->s2r[src[i]] = b;
-            b++;
-        }
-        u32 j = i + 1;
-        while (j < count && src[j] == src[i]) j++;
-        mtf->freqs[src[i]] += j - i;
-        i = j;
-    }
-
-    s32 n_symbols = preprocess(mtf->freqs, mtf->symbols);
-    for (u32 i = 0, bucket_pos = 0; i < n_symbols; i++) {
-        mtf->buckets[mtf->symbols[i]] = bucket_pos;
-        bucket_pos += mtf->freqs[mtf->symbols[i]];
-    }
-
-    const u32 header_size = encode_header(mtf->freqs, dst);
-    const s32 dst_idx = header_size;
-    for (u32 i = 0; i < count;) {
-        const s32 c = src[i] & 0xFF;
-        s32 r = mtf->s2r[c] & 0xFF;
-        u32 p = mtf->buckets[c];
-        dst[dst_idx + p++] = r;
-        if (r != 0) {
-            do {
-                mtf->r2s[r] = mtf->r2s[r - 1];
-                mtf->s2r[mtf->r2s[r]] = r;
-                r--;
-            } while (r != 0);
-            mtf->r2s[0] = c;
-            mtf->s2r[c] = 0;
-        }
-        i++;
-        while (i < count && src[i] == c) {
-            dst[dst_idx + p++] = 0;
-            i++;
-        }
-        mtf->buckets[c] = p;
-    }
-    return count + header_size;
-}
-
-u32 srt_decode(struct srt_state * mtf, u8 * src, u8 * dst, u32 count) {
-    const u32 header_size = decode_header(src, mtf->freqs);
-    const u32 src_idx = header_size;
-    s32 nb_symbols = preprocess(mtf->freqs, mtf->symbols);
-    for (u32 i = 0, bucket_pos = 0; i < nb_symbols; i++) {
-        const s32 c = mtf->symbols[i] & 0xFF;
-        mtf->r2s[src[src_idx + bucket_pos] & 0xFF] = c;
-        mtf->buckets[c] = bucket_pos + 1;
-        bucket_pos += mtf->freqs[c];
-        mtf->bucket_ends[c] = bucket_pos;
-    }
-    u32 c = mtf->r2s[0];
-    for (u32 i = 0; i < count; i++) {
-        dst[i] = c;
-        if (mtf->buckets[c] < mtf->bucket_ends[c]) {
-            const s32 r = src[src_idx + mtf->buckets[c]] & 0xFF;
-            mtf->buckets[c]++;
-            if (r == 0) continue;
-            for (s32 s = 0; s < r; s++) mtf->r2s[s] = mtf->r2s[s + 1];
-            mtf->r2s[r] = c;
-            c = mtf->r2s[0];
-        } else {
-            if (nb_symbols == 1) continue;
-            nb_symbols--;
-            for (s32 s = 0; s < nb_symbols; s++) mtf->r2s[s] = mtf->r2s[s + 1];
-            c = mtf->r2s[0];
-        }
-    }
-    return count - header_size;
-}
diff --git a/src/txt.c b/src/txt.c
deleted file mode 100644
index 7c651aa..0000000
--- a/src/txt.c
+++ /dev/null
@@ -1,100 +0,0 @@
-
-/*
- * BZip3 - A spiritual successor to BZip2.
- * Copyright (C) 2022 Kamila Szewczyk
- *
- * This program is free software: you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation, either version 3 of the License, or (at your option)
- * any later version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of  MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "txt.h"
-
-#include <math.h>
-
-#ifdef STANDALONE
-    #include <stdio.h>
-#endif
-
-int is_text(const u8 * data, s32 len) {
-#ifdef STANDALONE
-    printf("Data of length %d.\n");
-#endif
-
-    s32 histogram[256] = { 0 };
-    for (s32 i = 0; i < len; i++) histogram[data[i]]++;
-
-    // Text criterions:
-    // 1. Shannon entropy is between 4.5 and 5.2.
-    // 2. Majority of the document must be uppercase/lowercase numbers.
-    // 3. The file has a proper amount of whitespace
-    // -----
-
-    // Step 1
-    double entropy = 0;
-    for (s32 i = 0; i < 256; i++) {
-        double p = (double)histogram[i] / len;
-        if (p == 0) continue;
-        entropy += p * log2(p);
-    }
-    entropy = -entropy;
-
-#ifdef STANDALONE
-    printf("Shannon entropy: %lf\n", entropy);
-#endif
-
-    if (entropy > 5.4 || entropy < 4.5) return 0;
-
-    // Step 2
-    s32 letters = 0;
-    s32 whitespace = 0;
-    for (s32 i = 0; i < 256; i++) {
-        if (i >= 'A' && i <= 'Z')
-            letters += histogram[i];
-        else if (i >= 'a' && i <= 'z')
-            letters += histogram[i];
-        else if (i >= '0' && i <= '9')
-            letters += histogram[i];
-        else if (i == ' ' || i == '\t' || i == '\n' || i == '\r' || i == '\v')
-            whitespace += histogram[i];
-    }
-
-#ifdef STANDALONE
-    printf("Letters: %d, whitespace: %d, text to other ratio: %lf\n", letters, whitespace,
-           (double)(letters + whitespace) / len);
-#endif
-
-    if ((double)(letters + whitespace) / len < 0.6) return 0;
-
-    // Step 3
-    double letters_ratio = (double)letters / whitespace;
-    if (letters_ratio < 2 || letters_ratio > 9) return 0;
-
-#ifdef STANDALONE
-    printf("Letter to whitespace ratio: %lf\n", letters_ratio);
-#endif
-
-    return 1;
-}
-
-#ifdef STANDALONE
-int main(int argc, char * argv[]) {
-    FILE * f = fopen(argv[1], "rb");
-    fseek(f, 0, SEEK_END);
-    s32 len = ftell(f);
-    fseek(f, 0, SEEK_SET);
-    u8 * data = malloc(len);
-    fread(data, 1, len, f);
-    fclose(f);
-    printf("%d\n", is_text(data, len));
-}
-#endif
tab: 248 wrap: offon