:: commit eff120cd7d6795645de23b3a0a0b64e5f6337dd5

Kamila Szewczyk <kspalaiologos@gmail.com> — 2022-05-02 14:36

parents: 9708809c42

libbz3 barebones

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 7adac3d..edb7021 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -3,6 +3,8 @@
         "srt.h": "c",
         "rle.h": "c",
         "cm.h": "c",
-        "stdint.h": "c"
+        "stdint.h": "c",
+        "common.h": "c",
+        "libsais.h": "c"
     }
 }
\ No newline at end of file
diff --git a/Makefile b/Makefile
index cf0f0d2..ab7b001 100644
--- a/Makefile
+++ b/Makefile
@@ -6,7 +6,7 @@ PREFIX?=/usr/local
 .PHONY: all clean format install
 
 OBJECTS=obj/main.o obj/libsais.o obj/crc32.o obj/mtf.o obj/srt.o obj/rle.o \
-        obj/cm.o
+        obj/cm.o obj/libbz3.o
 
 all: bzip3
 
diff --git a/include/libbz3.h b/include/libbz3.h
new file mode 100644
index 0000000..10cdde3
--- /dev/null
+++ b/include/libbz3.h
@@ -0,0 +1,79 @@
+
+#ifndef _LIBBZ3_H
+#define _LIBBZ3_H
+
+#include "cm.h"
+#include "common.h"
+#include "crc32.h"
+#include "libsais.h"
+#include "mtf.h"
+#include "rle.h"
+#include "srt.h"
+
+#define BZ3_OK 0
+#define BZ3_ERR_OUT_OF_BOUNDS -1
+#define BZ3_ERR_BWT -2
+#define BZ3_ERR_CRC -3
+#define BZ3_ERR_MALFORMED_HEADER -4
+#define BZ3_ERR_TRUNCATED_DATA -5
+
+struct block_encoder_state;
+
+struct encoding_result {
+    u8 * buffer;
+    s32 size;
+};
+
+/**
+ * @brief Get the last error number associated with a given state.
+ */
+s8 get_last_error(struct block_encoder_state * state);
+
+/**
+ * @brief Return a user-readable message explaining the cause of the error.
+ */
+const char * str_last_error(struct block_encoder_state * state);
+
+/**
+ * @brief Get the input buffer associated with given state. Fill it with data
+ * of length not exceeding the block size and call commit_read() to commit
+ * the read operation with the number of bytes read.
+ */
+u8 * get_buffer(struct block_encoder_state * state);
+
+/**
+ * @brief Commit the amount of bytes inserted into the buffer.
+ */
+s32 commit_read(struct block_encoder_state * state, s32 bytes_read);
+
+/**
+ * @brief Construct a new block encoder state.
+ */
+struct block_encoder_state * new_block_encoder_state(s32 block_size);
+
+/**
+ * @brief Free the memory occupied by a block encoder state.
+ */
+void delete_block_encoder_state(struct block_encoder_state * state);
+
+/**
+ * @brief Read a block of data from provided file descriptor, put it in
+ * the input buffer and commit the read.
+ *
+ * @param filedes
+ * @param state
+ * @return s32
+ */
+s32 read_block(int filedes, struct block_encoder_state * state);
+
+/**
+ * @brief Encode a single block.
+ */
+struct encoding_result encode_block(struct block_encoder_state * state);
+
+/**
+ * @brief Decode a single block.
+ */
+struct encoding_result decode_block(struct block_encoder_state * state);
+
+#endif
diff --git a/src/libbz3.c b/src/libbz3.c
new file mode 100644
index 0000000..4eb772c
--- /dev/null
+++ b/src/libbz3.c
@@ -0,0 +1,197 @@
+
+#include "libbz3.h"
+
+#include <arpa/inet.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "cm.h"
+#include "common.h"
+#include "crc32.h"
+#include "libsais.h"
+#include "mtf.h"
+#include "rle.h"
+#include "srt.h"
+
+struct block_encoder_state {
+    u8 *buf1, *buf2;
+    s32 bytes_read, block_size;
+    s32 * sais_array;
+    struct srt_state * srt_state;
+    struct mtf_state * mtf_state;
+    state * cm_state;
+    s8 last_error;
+};
+
+s8 get_last_error(struct block_encoder_state * state) {
+    return state->last_error;
+}
+
+const char * str_last_error(struct block_encoder_state * state) {
+    switch (state->last_error) {
+        case BZ3_OK:
+            return "No error";
+        case BZ3_ERR_OUT_OF_BOUNDS:
+            return "Data index out of bounds";
+        case BZ3_ERR_BWT:
+            return "Burrows-Wheeler transform failed";
+        case BZ3_ERR_CRC:
+            return "CRC32 check failed";
+        case BZ3_ERR_MALFORMED_HEADER:
+            return "Malformed header";
+        case BZ3_ERR_TRUNCATED_DATA:
+            return "Truncated data";
+        default:
+            return "Unknown error";
+    }
+}
+
+u8 * get_buffer(struct block_encoder_state * state) { return state->buf1; }
+
+s32 commit_read(struct block_encoder_state * state, s32 bytes_read) {
+    if (bytes_read > state->block_size) {
+        state->last_error = BZ3_ERR_OUT_OF_BOUNDS;
+        return -1;
+    }
+    state->last_error = BZ3_OK;
+    return state->bytes_read = bytes_read;
+}
+
+struct block_encoder_state * new_block_encoder_state(s32 block_size) {
+    struct block_encoder_state * block_encoder_state =
+        malloc(sizeof(struct block_encoder_state));
+
+    if (!block_encoder_state) {
+        return NULL;
+    }
+
+    block_encoder_state->cm_state = malloc(sizeof(state));
+    block_encoder_state->srt_state = malloc(sizeof(struct mtf_state));
+    block_encoder_state->mtf_state = malloc(sizeof(struct srt_state));
+
+    block_encoder_state->buf1 = malloc(block_size + block_size / 3);
+    block_encoder_state->buf2 = malloc(block_size + block_size / 3);
+    block_encoder_state->sais_array = malloc(block_size * sizeof(s32) + 16);
+
+    block_encoder_state->block_size = block_size;
+
+    block_encoder_state->last_error = BZ3_OK;
+
+    return block_encoder_state;
+}
+
+void delete_block_encoder_state(struct block_encoder_state * state) {
+    free(state->buf1);
+    free(state->buf2);
+    free(state->sais_array);
+    free(state->srt_state);
+    free(state->mtf_state);
+    free(state->cm_state);
+    free(state);
+}
+
+struct encoding_result encode_block(struct block_encoder_state * state) {
+    u32 crc32 = crc32sum(1, state->buf1, state->bytes_read);
+
+    s32 new_size = mrlec(state->buf1, state->bytes_read, state->buf2);
+    s32 bwt_index = libsais_bwt(state->buf2, state->buf2, state->sais_array,
+                                new_size, 16, NULL);
+    if (bwt_index < 0) {
+        state->last_error = BZ3_ERR_BWT;
+        return (struct encoding_result){ NULL, -1 };
+    }
+    s32 new_size2;
+
+    if (new_size > MiB(3)) {
+        new_size2 =
+            srt_encode(state->srt_state, state->buf2, state->buf1, new_size);
+    } else {
+        new_size2 = -1;
+        mtf_encode(state->mtf_state, state->buf2, state->buf1, new_size);
+    }
+
+    begin(state->cm_state);
+    state->cm_state->out_queue = state->buf2 + 24;
+    state->cm_state->output_ptr = 0;
+    if (new_size2 != -1)
+        for (s32 i = 0; i < new_size2; i++)
+            encode_byte(state->cm_state, state->buf1[i]);
+    else
+        for (s32 i = 0; i < new_size; i++)
+            encode_byte(state->cm_state, state->buf1[i]);
+    flush(state->cm_state);
+    s32 new_size3 = state->cm_state->output_ptr;
+
+    ((uint32_t *)state->buf2)[0] = htonl(crc32);
+    ((uint32_t *)state->buf2)[1] = htonl(state->bytes_read);
+    ((uint32_t *)state->buf2)[2] = htonl(bwt_index);
+    ((uint32_t *)state->buf2)[3] = htonl(new_size);
+    ((uint32_t *)state->buf2)[4] = htonl(new_size2);
+    ((uint32_t *)state->buf2)[5] = htonl(new_size3);
+    state->last_error = BZ3_OK;
+    return (struct encoding_result){ .buffer = state->buf2,
+                                     .size = 24 + new_size3 };
+}
+
+struct encoding_result decode_block(struct block_encoder_state * state) {
+    u32 crc32;
+    s32 bwt_index, new_size, new_size2, new_size3;
+
+    crc32 = ntohl(((uint32_t *)state->buf1)[0]);
+    state->bytes_read = ntohl(((uint32_t *)state->buf1)[1]);
+    bwt_index = ntohl(((uint32_t *)state->buf1)[2]);
+    new_size = ntohl(((uint32_t *)state->buf1)[3]);
+    new_size2 = ntohl(((uint32_t *)state->buf1)[4]);
+    new_size3 = ntohl(((uint32_t *)state->buf1)[5]);
+
+    begin(state->cm_state);
+    state->cm_state->in_queue = state->buf1 + 24;
+    state->cm_state->input_ptr = 0;
+    state->cm_state->input_max = new_size3;
+    init(state->cm_state);
+    if (new_size2 != -1) {
+        for (s32 i = 0; i < new_size2; i++)
+            state->buf2[i] = decode_byte(state->cm_state);
+        srt_decode(state->srt_state, state->buf2, state->buf1, new_size2);
+    } else {
+        for (s32 i = 0; i < new_size; i++)
+            state->buf2[i] = decode_byte(state->cm_state);
+        mtf_decode(state->mtf_state, state->buf2, state->buf1, new_size);
+    }
+    if (libsais_unbwt(state->buf1, state->buf2, state->sais_array, new_size,
+                      NULL, bwt_index) < 0) {
+        state->last_error = BZ3_ERR_BWT;
+        return (struct encoding_result){ NULL, -1 };
+    }
+    mrled(state->buf2, state->buf1, state->bytes_read);
+    if (crc32sum(1, state->buf1, state->bytes_read) != crc32) {
+        state->last_error = BZ3_ERR_CRC;
+        return (struct encoding_result){ .buffer = NULL, .size = -1 };
+    }
+    state->last_error = BZ3_OK;
+    return (struct encoding_result){ .buffer = state->buf1,
+                                     .size = state->bytes_read };
+}
+
+s32 read_block(int filedes, struct block_encoder_state * state) {
+    s32 metadata[6];
+    s32 bytes_read = read(filedes, state->buf1, 24);
+    if (bytes_read == 0) return 0;
+    if (bytes_read != 24) {
+        state->last_error = BZ3_ERR_MALFORMED_HEADER;
+        return -1;
+    }
+    s32 data_size = ntohl(((uint32_t *)state->buf1)[5]);
+    if (data_size > state->block_size) {
+        state->last_error = BZ3_ERR_MALFORMED_HEADER;
+        return -1;
+    }
+    bytes_read = read(filedes, state->buf1 + 24, data_size);
+    if (bytes_read != data_size) {
+        state->last_error = BZ3_ERR_TRUNCATED_DATA;
+        return -1;
+    }
+    state->last_error = BZ3_OK;
+    return state->bytes_read = 24 + data_size;
+}
diff --git a/src/main.c b/src/main.c
index a554d68..96767f6 100644
--- a/src/main.c
+++ b/src/main.c
@@ -26,102 +26,21 @@
 #include "cm.h"
 #include "common.h"
 #include "crc32.h"
+#include "libbz3.h"
 #include "libsais.h"
 #include "mtf.h"
 #include "rle.h"
 #include "srt.h"
 
-struct block_encoder_state {
-    s32 input_des, output_des;
-    u8 *buf1, *buf2;
-    s32 bytes_read;
-    s32 * sais_array;
-    struct srt_state * srt_state;
-    struct mtf_state * mtf_state;
-    state * cm_state;
-};
-
-void encode_block(struct block_encoder_state * state) {
-    u32 crc32 = crc32sum(1, state->buf1, state->bytes_read);
-
-    s32 new_size = mrlec(state->buf1, state->bytes_read, state->buf2);
-    s32 bwt_index = libsais_bwt(state->buf2, state->buf2, state->sais_array,
-                                new_size, 16, NULL);
-    s32 new_size2;
-
-    if (new_size > MiB(3)) {
-        new_size2 =
-            srt_encode(state->srt_state, state->buf2, state->buf1, new_size);
-    } else {
-        new_size2 = -1;
-        mtf_encode(state->mtf_state, state->buf2, state->buf1, new_size);
-    }
-
-    begin(state->cm_state);
-    state->cm_state->out_queue = state->buf2;
-    state->cm_state->output_ptr = 0;
-    if (new_size2 != -1)
-        for (s32 i = 0; i < new_size2; i++)
-            encode_byte(state->cm_state, state->buf1[i]);
-    else
-        for (s32 i = 0; i < new_size; i++)
-            encode_byte(state->cm_state, state->buf1[i]);
-    flush(state->cm_state);
-    s32 new_size3 = state->cm_state->output_ptr;
-
-    write(state->output_des, &crc32, sizeof(u32));
-    write(state->output_des, &state->bytes_read, sizeof(s32));
-    write(state->output_des, &bwt_index, sizeof(s32));
-    write(state->output_des, &new_size, sizeof(s32));
-    write(state->output_des, &new_size2, sizeof(s32));
-    write(state->output_des, &new_size3, sizeof(s32));
-    write(state->output_des, state->buf2, new_size3);
-}
+int main(int argc, char * argv[]) {
+    // -1: encode, 0: unspecified, 1: encode, 2: test
+    int mode = 0;
 
-int decode_block(struct block_encoder_state * state, s8 test) {
-#define safe_read(fd, buf, size) \
-    if (read(fd, buf, size) != size) return 1;
-
-    u32 crc32;
-    s32 bwt_index, new_size, new_size2, new_size3;
-
-    safe_read(state->input_des, &crc32, sizeof(u32));
-    safe_read(state->input_des, &state->bytes_read, sizeof(s32));
-    safe_read(state->input_des, &bwt_index, sizeof(s32));
-    safe_read(state->input_des, &new_size, sizeof(s32));
-    safe_read(state->input_des, &new_size2, sizeof(s32));
-    safe_read(state->input_des, &new_size3, sizeof(s32));
-    safe_read(state->input_des, state->buf1, new_size3);
-
-    begin(state->cm_state);
-    state->cm_state->in_queue = state->buf1;
-    state->cm_state->input_ptr = 0;
-    state->cm_state->input_max = new_size3;
-    init(state->cm_state);
-    if (new_size2 != -1) {
-        for (s32 i = 0; i < new_size2; i++)
-            state->buf2[i] = decode_byte(state->cm_state);
-        srt_decode(state->srt_state, state->buf2, state->buf1, new_size2);
-    } else {
-        for (s32 i = 0; i < new_size; i++)
-            state->buf2[i] = decode_byte(state->cm_state);
-        mtf_decode(state->mtf_state, state->buf2, state->buf1, new_size);
-    }
-    libsais_unbwt(state->buf1, state->buf2, state->sais_array, new_size, NULL,
-                  bwt_index);
-    mrled(state->buf2, state->buf1, state->bytes_read);
-    if (crc32sum(1, state->buf1, state->bytes_read) != crc32) {
-        fprintf(stderr, "CRC32 checksum mismatch.\n");
-        return 1;
-    }
-    if (!test) write(state->output_des, state->buf1, state->bytes_read);
-    return 0;
-}
+    // input and output file names
+    char *input = NULL, *output = NULL;
 
-int main(int argc, char * argv[]) {
-    int mode = 0;  // -1: encode, 0: unspecified, 1: encode, 2: test
-    char *input = NULL, *output = NULL;  // input and output file names
-    u32 block_size = MiB(8);    // the block size
+    // the block size
+    u32 block_size = MiB(8);
 
     for (int i = 1; i < argc; i++) {
         if (argv[i][0] == '-') {
@@ -180,17 +99,14 @@ int main(int argc, char * argv[]) {
         return 1;
     }
 
-    struct block_encoder_state block_encoder_state;
-    struct srt_state srt_state;
-    struct mtf_state mtf_state;
-    state cm_state;
-
-    block_encoder_state.cm_state = &cm_state;
-    block_encoder_state.srt_state = &srt_state;
-    block_encoder_state.mtf_state = &mtf_state;
+    struct block_encoder_state * block_encoder_state =
+        new_block_encoder_state(block_size);
 
-    block_encoder_state.input_des = input_des;
-    block_encoder_state.output_des = output_des;
+    if (get_last_error(block_encoder_state) != BZ3_OK) {
+        fprintf(stderr, "Failed to create block encoder state: %s\n",
+                str_last_error(block_encoder_state));
+        return 1;
+    }
 
     switch (mode) {
         case 1:
@@ -210,7 +126,9 @@ int main(int argc, char * argv[]) {
             read(input_des, &block_size, sizeof(u32));
 
             if (block_size < KiB(65) || block_size > MiB(2047)) {
-                fprintf(stderr, "The input file is corrupted. Reason: Invalid block size in the header.\n");
+                fprintf(stderr,
+                        "The input file is corrupted. Reason: Invalid block "
+                        "size in the header.\n");
                 return 1;
             }
 
@@ -218,24 +136,63 @@ int main(int argc, char * argv[]) {
         }
     }
 
-    block_encoder_state.buf1 = malloc(block_size + block_size / 3);
-    block_encoder_state.buf2 = malloc(block_size + block_size / 3);
-    block_encoder_state.sais_array = malloc(block_size * sizeof(s32) + 16);
-
     if (mode == 1)
-        while ((block_encoder_state.bytes_read =
-                    read(input_des, block_encoder_state.buf1, block_size)) > 0)
-            encode_block(&block_encoder_state);
-    else if (mode == -1)
-        while (decode_block(&block_encoder_state, 0) == 0)
-            ;
-    else if (mode == -2)
-        while (decode_block(&block_encoder_state, 1) == 0)
-            ;
-
-    free(block_encoder_state.buf1);
-    free(block_encoder_state.buf2);
-    free(block_encoder_state.sais_array);
+        while (commit_read(block_encoder_state,
+                           read(input_des, get_buffer(block_encoder_state),
+                                block_size)) > 0) {
+            if (get_last_error(block_encoder_state) != BZ3_OK) {
+                fprintf(stderr, "Failed to read data: %s\n",
+                        str_last_error(block_encoder_state));
+                return 1;
+            }
+            struct encoding_result r = encode_block(block_encoder_state);
+            if (get_last_error(block_encoder_state) != BZ3_OK) {
+                fprintf(stderr, "Failed to encode the block: %s\n",
+                        str_last_error(block_encoder_state));
+                return 1;
+            }
+            write(output_des, r.buffer, r.size);
+        }
+    else if (mode == -1) {
+        s32 read_size;
+        while ((read_size = read_block(input_des, block_encoder_state)) > 0) {
+            if (get_last_error(block_encoder_state) != BZ3_OK) {
+                fprintf(stderr, "Failed to read data: %s\n",
+                        str_last_error(block_encoder_state));
+                return 1;
+            }
+            struct encoding_result r = decode_block(block_encoder_state);
+            if (get_last_error(block_encoder_state) != BZ3_OK) {
+                fprintf(stderr, "Failed to encode the block: %s\n",
+                        str_last_error(block_encoder_state));
+                return 1;
+            }
+            write(output_des, r.buffer, r.size);
+        }
+    } else if (mode == -2) {
+        s32 read_size;
+        while ((read_size = read_block(input_des, block_encoder_state)) > 0) {
+            if (get_last_error(block_encoder_state) != BZ3_OK) {
+                fprintf(stderr, "Failed to read data: %s\n",
+                        str_last_error(block_encoder_state));
+                return 1;
+            }
+            decode_block(block_encoder_state);
+            if (get_last_error(block_encoder_state) != BZ3_OK) {
+                fprintf(stderr, "Failed to decode data: %s\n",
+                        str_last_error(block_encoder_state));
+                return 1;
+            }
+        }
+    }
+
+    if (get_last_error(block_encoder_state) != BZ3_OK) {
+        fprintf(stderr, "Failed to read data: %s\n",
+                str_last_error(block_encoder_state));
+        return 1;
+    }
+
+    delete_block_encoder_state(block_encoder_state);
 
     close(input_des);
     close(output_des);
tab: 248 wrap: offon