libbz3 barebones
diff --git a/.vscode/settings.json b/.vscode/settings.json
index 7adac3d..edb7021 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -3,6 +3,8 @@
"srt.h": "c",
"rle.h": "c",
"cm.h": "c",
- "stdint.h": "c"
+ "stdint.h": "c",
+ "common.h": "c",
+ "libsais.h": "c"
}
}
\ No newline at end of file
diff --git a/Makefile b/Makefile
index cf0f0d2..ab7b001 100644
--- a/Makefile
+++ b/Makefile
@@ -6,7 +6,7 @@ PREFIX?=/usr/local
.PHONY: all clean format install
OBJECTS=obj/main.o obj/libsais.o obj/crc32.o obj/mtf.o obj/srt.o obj/rle.o \
- obj/cm.o
+ obj/cm.o obj/libbz3.o
all: bzip3
diff --git a/include/libbz3.h b/include/libbz3.h
new file mode 100644
index 0000000..10cdde3
--- /dev/null
+++ b/include/libbz3.h
@@ -0,0 +1,79 @@
+
+#ifndef _LIBBZ3_H
+#define _LIBBZ3_H
+
+#include "cm.h"
+#include "common.h"
+#include "crc32.h"
+#include "libsais.h"
+#include "mtf.h"
+#include "rle.h"
+#include "srt.h"
+
+#define BZ3_OK 0
+#define BZ3_ERR_OUT_OF_BOUNDS -1
+#define BZ3_ERR_BWT -2
+#define BZ3_ERR_CRC -3
+#define BZ3_ERR_MALFORMED_HEADER -4
+#define BZ3_ERR_TRUNCATED_DATA -5
+
+struct block_encoder_state;
+
+struct encoding_result {
+ u8 * buffer;
+ s32 size;
+};
+
+/**
+ * @brief Get the last error number associated with a given state.
+ */
+s8 get_last_error(struct block_encoder_state * state);
+
+/**
+ * @brief Return a user-readable message explaining the cause of the error.
+ */
+const char * str_last_error(struct block_encoder_state * state);
+
+/**
+ * @brief Get the input buffer associated with given state. Fill it with data
+ * of length not exceeding the block size and call commit_read() to commit
+ * the read operation with the number of bytes read.
+ */
+u8 * get_buffer(struct block_encoder_state * state);
+
+/**
+ * @brief Commit the amount of bytes inserted into the buffer.
+ */
+s32 commit_read(struct block_encoder_state * state, s32 bytes_read);
+
+/**
+ * @brief Construct a new block encoder state.
+ */
+struct block_encoder_state * new_block_encoder_state(s32 block_size);
+
+/**
+ * @brief Free the memory occupied by a block encoder state.
+ */
+void delete_block_encoder_state(struct block_encoder_state * state);
+
+/**
+ * @brief Read a block of data from provided file descriptor, put it in
+ * the input buffer and commit the read.
+ *
+ * @param filedes
+ * @param state
+ * @return s32
+ */
+s32 read_block(int filedes, struct block_encoder_state * state);
+
+/**
+ * @brief Encode a single block.
+ */
+struct encoding_result encode_block(struct block_encoder_state * state);
+
+/**
+ * @brief Decode a single block.
+ */
+struct encoding_result decode_block(struct block_encoder_state * state);
+
+#endif
diff --git a/src/libbz3.c b/src/libbz3.c
new file mode 100644
index 0000000..4eb772c
--- /dev/null
+++ b/src/libbz3.c
@@ -0,0 +1,197 @@
+
+#include "libbz3.h"
+
+#include <arpa/inet.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "cm.h"
+#include "common.h"
+#include "crc32.h"
+#include "libsais.h"
+#include "mtf.h"
+#include "rle.h"
+#include "srt.h"
+
+struct block_encoder_state {
+ u8 *buf1, *buf2;
+ s32 bytes_read, block_size;
+ s32 * sais_array;
+ struct srt_state * srt_state;
+ struct mtf_state * mtf_state;
+ state * cm_state;
+ s8 last_error;
+};
+
+s8 get_last_error(struct block_encoder_state * state) {
+ return state->last_error;
+}
+
+const char * str_last_error(struct block_encoder_state * state) {
+ switch (state->last_error) {
+ case BZ3_OK:
+ return "No error";
+ case BZ3_ERR_OUT_OF_BOUNDS:
+ return "Data index out of bounds";
+ case BZ3_ERR_BWT:
+ return "Burrows-Wheeler transform failed";
+ case BZ3_ERR_CRC:
+ return "CRC32 check failed";
+ case BZ3_ERR_MALFORMED_HEADER:
+ return "Malformed header";
+ case BZ3_ERR_TRUNCATED_DATA:
+ return "Truncated data";
+ default:
+ return "Unknown error";
+ }
+}
+
+u8 * get_buffer(struct block_encoder_state * state) { return state->buf1; }
+
+s32 commit_read(struct block_encoder_state * state, s32 bytes_read) {
+ if (bytes_read > state->block_size) {
+ state->last_error = BZ3_ERR_OUT_OF_BOUNDS;
+ return -1;
+ }
+ state->last_error = BZ3_OK;
+ return state->bytes_read = bytes_read;
+}
+
+struct block_encoder_state * new_block_encoder_state(s32 block_size) {
+ struct block_encoder_state * block_encoder_state =
+ malloc(sizeof(struct block_encoder_state));
+
+ if (!block_encoder_state) {
+ return NULL;
+ }
+
+ block_encoder_state->cm_state = malloc(sizeof(state));
+ block_encoder_state->srt_state = malloc(sizeof(struct mtf_state));
+ block_encoder_state->mtf_state = malloc(sizeof(struct srt_state));
+
+ block_encoder_state->buf1 = malloc(block_size + block_size / 3);
+ block_encoder_state->buf2 = malloc(block_size + block_size / 3);
+ block_encoder_state->sais_array = malloc(block_size * sizeof(s32) + 16);
+
+ block_encoder_state->block_size = block_size;
+
+ block_encoder_state->last_error = BZ3_OK;
+
+ return block_encoder_state;
+}
+
+void delete_block_encoder_state(struct block_encoder_state * state) {
+ free(state->buf1);
+ free(state->buf2);
+ free(state->sais_array);
+ free(state->srt_state);
+ free(state->mtf_state);
+ free(state->cm_state);
+ free(state);
+}
+
+struct encoding_result encode_block(struct block_encoder_state * state) {
+ u32 crc32 = crc32sum(1, state->buf1, state->bytes_read);
+
+ s32 new_size = mrlec(state->buf1, state->bytes_read, state->buf2);
+ s32 bwt_index = libsais_bwt(state->buf2, state->buf2, state->sais_array,
+ new_size, 16, NULL);
+ if (bwt_index < 0) {
+ state->last_error = BZ3_ERR_BWT;
+ return (struct encoding_result){ NULL, -1 };
+ }
+ s32 new_size2;
+
+ if (new_size > MiB(3)) {
+ new_size2 =
+ srt_encode(state->srt_state, state->buf2, state->buf1, new_size);
+ } else {
+ new_size2 = -1;
+ mtf_encode(state->mtf_state, state->buf2, state->buf1, new_size);
+ }
+
+ begin(state->cm_state);
+ state->cm_state->out_queue = state->buf2 + 24;
+ state->cm_state->output_ptr = 0;
+ if (new_size2 != -1)
+ for (s32 i = 0; i < new_size2; i++)
+ encode_byte(state->cm_state, state->buf1[i]);
+ else
+ for (s32 i = 0; i < new_size; i++)
+ encode_byte(state->cm_state, state->buf1[i]);
+ flush(state->cm_state);
+ s32 new_size3 = state->cm_state->output_ptr;
+
+ ((uint32_t *)state->buf2)[0] = htonl(crc32);
+ ((uint32_t *)state->buf2)[1] = htonl(state->bytes_read);
+ ((uint32_t *)state->buf2)[2] = htonl(bwt_index);
+ ((uint32_t *)state->buf2)[3] = htonl(new_size);
+ ((uint32_t *)state->buf2)[4] = htonl(new_size2);
+ ((uint32_t *)state->buf2)[5] = htonl(new_size3);
+ state->last_error = BZ3_OK;
+ return (struct encoding_result){ .buffer = state->buf2,
+ .size = 24 + new_size3 };
+}
+
+struct encoding_result decode_block(struct block_encoder_state * state) {
+ u32 crc32;
+ s32 bwt_index, new_size, new_size2, new_size3;
+
+ crc32 = ntohl(((uint32_t *)state->buf1)[0]);
+ state->bytes_read = ntohl(((uint32_t *)state->buf1)[1]);
+ bwt_index = ntohl(((uint32_t *)state->buf1)[2]);
+ new_size = ntohl(((uint32_t *)state->buf1)[3]);
+ new_size2 = ntohl(((uint32_t *)state->buf1)[4]);
+ new_size3 = ntohl(((uint32_t *)state->buf1)[5]);
+
+ begin(state->cm_state);
+ state->cm_state->in_queue = state->buf1 + 24;
+ state->cm_state->input_ptr = 0;
+ state->cm_state->input_max = new_size3;
+ init(state->cm_state);
+ if (new_size2 != -1) {
+ for (s32 i = 0; i < new_size2; i++)
+ state->buf2[i] = decode_byte(state->cm_state);
+ srt_decode(state->srt_state, state->buf2, state->buf1, new_size2);
+ } else {
+ for (s32 i = 0; i < new_size; i++)
+ state->buf2[i] = decode_byte(state->cm_state);
+ mtf_decode(state->mtf_state, state->buf2, state->buf1, new_size);
+ }
+ if (libsais_unbwt(state->buf1, state->buf2, state->sais_array, new_size,
+ NULL, bwt_index) < 0) {
+ state->last_error = BZ3_ERR_BWT;
+ return (struct encoding_result){ NULL, -1 };
+ }
+ mrled(state->buf2, state->buf1, state->bytes_read);
+ if (crc32sum(1, state->buf1, state->bytes_read) != crc32) {
+ state->last_error = BZ3_ERR_CRC;
+ return (struct encoding_result){ .buffer = NULL, .size = -1 };
+ }
+ state->last_error = BZ3_OK;
+ return (struct encoding_result){ .buffer = state->buf1,
+ .size = state->bytes_read };
+}
+
+s32 read_block(int filedes, struct block_encoder_state * state) {
+ s32 metadata[6];
+ s32 bytes_read = read(filedes, state->buf1, 24);
+ if (bytes_read == 0) return 0;
+ if (bytes_read != 24) {
+ state->last_error = BZ3_ERR_MALFORMED_HEADER;
+ return -1;
+ }
+ s32 data_size = ntohl(((uint32_t *)state->buf1)[5]);
+ if (data_size > state->block_size) {
+ state->last_error = BZ3_ERR_MALFORMED_HEADER;
+ return -1;
+ }
+ bytes_read = read(filedes, state->buf1 + 24, data_size);
+ if (bytes_read != data_size) {
+ state->last_error = BZ3_ERR_TRUNCATED_DATA;
+ return -1;
+ }
+ state->last_error = BZ3_OK;
+ return state->bytes_read = 24 + data_size;
+}
diff --git a/src/main.c b/src/main.c
index a554d68..96767f6 100644
--- a/src/main.c
+++ b/src/main.c
@@ -26,102 +26,21 @@
#include "cm.h"
#include "common.h"
#include "crc32.h"
+#include "libbz3.h"
#include "libsais.h"
#include "mtf.h"
#include "rle.h"
#include "srt.h"
-struct block_encoder_state {
- s32 input_des, output_des;
- u8 *buf1, *buf2;
- s32 bytes_read;
- s32 * sais_array;
- struct srt_state * srt_state;
- struct mtf_state * mtf_state;
- state * cm_state;
-};
-
-void encode_block(struct block_encoder_state * state) {
- u32 crc32 = crc32sum(1, state->buf1, state->bytes_read);
-
- s32 new_size = mrlec(state->buf1, state->bytes_read, state->buf2);
- s32 bwt_index = libsais_bwt(state->buf2, state->buf2, state->sais_array,
- new_size, 16, NULL);
- s32 new_size2;
-
- if (new_size > MiB(3)) {
- new_size2 =
- srt_encode(state->srt_state, state->buf2, state->buf1, new_size);
- } else {
- new_size2 = -1;
- mtf_encode(state->mtf_state, state->buf2, state->buf1, new_size);
- }
-
- begin(state->cm_state);
- state->cm_state->out_queue = state->buf2;
- state->cm_state->output_ptr = 0;
- if (new_size2 != -1)
- for (s32 i = 0; i < new_size2; i++)
- encode_byte(state->cm_state, state->buf1[i]);
- else
- for (s32 i = 0; i < new_size; i++)
- encode_byte(state->cm_state, state->buf1[i]);
- flush(state->cm_state);
- s32 new_size3 = state->cm_state->output_ptr;
-
- write(state->output_des, &crc32, sizeof(u32));
- write(state->output_des, &state->bytes_read, sizeof(s32));
- write(state->output_des, &bwt_index, sizeof(s32));
- write(state->output_des, &new_size, sizeof(s32));
- write(state->output_des, &new_size2, sizeof(s32));
- write(state->output_des, &new_size3, sizeof(s32));
- write(state->output_des, state->buf2, new_size3);
-}
+int main(int argc, char * argv[]) {
+ // -1: encode, 0: unspecified, 1: encode, 2: test
+ int mode = 0;
-int decode_block(struct block_encoder_state * state, s8 test) {
-#define safe_read(fd, buf, size) \
- if (read(fd, buf, size) != size) return 1;
-
- u32 crc32;
- s32 bwt_index, new_size, new_size2, new_size3;
-
- safe_read(state->input_des, &crc32, sizeof(u32));
- safe_read(state->input_des, &state->bytes_read, sizeof(s32));
- safe_read(state->input_des, &bwt_index, sizeof(s32));
- safe_read(state->input_des, &new_size, sizeof(s32));
- safe_read(state->input_des, &new_size2, sizeof(s32));
- safe_read(state->input_des, &new_size3, sizeof(s32));
- safe_read(state->input_des, state->buf1, new_size3);
-
- begin(state->cm_state);
- state->cm_state->in_queue = state->buf1;
- state->cm_state->input_ptr = 0;
- state->cm_state->input_max = new_size3;
- init(state->cm_state);
- if (new_size2 != -1) {
- for (s32 i = 0; i < new_size2; i++)
- state->buf2[i] = decode_byte(state->cm_state);
- srt_decode(state->srt_state, state->buf2, state->buf1, new_size2);
- } else {
- for (s32 i = 0; i < new_size; i++)
- state->buf2[i] = decode_byte(state->cm_state);
- mtf_decode(state->mtf_state, state->buf2, state->buf1, new_size);
- }
- libsais_unbwt(state->buf1, state->buf2, state->sais_array, new_size, NULL,
- bwt_index);
- mrled(state->buf2, state->buf1, state->bytes_read);
- if (crc32sum(1, state->buf1, state->bytes_read) != crc32) {
- fprintf(stderr, "CRC32 checksum mismatch.\n");
- return 1;
- }
- if (!test) write(state->output_des, state->buf1, state->bytes_read);
- return 0;
-}
+ // input and output file names
+ char *input = NULL, *output = NULL;
-int main(int argc, char * argv[]) {
- int mode = 0; // -1: encode, 0: unspecified, 1: encode, 2: test
- char *input = NULL, *output = NULL; // input and output file names
- u32 block_size = MiB(8); // the block size
+ // the block size
+ u32 block_size = MiB(8);
for (int i = 1; i < argc; i++) {
if (argv[i][0] == '-') {
@@ -180,17 +99,14 @@ int main(int argc, char * argv[]) {
return 1;
}
- struct block_encoder_state block_encoder_state;
- struct srt_state srt_state;
- struct mtf_state mtf_state;
- state cm_state;
-
- block_encoder_state.cm_state = &cm_state;
- block_encoder_state.srt_state = &srt_state;
- block_encoder_state.mtf_state = &mtf_state;
+ struct block_encoder_state * block_encoder_state =
+ new_block_encoder_state(block_size);
- block_encoder_state.input_des = input_des;
- block_encoder_state.output_des = output_des;
+ if (get_last_error(block_encoder_state) != BZ3_OK) {
+ fprintf(stderr, "Failed to create block encoder state: %s\n",
+ str_last_error(block_encoder_state));
+ return 1;
+ }
switch (mode) {
case 1:
@@ -210,7 +126,9 @@ int main(int argc, char * argv[]) {
read(input_des, &block_size, sizeof(u32));
if (block_size < KiB(65) || block_size > MiB(2047)) {
- fprintf(stderr, "The input file is corrupted. Reason: Invalid block size in the header.\n");
+ fprintf(stderr,
+ "The input file is corrupted. Reason: Invalid block "
+ "size in the header.\n");
return 1;
}
@@ -218,24 +136,63 @@ int main(int argc, char * argv[]) {
}
}
- block_encoder_state.buf1 = malloc(block_size + block_size / 3);
- block_encoder_state.buf2 = malloc(block_size + block_size / 3);
- block_encoder_state.sais_array = malloc(block_size * sizeof(s32) + 16);
-
if (mode == 1)
- while ((block_encoder_state.bytes_read =
- read(input_des, block_encoder_state.buf1, block_size)) > 0)
- encode_block(&block_encoder_state);
- else if (mode == -1)
- while (decode_block(&block_encoder_state, 0) == 0)
- ;
- else if (mode == -2)
- while (decode_block(&block_encoder_state, 1) == 0)
- ;
-
- free(block_encoder_state.buf1);
- free(block_encoder_state.buf2);
- free(block_encoder_state.sais_array);
+ while (commit_read(block_encoder_state,
+ read(input_des, get_buffer(block_encoder_state),
+ block_size)) > 0) {
+ if (get_last_error(block_encoder_state) != BZ3_OK) {
+ fprintf(stderr, "Failed to read data: %s\n",
+ str_last_error(block_encoder_state));
+ return 1;
+ }
+ struct encoding_result r = encode_block(block_encoder_state);
+ if (get_last_error(block_encoder_state) != BZ3_OK) {
+ fprintf(stderr, "Failed to encode the block: %s\n",
+ str_last_error(block_encoder_state));
+ return 1;
+ }
+ write(output_des, r.buffer, r.size);
+ }
+ else if (mode == -1) {
+ s32 read_size;
+ while ((read_size = read_block(input_des, block_encoder_state)) > 0) {
+ if (get_last_error(block_encoder_state) != BZ3_OK) {
+ fprintf(stderr, "Failed to read data: %s\n",
+ str_last_error(block_encoder_state));
+ return 1;
+ }
+ struct encoding_result r = decode_block(block_encoder_state);
+ if (get_last_error(block_encoder_state) != BZ3_OK) {
+ fprintf(stderr, "Failed to encode the block: %s\n",
+ str_last_error(block_encoder_state));
+ return 1;
+ }
+ write(output_des, r.buffer, r.size);
+ }
+ } else if (mode == -2) {
+ s32 read_size;
+ while ((read_size = read_block(input_des, block_encoder_state)) > 0) {
+ if (get_last_error(block_encoder_state) != BZ3_OK) {
+ fprintf(stderr, "Failed to read data: %s\n",
+ str_last_error(block_encoder_state));
+ return 1;
+ }
+ decode_block(block_encoder_state);
+ if (get_last_error(block_encoder_state) != BZ3_OK) {
+ fprintf(stderr, "Failed to decode data: %s\n",
+ str_last_error(block_encoder_state));
+ return 1;
+ }
+ }
+ }
+
+ if (get_last_error(block_encoder_state) != BZ3_OK) {
+ fprintf(stderr, "Failed to read data: %s\n",
+ str_last_error(block_encoder_state));
+ return 1;
+ }
+
+ delete_block_encoder_state(block_encoder_state);
close(input_des);
close(output_des);
