| 1 | # BZip3 Format Specification |
| 2 |
|
| 3 | Version 1 |
| 4 |
|
| 5 | ## Headers |
| 6 |
|
| 7 | The File and Frame formats share a similar structure, differing only in whether they include a |
| 8 | block count field. |
| 9 |
|
| 10 | ### File Header |
| 11 |
|
| 12 | ``` |
| 13 | +----------------+------------------+--------------------+ |
| 14 | | Header | Chunk 1 | Chunk 2 | |
| 15 | | (9 bytes) | (variable size) | (variable size) | |
| 16 | +----------------+------------------+--------------------+ |
| 17 | ``` |
| 18 |
|
| 19 | This is created by the CLI tool. |
| 20 |
|
| 21 | ### Frame Header |
| 22 |
|
| 23 | ``` |
| 24 | +----------------+------------------+--------------------+ |
| 25 | | Header | Chunk 1 | Chunk 2 | |
| 26 | | (13 bytes) | (variable size) | (variable size) | |
| 27 | +----------------+------------------+--------------------+ |
| 28 | ``` |
| 29 |
|
| 30 | This is created/read by `bz3_compress` and `bz3_decompress`. |
| 31 |
|
| 32 | ### Header Structure |
| 33 |
|
| 34 | | Field | Type | Description | File Header | Frame Header | |
| 35 | | -------------- | ------ | ------------------------------- | ----------- | ------------ | |
| 36 | | Signature | u8[5] | Fixed "BZ3v1" ASCII string | ✓ | ✓ | |
| 37 | | Max Block Size | u32_le | Maximum decompressed block size | ✓ | ✓ | |
| 38 | | Block Count | u32_le | Number of blocks in the stream | ✗ | ✓ | |
| 39 |
|
| 40 | ### Validation Rules |
| 41 |
|
| 42 | 1. **Signature**: Must exactly match "BZ3v1" |
| 43 | 2. **Max Block Size**: |
| 44 | - Minimum: 65KiB (66,560 bytes) |
| 45 | - Maximum: 511MiB (535,822,336 bytes) |
| 46 | 3. **Block Count** (Frame Format only): |
| 47 | - Must match the actual number of blocks in the stream |
| 48 | - Should be greater than 0 |
| 49 |
|
| 50 | ### Example Parser |
| 51 |
|
| 52 | ```c |
| 53 | typedef struct { |
| 54 | uint32_t max_block_size; |
| 55 | uint32_t block_count; // Frame Format only |
| 56 | } bzip3_header_t; |
| 57 |
|
| 58 | bool read_bzip3_header(FILE* fp, bzip3_header_t* header, bool is_frame_format) { |
| 59 | char signature[6] = {0}; |
| 60 | |
| 61 | // Read signature |
| 62 | if (fread(signature, 1, 5, fp) != 5) |
| 63 | return false; |
| 64 | |
| 65 | if (strcmp(signature, "BZ3v1") != 0) |
| 66 | return false; |
| 67 | |
| 68 | // Read max block size |
| 69 | uint8_t size_bytes[4]; |
| 70 | if (fread(size_bytes, 1, 4, fp) != 4) |
| 71 | return false; |
| 72 | |
| 73 | header->max_block_size = read_neutral_s32(size_bytes); |
| 74 | |
| 75 | if (header->max_block_size < 65536 || |
| 76 | header->max_block_size > 535822336) |
| 77 | return false; |
| 78 | |
| 79 | // Read block count if Frame Format |
| 80 | if (is_frame_format) { |
| 81 | uint8_t count_bytes[4]; |
| 82 | if (fread(count_bytes, 1, 4, fp) != 4) |
| 83 | return false; |
| 84 | |
| 85 | header->block_count = read_neutral_s32(count_bytes); |
| 86 | |
| 87 | if (header->block_count == 0) |
| 88 | return false; |
| 89 | } |
| 90 | |
| 91 | return true; |
| 92 | } |
| 93 | ``` |
| 94 |
|
| 95 | The integers in BZip3 are written unaligned, in little endian format. |
| 96 | A portable implementation is below. |
| 97 |
|
| 98 | ```c |
| 99 | // Reading a 32-bit integer |
| 100 | static s32 read_neutral_s32(u8 * data) { |
| 101 | return ((u32)data[0]) | |
| 102 | (((u32)data[1]) << 8) | |
| 103 | (((u32)data[2]) << 16) | |
| 104 | (((u32)data[3]) << 24); |
| 105 | } |
| 106 |
|
| 107 | // Writing a 32-bit integer |
| 108 | static void write_neutral_s32(u8 * data, s32 value) { |
| 109 | data[0] = value & 0xFF; |
| 110 | data[1] = (value >> 8) & 0xFF; |
| 111 | data[2] = (value >> 16) & 0xFF; |
| 112 | data[3] = (value >> 24) & 0xFF; |
| 113 | } |
| 114 | ``` |
| 115 |
|
| 116 | ## Block Format |
| 117 |
|
| 118 | After the header, both File and Frame formats contain a sequence of blocks that follow the Block |
| 119 | Format specification. Each block is encapsulated in a chunk structure that defines its size. |
| 120 |
|
| 121 | The blocks (***without chunk header***) can be encoded/decoded using the `bz3_encode_block` |
| 122 | and `bz3_decode_block` APIs. |
| 123 |
|
| 124 | ### Chunk Structure |
| 125 |
|
| 126 | ```c |
| 127 | // Main block structure |
| 128 | struct Chunk { |
| 129 | u32_le compressedSize; // Size of compressed block |
| 130 | u32_le origSize; // Original uncompressed size |
| 131 | |
| 132 | if (origSize < 64) { |
| 133 | SmallBlock block; |
| 134 | } else { |
| 135 | Block block; |
| 136 | } |
| 137 | }; |
| 138 | ``` |
| 139 |
|
| 140 | ### Small Block Format (< 64 bytes) |
| 141 |
|
| 142 | For blocks smaller than 64 bytes, no compression is attempted. The data is stored with just a checksum: |
| 143 |
|
| 144 | ```c |
| 145 | struct SmallBlock { |
| 146 | u32_le crc32; // CRC32 checksum |
| 147 | u32_le literal; // Always 0xFFFFFFFF for small blocks. This is basically an invalid `bwtIndex` |
| 148 | u8 data[parent.compressedSize - 8]; // Uncompressed data |
| 149 | }; |
| 150 | ``` |
| 151 |
|
| 152 | ### Regular Block Format (≥ 64 bytes) |
| 153 |
|
| 154 | Larger blocks use a more complex format that supports multiple compression features: |
| 155 |
|
| 156 | ```c |
| 157 | struct Block { |
| 158 | u32_le crc32; // CRC32 checksum of uncompressed data |
| 159 | u32_le bwtIndex; // Burrows-Wheeler transform index |
| 160 | u8 model; // Compression model flags |
| 161 | |
| 162 | if ((model & 0x02) != 0) |
| 163 | u32_le lzpSize; // Size after LZP compression |
| 164 | if ((model & 0x04) != 0) |
| 165 | u32_le rleSize; // Size after RLE compression |
| 166 | |
| 167 | u8 data[parent.compressedSize - (popcnt(model) * 4 + 9)]; |
| 168 | }; |
| 169 | ``` |
| 170 |
|
| 171 | #### Compression Model |
| 172 |
|
| 173 | The `model` byte in regular blocks indicates which compression features were used: |
| 174 |
|
| 175 | - `0x02`: LZP (Lempel Ziv Prediction) filter |
| 176 | - `0x04`: RLE (Run-Length Encoding) filter |
| 177 |
|
| 178 | ## External Resources |
| 179 |
|
| 180 | - [BZip3 Pattern for ImHex](https://github.com/WerWolv/ImHex-Patterns/pull/329) |
| 181 |
|