:: commit e038aba22a93239e1c59b708dbc9a1f994d26ff5

Kamila Szewczyk <kspalaiologos@gmail.com> — 2022-05-05 19:22

parents: cf66208737

refurbished api

diff --git a/.vscode/launch.json b/.vscode/launch.json
new file mode 100644
index 0000000..4bae5fb
--- /dev/null
+++ b/.vscode/launch.json
@@ -0,0 +1,34 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "(gdb) Launch",
+            "type": "cppdbg",
+            "request": "launch",
+            "program": "${workspaceFolder}/bzip3",
+            "args": ["-e", "${workspaceFolder}/corpus/cm.c", "${workspaceFolder}/corpus/cm.bz3"],
+            "stopAtEntry": false,
+            "cwd": "${fileDirname}",
+            "environment": [],
+            "externalConsole": false,
+            "MIMode": "gdb",
+            "setupCommands": [
+                {
+                    "description": "Enable pretty-printing for gdb",
+                    "text": "-enable-pretty-printing",
+                    "ignoreFailures": true
+                },
+                {
+                    "description":  "Set Disassembly Flavor to Intel",
+                    "text": "-gdb-set disassembly-flavor intel",
+                    "ignoreFailures": true
+                }
+            ]
+        }
+
+
+    ]
+}
\ No newline at end of file
diff --git a/.vscode/settings.json b/.vscode/settings.json
index 6e20f5f..3f99eae 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -7,6 +7,13 @@
         "common.h": "c",
         "libsais.h": "c",
         "cstring": "c",
-        "variant": "c"
+        "variant": "c",
+        "strstream": "c",
+        "*.tcc": "c",
+        "bitset": "c",
+        "fstream": "c",
+        "istream": "c",
+        "sstream": "c",
+        "streambuf": "c"
     }
 }
\ No newline at end of file
diff --git a/include/libbz3.h b/include/libbz3.h
index 3431268..99e65d1 100644
--- a/include/libbz3.h
+++ b/include/libbz3.h
@@ -10,64 +10,41 @@
 #define BZ3_ERR_CRC -3
 #define BZ3_ERR_MALFORMED_HEADER -4
 #define BZ3_ERR_TRUNCATED_DATA -5
+#define BZ3_ERR_DATA_TOO_BIG -6
 
-struct block_encoder_state;
-
-struct encoding_result {
-    u8 * buffer;
-    s32 size;
-};
+struct bz3_state;
 
 /**
  * @brief Get the last error number associated with a given state.
  */
-s8 get_last_error(struct block_encoder_state * state);
+s8 bz3_last_error(struct bz3_state * state);
 
 /**
  * @brief Return a user-readable message explaining the cause of the error.
  */
-const char * str_last_error(struct block_encoder_state * state);
-
-/**
- * @brief Get the input buffer associated with given state. Fill it with data
- * of length not exceeding the block size and call commit_read() to commit
- * the read operation with the number of bytes read.
- */
-u8 * get_buffer(struct block_encoder_state * state);
-
-/**
- * @brief Commit the amount of bytes inserted into the buffer.
- */
-s32 commit_read(struct block_encoder_state * state, s32 bytes_read);
+const char * bz3_strerror(struct bz3_state * state);
 
 /**
  * @brief Construct a new block encoder state.
  */
-struct block_encoder_state * new_block_encoder_state(s32 block_size);
+struct bz3_state * bz3_new(s32 block_size);
 
 /**
  * @brief Free the memory occupied by a block encoder state.
  */
-void delete_block_encoder_state(struct block_encoder_state * state);
-
-/**
- * @brief Read a block of data from provided file descriptor, put it in
- * the input buffer and commit the read.
- *
- * @param filedes
- * @param state
- * @return s32
- */
-s32 read_block(int filedes, struct block_encoder_state * state);
+void bz3_free(struct bz3_state * state);
 
 /**
  * @brief Encode a single block.
+ * Returns the amount of bytes written to `buffer'.
+ * `buffer' must be able to hold at least `size + size / 4' bytes.
  */
-struct encoding_result encode_block(struct block_encoder_state * state);
+s32 bz3_encode_block(struct bz3_state * state, u8 * buffer, s32 size);
 
 /**
  * @brief Decode a single block.
+ * `buffer' must be able to hold at least `size + size / 4' bytes.
  */
-struct encoding_result decode_block(struct block_encoder_state * state);
+s32 bz3_decode_block(struct bz3_state * state, u8 * buffer, s32 size, s32 orig_size);
 
 #endif
diff --git a/include/libsais.h b/include/libsais.h
index b0cae1e..dc3b704 100644
--- a/include/libsais.h
+++ b/include/libsais.h
@@ -138,8 +138,7 @@ s32 libsais_bwt_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n, s32
  * @param I [0..(n-1)/r] The output auxiliary indexes.
  * @return 0 if no error occurred, -1 or -2 otherwise.
  */
-s32 libsais_bwt_aux_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq,
-                        s32 r, s32 * I);
+s32 libsais_bwt_aux_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq, s32 r, s32 * I);
 
 /**
  * Creates the libsais reverse BWT context that allows reusing allocated memory
@@ -182,8 +181,7 @@ s32 libsais_unbwt(const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq, s32 i)
  * @param i The primary index.
  * @return 0 if no error occurred, -1 or -2 otherwise.
  */
-s32 libsais_unbwt_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq,
-                      s32 i);
+s32 libsais_unbwt_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq, s32 i);
 
 /**
  * Constructs the original string from a given burrows-wheeler transformed
@@ -214,8 +212,8 @@ s32 libsais_unbwt_aux(const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq, s3
  * @param I [0..(n-1)/r] The input auxiliary indexes.
  * @return 0 if no error occurred, -1 or -2 otherwise.
  */
-s32 libsais_unbwt_aux_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq,
-                          s32 r, const s32 * I);
+s32 libsais_unbwt_aux_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq, s32 r,
+                          const s32 * I);
 
 /**
  * Constructs the permuted longest common prefix array (PLCP) of a given string
diff --git a/src/crc32.c b/src/crc32.c
index 8bed740..33d0b19 100644
--- a/src/crc32.c
+++ b/src/crc32.c
@@ -20,42 +20,34 @@
 #include "crc32.h"
 
 static const u32 crc32Table[256] = {
-    0x00000000L, 0xF26B8303L, 0xE13B70F7L, 0x1350F3F4L, 0xC79A971FL, 0x35F1141CL, 0x26A1E7E8L,
-    0xD4CA64EBL, 0x8AD958CFL, 0x78B2DBCCL, 0x6BE22838L, 0x9989AB3BL, 0x4D43CFD0L, 0xBF284CD3L,
-    0xAC78BF27L, 0x5E133C24L, 0x105EC76FL, 0xE235446CL, 0xF165B798L, 0x030E349BL, 0xD7C45070L,
-    0x25AFD373L, 0x36FF2087L, 0xC494A384L, 0x9A879FA0L, 0x68EC1CA3L, 0x7BBCEF57L, 0x89D76C54L,
-    0x5D1D08BFL, 0xAF768BBCL, 0xBC267848L, 0x4E4DFB4BL, 0x20BD8EDEL, 0xD2D60DDDL, 0xC186FE29L,
-    0x33ED7D2AL, 0xE72719C1L, 0x154C9AC2L, 0x061C6936L, 0xF477EA35L, 0xAA64D611L, 0x580F5512L,
-    0x4B5FA6E6L, 0xB93425E5L, 0x6DFE410EL, 0x9F95C20DL, 0x8CC531F9L, 0x7EAEB2FAL, 0x30E349B1L,
-    0xC288CAB2L, 0xD1D83946L, 0x23B3BA45L, 0xF779DEAEL, 0x05125DADL, 0x1642AE59L, 0xE4292D5AL,
-    0xBA3A117EL, 0x4851927DL, 0x5B016189L, 0xA96AE28AL, 0x7DA08661L, 0x8FCB0562L, 0x9C9BF696L,
-    0x6EF07595L, 0x417B1DBCL, 0xB3109EBFL, 0xA0406D4BL, 0x522BEE48L, 0x86E18AA3L, 0x748A09A0L,
-    0x67DAFA54L, 0x95B17957L, 0xCBA24573L, 0x39C9C670L, 0x2A993584L, 0xD8F2B687L, 0x0C38D26CL,
-    0xFE53516FL, 0xED03A29BL, 0x1F682198L, 0x5125DAD3L, 0xA34E59D0L, 0xB01EAA24L, 0x42752927L,
-    0x96BF4DCCL, 0x64D4CECFL, 0x77843D3BL, 0x85EFBE38L, 0xDBFC821CL, 0x2997011FL, 0x3AC7F2EBL,
-    0xC8AC71E8L, 0x1C661503L, 0xEE0D9600L, 0xFD5D65F4L, 0x0F36E6F7L, 0x61C69362L, 0x93AD1061L,
-    0x80FDE395L, 0x72966096L, 0xA65C047DL, 0x5437877EL, 0x4767748AL, 0xB50CF789L, 0xEB1FCBADL,
-    0x197448AEL, 0x0A24BB5AL, 0xF84F3859L, 0x2C855CB2L, 0xDEEEDFB1L, 0xCDBE2C45L, 0x3FD5AF46L,
-    0x7198540DL, 0x83F3D70EL, 0x90A324FAL, 0x62C8A7F9L, 0xB602C312L, 0x44694011L, 0x5739B3E5L,
-    0xA55230E6L, 0xFB410CC2L, 0x092A8FC1L, 0x1A7A7C35L, 0xE811FF36L, 0x3CDB9BDDL, 0xCEB018DEL,
-    0xDDE0EB2AL, 0x2F8B6829L, 0x82F63B78L, 0x709DB87BL, 0x63CD4B8FL, 0x91A6C88CL, 0x456CAC67L,
-    0xB7072F64L, 0xA457DC90L, 0x563C5F93L, 0x082F63B7L, 0xFA44E0B4L, 0xE9141340L, 0x1B7F9043L,
-    0xCFB5F4A8L, 0x3DDE77ABL, 0x2E8E845FL, 0xDCE5075CL, 0x92A8FC17L, 0x60C37F14L, 0x73938CE0L,
-    0x81F80FE3L, 0x55326B08L, 0xA759E80BL, 0xB4091BFFL, 0x466298FCL, 0x1871A4D8L, 0xEA1A27DBL,
-    0xF94AD42FL, 0x0B21572CL, 0xDFEB33C7L, 0x2D80B0C4L, 0x3ED04330L, 0xCCBBC033L, 0xA24BB5A6L,
-    0x502036A5L, 0x4370C551L, 0xB11B4652L, 0x65D122B9L, 0x97BAA1BAL, 0x84EA524EL, 0x7681D14DL,
-    0x2892ED69L, 0xDAF96E6AL, 0xC9A99D9EL, 0x3BC21E9DL, 0xEF087A76L, 0x1D63F975L, 0x0E330A81L,
-    0xFC588982L, 0xB21572C9L, 0x407EF1CAL, 0x532E023EL, 0xA145813DL, 0x758FE5D6L, 0x87E466D5L,
-    0x94B49521L, 0x66DF1622L, 0x38CC2A06L, 0xCAA7A905L, 0xD9F75AF1L, 0x2B9CD9F2L, 0xFF56BD19L,
-    0x0D3D3E1AL, 0x1E6DCDEEL, 0xEC064EEDL, 0xC38D26C4L, 0x31E6A5C7L, 0x22B65633L, 0xD0DDD530L,
-    0x0417B1DBL, 0xF67C32D8L, 0xE52CC12CL, 0x1747422FL, 0x49547E0BL, 0xBB3FFD08L, 0xA86F0EFCL,
-    0x5A048DFFL, 0x8ECEE914L, 0x7CA56A17L, 0x6FF599E3L, 0x9D9E1AE0L, 0xD3D3E1ABL, 0x21B862A8L,
-    0x32E8915CL, 0xC083125FL, 0x144976B4L, 0xE622F5B7L, 0xF5720643L, 0x07198540L, 0x590AB964L,
-    0xAB613A67L, 0xB831C993L, 0x4A5A4A90L, 0x9E902E7BL, 0x6CFBAD78L, 0x7FAB5E8CL, 0x8DC0DD8FL,
-    0xE330A81AL, 0x115B2B19L, 0x020BD8EDL, 0xF0605BEEL, 0x24AA3F05L, 0xD6C1BC06L, 0xC5914FF2L,
-    0x37FACCF1L, 0x69E9F0D5L, 0x9B8273D6L, 0x88D28022L, 0x7AB90321L, 0xAE7367CAL, 0x5C18E4C9L,
-    0x4F48173DL, 0xBD23943EL, 0xF36E6F75L, 0x0105EC76L, 0x12551F82L, 0xE03E9C81L, 0x34F4F86AL,
-    0xC69F7B69L, 0xD5CF889DL, 0x27A40B9EL, 0x79B737BAL, 0x8BDCB4B9L, 0x988C474DL, 0x6AE7C44EL,
+    0x00000000L, 0xF26B8303L, 0xE13B70F7L, 0x1350F3F4L, 0xC79A971FL, 0x35F1141CL, 0x26A1E7E8L, 0xD4CA64EBL, 0x8AD958CFL,
+    0x78B2DBCCL, 0x6BE22838L, 0x9989AB3BL, 0x4D43CFD0L, 0xBF284CD3L, 0xAC78BF27L, 0x5E133C24L, 0x105EC76FL, 0xE235446CL,
+    0xF165B798L, 0x030E349BL, 0xD7C45070L, 0x25AFD373L, 0x36FF2087L, 0xC494A384L, 0x9A879FA0L, 0x68EC1CA3L, 0x7BBCEF57L,
+    0x89D76C54L, 0x5D1D08BFL, 0xAF768BBCL, 0xBC267848L, 0x4E4DFB4BL, 0x20BD8EDEL, 0xD2D60DDDL, 0xC186FE29L, 0x33ED7D2AL,
+    0xE72719C1L, 0x154C9AC2L, 0x061C6936L, 0xF477EA35L, 0xAA64D611L, 0x580F5512L, 0x4B5FA6E6L, 0xB93425E5L, 0x6DFE410EL,
+    0x9F95C20DL, 0x8CC531F9L, 0x7EAEB2FAL, 0x30E349B1L, 0xC288CAB2L, 0xD1D83946L, 0x23B3BA45L, 0xF779DEAEL, 0x05125DADL,
+    0x1642AE59L, 0xE4292D5AL, 0xBA3A117EL, 0x4851927DL, 0x5B016189L, 0xA96AE28AL, 0x7DA08661L, 0x8FCB0562L, 0x9C9BF696L,
+    0x6EF07595L, 0x417B1DBCL, 0xB3109EBFL, 0xA0406D4BL, 0x522BEE48L, 0x86E18AA3L, 0x748A09A0L, 0x67DAFA54L, 0x95B17957L,
+    0xCBA24573L, 0x39C9C670L, 0x2A993584L, 0xD8F2B687L, 0x0C38D26CL, 0xFE53516FL, 0xED03A29BL, 0x1F682198L, 0x5125DAD3L,
+    0xA34E59D0L, 0xB01EAA24L, 0x42752927L, 0x96BF4DCCL, 0x64D4CECFL, 0x77843D3BL, 0x85EFBE38L, 0xDBFC821CL, 0x2997011FL,
+    0x3AC7F2EBL, 0xC8AC71E8L, 0x1C661503L, 0xEE0D9600L, 0xFD5D65F4L, 0x0F36E6F7L, 0x61C69362L, 0x93AD1061L, 0x80FDE395L,
+    0x72966096L, 0xA65C047DL, 0x5437877EL, 0x4767748AL, 0xB50CF789L, 0xEB1FCBADL, 0x197448AEL, 0x0A24BB5AL, 0xF84F3859L,
+    0x2C855CB2L, 0xDEEEDFB1L, 0xCDBE2C45L, 0x3FD5AF46L, 0x7198540DL, 0x83F3D70EL, 0x90A324FAL, 0x62C8A7F9L, 0xB602C312L,
+    0x44694011L, 0x5739B3E5L, 0xA55230E6L, 0xFB410CC2L, 0x092A8FC1L, 0x1A7A7C35L, 0xE811FF36L, 0x3CDB9BDDL, 0xCEB018DEL,
+    0xDDE0EB2AL, 0x2F8B6829L, 0x82F63B78L, 0x709DB87BL, 0x63CD4B8FL, 0x91A6C88CL, 0x456CAC67L, 0xB7072F64L, 0xA457DC90L,
+    0x563C5F93L, 0x082F63B7L, 0xFA44E0B4L, 0xE9141340L, 0x1B7F9043L, 0xCFB5F4A8L, 0x3DDE77ABL, 0x2E8E845FL, 0xDCE5075CL,
+    0x92A8FC17L, 0x60C37F14L, 0x73938CE0L, 0x81F80FE3L, 0x55326B08L, 0xA759E80BL, 0xB4091BFFL, 0x466298FCL, 0x1871A4D8L,
+    0xEA1A27DBL, 0xF94AD42FL, 0x0B21572CL, 0xDFEB33C7L, 0x2D80B0C4L, 0x3ED04330L, 0xCCBBC033L, 0xA24BB5A6L, 0x502036A5L,
+    0x4370C551L, 0xB11B4652L, 0x65D122B9L, 0x97BAA1BAL, 0x84EA524EL, 0x7681D14DL, 0x2892ED69L, 0xDAF96E6AL, 0xC9A99D9EL,
+    0x3BC21E9DL, 0xEF087A76L, 0x1D63F975L, 0x0E330A81L, 0xFC588982L, 0xB21572C9L, 0x407EF1CAL, 0x532E023EL, 0xA145813DL,
+    0x758FE5D6L, 0x87E466D5L, 0x94B49521L, 0x66DF1622L, 0x38CC2A06L, 0xCAA7A905L, 0xD9F75AF1L, 0x2B9CD9F2L, 0xFF56BD19L,
+    0x0D3D3E1AL, 0x1E6DCDEEL, 0xEC064EEDL, 0xC38D26C4L, 0x31E6A5C7L, 0x22B65633L, 0xD0DDD530L, 0x0417B1DBL, 0xF67C32D8L,
+    0xE52CC12CL, 0x1747422FL, 0x49547E0BL, 0xBB3FFD08L, 0xA86F0EFCL, 0x5A048DFFL, 0x8ECEE914L, 0x7CA56A17L, 0x6FF599E3L,
+    0x9D9E1AE0L, 0xD3D3E1ABL, 0x21B862A8L, 0x32E8915CL, 0xC083125FL, 0x144976B4L, 0xE622F5B7L, 0xF5720643L, 0x07198540L,
+    0x590AB964L, 0xAB613A67L, 0xB831C993L, 0x4A5A4A90L, 0x9E902E7BL, 0x6CFBAD78L, 0x7FAB5E8CL, 0x8DC0DD8FL, 0xE330A81AL,
+    0x115B2B19L, 0x020BD8EDL, 0xF0605BEEL, 0x24AA3F05L, 0xD6C1BC06L, 0xC5914FF2L, 0x37FACCF1L, 0x69E9F0D5L, 0x9B8273D6L,
+    0x88D28022L, 0x7AB90321L, 0xAE7367CAL, 0x5C18E4C9L, 0x4F48173DL, 0xBD23943EL, 0xF36E6F75L, 0x0105EC76L, 0x12551F82L,
+    0xE03E9C81L, 0x34F4F86AL, 0xC69F7B69L, 0xD5CF889DL, 0x27A40B9EL, 0x79B737BAL, 0x8BDCB4B9L, 0x988C474DL, 0x6AE7C44EL,
     0xBE2DA0A5L, 0x4C4623A6L, 0x5F16D052L, 0xAD7D5351L
 };
 
diff --git a/src/libbz3.c b/src/libbz3.c
index 2043edc..51d5da0 100644
--- a/src/libbz3.c
+++ b/src/libbz3.c
@@ -20,9 +20,9 @@
 #define LZP_DICTIONARY 18
 #define LZP_MIN_MATCH 40
 
-struct block_encoder_state {
-    u8 *buf1, *buf2;
-    s32 bytes_read, block_size;
+struct bz3_state {
+    u8 *swap_buffer;
+    s32 block_size;
     s32 * sais_array;
     struct srt_state * srt_state;
     struct mtf_state * mtf_state;
@@ -30,9 +30,9 @@ struct block_encoder_state {
     s8 last_error;
 };
 
-s8 get_last_error(struct block_encoder_state * state) { return state->last_error; }
+s8 bz3_last_error(struct bz3_state * state) { return state->last_error; }
 
-const char * str_last_error(struct block_encoder_state * state) {
+const char * bz3_strerror(struct bz3_state * state) {
     switch (state->last_error) {
         case BZ3_OK:
             return "No error";
@@ -46,47 +46,36 @@ const char * str_last_error(struct block_encoder_state * state) {
             return "Malformed header";
         case BZ3_ERR_TRUNCATED_DATA:
             return "Truncated data";
+        case BZ3_ERR_DATA_TOO_BIG:
+            return "Too much data";
         default:
             return "Unknown error";
     }
 }
 
-u8 * get_buffer(struct block_encoder_state * state) { return state->buf1; }
+struct bz3_state * bz3_new(s32 block_size) {
+    struct bz3_state * bz3_state = malloc(sizeof(struct bz3_state));
 
-s32 commit_read(struct block_encoder_state * state, s32 bytes_read) {
-    if (bytes_read > state->block_size) {
-        state->last_error = BZ3_ERR_OUT_OF_BOUNDS;
-        return -1;
-    }
-    state->last_error = BZ3_OK;
-    return state->bytes_read = bytes_read;
-}
-
-struct block_encoder_state * new_block_encoder_state(s32 block_size) {
-    struct block_encoder_state * block_encoder_state = malloc(sizeof(struct block_encoder_state));
-
-    if (!block_encoder_state) {
+    if (!bz3_state) {
         return NULL;
     }
 
-    block_encoder_state->cm_state = malloc(sizeof(state));
-    block_encoder_state->srt_state = malloc(sizeof(struct srt_state));
-    block_encoder_state->mtf_state = malloc(sizeof(struct mtf_state));
+    bz3_state->cm_state = malloc(sizeof(state));
+    bz3_state->srt_state = malloc(sizeof(struct srt_state));
+    bz3_state->mtf_state = malloc(sizeof(struct mtf_state));
 
-    block_encoder_state->buf1 = malloc(block_size + block_size / 4);
-    block_encoder_state->buf2 = malloc(block_size + block_size / 4);
-    block_encoder_state->sais_array = malloc(block_size * sizeof(s32) + 16);
+    bz3_state->swap_buffer = malloc(block_size + block_size / 4);
+    bz3_state->sais_array = malloc(block_size * sizeof(s32) + 16);
 
-    block_encoder_state->block_size = block_size;
+    bz3_state->block_size = block_size;
 
-    block_encoder_state->last_error = BZ3_OK;
+    bz3_state->last_error = BZ3_OK;
 
-    return block_encoder_state;
+    return bz3_state;
 }
 
-void delete_block_encoder_state(struct block_encoder_state * state) {
-    free(state->buf1);
-    free(state->buf2);
+void bz3_free(struct bz3_state * state) {
+    free(state->swap_buffer);
     free(state->sais_array);
     free(state->srt_state);
     free(state->mtf_state);
@@ -96,19 +85,22 @@ void delete_block_encoder_state(struct block_encoder_state * state) {
 
 #define swap(x, y) { u8 * tmp = x; x = y; y = tmp; }
 
-struct encoding_result encode_block(struct block_encoder_state * state) {
-    u8 * b1 = state->buf1, * b2 = state->buf2;
-    s32 data_size = state->bytes_read;
+s32 bz3_encode_block(struct bz3_state * state, u8 * buffer, s32 data_size) {
+    u8 * b1 = buffer, * b2 = state->swap_buffer; s32 initial_size = data_size;
+
+    if(data_size > state->block_size) {
+        state->last_error = BZ3_ERR_DATA_TOO_BIG;
+        return -1;
+    }
     
     u32 crc32 = crc32sum(1, b1, data_size);
 
     // Ignore small blocks. They won't benefit from the entropy coding step.
     if(data_size < 64) {
-        ((s32 *) (b2))[0] = htonl(data_size + 8);
-        ((u32 *) (b2))[1] = htonl(crc32);
-        ((s32 *) (b2))[2] = htonl(-1);
-        memcpy(b2 + 12, b1, data_size);
-        return (struct encoding_result) { .buffer = b2, .size = data_size + 12 };
+        ((u32 *) (b1))[0] = htonl(crc32);
+        ((s32 *) (b1))[1] = htonl(-1);
+        memmove(b1 + 8, b1, data_size);
+        return data_size + 8;
     }
 
     // Back to front:
@@ -138,75 +130,88 @@ struct encoding_result encode_block(struct block_encoder_state * state) {
     s32 bwt_idx = libsais_bwt(b1, b2, state->sais_array, data_size, 16, NULL);
     if(bwt_idx < 0) {
         state->last_error = BZ3_ERR_BWT;
-        return (struct encoding_result) { .buffer = NULL, .size = -1 };
+        return -1;
     }
-    swap(b1, b2);
+    
+    // Important: b2 is the input now, b1 is the output.
+    // This avoids an expensive memory copy.
     
     s32 srt_size;
     if((model & 1) == 0) {
         if(data_size > MiB(3)) {
-            srt_size = srt_encode(state->srt_state, b1, b2, data_size);
+            srt_size = srt_encode(state->srt_state, b2, b1, data_size);
             swap(b1, b2);
             data_size = srt_size;
             model |= 4;
         } else {
-            mtf_encode(state->mtf_state, b1, b2, data_size);
+            mtf_encode(state->mtf_state, b2, b1, data_size);
             swap(b1, b2);
             model |= 8;
         }
     }
 
     // Compute the amount of overhead dwords.
-    s32 overhead = 4; // CRC32 + BWT index + original size + new size
+    s32 overhead = 2; // CRC32 + BWT index
     if((model & 2) || (model & 16)) overhead++; // LZP
     if(model & 4) overhead++; // sorted rank transform
 
     begin(state->cm_state);
-    state->cm_state->out_queue = b2 + overhead * 4 + 1;
+    state->cm_state->out_queue = b1 + overhead * 4 + 1;
     state->cm_state->output_ptr = 0;
-    for (s32 i = 0; i < data_size; i++) encode_byte(state->cm_state, b1[i]);
+    for (s32 i = 0; i < data_size; i++) encode_byte(state->cm_state, b2[i]);
     flush(state->cm_state);
     data_size = state->cm_state->output_ptr;
 
-    // Write the header. Starting with common entries:
-    ((s32 *) (b2))[0] = htonl(data_size + overhead * 4 - 3);
-    ((u32 *) (b2))[1] = htonl(crc32);
-    ((s32 *) (b2))[2] = htonl(bwt_idx);
-    ((s32 *) (b2))[3] = htonl(state->bytes_read);
-    b2[16] = model;
+    // Write the header. Starting with common entries.
+    ((u32 *) (b1))[0] = htonl(crc32);
+    ((s32 *) (b1))[1] = htonl(bwt_idx);
+    b1[8] = model;
 
     s32 p = 0;
-    if((model & 2) || (model & 16)) ((s32 *)(b2 + 17))[p++] = htonl(lzp_size);
-    if(model & 4) ((s32 *)(b2 + 17))[p++] = htonl(srt_size);
+    if((model & 2) || (model & 16)) ((s32 *)(b1 + 9))[p++] = htonl(lzp_size);
+    if(model & 4) ((s32 *)(b1 + 9))[p++] = htonl(srt_size);
+
+    state->last_error = BZ3_OK;
+
+    // XXX: Better solution
+    if(b1 != buffer)
+        memcpy(buffer, b1, data_size + overhead * 4 + 1);
 
-    return (struct encoding_result) { .buffer = b2, .size = data_size + overhead * 4 + 1 };
+    return data_size + overhead * 4 + 1;
 }
 
-struct encoding_result decode_block(struct block_encoder_state * state) {
+s32 bz3_decode_block(struct bz3_state * state, u8 * buffer, s32 data_size, s32 orig_size) {
     // Read the header.
-    s32 data_len = ntohl(((s32 *) state->buf1)[0]) - 1;
-    u32 crc32 = ntohl(((u32 *) state->buf1)[1]);
-    s32 bwt_idx = ntohl(((s32 *) state->buf1)[2]);
+    u32 crc32 = ntohl(((u32 *) buffer)[0]);
+    s32 bwt_idx = ntohl(((s32 *) buffer)[1]);
+
+    if(bwt_idx == -1) {
+        memmove(buffer, buffer + 8, data_size - 8);
+        return data_size - 8;
+    }
 
-    if(bwt_idx == -1)
-        return (struct encoding_result) { .buffer = state->buf1 + 12, .size = data_len - 7 };
+    if(orig_size > state->block_size) {
+        state->last_error = BZ3_ERR_DATA_TOO_BIG;
+        return -1;
+    }
 
-    s32 orig_size = ntohl(((s32 *) state->buf1)[3]);
-    s8 model = state->buf1[16];
+    s8 model = buffer[8];
     s32 lzp_size = -1, srt_size = -1, p = 0;
 
-    if((model & 2) || (model & 16)) lzp_size = ntohl(((s32 *) (state->buf1 + 17))[p++]);
-    if(model & 4) srt_size = ntohl(((s32 *) (state->buf1 + 17))[p++]);
+    if((model & 2) || (model & 16)) lzp_size = ntohl(((s32 *) (buffer + 9))[p++]);
+    if(model & 4) srt_size = ntohl(((s32 *) (buffer + 9))[p++]);
+
+    p += 2;
 
-    data_len -= p * 4;
+    data_size -= p * 4 + 1;
 
     // Decode the data.
-    u8 * b1 = state->buf1, * b2 = state->buf2;
+    u8 * b1 = buffer, * b2 = state->swap_buffer;
 
     begin(state->cm_state);
-    state->cm_state->in_queue = b1 + 17 + p * 4;
+    state->cm_state->in_queue = b1 + p * 4 + 1;
     state->cm_state->input_ptr = 0;
-    state->cm_state->input_max = data_len;
+    state->cm_state->input_max = data_size;
     init(state->cm_state);
 
     s32 size_src;
@@ -234,7 +239,7 @@ struct encoding_result decode_block(struct block_encoder_state * state) {
     // Undo BWT
     if (libsais_unbwt(b1, b2, state->sais_array, size_src, NULL, bwt_idx) < 0) {
         state->last_error = BZ3_ERR_BWT;
-        return (struct encoding_result) { .buffer = NULL, .size = -1 };
+        return -1;
     }
     swap(b1, b2);
 
@@ -248,28 +253,18 @@ struct encoding_result decode_block(struct block_encoder_state * state) {
         swap(b1, b2);
     }
 
-    return (struct encoding_result) { .buffer = b1, .size = size_src };
-}
-
-#undef swap
+    state->last_error = BZ3_OK;
 
-s32 read_block(int filedes, struct block_encoder_state * state) {
-    s32 bytes_read = read(filedes, state->buf1, 4);
-    if (bytes_read == 0) return 0;
-    if (bytes_read != 4) {
-        state->last_error = BZ3_ERR_MALFORMED_HEADER;
-        return -1;
-    }
-    s32 data_size = ntohl(((uint32_t *)state->buf1)[0]);
-    if (data_size > state->block_size) {
-        state->last_error = BZ3_ERR_MALFORMED_HEADER;
-        return -1;
-    }
-    bytes_read = read(filedes, state->buf1 + 4, data_size);
-    if (bytes_read != data_size) {
-        state->last_error = BZ3_ERR_TRUNCATED_DATA;
+    // XXX: Better solution
+    if(b1 != buffer)
+        memcpy(buffer, b1, size_src);
+    
+    if(crc32 != crc32sum(1, buffer, size_src)) {
+        state->last_error = BZ3_ERR_CRC;
         return -1;
     }
-    state->last_error = BZ3_OK;
-    return state->bytes_read = 4 + data_size;
+
+    return size_src;
 }
+
+#undef swap
diff --git a/src/libsais.c b/src/libsais.c
index c2d97a8..7800bff 100644
--- a/src/libsais.c
+++ b/src/libsais.c
@@ -132,19 +132,15 @@ typedef struct LIBSAIS_UNBWT_CONTEXT {
 #endif
 
 #if !defined(__LITTLE_ENDIAN__) && !defined(__BIG_ENDIAN__)
-    #if defined(_LITTLE_ENDIAN) ||                                                                \
-        (defined(BYTE_ORDER) && defined(LITTLE_ENDIAN) && BYTE_ORDER == LITTLE_ENDIAN) ||         \
-        (defined(_BYTE_ORDER) && defined(_LITTLE_ENDIAN) && _BYTE_ORDER == _LITTLE_ENDIAN) ||     \
-        (defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && __BYTE_ORDER == __LITTLE_ENDIAN) || \
-        (defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) &&                           \
-         __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+    #if defined(_LITTLE_ENDIAN) || (defined(BYTE_ORDER) && defined(LITTLE_ENDIAN) && BYTE_ORDER == LITTLE_ENDIAN) || \
+        (defined(_BYTE_ORDER) && defined(_LITTLE_ENDIAN) && _BYTE_ORDER == _LITTLE_ENDIAN) ||                        \
+        (defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && __BYTE_ORDER == __LITTLE_ENDIAN) ||                    \
+        (defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
         #define __LITTLE_ENDIAN__
-    #elif defined(_BIG_ENDIAN) ||                                                           \
-        (defined(BYTE_ORDER) && defined(BIG_ENDIAN) && BYTE_ORDER == BIG_ENDIAN) ||         \
-        (defined(_BYTE_ORDER) && defined(_BIG_ENDIAN) && _BYTE_ORDER == _BIG_ENDIAN) ||     \
-        (defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && __BYTE_ORDER == __BIG_ENDIAN) || \
-        (defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) &&                        \
-         __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    #elif defined(_BIG_ENDIAN) || (defined(BYTE_ORDER) && defined(BIG_ENDIAN) && BYTE_ORDER == BIG_ENDIAN) || \
+        (defined(_BYTE_ORDER) && defined(_BIG_ENDIAN) && _BYTE_ORDER == _BIG_ENDIAN) ||                       \
+        (defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && __BYTE_ORDER == __BIG_ENDIAN) ||                   \
+        (defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
         #define __BIG_ENDIAN__
     #elif defined(_WIN32)
         #define __LITTLE_ENDIAN__
@@ -166,15 +162,13 @@ typedef struct LIBSAIS_UNBWT_CONTEXT {
 #endif
 
 static void * libsais_align_up(const void * address, size_t alignment) {
-    return (void *)((((ptrdiff_t)address) + ((ptrdiff_t)alignment) - 1) &
-                    (-((ptrdiff_t)alignment)));
+    return (void *)((((ptrdiff_t)address) + ((ptrdiff_t)alignment) - 1) & (-((ptrdiff_t)alignment)));
 }
 
 static void * libsais_alloc_aligned(size_t size, size_t alignment) {
     void * address = malloc(size + sizeof(short) + alignment - 1);
     if (address != NULL) {
-        void * aligned_address =
-            libsais_align_up((void *)((ptrdiff_t)address + (ptrdiff_t)(sizeof(short))), alignment);
+        void * aligned_address = libsais_align_up((void *)((ptrdiff_t)address + (ptrdiff_t)(sizeof(short))), alignment);
         ((short *)aligned_address)[-1] = (short)((ptrdiff_t)aligned_address - (ptrdiff_t)address);
 
         return aligned_address;
@@ -190,10 +184,10 @@ static void libsais_free_aligned(void * aligned_address) {
 }
 
 static LIBSAIS_THREAD_STATE * libsais_alloc_thread_state(sa_sint_t threads) {
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state = (LIBSAIS_THREAD_STATE *)libsais_alloc_aligned(
-        (size_t)threads * sizeof(LIBSAIS_THREAD_STATE), 4096);
-    sa_sint_t * RESTRICT thread_buckets = (sa_sint_t *)libsais_alloc_aligned(
-        (size_t)threads * 4 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096);
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state =
+        (LIBSAIS_THREAD_STATE *)libsais_alloc_aligned((size_t)threads * sizeof(LIBSAIS_THREAD_STATE), 4096);
+    sa_sint_t * RESTRICT thread_buckets =
+        (sa_sint_t *)libsais_alloc_aligned((size_t)threads * 4 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096);
     LIBSAIS_THREAD_CACHE * RESTRICT thread_cache = (LIBSAIS_THREAD_CACHE *)libsais_alloc_aligned(
         (size_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE * sizeof(LIBSAIS_THREAD_CACHE), 4096);
 
@@ -224,12 +218,9 @@ static void libsais_free_thread_state(LIBSAIS_THREAD_STATE * thread_state) {
 }
 
 static LIBSAIS_CONTEXT * libsais_create_ctx_main(sa_sint_t threads) {
-    LIBSAIS_CONTEXT * RESTRICT ctx =
-        (LIBSAIS_CONTEXT *)libsais_alloc_aligned(sizeof(LIBSAIS_CONTEXT), 64);
-    sa_sint_t * RESTRICT buckets =
-        (sa_sint_t *)libsais_alloc_aligned(8 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096);
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state =
-        threads > 1 ? libsais_alloc_thread_state(threads) : NULL;
+    LIBSAIS_CONTEXT * RESTRICT ctx = (LIBSAIS_CONTEXT *)libsais_alloc_aligned(sizeof(LIBSAIS_CONTEXT), 64);
+    sa_sint_t * RESTRICT buckets = (sa_sint_t *)libsais_alloc_aligned(8 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096);
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state = threads > 1 ? libsais_alloc_thread_state(threads) : NULL;
 
     if (ctx != NULL && buckets != NULL && (thread_state != NULL || threads == 1)) {
         ctx->buckets = buckets;
@@ -252,14 +243,12 @@ static void libsais_free_ctx_main(LIBSAIS_CONTEXT * ctx) {
         libsais_free_aligned(ctx);
     }
 }
-static void libsais_gather_lms_suffixes_8u(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
-                                           sa_sint_t n, fast_sint_t m, fast_sint_t omp_block_start,
-                                           fast_sint_t omp_block_size) {
+static void libsais_gather_lms_suffixes_8u(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, fast_sint_t m,
+                                           fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
     if (omp_block_size > 0) {
         const fast_sint_t prefetch_distance = 128;
 
-        fast_sint_t i, j = omp_block_start + omp_block_size,
-                       c0 = T[omp_block_start + omp_block_size - 1], c1 = -1;
+        fast_sint_t i, j = omp_block_start + omp_block_size, c0 = T[omp_block_start + omp_block_size - 1], c1 = -1;
 
         while (j < n && (c1 = T[j]) == c0) {
             ++j;
@@ -300,9 +289,8 @@ static void libsais_gather_lms_suffixes_8u(const u8 * RESTRICT T, sa_sint_t * RE
     }
 }
 
-static void libsais_gather_lms_suffixes_8u_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
-                                               sa_sint_t n, sa_sint_t threads,
-                                               LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+static void libsais_gather_lms_suffixes_8u_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                               sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     {
         (void)(threads);
         (void)(thread_state);
@@ -312,18 +300,15 @@ static void libsais_gather_lms_suffixes_8u_omp(const u8 * RESTRICT T, sa_sint_t
 
         fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size =
-            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
 
         if (omp_num_threads == 1) {
-            libsais_gather_lms_suffixes_8u(T, SA, n, (fast_sint_t)n - 1, omp_block_start,
-                                           omp_block_size);
+            libsais_gather_lms_suffixes_8u(T, SA, n, (fast_sint_t)n - 1, omp_block_start, omp_block_size);
         }
     }
 }
 
-static sa_sint_t libsais_gather_lms_suffixes_32s(const sa_sint_t * RESTRICT T,
-                                                 sa_sint_t * RESTRICT SA, sa_sint_t n) {
+static sa_sint_t libsais_gather_lms_suffixes_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n) {
     const fast_sint_t prefetch_distance = 32;
 
     sa_sint_t i = n - 2;
@@ -364,8 +349,8 @@ static sa_sint_t libsais_gather_lms_suffixes_32s(const sa_sint_t * RESTRICT T,
     return n - 1 - m;
 }
 
-static sa_sint_t libsais_gather_compacted_lms_suffixes_32s(const sa_sint_t * RESTRICT T,
-                                                           sa_sint_t * RESTRICT SA, sa_sint_t n) {
+static sa_sint_t libsais_gather_compacted_lms_suffixes_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+                                                           sa_sint_t n) {
     const fast_sint_t prefetch_distance = 32;
 
     sa_sint_t i = n - 2;
@@ -405,8 +390,8 @@ static sa_sint_t libsais_gather_compacted_lms_suffixes_32s(const sa_sint_t * RES
 
     return n - 1 - m;
 }
-static void libsais_count_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t n,
-                                              sa_sint_t k, sa_sint_t * RESTRICT buckets) {
+static void libsais_count_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t n, sa_sint_t k,
+                                              sa_sint_t * RESTRICT buckets) {
     const fast_sint_t prefetch_distance = 32;
 
     memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t));
@@ -450,10 +435,8 @@ static void libsais_count_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_s
 
     buckets[BUCKETS_INDEX2((fast_uint_t)c0, 0)]++;
 }
-static sa_sint_t libsais_count_and_gather_lms_suffixes_8u(const u8 * RESTRICT T,
-                                                          sa_sint_t * RESTRICT SA, sa_sint_t n,
-                                                          sa_sint_t * RESTRICT buckets,
-                                                          fast_sint_t omp_block_start,
+static sa_sint_t libsais_count_and_gather_lms_suffixes_8u(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                                          sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start,
                                                           fast_sint_t omp_block_size) {
     memset(buckets, 0, 4 * ALPHABET_SIZE * sizeof(sa_sint_t));
 
@@ -517,9 +500,10 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_8u(const u8 * RESTRICT T,
     return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m);
 }
 
-static sa_sint_t libsais_count_and_gather_lms_suffixes_8u_omp(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets,
-    sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+static sa_sint_t libsais_count_and_gather_lms_suffixes_8u_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
+                                                              sa_sint_t n, sa_sint_t * RESTRICT buckets,
+                                                              sa_sint_t threads,
+                                                              LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     sa_sint_t m = 0;
 
     {
@@ -531,21 +515,19 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_8u_omp(
 
         fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size =
-            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
 
         if (omp_num_threads == 1) {
-            m = libsais_count_and_gather_lms_suffixes_8u(T, SA, n, buckets, omp_block_start,
-                                                         omp_block_size);
+            m = libsais_count_and_gather_lms_suffixes_8u(T, SA, n, buckets, omp_block_start, omp_block_size);
         }
     }
 
     return m;
 }
 
-static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
-    sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+                                                              sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets,
+                                                              fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
     memset(buckets, 0, 4 * (size_t)k * sizeof(sa_sint_t));
 
     fast_sint_t m = omp_block_start + omp_block_size - 1;
@@ -613,9 +595,9 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k(
     return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m);
 }
 
-static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
-    sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+                                                              sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets,
+                                                              fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
     memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t));
 
     fast_sint_t m = omp_block_start + omp_block_size - 1;
@@ -683,9 +665,11 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k(
     return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m);
 }
 
-static sa_sint_t libsais_count_and_gather_compacted_lms_suffixes_32s_2k(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
-    sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+static sa_sint_t libsais_count_and_gather_compacted_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T,
+                                                                        sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                                                        sa_sint_t k, sa_sint_t * RESTRICT buckets,
+                                                                        fast_sint_t omp_block_start,
+                                                                        fast_sint_t omp_block_size) {
     memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t));
 
     fast_sint_t m = omp_block_start + omp_block_size - 1;
@@ -704,14 +688,10 @@ static sa_sint_t libsais_count_and_gather_compacted_lms_suffixes_32s_2k(
         for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4) {
             libsais_prefetch(&T[i - 2 * prefetch_distance]);
 
-            libsais_prefetchw(
-                &buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0] & SAINT_MAX, 0)]);
-            libsais_prefetchw(
-                &buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1] & SAINT_MAX, 0)]);
-            libsais_prefetchw(
-                &buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2] & SAINT_MAX, 0)]);
-            libsais_prefetchw(
-                &buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3] & SAINT_MAX, 0)]);
+            libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0] & SAINT_MAX, 0)]);
+            libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1] & SAINT_MAX, 0)]);
+            libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2] & SAINT_MAX, 0)]);
+            libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3] & SAINT_MAX, 0)]);
 
             c1 = T[i - 0];
             s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
@@ -763,9 +743,8 @@ static sa_sint_t libsais_count_and_gather_compacted_lms_suffixes_32s_2k(
     return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m);
 }
 static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k_nofs_omp(const sa_sint_t * RESTRICT T,
-                                                                       sa_sint_t * RESTRICT SA,
-                                                                       sa_sint_t n, sa_sint_t k,
-                                                                       sa_sint_t * RESTRICT buckets,
+                                                                       sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                                                       sa_sint_t k, sa_sint_t * RESTRICT buckets,
                                                                        sa_sint_t threads) {
     sa_sint_t m = 0;
     {
@@ -782,9 +761,8 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k_nofs_omp(const sa_
 }
 
 static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k_nofs_omp(const sa_sint_t * RESTRICT T,
-                                                                       sa_sint_t * RESTRICT SA,
-                                                                       sa_sint_t n, sa_sint_t k,
-                                                                       sa_sint_t * RESTRICT buckets,
+                                                                       sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                                                       sa_sint_t k, sa_sint_t * RESTRICT buckets,
                                                                        sa_sint_t threads) {
     sa_sint_t m = 0;
     {
@@ -800,9 +778,11 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k_nofs_omp(const sa_
     return m;
 }
 
-static sa_sint_t libsais_count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
-    sa_sint_t * RESTRICT buckets, sa_sint_t threads) {
+static sa_sint_t libsais_count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp(const sa_sint_t * RESTRICT T,
+                                                                                 sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                                                                 sa_sint_t k,
+                                                                                 sa_sint_t * RESTRICT buckets,
+                                                                                 sa_sint_t threads) {
     sa_sint_t m = 0;
     {
         (void)(threads);
@@ -817,9 +797,10 @@ static sa_sint_t libsais_count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp
     return m;
 }
 
-static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
-    sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+                                                                  sa_sint_t n, sa_sint_t k,
+                                                                  sa_sint_t * RESTRICT buckets, sa_sint_t threads,
+                                                                  LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     sa_sint_t m;
     (void)(thread_state);
 
@@ -828,9 +809,10 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k_omp(
     return m;
 }
 
-static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
-    sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+                                                                  sa_sint_t n, sa_sint_t k,
+                                                                  sa_sint_t * RESTRICT buckets, sa_sint_t threads,
+                                                                  LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     sa_sint_t m;
     (void)(thread_state);
 
@@ -839,15 +821,14 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k_omp(
     return m;
 }
 
-static void libsais_count_and_gather_compacted_lms_suffixes_32s_2k_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
-    sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+static void libsais_count_and_gather_compacted_lms_suffixes_32s_2k_omp(const sa_sint_t * RESTRICT T,
+                                                                       sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                                                       sa_sint_t k, sa_sint_t * RESTRICT buckets,
+                                                                       sa_sint_t threads,
+                                                                       LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     (void)(thread_state);
 
-    {
-        libsais_count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp(T, SA, n, k, buckets,
-                                                                        threads);
-    }
+    { libsais_count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp(T, SA, n, k, buckets, threads); }
 }
 
 static void libsais_count_suffixes_32s(const sa_sint_t * RESTRICT T, sa_sint_t n, sa_sint_t k,
@@ -875,8 +856,7 @@ static void libsais_count_suffixes_32s(const sa_sint_t * RESTRICT T, sa_sint_t n
     }
 }
 
-static void libsais_initialize_buckets_start_and_end_8u(sa_sint_t * RESTRICT buckets,
-                                                        sa_sint_t * RESTRICT freq) {
+static void libsais_initialize_buckets_start_and_end_8u(sa_sint_t * RESTRICT buckets, sa_sint_t * RESTRICT freq) {
     sa_sint_t * RESTRICT bucket_start = &buckets[6 * ALPHABET_SIZE];
     sa_sint_t * RESTRICT bucket_end = &buckets[7 * ALPHABET_SIZE];
 
@@ -886,9 +866,8 @@ static void libsais_initialize_buckets_start_and_end_8u(sa_sint_t * RESTRICT buc
         for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0);
              i += BUCKETS_INDEX4(1, 0), j += 1) {
             bucket_start[j] = sum;
-            sum +=
-                (freq[j] = buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 1)] +
-                           buckets[i + BUCKETS_INDEX4(0, 2)] + buckets[i + BUCKETS_INDEX4(0, 3)]);
+            sum += (freq[j] = buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 1)] +
+                              buckets[i + BUCKETS_INDEX4(0, 2)] + buckets[i + BUCKETS_INDEX4(0, 3)]);
             bucket_end[j] = sum;
         }
     } else {
@@ -904,8 +883,7 @@ static void libsais_initialize_buckets_start_and_end_8u(sa_sint_t * RESTRICT buc
     }
 }
 
-static void libsais_initialize_buckets_start_and_end_32s_6k(sa_sint_t k,
-                                                            sa_sint_t * RESTRICT buckets) {
+static void libsais_initialize_buckets_start_and_end_32s_6k(sa_sint_t k, sa_sint_t * RESTRICT buckets) {
     sa_sint_t * RESTRICT bucket_start = &buckets[4 * k];
     sa_sint_t * RESTRICT bucket_end = &buckets[5 * k];
 
@@ -920,8 +898,7 @@ static void libsais_initialize_buckets_start_and_end_32s_6k(sa_sint_t k,
     }
 }
 
-static void libsais_initialize_buckets_start_and_end_32s_4k(sa_sint_t k,
-                                                            sa_sint_t * RESTRICT buckets) {
+static void libsais_initialize_buckets_start_and_end_32s_4k(sa_sint_t k, sa_sint_t * RESTRICT buckets) {
     sa_sint_t * RESTRICT bucket_start = &buckets[2 * k];
     sa_sint_t * RESTRICT bucket_end = &buckets[3 * k];
 
@@ -938,15 +915,13 @@ static void libsais_initialize_buckets_start_and_end_32s_4k(sa_sint_t k,
 static void libsais_initialize_buckets_end_32s_2k(sa_sint_t k, sa_sint_t * RESTRICT buckets) {
     fast_sint_t i;
     sa_sint_t sum0 = 0;
-    for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0);
-         i += BUCKETS_INDEX2(1, 0)) {
+    for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0)) {
         sum0 += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)];
         buckets[i + BUCKETS_INDEX2(0, 0)] = sum0;
     }
 }
 
-static void libsais_initialize_buckets_start_and_end_32s_2k(sa_sint_t k,
-                                                            sa_sint_t * RESTRICT buckets) {
+static void libsais_initialize_buckets_start_and_end_32s_2k(sa_sint_t k, sa_sint_t * RESTRICT buckets) {
     fast_sint_t i, j;
     for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0);
          i += BUCKETS_INDEX2(1, 0), j += 1) {
@@ -976,8 +951,9 @@ static void libsais_initialize_buckets_end_32s_1k(sa_sint_t k, sa_sint_t * RESTR
     }
 }
 
-static sa_sint_t libsais_initialize_buckets_for_lms_suffixes_radix_sort_8u(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix) {
+static sa_sint_t libsais_initialize_buckets_for_lms_suffixes_radix_sort_8u(const u8 * RESTRICT T,
+                                                                           sa_sint_t * RESTRICT buckets,
+                                                                           sa_sint_t first_lms_suffix) {
     {
         fast_uint_t s = 0;
         fast_sint_t c0 = T[first_lms_suffix];
@@ -998,8 +974,7 @@ static sa_sint_t libsais_initialize_buckets_for_lms_suffixes_radix_sort_8u(
 
         fast_sint_t i, j;
         sa_sint_t sum = 0;
-        for (i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0);
-             i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0);
+        for (i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0);
              i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) {
             temp_bucket[j + BUCKETS_INDEX2(0, 1)] = sum;
             sum += buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 3)];
@@ -1010,16 +985,15 @@ static sa_sint_t libsais_initialize_buckets_for_lms_suffixes_radix_sort_8u(
     }
 }
 
-static void libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_2k(
-    const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets,
-    sa_sint_t first_lms_suffix) {
+static void libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t k,
+                                                                          sa_sint_t * RESTRICT buckets,
+                                                                          sa_sint_t first_lms_suffix) {
     buckets[BUCKETS_INDEX2(T[first_lms_suffix], 0)]++;
     buckets[BUCKETS_INDEX2(T[first_lms_suffix], 1)]--;
 
     fast_sint_t i;
     sa_sint_t sum0 = 0, sum1 = 0;
-    for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0);
-         i += BUCKETS_INDEX2(1, 0)) {
+    for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0)) {
         sum0 += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)];
         sum1 += buckets[i + BUCKETS_INDEX2(0, 1)];
 
@@ -1028,9 +1002,10 @@ static void libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_2k(
     }
 }
 
-static sa_sint_t libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(
-    const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets,
-    sa_sint_t first_lms_suffix) {
+static sa_sint_t libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(const sa_sint_t * RESTRICT T,
+                                                                               sa_sint_t k,
+                                                                               sa_sint_t * RESTRICT buckets,
+                                                                               sa_sint_t first_lms_suffix) {
     {
         fast_uint_t s = 0;
         fast_sint_t c0 = T[first_lms_suffix];
@@ -1061,9 +1036,9 @@ static sa_sint_t libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(
     }
 }
 
-static void libsais_initialize_buckets_for_radix_and_partial_sorting_32s_4k(
-    const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets,
-    sa_sint_t first_lms_suffix) {
+static void libsais_initialize_buckets_for_radix_and_partial_sorting_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t k,
+                                                                            sa_sint_t * RESTRICT buckets,
+                                                                            sa_sint_t first_lms_suffix) {
     sa_sint_t * RESTRICT bucket_start = &buckets[2 * k];
     sa_sint_t * RESTRICT bucket_end = &buckets[3 * k];
 
@@ -1085,14 +1060,12 @@ static void libsais_initialize_buckets_for_radix_and_partial_sorting_32s_4k(
 }
 
 static void libsais_radix_sort_lms_suffixes_8u(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
-                                               sa_sint_t * RESTRICT induction_bucket,
-                                               fast_sint_t omp_block_start,
+                                               sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start,
                                                fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 3;
-         i >= j; i -= 4) {
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4) {
         libsais_prefetch(&SA[i - 2 * prefetch_distance]);
 
         libsais_prefetch(&T[SA[i - prefetch_distance - 0]]);
@@ -1116,9 +1089,8 @@ static void libsais_radix_sort_lms_suffixes_8u(const u8 * RESTRICT T, sa_sint_t
     }
 }
 
-static void libsais_radix_sort_lms_suffixes_8u_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
-                                                   sa_sint_t n, sa_sint_t m,
-                                                   sa_sint_t * RESTRICT buckets, sa_sint_t threads,
+static void libsais_radix_sort_lms_suffixes_8u_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                                   sa_sint_t m, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
                                                    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     {
         (void)(threads);
@@ -1127,23 +1099,19 @@ static void libsais_radix_sort_lms_suffixes_8u_omp(const u8 * RESTRICT T, sa_sin
         fast_sint_t omp_num_threads = 1;
 
         if (omp_num_threads == 1) {
-            libsais_radix_sort_lms_suffixes_8u(T, SA, &buckets[4 * ALPHABET_SIZE],
-                                               (fast_sint_t)n - (fast_sint_t)m + 1,
+            libsais_radix_sort_lms_suffixes_8u(T, SA, &buckets[4 * ALPHABET_SIZE], (fast_sint_t)n - (fast_sint_t)m + 1,
                                                (fast_sint_t)m - 1);
         }
     }
 }
 
-static void libsais_radix_sort_lms_suffixes_32s_6k(const sa_sint_t * RESTRICT T,
-                                                   sa_sint_t * RESTRICT SA,
-                                                   sa_sint_t * RESTRICT induction_bucket,
-                                                   fast_sint_t omp_block_start,
+static void libsais_radix_sort_lms_suffixes_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+                                                   sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start,
                                                    fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 3;
-         i >= j; i -= 4) {
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 3; i >= j; i -= 4) {
         libsais_prefetch(&SA[i - 3 * prefetch_distance]);
 
         libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 0]]);
@@ -1172,16 +1140,13 @@ static void libsais_radix_sort_lms_suffixes_32s_6k(const sa_sint_t * RESTRICT T,
     }
 }
 
-static void libsais_radix_sort_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T,
-                                                   sa_sint_t * RESTRICT SA,
-                                                   sa_sint_t * RESTRICT induction_bucket,
-                                                   fast_sint_t omp_block_start,
+static void libsais_radix_sort_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+                                                   sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start,
                                                    fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 3;
-         i >= j; i -= 4) {
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 3; i >= j; i -= 4) {
         libsais_prefetch(&SA[i - 3 * prefetch_distance]);
 
         libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 0]]);
@@ -1209,31 +1174,30 @@ static void libsais_radix_sort_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T,
         SA[--induction_bucket[BUCKETS_INDEX2(T[p], 0)]] = p;
     }
 }
-static void libsais_radix_sort_lms_suffixes_32s_6k_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
-    sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+static void libsais_radix_sort_lms_suffixes_32s_6k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+                                                       sa_sint_t n, sa_sint_t m, sa_sint_t * RESTRICT induction_bucket,
+                                                       sa_sint_t threads,
+                                                       LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     if (threads == 1 || m < 65536) {
-        libsais_radix_sort_lms_suffixes_32s_6k(
-            T, SA, induction_bucket, (fast_sint_t)n - (fast_sint_t)m + 1, (fast_sint_t)m - 1);
+        libsais_radix_sort_lms_suffixes_32s_6k(T, SA, induction_bucket, (fast_sint_t)n - (fast_sint_t)m + 1,
+                                               (fast_sint_t)m - 1);
     }
     (void)(thread_state);
 }
 
-static void libsais_radix_sort_lms_suffixes_32s_2k_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
-    sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+static void libsais_radix_sort_lms_suffixes_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+                                                       sa_sint_t n, sa_sint_t m, sa_sint_t * RESTRICT induction_bucket,
+                                                       sa_sint_t threads,
+                                                       LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     if (threads == 1 || m < 65536) {
-        libsais_radix_sort_lms_suffixes_32s_2k(
-            T, SA, induction_bucket, (fast_sint_t)n - (fast_sint_t)m + 1, (fast_sint_t)m - 1);
+        libsais_radix_sort_lms_suffixes_32s_2k(T, SA, induction_bucket, (fast_sint_t)n - (fast_sint_t)m + 1,
+                                               (fast_sint_t)m - 1);
     }
     (void)(thread_state);
 }
 
-static sa_sint_t libsais_radix_sort_lms_suffixes_32s_1k(const sa_sint_t * RESTRICT T,
-                                                        sa_sint_t * RESTRICT SA, sa_sint_t n,
-                                                        sa_sint_t * RESTRICT buckets) {
+static sa_sint_t libsais_radix_sort_lms_suffixes_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+                                                        sa_sint_t n, sa_sint_t * RESTRICT buckets) {
     const fast_sint_t prefetch_distance = 32;
 
     sa_sint_t i = n - 2;
@@ -1297,15 +1261,12 @@ static sa_sint_t libsais_radix_sort_lms_suffixes_32s_1k(const sa_sint_t * RESTRI
     return m;
 }
 
-static void libsais_radix_sort_set_markers_32s_6k(sa_sint_t * RESTRICT SA,
-                                                  sa_sint_t * RESTRICT induction_bucket,
-                                                  fast_sint_t omp_block_start,
-                                                  fast_sint_t omp_block_size) {
+static void libsais_radix_sort_set_markers_32s_6k(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket,
+                                                  fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j;
-         i += 4) {
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) {
         libsais_prefetch(&induction_bucket[i + 2 * prefetch_distance]);
 
         libsais_prefetchw(&SA[induction_bucket[i + prefetch_distance + 0]]);
@@ -1324,15 +1285,12 @@ static void libsais_radix_sort_set_markers_32s_6k(sa_sint_t * RESTRICT SA,
     }
 }
 
-static void libsais_radix_sort_set_markers_32s_4k(sa_sint_t * RESTRICT SA,
-                                                  sa_sint_t * RESTRICT induction_bucket,
-                                                  fast_sint_t omp_block_start,
-                                                  fast_sint_t omp_block_size) {
+static void libsais_radix_sort_set_markers_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket,
+                                                  fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j;
-         i += 4) {
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) {
         libsais_prefetch(&induction_bucket[BUCKETS_INDEX2(i + 2 * prefetch_distance, 0)]);
 
         libsais_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 0, 0)]]);
@@ -1352,33 +1310,28 @@ static void libsais_radix_sort_set_markers_32s_4k(sa_sint_t * RESTRICT SA,
 }
 
 static void libsais_radix_sort_set_markers_32s_6k_omp(sa_sint_t * RESTRICT SA, sa_sint_t k,
-                                                      sa_sint_t * RESTRICT induction_bucket,
-                                                      sa_sint_t threads) {
+                                                      sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads) {
     {
         (void)(threads);
 
         fast_sint_t omp_block_start = 0;
         fast_sint_t omp_block_size = (fast_sint_t)k - 1;
-        libsais_radix_sort_set_markers_32s_6k(SA, induction_bucket, omp_block_start,
-                                              omp_block_size);
+        libsais_radix_sort_set_markers_32s_6k(SA, induction_bucket, omp_block_start, omp_block_size);
     }
 }
 
 static void libsais_radix_sort_set_markers_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t k,
-                                                      sa_sint_t * RESTRICT induction_bucket,
-                                                      sa_sint_t threads) {
+                                                      sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads) {
     {
         (void)(threads);
 
         fast_sint_t omp_block_start = 0;
         fast_sint_t omp_block_size = (fast_sint_t)k - 1;
-        libsais_radix_sort_set_markers_32s_4k(SA, induction_bucket, omp_block_start,
-                                              omp_block_size);
+        libsais_radix_sort_set_markers_32s_4k(SA, induction_bucket, omp_block_start, omp_block_size);
     }
 }
 
-static void libsais_initialize_buckets_for_partial_sorting_8u(const u8 * RESTRICT T,
-                                                              sa_sint_t * RESTRICT buckets,
+static void libsais_initialize_buckets_for_partial_sorting_8u(const u8 * RESTRICT T, sa_sint_t * RESTRICT buckets,
                                                               sa_sint_t first_lms_suffix,
                                                               sa_sint_t left_suffixes_count) {
     sa_sint_t * RESTRICT temp_bucket = &buckets[4 * ALPHABET_SIZE];
@@ -1387,8 +1340,7 @@ static void libsais_initialize_buckets_for_partial_sorting_8u(const u8 * RESTRIC
 
     fast_sint_t i, j;
     sa_sint_t sum0 = left_suffixes_count + 1, sum1 = 0;
-    for (i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0);
-         i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0);
+    for (i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0);
          i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) {
         temp_bucket[j + BUCKETS_INDEX2(0, 0)] = sum0;
 
@@ -1400,8 +1352,7 @@ static void libsais_initialize_buckets_for_partial_sorting_8u(const u8 * RESTRIC
     }
 }
 
-static void libsais_initialize_buckets_for_partial_sorting_32s_6k(const sa_sint_t * RESTRICT T,
-                                                                  sa_sint_t k,
+static void libsais_initialize_buckets_for_partial_sorting_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t k,
                                                                   sa_sint_t * RESTRICT buckets,
                                                                   sa_sint_t first_lms_suffix,
                                                                   sa_sint_t left_suffixes_count) {
@@ -1430,8 +1381,7 @@ static void libsais_initialize_buckets_for_partial_sorting_32s_6k(const sa_sint_
         temp_bucket[j + BUCKETS_INDEX2(0, 1)] = sum1;
     }
 
-    for (sum1 += 1; i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0);
-         i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) {
+    for (sum1 += 1; i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) {
         sa_sint_t SS = buckets[i + BUCKETS_INDEX4(0, 0)];
         sa_sint_t LS = buckets[i + BUCKETS_INDEX4(0, 1)];
         sa_sint_t SL = buckets[i + BUCKETS_INDEX4(0, 2)];
@@ -1451,17 +1401,17 @@ static void libsais_initialize_buckets_for_partial_sorting_32s_6k(const sa_sint_
     }
 }
 
-static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d,
-    fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
+                                                               sa_sint_t * RESTRICT buckets, sa_sint_t d,
+                                                               fast_sint_t omp_block_start,
+                                                               fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     sa_sint_t * RESTRICT induction_bucket = &buckets[4 * ALPHABET_SIZE];
     sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE];
 
     fast_sint_t i, j;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j;
-         i += 2) {
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) {
         libsais_prefetch(&SA[i + 2 * prefetch_distance]);
 
         libsais_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1);
@@ -1473,16 +1423,14 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u(
         d += (p0 < 0);
         p0 &= SAINT_MAX;
         sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] >= T[p0 - 1]);
-        SA[induction_bucket[v0]++] =
-            (p0 - 1) | ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1));
+        SA[induction_bucket[v0]++] = (p0 - 1) | ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1));
         distinct_names[v0] = d;
 
         sa_sint_t p1 = SA[i + 1];
         d += (p1 < 0);
         p1 &= SAINT_MAX;
         sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] >= T[p1 - 1]);
-        SA[induction_bucket[v1]++] =
-            (p1 - 1) | ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1));
+        SA[induction_bucket[v1]++] = (p1 - 1) | ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1));
         distinct_names[v1] = d;
     }
 
@@ -1491,17 +1439,17 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u(
         d += (p < 0);
         p &= SAINT_MAX;
         sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] >= T[p - 1]);
-        SA[induction_bucket[v]++] =
-            (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1));
+        SA[induction_bucket[v]++] = (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1));
         distinct_names[v] = d;
     }
 
     return d;
 }
-static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u_omp(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets,
-    sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
+                                                                   sa_sint_t n, sa_sint_t * RESTRICT buckets,
+                                                                   sa_sint_t left_suffixes_count, sa_sint_t d,
+                                                                   sa_sint_t threads,
+                                                                   LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     sa_sint_t * RESTRICT induction_bucket = &buckets[4 * ALPHABET_SIZE];
     sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE];
 
@@ -1509,21 +1457,21 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u_omp(
     distinct_names[BUCKETS_INDEX2(T[n - 1], T[n - 2] >= T[n - 1])] = ++d;
 
     if (threads == 1 || left_suffixes_count < 65536) {
-        d = libsais_partial_sorting_scan_left_to_right_8u(T, SA, buckets, d, 0,
-                                                          left_suffixes_count);
+        d = libsais_partial_sorting_scan_left_to_right_8u(T, SA, buckets, d, 0, left_suffixes_count);
     }
     (void)(thread_state);
     return d;
 }
 
-static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
-    sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k(const sa_sint_t * RESTRICT T,
+                                                                   sa_sint_t * RESTRICT SA,
+                                                                   sa_sint_t * RESTRICT buckets, sa_sint_t d,
+                                                                   fast_sint_t omp_block_start,
+                                                                   fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1;
-         i < j; i += 2) {
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2) {
         libsais_prefetch(&SA[i + 3 * prefetch_distance]);
 
         libsais_prefetch(&T[SA[i + 2 * prefetch_distance + 0] & SAINT_MAX] - 1);
@@ -1565,18 +1513,18 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k(
     return d;
 }
 
-static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k,
-    sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start,
-    fast_sint_t omp_block_size) {
+static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k(const sa_sint_t * RESTRICT T,
+                                                                   sa_sint_t * RESTRICT SA, sa_sint_t k,
+                                                                   sa_sint_t * RESTRICT buckets, sa_sint_t d,
+                                                                   fast_sint_t omp_block_start,
+                                                                   fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     sa_sint_t * RESTRICT induction_bucket = &buckets[2 * k];
     sa_sint_t * RESTRICT distinct_names = &buckets[0 * k];
 
     fast_sint_t i, j;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1;
-         i < j; i += 2) {
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2) {
         libsais_prefetchw(&SA[i + 3 * prefetch_distance]);
 
         sa_sint_t s0 = SA[i + 2 * prefetch_distance + 0];
@@ -1609,9 +1557,8 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k(
             d += (p0 >> (SUFFIX_GROUP_BIT - 1));
             p0 &= ~SUFFIX_GROUP_MARKER;
             sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] < T[p0 - 1]);
-            SA[induction_bucket[T[p0 - 1]]++] =
-                (p0 - 1) | ((sa_sint_t)(T[p0 - 2] < T[p0 - 1]) << (SAINT_BIT - 1)) |
-                ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1));
+            SA[induction_bucket[T[p0 - 1]]++] = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] < T[p0 - 1]) << (SAINT_BIT - 1)) |
+                                                ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1));
             distinct_names[v0] = d;
         }
 
@@ -1622,9 +1569,8 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k(
             d += (p1 >> (SUFFIX_GROUP_BIT - 1));
             p1 &= ~SUFFIX_GROUP_MARKER;
             sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] < T[p1 - 1]);
-            SA[induction_bucket[T[p1 - 1]]++] =
-                (p1 - 1) | ((sa_sint_t)(T[p1 - 2] < T[p1 - 1]) << (SAINT_BIT - 1)) |
-                ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1));
+            SA[induction_bucket[T[p1 - 1]]++] = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] < T[p1 - 1]) << (SAINT_BIT - 1)) |
+                                                ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1));
             distinct_names[v1] = d;
         }
     }
@@ -1637,9 +1583,8 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k(
             d += (p >> (SUFFIX_GROUP_BIT - 1));
             p &= ~SUFFIX_GROUP_MARKER;
             sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] < T[p - 1]);
-            SA[induction_bucket[T[p - 1]]++] =
-                (p - 1) | ((sa_sint_t)(T[p - 2] < T[p - 1]) << (SAINT_BIT - 1)) |
-                ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1));
+            SA[induction_bucket[T[p - 1]]++] = (p - 1) | ((sa_sint_t)(T[p - 2] < T[p - 1]) << (SAINT_BIT - 1)) |
+                                               ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1));
             distinct_names[v] = d;
         }
     }
@@ -1647,16 +1592,13 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k(
     return d;
 }
 
-static void libsais_partial_sorting_scan_left_to_right_32s_1k(const sa_sint_t * RESTRICT T,
-                                                              sa_sint_t * RESTRICT SA,
+static void libsais_partial_sorting_scan_left_to_right_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
                                                               sa_sint_t * RESTRICT induction_bucket,
-                                                              fast_sint_t omp_block_start,
-                                                              fast_sint_t omp_block_size) {
+                                                              fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1;
-         i < j; i += 2) {
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2) {
         libsais_prefetchw(&SA[i + 3 * prefetch_distance]);
 
         sa_sint_t s0 = SA[i + 2 * prefetch_distance + 0];
@@ -1680,15 +1622,13 @@ static void libsais_partial_sorting_scan_left_to_right_32s_1k(const sa_sint_t *
         SA[i + 0] = p0 & SAINT_MAX;
         if (p0 > 0) {
             SA[i + 0] = 0;
-            SA[induction_bucket[T[p0 - 1]]++] =
-                (p0 - 1) | ((sa_sint_t)(T[p0 - 2] < T[p0 - 1]) << (SAINT_BIT - 1));
+            SA[induction_bucket[T[p0 - 1]]++] = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] < T[p0 - 1]) << (SAINT_BIT - 1));
         }
         sa_sint_t p1 = SA[i + 1];
         SA[i + 1] = p1 & SAINT_MAX;
         if (p1 > 0) {
             SA[i + 1] = 0;
-            SA[induction_bucket[T[p1 - 1]]++] =
-                (p1 - 1) | ((sa_sint_t)(T[p1 - 2] < T[p1 - 1]) << (SAINT_BIT - 1));
+            SA[induction_bucket[T[p1 - 1]]++] = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] < T[p1 - 1]) << (SAINT_BIT - 1));
         }
     }
 
@@ -1697,30 +1637,28 @@ static void libsais_partial_sorting_scan_left_to_right_32s_1k(const sa_sint_t *
         SA[i] = p & SAINT_MAX;
         if (p > 0) {
             SA[i] = 0;
-            SA[induction_bucket[T[p - 1]]++] =
-                (p - 1) | ((sa_sint_t)(T[p - 2] < T[p - 1]) << (SAINT_BIT - 1));
+            SA[induction_bucket[T[p - 1]]++] = (p - 1) | ((sa_sint_t)(T[p - 2] < T[p - 1]) << (SAINT_BIT - 1));
         }
     }
 }
 static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
-    sa_sint_t * RESTRICT buckets, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets,
+    sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     SA[buckets[BUCKETS_INDEX4(T[n - 1], T[n - 2] >= T[n - 1])]++] = (n - 1) | SAINT_MIN;
     buckets[2 + BUCKETS_INDEX4(T[n - 1], T[n - 2] >= T[n - 1])] = ++d;
 
     if (threads == 1 || left_suffixes_count < 65536) {
-        d = libsais_partial_sorting_scan_left_to_right_32s_6k(T, SA, buckets, d, 0,
-                                                              left_suffixes_count);
+        d = libsais_partial_sorting_scan_left_to_right_32s_6k(T, SA, buckets, d, 0, left_suffixes_count);
     }
     (void)(thread_state);
     return d;
 }
 
-static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
-    sa_sint_t * RESTRICT buckets, sa_sint_t d, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k_omp(const sa_sint_t * RESTRICT T,
+                                                                       sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                                                       sa_sint_t k, sa_sint_t * RESTRICT buckets,
+                                                                       sa_sint_t d, sa_sint_t threads,
+                                                                       LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     sa_sint_t * RESTRICT induction_bucket = &buckets[2 * k];
     sa_sint_t * RESTRICT distinct_names = &buckets[0 * k];
 
@@ -1735,9 +1673,10 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k_omp(
     return d;
 }
 
-static void libsais_partial_sorting_scan_left_to_right_32s_1k_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
-    sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+static void libsais_partial_sorting_scan_left_to_right_32s_1k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+                                                                  sa_sint_t n, sa_sint_t * RESTRICT buckets,
+                                                                  sa_sint_t threads,
+                                                                  LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     SA[buckets[T[n - 1]]++] = (n - 1) | ((sa_sint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1));
 
     if (threads == 1 || n < 65536) {
@@ -1747,8 +1686,7 @@ static void libsais_partial_sorting_scan_left_to_right_32s_1k_omp(
 }
 
 static void libsais_partial_sorting_shift_markers_8u_omp(sa_sint_t * RESTRICT SA, sa_sint_t n,
-                                                         const sa_sint_t * RESTRICT buckets,
-                                                         sa_sint_t threads) {
+                                                         const sa_sint_t * RESTRICT buckets, sa_sint_t threads) {
     const fast_sint_t prefetch_distance = 32;
 
     const sa_sint_t * RESTRICT temp_bucket = &buckets[4 * ALPHABET_SIZE];
@@ -1757,13 +1695,11 @@ static void libsais_partial_sorting_shift_markers_8u_omp(sa_sint_t * RESTRICT SA
     (void)(threads);
     (void)(n);
 
-    for (c = BUCKETS_INDEX2(ALPHABET_SIZE - 1, 0); c >= BUCKETS_INDEX2(1, 0);
-         c -= BUCKETS_INDEX2(1, 0)) {
+    for (c = BUCKETS_INDEX2(ALPHABET_SIZE - 1, 0); c >= BUCKETS_INDEX2(1, 0); c -= BUCKETS_INDEX2(1, 0)) {
         fast_sint_t i, j;
         sa_sint_t s = SAINT_MIN;
-        for (i = (fast_sint_t)temp_bucket[c] - 1,
-            j = (fast_sint_t)buckets[c - BUCKETS_INDEX2(1, 0)] + 3;
-             i >= j; i -= 4) {
+        for (i = (fast_sint_t)temp_bucket[c] - 1, j = (fast_sint_t)buckets[c - BUCKETS_INDEX2(1, 0)] + 3; i >= j;
+             i -= 4) {
             libsais_prefetchw(&SA[i - prefetch_distance]);
 
             sa_sint_t p0 = SA[i - 0], q0 = (p0 & SAINT_MIN) ^ s;
@@ -1789,8 +1725,7 @@ static void libsais_partial_sorting_shift_markers_8u_omp(sa_sint_t * RESTRICT SA
 }
 
 static void libsais_partial_sorting_shift_markers_32s_6k_omp(sa_sint_t * RESTRICT SA, sa_sint_t k,
-                                                             const sa_sint_t * RESTRICT buckets,
-                                                             sa_sint_t threads) {
+                                                             const sa_sint_t * RESTRICT buckets, sa_sint_t threads) {
     const fast_sint_t prefetch_distance = 32;
 
     const sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k];
@@ -1836,55 +1771,52 @@ static void libsais_partial_sorting_shift_markers_32s_4k(sa_sint_t * RESTRICT SA
     for (i = (fast_sint_t)n - 1; i >= 3; i -= 4) {
         libsais_prefetchw(&SA[i - prefetch_distance]);
 
-        sa_sint_t p0 = SA[i - 0], q0 = ((p0 & SUFFIX_GROUP_MARKER) ^ s) &
-                                       ((sa_sint_t)(p0 > 0) << ((SUFFIX_GROUP_BIT - 1)));
+        sa_sint_t p0 = SA[i - 0],
+                  q0 = ((p0 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p0 > 0) << ((SUFFIX_GROUP_BIT - 1)));
         s = s ^ q0;
         SA[i - 0] = p0 ^ q0;
-        sa_sint_t p1 = SA[i - 1], q1 = ((p1 & SUFFIX_GROUP_MARKER) ^ s) &
-                                       ((sa_sint_t)(p1 > 0) << ((SUFFIX_GROUP_BIT - 1)));
+        sa_sint_t p1 = SA[i - 1],
+                  q1 = ((p1 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p1 > 0) << ((SUFFIX_GROUP_BIT - 1)));
         s = s ^ q1;
         SA[i - 1] = p1 ^ q1;
-        sa_sint_t p2 = SA[i - 2], q2 = ((p2 & SUFFIX_GROUP_MARKER) ^ s) &
-                                       ((sa_sint_t)(p2 > 0) << ((SUFFIX_GROUP_BIT - 1)));
+        sa_sint_t p2 = SA[i - 2],
+                  q2 = ((p2 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p2 > 0) << ((SUFFIX_GROUP_BIT - 1)));
         s = s ^ q2;
         SA[i - 2] = p2 ^ q2;
-        sa_sint_t p3 = SA[i - 3], q3 = ((p3 & SUFFIX_GROUP_MARKER) ^ s) &
-                                       ((sa_sint_t)(p3 > 0) << ((SUFFIX_GROUP_BIT - 1)));
+        sa_sint_t p3 = SA[i - 3],
+                  q3 = ((p3 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p3 > 0) << ((SUFFIX_GROUP_BIT - 1)));
         s = s ^ q3;
         SA[i - 3] = p3 ^ q3;
     }
 
     for (; i >= 0; i -= 1) {
-        sa_sint_t p = SA[i], q = ((p & SUFFIX_GROUP_MARKER) ^ s) &
-                                 ((sa_sint_t)(p > 0) << ((SUFFIX_GROUP_BIT - 1)));
+        sa_sint_t p = SA[i], q = ((p & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p > 0) << ((SUFFIX_GROUP_BIT - 1)));
         s = s ^ q;
         SA[i] = p ^ q;
     }
 }
 
-static void libsais_partial_sorting_shift_buckets_32s_6k(sa_sint_t k,
-                                                         sa_sint_t * RESTRICT buckets) {
+static void libsais_partial_sorting_shift_buckets_32s_6k(sa_sint_t k, sa_sint_t * RESTRICT buckets) {
     sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k];
 
     fast_sint_t i;
-    for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0);
-         i += BUCKETS_INDEX2(1, 0)) {
+    for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0)) {
         buckets[2 * i + BUCKETS_INDEX4(0, 0)] = temp_bucket[i + BUCKETS_INDEX2(0, 0)];
         buckets[2 * i + BUCKETS_INDEX4(0, 1)] = temp_bucket[i + BUCKETS_INDEX2(0, 1)];
     }
 }
 
-static sa_sint_t libsais_partial_sorting_scan_right_to_left_8u(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d,
-    fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+static sa_sint_t libsais_partial_sorting_scan_right_to_left_8u(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
+                                                               sa_sint_t * RESTRICT buckets, sa_sint_t d,
+                                                               fast_sint_t omp_block_start,
+                                                               fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
     sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE];
 
     fast_sint_t i, j;
-    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1;
-         i >= j; i -= 2) {
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) {
         libsais_prefetch(&SA[i - 2 * prefetch_distance]);
 
         libsais_prefetch(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 1);
@@ -1896,16 +1828,14 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_8u(
         d += (p0 < 0);
         p0 &= SAINT_MAX;
         sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]);
-        SA[--induction_bucket[v0]] =
-            (p0 - 1) | ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1));
+        SA[--induction_bucket[v0]] = (p0 - 1) | ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1));
         distinct_names[v0] = d;
 
         sa_sint_t p1 = SA[i - 1];
         d += (p1 < 0);
         p1 &= SAINT_MAX;
         sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]);
-        SA[--induction_bucket[v1]] =
-            (p1 - 1) | ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1));
+        SA[--induction_bucket[v1]] = (p1 - 1) | ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1));
         distinct_names[v1] = d;
     }
 
@@ -1914,35 +1844,35 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_8u(
         d += (p < 0);
         p &= SAINT_MAX;
         sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]);
-        SA[--induction_bucket[v]] =
-            (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1));
+        SA[--induction_bucket[v]] = (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1));
         distinct_names[v] = d;
     }
 
     return d;
 }
-static void libsais_partial_sorting_scan_right_to_left_8u_omp(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets,
-    sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+static void libsais_partial_sorting_scan_right_to_left_8u_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
+                                                              sa_sint_t n, sa_sint_t * RESTRICT buckets,
+                                                              sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count,
+                                                              sa_sint_t d, sa_sint_t threads,
+                                                              LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     fast_sint_t scan_start = (fast_sint_t)left_suffixes_count + 1;
     fast_sint_t scan_end = (fast_sint_t)n - (fast_sint_t)first_lms_suffix;
 
     if (threads == 1 || (scan_end - scan_start) < 65536) {
-        libsais_partial_sorting_scan_right_to_left_8u(T, SA, buckets, d, scan_start,
-                                                      scan_end - scan_start);
+        libsais_partial_sorting_scan_right_to_left_8u(T, SA, buckets, d, scan_start, scan_end - scan_start);
     }
     (void)(thread_state);
 }
 
-static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
-    sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k(const sa_sint_t * RESTRICT T,
+                                                                   sa_sint_t * RESTRICT SA,
+                                                                   sa_sint_t * RESTRICT buckets, sa_sint_t d,
+                                                                   fast_sint_t omp_block_start,
+                                                                   fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1;
-         i >= j; i -= 2) {
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2) {
         libsais_prefetch(&SA[i - 3 * prefetch_distance]);
 
         libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 0] & SAINT_MAX] - 1);
@@ -1984,18 +1914,18 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k(
     return d;
 }
 
-static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k,
-    sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start,
-    fast_sint_t omp_block_size) {
+static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k(const sa_sint_t * RESTRICT T,
+                                                                   sa_sint_t * RESTRICT SA, sa_sint_t k,
+                                                                   sa_sint_t * RESTRICT buckets, sa_sint_t d,
+                                                                   fast_sint_t omp_block_start,
+                                                                   fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     sa_sint_t * RESTRICT induction_bucket = &buckets[3 * k];
     sa_sint_t * RESTRICT distinct_names = &buckets[0 * k];
 
     fast_sint_t i, j;
-    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1;
-         i >= j; i -= 2) {
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2) {
         libsais_prefetchw(&SA[i - 3 * prefetch_distance]);
 
         sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0];
@@ -2027,9 +1957,8 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k(
             d += (p0 >> (SUFFIX_GROUP_BIT - 1));
             p0 &= ~SUFFIX_GROUP_MARKER;
             sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]);
-            SA[--induction_bucket[T[p0 - 1]]] =
-                (p0 - 1) | ((sa_sint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1)) |
-                ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1));
+            SA[--induction_bucket[T[p0 - 1]]] = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1)) |
+                                                ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1));
             distinct_names[v0] = d;
         }
 
@@ -2039,9 +1968,8 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k(
             d += (p1 >> (SUFFIX_GROUP_BIT - 1));
             p1 &= ~SUFFIX_GROUP_MARKER;
             sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]);
-            SA[--induction_bucket[T[p1 - 1]]] =
-                (p1 - 1) | ((sa_sint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1)) |
-                ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1));
+            SA[--induction_bucket[T[p1 - 1]]] = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1)) |
+                                                ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1));
             distinct_names[v1] = d;
         }
     }
@@ -2053,9 +1981,8 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k(
             d += (p >> (SUFFIX_GROUP_BIT - 1));
             p &= ~SUFFIX_GROUP_MARKER;
             sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]);
-            SA[--induction_bucket[T[p - 1]]] =
-                (p - 1) | ((sa_sint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1)) |
-                ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1));
+            SA[--induction_bucket[T[p - 1]]] = (p - 1) | ((sa_sint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1)) |
+                                               ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1));
             distinct_names[v] = d;
         }
     }
@@ -2063,16 +1990,13 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k(
     return d;
 }
 
-static void libsais_partial_sorting_scan_right_to_left_32s_1k(const sa_sint_t * RESTRICT T,
-                                                              sa_sint_t * RESTRICT SA,
+static void libsais_partial_sorting_scan_right_to_left_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
                                                               sa_sint_t * RESTRICT induction_bucket,
-                                                              fast_sint_t omp_block_start,
-                                                              fast_sint_t omp_block_size) {
+                                                              fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1;
-         i >= j; i -= 2) {
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2) {
         libsais_prefetchw(&SA[i - 3 * prefetch_distance]);
 
         sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0];
@@ -2095,14 +2019,12 @@ static void libsais_partial_sorting_scan_right_to_left_32s_1k(const sa_sint_t *
         sa_sint_t p0 = SA[i - 0];
         if (p0 > 0) {
             SA[i - 0] = 0;
-            SA[--induction_bucket[T[p0 - 1]]] =
-                (p0 - 1) | ((sa_sint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1));
+            SA[--induction_bucket[T[p0 - 1]]] = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1));
         }
         sa_sint_t p1 = SA[i - 1];
         if (p1 > 0) {
             SA[i - 1] = 0;
-            SA[--induction_bucket[T[p1 - 1]]] =
-                (p1 - 1) | ((sa_sint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1));
+            SA[--induction_bucket[T[p1 - 1]]] = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1));
         }
     }
 
@@ -2110,30 +2032,29 @@ static void libsais_partial_sorting_scan_right_to_left_32s_1k(const sa_sint_t *
         sa_sint_t p = SA[i];
         if (p > 0) {
             SA[i] = 0;
-            SA[--induction_bucket[T[p - 1]]] =
-                (p - 1) | ((sa_sint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1));
+            SA[--induction_bucket[T[p - 1]]] = (p - 1) | ((sa_sint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1));
         }
     }
 }
 static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
-    sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count,
-    sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets,
+    sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     fast_sint_t scan_start = (fast_sint_t)left_suffixes_count + 1;
     fast_sint_t scan_end = (fast_sint_t)n - (fast_sint_t)first_lms_suffix;
 
     if (threads == 1 || (scan_end - scan_start) < 65536) {
-        d = libsais_partial_sorting_scan_right_to_left_32s_6k(T, SA, buckets, d, scan_start,
-                                                              scan_end - scan_start);
+        d = libsais_partial_sorting_scan_right_to_left_32s_6k(T, SA, buckets, d, scan_start, scan_end - scan_start);
     }
     (void)(thread_state);
     return d;
 }
 
-static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
-    sa_sint_t * RESTRICT buckets, sa_sint_t d, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k_omp(const sa_sint_t * RESTRICT T,
+                                                                       sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                                                       sa_sint_t k, sa_sint_t * RESTRICT buckets,
+                                                                       sa_sint_t d, sa_sint_t threads,
+                                                                       LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     if (threads == 1 || n < 65536) {
         d = libsais_partial_sorting_scan_right_to_left_32s_4k(T, SA, k, buckets, d, 0, n);
     }
@@ -2141,9 +2062,10 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k_omp(
     return d;
 }
 
-static void libsais_partial_sorting_scan_right_to_left_32s_1k_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
-    sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+static void libsais_partial_sorting_scan_right_to_left_32s_1k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+                                                                  sa_sint_t n, sa_sint_t * RESTRICT buckets,
+                                                                  sa_sint_t threads,
+                                                                  LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     if (threads == 1 || n < 65536) {
         libsais_partial_sorting_scan_right_to_left_32s_1k(T, SA, buckets, 0, n);
     }
@@ -2156,8 +2078,7 @@ static fast_sint_t libsais_partial_sorting_gather_lms_suffixes_32s_4k(sa_sint_t
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j, l;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - 3, l = omp_block_start; i < j;
-         i += 4) {
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - 3, l = omp_block_start; i < j; i += 4) {
         libsais_prefetch(&SA[i + prefetch_distance]);
 
         sa_sint_t s0 = SA[i + 0];
@@ -2189,8 +2110,7 @@ static fast_sint_t libsais_partial_sorting_gather_lms_suffixes_32s_1k(sa_sint_t
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j, l;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - 3, l = omp_block_start; i < j;
-         i += 4) {
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - 3, l = omp_block_start; i < j; i += 4) {
         libsais_prefetch(&SA[i + prefetch_distance]);
 
         sa_sint_t s0 = SA[i + 0];
@@ -2216,9 +2136,9 @@ static fast_sint_t libsais_partial_sorting_gather_lms_suffixes_32s_1k(sa_sint_t
     return l;
 }
 
-static void libsais_partial_sorting_gather_lms_suffixes_32s_4k_omp(
-    sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+static void libsais_partial_sorting_gather_lms_suffixes_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                                                   sa_sint_t threads,
+                                                                   LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     {
         (void)(threads);
         (void)(thread_state);
@@ -2228,8 +2148,7 @@ static void libsais_partial_sorting_gather_lms_suffixes_32s_4k_omp(
 
         fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size =
-            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
 
         if (omp_num_threads == 1) {
             libsais_partial_sorting_gather_lms_suffixes_32s_4k(SA, omp_block_start, omp_block_size);
@@ -2237,9 +2156,9 @@ static void libsais_partial_sorting_gather_lms_suffixes_32s_4k_omp(
     }
 }
 
-static void libsais_partial_sorting_gather_lms_suffixes_32s_1k_omp(
-    sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+static void libsais_partial_sorting_gather_lms_suffixes_32s_1k_omp(sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                                                   sa_sint_t threads,
+                                                                   LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     {
         (void)(threads);
         (void)(thread_state);
@@ -2249,8 +2168,7 @@ static void libsais_partial_sorting_gather_lms_suffixes_32s_1k_omp(
 
         fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size =
-            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
 
         if (omp_num_threads == 1) {
             libsais_partial_sorting_gather_lms_suffixes_32s_1k(SA, omp_block_start, omp_block_size);
@@ -2258,63 +2176,52 @@ static void libsais_partial_sorting_gather_lms_suffixes_32s_1k_omp(
     }
 }
 
-static void libsais_induce_partial_order_8u_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
-                                                sa_sint_t n, sa_sint_t * RESTRICT buckets,
-                                                sa_sint_t first_lms_suffix,
+static void libsais_induce_partial_order_8u_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                                sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix,
                                                 sa_sint_t left_suffixes_count, sa_sint_t threads,
                                                 LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     memset(&buckets[2 * ALPHABET_SIZE], 0, 2 * ALPHABET_SIZE * sizeof(sa_sint_t));
 
-    sa_sint_t d = libsais_partial_sorting_scan_left_to_right_8u_omp(
-        T, SA, n, buckets, left_suffixes_count, 0, threads, thread_state);
+    sa_sint_t d = libsais_partial_sorting_scan_left_to_right_8u_omp(T, SA, n, buckets, left_suffixes_count, 0, threads,
+                                                                    thread_state);
     libsais_partial_sorting_shift_markers_8u_omp(SA, n, buckets, threads);
-    libsais_partial_sorting_scan_right_to_left_8u_omp(
-        T, SA, n, buckets, first_lms_suffix, left_suffixes_count, d, threads, thread_state);
+    libsais_partial_sorting_scan_right_to_left_8u_omp(T, SA, n, buckets, first_lms_suffix, left_suffixes_count, d,
+                                                      threads, thread_state);
 }
 
-static void libsais_induce_partial_order_32s_6k_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
-    sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count,
-    sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
-    sa_sint_t d = libsais_partial_sorting_scan_left_to_right_32s_6k_omp(
-        T, SA, n, buckets, left_suffixes_count, 0, threads, thread_state);
+static void libsais_induce_partial_order_32s_6k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                                    sa_sint_t k, sa_sint_t * RESTRICT buckets,
+                                                    sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count,
+                                                    sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    sa_sint_t d = libsais_partial_sorting_scan_left_to_right_32s_6k_omp(T, SA, n, buckets, left_suffixes_count, 0,
+                                                                        threads, thread_state);
     libsais_partial_sorting_shift_markers_32s_6k_omp(SA, k, buckets, threads);
     libsais_partial_sorting_shift_buckets_32s_6k(k, buckets);
-    libsais_partial_sorting_scan_right_to_left_32s_6k_omp(
-        T, SA, n, buckets, first_lms_suffix, left_suffixes_count, d, threads, thread_state);
+    libsais_partial_sorting_scan_right_to_left_32s_6k_omp(T, SA, n, buckets, first_lms_suffix, left_suffixes_count, d,
+                                                          threads, thread_state);
 }
 
-static void libsais_induce_partial_order_32s_4k_omp(const sa_sint_t * RESTRICT T,
-                                                    sa_sint_t * RESTRICT SA, sa_sint_t n,
-                                                    sa_sint_t k, sa_sint_t * RESTRICT buckets,
-                                                    sa_sint_t threads,
+static void libsais_induce_partial_order_32s_4k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                                    sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
                                                     LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t));
 
-    sa_sint_t d = libsais_partial_sorting_scan_left_to_right_32s_4k_omp(T, SA, n, k, buckets, 0,
-                                                                        threads, thread_state);
+    sa_sint_t d = libsais_partial_sorting_scan_left_to_right_32s_4k_omp(T, SA, n, k, buckets, 0, threads, thread_state);
     libsais_partial_sorting_shift_markers_32s_4k(SA, n);
-    libsais_partial_sorting_scan_right_to_left_32s_4k_omp(T, SA, n, k, buckets, d, threads,
-                                                          thread_state);
+    libsais_partial_sorting_scan_right_to_left_32s_4k_omp(T, SA, n, k, buckets, d, threads, thread_state);
     libsais_partial_sorting_gather_lms_suffixes_32s_4k_omp(SA, n, threads, thread_state);
 }
 
-static void libsais_induce_partial_order_32s_2k_omp(const sa_sint_t * RESTRICT T,
-                                                    sa_sint_t * RESTRICT SA, sa_sint_t n,
-                                                    sa_sint_t k, sa_sint_t * RESTRICT buckets,
-                                                    sa_sint_t threads,
+static void libsais_induce_partial_order_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                                    sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
                                                     LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
-    libsais_partial_sorting_scan_left_to_right_32s_1k_omp(T, SA, n, &buckets[1 * k], threads,
-                                                          thread_state);
-    libsais_partial_sorting_scan_right_to_left_32s_1k_omp(T, SA, n, &buckets[0 * k], threads,
-                                                          thread_state);
+    libsais_partial_sorting_scan_left_to_right_32s_1k_omp(T, SA, n, &buckets[1 * k], threads, thread_state);
+    libsais_partial_sorting_scan_right_to_left_32s_1k_omp(T, SA, n, &buckets[0 * k], threads, thread_state);
     libsais_partial_sorting_gather_lms_suffixes_32s_1k_omp(SA, n, threads, thread_state);
 }
 
-static void libsais_induce_partial_order_32s_1k_omp(const sa_sint_t * RESTRICT T,
-                                                    sa_sint_t * RESTRICT SA, sa_sint_t n,
-                                                    sa_sint_t k, sa_sint_t * RESTRICT buckets,
-                                                    sa_sint_t threads,
+static void libsais_induce_partial_order_32s_1k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                                    sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
                                                     LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     libsais_count_suffixes_32s(T, n, k, buckets);
     libsais_initialize_buckets_start_32s_1k(k, buckets);
@@ -2327,16 +2234,14 @@ static void libsais_induce_partial_order_32s_1k_omp(const sa_sint_t * RESTRICT T
     libsais_partial_sorting_gather_lms_suffixes_32s_1k_omp(SA, n, threads, thread_state);
 }
 
-static sa_sint_t libsais_renumber_lms_suffixes_8u(sa_sint_t * RESTRICT SA, sa_sint_t m,
-                                                  sa_sint_t name, fast_sint_t omp_block_start,
-                                                  fast_sint_t omp_block_size) {
+static sa_sint_t libsais_renumber_lms_suffixes_8u(sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t name,
+                                                  fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     sa_sint_t * RESTRICT SAm = &SA[m];
 
     fast_sint_t i, j;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j;
-         i += 4) {
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) {
         libsais_prefetch(&SA[i + 2 * prefetch_distance]);
 
         libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 0] & SAINT_MAX) >> 1]);
@@ -2367,17 +2272,15 @@ static sa_sint_t libsais_renumber_lms_suffixes_8u(sa_sint_t * RESTRICT SA, sa_si
     return name;
 }
 
-static fast_sint_t libsais_gather_marked_suffixes_8u(sa_sint_t * RESTRICT SA, sa_sint_t m,
-                                                     fast_sint_t l, fast_sint_t omp_block_start,
-                                                     fast_sint_t omp_block_size) {
+static fast_sint_t libsais_gather_marked_suffixes_8u(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t l,
+                                                     fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     l -= 1;
 
     fast_sint_t i, j;
-    for (i = (fast_sint_t)m + omp_block_start + omp_block_size - 1,
-        j = (fast_sint_t)m + omp_block_start + 3;
-         i >= j; i -= 4) {
+    for (i = (fast_sint_t)m + omp_block_start + omp_block_size - 1, j = (fast_sint_t)m + omp_block_start + 3; i >= j;
+         i -= 4) {
         libsais_prefetch(&SA[i - prefetch_distance]);
 
         sa_sint_t s0 = SA[i - 0];
@@ -2405,9 +2308,8 @@ static fast_sint_t libsais_gather_marked_suffixes_8u(sa_sint_t * RESTRICT SA, sa
     return l;
 }
 
-static sa_sint_t libsais_renumber_lms_suffixes_8u_omp(
-    sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+static sa_sint_t libsais_renumber_lms_suffixes_8u_omp(sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t threads,
+                                                      LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     sa_sint_t name = 0;
     {
         (void)(threads);
@@ -2418,8 +2320,7 @@ static sa_sint_t libsais_renumber_lms_suffixes_8u_omp(
 
         fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size =
-            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
+        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
 
         if (omp_num_threads == 1) {
             name = libsais_renumber_lms_suffixes_8u(SA, m, 0, omp_block_start, omp_block_size);
@@ -2429,9 +2330,8 @@ static sa_sint_t libsais_renumber_lms_suffixes_8u_omp(
     return name;
 }
 
-static void libsais_gather_marked_lms_suffixes_8u_omp(
-    sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+static void libsais_gather_marked_lms_suffixes_8u_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs,
+                                                      sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     {
         (void)(threads);
         (void)(thread_state);
@@ -2441,20 +2341,18 @@ static void libsais_gather_marked_lms_suffixes_8u_omp(
 
         fast_sint_t omp_block_stride = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
-                                         ? omp_block_stride
-                                         : ((fast_sint_t)n >> 1) - omp_block_start;
+        fast_sint_t omp_block_size =
+            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ((fast_sint_t)n >> 1) - omp_block_start;
 
         if (omp_num_threads == 1) {
-            libsais_gather_marked_suffixes_8u(SA, m, (fast_sint_t)n + (fast_sint_t)fs,
-                                              omp_block_start, omp_block_size);
+            libsais_gather_marked_suffixes_8u(SA, m, (fast_sint_t)n + (fast_sint_t)fs, omp_block_start, omp_block_size);
         }
     }
 }
 
-static sa_sint_t libsais_renumber_and_gather_lms_suffixes_8u_omp(
-    sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+static sa_sint_t libsais_renumber_and_gather_lms_suffixes_8u_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
+                                                                 sa_sint_t fs, sa_sint_t threads,
+                                                                 LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     memset(&SA[m], 0, ((size_t)n >> 1) * sizeof(sa_sint_t));
 
     sa_sint_t name = libsais_renumber_lms_suffixes_8u_omp(SA, m, threads, thread_state);
@@ -2470,8 +2368,7 @@ static sa_sint_t libsais_renumber_and_gather_lms_suffixes_8u_omp(
     return name;
 }
 
-static sa_sint_t libsais_renumber_distinct_lms_suffixes_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t m,
-                                                               sa_sint_t name,
+static sa_sint_t libsais_renumber_distinct_lms_suffixes_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t name,
                                                                fast_sint_t omp_block_start,
                                                                fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
@@ -2480,8 +2377,7 @@ static sa_sint_t libsais_renumber_distinct_lms_suffixes_32s_4k(sa_sint_t * RESTR
 
     fast_sint_t i, j;
     sa_sint_t p0, p1, p2, p3 = 0;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j;
-         i += 4) {
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) {
         libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
 
         libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 0] & SAINT_MAX) >> 1]);
@@ -2513,16 +2409,14 @@ static sa_sint_t libsais_renumber_distinct_lms_suffixes_32s_4k(sa_sint_t * RESTR
     return name;
 }
 
-static void libsais_mark_distinct_lms_suffixes_32s(sa_sint_t * RESTRICT SA, sa_sint_t m,
-                                                   fast_sint_t omp_block_start,
+static void libsais_mark_distinct_lms_suffixes_32s(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t omp_block_start,
                                                    fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
     sa_sint_t p0, p1, p2, p3 = 0;
-    for (i = (fast_sint_t)m + omp_block_start,
-        j = (fast_sint_t)m + omp_block_start + omp_block_size - 3;
-         i < j; i += 4) {
+    for (i = (fast_sint_t)m + omp_block_start, j = (fast_sint_t)m + omp_block_start + omp_block_size - 3; i < j;
+         i += 4) {
         libsais_prefetchw(&SA[i + prefetch_distance]);
 
         p0 = SA[i + 0];
@@ -2547,8 +2441,7 @@ static void libsais_mark_distinct_lms_suffixes_32s(sa_sint_t * RESTRICT SA, sa_s
     }
 }
 
-static void libsais_clamp_lms_suffixes_length_32s(sa_sint_t * RESTRICT SA, sa_sint_t m,
-                                                  fast_sint_t omp_block_start,
+static void libsais_clamp_lms_suffixes_length_32s(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t omp_block_start,
                                                   fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
@@ -2569,9 +2462,9 @@ static void libsais_clamp_lms_suffixes_length_32s(sa_sint_t * RESTRICT SA, sa_si
     }
 }
 
-static sa_sint_t libsais_renumber_distinct_lms_suffixes_32s_4k_omp(
-    sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+static sa_sint_t libsais_renumber_distinct_lms_suffixes_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t m,
+                                                                   sa_sint_t threads,
+                                                                   LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     sa_sint_t name = 0;
     {
         (void)(threads);
@@ -2582,20 +2475,18 @@ static sa_sint_t libsais_renumber_distinct_lms_suffixes_32s_4k_omp(
 
         fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size =
-            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
+        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
 
         if (omp_num_threads == 1) {
-            name = libsais_renumber_distinct_lms_suffixes_32s_4k(SA, m, 1, omp_block_start,
-                                                                 omp_block_size);
+            name = libsais_renumber_distinct_lms_suffixes_32s_4k(SA, m, 1, omp_block_start, omp_block_size);
         }
     }
 
     return name - 1;
 }
 
-static void libsais_mark_distinct_lms_suffixes_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n,
-                                                       sa_sint_t m, sa_sint_t threads) {
+static void libsais_mark_distinct_lms_suffixes_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
+                                                       sa_sint_t threads) {
     {
         (void)(threads);
 
@@ -2606,8 +2497,8 @@ static void libsais_mark_distinct_lms_suffixes_32s_omp(sa_sint_t * RESTRICT SA,
     }
 }
 
-static void libsais_clamp_lms_suffixes_length_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n,
-                                                      sa_sint_t m, sa_sint_t threads) {
+static void libsais_clamp_lms_suffixes_length_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
+                                                      sa_sint_t threads) {
     {
         (void)(threads);
 
@@ -2623,8 +2514,7 @@ static sa_sint_t libsais_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(
     LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     memset(&SA[m], 0, ((size_t)n >> 1) * sizeof(sa_sint_t));
 
-    sa_sint_t name =
-        libsais_renumber_distinct_lms_suffixes_32s_4k_omp(SA, m, threads, thread_state);
+    sa_sint_t name = libsais_renumber_distinct_lms_suffixes_32s_4k_omp(SA, m, threads, thread_state);
     if (name < m) {
         libsais_mark_distinct_lms_suffixes_32s_omp(SA, n, m, threads);
     }
@@ -2632,8 +2522,9 @@ static sa_sint_t libsais_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(
     return name;
 }
 
-static sa_sint_t libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(
-    sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads) {
+static sa_sint_t libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(sa_sint_t * RESTRICT T,
+                                                                            sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                                                            sa_sint_t m, sa_sint_t threads) {
     const fast_sint_t prefetch_distance = 32;
 
     sa_sint_t * RESTRICT SAm = &SA[m];
@@ -2644,8 +2535,7 @@ static sa_sint_t libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(
         memset(&SA[m], 0, ((size_t)n - (size_t)m - (size_t)m) * sizeof(sa_sint_t));
 
         fast_sint_t i, j;
-        for (i = (fast_sint_t)n - (fast_sint_t)m, j = (fast_sint_t)n - 1 - prefetch_distance - 3;
-             i < j; i += 4) {
+        for (i = (fast_sint_t)n - (fast_sint_t)m, j = (fast_sint_t)n - 1 - prefetch_distance - 3; i < j; i += 4) {
             libsais_prefetch(&SA[i + 2 * prefetch_distance]);
 
             libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]);
@@ -2743,15 +2633,13 @@ static sa_sint_t libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(
 }
 
 static void libsais_reconstruct_lms_suffixes(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
-                                             fast_sint_t omp_block_start,
-                                             fast_sint_t omp_block_size) {
+                                             fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     const sa_sint_t * RESTRICT SAnm = &SA[n - m];
 
     fast_sint_t i, j;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j;
-         i += 4) {
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) {
         libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
 
         libsais_prefetch(&SAnm[SA[i + prefetch_distance + 0]]);
@@ -2770,8 +2658,7 @@ static void libsais_reconstruct_lms_suffixes(sa_sint_t * RESTRICT SA, sa_sint_t
     }
 }
 
-static void libsais_reconstruct_lms_suffixes_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
-                                                 sa_sint_t threads) {
+static void libsais_reconstruct_lms_suffixes_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads) {
     {
         (void)(threads);
 
@@ -2781,8 +2668,7 @@ static void libsais_reconstruct_lms_suffixes_omp(sa_sint_t * RESTRICT SA, sa_sin
     }
 }
 
-static void libsais_place_lms_suffixes_interval_8u(sa_sint_t * RESTRICT SA, sa_sint_t n,
-                                                   sa_sint_t m,
+static void libsais_place_lms_suffixes_interval_8u(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
                                                    const sa_sint_t * RESTRICT buckets) {
     const sa_sint_t * RESTRICT bucket_end = &buckets[7 * ALPHABET_SIZE];
 
@@ -2803,8 +2689,7 @@ static void libsais_place_lms_suffixes_interval_8u(sa_sint_t * RESTRICT SA, sa_s
     memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
 }
 
-static void libsais_place_lms_suffixes_interval_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n,
-                                                       sa_sint_t k, sa_sint_t m,
+static void libsais_place_lms_suffixes_interval_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m,
                                                        const sa_sint_t * RESTRICT buckets) {
     const sa_sint_t * RESTRICT bucket_end = &buckets[3 * k];
 
@@ -2825,17 +2710,15 @@ static void libsais_place_lms_suffixes_interval_32s_4k(sa_sint_t * RESTRICT SA,
     memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
 }
 
-static void libsais_place_lms_suffixes_interval_32s_2k(sa_sint_t * RESTRICT SA, sa_sint_t n,
-                                                       sa_sint_t k, sa_sint_t m,
+static void libsais_place_lms_suffixes_interval_32s_2k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m,
                                                        const sa_sint_t * RESTRICT buckets) {
     fast_sint_t j = n;
 
     if (k > 1) {
         fast_sint_t c;
-        for (c = BUCKETS_INDEX2((fast_sint_t)k - 2, 0); c >= BUCKETS_INDEX2(0, 0);
-             c -= BUCKETS_INDEX2(1, 0)) {
-            fast_sint_t l = (fast_sint_t)buckets[c + BUCKETS_INDEX2(1, 1)] -
-                            (fast_sint_t)buckets[c + BUCKETS_INDEX2(0, 1)];
+        for (c = BUCKETS_INDEX2((fast_sint_t)k - 2, 0); c >= BUCKETS_INDEX2(0, 0); c -= BUCKETS_INDEX2(1, 0)) {
+            fast_sint_t l =
+                (fast_sint_t)buckets[c + BUCKETS_INDEX2(1, 1)] - (fast_sint_t)buckets[c + BUCKETS_INDEX2(0, 1)];
             if (l > 0) {
                 fast_sint_t i = buckets[c];
                 if (j - i > 0) {
@@ -2850,9 +2733,8 @@ static void libsais_place_lms_suffixes_interval_32s_2k(sa_sint_t * RESTRICT SA,
     memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
 }
 
-static void libsais_place_lms_suffixes_interval_32s_1k(const sa_sint_t * RESTRICT T,
-                                                       sa_sint_t * RESTRICT SA, sa_sint_t k,
-                                                       sa_sint_t m, sa_sint_t * RESTRICT buckets) {
+static void libsais_place_lms_suffixes_interval_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+                                                       sa_sint_t k, sa_sint_t m, sa_sint_t * RESTRICT buckets) {
     const fast_sint_t prefetch_distance = 32;
 
     sa_sint_t c = k - 1;
@@ -2908,8 +2790,7 @@ static void libsais_place_lms_suffixes_interval_32s_1k(const sa_sint_t * RESTRIC
     memset(&SA[0], 0, (size_t)l * sizeof(sa_sint_t));
 }
 
-static void libsais_place_lms_suffixes_histogram_32s_6k(sa_sint_t * RESTRICT SA, sa_sint_t n,
-                                                        sa_sint_t k, sa_sint_t m,
+static void libsais_place_lms_suffixes_histogram_32s_6k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m,
                                                         const sa_sint_t * RESTRICT buckets) {
     const sa_sint_t * RESTRICT bucket_end = &buckets[5 * k];
 
@@ -2929,8 +2810,7 @@ static void libsais_place_lms_suffixes_histogram_32s_6k(sa_sint_t * RESTRICT SA,
     memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
 }
 
-static void libsais_place_lms_suffixes_histogram_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n,
-                                                        sa_sint_t k, sa_sint_t m,
+static void libsais_place_lms_suffixes_histogram_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m,
                                                         const sa_sint_t * RESTRICT buckets) {
     const sa_sint_t * RESTRICT bucket_end = &buckets[3 * k];
 
@@ -2950,15 +2830,13 @@ static void libsais_place_lms_suffixes_histogram_32s_4k(sa_sint_t * RESTRICT SA,
     memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
 }
 
-static void libsais_place_lms_suffixes_histogram_32s_2k(sa_sint_t * RESTRICT SA, sa_sint_t n,
-                                                        sa_sint_t k, sa_sint_t m,
+static void libsais_place_lms_suffixes_histogram_32s_2k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m,
                                                         const sa_sint_t * RESTRICT buckets) {
     fast_sint_t j = n;
 
     if (k > 1) {
         fast_sint_t c;
-        for (c = BUCKETS_INDEX2((fast_sint_t)k - 2, 0); c >= BUCKETS_INDEX2(0, 0);
-             c -= BUCKETS_INDEX2(1, 0)) {
+        for (c = BUCKETS_INDEX2((fast_sint_t)k - 2, 0); c >= BUCKETS_INDEX2(0, 0); c -= BUCKETS_INDEX2(1, 0)) {
             fast_sint_t l = (fast_sint_t)buckets[c + BUCKETS_INDEX2(0, 1)];
             if (l > 0) {
                 fast_sint_t i = buckets[c];
@@ -2975,14 +2853,12 @@ static void libsais_place_lms_suffixes_histogram_32s_2k(sa_sint_t * RESTRICT SA,
 }
 
 static void libsais_final_bwt_scan_left_to_right_8u(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
-                                                    sa_sint_t * RESTRICT induction_bucket,
-                                                    fast_sint_t omp_block_start,
+                                                    sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start,
                                                     fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j;
-         i += 2) {
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) {
         libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
 
         sa_sint_t s0 = SA[i + prefetch_distance + 0];
@@ -3001,16 +2877,14 @@ static void libsais_final_bwt_scan_left_to_right_8u(const u8 * RESTRICT T, sa_si
         if (p0 > 0) {
             p0--;
             SA[i + 0] = T[p0] | SAINT_MIN;
-            SA[induction_bucket[T[p0]]++] =
-                p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1));
+            SA[induction_bucket[T[p0]]++] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1));
         }
         sa_sint_t p1 = SA[i + 1];
         SA[i + 1] = p1 & SAINT_MAX;
         if (p1 > 0) {
             p1--;
             SA[i + 1] = T[p1] | SAINT_MIN;
-            SA[induction_bucket[T[p1]]++] =
-                p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1));
+            SA[induction_bucket[T[p1]]++] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1));
         }
     }
 
@@ -3020,23 +2894,18 @@ static void libsais_final_bwt_scan_left_to_right_8u(const u8 * RESTRICT T, sa_si
         if (p > 0) {
             p--;
             SA[i] = T[p] | SAINT_MIN;
-            SA[induction_bucket[T[p]]++] =
-                p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1));
+            SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1));
         }
     }
 }
 
-static void libsais_final_bwt_aux_scan_left_to_right_8u(const u8 * RESTRICT T,
-                                                        sa_sint_t * RESTRICT SA, sa_sint_t rm,
-                                                        sa_sint_t * RESTRICT I,
-                                                        sa_sint_t * RESTRICT induction_bucket,
-                                                        fast_sint_t omp_block_start,
-                                                        fast_sint_t omp_block_size) {
+static void libsais_final_bwt_aux_scan_left_to_right_8u(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm,
+                                                        sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket,
+                                                        fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j;
-         i += 2) {
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) {
         libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
 
         sa_sint_t s0 = SA[i + prefetch_distance + 0];
@@ -3055,8 +2924,7 @@ static void libsais_final_bwt_aux_scan_left_to_right_8u(const u8 * RESTRICT T,
         if (p0 > 0) {
             p0--;
             SA[i + 0] = T[p0] | SAINT_MIN;
-            SA[induction_bucket[T[p0]]++] =
-                p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1));
+            SA[induction_bucket[T[p0]]++] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1));
             if ((p0 & rm) == 0) {
                 I[p0 / (rm + 1)] = induction_bucket[T[p0]];
             }
@@ -3066,8 +2934,7 @@ static void libsais_final_bwt_aux_scan_left_to_right_8u(const u8 * RESTRICT T,
         if (p1 > 0) {
             p1--;
             SA[i + 1] = T[p1] | SAINT_MIN;
-            SA[induction_bucket[T[p1]]++] =
-                p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1));
+            SA[induction_bucket[T[p1]]++] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1));
             if ((p1 & rm) == 0) {
                 I[p1 / (rm + 1)] = induction_bucket[T[p1]];
             }
@@ -3080,8 +2947,7 @@ static void libsais_final_bwt_aux_scan_left_to_right_8u(const u8 * RESTRICT T,
         if (p > 0) {
             p--;
             SA[i] = T[p] | SAINT_MIN;
-            SA[induction_bucket[T[p]]++] =
-                p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1));
+            SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1));
             if ((p & rm) == 0) {
                 I[p / (rm + 1)] = induction_bucket[T[p]];
             }
@@ -3089,16 +2955,13 @@ static void libsais_final_bwt_aux_scan_left_to_right_8u(const u8 * RESTRICT T,
     }
 }
 
-static void libsais_final_sorting_scan_left_to_right_8u(const u8 * RESTRICT T,
-                                                        sa_sint_t * RESTRICT SA,
+static void libsais_final_sorting_scan_left_to_right_8u(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
                                                         sa_sint_t * RESTRICT induction_bucket,
-                                                        fast_sint_t omp_block_start,
-                                                        fast_sint_t omp_block_size) {
+                                                        fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j;
-         i += 2) {
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) {
         libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
 
         sa_sint_t s0 = SA[i + prefetch_distance + 0];
@@ -3116,15 +2979,13 @@ static void libsais_final_sorting_scan_left_to_right_8u(const u8 * RESTRICT T,
         SA[i + 0] = p0 ^ SAINT_MIN;
         if (p0 > 0) {
             p0--;
-            SA[induction_bucket[T[p0]]++] =
-                p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1));
+            SA[induction_bucket[T[p0]]++] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1));
         }
         sa_sint_t p1 = SA[i + 1];
         SA[i + 1] = p1 ^ SAINT_MIN;
         if (p1 > 0) {
             p1--;
-            SA[induction_bucket[T[p1]]++] =
-                p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1));
+            SA[induction_bucket[T[p1]]++] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1));
         }
     }
 
@@ -3133,22 +2994,18 @@ static void libsais_final_sorting_scan_left_to_right_8u(const u8 * RESTRICT T,
         SA[i] = p ^ SAINT_MIN;
         if (p > 0) {
             p--;
-            SA[induction_bucket[T[p]]++] =
-                p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1));
+            SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1));
         }
     }
 }
 
-static void libsais_final_sorting_scan_left_to_right_32s(const sa_sint_t * RESTRICT T,
-                                                         sa_sint_t * RESTRICT SA,
+static void libsais_final_sorting_scan_left_to_right_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
                                                          sa_sint_t * RESTRICT induction_bucket,
-                                                         fast_sint_t omp_block_start,
-                                                         fast_sint_t omp_block_size) {
+                                                         fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1;
-         i < j; i += 2) {
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2) {
         libsais_prefetchw(&SA[i + 3 * prefetch_distance]);
 
         sa_sint_t s0 = SA[i + 2 * prefetch_distance + 0];
@@ -3172,15 +3029,13 @@ static void libsais_final_sorting_scan_left_to_right_32s(const sa_sint_t * RESTR
         SA[i + 0] = p0 ^ SAINT_MIN;
         if (p0 > 0) {
             p0--;
-            SA[induction_bucket[T[p0]]++] =
-                p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1));
+            SA[induction_bucket[T[p0]]++] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1));
         }
         sa_sint_t p1 = SA[i + 1];
         SA[i + 1] = p1 ^ SAINT_MIN;
         if (p1 > 0) {
             p1--;
-            SA[induction_bucket[T[p1]]++] =
-                p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1));
+            SA[induction_bucket[T[p1]]++] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1));
         }
     }
 
@@ -3189,18 +3044,15 @@ static void libsais_final_sorting_scan_left_to_right_32s(const sa_sint_t * RESTR
         SA[i] = p ^ SAINT_MIN;
         if (p > 0) {
             p--;
-            SA[induction_bucket[T[p]]++] =
-                p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1));
+            SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1));
         }
     }
 }
-static void libsais_final_bwt_scan_left_to_right_8u_omp(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t n,
-    sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+static void libsais_final_bwt_scan_left_to_right_8u_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t n,
+                                                        sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads,
+                                                        LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     SA[induction_bucket[T[(sa_sint_t)n - 1]]++] =
-        ((sa_sint_t)n - 1) |
-        ((sa_sint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1));
+        ((sa_sint_t)n - 1) | ((sa_sint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1));
 
     if (threads == 1 || n < 65536) {
         libsais_final_bwt_scan_left_to_right_8u(T, SA, induction_bucket, 0, n);
@@ -3208,13 +3060,12 @@ static void libsais_final_bwt_scan_left_to_right_8u_omp(
     (void)(thread_state);
 }
 
-static void libsais_final_bwt_aux_scan_left_to_right_8u_omp(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t n, sa_sint_t rm,
-    sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+static void libsais_final_bwt_aux_scan_left_to_right_8u_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
+                                                            fast_sint_t n, sa_sint_t rm, sa_sint_t * RESTRICT I,
+                                                            sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads,
+                                                            LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     SA[induction_bucket[T[(sa_sint_t)n - 1]]++] =
-        ((sa_sint_t)n - 1) |
-        ((sa_sint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1));
+        ((sa_sint_t)n - 1) | ((sa_sint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1));
 
     if ((((sa_sint_t)n - 1) & rm) == 0) {
         I[((sa_sint_t)n - 1) / (rm + 1)] = induction_bucket[T[(sa_sint_t)n - 1]];
@@ -3226,13 +3077,12 @@ static void libsais_final_bwt_aux_scan_left_to_right_8u_omp(
     (void)(thread_state);
 }
 
-static void libsais_final_sorting_scan_left_to_right_8u_omp(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t n,
-    sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+static void libsais_final_sorting_scan_left_to_right_8u_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
+                                                            fast_sint_t n, sa_sint_t * RESTRICT induction_bucket,
+                                                            sa_sint_t threads,
+                                                            LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     SA[induction_bucket[T[(sa_sint_t)n - 1]]++] =
-        ((sa_sint_t)n - 1) |
-        ((sa_sint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1));
+        ((sa_sint_t)n - 1) | ((sa_sint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1));
 
     if (threads == 1 || n < 65536) {
         libsais_final_sorting_scan_left_to_right_8u(T, SA, induction_bucket, 0, n);
@@ -3240,12 +3090,11 @@ static void libsais_final_sorting_scan_left_to_right_8u_omp(
     (void)(thread_state);
 }
 
-static void libsais_final_sorting_scan_left_to_right_32s_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
-    sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
-    SA[induction_bucket[T[n - 1]]++] =
-        (n - 1) | ((sa_sint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1));
+static void libsais_final_sorting_scan_left_to_right_32s_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+                                                             sa_sint_t n, sa_sint_t * RESTRICT induction_bucket,
+                                                             sa_sint_t threads,
+                                                             LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    SA[induction_bucket[T[n - 1]]++] = (n - 1) | ((sa_sint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1));
 
     if (threads == 1 || n < 65536) {
         libsais_final_sorting_scan_left_to_right_32s(T, SA, induction_bucket, 0, n);
@@ -3253,17 +3102,14 @@ static void libsais_final_sorting_scan_left_to_right_32s_omp(
     (void)(thread_state);
 }
 
-static sa_sint_t libsais_final_bwt_scan_right_to_left_8u(const u8 * RESTRICT T,
-                                                         sa_sint_t * RESTRICT SA,
+static sa_sint_t libsais_final_bwt_scan_right_to_left_8u(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
                                                          sa_sint_t * RESTRICT induction_bucket,
-                                                         fast_sint_t omp_block_start,
-                                                         fast_sint_t omp_block_size) {
+                                                         fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
     sa_sint_t index = -1;
-    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1;
-         i >= j; i -= 2) {
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) {
         libsais_prefetchw(&SA[i - 2 * prefetch_distance]);
 
         sa_sint_t s0 = SA[i - prefetch_distance - 0];
@@ -3316,17 +3162,13 @@ static sa_sint_t libsais_final_bwt_scan_right_to_left_8u(const u8 * RESTRICT T,
     return index;
 }
 
-static void libsais_final_bwt_aux_scan_right_to_left_8u(const u8 * RESTRICT T,
-                                                        sa_sint_t * RESTRICT SA, sa_sint_t rm,
-                                                        sa_sint_t * RESTRICT I,
-                                                        sa_sint_t * RESTRICT induction_bucket,
-                                                        fast_sint_t omp_block_start,
-                                                        fast_sint_t omp_block_size) {
+static void libsais_final_bwt_aux_scan_right_to_left_8u(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm,
+                                                        sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket,
+                                                        fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1;
-         i >= j; i -= 2) {
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) {
         libsais_prefetchw(&SA[i - 2 * prefetch_distance]);
 
         sa_sint_t s0 = SA[i - prefetch_distance - 0];
@@ -3383,16 +3225,13 @@ static void libsais_final_bwt_aux_scan_right_to_left_8u(const u8 * RESTRICT T,
     }
 }
 
-static void libsais_final_sorting_scan_right_to_left_8u(const u8 * RESTRICT T,
-                                                        sa_sint_t * RESTRICT SA,
+static void libsais_final_sorting_scan_right_to_left_8u(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
                                                         sa_sint_t * RESTRICT induction_bucket,
-                                                        fast_sint_t omp_block_start,
-                                                        fast_sint_t omp_block_size) {
+                                                        fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1;
-         i >= j; i -= 2) {
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) {
         libsais_prefetchw(&SA[i - 2 * prefetch_distance]);
 
         sa_sint_t s0 = SA[i - prefetch_distance - 0];
@@ -3410,15 +3249,13 @@ static void libsais_final_sorting_scan_right_to_left_8u(const u8 * RESTRICT T,
         SA[i - 0] = p0 & SAINT_MAX;
         if (p0 > 0) {
             p0--;
-            SA[--induction_bucket[T[p0]]] =
-                p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1));
+            SA[--induction_bucket[T[p0]]] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1));
         }
         sa_sint_t p1 = SA[i - 1];
         SA[i - 1] = p1 & SAINT_MAX;
         if (p1 > 0) {
             p1--;
-            SA[--induction_bucket[T[p1]]] =
-                p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1));
+            SA[--induction_bucket[T[p1]]] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1));
         }
     }
 
@@ -3427,22 +3264,18 @@ static void libsais_final_sorting_scan_right_to_left_8u(const u8 * RESTRICT T,
         SA[i] = p & SAINT_MAX;
         if (p > 0) {
             p--;
-            SA[--induction_bucket[T[p]]] =
-                p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1));
+            SA[--induction_bucket[T[p]]] = p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1));
         }
     }
 }
 
-static void libsais_final_sorting_scan_right_to_left_32s(const sa_sint_t * RESTRICT T,
-                                                         sa_sint_t * RESTRICT SA,
+static void libsais_final_sorting_scan_right_to_left_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
                                                          sa_sint_t * RESTRICT induction_bucket,
-                                                         fast_sint_t omp_block_start,
-                                                         fast_sint_t omp_block_size) {
+                                                         fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1;
-         i >= j; i -= 2) {
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2) {
         libsais_prefetchw(&SA[i - 3 * prefetch_distance]);
 
         sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0];
@@ -3466,15 +3299,13 @@ static void libsais_final_sorting_scan_right_to_left_32s(const sa_sint_t * RESTR
         SA[i - 0] = p0 & SAINT_MAX;
         if (p0 > 0) {
             p0--;
-            SA[--induction_bucket[T[p0]]] =
-                p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1));
+            SA[--induction_bucket[T[p0]]] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1));
         }
         sa_sint_t p1 = SA[i - 1];
         SA[i - 1] = p1 & SAINT_MAX;
         if (p1 > 0) {
             p1--;
-            SA[--induction_bucket[T[p1]]] =
-                p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1));
+            SA[--induction_bucket[T[p1]]] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1));
         }
     }
 
@@ -3483,15 +3314,14 @@ static void libsais_final_sorting_scan_right_to_left_32s(const sa_sint_t * RESTR
         SA[i] = p & SAINT_MAX;
         if (p > 0) {
             p--;
-            SA[--induction_bucket[T[p]]] =
-                p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1));
+            SA[--induction_bucket[T[p]]] = p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1));
         }
     }
 }
-static sa_sint_t libsais_final_bwt_scan_right_to_left_8u_omp(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
-    sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+static sa_sint_t libsais_final_bwt_scan_right_to_left_8u_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
+                                                             sa_sint_t n, sa_sint_t * RESTRICT induction_bucket,
+                                                             sa_sint_t threads,
+                                                             LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     sa_sint_t index = -1;
 
     if (threads == 1 || n < 65536) {
@@ -3501,30 +3331,29 @@ static sa_sint_t libsais_final_bwt_scan_right_to_left_8u_omp(
     return index;
 }
 
-static void libsais_final_bwt_aux_scan_right_to_left_8u_omp(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t rm,
-    sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+static void libsais_final_bwt_aux_scan_right_to_left_8u_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                                            sa_sint_t rm, sa_sint_t * RESTRICT I,
+                                                            sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads,
+                                                            LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     if (threads == 1 || n < 65536) {
         libsais_final_bwt_aux_scan_right_to_left_8u(T, SA, rm, I, induction_bucket, 0, n);
     }
     (void)(thread_state);
 }
 
-static void libsais_final_sorting_scan_right_to_left_8u_omp(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
-    sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+static void libsais_final_sorting_scan_right_to_left_8u_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                                            sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads,
+                                                            LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     if (threads == 1 || n < 65536) {
         libsais_final_sorting_scan_right_to_left_8u(T, SA, induction_bucket, 0, n);
     }
     (void)(thread_state);
 }
 
-static void libsais_final_sorting_scan_right_to_left_32s_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
-    sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+static void libsais_final_sorting_scan_right_to_left_32s_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+                                                             sa_sint_t n, sa_sint_t * RESTRICT induction_bucket,
+                                                             sa_sint_t threads,
+                                                             LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     if (threads == 1 || n < 65536) {
         libsais_final_sorting_scan_right_to_left_32s(T, SA, induction_bucket, 0, n);
     }
@@ -3532,90 +3361,75 @@ static void libsais_final_sorting_scan_right_to_left_32s_omp(
 }
 
 static void libsais_clear_lms_suffixes_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
-                                           sa_sint_t * RESTRICT bucket_start,
-                                           sa_sint_t * RESTRICT bucket_end, sa_sint_t threads) {
+                                           sa_sint_t * RESTRICT bucket_start, sa_sint_t * RESTRICT bucket_end,
+                                           sa_sint_t threads) {
     fast_sint_t c;
     (void)(threads);
     (void)(n);
 
     for (c = 0; c < k; ++c) {
         if (bucket_end[c] > bucket_start[c]) {
-            memset(&SA[bucket_start[c]], 0,
-                   ((size_t)bucket_end[c] - (size_t)bucket_start[c]) * sizeof(sa_sint_t));
+            memset(&SA[bucket_start[c]], 0, ((size_t)bucket_end[c] - (size_t)bucket_start[c]) * sizeof(sa_sint_t));
         }
     }
 }
 
-static sa_sint_t libsais_induce_final_order_8u_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
-                                                   sa_sint_t n, sa_sint_t bwt, sa_sint_t r,
-                                                   sa_sint_t * RESTRICT I,
+static sa_sint_t libsais_induce_final_order_8u_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                                   sa_sint_t bwt, sa_sint_t r, sa_sint_t * RESTRICT I,
                                                    sa_sint_t * RESTRICT buckets, sa_sint_t threads,
                                                    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     if (!bwt) {
-        libsais_final_sorting_scan_left_to_right_8u_omp(T, SA, n, &buckets[6 * ALPHABET_SIZE],
-                                                        threads, thread_state);
+        libsais_final_sorting_scan_left_to_right_8u_omp(T, SA, n, &buckets[6 * ALPHABET_SIZE], threads, thread_state);
         if (threads > 1 && n >= 65536) {
             libsais_clear_lms_suffixes_omp(SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE],
                                            &buckets[7 * ALPHABET_SIZE], threads);
         }
-        libsais_final_sorting_scan_right_to_left_8u_omp(T, SA, n, &buckets[7 * ALPHABET_SIZE],
-                                                        threads, thread_state);
+        libsais_final_sorting_scan_right_to_left_8u_omp(T, SA, n, &buckets[7 * ALPHABET_SIZE], threads, thread_state);
         return 0;
     } else if (I != NULL) {
-        libsais_final_bwt_aux_scan_left_to_right_8u_omp(
-            T, SA, n, r - 1, I, &buckets[6 * ALPHABET_SIZE], threads, thread_state);
+        libsais_final_bwt_aux_scan_left_to_right_8u_omp(T, SA, n, r - 1, I, &buckets[6 * ALPHABET_SIZE], threads,
+                                                        thread_state);
         if (threads > 1 && n >= 65536) {
             libsais_clear_lms_suffixes_omp(SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE],
                                            &buckets[7 * ALPHABET_SIZE], threads);
         }
-        libsais_final_bwt_aux_scan_right_to_left_8u_omp(
-            T, SA, n, r - 1, I, &buckets[7 * ALPHABET_SIZE], threads, thread_state);
+        libsais_final_bwt_aux_scan_right_to_left_8u_omp(T, SA, n, r - 1, I, &buckets[7 * ALPHABET_SIZE], threads,
+                                                        thread_state);
         return 0;
     } else {
-        libsais_final_bwt_scan_left_to_right_8u_omp(T, SA, n, &buckets[6 * ALPHABET_SIZE], threads,
-                                                    thread_state);
+        libsais_final_bwt_scan_left_to_right_8u_omp(T, SA, n, &buckets[6 * ALPHABET_SIZE], threads, thread_state);
         if (threads > 1 && n >= 65536) {
             libsais_clear_lms_suffixes_omp(SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE],
                                            &buckets[7 * ALPHABET_SIZE], threads);
         }
-        return libsais_final_bwt_scan_right_to_left_8u_omp(T, SA, n, &buckets[7 * ALPHABET_SIZE],
-                                                           threads, thread_state);
+        return libsais_final_bwt_scan_right_to_left_8u_omp(T, SA, n, &buckets[7 * ALPHABET_SIZE], threads,
+                                                           thread_state);
     }
 }
 
-static void libsais_induce_final_order_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
-                                              sa_sint_t n, sa_sint_t k,
-                                              sa_sint_t * RESTRICT buckets, sa_sint_t threads,
+static void libsais_induce_final_order_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                              sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
                                               LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
-    libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[4 * k], threads,
-                                                     thread_state);
-    libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[5 * k], threads,
-                                                     thread_state);
+    libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[4 * k], threads, thread_state);
+    libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[5 * k], threads, thread_state);
 }
 
-static void libsais_induce_final_order_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
-                                              sa_sint_t n, sa_sint_t k,
-                                              sa_sint_t * RESTRICT buckets, sa_sint_t threads,
+static void libsais_induce_final_order_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                              sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
                                               LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
-    libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[2 * k], threads,
-                                                     thread_state);
-    libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[3 * k], threads,
-                                                     thread_state);
+    libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[2 * k], threads, thread_state);
+    libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[3 * k], threads, thread_state);
 }
 
-static void libsais_induce_final_order_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
-                                              sa_sint_t n, sa_sint_t k,
-                                              sa_sint_t * RESTRICT buckets, sa_sint_t threads,
+static void libsais_induce_final_order_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                              sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
                                               LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
-    libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[1 * k], threads,
-                                                     thread_state);
-    libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[0 * k], threads,
-                                                     thread_state);
+    libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[1 * k], threads, thread_state);
+    libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[0 * k], threads, thread_state);
 }
 
-static void libsais_induce_final_order_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
-                                              sa_sint_t n, sa_sint_t k,
-                                              sa_sint_t * RESTRICT buckets, sa_sint_t threads,
+static void libsais_induce_final_order_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                              sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
                                               LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     libsais_count_suffixes_32s(T, n, k, buckets);
     libsais_initialize_buckets_start_32s_1k(k, buckets);
@@ -3626,17 +3440,17 @@ static void libsais_induce_final_order_32s_1k(const sa_sint_t * RESTRICT T, sa_s
     libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, buckets, threads, thread_state);
 }
 
-static sa_sint_t libsais_renumber_unique_and_nonunique_lms_suffixes_32s(
-    sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t f,
-    fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+static sa_sint_t libsais_renumber_unique_and_nonunique_lms_suffixes_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+                                                                        sa_sint_t m, sa_sint_t f,
+                                                                        fast_sint_t omp_block_start,
+                                                                        fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     sa_sint_t * RESTRICT SAm = &SA[m];
 
     sa_sint_t i, j;
-    for (i = (sa_sint_t)omp_block_start, j = (sa_sint_t)omp_block_start +
-                                             (sa_sint_t)omp_block_size -
-                                             2 * (sa_sint_t)prefetch_distance - 3;
+    for (i = (sa_sint_t)omp_block_start,
+        j = (sa_sint_t)omp_block_start + (sa_sint_t)omp_block_size - 2 * (sa_sint_t)prefetch_distance - 3;
          i < j; i += 4) {
         libsais_prefetch(&SA[i + 3 * prefetch_distance]);
 
@@ -3706,9 +3520,8 @@ static sa_sint_t libsais_renumber_unique_and_nonunique_lms_suffixes_32s(
     return f;
 }
 
-static void libsais_compact_unique_and_nonunique_lms_suffixes_32s(sa_sint_t * RESTRICT SA,
-                                                                  sa_sint_t m, fast_sint_t * pl,
-                                                                  fast_sint_t * pr,
+static void libsais_compact_unique_and_nonunique_lms_suffixes_32s(sa_sint_t * RESTRICT SA, sa_sint_t m,
+                                                                  fast_sint_t * pl, fast_sint_t * pr,
                                                                   fast_sint_t omp_block_start,
                                                                   fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
@@ -3717,9 +3530,8 @@ static void libsais_compact_unique_and_nonunique_lms_suffixes_32s(sa_sint_t * RE
     sa_sint_t * RESTRICT SAr = &SA[0];
 
     fast_sint_t i, j, l = *pl - 1, r = *pr - 1;
-    for (i = (fast_sint_t)m + omp_block_start + omp_block_size - 1,
-        j = (fast_sint_t)m + omp_block_start + 3;
-         i >= j; i -= 4) {
+    for (i = (fast_sint_t)m + omp_block_start + omp_block_size - 1, j = (fast_sint_t)m + omp_block_start + 3; i >= j;
+         i -= 4) {
         libsais_prefetch(&SA[i - prefetch_distance]);
 
         sa_sint_t p0 = SA[i - 0];
@@ -3768,21 +3580,19 @@ static sa_sint_t libsais_renumber_unique_and_nonunique_lms_suffixes_32s_omp(
 
         fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size =
-            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
+        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
 
         if (omp_num_threads == 1) {
-            f = libsais_renumber_unique_and_nonunique_lms_suffixes_32s(T, SA, m, 0, omp_block_start,
-                                                                       omp_block_size);
+            f = libsais_renumber_unique_and_nonunique_lms_suffixes_32s(T, SA, m, 0, omp_block_start, omp_block_size);
         }
     }
 
     return f;
 }
 
-static void libsais_compact_unique_and_nonunique_lms_suffixes_32s_omp(
-    sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t f, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+static void libsais_compact_unique_and_nonunique_lms_suffixes_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
+                                                                      sa_sint_t fs, sa_sint_t f, sa_sint_t threads,
+                                                                      LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     {
         (void)(threads);
         (void)(thread_state);
@@ -3792,35 +3602,30 @@ static void libsais_compact_unique_and_nonunique_lms_suffixes_32s_omp(
 
         fast_sint_t omp_block_stride = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
-                                         ? omp_block_stride
-                                         : ((fast_sint_t)n >> 1) - omp_block_start;
+        fast_sint_t omp_block_size =
+            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ((fast_sint_t)n >> 1) - omp_block_start;
 
         if (omp_num_threads == 1) {
             fast_sint_t l = m, r = (fast_sint_t)n + (fast_sint_t)fs;
-            libsais_compact_unique_and_nonunique_lms_suffixes_32s(SA, m, &l, &r, omp_block_start,
-                                                                  omp_block_size);
+            libsais_compact_unique_and_nonunique_lms_suffixes_32s(SA, m, &l, &r, omp_block_start, omp_block_size);
         }
     }
 
-    memcpy(&SA[(fast_sint_t)n + (fast_sint_t)fs - (fast_sint_t)m],
-           &SA[(fast_sint_t)m - (fast_sint_t)f], (size_t)f * sizeof(sa_sint_t));
+    memcpy(&SA[(fast_sint_t)n + (fast_sint_t)fs - (fast_sint_t)m], &SA[(fast_sint_t)m - (fast_sint_t)f],
+           (size_t)f * sizeof(sa_sint_t));
 }
 
-static sa_sint_t libsais_compact_lms_suffixes_32s_omp(
-    sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs,
-    sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
-    sa_sint_t f =
-        libsais_renumber_unique_and_nonunique_lms_suffixes_32s_omp(T, SA, m, threads, thread_state);
-    libsais_compact_unique_and_nonunique_lms_suffixes_32s_omp(SA, n, m, fs, f, threads,
-                                                              thread_state);
+static sa_sint_t libsais_compact_lms_suffixes_32s_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                                      sa_sint_t m, sa_sint_t fs, sa_sint_t threads,
+                                                      LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    sa_sint_t f = libsais_renumber_unique_and_nonunique_lms_suffixes_32s_omp(T, SA, m, threads, thread_state);
+    libsais_compact_unique_and_nonunique_lms_suffixes_32s_omp(SA, n, m, fs, f, threads, thread_state);
 
     return f;
 }
 
-static void libsais_merge_unique_lms_suffixes_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
-                                                  sa_sint_t n, sa_sint_t m, fast_sint_t l,
-                                                  fast_sint_t omp_block_start,
+static void libsais_merge_unique_lms_suffixes_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                                  sa_sint_t m, fast_sint_t l, fast_sint_t omp_block_start,
                                                   fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
@@ -3828,9 +3633,8 @@ static void libsais_merge_unique_lms_suffixes_32s(sa_sint_t * RESTRICT T, sa_sin
 
     sa_sint_t i, j;
     fast_sint_t tmp = *SAnm++;
-    for (i = (sa_sint_t)omp_block_start,
-        j = (sa_sint_t)omp_block_start + (sa_sint_t)omp_block_size - 6;
-         i < j; i += 4) {
+    for (i = (sa_sint_t)omp_block_start, j = (sa_sint_t)omp_block_start + (sa_sint_t)omp_block_size - 6; i < j;
+         i += 4) {
         libsais_prefetch(&T[i + prefetch_distance]);
 
         sa_sint_t c0 = T[i + 0];
@@ -3874,10 +3678,8 @@ static void libsais_merge_unique_lms_suffixes_32s(sa_sint_t * RESTRICT T, sa_sin
     }
 }
 
-static void libsais_merge_nonunique_lms_suffixes_32s(sa_sint_t * RESTRICT SA, sa_sint_t n,
-                                                     sa_sint_t m, fast_sint_t l,
-                                                     fast_sint_t omp_block_start,
-                                                     fast_sint_t omp_block_size) {
+static void libsais_merge_nonunique_lms_suffixes_32s(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, fast_sint_t l,
+                                                     fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     const sa_sint_t * RESTRICT SAnm = &SA[(fast_sint_t)n - (fast_sint_t)m - 1 + l];
@@ -3913,9 +3715,9 @@ static void libsais_merge_nonunique_lms_suffixes_32s(sa_sint_t * RESTRICT SA, sa
     }
 }
 
-static void libsais_merge_unique_lms_suffixes_32s_omp(
-    sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+static void libsais_merge_unique_lms_suffixes_32s_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                                      sa_sint_t m, sa_sint_t threads,
+                                                      LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     {
         (void)(threads);
         (void)(thread_state);
@@ -3925,8 +3727,7 @@ static void libsais_merge_unique_lms_suffixes_32s_omp(
 
         fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size =
-            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
 
         if (omp_num_threads == 1) {
             libsais_merge_unique_lms_suffixes_32s(T, SA, n, m, 0, omp_block_start, omp_block_size);
@@ -3934,9 +3735,9 @@ static void libsais_merge_unique_lms_suffixes_32s_omp(
     }
 }
 
-static void libsais_merge_nonunique_lms_suffixes_32s_omp(
-    sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t f, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+static void libsais_merge_nonunique_lms_suffixes_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t f,
+                                                         sa_sint_t threads,
+                                                         LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     {
         (void)(threads);
         (void)(thread_state);
@@ -3946,8 +3747,7 @@ static void libsais_merge_nonunique_lms_suffixes_32s_omp(
 
         fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size =
-            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
+        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
 
         if (omp_num_threads == 1) {
             libsais_merge_nonunique_lms_suffixes_32s(SA, n, m, f, omp_block_start, omp_block_size);
@@ -3955,22 +3755,22 @@ static void libsais_merge_nonunique_lms_suffixes_32s_omp(
     }
 }
 
-static void libsais_merge_compacted_lms_suffixes_32s_omp(
-    sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t f,
-    sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+static void libsais_merge_compacted_lms_suffixes_32s_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                                         sa_sint_t m, sa_sint_t f, sa_sint_t threads,
+                                                         LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     libsais_merge_unique_lms_suffixes_32s_omp(T, SA, n, m, threads, thread_state);
     libsais_merge_nonunique_lms_suffixes_32s_omp(SA, n, m, f, threads, thread_state);
 }
 
-static void libsais_reconstruct_compacted_lms_suffixes_32s_2k_omp(
-    sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m,
-    sa_sint_t fs, sa_sint_t f, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+static void libsais_reconstruct_compacted_lms_suffixes_32s_2k_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+                                                                  sa_sint_t n, sa_sint_t k, sa_sint_t m, sa_sint_t fs,
+                                                                  sa_sint_t f, sa_sint_t * RESTRICT buckets,
+                                                                  sa_sint_t threads,
+                                                                  LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     if (f > 0) {
         memmove(&SA[n - m - 1], &SA[n + fs - m], (size_t)f * sizeof(sa_sint_t));
 
-        libsais_count_and_gather_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, threads,
-                                                                   thread_state);
+        libsais_count_and_gather_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state);
         libsais_reconstruct_lms_suffixes_omp(SA, n, m - f, threads);
 
         memcpy(&SA[n - m - 1 + f], &SA[0], ((size_t)m - (size_t)f) * sizeof(sa_sint_t));
@@ -3983,9 +3783,10 @@ static void libsais_reconstruct_compacted_lms_suffixes_32s_2k_omp(
     }
 }
 
-static void libsais_reconstruct_compacted_lms_suffixes_32s_1k_omp(
-    sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs,
-    sa_sint_t f, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+static void libsais_reconstruct_compacted_lms_suffixes_32s_1k_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+                                                                  sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t f,
+                                                                  sa_sint_t threads,
+                                                                  LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     if (f > 0) {
         memmove(&SA[n - m - 1], &SA[n + fs - m], (size_t)f * sizeof(sa_sint_t));
 
@@ -4002,55 +3803,48 @@ static void libsais_reconstruct_compacted_lms_suffixes_32s_1k_omp(
     }
 }
 
-static sa_sint_t libsais_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
-                                  sa_sint_t k, sa_sint_t fs, sa_sint_t threads,
-                                  LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+static sa_sint_t libsais_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
+                                  sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     fs = fs < (SAINT_MAX - n) ? fs : (SAINT_MAX - n);
 
     if (k > 0 && fs / k >= 6) {
         sa_sint_t alignment = (fs - 1024) / k >= 6 ? 1024 : 16;
         sa_sint_t * RESTRICT buckets =
             (fs - alignment) / k >= 6
-                ? (sa_sint_t *)libsais_align_up(&SA[n + fs - 6 * k - alignment],
-                                                (size_t)alignment * sizeof(sa_sint_t))
+                ? (sa_sint_t *)libsais_align_up(&SA[n + fs - 6 * k - alignment], (size_t)alignment * sizeof(sa_sint_t))
                 : &SA[n + fs - 6 * k];
 
-        sa_sint_t m = libsais_count_and_gather_lms_suffixes_32s_4k_omp(T, SA, n, k, buckets,
-                                                                       threads, thread_state);
+        sa_sint_t m = libsais_count_and_gather_lms_suffixes_32s_4k_omp(T, SA, n, k, buckets, threads, thread_state);
         if (m > 1) {
             memset(SA, 0, ((size_t)n - (size_t)m) * sizeof(sa_sint_t));
 
             sa_sint_t first_lms_suffix = SA[n - m];
             sa_sint_t left_suffixes_count =
-                libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(T, k, buckets,
-                                                                              first_lms_suffix);
+                libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(T, k, buckets, first_lms_suffix);
 
-            libsais_radix_sort_lms_suffixes_32s_6k_omp(T, SA, n, m, &buckets[4 * k], threads,
-                                                       thread_state);
+            libsais_radix_sort_lms_suffixes_32s_6k_omp(T, SA, n, m, &buckets[4 * k], threads, thread_state);
             libsais_radix_sort_set_markers_32s_6k_omp(SA, k, &buckets[4 * k], threads);
 
             if (threads > 1 && n >= 65536) {
                 memset(&SA[(fast_sint_t)n - (fast_sint_t)m], 0, (size_t)m * sizeof(sa_sint_t));
             }
 
-            libsais_initialize_buckets_for_partial_sorting_32s_6k(T, k, buckets, first_lms_suffix,
-                                                                  left_suffixes_count);
-            libsais_induce_partial_order_32s_6k_omp(T, SA, n, k, buckets, first_lms_suffix,
-                                                    left_suffixes_count, threads, thread_state);
+            libsais_initialize_buckets_for_partial_sorting_32s_6k(T, k, buckets, first_lms_suffix, left_suffixes_count);
+            libsais_induce_partial_order_32s_6k_omp(T, SA, n, k, buckets, first_lms_suffix, left_suffixes_count,
+                                                    threads, thread_state);
 
-            sa_sint_t names = libsais_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(
-                SA, n, m, threads, thread_state);
+            sa_sint_t names =
+                libsais_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(SA, n, m, threads, thread_state);
             if (names < m) {
-                sa_sint_t f =
-                    libsais_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state);
+                sa_sint_t f = libsais_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state);
 
-                if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f,
-                                     threads, thread_state) != 0) {
+                if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads,
+                                     thread_state) != 0) {
                     return -2;
                 }
 
-                libsais_reconstruct_compacted_lms_suffixes_32s_2k_omp(
-                    T, SA, n, k, m, fs, f, buckets, threads, thread_state);
+                libsais_reconstruct_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, m, fs, f, buckets, threads,
+                                                                      thread_state);
             } else {
                 libsais_count_lms_suffixes_32s_2k(T, n, k, buckets);
             }
@@ -4071,36 +3865,31 @@ static sa_sint_t libsais_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT S
         sa_sint_t alignment = (fs - 1024) / k >= 4 ? 1024 : 16;
         sa_sint_t * RESTRICT buckets =
             (fs - alignment) / k >= 4
-                ? (sa_sint_t *)libsais_align_up(&SA[n + fs - 4 * k - alignment],
-                                                (size_t)alignment * sizeof(sa_sint_t))
+                ? (sa_sint_t *)libsais_align_up(&SA[n + fs - 4 * k - alignment], (size_t)alignment * sizeof(sa_sint_t))
                 : &SA[n + fs - 4 * k];
 
-        sa_sint_t m = libsais_count_and_gather_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets,
-                                                                       threads, thread_state);
+        sa_sint_t m = libsais_count_and_gather_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state);
         if (m > 1) {
-            libsais_initialize_buckets_for_radix_and_partial_sorting_32s_4k(T, k, buckets,
-                                                                            SA[n - m]);
+            libsais_initialize_buckets_for_radix_and_partial_sorting_32s_4k(T, k, buckets, SA[n - m]);
 
-            libsais_radix_sort_lms_suffixes_32s_2k_omp(T, SA, n, m, &buckets[1], threads,
-                                                       thread_state);
+            libsais_radix_sort_lms_suffixes_32s_2k_omp(T, SA, n, m, &buckets[1], threads, thread_state);
             libsais_radix_sort_set_markers_32s_4k_omp(SA, k, &buckets[1], threads);
 
             libsais_place_lms_suffixes_interval_32s_4k(SA, n, k, m - 1, buckets);
             libsais_induce_partial_order_32s_4k_omp(T, SA, n, k, buckets, threads, thread_state);
 
-            sa_sint_t names = libsais_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(
-                SA, n, m, threads, thread_state);
+            sa_sint_t names =
+                libsais_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(SA, n, m, threads, thread_state);
             if (names < m) {
-                sa_sint_t f =
-                    libsais_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state);
+                sa_sint_t f = libsais_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state);
 
-                if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f,
-                                     threads, thread_state) != 0) {
+                if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads,
+                                     thread_state) != 0) {
                     return -2;
                 }
 
-                libsais_reconstruct_compacted_lms_suffixes_32s_2k_omp(
-                    T, SA, n, k, m, fs, f, buckets, threads, thread_state);
+                libsais_reconstruct_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, m, fs, f, buckets, threads,
+                                                                      thread_state);
             } else {
                 libsais_count_lms_suffixes_32s_2k(T, n, k, buckets);
             }
@@ -4117,35 +3906,30 @@ static sa_sint_t libsais_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT S
         sa_sint_t alignment = (fs - 1024) / k >= 2 ? 1024 : 16;
         sa_sint_t * RESTRICT buckets =
             (fs - alignment) / k >= 2
-                ? (sa_sint_t *)libsais_align_up(&SA[n + fs - 2 * k - alignment],
-                                                (size_t)alignment * sizeof(sa_sint_t))
+                ? (sa_sint_t *)libsais_align_up(&SA[n + fs - 2 * k - alignment], (size_t)alignment * sizeof(sa_sint_t))
                 : &SA[n + fs - 2 * k];
 
-        sa_sint_t m = libsais_count_and_gather_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets,
-                                                                       threads, thread_state);
+        sa_sint_t m = libsais_count_and_gather_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state);
         if (m > 1) {
             libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_2k(T, k, buckets, SA[n - m]);
 
-            libsais_radix_sort_lms_suffixes_32s_2k_omp(T, SA, n, m, &buckets[1], threads,
-                                                       thread_state);
+            libsais_radix_sort_lms_suffixes_32s_2k_omp(T, SA, n, m, &buckets[1], threads, thread_state);
             libsais_place_lms_suffixes_interval_32s_2k(SA, n, k, m - 1, buckets);
 
             libsais_initialize_buckets_start_and_end_32s_2k(k, buckets);
             libsais_induce_partial_order_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state);
 
-            sa_sint_t names =
-                libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(T, SA, n, m, threads);
+            sa_sint_t names = libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(T, SA, n, m, threads);
             if (names < m) {
-                sa_sint_t f =
-                    libsais_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state);
+                sa_sint_t f = libsais_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state);
 
-                if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f,
-                                     threads, thread_state) != 0) {
+                if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads,
+                                     thread_state) != 0) {
                     return -2;
                 }
 
-                libsais_reconstruct_compacted_lms_suffixes_32s_2k_omp(
-                    T, SA, n, k, m, fs, f, buckets, threads, thread_state);
+                libsais_reconstruct_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, m, fs, f, buckets, threads,
+                                                                      thread_state);
             } else {
                 libsais_count_lms_suffixes_32s_2k(T, n, k, buckets);
             }
@@ -4162,14 +3946,12 @@ static sa_sint_t libsais_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT S
         return 0;
     } else {
         sa_sint_t * buffer =
-            fs < k ? (sa_sint_t *)libsais_alloc_aligned((size_t)k * sizeof(sa_sint_t), 4096)
-                   : (sa_sint_t *)NULL;
+            fs < k ? (sa_sint_t *)libsais_alloc_aligned((size_t)k * sizeof(sa_sint_t), 4096) : (sa_sint_t *)NULL;
 
         sa_sint_t alignment = fs - 1024 >= k ? 1024 : 16;
         sa_sint_t * RESTRICT buckets =
             fs - alignment >= k
-                ? (sa_sint_t *)libsais_align_up(&SA[n + fs - k - alignment],
-                                                (size_t)alignment * sizeof(sa_sint_t))
+                ? (sa_sint_t *)libsais_align_up(&SA[n + fs - k - alignment], (size_t)alignment * sizeof(sa_sint_t))
             : fs >= k ? &SA[n + fs - k]
                       : buffer;
 
@@ -4186,28 +3968,24 @@ static sa_sint_t libsais_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT S
         if (m > 1) {
             libsais_induce_partial_order_32s_1k_omp(T, SA, n, k, buckets, threads, thread_state);
 
-            sa_sint_t names =
-                libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(T, SA, n, m, threads);
+            sa_sint_t names = libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(T, SA, n, m, threads);
             if (names < m) {
                 if (buffer != NULL) {
                     libsais_free_aligned(buffer);
                     buckets = NULL;
                 }
 
-                sa_sint_t f =
-                    libsais_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state);
+                sa_sint_t f = libsais_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state);
 
-                if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f,
-                                     threads, thread_state) != 0) {
+                if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads,
+                                     thread_state) != 0) {
                     return -2;
                 }
 
-                libsais_reconstruct_compacted_lms_suffixes_32s_1k_omp(T, SA, n, m, fs, f, threads,
-                                                                      thread_state);
+                libsais_reconstruct_compacted_lms_suffixes_32s_1k_omp(T, SA, n, m, fs, f, threads, thread_state);
 
                 if (buckets == NULL) {
-                    buckets = buffer =
-                        (sa_sint_t *)libsais_alloc_aligned((size_t)k * sizeof(sa_sint_t), 4096);
+                    buckets = buffer = (sa_sint_t *)libsais_alloc_aligned((size_t)k * sizeof(sa_sint_t), 4096);
                 }
                 if (buckets == NULL) {
                     return -2;
@@ -4226,14 +4004,12 @@ static sa_sint_t libsais_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT S
     }
 }
 
-static sa_sint_t libsais_main_8u(const u8 * T, sa_sint_t * SA, sa_sint_t n,
-                                 sa_sint_t * RESTRICT buckets, sa_sint_t bwt, sa_sint_t r,
-                                 sa_sint_t * RESTRICT I, sa_sint_t fs, sa_sint_t * freq,
-                                 sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+static sa_sint_t libsais_main_8u(const u8 * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t bwt,
+                                 sa_sint_t r, sa_sint_t * RESTRICT I, sa_sint_t fs, sa_sint_t * freq, sa_sint_t threads,
+                                 LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     fs = fs < (SAINT_MAX - n) ? fs : (SAINT_MAX - n);
 
-    sa_sint_t m =
-        libsais_count_and_gather_lms_suffixes_8u_omp(T, SA, n, buckets, threads, thread_state);
+    sa_sint_t m = libsais_count_and_gather_lms_suffixes_8u_omp(T, SA, n, buckets, threads, thread_state);
 
     libsais_initialize_buckets_start_and_end_8u(buckets, freq);
 
@@ -4250,16 +4026,13 @@ static sa_sint_t libsais_main_8u(const u8 * T, sa_sint_t * SA, sa_sint_t n,
             memset(&SA[(fast_sint_t)n - (fast_sint_t)m], 0, (size_t)m * sizeof(sa_sint_t));
         }
 
-        libsais_initialize_buckets_for_partial_sorting_8u(T, buckets, first_lms_suffix,
-                                                          left_suffixes_count);
-        libsais_induce_partial_order_8u_omp(T, SA, n, buckets, first_lms_suffix,
-                                            left_suffixes_count, threads, thread_state);
+        libsais_initialize_buckets_for_partial_sorting_8u(T, buckets, first_lms_suffix, left_suffixes_count);
+        libsais_induce_partial_order_8u_omp(T, SA, n, buckets, first_lms_suffix, left_suffixes_count, threads,
+                                            thread_state);
 
-        sa_sint_t names =
-            libsais_renumber_and_gather_lms_suffixes_8u_omp(SA, n, m, fs, threads, thread_state);
+        sa_sint_t names = libsais_renumber_and_gather_lms_suffixes_8u_omp(SA, n, m, fs, threads, thread_state);
         if (names < m) {
-            if (libsais_main_32s(SA + n + fs - m, SA, m, names, fs + n - 2 * m, threads,
-                                 thread_state) != 0) {
+            if (libsais_main_32s(SA + n + fs - m, SA, m, names, fs + n - 2 * m, threads, thread_state) != 0) {
                 return -2;
             }
 
@@ -4275,17 +4048,14 @@ static sa_sint_t libsais_main_8u(const u8 * T, sa_sint_t * SA, sa_sint_t n,
     return libsais_induce_final_order_8u_omp(T, SA, n, bwt, r, I, buckets, threads, thread_state);
 }
 
-static sa_sint_t libsais_main(const u8 * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t bwt, sa_sint_t r,
-                              sa_sint_t * I, sa_sint_t fs, sa_sint_t * freq, sa_sint_t threads) {
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state =
-        threads > 1 ? libsais_alloc_thread_state(threads) : NULL;
-    sa_sint_t * RESTRICT buckets =
-        (sa_sint_t *)libsais_alloc_aligned(8 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096);
+static sa_sint_t libsais_main(const u8 * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t bwt, sa_sint_t r, sa_sint_t * I,
+                              sa_sint_t fs, sa_sint_t * freq, sa_sint_t threads) {
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state = threads > 1 ? libsais_alloc_thread_state(threads) : NULL;
+    sa_sint_t * RESTRICT buckets = (sa_sint_t *)libsais_alloc_aligned(8 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096);
 
-    sa_sint_t index =
-        buckets != NULL && (thread_state != NULL || threads == 1)
-            ? libsais_main_8u(T, SA, n, buckets, bwt, r, I, fs, freq, threads, thread_state)
-            : -2;
+    sa_sint_t index = buckets != NULL && (thread_state != NULL || threads == 1)
+                          ? libsais_main_8u(T, SA, n, buckets, bwt, r, I, fs, freq, threads, thread_state)
+                          : -2;
 
     libsais_free_aligned(buckets);
     libsais_free_thread_state(thread_state);
@@ -4293,26 +4063,22 @@ static sa_sint_t libsais_main(const u8 * T, sa_sint_t * SA, sa_sint_t n, sa_sint
     return index;
 }
 
-static s32 libsais_main_int(sa_sint_t * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t k, sa_sint_t fs,
-                            sa_sint_t threads) {
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state =
-        threads > 1 ? libsais_alloc_thread_state(threads) : NULL;
+static s32 libsais_main_int(sa_sint_t * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t k, sa_sint_t fs, sa_sint_t threads) {
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state = threads > 1 ? libsais_alloc_thread_state(threads) : NULL;
 
-    sa_sint_t index = thread_state != NULL || threads == 1
-                          ? libsais_main_32s(T, SA, n, k, fs, threads, thread_state)
-                          : -2;
+    sa_sint_t index =
+        thread_state != NULL || threads == 1 ? libsais_main_32s(T, SA, n, k, fs, threads, thread_state) : -2;
 
     libsais_free_thread_state(thread_state);
 
     return index;
 }
 
-static sa_sint_t libsais_main_ctx(const LIBSAIS_CONTEXT * ctx, const u8 * T, sa_sint_t * SA,
-                                  sa_sint_t n, sa_sint_t bwt, sa_sint_t r, sa_sint_t * I,
-                                  sa_sint_t fs, sa_sint_t * freq) {
+static sa_sint_t libsais_main_ctx(const LIBSAIS_CONTEXT * ctx, const u8 * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t bwt,
+                                  sa_sint_t r, sa_sint_t * I, sa_sint_t fs, sa_sint_t * freq) {
     return ctx != NULL && (ctx->buckets != NULL && (ctx->thread_state != NULL || ctx->threads == 1))
-               ? libsais_main_8u(T, SA, n, ctx->buckets, bwt, r, I, fs, freq,
-                                 (sa_sint_t)ctx->threads, ctx->thread_state)
+               ? libsais_main_8u(T, SA, n, ctx->buckets, bwt, r, I, fs, freq, (sa_sint_t)ctx->threads,
+                                 ctx->thread_state)
                : -2;
 }
 
@@ -4421,8 +4187,8 @@ s32 libsais_bwt(const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq) {
 }
 
 s32 libsais_bwt_aux(const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq, s32 r, s32 * I) {
-    if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (r < 2) ||
-        ((r & (r - 1)) != 0) || (I == NULL)) {
+    if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (r < 2) || ((r & (r - 1)) != 0) ||
+        (I == NULL)) {
         return -1;
     } else if (n <= 1) {
         if (freq != NULL) {
@@ -4478,10 +4244,9 @@ s32 libsais_bwt_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n, s32
     return index;
 }
 
-s32 libsais_bwt_aux_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq,
-                        s32 r, s32 * I) {
-    if ((ctx == NULL) || (T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) ||
-        (r < 2) || ((r & (r - 1)) != 0) || (I == NULL)) {
+s32 libsais_bwt_aux_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq, s32 r, s32 * I) {
+    if ((ctx == NULL) || (T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (r < 2) ||
+        ((r & (r - 1)) != 0) || (I == NULL)) {
         return -1;
     } else if (n <= 1) {
         if (freq != NULL) {
@@ -4511,13 +4276,10 @@ static LIBSAIS_UNBWT_CONTEXT * libsais_unbwt_create_ctx_main(sa_sint_t threads)
         (LIBSAIS_UNBWT_CONTEXT *)libsais_alloc_aligned(sizeof(LIBSAIS_UNBWT_CONTEXT), 64);
     sa_uint_t * RESTRICT bucket2 =
         (sa_uint_t *)libsais_alloc_aligned(ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t), 4096);
-    u16 * RESTRICT fastbits =
-        (u16 *)libsais_alloc_aligned((1 + (1 << UNBWT_FASTBITS)) * sizeof(u16), 4096);
+    u16 * RESTRICT fastbits = (u16 *)libsais_alloc_aligned((1 + (1 << UNBWT_FASTBITS)) * sizeof(u16), 4096);
     sa_uint_t * RESTRICT buckets =
         threads > 1 ? (sa_uint_t *)libsais_alloc_aligned(
-                          (size_t)threads * (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)) *
-                              sizeof(sa_uint_t),
-                          4096)
+                          (size_t)threads * (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)) * sizeof(sa_uint_t), 4096)
                     : NULL;
 
     if (ctx != NULL && bucket2 != NULL && fastbits != NULL && (buckets != NULL || threads == 1)) {
@@ -4546,8 +4308,7 @@ static void libsais_unbwt_free_ctx_main(LIBSAIS_UNBWT_CONTEXT * ctx) {
     }
 }
 
-static void libsais_unbwt_compute_histogram(const u8 * RESTRICT T, fast_sint_t n,
-                                            sa_uint_t * RESTRICT count) {
+static void libsais_unbwt_compute_histogram(const u8 * RESTRICT T, fast_sint_t n, sa_uint_t * RESTRICT count) {
     const fast_sint_t prefetch_distance = 256;
 
     const u8 * RESTRICT T_p = T;
@@ -4566,14 +4327,12 @@ static void libsais_unbwt_compute_histogram(const u8 * RESTRICT T, fast_sint_t n
             copy0[T_p[0]]++;
         }
 
-        fast_uint_t x = ((const u32 *)(const void *)T_p)[0],
-                    y = ((const u32 *)(const void *)T_p)[1];
+        fast_uint_t x = ((const u32 *)(const void *)T_p)[0], y = ((const u32 *)(const void *)T_p)[1];
 
         for (; T_p < (u8 *)((ptrdiff_t)(T + n - 8) & (-64)); T_p += 64) {
             libsais_prefetch(&T_p[prefetch_distance]);
 
-            fast_uint_t z = ((const u32 *)(const void *)T_p)[2],
-                        w = ((const u32 *)(const void *)T_p)[3];
+            fast_uint_t z = ((const u32 *)(const void *)T_p)[2], w = ((const u32 *)(const void *)T_p)[3];
             copy0[(u8)x]++;
             x >>= 8;
             copy1[(u8)x]++;
@@ -4806,10 +4565,8 @@ static void libsais_unbwt_transpose_bucket2(sa_uint_t * RESTRICT bucket2) {
     }
 }
 
-static void libsais_unbwt_compute_bigram_histogram_single(const u8 * RESTRICT T,
-                                                          sa_uint_t * RESTRICT bucket1,
-                                                          sa_uint_t * RESTRICT bucket2,
-                                                          fast_uint_t index) {
+static void libsais_unbwt_compute_bigram_histogram_single(const u8 * RESTRICT T, sa_uint_t * RESTRICT bucket1,
+                                                          sa_uint_t * RESTRICT bucket2, fast_uint_t index) {
     fast_uint_t sum, c;
     for (sum = 1, c = 0; c < ALPHABET_SIZE; ++c) {
         fast_uint_t prev = sum;
@@ -4839,8 +4596,8 @@ static void libsais_unbwt_compute_bigram_histogram_single(const u8 * RESTRICT T,
     libsais_unbwt_transpose_bucket2(bucket2);
 }
 
-static void libsais_unbwt_calculate_fastbits(sa_uint_t * RESTRICT bucket2, u16 * RESTRICT fastbits,
-                                             fast_uint_t lastc, fast_uint_t shift) {
+static void libsais_unbwt_calculate_fastbits(sa_uint_t * RESTRICT bucket2, u16 * RESTRICT fastbits, fast_uint_t lastc,
+                                             fast_uint_t shift) {
     fast_uint_t v, w, sum, c, d;
     for (v = 0, w = 0, sum = 1, c = 0; c < ALPHABET_SIZE; ++c) {
         if (c == lastc) {
@@ -4860,10 +4617,9 @@ static void libsais_unbwt_calculate_fastbits(sa_uint_t * RESTRICT bucket2, u16 *
     }
 }
 
-static void libsais_unbwt_calculate_biPSI(const u8 * RESTRICT T, sa_uint_t * RESTRICT P,
-                                          sa_uint_t * RESTRICT bucket1,
-                                          sa_uint_t * RESTRICT bucket2, fast_uint_t index,
-                                          fast_sint_t omp_block_start, fast_sint_t omp_block_end) {
+static void libsais_unbwt_calculate_biPSI(const u8 * RESTRICT T, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket1,
+                                          sa_uint_t * RESTRICT bucket2, fast_uint_t index, fast_sint_t omp_block_start,
+                                          fast_sint_t omp_block_end) {
     {
         fast_sint_t i = omp_block_start, j = (fast_sint_t)index;
         if (omp_block_end < j) {
@@ -4875,10 +4631,7 @@ static void libsais_unbwt_calculate_biPSI(const u8 * RESTRICT T, sa_uint_t * RES
             fast_sint_t t = (fast_sint_t)(index - p);
 
             if (t != 0) {
-                fast_uint_t w =
-                    (((fast_uint_t)T[p + (fast_uint_t)(t >> ((sizeof(fast_sint_t) * 8) - 1))])
-                     << 8) +
-                    c;
+                fast_uint_t w = (((fast_uint_t)T[p + (fast_uint_t)(t >> ((sizeof(fast_sint_t) * 8) - 1))]) << 8) + c;
                 P[bucket2[w]++] = (sa_uint_t)i;
             }
         }
@@ -4895,10 +4648,7 @@ static void libsais_unbwt_calculate_biPSI(const u8 * RESTRICT T, sa_uint_t * RES
             fast_sint_t t = (fast_sint_t)(index - p);
 
             if (t != 0) {
-                fast_uint_t w =
-                    (((fast_uint_t)T[p + (fast_uint_t)(t >> ((sizeof(fast_sint_t) * 8) - 1))])
-                     << 8) +
-                    c;
+                fast_uint_t w = (((fast_uint_t)T[p + (fast_uint_t)(t >> ((sizeof(fast_sint_t) * 8) - 1))]) << 8) + c;
                 P[bucket2[w]++] = (sa_uint_t)i;
             }
         }
@@ -4930,9 +4680,8 @@ static void libsais_unbwt_init_single(const u8 * RESTRICT T, sa_uint_t * RESTRIC
     libsais_unbwt_calculate_fastbits(bucket2, fastbits, lastc, shift);
     libsais_unbwt_calculate_biPSI(T, P, bucket1, bucket2, index, 0, n);
 }
-static void libsais_unbwt_decode_1(u8 * RESTRICT U, sa_uint_t * RESTRICT P,
-                                   sa_uint_t * RESTRICT bucket2, u16 * RESTRICT fastbits,
-                                   fast_uint_t shift, fast_uint_t * i0, fast_uint_t k) {
+static void libsais_unbwt_decode_1(u8 * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2,
+                                   u16 * RESTRICT fastbits, fast_uint_t shift, fast_uint_t * i0, fast_uint_t k) {
     u16 * RESTRICT U0 = (u16 *)(void *)U;
 
     fast_uint_t i, p0 = *i0;
@@ -4951,9 +4700,8 @@ static void libsais_unbwt_decode_1(u8 * RESTRICT U, sa_uint_t * RESTRICT P,
     *i0 = p0;
 }
 
-static void libsais_unbwt_decode_2(u8 * RESTRICT U, sa_uint_t * RESTRICT P,
-                                   sa_uint_t * RESTRICT bucket2, u16 * RESTRICT fastbits,
-                                   fast_uint_t shift, fast_uint_t r, fast_uint_t * i0,
+static void libsais_unbwt_decode_2(u8 * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2,
+                                   u16 * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0,
                                    fast_uint_t * i1, fast_uint_t k) {
     u16 * RESTRICT U0 = (u16 *)(void *)U;
     u16 * RESTRICT U1 = (u16 *)(void *)(((u8 *)U0) + r);
@@ -4983,9 +4731,8 @@ static void libsais_unbwt_decode_2(u8 * RESTRICT U, sa_uint_t * RESTRICT P,
     *i1 = p1;
 }
 
-static void libsais_unbwt_decode_3(u8 * RESTRICT U, sa_uint_t * RESTRICT P,
-                                   sa_uint_t * RESTRICT bucket2, u16 * RESTRICT fastbits,
-                                   fast_uint_t shift, fast_uint_t r, fast_uint_t * i0,
+static void libsais_unbwt_decode_3(u8 * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2,
+                                   u16 * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0,
                                    fast_uint_t * i1, fast_uint_t * i2, fast_uint_t k) {
     u16 * RESTRICT U0 = (u16 *)(void *)U;
     u16 * RESTRICT U1 = (u16 *)(void *)(((u8 *)U0) + r);
@@ -5025,11 +4772,9 @@ static void libsais_unbwt_decode_3(u8 * RESTRICT U, sa_uint_t * RESTRICT P,
     *i2 = p2;
 }
 
-static void libsais_unbwt_decode_4(u8 * RESTRICT U, sa_uint_t * RESTRICT P,
-                                   sa_uint_t * RESTRICT bucket2, u16 * RESTRICT fastbits,
-                                   fast_uint_t shift, fast_uint_t r, fast_uint_t * i0,
-                                   fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3,
-                                   fast_uint_t k) {
+static void libsais_unbwt_decode_4(u8 * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2,
+                                   u16 * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0,
+                                   fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t k) {
     u16 * RESTRICT U0 = (u16 *)(void *)U;
     u16 * RESTRICT U1 = (u16 *)(void *)(((u8 *)U0) + r);
     u16 * RESTRICT U2 = (u16 *)(void *)(((u8 *)U1) + r);
@@ -5078,11 +4823,10 @@ static void libsais_unbwt_decode_4(u8 * RESTRICT U, sa_uint_t * RESTRICT P,
     *i3 = p3;
 }
 
-static void libsais_unbwt_decode_5(u8 * RESTRICT U, sa_uint_t * RESTRICT P,
-                                   sa_uint_t * RESTRICT bucket2, u16 * RESTRICT fastbits,
-                                   fast_uint_t shift, fast_uint_t r, fast_uint_t * i0,
-                                   fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3,
-                                   fast_uint_t * i4, fast_uint_t k) {
+static void libsais_unbwt_decode_5(u8 * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2,
+                                   u16 * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0,
+                                   fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4,
+                                   fast_uint_t k) {
     u16 * RESTRICT U0 = (u16 *)(void *)U;
     u16 * RESTRICT U1 = (u16 *)(void *)(((u8 *)U0) + r);
     u16 * RESTRICT U2 = (u16 *)(void *)(((u8 *)U1) + r);
@@ -5141,11 +4885,10 @@ static void libsais_unbwt_decode_5(u8 * RESTRICT U, sa_uint_t * RESTRICT P,
     *i4 = p4;
 }
 
-static void libsais_unbwt_decode_6(u8 * RESTRICT U, sa_uint_t * RESTRICT P,
-                                   sa_uint_t * RESTRICT bucket2, u16 * RESTRICT fastbits,
-                                   fast_uint_t shift, fast_uint_t r, fast_uint_t * i0,
-                                   fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3,
-                                   fast_uint_t * i4, fast_uint_t * i5, fast_uint_t k) {
+static void libsais_unbwt_decode_6(u8 * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2,
+                                   u16 * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0,
+                                   fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4,
+                                   fast_uint_t * i5, fast_uint_t k) {
     u16 * RESTRICT U0 = (u16 *)(void *)U;
     u16 * RESTRICT U1 = (u16 *)(void *)(((u8 *)U0) + r);
     u16 * RESTRICT U2 = (u16 *)(void *)(((u8 *)U1) + r);
@@ -5214,12 +4957,10 @@ static void libsais_unbwt_decode_6(u8 * RESTRICT U, sa_uint_t * RESTRICT P,
     *i5 = p5;
 }
 
-static void libsais_unbwt_decode_7(u8 * RESTRICT U, sa_uint_t * RESTRICT P,
-                                   sa_uint_t * RESTRICT bucket2, u16 * RESTRICT fastbits,
-                                   fast_uint_t shift, fast_uint_t r, fast_uint_t * i0,
-                                   fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3,
-                                   fast_uint_t * i4, fast_uint_t * i5, fast_uint_t * i6,
-                                   fast_uint_t k) {
+static void libsais_unbwt_decode_7(u8 * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2,
+                                   u16 * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0,
+                                   fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4,
+                                   fast_uint_t * i5, fast_uint_t * i6, fast_uint_t k) {
     u16 * RESTRICT U0 = (u16 *)(void *)U;
     u16 * RESTRICT U1 = (u16 *)(void *)(((u8 *)U0) + r);
     u16 * RESTRICT U2 = (u16 *)(void *)(((u8 *)U1) + r);
@@ -5298,12 +5039,10 @@ static void libsais_unbwt_decode_7(u8 * RESTRICT U, sa_uint_t * RESTRICT P,
     *i6 = p6;
 }
 
-static void libsais_unbwt_decode_8(u8 * RESTRICT U, sa_uint_t * RESTRICT P,
-                                   sa_uint_t * RESTRICT bucket2, u16 * RESTRICT fastbits,
-                                   fast_uint_t shift, fast_uint_t r, fast_uint_t * i0,
-                                   fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3,
-                                   fast_uint_t * i4, fast_uint_t * i5, fast_uint_t * i6,
-                                   fast_uint_t * i7, fast_uint_t k) {
+static void libsais_unbwt_decode_8(u8 * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2,
+                                   u16 * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0,
+                                   fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4,
+                                   fast_uint_t * i5, fast_uint_t * i6, fast_uint_t * i7, fast_uint_t k) {
     u16 * RESTRICT U0 = (u16 *)(void *)U;
     u16 * RESTRICT U1 = (u16 *)(void *)(((u8 *)U0) + r);
     u16 * RESTRICT U2 = (u16 *)(void *)(((u8 *)U1) + r);
@@ -5393,9 +5132,8 @@ static void libsais_unbwt_decode_8(u8 * RESTRICT U, sa_uint_t * RESTRICT P,
 }
 
 static void libsais_unbwt_decode(u8 * RESTRICT U, sa_uint_t * RESTRICT P, sa_sint_t n, sa_sint_t r,
-                                 const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2,
-                                 u16 * RESTRICT fastbits, fast_sint_t blocks,
-                                 fast_uint_t reminder) {
+                                 const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, u16 * RESTRICT fastbits,
+                                 fast_sint_t blocks, fast_uint_t reminder) {
     fast_uint_t shift = 0;
     while ((n >> shift) > (1 << UNBWT_FASTBITS)) {
         shift++;
@@ -5403,10 +5141,9 @@ static void libsais_unbwt_decode(u8 * RESTRICT U, sa_uint_t * RESTRICT P, sa_sin
     fast_uint_t offset = 0;
 
     while (blocks > 8) {
-        fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5], i6 = I[6],
-                    i7 = I[7];
-        libsais_unbwt_decode_8(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1,
-                               &i2, &i3, &i4, &i5, &i6, &i7, (fast_uint_t)r >> 1);
+        fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5], i6 = I[6], i7 = I[7];
+        libsais_unbwt_decode_8(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5,
+                               &i6, &i7, (fast_uint_t)r >> 1);
         I += 8;
         blocks -= 8;
         offset += 8 * (fast_uint_t)r;
@@ -5417,59 +5154,50 @@ static void libsais_unbwt_decode(u8 * RESTRICT U, sa_uint_t * RESTRICT P, sa_sin
         libsais_unbwt_decode_1(U + offset, P, bucket2, fastbits, shift, &i0, reminder >> 1);
     } else if (blocks == 2) {
         fast_uint_t i0 = I[0], i1 = I[1];
-        libsais_unbwt_decode_2(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1,
-                               reminder >> 1);
+        libsais_unbwt_decode_2(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, reminder >> 1);
         libsais_unbwt_decode_1(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift, &i0,
                                ((fast_uint_t)r >> 1) - (reminder >> 1));
     } else if (blocks == 3) {
         fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2];
-        libsais_unbwt_decode_3(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1,
-                               &i2, reminder >> 1);
-        libsais_unbwt_decode_2(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift,
-                               (fast_uint_t)r, &i0, &i1, ((fast_uint_t)r >> 1) - (reminder >> 1));
+        libsais_unbwt_decode_3(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, reminder >> 1);
+        libsais_unbwt_decode_2(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1,
+                               ((fast_uint_t)r >> 1) - (reminder >> 1));
     } else if (blocks == 4) {
         fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3];
-        libsais_unbwt_decode_4(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1,
-                               &i2, &i3, reminder >> 1);
-        libsais_unbwt_decode_3(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift,
-                               (fast_uint_t)r, &i0, &i1, &i2,
-                               ((fast_uint_t)r >> 1) - (reminder >> 1));
+        libsais_unbwt_decode_4(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3,
+                               reminder >> 1);
+        libsais_unbwt_decode_3(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1,
+                               &i2, ((fast_uint_t)r >> 1) - (reminder >> 1));
     } else if (blocks == 5) {
         fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4];
-        libsais_unbwt_decode_5(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1,
-                               &i2, &i3, &i4, reminder >> 1);
-        libsais_unbwt_decode_4(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift,
-                               (fast_uint_t)r, &i0, &i1, &i2, &i3,
-                               ((fast_uint_t)r >> 1) - (reminder >> 1));
+        libsais_unbwt_decode_5(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4,
+                               reminder >> 1);
+        libsais_unbwt_decode_4(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1,
+                               &i2, &i3, ((fast_uint_t)r >> 1) - (reminder >> 1));
     } else if (blocks == 6) {
         fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5];
-        libsais_unbwt_decode_6(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1,
-                               &i2, &i3, &i4, &i5, reminder >> 1);
-        libsais_unbwt_decode_5(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift,
-                               (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4,
-                               ((fast_uint_t)r >> 1) - (reminder >> 1));
+        libsais_unbwt_decode_6(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5,
+                               reminder >> 1);
+        libsais_unbwt_decode_5(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1,
+                               &i2, &i3, &i4, ((fast_uint_t)r >> 1) - (reminder >> 1));
     } else if (blocks == 7) {
         fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5], i6 = I[6];
-        libsais_unbwt_decode_7(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1,
-                               &i2, &i3, &i4, &i5, &i6, reminder >> 1);
-        libsais_unbwt_decode_6(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift,
-                               (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5,
-                               ((fast_uint_t)r >> 1) - (reminder >> 1));
+        libsais_unbwt_decode_7(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5,
+                               &i6, reminder >> 1);
+        libsais_unbwt_decode_6(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1,
+                               &i2, &i3, &i4, &i5, ((fast_uint_t)r >> 1) - (reminder >> 1));
     } else {
-        fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5], i6 = I[6],
-                    i7 = I[7];
-        libsais_unbwt_decode_8(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1,
-                               &i2, &i3, &i4, &i5, &i6, &i7, reminder >> 1);
-        libsais_unbwt_decode_7(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift,
-                               (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, &i6,
-                               ((fast_uint_t)r >> 1) - (reminder >> 1));
+        fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5], i6 = I[6], i7 = I[7];
+        libsais_unbwt_decode_8(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5,
+                               &i6, &i7, reminder >> 1);
+        libsais_unbwt_decode_7(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1,
+                               &i2, &i3, &i4, &i5, &i6, ((fast_uint_t)r >> 1) - (reminder >> 1));
     }
 }
 
-static void libsais_unbwt_decode_omp(const u8 * RESTRICT T, u8 * RESTRICT U, sa_uint_t * RESTRICT P,
-                                     sa_sint_t n, sa_sint_t r, const sa_uint_t * RESTRICT I,
-                                     sa_uint_t * RESTRICT bucket2, u16 * RESTRICT fastbits,
-                                     sa_sint_t threads) {
+static void libsais_unbwt_decode_omp(const u8 * RESTRICT T, u8 * RESTRICT U, sa_uint_t * RESTRICT P, sa_sint_t n,
+                                     sa_sint_t r, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2,
+                                     u16 * RESTRICT fastbits, sa_sint_t threads) {
     fast_uint_t lastc = T[0];
     fast_sint_t blocks = 1 + (((fast_sint_t)n - 1) / (fast_sint_t)r);
     fast_uint_t reminder = (fast_uint_t)n - ((fast_uint_t)r * ((fast_uint_t)blocks - 1));
@@ -5482,22 +5210,19 @@ static void libsais_unbwt_decode_omp(const u8 * RESTRICT T, u8 * RESTRICT U, sa_
         fast_sint_t omp_block_stride = blocks / omp_num_threads;
         fast_sint_t omp_block_reminder = blocks % omp_num_threads;
         fast_sint_t omp_block_size = omp_block_stride + (omp_thread_num < omp_block_reminder);
-        fast_sint_t omp_block_start =
-            omp_block_stride * omp_thread_num +
-            (omp_thread_num < omp_block_reminder ? omp_thread_num : omp_block_reminder);
+        fast_sint_t omp_block_start = omp_block_stride * omp_thread_num +
+                                      (omp_thread_num < omp_block_reminder ? omp_thread_num : omp_block_reminder);
 
-        libsais_unbwt_decode(U + r * omp_block_start, P, n, r, I + omp_block_start, bucket2,
-                             fastbits, omp_block_size,
+        libsais_unbwt_decode(U + r * omp_block_start, P, n, r, I + omp_block_start, bucket2, fastbits, omp_block_size,
                              omp_thread_num < omp_num_threads - 1 ? (fast_uint_t)r : reminder);
     }
 
     U[n - 1] = (u8)lastc;
 }
 
-static sa_sint_t libsais_unbwt_core(const u8 * RESTRICT T, u8 * RESTRICT U, sa_uint_t * RESTRICT P,
-                                    sa_sint_t n, const sa_sint_t * freq, sa_sint_t r,
-                                    const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2,
-                                    u16 * RESTRICT fastbits, sa_uint_t * RESTRICT buckets,
+static sa_sint_t libsais_unbwt_core(const u8 * RESTRICT T, u8 * RESTRICT U, sa_uint_t * RESTRICT P, sa_sint_t n,
+                                    const sa_sint_t * freq, sa_sint_t r, const sa_uint_t * RESTRICT I,
+                                    sa_uint_t * RESTRICT bucket2, u16 * RESTRICT fastbits, sa_uint_t * RESTRICT buckets,
                                     sa_sint_t threads) {
     (void)(buckets);
 
@@ -5507,9 +5232,8 @@ static sa_sint_t libsais_unbwt_core(const u8 * RESTRICT T, u8 * RESTRICT U, sa_u
     return 0;
 }
 
-static sa_sint_t libsais_unbwt_main(const u8 * T, u8 * U, sa_uint_t * P, sa_sint_t n,
-                                    const sa_sint_t * freq, sa_sint_t r, const sa_uint_t * I,
-                                    sa_sint_t threads) {
+static sa_sint_t libsais_unbwt_main(const u8 * T, u8 * U, sa_uint_t * P, sa_sint_t n, const sa_sint_t * freq,
+                                    sa_sint_t r, const sa_uint_t * I, sa_sint_t threads) {
     fast_uint_t shift = 0;
     while ((n >> shift) > (1 << UNBWT_FASTBITS)) {
         shift++;
@@ -5517,20 +5241,16 @@ static sa_sint_t libsais_unbwt_main(const u8 * T, u8 * U, sa_uint_t * P, sa_sint
 
     sa_uint_t * RESTRICT bucket2 =
         (sa_uint_t *)libsais_alloc_aligned(ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t), 4096);
-    u16 * RESTRICT fastbits =
-        (u16 *)libsais_alloc_aligned(((size_t)1 + (size_t)(n >> shift)) * sizeof(u16), 4096);
+    u16 * RESTRICT fastbits = (u16 *)libsais_alloc_aligned(((size_t)1 + (size_t)(n >> shift)) * sizeof(u16), 4096);
     sa_uint_t * RESTRICT buckets =
         threads > 1 && n >= 262144
             ? (sa_uint_t *)libsais_alloc_aligned(
-                  (size_t)threads * (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)) *
-                      sizeof(sa_uint_t),
-                  4096)
+                  (size_t)threads * (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)) * sizeof(sa_uint_t), 4096)
             : NULL;
 
-    sa_sint_t index =
-        bucket2 != NULL && fastbits != NULL && (buckets != NULL || threads == 1 || n < 262144)
-            ? libsais_unbwt_core(T, U, P, n, freq, r, I, bucket2, fastbits, buckets, threads)
-            : -2;
+    sa_sint_t index = bucket2 != NULL && fastbits != NULL && (buckets != NULL || threads == 1 || n < 262144)
+                          ? libsais_unbwt_core(T, U, P, n, freq, r, I, bucket2, fastbits, buckets, threads)
+                          : -2;
 
     libsais_free_aligned(buckets);
     libsais_free_aligned(fastbits);
@@ -5539,35 +5259,29 @@ static sa_sint_t libsais_unbwt_main(const u8 * T, u8 * U, sa_uint_t * P, sa_sint
     return index;
 }
 
-static sa_sint_t libsais_unbwt_main_ctx(const LIBSAIS_UNBWT_CONTEXT * ctx, const u8 * T, u8 * U,
-                                        sa_uint_t * P, sa_sint_t n, const sa_sint_t * freq,
-                                        sa_sint_t r, const sa_uint_t * I) {
-    return ctx != NULL && ctx->bucket2 != NULL && ctx->fastbits != NULL &&
-                   (ctx->buckets != NULL || ctx->threads == 1)
-               ? libsais_unbwt_core(T, U, P, n, freq, r, I, ctx->bucket2, ctx->fastbits,
-                                    ctx->buckets, (sa_sint_t)ctx->threads)
+static sa_sint_t libsais_unbwt_main_ctx(const LIBSAIS_UNBWT_CONTEXT * ctx, const u8 * T, u8 * U, sa_uint_t * P,
+                                        sa_sint_t n, const sa_sint_t * freq, sa_sint_t r, const sa_uint_t * I) {
+    return ctx != NULL && ctx->bucket2 != NULL && ctx->fastbits != NULL && (ctx->buckets != NULL || ctx->threads == 1)
+               ? libsais_unbwt_core(T, U, P, n, freq, r, I, ctx->bucket2, ctx->fastbits, ctx->buckets,
+                                    (sa_sint_t)ctx->threads)
                : -2;
 }
 
 void * libsais_unbwt_create_ctx(void) { return (void *)libsais_unbwt_create_ctx_main(1); }
 
-void libsais_unbwt_free_ctx(void * ctx) {
-    libsais_unbwt_free_ctx_main((LIBSAIS_UNBWT_CONTEXT *)ctx);
-}
+void libsais_unbwt_free_ctx(void * ctx) { libsais_unbwt_free_ctx_main((LIBSAIS_UNBWT_CONTEXT *)ctx); }
 
 s32 libsais_unbwt(const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq, s32 i) {
     return libsais_unbwt_aux(T, U, A, n, freq, n, &i);
 }
 
-s32 libsais_unbwt_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq,
-                      s32 i) {
+s32 libsais_unbwt_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq, s32 i) {
     return libsais_unbwt_aux_ctx(ctx, T, U, A, n, freq, n, &i);
 }
 
-s32 libsais_unbwt_aux(const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq, s32 r,
-                      const s32 * I) {
-    if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) ||
-        ((r != n) && ((r < 2) || ((r & (r - 1)) != 0))) || (I == NULL)) {
+s32 libsais_unbwt_aux(const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq, s32 r, const s32 * I) {
+    if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || ((r != n) && ((r < 2) || ((r & (r - 1)) != 0))) ||
+        (I == NULL)) {
         return -1;
     } else if (n <= 1) {
         if (I[0] != n) {
@@ -5589,10 +5303,10 @@ s32 libsais_unbwt_aux(const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq, s3
     return libsais_unbwt_main(T, U, (sa_uint_t *)A, n, freq, r, (const sa_uint_t *)I, 1);
 }
 
-s32 libsais_unbwt_aux_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq,
-                          s32 r, const s32 * I) {
-    if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) ||
-        ((r != n) && ((r < 2) || ((r & (r - 1)) != 0))) || (I == NULL)) {
+s32 libsais_unbwt_aux_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq, s32 r,
+                          const s32 * I) {
+    if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || ((r != n) && ((r < 2) || ((r & (r - 1)) != 0))) ||
+        (I == NULL)) {
         return -1;
     } else if (n <= 1) {
         if (I[0] != n) {
@@ -5611,18 +5325,16 @@ s32 libsais_unbwt_aux_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n
         }
     }
 
-    return libsais_unbwt_main_ctx((const LIBSAIS_UNBWT_CONTEXT *)ctx, T, U, (sa_uint_t *)A, n, freq,
-                                  r, (const sa_uint_t *)I);
+    return libsais_unbwt_main_ctx((const LIBSAIS_UNBWT_CONTEXT *)ctx, T, U, (sa_uint_t *)A, n, freq, r,
+                                  (const sa_uint_t *)I);
 }
-static void libsais_compute_phi(const sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT PLCP,
-                                sa_sint_t n, fast_sint_t omp_block_start,
-                                fast_sint_t omp_block_size) {
+static void libsais_compute_phi(const sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT PLCP, sa_sint_t n,
+                                fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
     sa_sint_t k = omp_block_start > 0 ? SA[omp_block_start - 1] : n;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j;
-         i += 4) {
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) {
         libsais_prefetchw(&PLCP[SA[i + prefetch_distance + 0]]);
         libsais_prefetchw(&PLCP[SA[i + prefetch_distance + 1]]);
 
@@ -5646,8 +5358,8 @@ static void libsais_compute_phi(const sa_sint_t * RESTRICT SA, sa_sint_t * RESTR
     }
 }
 
-static void libsais_compute_phi_omp(const sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT PLCP,
-                                    sa_sint_t n, sa_sint_t threads) {
+static void libsais_compute_phi_omp(const sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT PLCP, sa_sint_t n,
+                                    sa_sint_t threads) {
     {
         (void)(threads);
 
@@ -5656,8 +5368,7 @@ static void libsais_compute_phi_omp(const sa_sint_t * RESTRICT SA, sa_sint_t * R
 
         fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size =
-            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
 
         libsais_compute_phi(SA, PLCP, n, omp_block_start, omp_block_size);
     }
@@ -5668,8 +5379,7 @@ static void libsais_compute_plcp(const u8 * RESTRICT T, sa_sint_t * RESTRICT PLC
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j, l = 0;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance; i < j;
-         i += 1) {
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance; i < j; i += 1) {
         libsais_prefetch(&T[PLCP[i + prefetch_distance] + l]);
 
         fast_sint_t k = PLCP[i], m = n - (i > k ? i : k);
@@ -5692,8 +5402,7 @@ static void libsais_compute_plcp(const u8 * RESTRICT T, sa_sint_t * RESTRICT PLC
     }
 }
 
-static void libsais_compute_plcp_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT PLCP, sa_sint_t n,
-                                     sa_sint_t threads) {
+static void libsais_compute_plcp_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT PLCP, sa_sint_t n, sa_sint_t threads) {
     {
         (void)(threads);
 
@@ -5702,21 +5411,18 @@ static void libsais_compute_plcp_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT
 
         fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size =
-            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
 
         libsais_compute_plcp(T, PLCP, n, omp_block_start, omp_block_size);
     }
 }
 
 static void libsais_compute_lcp(const sa_sint_t * RESTRICT PLCP, const sa_sint_t * RESTRICT SA,
-                                sa_sint_t * RESTRICT LCP, fast_sint_t omp_block_start,
-                                fast_sint_t omp_block_size) {
+                                sa_sint_t * RESTRICT LCP, fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j;
-         i += 4) {
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) {
         libsais_prefetch(&PLCP[SA[i + prefetch_distance + 0]]);
         libsais_prefetch(&PLCP[SA[i + prefetch_distance + 1]]);
 
@@ -5745,8 +5451,7 @@ static void libsais_compute_lcp_omp(const sa_sint_t * RESTRICT PLCP, const sa_si
 
         fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size =
-            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
 
         libsais_compute_lcp(PLCP, SA, LCP, omp_block_start, omp_block_size);
     }
diff --git a/src/lzp.c b/src/lzp.c
index 6813788..0ba6dd5 100644
--- a/src/lzp.c
+++ b/src/lzp.c
@@ -16,8 +16,8 @@ static inline s32 num_blocks(s32 n) {
     return 4;
 }
 
-static s32 lzp_encode_block(const u8 * restrict in, const u8 * in_end, u8 * restrict out,
-                            u8 * out_end, s32 * restrict lut, s32 mask, s32 m_len) {
+static s32 lzp_encode_block(const u8 * restrict in, const u8 * in_end, u8 * restrict out, u8 * out_end,
+                            s32 * restrict lut, s32 mask, s32 m_len) {
     const u8 *ins = in, *outs = out;
     const u8 * out_eob = out_end - 8;
     const u8 * heur = in;
@@ -34,8 +34,7 @@ static s32 lzp_encode_block(const u8 * restrict in, const u8 * in_end, u8 * rest
         lut[idx] = in - ins;
         if (val > 0) {
             const u8 * restrict ref = ins + val;
-            if (memcmp(in + m_len - 4, ref + m_len - 4, sizeof(u32)) == 0 &&
-                memcmp(in, ref, sizeof(u32)) == 0) {
+            if (memcmp(in + m_len - 4, ref + m_len - 4, sizeof(u32)) == 0 && memcmp(in, ref, sizeof(u32)) == 0) {
                 if (heur > in && *(u32 *)heur != *(u32 *)(ref + (heur - in))) goto not_found;
 
                 s32 len = 4;
@@ -91,8 +90,7 @@ static s32 lzp_encode_block(const u8 * restrict in, const u8 * in_end, u8 * rest
     return out >= out_eob ? -1 : (s32)(out - outs);
 }
 
-static s32 lzp_decode_block(const u8 * restrict in, const u8 * in_end, u8 * restrict out, s32 hash,
-                            s32 m_len) {
+static s32 lzp_decode_block(const u8 * restrict in, const u8 * in_end, u8 * restrict out, s32 hash, s32 m_len) {
     if (in_end - in < 4) return -1;
 
     s32 * restrict lut = calloc(1 << hash, sizeof(s32));
@@ -148,8 +146,7 @@ s32 lzp_compress(const u8 * in, u8 * out, s32 n, s32 hash, s32 m_len) {
 
         if (!lut) return -1;
 
-        s32 r =
-            lzp_encode_block(in, in + n, out + 1, out + n - 1, lut, (s32)(1 << hash) - 1, m_len);
+        s32 r = lzp_encode_block(in, in + n, out + 1, out + n - 1, lut, (s32)(1 << hash) - 1, m_len);
 
         free(lut);
 
@@ -180,8 +177,8 @@ s32 lzp_compress(const u8 * in, u8 * out, s32 n, s32 hash, s32 m_len) {
             if (!lut)
                 r = -1;
             else
-                r = lzp_encode_block(in + ins, in + ins + insz, out + out_ptr,
-                                     out + out_ptr + outsz, lut, (s32)(1 << hash) - 1, m_len);
+                r = lzp_encode_block(in + ins, in + ins + insz, out + out_ptr, out + out_ptr + outsz, lut,
+                                     (s32)(1 << hash) - 1, m_len);
 
             free(lut);
         }
@@ -220,8 +217,7 @@ s32 lzp_decompress(const u8 * in, u8 * out, s32 n, s32 hash, s32 m_len) {
         s32 outsz = *(s32 *)(in + 1 + 8 * b_id + 0);
 
         if (insz != outsz) {
-            dec[b_id] =
-                lzp_decode_block(in + in_ptr, in + in_ptr + insz, out + out_ptr, hash, m_len);
+            dec[b_id] = lzp_decode_block(in + in_ptr, in + in_ptr + insz, out + out_ptr, hash, m_len);
         } else {
             dec[b_id] = insz;
             memcpy(out + out_ptr, in + in_ptr, insz);
diff --git a/src/main.c b/src/main.c
index 23a4fd6..84ec081 100644
--- a/src/main.c
+++ b/src/main.c
@@ -22,6 +22,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
+#include <arpa/inet.h>
 
 #include "common.h"
 #include "libbz3.h"
@@ -65,26 +66,26 @@ int main(int argc, char * argv[]) {
         return 1;
     }
 
-    int input_des, output_des;
+    FILE * input_des, * output_des;
 
     if (input != NULL) {
-        input_des = open(input, O_RDONLY);
-        if (input_des == -1) {
-            perror("open");
+        input_des = fopen(input, "rb");
+        if (input_des == NULL) {
+            perror("fopen");
             return 1;
         }
     } else {
-        input_des = STDIN_FILENO;
+        input_des = stdin;
     }
 
     if (output != NULL) {
-        output_des = open(output, O_WRONLY | O_CREAT | O_TRUNC, 0644);
-        if (output_des == -1) {
+        output_des = fopen(output, "wb");
+        if (output_des == NULL) {
             perror("open");
             return 1;
         }
     } else {
-        output_des = STDOUT_FILENO;
+        output_des = stdout;
     }
 
     if (block_size < KiB(65) || block_size > MiB(2047)) {
@@ -94,20 +95,25 @@ int main(int argc, char * argv[]) {
 
     switch (mode) {
         case 1:
-            write(output_des, "BZ3v1", 5);
-            write(output_des, &block_size, sizeof(u32));
+            fwrite("BZ3v1", 5, 1, output_des);
+
+            block_size = htonl(block_size);
+            fwrite(&block_size, sizeof(u32), 1, output_des);
+            block_size = ntohl(block_size);
             break;
         case -1:
         case -2: {
             char signature[5];
 
-            read(input_des, signature, 5);
+            fread(signature, 5, 1, input_des);
             if (strncmp(signature, "BZ3v1", 5) != 0) {
                 fprintf(stderr, "Invalid signature.\n");
                 return 1;
             }
 
-            read(input_des, &block_size, sizeof(u32));
+            fread(&block_size, sizeof(u32), 1, input_des);
+
+            block_size = ntohl(block_size);
 
             if (block_size < KiB(65) || block_size > MiB(2047)) {
                 fprintf(stderr,
@@ -120,65 +126,77 @@ int main(int argc, char * argv[]) {
         }
     }
 
-    struct block_encoder_state * block_encoder_state = new_block_encoder_state(block_size);
+    struct bz3_state * state = bz3_new(block_size);
 
-    if (block_encoder_state == NULL) {
+    if (state == NULL) {
         fprintf(stderr, "Failed to create a block encoder state.\n");
         return 1;
     }
 
+    u8 * buffer = malloc(block_size + block_size / 4);
+
     if (mode == 1) {
-        while (commit_read(block_encoder_state, read(input_des, get_buffer(block_encoder_state), block_size)) > 0) {
-            if (get_last_error(block_encoder_state) != BZ3_OK) {
-                fprintf(stderr, "Failed to read data: %s\n", str_last_error(block_encoder_state));
-                return 1;
-            }
-            struct encoding_result r = encode_block(block_encoder_state);
-            if (get_last_error(block_encoder_state) != BZ3_OK) {
-                fprintf(stderr, "Failed to encode the block: %s\n",
-                        str_last_error(block_encoder_state));
+        s32 read_count;
+        while (!feof(input_des)) {
+            read_count = fread(buffer, 1, block_size, input_des);
+
+            s32 new_size = bz3_encode_block(state, buffer, read_count);
+            if (new_size == -1) {
+                fprintf(stderr, "Failed to encode a block: %s\n", bz3_strerror(state));
                 return 1;
             }
-            write(output_des, r.buffer, r.size);
+
+            read_count = htonl(read_count); new_size = ntohl(new_size);
+            fwrite(&new_size, 4, 1, output_des);
+            fwrite(&read_count, 4, 1, output_des);
+            fwrite(buffer, ntohl(new_size), 1, output_des);
         }
-    }
-    else if (mode == -1) {
-        s32 read_size;
-        while ((read_size = read_block(input_des, block_encoder_state)) > 0) {
-            if (get_last_error(block_encoder_state) != BZ3_OK) {
-                fprintf(stderr, "Failed to read data: %s\n", str_last_error(block_encoder_state));
+    } else if (mode == -1) {
+        s32 new_size, old_size;
+        while (!feof(input_des)) {
+            if(fread(&new_size, 1, 4, input_des) != 4) {
+                // Assume that the file has no more data.
+                break;
+            }
+            if(fread(&old_size, 1, 4, input_des) != 4) {
+                fprintf(stderr, "I/O error.\n");
                 return 1;
             }
-            struct encoding_result r = decode_block(block_encoder_state);
-            if (get_last_error(block_encoder_state) != BZ3_OK) {
-                fprintf(stderr, "Failed to decode the block: %s\n",
-                        str_last_error(block_encoder_state));
+            new_size = ntohl(new_size); old_size = ntohl(old_size);
+            fread(buffer, 1, new_size, input_des);
+            if(bz3_decode_block(state, buffer, new_size, old_size) == -1) {
+                fprintf(stderr, "Failed to decode a block: %s\n", bz3_strerror(state));
                 return 1;
             }
-            write(output_des, r.buffer, r.size);
+            fwrite(buffer, old_size, 1, output_des);
         }
     } else if (mode == -2) {
-        s32 read_size;
-        while ((read_size = read_block(input_des, block_encoder_state)) > 0) {
-            if (get_last_error(block_encoder_state) != BZ3_OK) {
-                fprintf(stderr, "Failed to read data: %s\n", str_last_error(block_encoder_state));
-                return 1;
+        s32 new_size, old_size;
+        while (!feof(input_des)) {
+            if(fread(&new_size, 4, 1, input_des) != 4) {
+                fprintf(stderr, "I/O error.\n");
+            }
+            if(fread(&old_size, 4, 1, input_des) != 4) {
+                fprintf(stderr, "I/O error.\n");
             }
-            decode_block(block_encoder_state);
-            if (get_last_error(block_encoder_state) != BZ3_OK) {
-                fprintf(stderr, "Failed to decode data: %s\n", str_last_error(block_encoder_state));
+            new_size = ntohl(new_size); old_size = ntohl(old_size);
+            fread(buffer, 1, new_size, input_des);
+            if(bz3_decode_block(state, buffer, new_size, old_size) == -1) {
+                fprintf(stderr, "Failed to decode a block: %s\n", bz3_strerror(state));
                 return 1;
             }
         }
     }
 
-    if (get_last_error(block_encoder_state) != BZ3_OK) {
-        fprintf(stderr, "Failed to read data: %s\n", str_last_error(block_encoder_state));
+    if (bz3_last_error(state) != BZ3_OK) {
+        fprintf(stderr, "Failed to read data: %s\n", bz3_strerror(state));
         return 1;
     }
 
-    delete_block_encoder_state(block_encoder_state);
+    free(buffer);
+
+    bz3_free(state);
 
-    close(input_des);
-    close(output_des);
+    fclose(input_des);
+    fclose(output_des);
 }
diff --git a/src/srt.c b/src/srt.c
index d292abe..c5c1e38 100644
--- a/src/srt.c
+++ b/src/srt.c
@@ -32,8 +32,7 @@ static s32 preprocess(const u32 * freqs, u8 * symbols) {
         for (u32 i = h; i < nb_symbols; i++) {
             const s32 t = symbols[i];
             s32 b = i - h;
-            while ((b >= 0) && (freqs[symbols[b]] < freqs[t] ||
-                                (freqs[t] == freqs[symbols[b]] && t < symbols[b]))) {
+            while ((b >= 0) && (freqs[symbols[b]] < freqs[t] || (freqs[t] == freqs[symbols[b]] && t < symbols[b]))) {
                 symbols[b + h] = symbols[b];
                 b -= h;
             }
tab: 248 wrap: offon