:: commit ef181ceee2b3795b84c8d5b04ce910468fc850c5

Kamila Szewczyk <kspalaiologos@gmail.com> — 2022-05-03 08:52

parents: 9e1b132db4

LZP for text data

diff --git a/.clang-format b/.clang-format
index ef36a0a..9b1bf1e 100644
--- a/.clang-format
+++ b/.clang-format
@@ -7,5 +7,6 @@ PointerAlignment: Middle
 TabWidth: '4'
 UseTab: Never
 Cpp11BracedListStyle: false
+ColumnLimit: 100
 
 ...
diff --git a/Makefile b/Makefile
index e4e8a8b..397feff 100644
--- a/Makefile
+++ b/Makefile
@@ -6,7 +6,7 @@ PREFIX?=/usr/local
 .PHONY: all clean format install
 
 OBJECTS=obj/main.o obj/libsais.o obj/crc32.o obj/mtf.o obj/srt.o obj/rle.o \
-        obj/cm.o obj/libbz3.o obj/txt.o
+        obj/cm.o obj/libbz3.o obj/txt.o obj/lzp.o
 
 all: bzip3
 
diff --git a/include/libsais.h b/include/libsais.h
index 6286a5d..b0cae1e 100644
--- a/include/libsais.h
+++ b/include/libsais.h
@@ -77,8 +77,7 @@ s32 libsais_int(s32 * T, s32 * SA, s32 n, s32 k, s32 fs);
  * @param freq [0..255] The output symbol frequency table (can be NULL).
  * @return 0 if no error occurred, -1 or -2 otherwise.
  */
-s32 libsais_ctx(const void * ctx, const u8 * T, s32 * SA, s32 n, s32 fs,
-                s32 * freq);
+s32 libsais_ctx(const void * ctx, const u8 * T, s32 * SA, s32 n, s32 fs, s32 * freq);
 
 /**
  * Constructs the burrows-wheeler transformed string (BWT) of a given string.
@@ -107,8 +106,7 @@ s32 libsais_bwt(const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq);
  * @param I [0..(n-1)/r] The output auxiliary indexes.
  * @return 0 if no error occurred, -1 or -2 otherwise.
  */
-s32 libsais_bwt_aux(const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq,
-                    s32 r, s32 * I);
+s32 libsais_bwt_aux(const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq, s32 r, s32 * I);
 
 /**
  * Constructs the burrows-wheeler transformed string (BWT) of a given string
@@ -123,8 +121,7 @@ s32 libsais_bwt_aux(const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq,
  * @param freq [0..255] The output symbol frequency table (can be NULL).
  * @return The primary index if no error occurred, -1 or -2 otherwise.
  */
-s32 libsais_bwt_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n,
-                    s32 fs, s32 * freq);
+s32 libsais_bwt_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq);
 
 /**
  * Constructs the burrows-wheeler transformed string (BWT) of a given string
@@ -141,8 +138,8 @@ s32 libsais_bwt_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n,
  * @param I [0..(n-1)/r] The output auxiliary indexes.
  * @return 0 if no error occurred, -1 or -2 otherwise.
  */
-s32 libsais_bwt_aux_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n,
-                        s32 fs, s32 * freq, s32 r, s32 * I);
+s32 libsais_bwt_aux_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq,
+                        s32 r, s32 * I);
 
 /**
  * Creates the libsais reverse BWT context that allows reusing allocated memory
@@ -170,8 +167,7 @@ void libsais_unbwt_free_ctx(void * ctx);
  * @param i The primary index.
  * @return 0 if no error occurred, -1 or -2 otherwise.
  */
-s32 libsais_unbwt(const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq,
-                  s32 i);
+s32 libsais_unbwt(const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq, s32 i);
 
 /**
  * Constructs the original string from a given burrows-wheeler transformed
@@ -186,8 +182,8 @@ s32 libsais_unbwt(const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq,
  * @param i The primary index.
  * @return 0 if no error occurred, -1 or -2 otherwise.
  */
-s32 libsais_unbwt_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n,
-                      const s32 * freq, s32 i);
+s32 libsais_unbwt_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq,
+                      s32 i);
 
 /**
  * Constructs the original string from a given burrows-wheeler transformed
@@ -202,8 +198,7 @@ s32 libsais_unbwt_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n,
  * @param I [0..(n-1)/r] The input auxiliary indexes.
  * @return 0 if no error occurred, -1 or -2 otherwise.
  */
-s32 libsais_unbwt_aux(const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq,
-                      s32 r, const s32 * I);
+s32 libsais_unbwt_aux(const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq, s32 r, const s32 * I);
 
 /**
  * Constructs the original string from a given burrows-wheeler transformed
@@ -219,8 +214,8 @@ s32 libsais_unbwt_aux(const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq,
  * @param I [0..(n-1)/r] The input auxiliary indexes.
  * @return 0 if no error occurred, -1 or -2 otherwise.
  */
-s32 libsais_unbwt_aux_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A,
-                          s32 n, const s32 * freq, s32 r, const s32 * I);
+s32 libsais_unbwt_aux_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq,
+                          s32 r, const s32 * I);
 
 /**
  * Constructs the permuted longest common prefix array (PLCP) of a given string
diff --git a/include/lzp.h b/include/lzp.h
new file mode 100644
index 0000000..d07c956
--- /dev/null
+++ b/include/lzp.h
@@ -0,0 +1,11 @@
+
+#ifndef _LZP_H
+#define _LZP_H
+
+#include "common.h"
+
+s32 lzp_compress(const u8 * input, u8 * output, s32 n, s32 hash, s32 min);
+
+s32 lzp_decompress(const u8 * input, u8 * output, s32 n, s32 hash, s32 min);
+
+#endif
diff --git a/src/cm.c b/src/cm.c
index a939436..05ecea7 100644
--- a/src/cm.c
+++ b/src/cm.c
@@ -17,7 +17,7 @@ static void encodebit0(state * s, u32 p) {
 
     // Write identical bits.
     while ((s->low ^ s->high) < (1 << 24)) {
-        write_out(s, s->low >> 24); // Same as s->high >> 24
+        write_out(s, s->low >> 24);  // Same as s->high >> 24
         s->low <<= 8;
         s->high = (s->high << 8) | 0xFF;
     }
diff --git a/src/crc32.c b/src/crc32.c
index 2b88e98..8bed740 100644
--- a/src/crc32.c
+++ b/src/crc32.c
@@ -20,58 +20,43 @@
 #include "crc32.h"
 
 static const u32 crc32Table[256] = {
-    0x00000000L, 0xF26B8303L, 0xE13B70F7L, 0x1350F3F4L, 0xC79A971FL,
-    0x35F1141CL, 0x26A1E7E8L, 0xD4CA64EBL, 0x8AD958CFL, 0x78B2DBCCL,
-    0x6BE22838L, 0x9989AB3BL, 0x4D43CFD0L, 0xBF284CD3L, 0xAC78BF27L,
-    0x5E133C24L, 0x105EC76FL, 0xE235446CL, 0xF165B798L, 0x030E349BL,
-    0xD7C45070L, 0x25AFD373L, 0x36FF2087L, 0xC494A384L, 0x9A879FA0L,
-    0x68EC1CA3L, 0x7BBCEF57L, 0x89D76C54L, 0x5D1D08BFL, 0xAF768BBCL,
-    0xBC267848L, 0x4E4DFB4BL, 0x20BD8EDEL, 0xD2D60DDDL, 0xC186FE29L,
-    0x33ED7D2AL, 0xE72719C1L, 0x154C9AC2L, 0x061C6936L, 0xF477EA35L,
-    0xAA64D611L, 0x580F5512L, 0x4B5FA6E6L, 0xB93425E5L, 0x6DFE410EL,
-    0x9F95C20DL, 0x8CC531F9L, 0x7EAEB2FAL, 0x30E349B1L, 0xC288CAB2L,
-    0xD1D83946L, 0x23B3BA45L, 0xF779DEAEL, 0x05125DADL, 0x1642AE59L,
-    0xE4292D5AL, 0xBA3A117EL, 0x4851927DL, 0x5B016189L, 0xA96AE28AL,
-    0x7DA08661L, 0x8FCB0562L, 0x9C9BF696L, 0x6EF07595L, 0x417B1DBCL,
-    0xB3109EBFL, 0xA0406D4BL, 0x522BEE48L, 0x86E18AA3L, 0x748A09A0L,
-    0x67DAFA54L, 0x95B17957L, 0xCBA24573L, 0x39C9C670L, 0x2A993584L,
-    0xD8F2B687L, 0x0C38D26CL, 0xFE53516FL, 0xED03A29BL, 0x1F682198L,
-    0x5125DAD3L, 0xA34E59D0L, 0xB01EAA24L, 0x42752927L, 0x96BF4DCCL,
-    0x64D4CECFL, 0x77843D3BL, 0x85EFBE38L, 0xDBFC821CL, 0x2997011FL,
-    0x3AC7F2EBL, 0xC8AC71E8L, 0x1C661503L, 0xEE0D9600L, 0xFD5D65F4L,
-    0x0F36E6F7L, 0x61C69362L, 0x93AD1061L, 0x80FDE395L, 0x72966096L,
-    0xA65C047DL, 0x5437877EL, 0x4767748AL, 0xB50CF789L, 0xEB1FCBADL,
-    0x197448AEL, 0x0A24BB5AL, 0xF84F3859L, 0x2C855CB2L, 0xDEEEDFB1L,
-    0xCDBE2C45L, 0x3FD5AF46L, 0x7198540DL, 0x83F3D70EL, 0x90A324FAL,
-    0x62C8A7F9L, 0xB602C312L, 0x44694011L, 0x5739B3E5L, 0xA55230E6L,
-    0xFB410CC2L, 0x092A8FC1L, 0x1A7A7C35L, 0xE811FF36L, 0x3CDB9BDDL,
-    0xCEB018DEL, 0xDDE0EB2AL, 0x2F8B6829L, 0x82F63B78L, 0x709DB87BL,
-    0x63CD4B8FL, 0x91A6C88CL, 0x456CAC67L, 0xB7072F64L, 0xA457DC90L,
-    0x563C5F93L, 0x082F63B7L, 0xFA44E0B4L, 0xE9141340L, 0x1B7F9043L,
-    0xCFB5F4A8L, 0x3DDE77ABL, 0x2E8E845FL, 0xDCE5075CL, 0x92A8FC17L,
-    0x60C37F14L, 0x73938CE0L, 0x81F80FE3L, 0x55326B08L, 0xA759E80BL,
-    0xB4091BFFL, 0x466298FCL, 0x1871A4D8L, 0xEA1A27DBL, 0xF94AD42FL,
-    0x0B21572CL, 0xDFEB33C7L, 0x2D80B0C4L, 0x3ED04330L, 0xCCBBC033L,
-    0xA24BB5A6L, 0x502036A5L, 0x4370C551L, 0xB11B4652L, 0x65D122B9L,
-    0x97BAA1BAL, 0x84EA524EL, 0x7681D14DL, 0x2892ED69L, 0xDAF96E6AL,
-    0xC9A99D9EL, 0x3BC21E9DL, 0xEF087A76L, 0x1D63F975L, 0x0E330A81L,
-    0xFC588982L, 0xB21572C9L, 0x407EF1CAL, 0x532E023EL, 0xA145813DL,
-    0x758FE5D6L, 0x87E466D5L, 0x94B49521L, 0x66DF1622L, 0x38CC2A06L,
-    0xCAA7A905L, 0xD9F75AF1L, 0x2B9CD9F2L, 0xFF56BD19L, 0x0D3D3E1AL,
-    0x1E6DCDEEL, 0xEC064EEDL, 0xC38D26C4L, 0x31E6A5C7L, 0x22B65633L,
-    0xD0DDD530L, 0x0417B1DBL, 0xF67C32D8L, 0xE52CC12CL, 0x1747422FL,
-    0x49547E0BL, 0xBB3FFD08L, 0xA86F0EFCL, 0x5A048DFFL, 0x8ECEE914L,
-    0x7CA56A17L, 0x6FF599E3L, 0x9D9E1AE0L, 0xD3D3E1ABL, 0x21B862A8L,
-    0x32E8915CL, 0xC083125FL, 0x144976B4L, 0xE622F5B7L, 0xF5720643L,
-    0x07198540L, 0x590AB964L, 0xAB613A67L, 0xB831C993L, 0x4A5A4A90L,
-    0x9E902E7BL, 0x6CFBAD78L, 0x7FAB5E8CL, 0x8DC0DD8FL, 0xE330A81AL,
-    0x115B2B19L, 0x020BD8EDL, 0xF0605BEEL, 0x24AA3F05L, 0xD6C1BC06L,
-    0xC5914FF2L, 0x37FACCF1L, 0x69E9F0D5L, 0x9B8273D6L, 0x88D28022L,
-    0x7AB90321L, 0xAE7367CAL, 0x5C18E4C9L, 0x4F48173DL, 0xBD23943EL,
-    0xF36E6F75L, 0x0105EC76L, 0x12551F82L, 0xE03E9C81L, 0x34F4F86AL,
-    0xC69F7B69L, 0xD5CF889DL, 0x27A40B9EL, 0x79B737BAL, 0x8BDCB4B9L,
-    0x988C474DL, 0x6AE7C44EL, 0xBE2DA0A5L, 0x4C4623A6L, 0x5F16D052L,
-    0xAD7D5351L
+    0x00000000L, 0xF26B8303L, 0xE13B70F7L, 0x1350F3F4L, 0xC79A971FL, 0x35F1141CL, 0x26A1E7E8L,
+    0xD4CA64EBL, 0x8AD958CFL, 0x78B2DBCCL, 0x6BE22838L, 0x9989AB3BL, 0x4D43CFD0L, 0xBF284CD3L,
+    0xAC78BF27L, 0x5E133C24L, 0x105EC76FL, 0xE235446CL, 0xF165B798L, 0x030E349BL, 0xD7C45070L,
+    0x25AFD373L, 0x36FF2087L, 0xC494A384L, 0x9A879FA0L, 0x68EC1CA3L, 0x7BBCEF57L, 0x89D76C54L,
+    0x5D1D08BFL, 0xAF768BBCL, 0xBC267848L, 0x4E4DFB4BL, 0x20BD8EDEL, 0xD2D60DDDL, 0xC186FE29L,
+    0x33ED7D2AL, 0xE72719C1L, 0x154C9AC2L, 0x061C6936L, 0xF477EA35L, 0xAA64D611L, 0x580F5512L,
+    0x4B5FA6E6L, 0xB93425E5L, 0x6DFE410EL, 0x9F95C20DL, 0x8CC531F9L, 0x7EAEB2FAL, 0x30E349B1L,
+    0xC288CAB2L, 0xD1D83946L, 0x23B3BA45L, 0xF779DEAEL, 0x05125DADL, 0x1642AE59L, 0xE4292D5AL,
+    0xBA3A117EL, 0x4851927DL, 0x5B016189L, 0xA96AE28AL, 0x7DA08661L, 0x8FCB0562L, 0x9C9BF696L,
+    0x6EF07595L, 0x417B1DBCL, 0xB3109EBFL, 0xA0406D4BL, 0x522BEE48L, 0x86E18AA3L, 0x748A09A0L,
+    0x67DAFA54L, 0x95B17957L, 0xCBA24573L, 0x39C9C670L, 0x2A993584L, 0xD8F2B687L, 0x0C38D26CL,
+    0xFE53516FL, 0xED03A29BL, 0x1F682198L, 0x5125DAD3L, 0xA34E59D0L, 0xB01EAA24L, 0x42752927L,
+    0x96BF4DCCL, 0x64D4CECFL, 0x77843D3BL, 0x85EFBE38L, 0xDBFC821CL, 0x2997011FL, 0x3AC7F2EBL,
+    0xC8AC71E8L, 0x1C661503L, 0xEE0D9600L, 0xFD5D65F4L, 0x0F36E6F7L, 0x61C69362L, 0x93AD1061L,
+    0x80FDE395L, 0x72966096L, 0xA65C047DL, 0x5437877EL, 0x4767748AL, 0xB50CF789L, 0xEB1FCBADL,
+    0x197448AEL, 0x0A24BB5AL, 0xF84F3859L, 0x2C855CB2L, 0xDEEEDFB1L, 0xCDBE2C45L, 0x3FD5AF46L,
+    0x7198540DL, 0x83F3D70EL, 0x90A324FAL, 0x62C8A7F9L, 0xB602C312L, 0x44694011L, 0x5739B3E5L,
+    0xA55230E6L, 0xFB410CC2L, 0x092A8FC1L, 0x1A7A7C35L, 0xE811FF36L, 0x3CDB9BDDL, 0xCEB018DEL,
+    0xDDE0EB2AL, 0x2F8B6829L, 0x82F63B78L, 0x709DB87BL, 0x63CD4B8FL, 0x91A6C88CL, 0x456CAC67L,
+    0xB7072F64L, 0xA457DC90L, 0x563C5F93L, 0x082F63B7L, 0xFA44E0B4L, 0xE9141340L, 0x1B7F9043L,
+    0xCFB5F4A8L, 0x3DDE77ABL, 0x2E8E845FL, 0xDCE5075CL, 0x92A8FC17L, 0x60C37F14L, 0x73938CE0L,
+    0x81F80FE3L, 0x55326B08L, 0xA759E80BL, 0xB4091BFFL, 0x466298FCL, 0x1871A4D8L, 0xEA1A27DBL,
+    0xF94AD42FL, 0x0B21572CL, 0xDFEB33C7L, 0x2D80B0C4L, 0x3ED04330L, 0xCCBBC033L, 0xA24BB5A6L,
+    0x502036A5L, 0x4370C551L, 0xB11B4652L, 0x65D122B9L, 0x97BAA1BAL, 0x84EA524EL, 0x7681D14DL,
+    0x2892ED69L, 0xDAF96E6AL, 0xC9A99D9EL, 0x3BC21E9DL, 0xEF087A76L, 0x1D63F975L, 0x0E330A81L,
+    0xFC588982L, 0xB21572C9L, 0x407EF1CAL, 0x532E023EL, 0xA145813DL, 0x758FE5D6L, 0x87E466D5L,
+    0x94B49521L, 0x66DF1622L, 0x38CC2A06L, 0xCAA7A905L, 0xD9F75AF1L, 0x2B9CD9F2L, 0xFF56BD19L,
+    0x0D3D3E1AL, 0x1E6DCDEEL, 0xEC064EEDL, 0xC38D26C4L, 0x31E6A5C7L, 0x22B65633L, 0xD0DDD530L,
+    0x0417B1DBL, 0xF67C32D8L, 0xE52CC12CL, 0x1747422FL, 0x49547E0BL, 0xBB3FFD08L, 0xA86F0EFCL,
+    0x5A048DFFL, 0x8ECEE914L, 0x7CA56A17L, 0x6FF599E3L, 0x9D9E1AE0L, 0xD3D3E1ABL, 0x21B862A8L,
+    0x32E8915CL, 0xC083125FL, 0x144976B4L, 0xE622F5B7L, 0xF5720643L, 0x07198540L, 0x590AB964L,
+    0xAB613A67L, 0xB831C993L, 0x4A5A4A90L, 0x9E902E7BL, 0x6CFBAD78L, 0x7FAB5E8CL, 0x8DC0DD8FL,
+    0xE330A81AL, 0x115B2B19L, 0x020BD8EDL, 0xF0605BEEL, 0x24AA3F05L, 0xD6C1BC06L, 0xC5914FF2L,
+    0x37FACCF1L, 0x69E9F0D5L, 0x9B8273D6L, 0x88D28022L, 0x7AB90321L, 0xAE7367CAL, 0x5C18E4C9L,
+    0x4F48173DL, 0xBD23943EL, 0xF36E6F75L, 0x0105EC76L, 0x12551F82L, 0xE03E9C81L, 0x34F4F86AL,
+    0xC69F7B69L, 0xD5CF889DL, 0x27A40B9EL, 0x79B737BAL, 0x8BDCB4B9L, 0x988C474DL, 0x6AE7C44EL,
+    0xBE2DA0A5L, 0x4C4623A6L, 0x5F16D052L, 0xAD7D5351L
 };
 
 u32 crc32sum(u32 crc, u8 * buf, size_t size) {
diff --git a/src/libbz3.c b/src/libbz3.c
index f1aed9a..f1aacb7 100644
--- a/src/libbz3.c
+++ b/src/libbz3.c
@@ -10,10 +10,14 @@
 #include "common.h"
 #include "crc32.h"
 #include "libsais.h"
+#include "lzp.h"
 #include "mtf.h"
-#include "txt.h"
 #include "rle.h"
 #include "srt.h"
+#include "txt.h"
+
+#define LZP_DICTIONARY 16
+#define LZP_MIN_MATCH 40
 
 struct block_encoder_state {
     u8 *buf1, *buf2;
@@ -25,9 +29,7 @@ struct block_encoder_state {
     s8 last_error;
 };
 
-s8 get_last_error(struct block_encoder_state * state) {
-    return state->last_error;
-}
+s8 get_last_error(struct block_encoder_state * state) { return state->last_error; }
 
 const char * str_last_error(struct block_encoder_state * state) {
     switch (state->last_error) {
@@ -60,8 +62,7 @@ s32 commit_read(struct block_encoder_state * state, s32 bytes_read) {
 }
 
 struct block_encoder_state * new_block_encoder_state(s32 block_size) {
-    struct block_encoder_state * block_encoder_state =
-        malloc(sizeof(struct block_encoder_state));
+    struct block_encoder_state * block_encoder_state = malloc(sizeof(struct block_encoder_state));
 
     if (!block_encoder_state) {
         return NULL;
@@ -97,18 +98,19 @@ struct encoding_result encode_block(struct block_encoder_state * state) {
 
     int txt = is_text(state->buf1, state->bytes_read);
 
-    if(txt) {
-        s32 bwt_index = libsais_bwt(state->buf1, state->buf1, state->sais_array,
-                                    state->bytes_read, 16, NULL);
-        if(bwt_index < 0) {
+    if (txt) {
+        s32 lzp_size = lzp_compress(state->buf1, state->buf2, state->bytes_read, LZP_DICTIONARY,
+                                    LZP_MIN_MATCH);
+        s32 bwt_index =
+            libsais_bwt(state->buf2, state->buf1, state->sais_array, lzp_size, 16, NULL);
+        if (bwt_index < 0) {
             state->last_error = BZ3_ERR_BWT;
-            return (struct encoding_result) { NULL, -1 };
+            return (struct encoding_result){ NULL, -1 };
         }
         begin(state->cm_state);
         state->cm_state->out_queue = state->buf2 + 24;
         state->cm_state->output_ptr = 0;
-        for(s32 i = 0; i < state->bytes_read; i++)
-            encode_byte(state->cm_state, state->buf1[i]);
+        for (s32 i = 0; i < lzp_size; i++) encode_byte(state->cm_state, state->buf1[i]);
         flush(state->cm_state);
         s32 new_size = state->cm_state->output_ptr;
 
@@ -116,15 +118,15 @@ struct encoding_result encode_block(struct block_encoder_state * state) {
         ((uint32_t *)state->buf2)[1] = htonl(state->bytes_read);
         ((uint32_t *)state->buf2)[2] = htonl(bwt_index);
         ((uint32_t *)state->buf2)[3] = 0xFFFFFFFF;
-        ((uint32_t *)state->buf2)[4] = 0xFFFFFFFF;
+        ((uint32_t *)state->buf2)[4] = htonl(lzp_size);
         ((uint32_t *)state->buf2)[5] = htonl(new_size);
-        
+
         state->last_error = BZ3_OK;
-        return (struct encoding_result) { state->buf2, 24 + new_size };
+        return (struct encoding_result){ state->buf2, 24 + new_size };
     } else {
         s32 new_size = mrlec(state->buf1, state->bytes_read, state->buf2);
-        s32 bwt_index = libsais_bwt(state->buf2, state->buf2, state->sais_array,
-                                    new_size, 16, NULL);
+        s32 bwt_index =
+            libsais_bwt(state->buf2, state->buf2, state->sais_array, new_size, 16, NULL);
         if (bwt_index < 0) {
             state->last_error = BZ3_ERR_BWT;
             return (struct encoding_result){ NULL, -1 };
@@ -132,8 +134,7 @@ struct encoding_result encode_block(struct block_encoder_state * state) {
         s32 new_size2;
 
         if (new_size > MiB(3)) {
-            new_size2 =
-                srt_encode(state->srt_state, state->buf2, state->buf1, new_size);
+            new_size2 = srt_encode(state->srt_state, state->buf2, state->buf1, new_size);
         } else {
             new_size2 = -1;
             mtf_encode(state->mtf_state, state->buf2, state->buf1, new_size);
@@ -143,11 +144,9 @@ struct encoding_result encode_block(struct block_encoder_state * state) {
         state->cm_state->out_queue = state->buf2 + 24;
         state->cm_state->output_ptr = 0;
         if (new_size2 != -1)
-            for (s32 i = 0; i < new_size2; i++)
-                encode_byte(state->cm_state, state->buf1[i]);
+            for (s32 i = 0; i < new_size2; i++) encode_byte(state->cm_state, state->buf1[i]);
         else
-            for (s32 i = 0; i < new_size; i++)
-                encode_byte(state->cm_state, state->buf1[i]);
+            for (s32 i = 0; i < new_size; i++) encode_byte(state->cm_state, state->buf1[i]);
         flush(state->cm_state);
         s32 new_size3 = state->cm_state->output_ptr;
 
@@ -158,8 +157,7 @@ struct encoding_result encode_block(struct block_encoder_state * state) {
         ((uint32_t *)state->buf2)[4] = htonl(new_size2);
         ((uint32_t *)state->buf2)[5] = htonl(new_size3);
         state->last_error = BZ3_OK;
-        return (struct encoding_result){ .buffer = state->buf2,
-                                        .size = 24 + new_size3 };
+        return (struct encoding_result){ .buffer = state->buf2, .size = 24 + new_size3 };
     }
 }
 
@@ -174,23 +172,21 @@ struct encoding_result decode_block(struct block_encoder_state * state) {
     new_size2 = ntohl(((uint32_t *)state->buf1)[4]);
     new_size3 = ntohl(((uint32_t *)state->buf1)[5]);
 
-    if(new_size2 != 0xFFFFFFFF || new_size != 0xFFFFFFFF) {
+    if (new_size != 0xFFFFFFFF) {
         begin(state->cm_state);
         state->cm_state->in_queue = state->buf1 + 24;
         state->cm_state->input_ptr = 0;
         state->cm_state->input_max = new_size3;
         init(state->cm_state);
         if (new_size2 != -1) {
-            for (s32 i = 0; i < new_size2; i++)
-                state->buf2[i] = decode_byte(state->cm_state);
+            for (s32 i = 0; i < new_size2; i++) state->buf2[i] = decode_byte(state->cm_state);
             srt_decode(state->srt_state, state->buf2, state->buf1, new_size2);
         } else {
-            for (s32 i = 0; i < new_size; i++)
-                state->buf2[i] = decode_byte(state->cm_state);
+            for (s32 i = 0; i < new_size; i++) state->buf2[i] = decode_byte(state->cm_state);
             mtf_decode(state->mtf_state, state->buf2, state->buf1, new_size);
         }
-        if (libsais_unbwt(state->buf1, state->buf2, state->sais_array, new_size,
-                        NULL, bwt_index) < 0) {
+        if (libsais_unbwt(state->buf1, state->buf2, state->sais_array, new_size, NULL, bwt_index) <
+            0) {
             state->last_error = BZ3_ERR_BWT;
             return (struct encoding_result){ NULL, -1 };
         }
@@ -200,28 +196,26 @@ struct encoding_result decode_block(struct block_encoder_state * state) {
             return (struct encoding_result){ .buffer = NULL, .size = -1 };
         }
         state->last_error = BZ3_OK;
-        return (struct encoding_result){ .buffer = state->buf1,
-                                        .size = state->bytes_read };
+        return (struct encoding_result){ .buffer = state->buf1, .size = state->bytes_read };
     } else {
         begin(state->cm_state);
         state->cm_state->in_queue = state->buf1 + 24;
         state->cm_state->input_ptr = 0;
         state->cm_state->input_max = new_size3;
         init(state->cm_state);
-        for (s32 i = 0; i < state->bytes_read; i++)
-            state->buf2[i] = decode_byte(state->cm_state);
-        if (libsais_unbwt(state->buf2, state->buf1, state->sais_array, state->bytes_read,
-                        NULL, bwt_index) < 0) {
+        for (s32 i = 0; i < new_size2; i++) state->buf2[i] = decode_byte(state->cm_state);
+        if (libsais_unbwt(state->buf2, state->buf1, state->sais_array, new_size2, NULL, bwt_index) <
+            0) {
             state->last_error = BZ3_ERR_BWT;
             return (struct encoding_result){ NULL, -1 };
         }
-        if (crc32sum(1, state->buf1, state->bytes_read) != crc32) {
+        lzp_decompress(state->buf1, state->buf2, new_size2, LZP_DICTIONARY, LZP_MIN_MATCH);
+        if (crc32sum(1, state->buf2, state->bytes_read) != crc32) {
             state->last_error = BZ3_ERR_CRC;
             return (struct encoding_result){ .buffer = NULL, .size = -1 };
         }
         state->last_error = BZ3_OK;
-        return (struct encoding_result){ .buffer = state->buf1,
-                                         .size = state->bytes_read };
+        return (struct encoding_result){ .buffer = state->buf2, .size = state->bytes_read };
     }
 }
 
diff --git a/src/libsais.c b/src/libsais.c
index d2f6733..fd5f71a 100644
--- a/src/libsais.c
+++ b/src/libsais.c
@@ -100,8 +100,7 @@ typedef struct LIBSAIS_UNBWT_CONTEXT {
     #if __has_builtin(__builtin_prefetch)
         #define HAS_BUILTIN_PREFECTCH
     #endif
-#elif defined(__GNUC__) && \
-    (((__GNUC__ == 3) && (__GNUC_MINOR__ >= 2)) || (__GNUC__ >= 4))
+#elif defined(__GNUC__) && (((__GNUC__ == 3) && (__GNUC_MINOR__ >= 2)) || (__GNUC__ >= 4))
     #define HAS_BUILTIN_PREFECTCH
 #endif
 
@@ -109,20 +108,16 @@ typedef struct LIBSAIS_UNBWT_CONTEXT {
     #if __has_builtin(__builtin_bswap16)
         #define HAS_BUILTIN_BSWAP16
     #endif
-#elif defined(__GNUC__) && \
-    (((__GNUC__ == 4) && (__GNUC_MINOR__ >= 8)) || (__GNUC__ >= 5))
+#elif defined(__GNUC__) && (((__GNUC__ == 4) && (__GNUC_MINOR__ >= 8)) || (__GNUC__ >= 5))
     #define HAS_BUILTIN_BSWAP16
 #endif
 
 #if defined(HAS_BUILTIN_PREFECTCH)
-    #define libsais_prefetch(address) \
-        __builtin_prefetch((const void *)(address), 0, 0)
-    #define libsais_prefetchw(address) \
-        __builtin_prefetch((const void *)(address), 1, 0)
+    #define libsais_prefetch(address) __builtin_prefetch((const void *)(address), 0, 0)
+    #define libsais_prefetchw(address) __builtin_prefetch((const void *)(address), 1, 0)
 #elif defined(_M_IX86) || defined(_M_AMD64)
     #include <intrin.h>
-    #define libsais_prefetch(address) \
-        _mm_prefetch((const void *)(address), _MM_HINT_NTA)
+    #define libsais_prefetch(address) _mm_prefetch((const void *)(address), _MM_HINT_NTA)
     #define libsais_prefetchw(address) _m_prefetchw((const void *)(address))
 #elif defined(_M_ARM)
     #include <intrin.h>
@@ -137,24 +132,18 @@ typedef struct LIBSAIS_UNBWT_CONTEXT {
 #endif
 
 #if !defined(__LITTLE_ENDIAN__) && !defined(__BIG_ENDIAN__)
-    #if defined(_LITTLE_ENDIAN) ||                                      \
-        (defined(BYTE_ORDER) && defined(LITTLE_ENDIAN) &&               \
-         BYTE_ORDER == LITTLE_ENDIAN) ||                                \
-        (defined(_BYTE_ORDER) && defined(_LITTLE_ENDIAN) &&             \
-         _BYTE_ORDER == _LITTLE_ENDIAN) ||                              \
-        (defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) &&           \
-         __BYTE_ORDER == __LITTLE_ENDIAN) ||                            \
-        (defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && \
+    #if defined(_LITTLE_ENDIAN) ||                                                                \
+        (defined(BYTE_ORDER) && defined(LITTLE_ENDIAN) && BYTE_ORDER == LITTLE_ENDIAN) ||         \
+        (defined(_BYTE_ORDER) && defined(_LITTLE_ENDIAN) && _BYTE_ORDER == _LITTLE_ENDIAN) ||     \
+        (defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && __BYTE_ORDER == __LITTLE_ENDIAN) || \
+        (defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) &&                           \
          __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
         #define __LITTLE_ENDIAN__
-    #elif defined(_BIG_ENDIAN) ||                                    \
-        (defined(BYTE_ORDER) && defined(BIG_ENDIAN) &&               \
-         BYTE_ORDER == BIG_ENDIAN) ||                                \
-        (defined(_BYTE_ORDER) && defined(_BIG_ENDIAN) &&             \
-         _BYTE_ORDER == _BIG_ENDIAN) ||                              \
-        (defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) &&           \
-         __BYTE_ORDER == __BIG_ENDIAN) ||                            \
-        (defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && \
+    #elif defined(_BIG_ENDIAN) ||                                                           \
+        (defined(BYTE_ORDER) && defined(BIG_ENDIAN) && BYTE_ORDER == BIG_ENDIAN) ||         \
+        (defined(_BYTE_ORDER) && defined(_BIG_ENDIAN) && _BYTE_ORDER == _BIG_ENDIAN) ||     \
+        (defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && __BYTE_ORDER == __BIG_ENDIAN) || \
+        (defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) &&                        \
          __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
         #define __BIG_ENDIAN__
     #elif defined(_WIN32)
@@ -184,11 +173,9 @@ static void * libsais_align_up(const void * address, size_t alignment) {
 static void * libsais_alloc_aligned(size_t size, size_t alignment) {
     void * address = malloc(size + sizeof(short) + alignment - 1);
     if (address != NULL) {
-        void * aligned_address = libsais_align_up(
-            (void *)((ptrdiff_t)address + (ptrdiff_t)(sizeof(short))),
-            alignment);
-        ((short *)aligned_address)[-1] =
-            (short)((ptrdiff_t)aligned_address - (ptrdiff_t)address);
+        void * aligned_address =
+            libsais_align_up((void *)((ptrdiff_t)address + (ptrdiff_t)(sizeof(short))), alignment);
+        ((short *)aligned_address)[-1] = (short)((ptrdiff_t)aligned_address - (ptrdiff_t)address);
 
         return aligned_address;
     }
@@ -198,25 +185,19 @@ static void * libsais_alloc_aligned(size_t size, size_t alignment) {
 
 static void libsais_free_aligned(void * aligned_address) {
     if (aligned_address != NULL) {
-        free((void *)((ptrdiff_t)aligned_address -
-                      ((short *)aligned_address)[-1]));
+        free((void *)((ptrdiff_t)aligned_address - ((short *)aligned_address)[-1]));
     }
 }
 
 static LIBSAIS_THREAD_STATE * libsais_alloc_thread_state(sa_sint_t threads) {
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state =
-        (LIBSAIS_THREAD_STATE *)libsais_alloc_aligned(
-            (size_t)threads * sizeof(LIBSAIS_THREAD_STATE), 4096);
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state = (LIBSAIS_THREAD_STATE *)libsais_alloc_aligned(
+        (size_t)threads * sizeof(LIBSAIS_THREAD_STATE), 4096);
     sa_sint_t * RESTRICT thread_buckets = (sa_sint_t *)libsais_alloc_aligned(
         (size_t)threads * 4 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096);
-    LIBSAIS_THREAD_CACHE * RESTRICT thread_cache =
-        (LIBSAIS_THREAD_CACHE *)libsais_alloc_aligned(
-            (size_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE *
-                sizeof(LIBSAIS_THREAD_CACHE),
-            4096);
-
-    if (thread_state != NULL && thread_buckets != NULL &&
-        thread_cache != NULL) {
+    LIBSAIS_THREAD_CACHE * RESTRICT thread_cache = (LIBSAIS_THREAD_CACHE *)libsais_alloc_aligned(
+        (size_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE * sizeof(LIBSAIS_THREAD_CACHE), 4096);
+
+    if (thread_state != NULL && thread_buckets != NULL && thread_cache != NULL) {
         fast_sint_t t;
         for (t = 0; t < threads; ++t) {
             thread_state[t].state.buckets = thread_buckets;
@@ -245,13 +226,12 @@ static void libsais_free_thread_state(LIBSAIS_THREAD_STATE * thread_state) {
 static LIBSAIS_CONTEXT * libsais_create_ctx_main(sa_sint_t threads) {
     LIBSAIS_CONTEXT * RESTRICT ctx =
         (LIBSAIS_CONTEXT *)libsais_alloc_aligned(sizeof(LIBSAIS_CONTEXT), 64);
-    sa_sint_t * RESTRICT buckets = (sa_sint_t *)libsais_alloc_aligned(
-        8 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096);
+    sa_sint_t * RESTRICT buckets =
+        (sa_sint_t *)libsais_alloc_aligned(8 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096);
     LIBSAIS_THREAD_STATE * RESTRICT thread_state =
         threads > 1 ? libsais_alloc_thread_state(threads) : NULL;
 
-    if (ctx != NULL && buckets != NULL &&
-        (thread_state != NULL || threads == 1)) {
+    if (ctx != NULL && buckets != NULL && (thread_state != NULL || threads == 1)) {
         ctx->buckets = buckets;
         ctx->threads = threads;
         ctx->thread_state = thread_state;
@@ -275,9 +255,9 @@ static void libsais_free_ctx_main(LIBSAIS_CONTEXT * ctx) {
 
 #if defined(_OPENMP)
 
-static sa_sint_t libsais_count_negative_marked_suffixes(
-    sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start,
-    fast_sint_t omp_block_size) {
+static sa_sint_t libsais_count_negative_marked_suffixes(sa_sint_t * RESTRICT SA,
+                                                        fast_sint_t omp_block_start,
+                                                        fast_sint_t omp_block_size) {
     sa_sint_t count = 0;
 
     fast_sint_t i;
@@ -288,9 +268,9 @@ static sa_sint_t libsais_count_negative_marked_suffixes(
     return count;
 }
 
-static sa_sint_t libsais_count_zero_marked_suffixes(
-    sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start,
-    fast_sint_t omp_block_size) {
+static sa_sint_t libsais_count_zero_marked_suffixes(sa_sint_t * RESTRICT SA,
+                                                    fast_sint_t omp_block_start,
+                                                    fast_sint_t omp_block_size) {
     sa_sint_t count = 0;
 
     fast_sint_t i;
@@ -303,14 +283,12 @@ static sa_sint_t libsais_count_zero_marked_suffixes(
 
 static void libsais_place_cached_suffixes(sa_sint_t * RESTRICT SA,
                                           LIBSAIS_THREAD_CACHE * RESTRICT cache,
-                                          fast_sint_t omp_block_start,
-                                          fast_sint_t omp_block_size) {
+                                          fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start,
-        j = omp_block_start + omp_block_size - prefetch_distance - 3;
-         i < j; i += 4) {
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j;
+         i += 4) {
         libsais_prefetch(&cache[i + 2 * prefetch_distance]);
 
         libsais_prefetchw(&SA[cache[i + prefetch_distance + 0].symbol]);
@@ -329,15 +307,15 @@ static void libsais_place_cached_suffixes(sa_sint_t * RESTRICT SA,
     }
 }
 
-static void libsais_compact_and_place_cached_suffixes(
-    sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache,
-    fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+static void libsais_compact_and_place_cached_suffixes(sa_sint_t * RESTRICT SA,
+                                                      LIBSAIS_THREAD_CACHE * RESTRICT cache,
+                                                      fast_sint_t omp_block_start,
+                                                      fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j, l;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - 3,
-        l = omp_block_start;
-         i < j; i += 4) {
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - 3, l = omp_block_start; i < j;
+         i += 4) {
         libsais_prefetchw(&cache[i + prefetch_distance]);
 
         cache[l] = cache[i + 0];
@@ -355,12 +333,10 @@ static void libsais_compact_and_place_cached_suffixes(
         l += cache[l].symbol >= 0;
     }
 
-    libsais_place_cached_suffixes(SA, cache, omp_block_start,
-                                  l - omp_block_start);
+    libsais_place_cached_suffixes(SA, cache, omp_block_start, l - omp_block_start);
 }
 
-static void libsais_accumulate_counts_s32_2(sa_sint_t * RESTRICT bucket00,
-                                            fast_sint_t bucket_size,
+static void libsais_accumulate_counts_s32_2(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size,
                                             fast_sint_t bucket_stride) {
     sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
     fast_sint_t s;
@@ -369,8 +345,7 @@ static void libsais_accumulate_counts_s32_2(sa_sint_t * RESTRICT bucket00,
     }
 }
 
-static void libsais_accumulate_counts_s32_3(sa_sint_t * RESTRICT bucket00,
-                                            fast_sint_t bucket_size,
+static void libsais_accumulate_counts_s32_3(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size,
                                             fast_sint_t bucket_stride) {
     sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
     sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
@@ -380,8 +355,7 @@ static void libsais_accumulate_counts_s32_3(sa_sint_t * RESTRICT bucket00,
     }
 }
 
-static void libsais_accumulate_counts_s32_4(sa_sint_t * RESTRICT bucket00,
-                                            fast_sint_t bucket_size,
+static void libsais_accumulate_counts_s32_4(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size,
                                             fast_sint_t bucket_stride) {
     sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
     sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
@@ -392,8 +366,7 @@ static void libsais_accumulate_counts_s32_4(sa_sint_t * RESTRICT bucket00,
     }
 }
 
-static void libsais_accumulate_counts_s32_5(sa_sint_t * RESTRICT bucket00,
-                                            fast_sint_t bucket_size,
+static void libsais_accumulate_counts_s32_5(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size,
                                             fast_sint_t bucket_stride) {
     sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
     sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
@@ -401,13 +374,11 @@ static void libsais_accumulate_counts_s32_5(sa_sint_t * RESTRICT bucket00,
     sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride;
     fast_sint_t s;
     for (s = 0; s < bucket_size; s += 1) {
-        bucket00[s] =
-            bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s];
+        bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s];
     }
 }
 
-static void libsais_accumulate_counts_s32_6(sa_sint_t * RESTRICT bucket00,
-                                            fast_sint_t bucket_size,
+static void libsais_accumulate_counts_s32_6(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size,
                                             fast_sint_t bucket_stride) {
     sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
     sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
@@ -416,13 +387,12 @@ static void libsais_accumulate_counts_s32_6(sa_sint_t * RESTRICT bucket00,
     sa_sint_t * RESTRICT bucket05 = bucket04 - bucket_stride;
     fast_sint_t s;
     for (s = 0; s < bucket_size; s += 1) {
-        bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] +
-                      bucket04[s] + bucket05[s];
+        bucket00[s] =
+            bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] + bucket05[s];
     }
 }
 
-static void libsais_accumulate_counts_s32_7(sa_sint_t * RESTRICT bucket00,
-                                            fast_sint_t bucket_size,
+static void libsais_accumulate_counts_s32_7(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size,
                                             fast_sint_t bucket_stride) {
     sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
     sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
@@ -432,13 +402,12 @@ static void libsais_accumulate_counts_s32_7(sa_sint_t * RESTRICT bucket00,
     sa_sint_t * RESTRICT bucket06 = bucket05 - bucket_stride;
     fast_sint_t s;
     for (s = 0; s < bucket_size; s += 1) {
-        bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] +
-                      bucket04[s] + bucket05[s] + bucket06[s];
+        bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] +
+                      bucket05[s] + bucket06[s];
     }
 }
 
-static void libsais_accumulate_counts_s32_8(sa_sint_t * RESTRICT bucket00,
-                                            fast_sint_t bucket_size,
+static void libsais_accumulate_counts_s32_8(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size,
                                             fast_sint_t bucket_stride) {
     sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
     sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
@@ -449,13 +418,12 @@ static void libsais_accumulate_counts_s32_8(sa_sint_t * RESTRICT bucket00,
     sa_sint_t * RESTRICT bucket07 = bucket06 - bucket_stride;
     fast_sint_t s;
     for (s = 0; s < bucket_size; s += 1) {
-        bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] +
-                      bucket04[s] + bucket05[s] + bucket06[s] + bucket07[s];
+        bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] +
+                      bucket05[s] + bucket06[s] + bucket07[s];
     }
 }
 
-static void libsais_accumulate_counts_s32_9(sa_sint_t * RESTRICT bucket00,
-                                            fast_sint_t bucket_size,
+static void libsais_accumulate_counts_s32_9(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size,
                                             fast_sint_t bucket_stride) {
     sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
     sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
@@ -467,20 +435,16 @@ static void libsais_accumulate_counts_s32_9(sa_sint_t * RESTRICT bucket00,
     sa_sint_t * RESTRICT bucket08 = bucket07 - bucket_stride;
     fast_sint_t s;
     for (s = 0; s < bucket_size; s += 1) {
-        bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] +
-                      bucket04[s] + bucket05[s] + bucket06[s] + bucket07[s] +
-                      bucket08[s];
+        bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] +
+                      bucket05[s] + bucket06[s] + bucket07[s] + bucket08[s];
     }
 }
 
-static void libsais_accumulate_counts_s32(sa_sint_t * RESTRICT buckets,
-                                          fast_sint_t bucket_size,
-                                          fast_sint_t bucket_stride,
-                                          fast_sint_t num_buckets) {
+static void libsais_accumulate_counts_s32(sa_sint_t * RESTRICT buckets, fast_sint_t bucket_size,
+                                          fast_sint_t bucket_stride, fast_sint_t num_buckets) {
     while (num_buckets >= 9) {
-        libsais_accumulate_counts_s32_9(
-            buckets - (num_buckets - 9) * bucket_stride, bucket_size,
-            bucket_stride);
+        libsais_accumulate_counts_s32_9(buckets - (num_buckets - 9) * bucket_stride, bucket_size,
+                                        bucket_stride);
         num_buckets -= 8;
     }
 
@@ -488,42 +452,33 @@ static void libsais_accumulate_counts_s32(sa_sint_t * RESTRICT buckets,
         case 1:
             break;
         case 2:
-            libsais_accumulate_counts_s32_2(buckets, bucket_size,
-                                            bucket_stride);
+            libsais_accumulate_counts_s32_2(buckets, bucket_size, bucket_stride);
             break;
         case 3:
-            libsais_accumulate_counts_s32_3(buckets, bucket_size,
-                                            bucket_stride);
+            libsais_accumulate_counts_s32_3(buckets, bucket_size, bucket_stride);
             break;
         case 4:
-            libsais_accumulate_counts_s32_4(buckets, bucket_size,
-                                            bucket_stride);
+            libsais_accumulate_counts_s32_4(buckets, bucket_size, bucket_stride);
             break;
         case 5:
-            libsais_accumulate_counts_s32_5(buckets, bucket_size,
-                                            bucket_stride);
+            libsais_accumulate_counts_s32_5(buckets, bucket_size, bucket_stride);
             break;
         case 6:
-            libsais_accumulate_counts_s32_6(buckets, bucket_size,
-                                            bucket_stride);
+            libsais_accumulate_counts_s32_6(buckets, bucket_size, bucket_stride);
             break;
         case 7:
-            libsais_accumulate_counts_s32_7(buckets, bucket_size,
-                                            bucket_stride);
+            libsais_accumulate_counts_s32_7(buckets, bucket_size, bucket_stride);
             break;
         case 8:
-            libsais_accumulate_counts_s32_8(buckets, bucket_size,
-                                            bucket_stride);
+            libsais_accumulate_counts_s32_8(buckets, bucket_size, bucket_stride);
             break;
     }
 }
 
 #endif
 
-static void libsais_gather_lms_suffixes_8u(const u8 * RESTRICT T,
-                                           sa_sint_t * RESTRICT SA, sa_sint_t n,
-                                           fast_sint_t m,
-                                           fast_sint_t omp_block_start,
+static void libsais_gather_lms_suffixes_8u(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
+                                           sa_sint_t n, fast_sint_t m, fast_sint_t omp_block_start,
                                            fast_sint_t omp_block_size) {
     if (omp_block_size > 0) {
         const fast_sint_t prefetch_distance = 128;
@@ -537,8 +492,7 @@ static void libsais_gather_lms_suffixes_8u(const u8 * RESTRICT T,
 
         fast_uint_t s = c0 >= c1;
 
-        for (i = omp_block_start + omp_block_size - 2, j = omp_block_start + 3;
-             i >= j; i -= 4) {
+        for (i = omp_block_start + omp_block_size - 2, j = omp_block_start + 3; i >= j; i -= 4) {
             libsais_prefetch(&T[i - prefetch_distance]);
 
             c1 = T[i - 0];
@@ -571,9 +525,9 @@ static void libsais_gather_lms_suffixes_8u(const u8 * RESTRICT T,
     }
 }
 
-static void libsais_gather_lms_suffixes_8u_omp(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
-    sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+static void libsais_gather_lms_suffixes_8u_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
+                                               sa_sint_t n, sa_sint_t threads,
+                                               LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
 #if defined(_OPENMP)
     #pragma omp parallel num_threads(threads) if (threads > 1 && n >= 65536 && \
                                                   omp_get_dynamic() == 0)
@@ -591,13 +545,12 @@ static void libsais_gather_lms_suffixes_8u_omp(
 #endif
         fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
-                                         ? omp_block_stride
-                                         : n - omp_block_start;
+        fast_sint_t omp_block_size =
+            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
 
         if (omp_num_threads == 1) {
-            libsais_gather_lms_suffixes_8u(T, SA, n, (fast_sint_t)n - 1,
-                                           omp_block_start, omp_block_size);
+            libsais_gather_lms_suffixes_8u(T, SA, n, (fast_sint_t)n - 1, omp_block_start,
+                                           omp_block_size);
         }
 #if defined(_OPENMP)
         else {
@@ -606,15 +559,14 @@ static void libsais_gather_lms_suffixes_8u_omp(
                 m += thread_state[t].state.m;
             }
 
-            libsais_gather_lms_suffixes_8u(T, SA, n, (fast_sint_t)n - 1 - m,
-                                           omp_block_start, omp_block_size);
+            libsais_gather_lms_suffixes_8u(T, SA, n, (fast_sint_t)n - 1 - m, omp_block_start,
+                                           omp_block_size);
 
     #pragma omp barrier
 
             if (thread_state[omp_thread_num].state.m > 0) {
                 SA[(fast_sint_t)n - 1 - m] =
-                    (sa_sint_t)thread_state[omp_thread_num]
-                        .state.last_lms_suffix;
+                    (sa_sint_t)thread_state[omp_thread_num].state.last_lms_suffix;
             }
         }
 #endif
@@ -622,8 +574,7 @@ static void libsais_gather_lms_suffixes_8u_omp(
 }
 
 static sa_sint_t libsais_gather_lms_suffixes_32s(const sa_sint_t * RESTRICT T,
-                                                 sa_sint_t * RESTRICT SA,
-                                                 sa_sint_t n) {
+                                                 sa_sint_t * RESTRICT SA, sa_sint_t n) {
     const fast_sint_t prefetch_distance = 32;
 
     sa_sint_t i = n - 2;
@@ -664,8 +615,8 @@ static sa_sint_t libsais_gather_lms_suffixes_32s(const sa_sint_t * RESTRICT T,
     return n - 1 - m;
 }
 
-static sa_sint_t libsais_gather_compacted_lms_suffixes_32s(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n) {
+static sa_sint_t libsais_gather_compacted_lms_suffixes_32s(const sa_sint_t * RESTRICT T,
+                                                           sa_sint_t * RESTRICT SA, sa_sint_t n) {
     const fast_sint_t prefetch_distance = 32;
 
     sa_sint_t i = n - 2;
@@ -708,9 +659,8 @@ static sa_sint_t libsais_gather_compacted_lms_suffixes_32s(
 
 #if defined(_OPENMP)
 
-static void libsais_count_lms_suffixes_32s_4k(const sa_sint_t * RESTRICT T,
-                                              sa_sint_t n, sa_sint_t k,
-                                              sa_sint_t * RESTRICT buckets) {
+static void libsais_count_lms_suffixes_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t n,
+                                              sa_sint_t k, sa_sint_t * RESTRICT buckets) {
     const fast_sint_t prefetch_distance = 32;
 
     memset(buckets, 0, 4 * (size_t)k * sizeof(sa_sint_t));
@@ -723,14 +673,10 @@ static void libsais_count_lms_suffixes_32s_4k(const sa_sint_t * RESTRICT T,
     for (; i >= prefetch_distance + 3; i -= 4) {
         libsais_prefetch(&T[i - 2 * prefetch_distance]);
 
-        libsais_prefetchw(
-            &buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 0], 0)]);
-        libsais_prefetchw(
-            &buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 1], 0)]);
-        libsais_prefetchw(
-            &buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 2], 0)]);
-        libsais_prefetchw(
-            &buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 3], 0)]);
+        libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 0], 0)]);
+        libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 1], 0)]);
+        libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 2], 0)]);
+        libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 3], 0)]);
 
         c1 = T[i - 0];
         s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
@@ -761,9 +707,8 @@ static void libsais_count_lms_suffixes_32s_4k(const sa_sint_t * RESTRICT T,
 
 #endif
 
-static void libsais_count_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T,
-                                              sa_sint_t n, sa_sint_t k,
-                                              sa_sint_t * RESTRICT buckets) {
+static void libsais_count_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t n,
+                                              sa_sint_t k, sa_sint_t * RESTRICT buckets) {
     const fast_sint_t prefetch_distance = 32;
 
     memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t));
@@ -776,14 +721,10 @@ static void libsais_count_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T,
     for (; i >= prefetch_distance + 3; i -= 4) {
         libsais_prefetch(&T[i - 2 * prefetch_distance]);
 
-        libsais_prefetchw(
-            &buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0], 0)]);
-        libsais_prefetchw(
-            &buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1], 0)]);
-        libsais_prefetchw(
-            &buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2], 0)]);
-        libsais_prefetchw(
-            &buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3], 0)]);
+        libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0], 0)]);
+        libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1], 0)]);
+        libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2], 0)]);
+        libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3], 0)]);
 
         c1 = T[i - 0];
         s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
@@ -814,9 +755,8 @@ static void libsais_count_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T,
 
 #if defined(_OPENMP)
 
-static void libsais_count_compacted_lms_suffixes_32s_2k(
-    const sa_sint_t * RESTRICT T, sa_sint_t n, sa_sint_t k,
-    sa_sint_t * RESTRICT buckets) {
+static void libsais_count_compacted_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t n,
+                                                        sa_sint_t k, sa_sint_t * RESTRICT buckets) {
     const fast_sint_t prefetch_distance = 32;
 
     memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t));
@@ -829,14 +769,10 @@ static void libsais_count_compacted_lms_suffixes_32s_2k(
     for (; i >= prefetch_distance + 3; i -= 4) {
         libsais_prefetch(&T[i - 2 * prefetch_distance]);
 
-        libsais_prefetchw(&buckets[BUCKETS_INDEX2(
-            T[i - prefetch_distance - 0] & SAINT_MAX, 0)]);
-        libsais_prefetchw(&buckets[BUCKETS_INDEX2(
-            T[i - prefetch_distance - 1] & SAINT_MAX, 0)]);
-        libsais_prefetchw(&buckets[BUCKETS_INDEX2(
-            T[i - prefetch_distance - 2] & SAINT_MAX, 0)]);
-        libsais_prefetchw(&buckets[BUCKETS_INDEX2(
-            T[i - prefetch_distance - 3] & SAINT_MAX, 0)]);
+        libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0] & SAINT_MAX, 0)]);
+        libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1] & SAINT_MAX, 0)]);
+        libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2] & SAINT_MAX, 0)]);
+        libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3] & SAINT_MAX, 0)]);
 
         c1 = T[i - 0];
         s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
@@ -873,10 +809,11 @@ static void libsais_count_compacted_lms_suffixes_32s_2k(
 
 #endif
 
-static sa_sint_t libsais_count_and_gather_lms_suffixes_8u(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
-    sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start,
-    fast_sint_t omp_block_size) {
+static sa_sint_t libsais_count_and_gather_lms_suffixes_8u(const u8 * RESTRICT T,
+                                                          sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                                          sa_sint_t * RESTRICT buckets,
+                                                          fast_sint_t omp_block_start,
+                                                          fast_sint_t omp_block_size) {
     memset(buckets, 0, 4 * ALPHABET_SIZE * sizeof(sa_sint_t));
 
     fast_sint_t m = omp_block_start + omp_block_size - 1;
@@ -940,9 +877,8 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_8u(
 }
 
 static sa_sint_t libsais_count_and_gather_lms_suffixes_8u_omp(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
-    sa_sint_t * RESTRICT buckets, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets,
+    sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     sa_sint_t m = 0;
 
 #if defined(_OPENMP)
@@ -962,23 +898,20 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_8u_omp(
 #endif
         fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
-                                         ? omp_block_stride
-                                         : n - omp_block_start;
+        fast_sint_t omp_block_size =
+            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
 
         if (omp_num_threads == 1) {
-            m = libsais_count_and_gather_lms_suffixes_8u(
-                T, SA, n, buckets, omp_block_start, omp_block_size);
+            m = libsais_count_and_gather_lms_suffixes_8u(T, SA, n, buckets, omp_block_start,
+                                                         omp_block_size);
         }
 #if defined(_OPENMP)
         else {
             {
-                thread_state[omp_thread_num].state.position =
-                    omp_block_start + omp_block_size;
-                thread_state[omp_thread_num].state.m =
-                    libsais_count_and_gather_lms_suffixes_8u(
-                        T, SA, n, thread_state[omp_thread_num].state.buckets,
-                        omp_block_start, omp_block_size);
+                thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size;
+                thread_state[omp_thread_num].state.m = libsais_count_and_gather_lms_suffixes_8u(
+                    T, SA, n, thread_state[omp_thread_num].state.buckets, omp_block_start,
+                    omp_block_size);
 
                 if (thread_state[omp_thread_num].state.m > 0) {
                     thread_state[omp_thread_num].state.last_lms_suffix =
@@ -996,18 +929,14 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_8u_omp(
                 for (t = omp_num_threads - 1; t >= 0; --t) {
                     m += (sa_sint_t)thread_state[t].state.m;
 
-                    if (t != omp_num_threads - 1 &&
-                        thread_state[t].state.m > 0) {
+                    if (t != omp_num_threads - 1 && thread_state[t].state.m > 0) {
                         memcpy(&SA[n - m],
-                               &SA[thread_state[t].state.position -
-                                   thread_state[t].state.m],
-                               (size_t)thread_state[t].state.m *
-                                   sizeof(sa_sint_t));
+                               &SA[thread_state[t].state.position - thread_state[t].state.m],
+                               (size_t)thread_state[t].state.m * sizeof(sa_sint_t));
                     }
 
                     {
-                        sa_sint_t * RESTRICT temp_bucket =
-                            thread_state[t].state.buckets;
+                        sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
                         fast_sint_t s;
                         for (s = 0; s < 4 * ALPHABET_SIZE; s += 1) {
                             sa_sint_t A = buckets[s], B = temp_bucket[s];
@@ -1025,9 +954,8 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_8u_omp(
 }
 
 static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
-    sa_sint_t k, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start,
-    fast_sint_t omp_block_size) {
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
+    sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
     memset(buckets, 0, 4 * (size_t)k * sizeof(sa_sint_t));
 
     fast_sint_t m = omp_block_start + omp_block_size - 1;
@@ -1043,18 +971,13 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k(
 
         fast_uint_t s = c0 >= c1;
 
-        for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j;
-             i -= 4) {
+        for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4) {
             libsais_prefetch(&T[i - 2 * prefetch_distance]);
 
-            libsais_prefetchw(
-                &buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 0], 0)]);
-            libsais_prefetchw(
-                &buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 1], 0)]);
-            libsais_prefetchw(
-                &buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 2], 0)]);
-            libsais_prefetchw(
-                &buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 3], 0)]);
+            libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 0], 0)]);
+            libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 1], 0)]);
+            libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 2], 0)]);
+            libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 3], 0)]);
 
             c1 = T[i - 0];
             s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
@@ -1101,9 +1024,8 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k(
 }
 
 static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
-    sa_sint_t k, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start,
-    fast_sint_t omp_block_size) {
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
+    sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
     memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t));
 
     fast_sint_t m = omp_block_start + omp_block_size - 1;
@@ -1119,18 +1041,13 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k(
 
         fast_uint_t s = c0 >= c1;
 
-        for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j;
-             i -= 4) {
+        for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4) {
             libsais_prefetch(&T[i - 2 * prefetch_distance]);
 
-            libsais_prefetchw(
-                &buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0], 0)]);
-            libsais_prefetchw(
-                &buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1], 0)]);
-            libsais_prefetchw(
-                &buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2], 0)]);
-            libsais_prefetchw(
-                &buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3], 0)]);
+            libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0], 0)]);
+            libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1], 0)]);
+            libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2], 0)]);
+            libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3], 0)]);
 
             c1 = T[i - 0];
             s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
@@ -1177,9 +1094,8 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k(
 }
 
 static sa_sint_t libsais_count_and_gather_compacted_lms_suffixes_32s_2k(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
-    sa_sint_t k, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start,
-    fast_sint_t omp_block_size) {
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
+    sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
     memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t));
 
     fast_sint_t m = omp_block_start + omp_block_size - 1;
@@ -1195,18 +1111,17 @@ static sa_sint_t libsais_count_and_gather_compacted_lms_suffixes_32s_2k(
 
         fast_uint_t s = c0 >= c1;
 
-        for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j;
-             i -= 4) {
+        for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4) {
             libsais_prefetch(&T[i - 2 * prefetch_distance]);
 
-            libsais_prefetchw(&buckets[BUCKETS_INDEX2(
-                T[i - prefetch_distance - 0] & SAINT_MAX, 0)]);
-            libsais_prefetchw(&buckets[BUCKETS_INDEX2(
-                T[i - prefetch_distance - 1] & SAINT_MAX, 0)]);
-            libsais_prefetchw(&buckets[BUCKETS_INDEX2(
-                T[i - prefetch_distance - 2] & SAINT_MAX, 0)]);
-            libsais_prefetchw(&buckets[BUCKETS_INDEX2(
-                T[i - prefetch_distance - 3] & SAINT_MAX, 0)]);
+            libsais_prefetchw(
+                &buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0] & SAINT_MAX, 0)]);
+            libsais_prefetchw(
+                &buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1] & SAINT_MAX, 0)]);
+            libsais_prefetchw(
+                &buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2] & SAINT_MAX, 0)]);
+            libsais_prefetchw(
+                &buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3] & SAINT_MAX, 0)]);
 
             c1 = T[i - 0];
             s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
@@ -1260,8 +1175,7 @@ static sa_sint_t libsais_count_and_gather_compacted_lms_suffixes_32s_2k(
 
 #if defined(_OPENMP)
 
-static fast_sint_t libsais_get_bucket_stride(fast_sint_t free_space,
-                                             fast_sint_t bucket_size,
+static fast_sint_t libsais_get_bucket_stride(fast_sint_t free_space, fast_sint_t bucket_size,
                                              fast_sint_t num_buckets) {
     fast_sint_t bucket_size_1024 = (bucket_size + 1023) & (-1024);
     if (free_space / (num_buckets - 1) >= bucket_size_1024) {
@@ -1276,9 +1190,8 @@ static fast_sint_t libsais_get_bucket_stride(fast_sint_t free_space,
 }
 
 static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k_fs_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
-    sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
+    sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     sa_sint_t m = 0;
 
     #if defined(_OPENMP)
@@ -1297,27 +1210,25 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k_fs_omp(
     #endif
         fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
-                                         ? omp_block_stride
-                                         : n - omp_block_start;
+        fast_sint_t omp_block_size =
+            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
 
         if (omp_num_threads == 1) {
-            m = libsais_count_and_gather_lms_suffixes_32s_4k(
-                T, SA, n, k, buckets, omp_block_start, omp_block_size);
+            m = libsais_count_and_gather_lms_suffixes_32s_4k(T, SA, n, k, buckets, omp_block_start,
+                                                             omp_block_size);
         }
     #if defined(_OPENMP)
         else {
             fast_sint_t bucket_size = 4 * (fast_sint_t)k;
-            fast_sint_t bucket_stride = libsais_get_bucket_stride(
-                buckets - &SA[n], bucket_size, omp_num_threads);
+            fast_sint_t bucket_stride =
+                libsais_get_bucket_stride(buckets - &SA[n], bucket_size, omp_num_threads);
 
             {
-                thread_state[omp_thread_num].state.position =
-                    omp_block_start + omp_block_size;
+                thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size;
                 thread_state[omp_thread_num].state.count =
                     libsais_count_and_gather_lms_suffixes_32s_4k(
-                        T, SA, n, k, buckets - (omp_thread_num * bucket_stride),
-                        omp_block_start, omp_block_size);
+                        T, SA, n, k, buckets - (omp_thread_num * bucket_stride), omp_block_start,
+                        omp_block_size);
             }
 
         #pragma omp barrier
@@ -1327,13 +1238,10 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k_fs_omp(
                 for (t = omp_num_threads - 1; t >= 0; --t) {
                     m += (sa_sint_t)thread_state[t].state.count;
 
-                    if (t != omp_num_threads - 1 &&
-                        thread_state[t].state.count > 0) {
+                    if (t != omp_num_threads - 1 && thread_state[t].state.count > 0) {
                         memcpy(&SA[n - m],
-                               &SA[thread_state[t].state.position -
-                                   thread_state[t].state.count],
-                               (size_t)thread_state[t].state.count *
-                                   sizeof(sa_sint_t));
+                               &SA[thread_state[t].state.position - thread_state[t].state.count],
+                               (size_t)thread_state[t].state.count * sizeof(sa_sint_t));
                     }
                 }
             } else {
@@ -1344,9 +1252,8 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k_fs_omp(
                                      ? omp_block_stride
                                      : bucket_size - omp_block_start;
 
-                libsais_accumulate_counts_s32(buckets + omp_block_start,
-                                              omp_block_size, bucket_stride,
-                                              omp_num_threads + 1);
+                libsais_accumulate_counts_s32(buckets + omp_block_start, omp_block_size,
+                                              bucket_stride, omp_num_threads + 1);
             }
         }
     #endif
@@ -1356,9 +1263,8 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k_fs_omp(
 }
 
 static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k_fs_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
-    sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
+    sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     sa_sint_t m = 0;
 
     #if defined(_OPENMP)
@@ -1377,27 +1283,25 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k_fs_omp(
     #endif
         fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
-                                         ? omp_block_stride
-                                         : n - omp_block_start;
+        fast_sint_t omp_block_size =
+            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
 
         if (omp_num_threads == 1) {
-            m = libsais_count_and_gather_lms_suffixes_32s_2k(
-                T, SA, n, k, buckets, omp_block_start, omp_block_size);
+            m = libsais_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets, omp_block_start,
+                                                             omp_block_size);
         }
     #if defined(_OPENMP)
         else {
             fast_sint_t bucket_size = 2 * (fast_sint_t)k;
-            fast_sint_t bucket_stride = libsais_get_bucket_stride(
-                buckets - &SA[n], bucket_size, omp_num_threads);
+            fast_sint_t bucket_stride =
+                libsais_get_bucket_stride(buckets - &SA[n], bucket_size, omp_num_threads);
 
             {
-                thread_state[omp_thread_num].state.position =
-                    omp_block_start + omp_block_size;
+                thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size;
                 thread_state[omp_thread_num].state.count =
                     libsais_count_and_gather_lms_suffixes_32s_2k(
-                        T, SA, n, k, buckets - (omp_thread_num * bucket_stride),
-                        omp_block_start, omp_block_size);
+                        T, SA, n, k, buckets - (omp_thread_num * bucket_stride), omp_block_start,
+                        omp_block_size);
             }
 
         #pragma omp barrier
@@ -1407,13 +1311,10 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k_fs_omp(
                 for (t = omp_num_threads - 1; t >= 0; --t) {
                     m += (sa_sint_t)thread_state[t].state.count;
 
-                    if (t != omp_num_threads - 1 &&
-                        thread_state[t].state.count > 0) {
+                    if (t != omp_num_threads - 1 && thread_state[t].state.count > 0) {
                         memcpy(&SA[n - m],
-                               &SA[thread_state[t].state.position -
-                                   thread_state[t].state.count],
-                               (size_t)thread_state[t].state.count *
-                                   sizeof(sa_sint_t));
+                               &SA[thread_state[t].state.position - thread_state[t].state.count],
+                               (size_t)thread_state[t].state.count * sizeof(sa_sint_t));
                     }
                 }
             } else {
@@ -1424,9 +1325,8 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k_fs_omp(
                                      ? omp_block_stride
                                      : bucket_size - omp_block_start;
 
-                libsais_accumulate_counts_s32(buckets + omp_block_start,
-                                              omp_block_size, bucket_stride,
-                                              omp_num_threads + 1);
+                libsais_accumulate_counts_s32(buckets + omp_block_start, omp_block_size,
+                                              bucket_stride, omp_num_threads + 1);
             }
         }
     #endif
@@ -1436,9 +1336,8 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k_fs_omp(
 }
 
 static void libsais_count_and_gather_compacted_lms_suffixes_32s_2k_fs_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
-    sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
+    sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     #if defined(_OPENMP)
         #pragma omp parallel num_threads(threads) if (threads > 1 && n >= 65536)
     #endif
@@ -1455,27 +1354,24 @@ static void libsais_count_and_gather_compacted_lms_suffixes_32s_2k_fs_omp(
     #endif
         fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
-                                         ? omp_block_stride
-                                         : n - omp_block_start;
+        fast_sint_t omp_block_size =
+            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
 
         if (omp_num_threads == 1) {
-            libsais_count_and_gather_compacted_lms_suffixes_32s_2k(
-                T, SA, n, k, buckets, omp_block_start, omp_block_size);
+            libsais_count_and_gather_compacted_lms_suffixes_32s_2k(T, SA, n, k, buckets,
+                                                                   omp_block_start, omp_block_size);
         }
     #if defined(_OPENMP)
         else {
             fast_sint_t bucket_size = 2 * (fast_sint_t)k;
-            fast_sint_t bucket_stride = libsais_get_bucket_stride(
-                buckets - &SA[n + n], bucket_size, omp_num_threads);
+            fast_sint_t bucket_stride =
+                libsais_get_bucket_stride(buckets - &SA[n + n], bucket_size, omp_num_threads);
 
             {
-                thread_state[omp_thread_num].state.position =
-                    omp_block_start + omp_block_size;
+                thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size;
                 thread_state[omp_thread_num].state.count =
                     libsais_count_and_gather_compacted_lms_suffixes_32s_2k(
-                        T, SA + n, n, k,
-                        buckets - (omp_thread_num * bucket_stride),
+                        T, SA + n, n, k, buckets - (omp_thread_num * bucket_stride),
                         omp_block_start, omp_block_size);
             }
 
@@ -1491,8 +1387,7 @@ static void libsais_count_and_gather_compacted_lms_suffixes_32s_2k_fs_omp(
                     memcpy(&SA[n - m],
                            &SA[n + thread_state[omp_thread_num].state.position -
                                thread_state[omp_thread_num].state.count],
-                           (size_t)thread_state[omp_thread_num].state.count *
-                               sizeof(sa_sint_t));
+                           (size_t)thread_state[omp_thread_num].state.count * sizeof(sa_sint_t));
                 }
             }
 
@@ -1503,9 +1398,8 @@ static void libsais_count_and_gather_compacted_lms_suffixes_32s_2k_fs_omp(
                                      ? omp_block_stride
                                      : bucket_size - omp_block_start;
 
-                libsais_accumulate_counts_s32(buckets + omp_block_start,
-                                              omp_block_size, bucket_stride,
-                                              omp_num_threads);
+                libsais_accumulate_counts_s32(buckets + omp_block_start, omp_block_size,
+                                              bucket_stride, omp_num_threads);
             }
         }
     #endif
@@ -1514,9 +1408,11 @@ static void libsais_count_and_gather_compacted_lms_suffixes_32s_2k_fs_omp(
 
 #endif
 
-static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k_nofs_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
-    sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads) {
+static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k_nofs_omp(const sa_sint_t * RESTRICT T,
+                                                                       sa_sint_t * RESTRICT SA,
+                                                                       sa_sint_t n, sa_sint_t k,
+                                                                       sa_sint_t * RESTRICT buckets,
+                                                                       sa_sint_t threads) {
     sa_sint_t m = 0;
 
 #if defined(_OPENMP)
@@ -1532,8 +1428,7 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k_nofs_omp(
         fast_sint_t omp_num_threads = 1;
 #endif
         if (omp_num_threads == 1) {
-            m = libsais_count_and_gather_lms_suffixes_32s_4k(T, SA, n, k,
-                                                             buckets, 0, n);
+            m = libsais_count_and_gather_lms_suffixes_32s_4k(T, SA, n, k, buckets, 0, n);
         }
 #if defined(_OPENMP)
         else if (omp_thread_num == 0) {
@@ -1547,9 +1442,11 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k_nofs_omp(
     return m;
 }
 
-static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k_nofs_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
-    sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads) {
+static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k_nofs_omp(const sa_sint_t * RESTRICT T,
+                                                                       sa_sint_t * RESTRICT SA,
+                                                                       sa_sint_t n, sa_sint_t k,
+                                                                       sa_sint_t * RESTRICT buckets,
+                                                                       sa_sint_t threads) {
     sa_sint_t m = 0;
 
 #if defined(_OPENMP)
@@ -1565,8 +1462,7 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k_nofs_omp(
         fast_sint_t omp_num_threads = 1;
 #endif
         if (omp_num_threads == 1) {
-            m = libsais_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k,
-                                                             buckets, 0, n);
+            m = libsais_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets, 0, n);
         }
 #if defined(_OPENMP)
         else if (omp_thread_num == 0) {
@@ -1580,10 +1476,9 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k_nofs_omp(
     return m;
 }
 
-static sa_sint_t
-libsais_count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
-    sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads) {
+static sa_sint_t libsais_count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
+    sa_sint_t * RESTRICT buckets, sa_sint_t threads) {
     sa_sint_t m = 0;
 
 #if defined(_OPENMP)
@@ -1599,8 +1494,7 @@ libsais_count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp(
         fast_sint_t omp_num_threads = 1;
 #endif
         if (omp_num_threads == 1) {
-            m = libsais_count_and_gather_compacted_lms_suffixes_32s_2k(
-                T, SA, n, k, buckets, 0, n);
+            m = libsais_count_and_gather_compacted_lms_suffixes_32s_2k(T, SA, n, k, buckets, 0, n);
         }
 #if defined(_OPENMP)
         else if (omp_thread_num == 0) {
@@ -1615,14 +1509,12 @@ libsais_count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp(
 }
 
 static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
-    sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
+    sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     sa_sint_t m;
 
 #if defined(_OPENMP)
-    sa_sint_t max_threads =
-        (sa_sint_t)((buckets - &SA[n]) / ((4 * (fast_sint_t)k + 15) & (-16)));
+    sa_sint_t max_threads = (sa_sint_t)((buckets - &SA[n]) / ((4 * (fast_sint_t)k + 15) & (-16)));
     if (max_threads > threads) {
         max_threads = threads;
     }
@@ -1631,29 +1523,25 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k_omp(
             max_threads = n / 16 / k;
         }
         m = libsais_count_and_gather_lms_suffixes_32s_4k_fs_omp(
-            T, SA, n, k, buckets, max_threads > 2 ? max_threads : 2,
-            thread_state);
+            T, SA, n, k, buckets, max_threads > 2 ? max_threads : 2, thread_state);
     } else
 #else
     UNUSED(thread_state);
 #endif
     {
-        m = libsais_count_and_gather_lms_suffixes_32s_4k_nofs_omp(
-            T, SA, n, k, buckets, threads);
+        m = libsais_count_and_gather_lms_suffixes_32s_4k_nofs_omp(T, SA, n, k, buckets, threads);
     }
 
     return m;
 }
 
 static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
-    sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
+    sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     sa_sint_t m;
 
 #if defined(_OPENMP)
-    sa_sint_t max_threads =
-        (sa_sint_t)((buckets - &SA[n]) / ((2 * (fast_sint_t)k + 15) & (-16)));
+    sa_sint_t max_threads = (sa_sint_t)((buckets - &SA[n]) / ((2 * (fast_sint_t)k + 15) & (-16)));
     if (max_threads > threads) {
         max_threads = threads;
     }
@@ -1662,27 +1550,24 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k_omp(
             max_threads = n / 8 / k;
         }
         m = libsais_count_and_gather_lms_suffixes_32s_2k_fs_omp(
-            T, SA, n, k, buckets, max_threads > 2 ? max_threads : 2,
-            thread_state);
+            T, SA, n, k, buckets, max_threads > 2 ? max_threads : 2, thread_state);
     } else
 #else
     UNUSED(thread_state);
 #endif
     {
-        m = libsais_count_and_gather_lms_suffixes_32s_2k_nofs_omp(
-            T, SA, n, k, buckets, threads);
+        m = libsais_count_and_gather_lms_suffixes_32s_2k_nofs_omp(T, SA, n, k, buckets, threads);
     }
 
     return m;
 }
 
 static void libsais_count_and_gather_compacted_lms_suffixes_32s_2k_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
-    sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
+    sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
 #if defined(_OPENMP)
-    sa_sint_t max_threads = (sa_sint_t)((buckets - &SA[n + n]) /
-                                        ((2 * (fast_sint_t)k + 15) & (-16)));
+    sa_sint_t max_threads =
+        (sa_sint_t)((buckets - &SA[n + n]) / ((2 * (fast_sint_t)k + 15) & (-16)));
     if (max_threads > threads) {
         max_threads = threads;
     }
@@ -1691,20 +1576,18 @@ static void libsais_count_and_gather_compacted_lms_suffixes_32s_2k_omp(
             max_threads = n / 8 / k;
         }
         libsais_count_and_gather_compacted_lms_suffixes_32s_2k_fs_omp(
-            T, SA, n, k, buckets, max_threads > 2 ? max_threads : 2,
-            thread_state);
+            T, SA, n, k, buckets, max_threads > 2 ? max_threads : 2, thread_state);
     } else
 #else
     UNUSED(thread_state);
 #endif
     {
-        libsais_count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp(
-            T, SA, n, k, buckets, threads);
+        libsais_count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp(T, SA, n, k, buckets,
+                                                                        threads);
     }
 }
 
-static void libsais_count_suffixes_32s(const sa_sint_t * RESTRICT T,
-                                       sa_sint_t n, sa_sint_t k,
+static void libsais_count_suffixes_32s(const sa_sint_t * RESTRICT T, sa_sint_t n, sa_sint_t k,
                                        sa_sint_t * RESTRICT buckets) {
     const fast_sint_t prefetch_distance = 32;
 
@@ -1729,93 +1612,80 @@ static void libsais_count_suffixes_32s(const sa_sint_t * RESTRICT T,
     }
 }
 
-static void libsais_initialize_buckets_start_and_end_8u(
-    sa_sint_t * RESTRICT buckets, sa_sint_t * RESTRICT freq) {
+static void libsais_initialize_buckets_start_and_end_8u(sa_sint_t * RESTRICT buckets,
+                                                        sa_sint_t * RESTRICT freq) {
     sa_sint_t * RESTRICT bucket_start = &buckets[6 * ALPHABET_SIZE];
     sa_sint_t * RESTRICT bucket_end = &buckets[7 * ALPHABET_SIZE];
 
     if (freq != NULL) {
         fast_sint_t i, j;
         sa_sint_t sum = 0;
-        for (i = BUCKETS_INDEX4(0, 0), j = 0;
-             i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0);
+        for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0);
              i += BUCKETS_INDEX4(1, 0), j += 1) {
             bucket_start[j] = sum;
-            sum += (freq[j] = buckets[i + BUCKETS_INDEX4(0, 0)] +
-                              buckets[i + BUCKETS_INDEX4(0, 1)] +
-                              buckets[i + BUCKETS_INDEX4(0, 2)] +
-                              buckets[i + BUCKETS_INDEX4(0, 3)]);
+            sum +=
+                (freq[j] = buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 1)] +
+                           buckets[i + BUCKETS_INDEX4(0, 2)] + buckets[i + BUCKETS_INDEX4(0, 3)]);
             bucket_end[j] = sum;
         }
     } else {
         fast_sint_t i, j;
         sa_sint_t sum = 0;
-        for (i = BUCKETS_INDEX4(0, 0), j = 0;
-             i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0);
+        for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0);
              i += BUCKETS_INDEX4(1, 0), j += 1) {
             bucket_start[j] = sum;
-            sum += buckets[i + BUCKETS_INDEX4(0, 0)] +
-                   buckets[i + BUCKETS_INDEX4(0, 1)] +
-                   buckets[i + BUCKETS_INDEX4(0, 2)] +
-                   buckets[i + BUCKETS_INDEX4(0, 3)];
+            sum += buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 1)] +
+                   buckets[i + BUCKETS_INDEX4(0, 2)] + buckets[i + BUCKETS_INDEX4(0, 3)];
             bucket_end[j] = sum;
         }
     }
 }
 
-static void libsais_initialize_buckets_start_and_end_32s_6k(
-    sa_sint_t k, sa_sint_t * RESTRICT buckets) {
+static void libsais_initialize_buckets_start_and_end_32s_6k(sa_sint_t k,
+                                                            sa_sint_t * RESTRICT buckets) {
     sa_sint_t * RESTRICT bucket_start = &buckets[4 * k];
     sa_sint_t * RESTRICT bucket_end = &buckets[5 * k];
 
     fast_sint_t i, j;
     sa_sint_t sum = 0;
-    for (i = BUCKETS_INDEX4(0, 0), j = 0;
-         i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0);
+    for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0);
          i += BUCKETS_INDEX4(1, 0), j += 1) {
         bucket_start[j] = sum;
-        sum += buckets[i + BUCKETS_INDEX4(0, 0)] +
-               buckets[i + BUCKETS_INDEX4(0, 1)] +
-               buckets[i + BUCKETS_INDEX4(0, 2)] +
-               buckets[i + BUCKETS_INDEX4(0, 3)];
+        sum += buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 1)] +
+               buckets[i + BUCKETS_INDEX4(0, 2)] + buckets[i + BUCKETS_INDEX4(0, 3)];
         bucket_end[j] = sum;
     }
 }
 
-static void libsais_initialize_buckets_start_and_end_32s_4k(
-    sa_sint_t k, sa_sint_t * RESTRICT buckets) {
+static void libsais_initialize_buckets_start_and_end_32s_4k(sa_sint_t k,
+                                                            sa_sint_t * RESTRICT buckets) {
     sa_sint_t * RESTRICT bucket_start = &buckets[2 * k];
     sa_sint_t * RESTRICT bucket_end = &buckets[3 * k];
 
     fast_sint_t i, j;
     sa_sint_t sum = 0;
-    for (i = BUCKETS_INDEX2(0, 0), j = 0;
-         i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0);
+    for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0);
          i += BUCKETS_INDEX2(1, 0), j += 1) {
         bucket_start[j] = sum;
-        sum += buckets[i + BUCKETS_INDEX2(0, 0)] +
-               buckets[i + BUCKETS_INDEX2(0, 1)];
+        sum += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)];
         bucket_end[j] = sum;
     }
 }
 
-static void libsais_initialize_buckets_end_32s_2k(
-    sa_sint_t k, sa_sint_t * RESTRICT buckets) {
+static void libsais_initialize_buckets_end_32s_2k(sa_sint_t k, sa_sint_t * RESTRICT buckets) {
     fast_sint_t i;
     sa_sint_t sum0 = 0;
     for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0);
          i += BUCKETS_INDEX2(1, 0)) {
-        sum0 += buckets[i + BUCKETS_INDEX2(0, 0)] +
-                buckets[i + BUCKETS_INDEX2(0, 1)];
+        sum0 += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)];
         buckets[i + BUCKETS_INDEX2(0, 0)] = sum0;
     }
 }
 
-static void libsais_initialize_buckets_start_and_end_32s_2k(
-    sa_sint_t k, sa_sint_t * RESTRICT buckets) {
+static void libsais_initialize_buckets_start_and_end_32s_2k(sa_sint_t k,
+                                                            sa_sint_t * RESTRICT buckets) {
     fast_sint_t i, j;
-    for (i = BUCKETS_INDEX2(0, 0), j = 0;
-         i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0);
+    for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0);
          i += BUCKETS_INDEX2(1, 0), j += 1) {
         buckets[j] = buckets[i];
     }
@@ -1824,8 +1694,7 @@ static void libsais_initialize_buckets_start_and_end_32s_2k(
     memcpy(&buckets[k + 1], buckets, ((size_t)k - 1) * sizeof(sa_sint_t));
 }
 
-static void libsais_initialize_buckets_start_32s_1k(
-    sa_sint_t k, sa_sint_t * RESTRICT buckets) {
+static void libsais_initialize_buckets_start_32s_1k(sa_sint_t k, sa_sint_t * RESTRICT buckets) {
     fast_sint_t i;
     sa_sint_t sum = 0;
     for (i = 0; i <= (fast_sint_t)k - 1; i += 1) {
@@ -1835,8 +1704,7 @@ static void libsais_initialize_buckets_start_32s_1k(
     }
 }
 
-static void libsais_initialize_buckets_end_32s_1k(
-    sa_sint_t k, sa_sint_t * RESTRICT buckets) {
+static void libsais_initialize_buckets_end_32s_1k(sa_sint_t k, sa_sint_t * RESTRICT buckets) {
     fast_sint_t i;
     sa_sint_t sum = 0;
     for (i = 0; i <= (fast_sint_t)k - 1; i += 1) {
@@ -1846,8 +1714,7 @@ static void libsais_initialize_buckets_end_32s_1k(
 }
 
 static sa_sint_t libsais_initialize_buckets_for_lms_suffixes_radix_sort_8u(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT buckets,
-    sa_sint_t first_lms_suffix) {
+    const u8 * RESTRICT T, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix) {
     {
         fast_uint_t s = 0;
         fast_sint_t c0 = T[first_lms_suffix];
@@ -1872,8 +1739,7 @@ static sa_sint_t libsais_initialize_buckets_for_lms_suffixes_radix_sort_8u(
              i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0);
              i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) {
             temp_bucket[j + BUCKETS_INDEX2(0, 1)] = sum;
-            sum += buckets[i + BUCKETS_INDEX4(0, 1)] +
-                   buckets[i + BUCKETS_INDEX4(0, 3)];
+            sum += buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 3)];
             temp_bucket[j] = sum;
         }
 
@@ -1891,8 +1757,7 @@ static void libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_2k(
     sa_sint_t sum0 = 0, sum1 = 0;
     for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0);
          i += BUCKETS_INDEX2(1, 0)) {
-        sum0 += buckets[i + BUCKETS_INDEX2(0, 0)] +
-                buckets[i + BUCKETS_INDEX2(0, 1)];
+        sum0 += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)];
         sum1 += buckets[i + BUCKETS_INDEX2(0, 1)];
 
         buckets[i + BUCKETS_INDEX2(0, 0)] = sum0;
@@ -1923,11 +1788,9 @@ static sa_sint_t libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(
 
         fast_sint_t i, j;
         sa_sint_t sum = 0;
-        for (i = BUCKETS_INDEX4(0, 0), j = 0;
-             i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0);
+        for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0);
              i += BUCKETS_INDEX4(1, 0), j += 1) {
-            sum += buckets[i + BUCKETS_INDEX4(0, 1)] +
-                   buckets[i + BUCKETS_INDEX4(0, 3)];
+            sum += buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 3)];
             temp_bucket[j] = sum;
         }
 
@@ -1946,29 +1809,26 @@ static void libsais_initialize_buckets_for_radix_and_partial_sorting_32s_4k(
 
     fast_sint_t i, j;
     sa_sint_t sum0 = 0, sum1 = 0;
-    for (i = BUCKETS_INDEX2(0, 0), j = 0;
-         i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0);
+    for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0);
          i += BUCKETS_INDEX2(1, 0), j += 1) {
         bucket_start[j] = sum1;
 
         sum0 += buckets[i + BUCKETS_INDEX2(0, 1)];
-        sum1 += buckets[i + BUCKETS_INDEX2(0, 0)] +
-                buckets[i + BUCKETS_INDEX2(0, 1)];
+        sum1 += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)];
         buckets[i + BUCKETS_INDEX2(0, 1)] = sum0;
 
         bucket_end[j] = sum1;
     }
 }
 
-static void libsais_radix_sort_lms_suffixes_8u(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
-    sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start,
-    fast_sint_t omp_block_size) {
+static void libsais_radix_sort_lms_suffixes_8u(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
+                                               sa_sint_t * RESTRICT induction_bucket,
+                                               fast_sint_t omp_block_start,
+                                               fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start + omp_block_size - 1,
-        j = omp_block_start + prefetch_distance + 3;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 3;
          i >= j; i -= 4) {
         libsais_prefetch(&SA[i - 2 * prefetch_distance]);
 
@@ -1993,13 +1853,12 @@ static void libsais_radix_sort_lms_suffixes_8u(
     }
 }
 
-static void libsais_radix_sort_lms_suffixes_8u_omp(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
-    sa_sint_t * RESTRICT buckets, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+static void libsais_radix_sort_lms_suffixes_8u_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
+                                                   sa_sint_t n, sa_sint_t m,
+                                                   sa_sint_t * RESTRICT buckets, sa_sint_t threads,
+                                                   LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
 #if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if (threads > 1 && n >= 65536 && \
-                                                  m >= 65536 &&                \
+    #pragma omp parallel num_threads(threads) if (threads > 1 && n >= 65536 && m >= 65536 && \
                                                   omp_get_dynamic() == 0)
 #endif
     {
@@ -2013,16 +1872,15 @@ static void libsais_radix_sort_lms_suffixes_8u_omp(
         fast_sint_t omp_num_threads = 1;
 #endif
         if (omp_num_threads == 1) {
-            libsais_radix_sort_lms_suffixes_8u(
-                T, SA, &buckets[4 * ALPHABET_SIZE],
-                (fast_sint_t)n - (fast_sint_t)m + 1, (fast_sint_t)m - 1);
+            libsais_radix_sort_lms_suffixes_8u(T, SA, &buckets[4 * ALPHABET_SIZE],
+                                               (fast_sint_t)n - (fast_sint_t)m + 1,
+                                               (fast_sint_t)m - 1);
         }
 #if defined(_OPENMP)
         else {
             {
                 sa_sint_t * RESTRICT src_bucket = &buckets[4 * ALPHABET_SIZE];
-                sa_sint_t * RESTRICT dst_bucket =
-                    thread_state[omp_thread_num].state.buckets;
+                sa_sint_t * RESTRICT dst_bucket = thread_state[omp_thread_num].state.buckets;
 
                 fast_sint_t i, j;
                 for (i = BUCKETS_INDEX2(0, 0), j = BUCKETS_INDEX4(0, 1);
@@ -2033,9 +1891,8 @@ static void libsais_radix_sort_lms_suffixes_8u_omp(
             }
 
             {
-                fast_sint_t t,
-                    omp_block_start = 0,
-                    omp_block_size = thread_state[omp_thread_num].state.m;
+                fast_sint_t t, omp_block_start = 0,
+                               omp_block_size = thread_state[omp_thread_num].state.m;
                 for (t = omp_num_threads - 1; t >= omp_thread_num; --t)
                     omp_block_start += thread_state[t].state.m;
 
@@ -2053,15 +1910,15 @@ static void libsais_radix_sort_lms_suffixes_8u_omp(
     }
 }
 
-static void libsais_radix_sort_lms_suffixes_32s_6k(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
-    sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start,
-    fast_sint_t omp_block_size) {
+static void libsais_radix_sort_lms_suffixes_32s_6k(const sa_sint_t * RESTRICT T,
+                                                   sa_sint_t * RESTRICT SA,
+                                                   sa_sint_t * RESTRICT induction_bucket,
+                                                   fast_sint_t omp_block_start,
+                                                   fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start + omp_block_size - 1,
-        j = omp_block_start + 2 * prefetch_distance + 3;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 3;
          i >= j; i -= 4) {
         libsais_prefetch(&SA[i - 3 * prefetch_distance]);
 
@@ -2091,15 +1948,15 @@ static void libsais_radix_sort_lms_suffixes_32s_6k(
     }
 }
 
-static void libsais_radix_sort_lms_suffixes_32s_2k(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
-    sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start,
-    fast_sint_t omp_block_size) {
+static void libsais_radix_sort_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T,
+                                                   sa_sint_t * RESTRICT SA,
+                                                   sa_sint_t * RESTRICT induction_bucket,
+                                                   fast_sint_t omp_block_start,
+                                                   fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start + omp_block_size - 1,
-        j = omp_block_start + 2 * prefetch_distance + 3;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 3;
          i >= j; i -= 4) {
         libsais_prefetch(&SA[i - 3 * prefetch_distance]);
 
@@ -2108,14 +1965,10 @@ static void libsais_radix_sort_lms_suffixes_32s_2k(
         libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 2]]);
         libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 3]]);
 
-        libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(
-            T[SA[i - prefetch_distance - 0]], 0)]);
-        libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(
-            T[SA[i - prefetch_distance - 1]], 0)]);
-        libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(
-            T[SA[i - prefetch_distance - 2]], 0)]);
-        libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(
-            T[SA[i - prefetch_distance - 3]], 0)]);
+        libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 0]], 0)]);
+        libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 1]], 0)]);
+        libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 2]], 0)]);
+        libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 3]], 0)]);
 
         sa_sint_t p0 = SA[i - 0];
         SA[--induction_bucket[BUCKETS_INDEX2(T[p0], 0)]] = p0;
@@ -2135,16 +1988,16 @@ static void libsais_radix_sort_lms_suffixes_32s_2k(
 
 #if defined(_OPENMP)
 
-static void libsais_radix_sort_lms_suffixes_32s_block_gather(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
-    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
-    fast_sint_t omp_block_size) {
+static void libsais_radix_sort_lms_suffixes_32s_block_gather(const sa_sint_t * RESTRICT T,
+                                                             sa_sint_t * RESTRICT SA,
+                                                             LIBSAIS_THREAD_CACHE * RESTRICT cache,
+                                                             fast_sint_t omp_block_start,
+                                                             fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start,
-        j = omp_block_start + omp_block_size - prefetch_distance - 3;
-         i < j; i += 4) {
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j;
+         i += 4) {
         libsais_prefetch(&SA[i + 2 * prefetch_distance]);
 
         libsais_prefetch(&T[SA[i + prefetch_distance + 0]]);
@@ -2165,26 +2018,21 @@ static void libsais_radix_sort_lms_suffixes_32s_block_gather(
     }
 }
 
-static void libsais_radix_sort_lms_suffixes_32s_6k_block_sort(
-    sa_sint_t * RESTRICT induction_bucket,
-    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
-    fast_sint_t omp_block_size) {
+static void libsais_radix_sort_lms_suffixes_32s_6k_block_sort(sa_sint_t * RESTRICT induction_bucket,
+                                                              LIBSAIS_THREAD_CACHE * RESTRICT cache,
+                                                              fast_sint_t omp_block_start,
+                                                              fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start + omp_block_size - 1,
-        j = omp_block_start + prefetch_distance + 3;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 3;
          i >= j; i -= 4) {
         libsais_prefetchw(&cache[i - 2 * prefetch_distance]);
 
-        libsais_prefetchw(
-            &induction_bucket[cache[i - prefetch_distance - 0].symbol]);
-        libsais_prefetchw(
-            &induction_bucket[cache[i - prefetch_distance - 1].symbol]);
-        libsais_prefetchw(
-            &induction_bucket[cache[i - prefetch_distance - 2].symbol]);
-        libsais_prefetchw(
-            &induction_bucket[cache[i - prefetch_distance - 3].symbol]);
+        libsais_prefetchw(&induction_bucket[cache[i - prefetch_distance - 0].symbol]);
+        libsais_prefetchw(&induction_bucket[cache[i - prefetch_distance - 1].symbol]);
+        libsais_prefetchw(&induction_bucket[cache[i - prefetch_distance - 2].symbol]);
+        libsais_prefetchw(&induction_bucket[cache[i - prefetch_distance - 3].symbol]);
 
         cache[i - 0].symbol = --induction_bucket[cache[i - 0].symbol];
         cache[i - 1].symbol = --induction_bucket[cache[i - 1].symbol];
@@ -2197,51 +2045,43 @@ static void libsais_radix_sort_lms_suffixes_32s_6k_block_sort(
     }
 }
 
-static void libsais_radix_sort_lms_suffixes_32s_2k_block_sort(
-    sa_sint_t * RESTRICT induction_bucket,
-    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
-    fast_sint_t omp_block_size) {
+static void libsais_radix_sort_lms_suffixes_32s_2k_block_sort(sa_sint_t * RESTRICT induction_bucket,
+                                                              LIBSAIS_THREAD_CACHE * RESTRICT cache,
+                                                              fast_sint_t omp_block_start,
+                                                              fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start + omp_block_size - 1,
-        j = omp_block_start + prefetch_distance + 3;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 3;
          i >= j; i -= 4) {
         libsais_prefetchw(&cache[i - 2 * prefetch_distance]);
 
-        libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(
-            cache[i - prefetch_distance - 0].symbol, 0)]);
-        libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(
-            cache[i - prefetch_distance - 1].symbol, 0)]);
-        libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(
-            cache[i - prefetch_distance - 2].symbol, 0)]);
-        libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(
-            cache[i - prefetch_distance - 3].symbol, 0)]);
-
-        cache[i - 0].symbol =
-            --induction_bucket[BUCKETS_INDEX2(cache[i - 0].symbol, 0)];
-        cache[i - 1].symbol =
-            --induction_bucket[BUCKETS_INDEX2(cache[i - 1].symbol, 0)];
-        cache[i - 2].symbol =
-            --induction_bucket[BUCKETS_INDEX2(cache[i - 2].symbol, 0)];
-        cache[i - 3].symbol =
-            --induction_bucket[BUCKETS_INDEX2(cache[i - 3].symbol, 0)];
+        libsais_prefetchw(
+            &induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 0].symbol, 0)]);
+        libsais_prefetchw(
+            &induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 1].symbol, 0)]);
+        libsais_prefetchw(
+            &induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 2].symbol, 0)]);
+        libsais_prefetchw(
+            &induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 3].symbol, 0)]);
+
+        cache[i - 0].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 0].symbol, 0)];
+        cache[i - 1].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 1].symbol, 0)];
+        cache[i - 2].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 2].symbol, 0)];
+        cache[i - 3].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 3].symbol, 0)];
     }
 
     for (j -= prefetch_distance + 3; i >= j; i -= 1) {
-        cache[i].symbol =
-            --induction_bucket[BUCKETS_INDEX2(cache[i].symbol, 0)];
+        cache[i].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i].symbol, 0)];
     }
 }
 
 static void libsais_radix_sort_lms_suffixes_32s_6k_block_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
-    sa_sint_t * RESTRICT induction_bucket,
-    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start,
-    fast_sint_t block_size, sa_sint_t threads) {
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket,
+    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size,
+    sa_sint_t threads) {
     #if defined(_OPENMP)
-        #pragma omp parallel num_threads(threads) if (threads > 1 && \
-                                                      block_size >= 16384)
+        #pragma omp parallel num_threads(threads) if (threads > 1 && block_size >= 16384)
     #endif
     {
     #if defined(_OPENMP)
@@ -2256,22 +2096,20 @@ static void libsais_radix_sort_lms_suffixes_32s_6k_block_omp(
     #endif
         fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
-                                         ? omp_block_stride
-                                         : block_size - omp_block_start;
+        fast_sint_t omp_block_size =
+            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
 
         omp_block_start += block_start;
 
         if (omp_num_threads == 1) {
-            libsais_radix_sort_lms_suffixes_32s_6k(
-                T, SA, induction_bucket, omp_block_start, omp_block_size);
+            libsais_radix_sort_lms_suffixes_32s_6k(T, SA, induction_bucket, omp_block_start,
+                                                   omp_block_size);
         }
     #if defined(_OPENMP)
         else {
             {
-                libsais_radix_sort_lms_suffixes_32s_block_gather(
-                    T, SA, cache - block_start, omp_block_start,
-                    omp_block_size);
+                libsais_radix_sort_lms_suffixes_32s_block_gather(T, SA, cache - block_start,
+                                                                 omp_block_start, omp_block_size);
             }
 
         #pragma omp barrier
@@ -2279,15 +2117,14 @@ static void libsais_radix_sort_lms_suffixes_32s_6k_block_omp(
         #pragma omp master
             {
                 libsais_radix_sort_lms_suffixes_32s_6k_block_sort(
-                    induction_bucket, cache - block_start, block_start,
-                    block_size);
+                    induction_bucket, cache - block_start, block_start, block_size);
             }
 
         #pragma omp barrier
 
             {
-                libsais_place_cached_suffixes(SA, cache - block_start,
-                                              omp_block_start, omp_block_size);
+                libsais_place_cached_suffixes(SA, cache - block_start, omp_block_start,
+                                              omp_block_size);
             }
         }
     #endif
@@ -2295,13 +2132,11 @@ static void libsais_radix_sort_lms_suffixes_32s_6k_block_omp(
 }
 
 static void libsais_radix_sort_lms_suffixes_32s_2k_block_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
-    sa_sint_t * RESTRICT induction_bucket,
-    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start,
-    fast_sint_t block_size, sa_sint_t threads) {
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket,
+    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size,
+    sa_sint_t threads) {
     #if defined(_OPENMP)
-        #pragma omp parallel num_threads(threads) if (threads > 1 && \
-                                                      block_size >= 16384)
+        #pragma omp parallel num_threads(threads) if (threads > 1 && block_size >= 16384)
     #endif
     {
     #if defined(_OPENMP)
@@ -2316,22 +2151,20 @@ static void libsais_radix_sort_lms_suffixes_32s_2k_block_omp(
     #endif
         fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
-                                         ? omp_block_stride
-                                         : block_size - omp_block_start;
+        fast_sint_t omp_block_size =
+            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
 
         omp_block_start += block_start;
 
         if (omp_num_threads == 1) {
-            libsais_radix_sort_lms_suffixes_32s_2k(
-                T, SA, induction_bucket, omp_block_start, omp_block_size);
+            libsais_radix_sort_lms_suffixes_32s_2k(T, SA, induction_bucket, omp_block_start,
+                                                   omp_block_size);
         }
     #if defined(_OPENMP)
         else {
             {
-                libsais_radix_sort_lms_suffixes_32s_block_gather(
-                    T, SA, cache - block_start, omp_block_start,
-                    omp_block_size);
+                libsais_radix_sort_lms_suffixes_32s_block_gather(T, SA, cache - block_start,
+                                                                 omp_block_start, omp_block_size);
             }
 
         #pragma omp barrier
@@ -2339,15 +2172,14 @@ static void libsais_radix_sort_lms_suffixes_32s_2k_block_omp(
         #pragma omp master
             {
                 libsais_radix_sort_lms_suffixes_32s_2k_block_sort(
-                    induction_bucket, cache - block_start, block_start,
-                    block_size);
+                    induction_bucket, cache - block_start, block_start, block_size);
             }
 
         #pragma omp barrier
 
             {
-                libsais_place_cached_suffixes(SA, cache - block_start,
-                                              omp_block_start, omp_block_size);
+                libsais_place_cached_suffixes(SA, cache - block_start, omp_block_start,
+                                              omp_block_size);
             }
         }
     #endif
@@ -2357,28 +2189,25 @@ static void libsais_radix_sort_lms_suffixes_32s_2k_block_omp(
 #endif
 
 static void libsais_radix_sort_lms_suffixes_32s_6k_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
-    sa_sint_t m, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads,
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
+    sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads,
     LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     if (threads == 1 || m < 65536) {
         libsais_radix_sort_lms_suffixes_32s_6k(
-            T, SA, induction_bucket, (fast_sint_t)n - (fast_sint_t)m + 1,
-            (fast_sint_t)m - 1);
+            T, SA, induction_bucket, (fast_sint_t)n - (fast_sint_t)m + 1, (fast_sint_t)m - 1);
     }
 #if defined(_OPENMP)
     else {
         fast_sint_t block_start, block_end;
-        for (block_start = 0; block_start < (fast_sint_t)m - 1;
-             block_start = block_end) {
-            block_end = block_start +
-                        (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
+        for (block_start = 0; block_start < (fast_sint_t)m - 1; block_start = block_end) {
+            block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
             if (block_end >= m) {
                 block_end = (fast_sint_t)m - 1;
             }
 
             libsais_radix_sort_lms_suffixes_32s_6k_block_omp(
-                T, SA, induction_bucket, thread_state[0].state.cache,
-                (fast_sint_t)n - block_end, block_end - block_start, threads);
+                T, SA, induction_bucket, thread_state[0].state.cache, (fast_sint_t)n - block_end,
+                block_end - block_start, threads);
         }
     }
 #else
@@ -2387,28 +2216,25 @@ static void libsais_radix_sort_lms_suffixes_32s_6k_omp(
 }
 
 static void libsais_radix_sort_lms_suffixes_32s_2k_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
-    sa_sint_t m, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads,
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
+    sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads,
     LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     if (threads == 1 || m < 65536) {
         libsais_radix_sort_lms_suffixes_32s_2k(
-            T, SA, induction_bucket, (fast_sint_t)n - (fast_sint_t)m + 1,
-            (fast_sint_t)m - 1);
+            T, SA, induction_bucket, (fast_sint_t)n - (fast_sint_t)m + 1, (fast_sint_t)m - 1);
     }
 #if defined(_OPENMP)
     else {
         fast_sint_t block_start, block_end;
-        for (block_start = 0; block_start < (fast_sint_t)m - 1;
-             block_start = block_end) {
-            block_end = block_start +
-                        (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
+        for (block_start = 0; block_start < (fast_sint_t)m - 1; block_start = block_end) {
+            block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
             if (block_end >= m) {
                 block_end = (fast_sint_t)m - 1;
             }
 
             libsais_radix_sort_lms_suffixes_32s_2k_block_omp(
-                T, SA, induction_bucket, thread_state[0].state.cache,
-                (fast_sint_t)n - block_end, block_end - block_start, threads);
+                T, SA, induction_bucket, thread_state[0].state.cache, (fast_sint_t)n - block_end,
+                block_end - block_start, threads);
         }
     }
 #else
@@ -2416,9 +2242,9 @@ static void libsais_radix_sort_lms_suffixes_32s_2k_omp(
 #endif
 }
 
-static sa_sint_t libsais_radix_sort_lms_suffixes_32s_1k(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
-    sa_sint_t * RESTRICT buckets) {
+static sa_sint_t libsais_radix_sort_lms_suffixes_32s_1k(const sa_sint_t * RESTRICT T,
+                                                        sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                                        sa_sint_t * RESTRICT buckets) {
     const fast_sint_t prefetch_distance = 32;
 
     sa_sint_t i = n - 2;
@@ -2482,15 +2308,15 @@ static sa_sint_t libsais_radix_sort_lms_suffixes_32s_1k(
     return m;
 }
 
-static void libsais_radix_sort_set_markers_32s_6k(
-    sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket,
-    fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+static void libsais_radix_sort_set_markers_32s_6k(sa_sint_t * RESTRICT SA,
+                                                  sa_sint_t * RESTRICT induction_bucket,
+                                                  fast_sint_t omp_block_start,
+                                                  fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start,
-        j = omp_block_start + omp_block_size - prefetch_distance - 3;
-         i < j; i += 4) {
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j;
+         i += 4) {
         libsais_prefetch(&induction_bucket[i + 2 * prefetch_distance]);
 
         libsais_prefetchw(&SA[induction_bucket[i + prefetch_distance + 0]]);
@@ -2509,26 +2335,21 @@ static void libsais_radix_sort_set_markers_32s_6k(
     }
 }
 
-static void libsais_radix_sort_set_markers_32s_4k(
-    sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket,
-    fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+static void libsais_radix_sort_set_markers_32s_4k(sa_sint_t * RESTRICT SA,
+                                                  sa_sint_t * RESTRICT induction_bucket,
+                                                  fast_sint_t omp_block_start,
+                                                  fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start,
-        j = omp_block_start + omp_block_size - prefetch_distance - 3;
-         i < j; i += 4) {
-        libsais_prefetch(
-            &induction_bucket[BUCKETS_INDEX2(i + 2 * prefetch_distance, 0)]);
-
-        libsais_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(
-            i + prefetch_distance + 0, 0)]]);
-        libsais_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(
-            i + prefetch_distance + 1, 0)]]);
-        libsais_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(
-            i + prefetch_distance + 2, 0)]]);
-        libsais_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(
-            i + prefetch_distance + 3, 0)]]);
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j;
+         i += 4) {
+        libsais_prefetch(&induction_bucket[BUCKETS_INDEX2(i + 2 * prefetch_distance, 0)]);
+
+        libsais_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 0, 0)]]);
+        libsais_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 1, 0)]]);
+        libsais_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 2, 0)]]);
+        libsais_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 3, 0)]]);
 
         SA[induction_bucket[BUCKETS_INDEX2(i + 0, 0)]] |= SUFFIX_GROUP_MARKER;
         SA[induction_bucket[BUCKETS_INDEX2(i + 1, 0)]] |= SUFFIX_GROUP_MARKER;
@@ -2541,9 +2362,9 @@ static void libsais_radix_sort_set_markers_32s_4k(
     }
 }
 
-static void libsais_radix_sort_set_markers_32s_6k_omp(
-    sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket,
-    sa_sint_t threads) {
+static void libsais_radix_sort_set_markers_32s_6k_omp(sa_sint_t * RESTRICT SA, sa_sint_t k,
+                                                      sa_sint_t * RESTRICT induction_bucket,
+                                                      sa_sint_t threads) {
 #if defined(_OPENMP)
     #pragma omp parallel num_threads(threads) if (threads > 1 && k >= 65536)
 #endif
@@ -2551,8 +2372,7 @@ static void libsais_radix_sort_set_markers_32s_6k_omp(
 #if defined(_OPENMP)
         fast_sint_t omp_thread_num = omp_get_thread_num();
         fast_sint_t omp_num_threads = omp_get_num_threads();
-        fast_sint_t omp_block_stride =
-            (((fast_sint_t)k - 1) / omp_num_threads) & (-16);
+        fast_sint_t omp_block_stride = (((fast_sint_t)k - 1) / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
         fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
                                          ? omp_block_stride
@@ -2564,14 +2384,14 @@ static void libsais_radix_sort_set_markers_32s_6k_omp(
         fast_sint_t omp_block_size = (fast_sint_t)k - 1;
 #endif
 
-        libsais_radix_sort_set_markers_32s_6k(SA, induction_bucket,
-                                              omp_block_start, omp_block_size);
+        libsais_radix_sort_set_markers_32s_6k(SA, induction_bucket, omp_block_start,
+                                              omp_block_size);
     }
 }
 
-static void libsais_radix_sort_set_markers_32s_4k_omp(
-    sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket,
-    sa_sint_t threads) {
+static void libsais_radix_sort_set_markers_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t k,
+                                                      sa_sint_t * RESTRICT induction_bucket,
+                                                      sa_sint_t threads) {
 #if defined(_OPENMP)
     #pragma omp parallel num_threads(threads) if (threads > 1 && k >= 65536)
 #endif
@@ -2579,8 +2399,7 @@ static void libsais_radix_sort_set_markers_32s_4k_omp(
 #if defined(_OPENMP)
         fast_sint_t omp_thread_num = omp_get_thread_num();
         fast_sint_t omp_num_threads = omp_get_num_threads();
-        fast_sint_t omp_block_stride =
-            (((fast_sint_t)k - 1) / omp_num_threads) & (-16);
+        fast_sint_t omp_block_stride = (((fast_sint_t)k - 1) / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
         fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
                                          ? omp_block_stride
@@ -2592,14 +2411,15 @@ static void libsais_radix_sort_set_markers_32s_4k_omp(
         fast_sint_t omp_block_size = (fast_sint_t)k - 1;
 #endif
 
-        libsais_radix_sort_set_markers_32s_4k(SA, induction_bucket,
-                                              omp_block_start, omp_block_size);
+        libsais_radix_sort_set_markers_32s_4k(SA, induction_bucket, omp_block_start,
+                                              omp_block_size);
     }
 }
 
-static void libsais_initialize_buckets_for_partial_sorting_8u(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT buckets,
-    sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count) {
+static void libsais_initialize_buckets_for_partial_sorting_8u(const u8 * RESTRICT T,
+                                                              sa_sint_t * RESTRICT buckets,
+                                                              sa_sint_t first_lms_suffix,
+                                                              sa_sint_t left_suffixes_count) {
     sa_sint_t * RESTRICT temp_bucket = &buckets[4 * ALPHABET_SIZE];
 
     buckets[BUCKETS_INDEX4((fast_uint_t)T[first_lms_suffix], 1)]++;
@@ -2611,8 +2431,7 @@ static void libsais_initialize_buckets_for_partial_sorting_8u(
          i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) {
         temp_bucket[j + BUCKETS_INDEX2(0, 0)] = sum0;
 
-        sum0 += buckets[i + BUCKETS_INDEX4(0, 0)] +
-                buckets[i + BUCKETS_INDEX4(0, 2)];
+        sum0 += buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 2)];
         sum1 += buckets[i + BUCKETS_INDEX4(0, 1)];
 
         buckets[j + BUCKETS_INDEX2(0, 0)] = sum0;
@@ -2620,15 +2439,16 @@ static void libsais_initialize_buckets_for_partial_sorting_8u(
     }
 }
 
-static void libsais_initialize_buckets_for_partial_sorting_32s_6k(
-    const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets,
-    sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count) {
+static void libsais_initialize_buckets_for_partial_sorting_32s_6k(const sa_sint_t * RESTRICT T,
+                                                                  sa_sint_t k,
+                                                                  sa_sint_t * RESTRICT buckets,
+                                                                  sa_sint_t first_lms_suffix,
+                                                                  sa_sint_t left_suffixes_count) {
     sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k];
 
     fast_sint_t i, j;
     sa_sint_t sum0 = left_suffixes_count + 1, sum1 = 0, sum2 = 0;
-    for (first_lms_suffix = T[first_lms_suffix], i = BUCKETS_INDEX4(0, 0),
-        j = BUCKETS_INDEX2(0, 0);
+    for (first_lms_suffix = T[first_lms_suffix], i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0);
          i <= BUCKETS_INDEX4((fast_sint_t)first_lms_suffix - 1, 0);
          i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) {
         sa_sint_t SS = buckets[i + BUCKETS_INDEX4(0, 0)];
@@ -2671,18 +2491,16 @@ static void libsais_initialize_buckets_for_partial_sorting_32s_6k(
 }
 
 static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
-    sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start,
-    fast_sint_t omp_block_size) {
+    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d,
+    fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     sa_sint_t * RESTRICT induction_bucket = &buckets[4 * ALPHABET_SIZE];
     sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE];
 
     fast_sint_t i, j;
-    for (i = omp_block_start,
-        j = omp_block_start + omp_block_size - prefetch_distance - 1;
-         i < j; i += 2) {
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j;
+         i += 2) {
         libsais_prefetch(&SA[i + 2 * prefetch_distance]);
 
         libsais_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1);
@@ -2695,8 +2513,7 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u(
         p0 &= SAINT_MAX;
         sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] >= T[p0 - 1]);
         SA[induction_bucket[v0]++] =
-            (p0 - 1) |
-            ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1));
+            (p0 - 1) | ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1));
         distinct_names[v0] = d;
 
         sa_sint_t p1 = SA[i + 1];
@@ -2704,8 +2521,7 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u(
         p1 &= SAINT_MAX;
         sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] >= T[p1 - 1]);
         SA[induction_bucket[v1]++] =
-            (p1 - 1) |
-            ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1));
+            (p1 - 1) | ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1));
         distinct_names[v1] = d;
     }
 
@@ -2725,9 +2541,8 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u(
 #if defined(_OPENMP)
 
 static void libsais_partial_sorting_scan_left_to_right_8u_block_prepare(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
-    sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache,
-    fast_sint_t omp_block_start, fast_sint_t omp_block_size,
+    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
+    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size,
     LIBSAIS_THREAD_STATE * RESTRICT state) {
     const fast_sint_t prefetch_distance = 32;
 
@@ -2738,9 +2553,8 @@ static void libsais_partial_sorting_scan_left_to_right_8u_block_prepare(
 
     fast_sint_t i, j, count = 0;
     sa_sint_t d = 1;
-    for (i = omp_block_start,
-        j = omp_block_start + omp_block_size - prefetch_distance - 1;
-         i < j; i += 2) {
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j;
+         i += 2) {
         libsais_prefetch(&SA[i + 2 * prefetch_distance]);
 
         libsais_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1);
@@ -2751,15 +2565,13 @@ static void libsais_partial_sorting_scan_left_to_right_8u_block_prepare(
         sa_sint_t p0 = cache[count].index = SA[i + 0];
         d += (p0 < 0);
         p0 &= SAINT_MAX;
-        sa_sint_t v0 = cache[count++].symbol =
-            BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] >= T[p0 - 1]);
+        sa_sint_t v0 = cache[count++].symbol = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] >= T[p0 - 1]);
         induction_bucket[v0]++;
         distinct_names[v0] = d;
         sa_sint_t p1 = cache[count].index = SA[i + 1];
         d += (p1 < 0);
         p1 &= SAINT_MAX;
-        sa_sint_t v1 = cache[count++].symbol =
-            BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] >= T[p1 - 1]);
+        sa_sint_t v1 = cache[count++].symbol = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] >= T[p1 - 1]);
         induction_bucket[v1]++;
         distinct_names[v1] = d;
     }
@@ -2768,8 +2580,7 @@ static void libsais_partial_sorting_scan_left_to_right_8u_block_prepare(
         sa_sint_t p = cache[count].index = SA[i];
         d += (p < 0);
         p &= SAINT_MAX;
-        sa_sint_t v = cache[count++].symbol =
-            BUCKETS_INDEX2(T[p - 1], T[p - 2] >= T[p - 1]);
+        sa_sint_t v = cache[count++].symbol = BUCKETS_INDEX2(T[p - 1], T[p - 2] >= T[p - 1]);
         induction_bucket[v]++;
         distinct_names[v] = d;
     }
@@ -2779,8 +2590,8 @@ static void libsais_partial_sorting_scan_left_to_right_8u_block_prepare(
 }
 
 static void libsais_partial_sorting_scan_left_to_right_8u_block_place(
-    sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
-    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count, sa_sint_t d) {
+    sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache,
+    fast_sint_t count, sa_sint_t d) {
     const fast_sint_t prefetch_distance = 32;
 
     sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
@@ -2794,16 +2605,14 @@ static void libsais_partial_sorting_scan_left_to_right_8u_block_place(
         d += (p0 < 0);
         sa_sint_t v0 = cache[i + 0].symbol;
         SA[induction_bucket[v0]++] =
-            (p0 - 1) |
-            ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1));
+            (p0 - 1) | ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1));
         distinct_names[v0] = d;
 
         sa_sint_t p1 = cache[i + 1].index;
         d += (p1 < 0);
         sa_sint_t v1 = cache[i + 1].symbol;
         SA[induction_bucket[v1]++] =
-            (p1 - 1) |
-            ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1));
+            (p1 - 1) | ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1));
         distinct_names[v1] = d;
     }
 
@@ -2818,14 +2627,13 @@ static void libsais_partial_sorting_scan_left_to_right_8u_block_place(
 }
 
 static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u_block_omp(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
-    sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t block_start,
-    fast_sint_t block_size, sa_sint_t threads,
+    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d,
+    fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads,
     LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     #if defined(_OPENMP)
-        #pragma omp parallel num_threads(                                   \
-            threads) if (threads > 1 && block_size >= 64 * ALPHABET_SIZE && \
-                         omp_get_dynamic() == 0)
+        #pragma omp parallel num_threads(threads) if (threads > 1 &&                      \
+                                                      block_size >= 64 * ALPHABET_SIZE && \
+                                                      omp_get_dynamic() == 0)
     #endif
     {
     #if defined(_OPENMP)
@@ -2840,33 +2648,30 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u_block_omp(
     #endif
         fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
-                                         ? omp_block_stride
-                                         : block_size - omp_block_start;
+        fast_sint_t omp_block_size =
+            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
 
         omp_block_start += block_start;
 
         if (omp_num_threads == 1) {
-            d = libsais_partial_sorting_scan_left_to_right_8u(
-                T, SA, buckets, d, omp_block_start, omp_block_size);
+            d = libsais_partial_sorting_scan_left_to_right_8u(T, SA, buckets, d, omp_block_start,
+                                                              omp_block_size);
         }
     #if defined(_OPENMP)
         else {
             {
                 libsais_partial_sorting_scan_left_to_right_8u_block_prepare(
                     T, SA, thread_state[omp_thread_num].state.buckets,
-                    thread_state[omp_thread_num].state.cache, omp_block_start,
-                    omp_block_size, &thread_state[omp_thread_num]);
+                    thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size,
+                    &thread_state[omp_thread_num]);
             }
 
         #pragma omp barrier
 
         #pragma omp master
             {
-                sa_sint_t * RESTRICT induction_bucket =
-                    &buckets[4 * ALPHABET_SIZE];
-                sa_sint_t * RESTRICT distinct_names =
-                    &buckets[2 * ALPHABET_SIZE];
+                sa_sint_t * RESTRICT induction_bucket = &buckets[4 * ALPHABET_SIZE];
+                sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE];
 
                 fast_sint_t t;
                 for (t = 0; t < omp_num_threads; ++t) {
@@ -2877,15 +2682,13 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u_block_omp(
 
                     fast_sint_t c;
                     for (c = 0; c < 2 * ALPHABET_SIZE; c += 1) {
-                        sa_sint_t A = induction_bucket[c],
-                                  B = temp_induction_bucket[c];
+                        sa_sint_t A = induction_bucket[c], B = temp_induction_bucket[c];
                         induction_bucket[c] = A + B;
                         temp_induction_bucket[c] = A;
                     }
 
                     for (d -= 1, c = 0; c < 2 * ALPHABET_SIZE; c += 1) {
-                        sa_sint_t A = distinct_names[c],
-                                  B = temp_distinct_names[c], D = B + d;
+                        sa_sint_t A = distinct_names[c], B = temp_distinct_names[c], D = B + d;
                         distinct_names[c] = B > 0 ? D : A;
                         temp_distinct_names[c] = A;
                     }
@@ -2914,14 +2717,13 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u_block_omp(
 #endif
 
 static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u_omp(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
-    sa_sint_t * RESTRICT buckets, sa_sint_t left_suffixes_count, sa_sint_t d,
-    sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets,
+    sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     sa_sint_t * RESTRICT induction_bucket = &buckets[4 * ALPHABET_SIZE];
     sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE];
 
-    SA[induction_bucket[BUCKETS_INDEX2(T[n - 1], T[n - 2] >= T[n - 1])]++] =
-        (n - 1) | SAINT_MIN;
+    SA[induction_bucket[BUCKETS_INDEX2(T[n - 1], T[n - 2] >= T[n - 1])]++] = (n - 1) | SAINT_MIN;
     distinct_names[BUCKETS_INDEX2(T[n - 1], T[n - 2] >= T[n - 1])] = ++d;
 
     if (threads == 1 || left_suffixes_count < 65536) {
@@ -2936,9 +2738,8 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u_omp(
                 block_start++;
             } else {
                 fast_sint_t block_max_end =
-                    block_start +
-                    ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE -
-                                              16 * (fast_sint_t)threads);
+                    block_start + ((fast_sint_t)threads) *
+                                      (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads);
                 if (block_max_end > left_suffixes_count) {
                     block_max_end = left_suffixes_count;
                 }
@@ -2953,17 +2754,14 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u_omp(
                         sa_sint_t p = SA[block_start];
                         d += (p < 0);
                         p &= SAINT_MAX;
-                        sa_sint_t v =
-                            BUCKETS_INDEX2(T[p - 1], T[p - 2] >= T[p - 1]);
+                        sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] >= T[p - 1]);
                         SA[induction_bucket[v]++] =
-                            (p - 1) | ((sa_sint_t)(distinct_names[v] != d)
-                                       << (SAINT_BIT - 1));
+                            (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1));
                         distinct_names[v] = d;
                     }
                 } else {
                     d = libsais_partial_sorting_scan_left_to_right_8u_block_omp(
-                        T, SA, buckets, d, block_start, block_size, threads,
-                        thread_state);
+                        T, SA, buckets, d, block_start, block_size, threads, thread_state);
                     block_start = block_end;
                 }
             }
@@ -2977,14 +2775,12 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u_omp(
 }
 
 static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
-    sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start,
-    fast_sint_t omp_block_size) {
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
+    sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start,
-        j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1;
          i < j; i += 2) {
         libsais_prefetch(&SA[i + 3 * prefetch_distance]);
 
@@ -3004,16 +2800,14 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k(
         d += (p2 < 0);
         p2 &= SAINT_MAX;
         sa_sint_t v2 = BUCKETS_INDEX4(T[p2 - 1], T[p2 - 2] >= T[p2 - 1]);
-        SA[buckets[v2]++] =
-            (p2 - 1) | ((sa_sint_t)(buckets[2 + v2] != d) << (SAINT_BIT - 1));
+        SA[buckets[v2]++] = (p2 - 1) | ((sa_sint_t)(buckets[2 + v2] != d) << (SAINT_BIT - 1));
         buckets[2 + v2] = d;
 
         sa_sint_t p3 = SA[i + 1];
         d += (p3 < 0);
         p3 &= SAINT_MAX;
         sa_sint_t v3 = BUCKETS_INDEX4(T[p3 - 1], T[p3 - 2] >= T[p3 - 1]);
-        SA[buckets[v3]++] =
-            (p3 - 1) | ((sa_sint_t)(buckets[2 + v3] != d) << (SAINT_BIT - 1));
+        SA[buckets[v3]++] = (p3 - 1) | ((sa_sint_t)(buckets[2 + v3] != d) << (SAINT_BIT - 1));
         buckets[2 + v3] = d;
     }
 
@@ -3022,8 +2816,7 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k(
         d += (p < 0);
         p &= SAINT_MAX;
         sa_sint_t v = BUCKETS_INDEX4(T[p - 1], T[p - 2] >= T[p - 1]);
-        SA[buckets[v]++] =
-            (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1));
+        SA[buckets[v]++] = (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1));
         buckets[2 + v] = d;
     }
 
@@ -3040,8 +2833,7 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k(
     sa_sint_t * RESTRICT distinct_names = &buckets[0 * k];
 
     fast_sint_t i, j;
-    for (i = omp_block_start,
-        j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1;
          i < j; i += 2) {
         libsais_prefetchw(&SA[i + 3 * prefetch_distance]);
 
@@ -3076,10 +2868,8 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k(
             p0 &= ~SUFFIX_GROUP_MARKER;
             sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] < T[p0 - 1]);
             SA[induction_bucket[T[p0 - 1]]++] =
-                (p0 - 1) |
-                ((sa_sint_t)(T[p0 - 2] < T[p0 - 1]) << (SAINT_BIT - 1)) |
-                ((sa_sint_t)(distinct_names[v0] != d)
-                 << (SUFFIX_GROUP_BIT - 1));
+                (p0 - 1) | ((sa_sint_t)(T[p0 - 2] < T[p0 - 1]) << (SAINT_BIT - 1)) |
+                ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1));
             distinct_names[v0] = d;
         }
 
@@ -3091,10 +2881,8 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k(
             p1 &= ~SUFFIX_GROUP_MARKER;
             sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] < T[p1 - 1]);
             SA[induction_bucket[T[p1 - 1]]++] =
-                (p1 - 1) |
-                ((sa_sint_t)(T[p1 - 2] < T[p1 - 1]) << (SAINT_BIT - 1)) |
-                ((sa_sint_t)(distinct_names[v1] != d)
-                 << (SUFFIX_GROUP_BIT - 1));
+                (p1 - 1) | ((sa_sint_t)(T[p1 - 2] < T[p1 - 1]) << (SAINT_BIT - 1)) |
+                ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1));
             distinct_names[v1] = d;
         }
     }
@@ -3108,8 +2896,7 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k(
             p &= ~SUFFIX_GROUP_MARKER;
             sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] < T[p - 1]);
             SA[induction_bucket[T[p - 1]]++] =
-                (p - 1) |
-                ((sa_sint_t)(T[p - 2] < T[p - 1]) << (SAINT_BIT - 1)) |
+                (p - 1) | ((sa_sint_t)(T[p - 2] < T[p - 1]) << (SAINT_BIT - 1)) |
                 ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1));
             distinct_names[v] = d;
         }
@@ -3118,15 +2905,15 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k(
     return d;
 }
 
-static void libsais_partial_sorting_scan_left_to_right_32s_1k(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
-    sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start,
-    fast_sint_t omp_block_size) {
+static void libsais_partial_sorting_scan_left_to_right_32s_1k(const sa_sint_t * RESTRICT T,
+                                                              sa_sint_t * RESTRICT SA,
+                                                              sa_sint_t * RESTRICT induction_bucket,
+                                                              fast_sint_t omp_block_start,
+                                                              fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start,
-        j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1;
          i < j; i += 2) {
         libsais_prefetchw(&SA[i + 3 * prefetch_distance]);
 
@@ -3152,16 +2939,14 @@ static void libsais_partial_sorting_scan_left_to_right_32s_1k(
         if (p0 > 0) {
             SA[i + 0] = 0;
             SA[induction_bucket[T[p0 - 1]]++] =
-                (p0 - 1) |
-                ((sa_sint_t)(T[p0 - 2] < T[p0 - 1]) << (SAINT_BIT - 1));
+                (p0 - 1) | ((sa_sint_t)(T[p0 - 2] < T[p0 - 1]) << (SAINT_BIT - 1));
         }
         sa_sint_t p1 = SA[i + 1];
         SA[i + 1] = p1 & SAINT_MAX;
         if (p1 > 0) {
             SA[i + 1] = 0;
             SA[induction_bucket[T[p1 - 1]]++] =
-                (p1 - 1) |
-                ((sa_sint_t)(T[p1 - 2] < T[p1 - 1]) << (SAINT_BIT - 1));
+                (p1 - 1) | ((sa_sint_t)(T[p1 - 2] < T[p1 - 1]) << (SAINT_BIT - 1));
         }
     }
 
@@ -3179,15 +2964,13 @@ static void libsais_partial_sorting_scan_left_to_right_32s_1k(
 #if defined(_OPENMP)
 
 static void libsais_partial_sorting_scan_left_to_right_32s_6k_block_gather(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
-    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
-    fast_sint_t omp_block_size) {
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache,
+    fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start,
-        j = omp_block_start + omp_block_size - prefetch_distance - 1;
-         i < j; i += 2) {
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j;
+         i += 2) {
         libsais_prefetch(&SA[i + 2 * prefetch_distance]);
 
         libsais_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1);
@@ -3225,15 +3008,13 @@ static void libsais_partial_sorting_scan_left_to_right_32s_6k_block_gather(
 }
 
 static void libsais_partial_sorting_scan_left_to_right_32s_4k_block_gather(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
-    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
-    fast_sint_t omp_block_size) {
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache,
+    fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start,
-        j = omp_block_start + omp_block_size - prefetch_distance - 1;
-         i < j; i += 2) {
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j;
+         i += 2) {
         libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
 
         sa_sint_t s0 = SA[i + prefetch_distance + 0];
@@ -3283,15 +3064,13 @@ static void libsais_partial_sorting_scan_left_to_right_32s_4k_block_gather(
 }
 
 static void libsais_partial_sorting_scan_left_to_right_32s_1k_block_gather(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
-    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
-    fast_sint_t omp_block_size) {
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache,
+    fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start,
-        j = omp_block_start + omp_block_size - prefetch_distance - 1;
-         i < j; i += 2) {
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j;
+         i += 2) {
         libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
 
         sa_sint_t s0 = SA[i + prefetch_distance + 0];
@@ -3309,8 +3088,7 @@ static void libsais_partial_sorting_scan_left_to_right_32s_1k_block_gather(
 
         sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0];
         if (p0 > 0) {
-            cache[i + 0].index = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] < T[p0 - 1])
-                                             << (SAINT_BIT - 1));
+            cache[i + 0].index = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] < T[p0 - 1]) << (SAINT_BIT - 1));
             symbol0 = T[p0 - 1];
             p0 = 0;
         }
@@ -3318,8 +3096,7 @@ static void libsais_partial_sorting_scan_left_to_right_32s_1k_block_gather(
         SA[i + 0] = p0 & SAINT_MAX;
         sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1];
         if (p1 > 0) {
-            cache[i + 1].index = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] < T[p1 - 1])
-                                             << (SAINT_BIT - 1));
+            cache[i + 1].index = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] < T[p1 - 1]) << (SAINT_BIT - 1));
             symbol1 = T[p1 - 1];
             p1 = 0;
         }
@@ -3330,8 +3107,7 @@ static void libsais_partial_sorting_scan_left_to_right_32s_1k_block_gather(
     for (j += prefetch_distance + 1; i < j; i += 1) {
         sa_sint_t symbol = SAINT_MIN, p = SA[i];
         if (p > 0) {
-            cache[i].index =
-                (p - 1) | ((sa_sint_t)(T[p - 2] < T[p - 1]) << (SAINT_BIT - 1));
+            cache[i].index = (p - 1) | ((sa_sint_t)(T[p - 2] < T[p - 1]) << (SAINT_BIT - 1));
             symbol = T[p - 1];
             p = 0;
         }
@@ -3347,8 +3123,7 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k_block_sort(
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size;
-    for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j;
-         i += 2) {
+    for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2) {
         libsais_prefetchw(&cache[i + 2 * prefetch_distance]);
 
         libsais_prefetchw(&buckets[cache[i + prefetch_distance + 0].symbol]);
@@ -3357,8 +3132,7 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k_block_sort(
         sa_sint_t v0 = cache[i + 0].symbol, p0 = cache[i + 0].index;
         d += (p0 < 0);
         cache[i + 0].symbol = buckets[v0]++;
-        cache[i + 0].index =
-            (p0 - 1) | ((sa_sint_t)(buckets[2 + v0] != d) << (SAINT_BIT - 1));
+        cache[i + 0].index = (p0 - 1) | ((sa_sint_t)(buckets[2 + v0] != d) << (SAINT_BIT - 1));
         buckets[2 + v0] = d;
         if (cache[i + 0].symbol < omp_block_end) {
             sa_sint_t s = cache[i + 0].symbol,
@@ -3369,8 +3143,7 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k_block_sort(
         sa_sint_t v1 = cache[i + 1].symbol, p1 = cache[i + 1].index;
         d += (p1 < 0);
         cache[i + 1].symbol = buckets[v1]++;
-        cache[i + 1].index =
-            (p1 - 1) | ((sa_sint_t)(buckets[2 + v1] != d) << (SAINT_BIT - 1));
+        cache[i + 1].index = (p1 - 1) | ((sa_sint_t)(buckets[2 + v1] != d) << (SAINT_BIT - 1));
         buckets[2 + v1] = d;
         if (cache[i + 1].symbol < omp_block_end) {
             sa_sint_t s = cache[i + 1].symbol,
@@ -3383,12 +3156,10 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k_block_sort(
         sa_sint_t v = cache[i].symbol, p = cache[i].index;
         d += (p < 0);
         cache[i].symbol = buckets[v]++;
-        cache[i].index =
-            (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1));
+        cache[i].index = (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1));
         buckets[2 + v] = d;
         if (cache[i].symbol < omp_block_end) {
-            sa_sint_t s = cache[i].symbol,
-                      q = (cache[s].index = cache[i].index) & SAINT_MAX;
+            sa_sint_t s = cache[i].symbol, q = (cache[s].index = cache[i].index) & SAINT_MAX;
             cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] >= T[q - 1]);
         }
     }
@@ -3397,17 +3168,16 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k_block_sort(
 }
 
 static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k_block_sort(
-    const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets,
-    sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache,
-    fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+    const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d,
+    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
+    fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     sa_sint_t * RESTRICT induction_bucket = &buckets[2 * k];
     sa_sint_t * RESTRICT distinct_names = &buckets[0 * k];
 
     fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size;
-    for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j;
-         i += 2) {
+    for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2) {
         libsais_prefetchw(&cache[i + 2 * prefetch_distance]);
 
         sa_sint_t s0 = cache[i + prefetch_distance + 0].symbol;
@@ -3427,16 +3197,14 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k_block_sort(
             d += (p0 >> (SUFFIX_GROUP_BIT - 1));
             cache[i + 0].symbol = induction_bucket[v0 >> 1]++;
             cache[i + 0].index = (p0 - 1) | (v0 << (SAINT_BIT - 1)) |
-                                 ((sa_sint_t)(distinct_names[v0] != d)
-                                  << (SUFFIX_GROUP_BIT - 1));
+                                 ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1));
             distinct_names[v0] = d;
             if (cache[i + 0].symbol < omp_block_end) {
                 sa_sint_t ni = cache[i + 0].symbol, np = cache[i + 0].index;
                 if (np > 0) {
                     cache[ni].index = np;
                     np &= ~SUFFIX_GROUP_MARKER;
-                    cache[ni].symbol =
-                        BUCKETS_INDEX2(T[np - 1], T[np - 2] < T[np - 1]);
+                    cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] < T[np - 1]);
                     np = 0;
                 }
                 cache[i + 0].index = np & SAINT_MAX;
@@ -3449,16 +3217,14 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k_block_sort(
             d += (p1 >> (SUFFIX_GROUP_BIT - 1));
             cache[i + 1].symbol = induction_bucket[v1 >> 1]++;
             cache[i + 1].index = (p1 - 1) | (v1 << (SAINT_BIT - 1)) |
-                                 ((sa_sint_t)(distinct_names[v1] != d)
-                                  << (SUFFIX_GROUP_BIT - 1));
+                                 ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1));
             distinct_names[v1] = d;
             if (cache[i + 1].symbol < omp_block_end) {
                 sa_sint_t ni = cache[i + 1].symbol, np = cache[i + 1].index;
                 if (np > 0) {
                     cache[ni].index = np;
                     np &= ~SUFFIX_GROUP_MARKER;
-                    cache[ni].symbol =
-                        BUCKETS_INDEX2(T[np - 1], T[np - 2] < T[np - 1]);
+                    cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] < T[np - 1]);
                     np = 0;
                 }
                 cache[i + 1].index = np & SAINT_MAX;
@@ -3472,17 +3238,15 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k_block_sort(
             sa_sint_t p = cache[i].index;
             d += (p >> (SUFFIX_GROUP_BIT - 1));
             cache[i].symbol = induction_bucket[v >> 1]++;
-            cache[i].index =
-                (p - 1) | (v << (SAINT_BIT - 1)) |
-                ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1));
+            cache[i].index = (p - 1) | (v << (SAINT_BIT - 1)) |
+                             ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1));
             distinct_names[v] = d;
             if (cache[i].symbol < omp_block_end) {
                 sa_sint_t ni = cache[i].symbol, np = cache[i].index;
                 if (np > 0) {
                     cache[ni].index = np;
                     np &= ~SUFFIX_GROUP_MARKER;
-                    cache[ni].symbol =
-                        BUCKETS_INDEX2(T[np - 1], T[np - 2] < T[np - 1]);
+                    cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] < T[np - 1]);
                     np = 0;
                 }
                 cache[i].index = np & SAINT_MAX;
@@ -3500,8 +3264,7 @@ static void libsais_partial_sorting_scan_left_to_right_32s_1k_block_sort(
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size;
-    for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j;
-         i += 2) {
+    for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2) {
         libsais_prefetchw(&cache[i + 2 * prefetch_distance]);
 
         sa_sint_t s0 = cache[i + prefetch_distance + 0].symbol;
@@ -3518,8 +3281,7 @@ static void libsais_partial_sorting_scan_left_to_right_32s_1k_block_sort(
                 sa_sint_t ni = cache[i + 0].symbol, np = cache[i + 0].index;
                 if (np > 0) {
                     cache[ni].index =
-                        (np - 1) |
-                        ((sa_sint_t)(T[np - 2] < T[np - 1]) << (SAINT_BIT - 1));
+                        (np - 1) | ((sa_sint_t)(T[np - 2] < T[np - 1]) << (SAINT_BIT - 1));
                     cache[ni].symbol = T[np - 1];
                     np = 0;
                 }
@@ -3534,8 +3296,7 @@ static void libsais_partial_sorting_scan_left_to_right_32s_1k_block_sort(
                 sa_sint_t ni = cache[i + 1].symbol, np = cache[i + 1].index;
                 if (np > 0) {
                     cache[ni].index =
-                        (np - 1) |
-                        ((sa_sint_t)(T[np - 2] < T[np - 1]) << (SAINT_BIT - 1));
+                        (np - 1) | ((sa_sint_t)(T[np - 2] < T[np - 1]) << (SAINT_BIT - 1));
                     cache[ni].symbol = T[np - 1];
                     np = 0;
                 }
@@ -3552,8 +3313,7 @@ static void libsais_partial_sorting_scan_left_to_right_32s_1k_block_sort(
                 sa_sint_t ni = cache[i].symbol, np = cache[i].index;
                 if (np > 0) {
                     cache[ni].index =
-                        (np - 1) |
-                        ((sa_sint_t)(T[np - 2] < T[np - 1]) << (SAINT_BIT - 1));
+                        (np - 1) | ((sa_sint_t)(T[np - 2] < T[np - 1]) << (SAINT_BIT - 1));
                     cache[ni].symbol = T[np - 1];
                     np = 0;
                 }
@@ -3564,13 +3324,11 @@ static void libsais_partial_sorting_scan_left_to_right_32s_1k_block_sort(
 }
 
 static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k_block_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
-    sa_sint_t * RESTRICT buckets, sa_sint_t d,
-    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start,
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
+    sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start,
     fast_sint_t block_size, sa_sint_t threads) {
     #if defined(_OPENMP)
-        #pragma omp parallel num_threads(threads) if (threads > 1 && \
-                                                      block_size >= 16384)
+        #pragma omp parallel num_threads(threads) if (threads > 1 && block_size >= 16384)
     #endif
     {
     #if defined(_OPENMP)
@@ -3585,22 +3343,20 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k_block_omp(
     #endif
         fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
-                                         ? omp_block_stride
-                                         : block_size - omp_block_start;
+        fast_sint_t omp_block_size =
+            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
 
         omp_block_start += block_start;
 
         if (omp_num_threads == 1) {
-            d = libsais_partial_sorting_scan_left_to_right_32s_6k(
-                T, SA, buckets, d, omp_block_start, omp_block_size);
+            d = libsais_partial_sorting_scan_left_to_right_32s_6k(T, SA, buckets, d,
+                                                                  omp_block_start, omp_block_size);
         }
     #if defined(_OPENMP)
         else {
             {
                 libsais_partial_sorting_scan_left_to_right_32s_6k_block_gather(
-                    T, SA, cache - block_start, omp_block_start,
-                    omp_block_size);
+                    T, SA, cache - block_start, omp_block_start, omp_block_size);
             }
 
         #pragma omp barrier
@@ -3608,15 +3364,14 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k_block_omp(
         #pragma omp master
             {
                 d = libsais_partial_sorting_scan_left_to_right_32s_6k_block_sort(
-                    T, buckets, d, cache - block_start, block_start,
-                    block_size);
+                    T, buckets, d, cache - block_start, block_start, block_size);
             }
 
         #pragma omp barrier
 
             {
-                libsais_place_cached_suffixes(SA, cache - block_start,
-                                              omp_block_start, omp_block_size);
+                libsais_place_cached_suffixes(SA, cache - block_start, omp_block_start,
+                                              omp_block_size);
             }
         }
     #endif
@@ -3627,12 +3382,10 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k_block_omp(
 
 static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k_block_omp(
     const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k,
-    sa_sint_t * RESTRICT buckets, sa_sint_t d,
-    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start,
-    fast_sint_t block_size, sa_sint_t threads) {
+    sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache,
+    fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) {
     #if defined(_OPENMP)
-        #pragma omp parallel num_threads(threads) if (threads > 1 && \
-                                                      block_size >= 16384)
+        #pragma omp parallel num_threads(threads) if (threads > 1 && block_size >= 16384)
     #endif
     {
     #if defined(_OPENMP)
@@ -3647,22 +3400,20 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k_block_omp(
     #endif
         fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
-                                         ? omp_block_stride
-                                         : block_size - omp_block_start;
+        fast_sint_t omp_block_size =
+            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
 
         omp_block_start += block_start;
 
         if (omp_num_threads == 1) {
-            d = libsais_partial_sorting_scan_left_to_right_32s_4k(
-                T, SA, k, buckets, d, omp_block_start, omp_block_size);
+            d = libsais_partial_sorting_scan_left_to_right_32s_4k(T, SA, k, buckets, d,
+                                                                  omp_block_start, omp_block_size);
         }
     #if defined(_OPENMP)
         else {
             {
                 libsais_partial_sorting_scan_left_to_right_32s_4k_block_gather(
-                    T, SA, cache - block_start, omp_block_start,
-                    omp_block_size);
+                    T, SA, cache - block_start, omp_block_start, omp_block_size);
             }
 
         #pragma omp barrier
@@ -3670,15 +3421,14 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k_block_omp(
         #pragma omp master
             {
                 d = libsais_partial_sorting_scan_left_to_right_32s_4k_block_sort(
-                    T, k, buckets, d, cache - block_start, block_start,
-                    block_size);
+                    T, k, buckets, d, cache - block_start, block_start, block_size);
             }
 
         #pragma omp barrier
 
             {
-                libsais_compact_and_place_cached_suffixes(
-                    SA, cache - block_start, omp_block_start, omp_block_size);
+                libsais_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start,
+                                                          omp_block_size);
             }
         }
     #endif
@@ -3688,12 +3438,11 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k_block_omp(
 }
 
 static void libsais_partial_sorting_scan_left_to_right_32s_1k_block_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
-    sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache,
-    fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) {
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
+    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size,
+    sa_sint_t threads) {
     #if defined(_OPENMP)
-        #pragma omp parallel num_threads(threads) if (threads > 1 && \
-                                                      block_size >= 16384)
+        #pragma omp parallel num_threads(threads) if (threads > 1 && block_size >= 16384)
     #endif
     {
     #if defined(_OPENMP)
@@ -3708,22 +3457,20 @@ static void libsais_partial_sorting_scan_left_to_right_32s_1k_block_omp(
     #endif
         fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
-                                         ? omp_block_stride
-                                         : block_size - omp_block_start;
+        fast_sint_t omp_block_size =
+            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
 
         omp_block_start += block_start;
 
         if (omp_num_threads == 1) {
-            libsais_partial_sorting_scan_left_to_right_32s_1k(
-                T, SA, buckets, omp_block_start, omp_block_size);
+            libsais_partial_sorting_scan_left_to_right_32s_1k(T, SA, buckets, omp_block_start,
+                                                              omp_block_size);
         }
     #if defined(_OPENMP)
         else {
             {
                 libsais_partial_sorting_scan_left_to_right_32s_1k_block_gather(
-                    T, SA, cache - block_start, omp_block_start,
-                    omp_block_size);
+                    T, SA, cache - block_start, omp_block_start, omp_block_size);
             }
 
         #pragma omp barrier
@@ -3737,8 +3484,8 @@ static void libsais_partial_sorting_scan_left_to_right_32s_1k_block_omp(
         #pragma omp barrier
 
             {
-                libsais_compact_and_place_cached_suffixes(
-                    SA, cache - block_start, omp_block_start, omp_block_size);
+                libsais_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start,
+                                                          omp_block_size);
             }
         }
     #endif
@@ -3749,23 +3496,20 @@ static void libsais_partial_sorting_scan_left_to_right_32s_1k_block_omp(
 
 static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k_omp(
     const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
-    sa_sint_t * RESTRICT buckets, sa_sint_t left_suffixes_count, sa_sint_t d,
-    sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
-    SA[buckets[BUCKETS_INDEX4(T[n - 1], T[n - 2] >= T[n - 1])]++] =
-        (n - 1) | SAINT_MIN;
+    sa_sint_t * RESTRICT buckets, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    SA[buckets[BUCKETS_INDEX4(T[n - 1], T[n - 2] >= T[n - 1])]++] = (n - 1) | SAINT_MIN;
     buckets[2 + BUCKETS_INDEX4(T[n - 1], T[n - 2] >= T[n - 1])] = ++d;
 
     if (threads == 1 || left_suffixes_count < 65536) {
-        d = libsais_partial_sorting_scan_left_to_right_32s_6k(
-            T, SA, buckets, d, 0, left_suffixes_count);
+        d = libsais_partial_sorting_scan_left_to_right_32s_6k(T, SA, buckets, d, 0,
+                                                              left_suffixes_count);
     }
 #if defined(_OPENMP)
     else {
         fast_sint_t block_start, block_end;
-        for (block_start = 0; block_start < left_suffixes_count;
-             block_start = block_end) {
-            block_end = block_start +
-                        (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
+        for (block_start = 0; block_start < left_suffixes_count; block_start = block_end) {
+            block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
             if (block_end > left_suffixes_count) {
                 block_end = left_suffixes_count;
             }
@@ -3783,27 +3527,24 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k_omp(
 }
 
 static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
-    sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, sa_sint_t threads,
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
+    sa_sint_t * RESTRICT buckets, sa_sint_t d, sa_sint_t threads,
     LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     sa_sint_t * RESTRICT induction_bucket = &buckets[2 * k];
     sa_sint_t * RESTRICT distinct_names = &buckets[0 * k];
 
     SA[induction_bucket[T[n - 1]]++] =
-        (n - 1) | ((sa_sint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1)) |
-        SUFFIX_GROUP_MARKER;
+        (n - 1) | ((sa_sint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1)) | SUFFIX_GROUP_MARKER;
     distinct_names[BUCKETS_INDEX2(T[n - 1], T[n - 2] < T[n - 1])] = ++d;
 
     if (threads == 1 || n < 65536) {
-        d = libsais_partial_sorting_scan_left_to_right_32s_4k(T, SA, k, buckets,
-                                                              d, 0, n);
+        d = libsais_partial_sorting_scan_left_to_right_32s_4k(T, SA, k, buckets, d, 0, n);
     }
 #if defined(_OPENMP)
     else {
         fast_sint_t block_start, block_end;
         for (block_start = 0; block_start < n; block_start = block_end) {
-            block_end = block_start +
-                        (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
+            block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
             if (block_end > n) {
                 block_end = n;
             }
@@ -3822,10 +3563,8 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k_omp(
 
 static void libsais_partial_sorting_scan_left_to_right_32s_1k_omp(
     const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
-    sa_sint_t * RESTRICT buckets, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
-    SA[buckets[T[n - 1]]++] =
-        (n - 1) | ((sa_sint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1));
+    sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    SA[buckets[T[n - 1]]++] = (n - 1) | ((sa_sint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1));
 
     if (threads == 1 || n < 65536) {
         libsais_partial_sorting_scan_left_to_right_32s_1k(T, SA, buckets, 0, n);
@@ -3834,15 +3573,14 @@ static void libsais_partial_sorting_scan_left_to_right_32s_1k_omp(
     else {
         fast_sint_t block_start, block_end;
         for (block_start = 0; block_start < n; block_start = block_end) {
-            block_end = block_start +
-                        (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
+            block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
             if (block_end > n) {
                 block_end = n;
             }
 
             libsais_partial_sorting_scan_left_to_right_32s_1k_block_omp(
-                T, SA, buckets, thread_state[0].state.cache, block_start,
-                block_end - block_start, threads);
+                T, SA, buckets, thread_state[0].state.cache, block_start, block_end - block_start,
+                threads);
         }
     }
 #else
@@ -3850,9 +3588,9 @@ static void libsais_partial_sorting_scan_left_to_right_32s_1k_omp(
 #endif
 }
 
-static void libsais_partial_sorting_shift_markers_8u_omp(
-    sa_sint_t * RESTRICT SA, sa_sint_t n, const sa_sint_t * RESTRICT buckets,
-    sa_sint_t threads) {
+static void libsais_partial_sorting_shift_markers_8u_omp(sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                                         const sa_sint_t * RESTRICT buckets,
+                                                         sa_sint_t threads) {
     const fast_sint_t prefetch_distance = 32;
 
     const sa_sint_t * RESTRICT temp_bucket = &buckets[4 * ALPHABET_SIZE];
@@ -3860,8 +3598,7 @@ static void libsais_partial_sorting_shift_markers_8u_omp(
     fast_sint_t c;
 
 #if defined(_OPENMP)
-    #pragma omp parallel for schedule(static, 1) \
-        num_threads(threads) if (threads > 1 && n >= 65536)
+    #pragma omp parallel for schedule(static, 1) num_threads(threads) if (threads > 1 && n >= 65536)
 #else
     UNUSED(threads);
     UNUSED(n);
@@ -3897,9 +3634,9 @@ static void libsais_partial_sorting_shift_markers_8u_omp(
     }
 }
 
-static void libsais_partial_sorting_shift_markers_32s_6k_omp(
-    sa_sint_t * RESTRICT SA, sa_sint_t k, const sa_sint_t * RESTRICT buckets,
-    sa_sint_t threads) {
+static void libsais_partial_sorting_shift_markers_32s_6k_omp(sa_sint_t * RESTRICT SA, sa_sint_t k,
+                                                             const sa_sint_t * RESTRICT buckets,
+                                                             sa_sint_t threads) {
     const fast_sint_t prefetch_distance = 32;
 
     const sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k];
@@ -3907,8 +3644,7 @@ static void libsais_partial_sorting_shift_markers_32s_6k_omp(
     fast_sint_t c;
 
 #if defined(_OPENMP)
-    #pragma omp parallel for schedule(static, 1) \
-        num_threads(threads) if (threads > 1 && k >= 65536)
+    #pragma omp parallel for schedule(static, 1) num_threads(threads) if (threads > 1 && k >= 65536)
 #else
     UNUSED(threads);
 #endif
@@ -3942,8 +3678,7 @@ static void libsais_partial_sorting_shift_markers_32s_6k_omp(
     }
 }
 
-static void libsais_partial_sorting_shift_markers_32s_4k(
-    sa_sint_t * RESTRICT SA, sa_sint_t n) {
+static void libsais_partial_sorting_shift_markers_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i;
@@ -3951,63 +3686,54 @@ static void libsais_partial_sorting_shift_markers_32s_4k(
     for (i = (fast_sint_t)n - 1; i >= 3; i -= 4) {
         libsais_prefetchw(&SA[i - prefetch_distance]);
 
-        sa_sint_t p0 = SA[i - 0],
-                  q0 = ((p0 & SUFFIX_GROUP_MARKER) ^ s) &
-                       ((sa_sint_t)(p0 > 0) << ((SUFFIX_GROUP_BIT - 1)));
+        sa_sint_t p0 = SA[i - 0], q0 = ((p0 & SUFFIX_GROUP_MARKER) ^ s) &
+                                       ((sa_sint_t)(p0 > 0) << ((SUFFIX_GROUP_BIT - 1)));
         s = s ^ q0;
         SA[i - 0] = p0 ^ q0;
-        sa_sint_t p1 = SA[i - 1],
-                  q1 = ((p1 & SUFFIX_GROUP_MARKER) ^ s) &
-                       ((sa_sint_t)(p1 > 0) << ((SUFFIX_GROUP_BIT - 1)));
+        sa_sint_t p1 = SA[i - 1], q1 = ((p1 & SUFFIX_GROUP_MARKER) ^ s) &
+                                       ((sa_sint_t)(p1 > 0) << ((SUFFIX_GROUP_BIT - 1)));
         s = s ^ q1;
         SA[i - 1] = p1 ^ q1;
-        sa_sint_t p2 = SA[i - 2],
-                  q2 = ((p2 & SUFFIX_GROUP_MARKER) ^ s) &
-                       ((sa_sint_t)(p2 > 0) << ((SUFFIX_GROUP_BIT - 1)));
+        sa_sint_t p2 = SA[i - 2], q2 = ((p2 & SUFFIX_GROUP_MARKER) ^ s) &
+                                       ((sa_sint_t)(p2 > 0) << ((SUFFIX_GROUP_BIT - 1)));
         s = s ^ q2;
         SA[i - 2] = p2 ^ q2;
-        sa_sint_t p3 = SA[i - 3],
-                  q3 = ((p3 & SUFFIX_GROUP_MARKER) ^ s) &
-                       ((sa_sint_t)(p3 > 0) << ((SUFFIX_GROUP_BIT - 1)));
+        sa_sint_t p3 = SA[i - 3], q3 = ((p3 & SUFFIX_GROUP_MARKER) ^ s) &
+                                       ((sa_sint_t)(p3 > 0) << ((SUFFIX_GROUP_BIT - 1)));
         s = s ^ q3;
         SA[i - 3] = p3 ^ q3;
     }
 
     for (; i >= 0; i -= 1) {
-        sa_sint_t p = SA[i],
-                  q = ((p & SUFFIX_GROUP_MARKER) ^ s) &
-                      ((sa_sint_t)(p > 0) << ((SUFFIX_GROUP_BIT - 1)));
+        sa_sint_t p = SA[i], q = ((p & SUFFIX_GROUP_MARKER) ^ s) &
+                                 ((sa_sint_t)(p > 0) << ((SUFFIX_GROUP_BIT - 1)));
         s = s ^ q;
         SA[i] = p ^ q;
     }
 }
 
-static void libsais_partial_sorting_shift_buckets_32s_6k(
-    sa_sint_t k, sa_sint_t * RESTRICT buckets) {
+static void libsais_partial_sorting_shift_buckets_32s_6k(sa_sint_t k,
+                                                         sa_sint_t * RESTRICT buckets) {
     sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k];
 
     fast_sint_t i;
     for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0);
          i += BUCKETS_INDEX2(1, 0)) {
-        buckets[2 * i + BUCKETS_INDEX4(0, 0)] =
-            temp_bucket[i + BUCKETS_INDEX2(0, 0)];
-        buckets[2 * i + BUCKETS_INDEX4(0, 1)] =
-            temp_bucket[i + BUCKETS_INDEX2(0, 1)];
+        buckets[2 * i + BUCKETS_INDEX4(0, 0)] = temp_bucket[i + BUCKETS_INDEX2(0, 0)];
+        buckets[2 * i + BUCKETS_INDEX4(0, 1)] = temp_bucket[i + BUCKETS_INDEX2(0, 1)];
     }
 }
 
 static sa_sint_t libsais_partial_sorting_scan_right_to_left_8u(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
-    sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start,
-    fast_sint_t omp_block_size) {
+    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d,
+    fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
     sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE];
 
     fast_sint_t i, j;
-    for (i = omp_block_start + omp_block_size - 1,
-        j = omp_block_start + prefetch_distance + 1;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1;
          i >= j; i -= 2) {
         libsais_prefetch(&SA[i - 2 * prefetch_distance]);
 
@@ -4021,8 +3747,7 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_8u(
         p0 &= SAINT_MAX;
         sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]);
         SA[--induction_bucket[v0]] =
-            (p0 - 1) |
-            ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1));
+            (p0 - 1) | ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1));
         distinct_names[v0] = d;
 
         sa_sint_t p1 = SA[i - 1];
@@ -4030,8 +3755,7 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_8u(
         p1 &= SAINT_MAX;
         sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]);
         SA[--induction_bucket[v1]] =
-            (p1 - 1) |
-            ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1));
+            (p1 - 1) | ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1));
         distinct_names[v1] = d;
     }
 
@@ -4051,9 +3775,8 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_8u(
 #if defined(_OPENMP)
 
 static void libsais_partial_sorting_scan_right_to_left_8u_block_prepare(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
-    sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache,
-    fast_sint_t omp_block_start, fast_sint_t omp_block_size,
+    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
+    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size,
     LIBSAIS_THREAD_STATE * RESTRICT state) {
     const fast_sint_t prefetch_distance = 32;
 
@@ -4064,8 +3787,7 @@ static void libsais_partial_sorting_scan_right_to_left_8u_block_prepare(
 
     fast_sint_t i, j, count = 0;
     sa_sint_t d = 1;
-    for (i = omp_block_start + omp_block_size - 1,
-        j = omp_block_start + prefetch_distance + 1;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1;
          i >= j; i -= 2) {
         libsais_prefetch(&SA[i - 2 * prefetch_distance]);
 
@@ -4077,15 +3799,13 @@ static void libsais_partial_sorting_scan_right_to_left_8u_block_prepare(
         sa_sint_t p0 = cache[count].index = SA[i - 0];
         d += (p0 < 0);
         p0 &= SAINT_MAX;
-        sa_sint_t v0 = cache[count++].symbol =
-            BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]);
+        sa_sint_t v0 = cache[count++].symbol = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]);
         induction_bucket[v0]++;
         distinct_names[v0] = d;
         sa_sint_t p1 = cache[count].index = SA[i - 1];
         d += (p1 < 0);
         p1 &= SAINT_MAX;
-        sa_sint_t v1 = cache[count++].symbol =
-            BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]);
+        sa_sint_t v1 = cache[count++].symbol = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]);
         induction_bucket[v1]++;
         distinct_names[v1] = d;
     }
@@ -4094,8 +3814,7 @@ static void libsais_partial_sorting_scan_right_to_left_8u_block_prepare(
         sa_sint_t p = cache[count].index = SA[i];
         d += (p < 0);
         p &= SAINT_MAX;
-        sa_sint_t v = cache[count++].symbol =
-            BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]);
+        sa_sint_t v = cache[count++].symbol = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]);
         induction_bucket[v]++;
         distinct_names[v] = d;
     }
@@ -4105,8 +3824,8 @@ static void libsais_partial_sorting_scan_right_to_left_8u_block_prepare(
 }
 
 static void libsais_partial_sorting_scan_right_to_left_8u_block_place(
-    sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
-    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count, sa_sint_t d) {
+    sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache,
+    fast_sint_t count, sa_sint_t d) {
     const fast_sint_t prefetch_distance = 32;
 
     sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
@@ -4120,16 +3839,14 @@ static void libsais_partial_sorting_scan_right_to_left_8u_block_place(
         d += (p0 < 0);
         sa_sint_t v0 = cache[i + 0].symbol;
         SA[--induction_bucket[v0]] =
-            (p0 - 1) |
-            ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1));
+            (p0 - 1) | ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1));
         distinct_names[v0] = d;
 
         sa_sint_t p1 = cache[i + 1].index;
         d += (p1 < 0);
         sa_sint_t v1 = cache[i + 1].symbol;
         SA[--induction_bucket[v1]] =
-            (p1 - 1) |
-            ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1));
+            (p1 - 1) | ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1));
         distinct_names[v1] = d;
     }
 
@@ -4144,14 +3861,13 @@ static void libsais_partial_sorting_scan_right_to_left_8u_block_place(
 }
 
 static sa_sint_t libsais_partial_sorting_scan_right_to_left_8u_block_omp(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
-    sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t block_start,
-    fast_sint_t block_size, sa_sint_t threads,
+    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d,
+    fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads,
     LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     #if defined(_OPENMP)
-        #pragma omp parallel num_threads(                                   \
-            threads) if (threads > 1 && block_size >= 64 * ALPHABET_SIZE && \
-                         omp_get_dynamic() == 0)
+        #pragma omp parallel num_threads(threads) if (threads > 1 &&                      \
+                                                      block_size >= 64 * ALPHABET_SIZE && \
+                                                      omp_get_dynamic() == 0)
     #endif
     {
     #if defined(_OPENMP)
@@ -4166,33 +3882,30 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_8u_block_omp(
     #endif
         fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
-                                         ? omp_block_stride
-                                         : block_size - omp_block_start;
+        fast_sint_t omp_block_size =
+            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
 
         omp_block_start += block_start;
 
         if (omp_num_threads == 1) {
-            d = libsais_partial_sorting_scan_right_to_left_8u(
-                T, SA, buckets, d, omp_block_start, omp_block_size);
+            d = libsais_partial_sorting_scan_right_to_left_8u(T, SA, buckets, d, omp_block_start,
+                                                              omp_block_size);
         }
     #if defined(_OPENMP)
         else {
             {
                 libsais_partial_sorting_scan_right_to_left_8u_block_prepare(
                     T, SA, thread_state[omp_thread_num].state.buckets,
-                    thread_state[omp_thread_num].state.cache, omp_block_start,
-                    omp_block_size, &thread_state[omp_thread_num]);
+                    thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size,
+                    &thread_state[omp_thread_num]);
             }
 
         #pragma omp barrier
 
         #pragma omp master
             {
-                sa_sint_t * RESTRICT induction_bucket =
-                    &buckets[0 * ALPHABET_SIZE];
-                sa_sint_t * RESTRICT distinct_names =
-                    &buckets[2 * ALPHABET_SIZE];
+                sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
+                sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE];
 
                 fast_sint_t t;
                 for (t = omp_num_threads - 1; t >= 0; --t) {
@@ -4203,15 +3916,13 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_8u_block_omp(
 
                     fast_sint_t c;
                     for (c = 0; c < 2 * ALPHABET_SIZE; c += 1) {
-                        sa_sint_t A = induction_bucket[c],
-                                  B = temp_induction_bucket[c];
+                        sa_sint_t A = induction_bucket[c], B = temp_induction_bucket[c];
                         induction_bucket[c] = A - B;
                         temp_induction_bucket[c] = A;
                     }
 
                     for (d -= 1, c = 0; c < 2 * ALPHABET_SIZE; c += 1) {
-                        sa_sint_t A = distinct_names[c],
-                                  B = temp_distinct_names[c], D = B + d;
+                        sa_sint_t A = distinct_names[c], B = temp_distinct_names[c], D = B + d;
                         distinct_names[c] = B > 0 ? D : A;
                         temp_distinct_names[c] = A;
                     }
@@ -4240,16 +3951,15 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_8u_block_omp(
 #endif
 
 static void libsais_partial_sorting_scan_right_to_left_8u_omp(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
-    sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix,
-    sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads,
+    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets,
+    sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads,
     LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     fast_sint_t scan_start = (fast_sint_t)left_suffixes_count + 1;
     fast_sint_t scan_end = (fast_sint_t)n - (fast_sint_t)first_lms_suffix;
 
     if (threads == 1 || (scan_end - scan_start) < 65536) {
-        libsais_partial_sorting_scan_right_to_left_8u(
-            T, SA, buckets, d, scan_start, scan_end - scan_start);
+        libsais_partial_sorting_scan_right_to_left_8u(T, SA, buckets, d, scan_start,
+                                                      scan_end - scan_start);
     }
 #if defined(_OPENMP)
     else {
@@ -4262,9 +3972,8 @@ static void libsais_partial_sorting_scan_right_to_left_8u_omp(
                 block_start--;
             } else {
                 fast_sint_t block_max_end =
-                    block_start -
-                    ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE -
-                                              16 * (fast_sint_t)threads);
+                    block_start - ((fast_sint_t)threads) *
+                                      (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads);
                 if (block_max_end < scan_start) {
                     block_max_end = scan_start - 1;
                 }
@@ -4279,17 +3988,14 @@ static void libsais_partial_sorting_scan_right_to_left_8u_omp(
                         sa_sint_t p = SA[block_start];
                         d += (p < 0);
                         p &= SAINT_MAX;
-                        sa_sint_t v =
-                            BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]);
+                        sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]);
                         SA[--induction_bucket[v]] =
-                            (p - 1) | ((sa_sint_t)(distinct_names[v] != d)
-                                       << (SAINT_BIT - 1));
+                            (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1));
                         distinct_names[v] = d;
                     }
                 } else {
                     d = libsais_partial_sorting_scan_right_to_left_8u_block_omp(
-                        T, SA, buckets, d, block_end + 1, block_size, threads,
-                        thread_state);
+                        T, SA, buckets, d, block_end + 1, block_size, threads, thread_state);
                     block_start = block_end;
                 }
             }
@@ -4301,14 +4007,12 @@ static void libsais_partial_sorting_scan_right_to_left_8u_omp(
 }
 
 static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
-    sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start,
-    fast_sint_t omp_block_size) {
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
+    sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start + omp_block_size - 1,
-        j = omp_block_start + 2 * prefetch_distance + 1;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1;
          i >= j; i -= 2) {
         libsais_prefetch(&SA[i - 3 * prefetch_distance]);
 
@@ -4328,16 +4032,14 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k(
         d += (p2 < 0);
         p2 &= SAINT_MAX;
         sa_sint_t v2 = BUCKETS_INDEX4(T[p2 - 1], T[p2 - 2] > T[p2 - 1]);
-        SA[--buckets[v2]] =
-            (p2 - 1) | ((sa_sint_t)(buckets[2 + v2] != d) << (SAINT_BIT - 1));
+        SA[--buckets[v2]] = (p2 - 1) | ((sa_sint_t)(buckets[2 + v2] != d) << (SAINT_BIT - 1));
         buckets[2 + v2] = d;
 
         sa_sint_t p3 = SA[i - 1];
         d += (p3 < 0);
         p3 &= SAINT_MAX;
         sa_sint_t v3 = BUCKETS_INDEX4(T[p3 - 1], T[p3 - 2] > T[p3 - 1]);
-        SA[--buckets[v3]] =
-            (p3 - 1) | ((sa_sint_t)(buckets[2 + v3] != d) << (SAINT_BIT - 1));
+        SA[--buckets[v3]] = (p3 - 1) | ((sa_sint_t)(buckets[2 + v3] != d) << (SAINT_BIT - 1));
         buckets[2 + v3] = d;
     }
 
@@ -4346,8 +4048,7 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k(
         d += (p < 0);
         p &= SAINT_MAX;
         sa_sint_t v = BUCKETS_INDEX4(T[p - 1], T[p - 2] > T[p - 1]);
-        SA[--buckets[v]] =
-            (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1));
+        SA[--buckets[v]] = (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1));
         buckets[2 + v] = d;
     }
 
@@ -4364,8 +4065,7 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k(
     sa_sint_t * RESTRICT distinct_names = &buckets[0 * k];
 
     fast_sint_t i, j;
-    for (i = omp_block_start + omp_block_size - 1,
-        j = omp_block_start + 2 * prefetch_distance + 1;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1;
          i >= j; i -= 2) {
         libsais_prefetchw(&SA[i - 3 * prefetch_distance]);
 
@@ -4399,10 +4099,8 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k(
             p0 &= ~SUFFIX_GROUP_MARKER;
             sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]);
             SA[--induction_bucket[T[p0 - 1]]] =
-                (p0 - 1) |
-                ((sa_sint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1)) |
-                ((sa_sint_t)(distinct_names[v0] != d)
-                 << (SUFFIX_GROUP_BIT - 1));
+                (p0 - 1) | ((sa_sint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1)) |
+                ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1));
             distinct_names[v0] = d;
         }
 
@@ -4413,10 +4111,8 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k(
             p1 &= ~SUFFIX_GROUP_MARKER;
             sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]);
             SA[--induction_bucket[T[p1 - 1]]] =
-                (p1 - 1) |
-                ((sa_sint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1)) |
-                ((sa_sint_t)(distinct_names[v1] != d)
-                 << (SUFFIX_GROUP_BIT - 1));
+                (p1 - 1) | ((sa_sint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1)) |
+                ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1));
             distinct_names[v1] = d;
         }
     }
@@ -4429,8 +4125,7 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k(
             p &= ~SUFFIX_GROUP_MARKER;
             sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]);
             SA[--induction_bucket[T[p - 1]]] =
-                (p - 1) |
-                ((sa_sint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1)) |
+                (p - 1) | ((sa_sint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1)) |
                 ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1));
             distinct_names[v] = d;
         }
@@ -4439,15 +4134,15 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k(
     return d;
 }
 
-static void libsais_partial_sorting_scan_right_to_left_32s_1k(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
-    sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start,
-    fast_sint_t omp_block_size) {
+static void libsais_partial_sorting_scan_right_to_left_32s_1k(const sa_sint_t * RESTRICT T,
+                                                              sa_sint_t * RESTRICT SA,
+                                                              sa_sint_t * RESTRICT induction_bucket,
+                                                              fast_sint_t omp_block_start,
+                                                              fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start + omp_block_size - 1,
-        j = omp_block_start + 2 * prefetch_distance + 1;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1;
          i >= j; i -= 2) {
         libsais_prefetchw(&SA[i - 3 * prefetch_distance]);
 
@@ -4472,15 +4167,13 @@ static void libsais_partial_sorting_scan_right_to_left_32s_1k(
         if (p0 > 0) {
             SA[i - 0] = 0;
             SA[--induction_bucket[T[p0 - 1]]] =
-                (p0 - 1) |
-                ((sa_sint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1));
+                (p0 - 1) | ((sa_sint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1));
         }
         sa_sint_t p1 = SA[i - 1];
         if (p1 > 0) {
             SA[i - 1] = 0;
             SA[--induction_bucket[T[p1 - 1]]] =
-                (p1 - 1) |
-                ((sa_sint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1));
+                (p1 - 1) | ((sa_sint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1));
         }
     }
 
@@ -4497,15 +4190,13 @@ static void libsais_partial_sorting_scan_right_to_left_32s_1k(
 #if defined(_OPENMP)
 
 static void libsais_partial_sorting_scan_right_to_left_32s_6k_block_gather(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
-    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
-    fast_sint_t omp_block_size) {
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache,
+    fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start,
-        j = omp_block_start + omp_block_size - prefetch_distance - 1;
-         i < j; i += 2) {
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j;
+         i += 2) {
         libsais_prefetch(&SA[i + 2 * prefetch_distance]);
 
         libsais_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1);
@@ -4543,15 +4234,13 @@ static void libsais_partial_sorting_scan_right_to_left_32s_6k_block_gather(
 }
 
 static void libsais_partial_sorting_scan_right_to_left_32s_4k_block_gather(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
-    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
-    fast_sint_t omp_block_size) {
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache,
+    fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start,
-        j = omp_block_start + omp_block_size - prefetch_distance - 1;
-         i < j; i += 2) {
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j;
+         i += 2) {
         libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
 
         sa_sint_t s0 = SA[i + prefetch_distance + 0];
@@ -4598,15 +4287,13 @@ static void libsais_partial_sorting_scan_right_to_left_32s_4k_block_gather(
 }
 
 static void libsais_partial_sorting_scan_right_to_left_32s_1k_block_gather(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
-    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
-    fast_sint_t omp_block_size) {
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache,
+    fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start,
-        j = omp_block_start + omp_block_size - prefetch_distance - 1;
-         i < j; i += 2) {
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j;
+         i += 2) {
         libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
 
         sa_sint_t s0 = SA[i + prefetch_distance + 0];
@@ -4625,16 +4312,14 @@ static void libsais_partial_sorting_scan_right_to_left_32s_1k_block_gather(
         sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0];
         if (p0 > 0) {
             SA[i + 0] = 0;
-            cache[i + 0].index = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] > T[p0 - 1])
-                                             << (SAINT_BIT - 1));
+            cache[i + 0].index = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1));
             symbol0 = T[p0 - 1];
         }
         cache[i + 0].symbol = symbol0;
         sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1];
         if (p1 > 0) {
             SA[i + 1] = 0;
-            cache[i + 1].index = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] > T[p1 - 1])
-                                             << (SAINT_BIT - 1));
+            cache[i + 1].index = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1));
             symbol1 = T[p1 - 1];
         }
         cache[i + 1].symbol = symbol1;
@@ -4644,8 +4329,7 @@ static void libsais_partial_sorting_scan_right_to_left_32s_1k_block_gather(
         sa_sint_t symbol = SAINT_MIN, p = SA[i];
         if (p > 0) {
             SA[i] = 0;
-            cache[i].index =
-                (p - 1) | ((sa_sint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1));
+            cache[i].index = (p - 1) | ((sa_sint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1));
             symbol = T[p - 1];
         }
         cache[i].symbol = symbol;
@@ -4659,8 +4343,7 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k_block_sort(
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start + omp_block_size - 1,
-        j = omp_block_start + prefetch_distance + 1;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1;
          i >= j; i -= 2) {
         libsais_prefetchw(&cache[i - 2 * prefetch_distance]);
 
@@ -4670,8 +4353,7 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k_block_sort(
         sa_sint_t v0 = cache[i - 0].symbol, p0 = cache[i - 0].index;
         d += (p0 < 0);
         cache[i - 0].symbol = --buckets[v0];
-        cache[i - 0].index =
-            (p0 - 1) | ((sa_sint_t)(buckets[2 + v0] != d) << (SAINT_BIT - 1));
+        cache[i - 0].index = (p0 - 1) | ((sa_sint_t)(buckets[2 + v0] != d) << (SAINT_BIT - 1));
         buckets[2 + v0] = d;
         if (cache[i - 0].symbol >= omp_block_start) {
             sa_sint_t s = cache[i - 0].symbol,
@@ -4682,8 +4364,7 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k_block_sort(
         sa_sint_t v1 = cache[i - 1].symbol, p1 = cache[i - 1].index;
         d += (p1 < 0);
         cache[i - 1].symbol = --buckets[v1];
-        cache[i - 1].index =
-            (p1 - 1) | ((sa_sint_t)(buckets[2 + v1] != d) << (SAINT_BIT - 1));
+        cache[i - 1].index = (p1 - 1) | ((sa_sint_t)(buckets[2 + v1] != d) << (SAINT_BIT - 1));
         buckets[2 + v1] = d;
         if (cache[i - 1].symbol >= omp_block_start) {
             sa_sint_t s = cache[i - 1].symbol,
@@ -4696,12 +4377,10 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k_block_sort(
         sa_sint_t v = cache[i].symbol, p = cache[i].index;
         d += (p < 0);
         cache[i].symbol = --buckets[v];
-        cache[i].index =
-            (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1));
+        cache[i].index = (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1));
         buckets[2 + v] = d;
         if (cache[i].symbol >= omp_block_start) {
-            sa_sint_t s = cache[i].symbol,
-                      q = (cache[s].index = cache[i].index) & SAINT_MAX;
+            sa_sint_t s = cache[i].symbol, q = (cache[s].index = cache[i].index) & SAINT_MAX;
             cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] > T[q - 1]);
         }
     }
@@ -4710,17 +4389,16 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k_block_sort(
 }
 
 static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k_block_sort(
-    const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets,
-    sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache,
-    fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+    const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d,
+    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
+    fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     sa_sint_t * RESTRICT induction_bucket = &buckets[3 * k];
     sa_sint_t * RESTRICT distinct_names = &buckets[0 * k];
 
     fast_sint_t i, j;
-    for (i = omp_block_start + omp_block_size - 1,
-        j = omp_block_start + prefetch_distance + 1;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1;
          i >= j; i -= 2) {
         libsais_prefetchw(&cache[i - 2 * prefetch_distance]);
 
@@ -4741,8 +4419,7 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k_block_sort(
             d += (p0 >> (SUFFIX_GROUP_BIT - 1));
             cache[i - 0].symbol = --induction_bucket[v0 >> 1];
             cache[i - 0].index = (p0 - 1) | (v0 << (SAINT_BIT - 1)) |
-                                 ((sa_sint_t)(distinct_names[v0] != d)
-                                  << (SUFFIX_GROUP_BIT - 1));
+                                 ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1));
             distinct_names[v0] = d;
             if (cache[i - 0].symbol >= omp_block_start) {
                 sa_sint_t ni = cache[i - 0].symbol, np = cache[i - 0].index;
@@ -4750,8 +4427,7 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k_block_sort(
                     cache[i - 0].index = 0;
                     cache[ni].index = np;
                     np &= ~SUFFIX_GROUP_MARKER;
-                    cache[ni].symbol =
-                        BUCKETS_INDEX2(T[np - 1], T[np - 2] > T[np - 1]);
+                    cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] > T[np - 1]);
                 }
             }
         }
@@ -4762,8 +4438,7 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k_block_sort(
             d += (p1 >> (SUFFIX_GROUP_BIT - 1));
             cache[i - 1].symbol = --induction_bucket[v1 >> 1];
             cache[i - 1].index = (p1 - 1) | (v1 << (SAINT_BIT - 1)) |
-                                 ((sa_sint_t)(distinct_names[v1] != d)
-                                  << (SUFFIX_GROUP_BIT - 1));
+                                 ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1));
             distinct_names[v1] = d;
             if (cache[i - 1].symbol >= omp_block_start) {
                 sa_sint_t ni = cache[i - 1].symbol, np = cache[i - 1].index;
@@ -4771,8 +4446,7 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k_block_sort(
                     cache[i - 1].index = 0;
                     cache[ni].index = np;
                     np &= ~SUFFIX_GROUP_MARKER;
-                    cache[ni].symbol =
-                        BUCKETS_INDEX2(T[np - 1], T[np - 2] > T[np - 1]);
+                    cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] > T[np - 1]);
                 }
             }
         }
@@ -4784,9 +4458,8 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k_block_sort(
             sa_sint_t p = cache[i].index;
             d += (p >> (SUFFIX_GROUP_BIT - 1));
             cache[i].symbol = --induction_bucket[v >> 1];
-            cache[i].index =
-                (p - 1) | (v << (SAINT_BIT - 1)) |
-                ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1));
+            cache[i].index = (p - 1) | (v << (SAINT_BIT - 1)) |
+                             ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1));
             distinct_names[v] = d;
             if (cache[i].symbol >= omp_block_start) {
                 sa_sint_t ni = cache[i].symbol, np = cache[i].index;
@@ -4794,8 +4467,7 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k_block_sort(
                     cache[i].index = 0;
                     cache[ni].index = np;
                     np &= ~SUFFIX_GROUP_MARKER;
-                    cache[ni].symbol =
-                        BUCKETS_INDEX2(T[np - 1], T[np - 2] > T[np - 1]);
+                    cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] > T[np - 1]);
                 }
             }
         }
@@ -4811,8 +4483,7 @@ static void libsais_partial_sorting_scan_right_to_left_32s_1k_block_sort(
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start + omp_block_size - 1,
-        j = omp_block_start + prefetch_distance + 1;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1;
          i >= j; i -= 2) {
         libsais_prefetchw(&cache[i - 2 * prefetch_distance]);
 
@@ -4831,8 +4502,7 @@ static void libsais_partial_sorting_scan_right_to_left_32s_1k_block_sort(
                 if (np > 0) {
                     cache[i - 0].index = 0;
                     cache[ni].index =
-                        (np - 1) |
-                        ((sa_sint_t)(T[np - 2] > T[np - 1]) << (SAINT_BIT - 1));
+                        (np - 1) | ((sa_sint_t)(T[np - 2] > T[np - 1]) << (SAINT_BIT - 1));
                     cache[ni].symbol = T[np - 1];
                 }
             }
@@ -4846,8 +4516,7 @@ static void libsais_partial_sorting_scan_right_to_left_32s_1k_block_sort(
                 if (np > 0) {
                     cache[i - 1].index = 0;
                     cache[ni].index =
-                        (np - 1) |
-                        ((sa_sint_t)(T[np - 2] > T[np - 1]) << (SAINT_BIT - 1));
+                        (np - 1) | ((sa_sint_t)(T[np - 2] > T[np - 1]) << (SAINT_BIT - 1));
                     cache[ni].symbol = T[np - 1];
                 }
             }
@@ -4863,8 +4532,7 @@ static void libsais_partial_sorting_scan_right_to_left_32s_1k_block_sort(
                 if (np > 0) {
                     cache[i].index = 0;
                     cache[ni].index =
-                        (np - 1) |
-                        ((sa_sint_t)(T[np - 2] > T[np - 1]) << (SAINT_BIT - 1));
+                        (np - 1) | ((sa_sint_t)(T[np - 2] > T[np - 1]) << (SAINT_BIT - 1));
                     cache[ni].symbol = T[np - 1];
                 }
             }
@@ -4873,13 +4541,11 @@ static void libsais_partial_sorting_scan_right_to_left_32s_1k_block_sort(
 }
 
 static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k_block_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
-    sa_sint_t * RESTRICT buckets, sa_sint_t d,
-    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start,
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
+    sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start,
     fast_sint_t block_size, sa_sint_t threads) {
     #if defined(_OPENMP)
-        #pragma omp parallel num_threads(threads) if (threads > 1 && \
-                                                      block_size >= 16384)
+        #pragma omp parallel num_threads(threads) if (threads > 1 && block_size >= 16384)
     #endif
     {
     #if defined(_OPENMP)
@@ -4894,22 +4560,20 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k_block_omp(
     #endif
         fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
-                                         ? omp_block_stride
-                                         : block_size - omp_block_start;
+        fast_sint_t omp_block_size =
+            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
 
         omp_block_start += block_start;
 
         if (omp_num_threads == 1) {
-            d = libsais_partial_sorting_scan_right_to_left_32s_6k(
-                T, SA, buckets, d, omp_block_start, omp_block_size);
+            d = libsais_partial_sorting_scan_right_to_left_32s_6k(T, SA, buckets, d,
+                                                                  omp_block_start, omp_block_size);
         }
     #if defined(_OPENMP)
         else {
             {
                 libsais_partial_sorting_scan_right_to_left_32s_6k_block_gather(
-                    T, SA, cache - block_start, omp_block_start,
-                    omp_block_size);
+                    T, SA, cache - block_start, omp_block_start, omp_block_size);
             }
 
         #pragma omp barrier
@@ -4917,15 +4581,14 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k_block_omp(
         #pragma omp master
             {
                 d = libsais_partial_sorting_scan_right_to_left_32s_6k_block_sort(
-                    T, buckets, d, cache - block_start, block_start,
-                    block_size);
+                    T, buckets, d, cache - block_start, block_start, block_size);
             }
 
         #pragma omp barrier
 
             {
-                libsais_place_cached_suffixes(SA, cache - block_start,
-                                              omp_block_start, omp_block_size);
+                libsais_place_cached_suffixes(SA, cache - block_start, omp_block_start,
+                                              omp_block_size);
             }
         }
     #endif
@@ -4936,12 +4599,10 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k_block_omp(
 
 static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k_block_omp(
     const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k,
-    sa_sint_t * RESTRICT buckets, sa_sint_t d,
-    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start,
-    fast_sint_t block_size, sa_sint_t threads) {
+    sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache,
+    fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) {
     #if defined(_OPENMP)
-        #pragma omp parallel num_threads(threads) if (threads > 1 && \
-                                                      block_size >= 16384)
+        #pragma omp parallel num_threads(threads) if (threads > 1 && block_size >= 16384)
     #endif
     {
     #if defined(_OPENMP)
@@ -4956,22 +4617,20 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k_block_omp(
     #endif
         fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
-                                         ? omp_block_stride
-                                         : block_size - omp_block_start;
+        fast_sint_t omp_block_size =
+            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
 
         omp_block_start += block_start;
 
         if (omp_num_threads == 1) {
-            d = libsais_partial_sorting_scan_right_to_left_32s_4k(
-                T, SA, k, buckets, d, omp_block_start, omp_block_size);
+            d = libsais_partial_sorting_scan_right_to_left_32s_4k(T, SA, k, buckets, d,
+                                                                  omp_block_start, omp_block_size);
         }
     #if defined(_OPENMP)
         else {
             {
                 libsais_partial_sorting_scan_right_to_left_32s_4k_block_gather(
-                    T, SA, cache - block_start, omp_block_start,
-                    omp_block_size);
+                    T, SA, cache - block_start, omp_block_start, omp_block_size);
             }
 
         #pragma omp barrier
@@ -4979,15 +4638,14 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k_block_omp(
         #pragma omp master
             {
                 d = libsais_partial_sorting_scan_right_to_left_32s_4k_block_sort(
-                    T, k, buckets, d, cache - block_start, block_start,
-                    block_size);
+                    T, k, buckets, d, cache - block_start, block_start, block_size);
             }
 
         #pragma omp barrier
 
             {
-                libsais_compact_and_place_cached_suffixes(
-                    SA, cache - block_start, omp_block_start, omp_block_size);
+                libsais_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start,
+                                                          omp_block_size);
             }
         }
     #endif
@@ -4997,12 +4655,11 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k_block_omp(
 }
 
 static void libsais_partial_sorting_scan_right_to_left_32s_1k_block_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
-    sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache,
-    fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) {
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
+    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size,
+    sa_sint_t threads) {
     #if defined(_OPENMP)
-        #pragma omp parallel num_threads(threads) if (threads > 1 && \
-                                                      block_size >= 16384)
+        #pragma omp parallel num_threads(threads) if (threads > 1 && block_size >= 16384)
     #endif
     {
     #if defined(_OPENMP)
@@ -5017,22 +4674,20 @@ static void libsais_partial_sorting_scan_right_to_left_32s_1k_block_omp(
     #endif
         fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
-                                         ? omp_block_stride
-                                         : block_size - omp_block_start;
+        fast_sint_t omp_block_size =
+            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
 
         omp_block_start += block_start;
 
         if (omp_num_threads == 1) {
-            libsais_partial_sorting_scan_right_to_left_32s_1k(
-                T, SA, buckets, omp_block_start, omp_block_size);
+            libsais_partial_sorting_scan_right_to_left_32s_1k(T, SA, buckets, omp_block_start,
+                                                              omp_block_size);
         }
     #if defined(_OPENMP)
         else {
             {
                 libsais_partial_sorting_scan_right_to_left_32s_1k_block_gather(
-                    T, SA, cache - block_start, omp_block_start,
-                    omp_block_size);
+                    T, SA, cache - block_start, omp_block_start, omp_block_size);
             }
 
         #pragma omp barrier
@@ -5046,8 +4701,8 @@ static void libsais_partial_sorting_scan_right_to_left_32s_1k_block_omp(
         #pragma omp barrier
 
             {
-                libsais_compact_and_place_cached_suffixes(
-                    SA, cache - block_start, omp_block_start, omp_block_size);
+                libsais_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start,
+                                                          omp_block_size);
             }
         }
     #endif
@@ -5058,23 +4713,20 @@ static void libsais_partial_sorting_scan_right_to_left_32s_1k_block_omp(
 
 static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k_omp(
     const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
-    sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix,
-    sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count,
+    sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     fast_sint_t scan_start = (fast_sint_t)left_suffixes_count + 1;
     fast_sint_t scan_end = (fast_sint_t)n - (fast_sint_t)first_lms_suffix;
 
     if (threads == 1 || (scan_end - scan_start) < 65536) {
-        d = libsais_partial_sorting_scan_right_to_left_32s_6k(
-            T, SA, buckets, d, scan_start, scan_end - scan_start);
+        d = libsais_partial_sorting_scan_right_to_left_32s_6k(T, SA, buckets, d, scan_start,
+                                                              scan_end - scan_start);
     }
 #if defined(_OPENMP)
     else {
         fast_sint_t block_start, block_end;
-        for (block_start = scan_end - 1; block_start >= scan_start;
-             block_start = block_end) {
-            block_end = block_start -
-                        (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
+        for (block_start = scan_end - 1; block_start >= scan_start; block_start = block_end) {
+            block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
             if (block_end < scan_start) {
                 block_end = scan_start - 1;
             }
@@ -5092,27 +4744,24 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k_omp(
 }
 
 static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
-    sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, sa_sint_t threads,
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
+    sa_sint_t * RESTRICT buckets, sa_sint_t d, sa_sint_t threads,
     LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     if (threads == 1 || n < 65536) {
-        d = libsais_partial_sorting_scan_right_to_left_32s_4k(T, SA, k, buckets,
-                                                              d, 0, n);
+        d = libsais_partial_sorting_scan_right_to_left_32s_4k(T, SA, k, buckets, d, 0, n);
     }
 #if defined(_OPENMP)
     else {
         fast_sint_t block_start, block_end;
-        for (block_start = (fast_sint_t)n - 1; block_start >= 0;
-             block_start = block_end) {
-            block_end = block_start -
-                        (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
+        for (block_start = (fast_sint_t)n - 1; block_start >= 0; block_start = block_end) {
+            block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
             if (block_end < 0) {
                 block_end = -1;
             }
 
             d = libsais_partial_sorting_scan_right_to_left_32s_4k_block_omp(
-                T, SA, k, buckets, d, thread_state[0].state.cache,
-                block_end + 1, block_start - block_end, threads);
+                T, SA, k, buckets, d, thread_state[0].state.cache, block_end + 1,
+                block_start - block_end, threads);
         }
     }
 #else
@@ -5124,25 +4773,22 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k_omp(
 
 static void libsais_partial_sorting_scan_right_to_left_32s_1k_omp(
     const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
-    sa_sint_t * RESTRICT buckets, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     if (threads == 1 || n < 65536) {
         libsais_partial_sorting_scan_right_to_left_32s_1k(T, SA, buckets, 0, n);
     }
 #if defined(_OPENMP)
     else {
         fast_sint_t block_start, block_end;
-        for (block_start = (fast_sint_t)n - 1; block_start >= 0;
-             block_start = block_end) {
-            block_end = block_start -
-                        (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
+        for (block_start = (fast_sint_t)n - 1; block_start >= 0; block_start = block_end) {
+            block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
             if (block_end < 0) {
                 block_end = -1;
             }
 
             libsais_partial_sorting_scan_right_to_left_32s_1k_block_omp(
-                T, SA, buckets, thread_state[0].state.cache, block_end + 1,
-                block_start - block_end, threads);
+                T, SA, buckets, thread_state[0].state.cache, block_end + 1, block_start - block_end,
+                threads);
         }
     }
 #else
@@ -5150,15 +4796,14 @@ static void libsais_partial_sorting_scan_right_to_left_32s_1k_omp(
 #endif
 }
 
-static fast_sint_t libsais_partial_sorting_gather_lms_suffixes_32s_4k(
-    sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start,
-    fast_sint_t omp_block_size) {
+static fast_sint_t libsais_partial_sorting_gather_lms_suffixes_32s_4k(sa_sint_t * RESTRICT SA,
+                                                                      fast_sint_t omp_block_start,
+                                                                      fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j, l;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - 3,
-        l = omp_block_start;
-         i < j; i += 4) {
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - 3, l = omp_block_start; i < j;
+         i += 4) {
         libsais_prefetch(&SA[i + prefetch_distance]);
 
         sa_sint_t s0 = SA[i + 0];
@@ -5184,15 +4829,14 @@ static fast_sint_t libsais_partial_sorting_gather_lms_suffixes_32s_4k(
     return l;
 }
 
-static fast_sint_t libsais_partial_sorting_gather_lms_suffixes_32s_1k(
-    sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start,
-    fast_sint_t omp_block_size) {
+static fast_sint_t libsais_partial_sorting_gather_lms_suffixes_32s_1k(sa_sint_t * RESTRICT SA,
+                                                                      fast_sint_t omp_block_start,
+                                                                      fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j, l;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - 3,
-        l = omp_block_start;
-         i < j; i += 4) {
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - 3, l = omp_block_start; i < j;
+         i += 4) {
         libsais_prefetch(&SA[i + prefetch_distance]);
 
         sa_sint_t s0 = SA[i + 0];
@@ -5237,21 +4881,19 @@ static void libsais_partial_sorting_gather_lms_suffixes_32s_4k_omp(
 #endif
         fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
-                                         ? omp_block_stride
-                                         : n - omp_block_start;
+        fast_sint_t omp_block_size =
+            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
 
         if (omp_num_threads == 1) {
-            libsais_partial_sorting_gather_lms_suffixes_32s_4k(
-                SA, omp_block_start, omp_block_size);
+            libsais_partial_sorting_gather_lms_suffixes_32s_4k(SA, omp_block_start, omp_block_size);
         }
 #if defined(_OPENMP)
         else {
             {
                 thread_state[omp_thread_num].state.position = omp_block_start;
                 thread_state[omp_thread_num].state.count =
-                    libsais_partial_sorting_gather_lms_suffixes_32s_4k(
-                        SA, omp_block_start, omp_block_size) -
+                    libsais_partial_sorting_gather_lms_suffixes_32s_4k(SA, omp_block_start,
+                                                                       omp_block_size) -
                     omp_block_start;
             }
 
@@ -5262,10 +4904,8 @@ static void libsais_partial_sorting_gather_lms_suffixes_32s_4k_omp(
                 fast_sint_t t, position = 0;
                 for (t = 0; t < omp_num_threads; ++t) {
                     if (t > 0 && thread_state[t].state.count > 0) {
-                        memmove(&SA[position],
-                                &SA[thread_state[t].state.position],
-                                (size_t)thread_state[t].state.count *
-                                    sizeof(sa_sint_t));
+                        memmove(&SA[position], &SA[thread_state[t].state.position],
+                                (size_t)thread_state[t].state.count * sizeof(sa_sint_t));
                     }
 
                     position += thread_state[t].state.count;
@@ -5295,21 +4935,19 @@ static void libsais_partial_sorting_gather_lms_suffixes_32s_1k_omp(
 #endif
         fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
-                                         ? omp_block_stride
-                                         : n - omp_block_start;
+        fast_sint_t omp_block_size =
+            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
 
         if (omp_num_threads == 1) {
-            libsais_partial_sorting_gather_lms_suffixes_32s_1k(
-                SA, omp_block_start, omp_block_size);
+            libsais_partial_sorting_gather_lms_suffixes_32s_1k(SA, omp_block_start, omp_block_size);
         }
 #if defined(_OPENMP)
         else {
             {
                 thread_state[omp_thread_num].state.position = omp_block_start;
                 thread_state[omp_thread_num].state.count =
-                    libsais_partial_sorting_gather_lms_suffixes_32s_1k(
-                        SA, omp_block_start, omp_block_size) -
+                    libsais_partial_sorting_gather_lms_suffixes_32s_1k(SA, omp_block_start,
+                                                                       omp_block_size) -
                     omp_block_start;
             }
 
@@ -5320,10 +4958,8 @@ static void libsais_partial_sorting_gather_lms_suffixes_32s_1k_omp(
                 fast_sint_t t, position = 0;
                 for (t = 0; t < omp_num_threads; ++t) {
                     if (t > 0 && thread_state[t].state.count > 0) {
-                        memmove(&SA[position],
-                                &SA[thread_state[t].state.position],
-                                (size_t)thread_state[t].state.count *
-                                    sizeof(sa_sint_t));
+                        memmove(&SA[position], &SA[thread_state[t].state.position],
+                                (size_t)thread_state[t].state.count * sizeof(sa_sint_t));
                     }
 
                     position += thread_state[t].state.count;
@@ -5334,103 +4970,91 @@ static void libsais_partial_sorting_gather_lms_suffixes_32s_1k_omp(
     }
 }
 
-static void libsais_induce_partial_order_8u_omp(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
-    sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix,
-    sa_sint_t left_suffixes_count, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
-    memset(&buckets[2 * ALPHABET_SIZE], 0,
-           2 * ALPHABET_SIZE * sizeof(sa_sint_t));
+static void libsais_induce_partial_order_8u_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
+                                                sa_sint_t n, sa_sint_t * RESTRICT buckets,
+                                                sa_sint_t first_lms_suffix,
+                                                sa_sint_t left_suffixes_count, sa_sint_t threads,
+                                                LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    memset(&buckets[2 * ALPHABET_SIZE], 0, 2 * ALPHABET_SIZE * sizeof(sa_sint_t));
 
     sa_sint_t d = libsais_partial_sorting_scan_left_to_right_8u_omp(
         T, SA, n, buckets, left_suffixes_count, 0, threads, thread_state);
     libsais_partial_sorting_shift_markers_8u_omp(SA, n, buckets, threads);
     libsais_partial_sorting_scan_right_to_left_8u_omp(
-        T, SA, n, buckets, first_lms_suffix, left_suffixes_count, d, threads,
-        thread_state);
+        T, SA, n, buckets, first_lms_suffix, left_suffixes_count, d, threads, thread_state);
 }
 
 static void libsais_induce_partial_order_32s_6k_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
-    sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix,
-    sa_sint_t left_suffixes_count, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
+    sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count,
+    sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     sa_sint_t d = libsais_partial_sorting_scan_left_to_right_32s_6k_omp(
         T, SA, n, buckets, left_suffixes_count, 0, threads, thread_state);
     libsais_partial_sorting_shift_markers_32s_6k_omp(SA, k, buckets, threads);
     libsais_partial_sorting_shift_buckets_32s_6k(k, buckets);
     libsais_partial_sorting_scan_right_to_left_32s_6k_omp(
-        T, SA, n, buckets, first_lms_suffix, left_suffixes_count, d, threads,
-        thread_state);
+        T, SA, n, buckets, first_lms_suffix, left_suffixes_count, d, threads, thread_state);
 }
 
-static void libsais_induce_partial_order_32s_4k_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
-    sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+static void libsais_induce_partial_order_32s_4k_omp(const sa_sint_t * RESTRICT T,
+                                                    sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                                    sa_sint_t k, sa_sint_t * RESTRICT buckets,
+                                                    sa_sint_t threads,
+                                                    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t));
 
-    sa_sint_t d = libsais_partial_sorting_scan_left_to_right_32s_4k_omp(
-        T, SA, n, k, buckets, 0, threads, thread_state);
+    sa_sint_t d = libsais_partial_sorting_scan_left_to_right_32s_4k_omp(T, SA, n, k, buckets, 0,
+                                                                        threads, thread_state);
     libsais_partial_sorting_shift_markers_32s_4k(SA, n);
-    libsais_partial_sorting_scan_right_to_left_32s_4k_omp(
-        T, SA, n, k, buckets, d, threads, thread_state);
-    libsais_partial_sorting_gather_lms_suffixes_32s_4k_omp(SA, n, threads,
-                                                           thread_state);
-}
-
-static void libsais_induce_partial_order_32s_2k_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
-    sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
-    libsais_partial_sorting_scan_left_to_right_32s_1k_omp(
-        T, SA, n, &buckets[1 * k], threads, thread_state);
-    libsais_partial_sorting_scan_right_to_left_32s_1k_omp(
-        T, SA, n, &buckets[0 * k], threads, thread_state);
-    libsais_partial_sorting_gather_lms_suffixes_32s_1k_omp(SA, n, threads,
-                                                           thread_state);
-}
-
-static void libsais_induce_partial_order_32s_1k_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
-    sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    libsais_partial_sorting_scan_right_to_left_32s_4k_omp(T, SA, n, k, buckets, d, threads,
+                                                          thread_state);
+    libsais_partial_sorting_gather_lms_suffixes_32s_4k_omp(SA, n, threads, thread_state);
+}
+
+static void libsais_induce_partial_order_32s_2k_omp(const sa_sint_t * RESTRICT T,
+                                                    sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                                    sa_sint_t k, sa_sint_t * RESTRICT buckets,
+                                                    sa_sint_t threads,
+                                                    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    libsais_partial_sorting_scan_left_to_right_32s_1k_omp(T, SA, n, &buckets[1 * k], threads,
+                                                          thread_state);
+    libsais_partial_sorting_scan_right_to_left_32s_1k_omp(T, SA, n, &buckets[0 * k], threads,
+                                                          thread_state);
+    libsais_partial_sorting_gather_lms_suffixes_32s_1k_omp(SA, n, threads, thread_state);
+}
+
+static void libsais_induce_partial_order_32s_1k_omp(const sa_sint_t * RESTRICT T,
+                                                    sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                                    sa_sint_t k, sa_sint_t * RESTRICT buckets,
+                                                    sa_sint_t threads,
+                                                    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     libsais_count_suffixes_32s(T, n, k, buckets);
     libsais_initialize_buckets_start_32s_1k(k, buckets);
-    libsais_partial_sorting_scan_left_to_right_32s_1k_omp(
-        T, SA, n, buckets, threads, thread_state);
+    libsais_partial_sorting_scan_left_to_right_32s_1k_omp(T, SA, n, buckets, threads, thread_state);
 
     libsais_count_suffixes_32s(T, n, k, buckets);
     libsais_initialize_buckets_end_32s_1k(k, buckets);
-    libsais_partial_sorting_scan_right_to_left_32s_1k_omp(
-        T, SA, n, buckets, threads, thread_state);
+    libsais_partial_sorting_scan_right_to_left_32s_1k_omp(T, SA, n, buckets, threads, thread_state);
 
-    libsais_partial_sorting_gather_lms_suffixes_32s_1k_omp(SA, n, threads,
-                                                           thread_state);
+    libsais_partial_sorting_gather_lms_suffixes_32s_1k_omp(SA, n, threads, thread_state);
 }
 
-static sa_sint_t libsais_renumber_lms_suffixes_8u(sa_sint_t * RESTRICT SA,
-                                                  sa_sint_t m, sa_sint_t name,
-                                                  fast_sint_t omp_block_start,
+static sa_sint_t libsais_renumber_lms_suffixes_8u(sa_sint_t * RESTRICT SA, sa_sint_t m,
+                                                  sa_sint_t name, fast_sint_t omp_block_start,
                                                   fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     sa_sint_t * RESTRICT SAm = &SA[m];
 
     fast_sint_t i, j;
-    for (i = omp_block_start,
-        j = omp_block_start + omp_block_size - prefetch_distance - 3;
-         i < j; i += 4) {
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j;
+         i += 4) {
         libsais_prefetch(&SA[i + 2 * prefetch_distance]);
 
-        libsais_prefetchw(
-            &SAm[(SA[i + prefetch_distance + 0] & SAINT_MAX) >> 1]);
-        libsais_prefetchw(
-            &SAm[(SA[i + prefetch_distance + 1] & SAINT_MAX) >> 1]);
-        libsais_prefetchw(
-            &SAm[(SA[i + prefetch_distance + 2] & SAINT_MAX) >> 1]);
-        libsais_prefetchw(
-            &SAm[(SA[i + prefetch_distance + 3] & SAINT_MAX) >> 1]);
+        libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 0] & SAINT_MAX) >> 1]);
+        libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 1] & SAINT_MAX) >> 1]);
+        libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 2] & SAINT_MAX) >> 1]);
+        libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 3] & SAINT_MAX) >> 1]);
 
         sa_sint_t p0 = SA[i + 0];
         SAm[(p0 & SAINT_MAX) >> 1] = name | SAINT_MIN;
@@ -5455,9 +5079,9 @@ static sa_sint_t libsais_renumber_lms_suffixes_8u(sa_sint_t * RESTRICT SA,
     return name;
 }
 
-static fast_sint_t libsais_gather_marked_suffixes_8u(
-    sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t l,
-    fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+static fast_sint_t libsais_gather_marked_suffixes_8u(sa_sint_t * RESTRICT SA, sa_sint_t m,
+                                                     fast_sint_t l, fast_sint_t omp_block_start,
+                                                     fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     l -= 1;
@@ -5514,20 +5138,17 @@ static sa_sint_t libsais_renumber_lms_suffixes_8u_omp(
 #endif
         fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
-                                         ? omp_block_stride
-                                         : m - omp_block_start;
+        fast_sint_t omp_block_size =
+            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
 
         if (omp_num_threads == 1) {
-            name = libsais_renumber_lms_suffixes_8u(SA, m, 0, omp_block_start,
-                                                    omp_block_size);
+            name = libsais_renumber_lms_suffixes_8u(SA, m, 0, omp_block_start, omp_block_size);
         }
 #if defined(_OPENMP)
         else {
             {
                 thread_state[omp_thread_num].state.count =
-                    libsais_count_negative_marked_suffixes(SA, omp_block_start,
-                                                           omp_block_size);
+                    libsais_count_negative_marked_suffixes(SA, omp_block_start, omp_block_size);
             }
 
     #pragma omp barrier
@@ -5539,13 +5160,11 @@ static sa_sint_t libsais_renumber_lms_suffixes_8u_omp(
                 }
 
                 if (omp_thread_num == omp_num_threads - 1) {
-                    name =
-                        (sa_sint_t)(count +
-                                    thread_state[omp_thread_num].state.count);
+                    name = (sa_sint_t)(count + thread_state[omp_thread_num].state.count);
                 }
 
-                libsais_renumber_lms_suffixes_8u(
-                    SA, m, (sa_sint_t)count, omp_block_start, omp_block_size);
+                libsais_renumber_lms_suffixes_8u(SA, m, (sa_sint_t)count, omp_block_start,
+                                                 omp_block_size);
             }
         }
 #endif
@@ -5555,8 +5174,8 @@ static sa_sint_t libsais_renumber_lms_suffixes_8u_omp(
 }
 
 static void libsais_gather_marked_lms_suffixes_8u_omp(
-    sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs,
-    sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
 #if defined(_OPENMP)
     #pragma omp parallel num_threads(threads) if (threads > 1 && n >= 131072)
 #endif
@@ -5571,36 +5190,29 @@ static void libsais_gather_marked_lms_suffixes_8u_omp(
         fast_sint_t omp_thread_num = 0;
         fast_sint_t omp_num_threads = 1;
 #endif
-        fast_sint_t omp_block_stride =
-            (((fast_sint_t)n >> 1) / omp_num_threads) & (-16);
+        fast_sint_t omp_block_stride = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size =
-            omp_thread_num < omp_num_threads - 1
-                ? omp_block_stride
-                : ((fast_sint_t)n >> 1) - omp_block_start;
+        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
+                                         ? omp_block_stride
+                                         : ((fast_sint_t)n >> 1) - omp_block_start;
 
         if (omp_num_threads == 1) {
-            libsais_gather_marked_suffixes_8u(SA, m,
-                                              (fast_sint_t)n + (fast_sint_t)fs,
+            libsais_gather_marked_suffixes_8u(SA, m, (fast_sint_t)n + (fast_sint_t)fs,
                                               omp_block_start, omp_block_size);
         }
 #if defined(_OPENMP)
         else {
             {
                 if (omp_thread_num < omp_num_threads - 1) {
-                    thread_state[omp_thread_num].state.position =
-                        libsais_gather_marked_suffixes_8u(
-                            SA, m,
-                            (fast_sint_t)m + omp_block_start + omp_block_size,
-                            omp_block_start, omp_block_size);
+                    thread_state[omp_thread_num].state.position = libsais_gather_marked_suffixes_8u(
+                        SA, m, (fast_sint_t)m + omp_block_start + omp_block_size, omp_block_start,
+                        omp_block_size);
                     thread_state[omp_thread_num].state.count =
                         (fast_sint_t)m + omp_block_start + omp_block_size -
                         thread_state[omp_thread_num].state.position;
                 } else {
-                    thread_state[omp_thread_num].state.position =
-                        libsais_gather_marked_suffixes_8u(
-                            SA, m, (fast_sint_t)n + (fast_sint_t)fs,
-                            omp_block_start, omp_block_size);
+                    thread_state[omp_thread_num].state.position = libsais_gather_marked_suffixes_8u(
+                        SA, m, (fast_sint_t)n + (fast_sint_t)fs, omp_block_start, omp_block_size);
                     thread_state[omp_thread_num].state.count =
                         (fast_sint_t)n + (fast_sint_t)fs -
                         thread_state[omp_thread_num].state.position;
@@ -5615,12 +5227,9 @@ static void libsais_gather_marked_lms_suffixes_8u_omp(
 
                 for (t = omp_num_threads - 1; t >= 0; --t) {
                     position -= thread_state[t].state.count;
-                    if (t != omp_num_threads - 1 &&
-                        thread_state[t].state.count > 0) {
-                        memmove(&SA[position],
-                                &SA[thread_state[t].state.position],
-                                (size_t)thread_state[t].state.count *
-                                    sizeof(sa_sint_t));
+                    if (t != omp_num_threads - 1 && thread_state[t].state.count > 0) {
+                        memmove(&SA[position], &SA[thread_state[t].state.position],
+                                (size_t)thread_state[t].state.count * sizeof(sa_sint_t));
                     }
                 }
             }
@@ -5630,15 +5239,13 @@ static void libsais_gather_marked_lms_suffixes_8u_omp(
 }
 
 static sa_sint_t libsais_renumber_and_gather_lms_suffixes_8u_omp(
-    sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs,
-    sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     memset(&SA[m], 0, ((size_t)n >> 1) * sizeof(sa_sint_t));
 
-    sa_sint_t name =
-        libsais_renumber_lms_suffixes_8u_omp(SA, m, threads, thread_state);
+    sa_sint_t name = libsais_renumber_lms_suffixes_8u_omp(SA, m, threads, thread_state);
     if (name < m) {
-        libsais_gather_marked_lms_suffixes_8u_omp(SA, n, m, fs, threads,
-                                                  thread_state);
+        libsais_gather_marked_lms_suffixes_8u_omp(SA, n, m, fs, threads, thread_state);
     } else {
         fast_sint_t i;
         for (i = 0; i < m; i += 1) {
@@ -5649,28 +5256,24 @@ static sa_sint_t libsais_renumber_and_gather_lms_suffixes_8u_omp(
     return name;
 }
 
-static sa_sint_t libsais_renumber_distinct_lms_suffixes_32s_4k(
-    sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t name,
-    fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+static sa_sint_t libsais_renumber_distinct_lms_suffixes_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t m,
+                                                               sa_sint_t name,
+                                                               fast_sint_t omp_block_start,
+                                                               fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     sa_sint_t * RESTRICT SAm = &SA[m];
 
     fast_sint_t i, j;
     sa_sint_t p0, p1, p2, p3 = 0;
-    for (i = omp_block_start,
-        j = omp_block_start + omp_block_size - prefetch_distance - 3;
-         i < j; i += 4) {
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j;
+         i += 4) {
         libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
 
-        libsais_prefetchw(
-            &SAm[(SA[i + prefetch_distance + 0] & SAINT_MAX) >> 1]);
-        libsais_prefetchw(
-            &SAm[(SA[i + prefetch_distance + 1] & SAINT_MAX) >> 1]);
-        libsais_prefetchw(
-            &SAm[(SA[i + prefetch_distance + 2] & SAINT_MAX) >> 1]);
-        libsais_prefetchw(
-            &SAm[(SA[i + prefetch_distance + 3] & SAINT_MAX) >> 1]);
+        libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 0] & SAINT_MAX) >> 1]);
+        libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 1] & SAINT_MAX) >> 1]);
+        libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 2] & SAINT_MAX) >> 1]);
+        libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 3] & SAINT_MAX) >> 1]);
 
         p0 = SA[i + 0];
         SAm[(SA[i + 0] = p0 & SAINT_MAX) >> 1] = name | (p0 & p3 & SAINT_MIN);
@@ -5696,8 +5299,7 @@ static sa_sint_t libsais_renumber_distinct_lms_suffixes_32s_4k(
     return name;
 }
 
-static void libsais_mark_distinct_lms_suffixes_32s(sa_sint_t * RESTRICT SA,
-                                                   sa_sint_t m,
+static void libsais_mark_distinct_lms_suffixes_32s(sa_sint_t * RESTRICT SA, sa_sint_t m,
                                                    fast_sint_t omp_block_start,
                                                    fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
@@ -5731,8 +5333,7 @@ static void libsais_mark_distinct_lms_suffixes_32s(sa_sint_t * RESTRICT SA,
     }
 }
 
-static void libsais_clamp_lms_suffixes_length_32s(sa_sint_t * RESTRICT SA,
-                                                  sa_sint_t m,
+static void libsais_clamp_lms_suffixes_length_32s(sa_sint_t * RESTRICT SA, sa_sint_t m,
                                                   fast_sint_t omp_block_start,
                                                   fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
@@ -5740,8 +5341,7 @@ static void libsais_clamp_lms_suffixes_length_32s(sa_sint_t * RESTRICT SA,
     sa_sint_t * RESTRICT SAm = &SA[m];
 
     fast_sint_t i, j;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - 3; i < j;
-         i += 4) {
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - 3; i < j; i += 4) {
         libsais_prefetchw(&SAm[i + prefetch_distance]);
 
         SAm[i + 0] = (SAm[i + 0] < 0 ? SAm[i + 0] : 0) & SAINT_MAX;
@@ -5776,20 +5376,18 @@ static sa_sint_t libsais_renumber_distinct_lms_suffixes_32s_4k_omp(
 #endif
         fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
-                                         ? omp_block_stride
-                                         : m - omp_block_start;
+        fast_sint_t omp_block_size =
+            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
 
         if (omp_num_threads == 1) {
-            name = libsais_renumber_distinct_lms_suffixes_32s_4k(
-                SA, m, 1, omp_block_start, omp_block_size);
+            name = libsais_renumber_distinct_lms_suffixes_32s_4k(SA, m, 1, omp_block_start,
+                                                                 omp_block_size);
         }
 #if defined(_OPENMP)
         else {
             {
                 thread_state[omp_thread_num].state.count =
-                    libsais_count_negative_marked_suffixes(SA, omp_block_start,
-                                                           omp_block_size);
+                    libsais_count_negative_marked_suffixes(SA, omp_block_start, omp_block_size);
             }
 
     #pragma omp barrier
@@ -5801,13 +5399,11 @@ static sa_sint_t libsais_renumber_distinct_lms_suffixes_32s_4k_omp(
                 }
 
                 if (omp_thread_num == omp_num_threads - 1) {
-                    name =
-                        (sa_sint_t)(count +
-                                    thread_state[omp_thread_num].state.count);
+                    name = (sa_sint_t)(count + thread_state[omp_thread_num].state.count);
                 }
 
-                libsais_renumber_distinct_lms_suffixes_32s_4k(
-                    SA, m, (sa_sint_t)count, omp_block_start, omp_block_size);
+                libsais_renumber_distinct_lms_suffixes_32s_4k(SA, m, (sa_sint_t)count,
+                                                              omp_block_start, omp_block_size);
             }
         }
 #endif
@@ -5816,9 +5412,8 @@ static sa_sint_t libsais_renumber_distinct_lms_suffixes_32s_4k_omp(
     return name - 1;
 }
 
-static void libsais_mark_distinct_lms_suffixes_32s_omp(sa_sint_t * RESTRICT SA,
-                                                       sa_sint_t n, sa_sint_t m,
-                                                       sa_sint_t threads) {
+static void libsais_mark_distinct_lms_suffixes_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                                       sa_sint_t m, sa_sint_t threads) {
 #if defined(_OPENMP)
     #pragma omp parallel num_threads(threads) if (threads > 1 && n >= 131072)
 #endif
@@ -5826,27 +5421,23 @@ static void libsais_mark_distinct_lms_suffixes_32s_omp(sa_sint_t * RESTRICT SA,
 #if defined(_OPENMP)
         fast_sint_t omp_thread_num = omp_get_thread_num();
         fast_sint_t omp_num_threads = omp_get_num_threads();
-        fast_sint_t omp_block_stride =
-            (((fast_sint_t)n >> 1) / omp_num_threads) & (-16);
+        fast_sint_t omp_block_stride = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size =
-            omp_thread_num < omp_num_threads - 1
-                ? omp_block_stride
-                : ((fast_sint_t)n >> 1) - omp_block_start;
+        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
+                                         ? omp_block_stride
+                                         : ((fast_sint_t)n >> 1) - omp_block_start;
 #else
         UNUSED(threads);
 
         fast_sint_t omp_block_start = 0;
         fast_sint_t omp_block_size = (fast_sint_t)n >> 1;
 #endif
-        libsais_mark_distinct_lms_suffixes_32s(SA, m, omp_block_start,
-                                               omp_block_size);
+        libsais_mark_distinct_lms_suffixes_32s(SA, m, omp_block_start, omp_block_size);
     }
 }
 
-static void libsais_clamp_lms_suffixes_length_32s_omp(sa_sint_t * RESTRICT SA,
-                                                      sa_sint_t n, sa_sint_t m,
-                                                      sa_sint_t threads) {
+static void libsais_clamp_lms_suffixes_length_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                                      sa_sint_t m, sa_sint_t threads) {
 #if defined(_OPENMP)
     #pragma omp parallel num_threads(threads) if (threads > 1 && n >= 131072)
 #endif
@@ -5854,21 +5445,18 @@ static void libsais_clamp_lms_suffixes_length_32s_omp(sa_sint_t * RESTRICT SA,
 #if defined(_OPENMP)
         fast_sint_t omp_thread_num = omp_get_thread_num();
         fast_sint_t omp_num_threads = omp_get_num_threads();
-        fast_sint_t omp_block_stride =
-            (((fast_sint_t)n >> 1) / omp_num_threads) & (-16);
+        fast_sint_t omp_block_stride = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size =
-            omp_thread_num < omp_num_threads - 1
-                ? omp_block_stride
-                : ((fast_sint_t)n >> 1) - omp_block_start;
+        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
+                                         ? omp_block_stride
+                                         : ((fast_sint_t)n >> 1) - omp_block_start;
 #else
         UNUSED(threads);
 
         fast_sint_t omp_block_start = 0;
         fast_sint_t omp_block_size = (fast_sint_t)n >> 1;
 #endif
-        libsais_clamp_lms_suffixes_length_32s(SA, m, omp_block_start,
-                                              omp_block_size);
+        libsais_clamp_lms_suffixes_length_32s(SA, m, omp_block_start, omp_block_size);
     }
 }
 
@@ -5877,8 +5465,8 @@ static sa_sint_t libsais_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(
     LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     memset(&SA[m], 0, ((size_t)n >> 1) * sizeof(sa_sint_t));
 
-    sa_sint_t name = libsais_renumber_distinct_lms_suffixes_32s_4k_omp(
-        SA, m, threads, thread_state);
+    sa_sint_t name =
+        libsais_renumber_distinct_lms_suffixes_32s_4k_omp(SA, m, threads, thread_state);
     if (name < m) {
         libsais_mark_distinct_lms_suffixes_32s_omp(SA, n, m, threads);
     }
@@ -5887,8 +5475,7 @@ static sa_sint_t libsais_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(
 }
 
 static sa_sint_t libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(
-    sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
-    sa_sint_t threads) {
+    sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads) {
     const fast_sint_t prefetch_distance = 32;
 
     sa_sint_t * RESTRICT SAm = &SA[m];
@@ -5896,32 +5483,22 @@ static sa_sint_t libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(
     {
         libsais_gather_lms_suffixes_32s(T, SA, n);
 
-        memset(&SA[m], 0,
-               ((size_t)n - (size_t)m - (size_t)m) * sizeof(sa_sint_t));
+        memset(&SA[m], 0, ((size_t)n - (size_t)m - (size_t)m) * sizeof(sa_sint_t));
 
         fast_sint_t i, j;
-        for (i = (fast_sint_t)n - (fast_sint_t)m,
-            j = (fast_sint_t)n - 1 - prefetch_distance - 3;
+        for (i = (fast_sint_t)n - (fast_sint_t)m, j = (fast_sint_t)n - 1 - prefetch_distance - 3;
              i < j; i += 4) {
             libsais_prefetch(&SA[i + 2 * prefetch_distance]);
 
-            libsais_prefetchw(
-                &SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]);
-            libsais_prefetchw(
-                &SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]);
-            libsais_prefetchw(
-                &SAm[((sa_uint_t)SA[i + prefetch_distance + 2]) >> 1]);
-            libsais_prefetchw(
-                &SAm[((sa_uint_t)SA[i + prefetch_distance + 3]) >> 1]);
+            libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]);
+            libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]);
+            libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 2]) >> 1]);
+            libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 3]) >> 1]);
 
-            SAm[((sa_uint_t)SA[i + 0]) >> 1] =
-                SA[i + 1] - SA[i + 0] + 1 + SAINT_MIN;
-            SAm[((sa_uint_t)SA[i + 1]) >> 1] =
-                SA[i + 2] - SA[i + 1] + 1 + SAINT_MIN;
-            SAm[((sa_uint_t)SA[i + 2]) >> 1] =
-                SA[i + 3] - SA[i + 2] + 1 + SAINT_MIN;
-            SAm[((sa_uint_t)SA[i + 3]) >> 1] =
-                SA[i + 4] - SA[i + 3] + 1 + SAINT_MIN;
+            SAm[((sa_uint_t)SA[i + 0]) >> 1] = SA[i + 1] - SA[i + 0] + 1 + SAINT_MIN;
+            SAm[((sa_uint_t)SA[i + 1]) >> 1] = SA[i + 2] - SA[i + 1] + 1 + SAINT_MIN;
+            SAm[((sa_uint_t)SA[i + 2]) >> 1] = SA[i + 3] - SA[i + 2] + 1 + SAINT_MIN;
+            SAm[((sa_uint_t)SA[i + 3]) >> 1] = SA[i + 4] - SA[i + 3] + 1 + SAINT_MIN;
         }
 
         for (j += prefetch_distance + 3; i < j; i += 1) {
@@ -5941,11 +5518,9 @@ static sa_sint_t libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(
         for (i = 1, j = m - prefetch_distance - 1; i < j; i += 2) {
             libsais_prefetch(&SA[i + 2 * prefetch_distance]);
 
-            libsais_prefetchw(
-                &SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]);
+            libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]);
             libsais_prefetch(&T[((sa_uint_t)SA[i + prefetch_distance + 0])]);
-            libsais_prefetchw(
-                &SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]);
+            libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]);
             libsais_prefetch(&T[((sa_uint_t)SA[i + prefetch_distance + 1])]);
 
             fast_sint_t q = SA[i + 0], qlen = SAm[q >> 1];
@@ -6009,8 +5584,7 @@ static sa_sint_t libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(
     return name - 1;
 }
 
-static void libsais_reconstruct_lms_suffixes(sa_sint_t * RESTRICT SA,
-                                             sa_sint_t n, sa_sint_t m,
+static void libsais_reconstruct_lms_suffixes(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
                                              fast_sint_t omp_block_start,
                                              fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
@@ -6018,9 +5592,8 @@ static void libsais_reconstruct_lms_suffixes(sa_sint_t * RESTRICT SA,
     const sa_sint_t * RESTRICT SAnm = &SA[n - m];
 
     fast_sint_t i, j;
-    for (i = omp_block_start,
-        j = omp_block_start + omp_block_size - prefetch_distance - 3;
-         i < j; i += 4) {
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j;
+         i += 4) {
         libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
 
         libsais_prefetch(&SAnm[SA[i + prefetch_distance + 0]]);
@@ -6039,8 +5612,7 @@ static void libsais_reconstruct_lms_suffixes(sa_sint_t * RESTRICT SA,
     }
 }
 
-static void libsais_reconstruct_lms_suffixes_omp(sa_sint_t * RESTRICT SA,
-                                                 sa_sint_t n, sa_sint_t m,
+static void libsais_reconstruct_lms_suffixes_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
                                                  sa_sint_t threads) {
 #if defined(_OPENMP)
     #pragma omp parallel num_threads(threads) if (threads > 1 && m >= 65536)
@@ -6050,10 +5622,9 @@ static void libsais_reconstruct_lms_suffixes_omp(sa_sint_t * RESTRICT SA,
         fast_sint_t omp_thread_num = omp_get_thread_num();
         fast_sint_t omp_num_threads = omp_get_num_threads();
         fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
-                                         ? omp_block_stride
-                                         : m - omp_block_start;
+        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size =
+            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
 #else
         UNUSED(threads);
 
@@ -6061,68 +5632,63 @@ static void libsais_reconstruct_lms_suffixes_omp(sa_sint_t * RESTRICT SA,
         fast_sint_t omp_block_size = m;
 #endif
 
-        libsais_reconstruct_lms_suffixes(SA, n, m, omp_block_start,
-                                         omp_block_size);
+        libsais_reconstruct_lms_suffixes(SA, n, m, omp_block_start, omp_block_size);
     }
 }
 
-static void libsais_place_lms_suffixes_interval_8u(
-    sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
-    const sa_sint_t * RESTRICT buckets) {
+static void libsais_place_lms_suffixes_interval_8u(sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                                   sa_sint_t m,
+                                                   const sa_sint_t * RESTRICT buckets) {
     const sa_sint_t * RESTRICT bucket_end = &buckets[7 * ALPHABET_SIZE];
 
     fast_sint_t c, j = n;
     for (c = ALPHABET_SIZE - 2; c >= 0; --c) {
-        fast_sint_t l =
-            (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1) + BUCKETS_INDEX2(1, 0)] -
-            (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1)];
+        fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1) + BUCKETS_INDEX2(1, 0)] -
+                        (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1)];
         if (l > 0) {
             fast_sint_t i = bucket_end[c];
             if (j - i > 0) {
                 memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
             }
 
-            memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l],
-                    (size_t)l * sizeof(sa_sint_t));
+            memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
         }
     }
 
     memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
 }
 
-static void libsais_place_lms_suffixes_interval_32s_4k(
-    sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m,
-    const sa_sint_t * RESTRICT buckets) {
+static void libsais_place_lms_suffixes_interval_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                                       sa_sint_t k, sa_sint_t m,
+                                                       const sa_sint_t * RESTRICT buckets) {
     const sa_sint_t * RESTRICT bucket_end = &buckets[3 * k];
 
     fast_sint_t c, j = n;
     for (c = (fast_sint_t)k - 2; c >= 0; --c) {
-        fast_sint_t l =
-            (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1) + BUCKETS_INDEX2(1, 0)] -
-            (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1)];
+        fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1) + BUCKETS_INDEX2(1, 0)] -
+                        (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1)];
         if (l > 0) {
             fast_sint_t i = bucket_end[c];
             if (j - i > 0) {
                 memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
             }
 
-            memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l],
-                    (size_t)l * sizeof(sa_sint_t));
+            memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
         }
     }
 
     memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
 }
 
-static void libsais_place_lms_suffixes_interval_32s_2k(
-    sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m,
-    const sa_sint_t * RESTRICT buckets) {
+static void libsais_place_lms_suffixes_interval_32s_2k(sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                                       sa_sint_t k, sa_sint_t m,
+                                                       const sa_sint_t * RESTRICT buckets) {
     fast_sint_t j = n;
 
     if (k > 1) {
         fast_sint_t c;
-        for (c = BUCKETS_INDEX2((fast_sint_t)k - 2, 0);
-             c >= BUCKETS_INDEX2(0, 0); c -= BUCKETS_INDEX2(1, 0)) {
+        for (c = BUCKETS_INDEX2((fast_sint_t)k - 2, 0); c >= BUCKETS_INDEX2(0, 0);
+             c -= BUCKETS_INDEX2(1, 0)) {
             fast_sint_t l = (fast_sint_t)buckets[c + BUCKETS_INDEX2(1, 1)] -
                             (fast_sint_t)buckets[c + BUCKETS_INDEX2(0, 1)];
             if (l > 0) {
@@ -6131,8 +5697,7 @@ static void libsais_place_lms_suffixes_interval_32s_2k(
                     memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
                 }
 
-                memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l],
-                        (size_t)l * sizeof(sa_sint_t));
+                memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
             }
         }
     }
@@ -6140,9 +5705,9 @@ static void libsais_place_lms_suffixes_interval_32s_2k(
     memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
 }
 
-static void libsais_place_lms_suffixes_interval_32s_1k(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k,
-    sa_sint_t m, sa_sint_t * RESTRICT buckets) {
+static void libsais_place_lms_suffixes_interval_32s_1k(const sa_sint_t * RESTRICT T,
+                                                       sa_sint_t * RESTRICT SA, sa_sint_t k,
+                                                       sa_sint_t m, sa_sint_t * RESTRICT buckets) {
     const fast_sint_t prefetch_distance = 32;
 
     sa_sint_t c = k - 1;
@@ -6158,32 +5723,28 @@ static void libsais_place_lms_suffixes_interval_32s_1k(
         sa_sint_t p0 = SA[i - 0];
         if (T[p0] != c) {
             c = T[p0];
-            memset(&SA[buckets[c]], 0,
-                   (size_t)(l - buckets[c]) * sizeof(sa_sint_t));
+            memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t));
             l = buckets[c];
         }
         SA[--l] = p0;
         sa_sint_t p1 = SA[i - 1];
         if (T[p1] != c) {
             c = T[p1];
-            memset(&SA[buckets[c]], 0,
-                   (size_t)(l - buckets[c]) * sizeof(sa_sint_t));
+            memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t));
             l = buckets[c];
         }
         SA[--l] = p1;
         sa_sint_t p2 = SA[i - 2];
         if (T[p2] != c) {
             c = T[p2];
-            memset(&SA[buckets[c]], 0,
-                   (size_t)(l - buckets[c]) * sizeof(sa_sint_t));
+            memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t));
             l = buckets[c];
         }
         SA[--l] = p2;
         sa_sint_t p3 = SA[i - 3];
         if (T[p3] != c) {
             c = T[p3];
-            memset(&SA[buckets[c]], 0,
-                   (size_t)(l - buckets[c]) * sizeof(sa_sint_t));
+            memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t));
             l = buckets[c];
         }
         SA[--l] = p3;
@@ -6193,8 +5754,7 @@ static void libsais_place_lms_suffixes_interval_32s_1k(
         sa_sint_t p = SA[i];
         if (T[p] != c) {
             c = T[p];
-            memset(&SA[buckets[c]], 0,
-                   (size_t)(l - buckets[c]) * sizeof(sa_sint_t));
+            memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t));
             l = buckets[c];
         }
         SA[--l] = p;
@@ -6203,9 +5763,9 @@ static void libsais_place_lms_suffixes_interval_32s_1k(
     memset(&SA[0], 0, (size_t)l * sizeof(sa_sint_t));
 }
 
-static void libsais_place_lms_suffixes_histogram_32s_6k(
-    sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m,
-    const sa_sint_t * RESTRICT buckets) {
+static void libsais_place_lms_suffixes_histogram_32s_6k(sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                                        sa_sint_t k, sa_sint_t m,
+                                                        const sa_sint_t * RESTRICT buckets) {
     const sa_sint_t * RESTRICT bucket_end = &buckets[5 * k];
 
     fast_sint_t c, j = n;
@@ -6217,17 +5777,16 @@ static void libsais_place_lms_suffixes_histogram_32s_6k(
                 memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
             }
 
-            memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l],
-                    (size_t)l * sizeof(sa_sint_t));
+            memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
         }
     }
 
     memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
 }
 
-static void libsais_place_lms_suffixes_histogram_32s_4k(
-    sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m,
-    const sa_sint_t * RESTRICT buckets) {
+static void libsais_place_lms_suffixes_histogram_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                                        sa_sint_t k, sa_sint_t m,
+                                                        const sa_sint_t * RESTRICT buckets) {
     const sa_sint_t * RESTRICT bucket_end = &buckets[3 * k];
 
     fast_sint_t c, j = n;
@@ -6239,23 +5798,22 @@ static void libsais_place_lms_suffixes_histogram_32s_4k(
                 memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
             }
 
-            memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l],
-                    (size_t)l * sizeof(sa_sint_t));
+            memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
         }
     }
 
     memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
 }
 
-static void libsais_place_lms_suffixes_histogram_32s_2k(
-    sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m,
-    const sa_sint_t * RESTRICT buckets) {
+static void libsais_place_lms_suffixes_histogram_32s_2k(sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                                        sa_sint_t k, sa_sint_t m,
+                                                        const sa_sint_t * RESTRICT buckets) {
     fast_sint_t j = n;
 
     if (k > 1) {
         fast_sint_t c;
-        for (c = BUCKETS_INDEX2((fast_sint_t)k - 2, 0);
-             c >= BUCKETS_INDEX2(0, 0); c -= BUCKETS_INDEX2(1, 0)) {
+        for (c = BUCKETS_INDEX2((fast_sint_t)k - 2, 0); c >= BUCKETS_INDEX2(0, 0);
+             c -= BUCKETS_INDEX2(1, 0)) {
             fast_sint_t l = (fast_sint_t)buckets[c + BUCKETS_INDEX2(0, 1)];
             if (l > 0) {
                 fast_sint_t i = buckets[c];
@@ -6263,8 +5821,7 @@ static void libsais_place_lms_suffixes_histogram_32s_2k(
                     memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
                 }
 
-                memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l],
-                        (size_t)l * sizeof(sa_sint_t));
+                memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
             }
         }
     }
@@ -6272,16 +5829,15 @@ static void libsais_place_lms_suffixes_histogram_32s_2k(
     memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
 }
 
-static void libsais_final_bwt_scan_left_to_right_8u(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
-    sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start,
-    fast_sint_t omp_block_size) {
+static void libsais_final_bwt_scan_left_to_right_8u(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
+                                                    sa_sint_t * RESTRICT induction_bucket,
+                                                    fast_sint_t omp_block_start,
+                                                    fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start,
-        j = omp_block_start + omp_block_size - prefetch_distance - 1;
-         i < j; i += 2) {
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j;
+         i += 2) {
         libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
 
         sa_sint_t s0 = SA[i + prefetch_distance + 0];
@@ -6325,16 +5881,17 @@ static void libsais_final_bwt_scan_left_to_right_8u(
     }
 }
 
-static void libsais_final_bwt_aux_scan_left_to_right_8u(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm,
-    sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket,
-    fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+static void libsais_final_bwt_aux_scan_left_to_right_8u(const u8 * RESTRICT T,
+                                                        sa_sint_t * RESTRICT SA, sa_sint_t rm,
+                                                        sa_sint_t * RESTRICT I,
+                                                        sa_sint_t * RESTRICT induction_bucket,
+                                                        fast_sint_t omp_block_start,
+                                                        fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start,
-        j = omp_block_start + omp_block_size - prefetch_distance - 1;
-         i < j; i += 2) {
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j;
+         i += 2) {
         libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
 
         sa_sint_t s0 = SA[i + prefetch_distance + 0];
@@ -6387,16 +5944,16 @@ static void libsais_final_bwt_aux_scan_left_to_right_8u(
     }
 }
 
-static void libsais_final_sorting_scan_left_to_right_8u(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
-    sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start,
-    fast_sint_t omp_block_size) {
+static void libsais_final_sorting_scan_left_to_right_8u(const u8 * RESTRICT T,
+                                                        sa_sint_t * RESTRICT SA,
+                                                        sa_sint_t * RESTRICT induction_bucket,
+                                                        fast_sint_t omp_block_start,
+                                                        fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start,
-        j = omp_block_start + omp_block_size - prefetch_distance - 1;
-         i < j; i += 2) {
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j;
+         i += 2) {
         libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
 
         sa_sint_t s0 = SA[i + prefetch_distance + 0];
@@ -6437,15 +5994,15 @@ static void libsais_final_sorting_scan_left_to_right_8u(
     }
 }
 
-static void libsais_final_sorting_scan_left_to_right_32s(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
-    sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start,
-    fast_sint_t omp_block_size) {
+static void libsais_final_sorting_scan_left_to_right_32s(const sa_sint_t * RESTRICT T,
+                                                         sa_sint_t * RESTRICT SA,
+                                                         sa_sint_t * RESTRICT induction_bucket,
+                                                         fast_sint_t omp_block_start,
+                                                         fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start,
-        j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1;
          i < j; i += 2) {
         libsais_prefetchw(&SA[i + 3 * prefetch_distance]);
 
@@ -6496,17 +6053,16 @@ static void libsais_final_sorting_scan_left_to_right_32s(
 #if defined(_OPENMP)
 
 static fast_sint_t libsais_final_bwt_scan_left_to_right_8u_block_prepare(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
-    sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache,
-    fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
+    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
+    fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t));
 
     fast_sint_t i, j, count = 0;
-    for (i = omp_block_start,
-        j = omp_block_start + omp_block_size - prefetch_distance - 1;
-         i < j; i += 2) {
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j;
+         i += 2) {
         libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
 
         sa_sint_t s0 = SA[i + prefetch_distance + 0];
@@ -6526,8 +6082,7 @@ static fast_sint_t libsais_final_bwt_scan_left_to_right_8u_block_prepare(
             p0--;
             SA[i + 0] = T[p0] | SAINT_MIN;
             buckets[cache[count].symbol = T[p0]]++;
-            cache[count++].index =
-                p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1));
+            cache[count++].index = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1));
         }
         sa_sint_t p1 = SA[i + 1];
         SA[i + 1] = p1 & SAINT_MAX;
@@ -6535,8 +6090,7 @@ static fast_sint_t libsais_final_bwt_scan_left_to_right_8u_block_prepare(
             p1--;
             SA[i + 1] = T[p1] | SAINT_MIN;
             buckets[cache[count].symbol = T[p1]]++;
-            cache[count++].index =
-                p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1));
+            cache[count++].index = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1));
         }
     }
 
@@ -6547,8 +6101,7 @@ static fast_sint_t libsais_final_bwt_scan_left_to_right_8u_block_prepare(
             p--;
             SA[i] = T[p] | SAINT_MIN;
             buckets[cache[count].symbol = T[p]]++;
-            cache[count++].index =
-                p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1));
+            cache[count++].index = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1));
         }
     }
 
@@ -6556,17 +6109,16 @@ static fast_sint_t libsais_final_bwt_scan_left_to_right_8u_block_prepare(
 }
 
 static fast_sint_t libsais_final_sorting_scan_left_to_right_8u_block_prepare(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
-    sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache,
-    fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
+    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
+    fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t));
 
     fast_sint_t i, j, count = 0;
-    for (i = omp_block_start,
-        j = omp_block_start + omp_block_size - prefetch_distance - 1;
-         i < j; i += 2) {
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j;
+         i += 2) {
         libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
 
         sa_sint_t s0 = SA[i + prefetch_distance + 0];
@@ -6585,16 +6137,14 @@ static fast_sint_t libsais_final_sorting_scan_left_to_right_8u_block_prepare(
         if (p0 > 0) {
             p0--;
             buckets[cache[count].symbol = T[p0]]++;
-            cache[count++].index =
-                p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1));
+            cache[count++].index = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1));
         }
         sa_sint_t p1 = SA[i + 1];
         SA[i + 1] = p1 ^ SAINT_MIN;
         if (p1 > 0) {
             p1--;
             buckets[cache[count].symbol = T[p1]]++;
-            cache[count++].index =
-                p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1));
+            cache[count++].index = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1));
         }
     }
 
@@ -6604,8 +6154,7 @@ static fast_sint_t libsais_final_sorting_scan_left_to_right_8u_block_prepare(
         if (p > 0) {
             p--;
             buckets[cache[count].symbol = T[p]]++;
-            cache[count++].index =
-                p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1));
+            cache[count++].index = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1));
         }
     }
 
@@ -6613,8 +6162,8 @@ static fast_sint_t libsais_final_sorting_scan_left_to_right_8u_block_prepare(
 }
 
 static void libsais_final_order_scan_left_to_right_8u_block_place(
-    sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
-    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count) {
+    sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache,
+    fast_sint_t count) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
@@ -6633,9 +6182,8 @@ static void libsais_final_order_scan_left_to_right_8u_block_place(
 }
 
 static void libsais_final_bwt_aux_scan_left_to_right_8u_block_place(
-    sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I,
-    sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache,
-    fast_sint_t count) {
+    sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT buckets,
+    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
@@ -6644,45 +6192,38 @@ static void libsais_final_bwt_aux_scan_left_to_right_8u_block_place(
 
         SA[buckets[cache[i + 0].symbol]++] = cache[i + 0].index;
         if ((cache[i + 0].index & rm) == 0) {
-            I[(cache[i + 0].index & SAINT_MAX) / (rm + 1)] =
-                buckets[cache[i + 0].symbol];
+            I[(cache[i + 0].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 0].symbol];
         }
         SA[buckets[cache[i + 1].symbol]++] = cache[i + 1].index;
         if ((cache[i + 1].index & rm) == 0) {
-            I[(cache[i + 1].index & SAINT_MAX) / (rm + 1)] =
-                buckets[cache[i + 1].symbol];
+            I[(cache[i + 1].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 1].symbol];
         }
         SA[buckets[cache[i + 2].symbol]++] = cache[i + 2].index;
         if ((cache[i + 2].index & rm) == 0) {
-            I[(cache[i + 2].index & SAINT_MAX) / (rm + 1)] =
-                buckets[cache[i + 2].symbol];
+            I[(cache[i + 2].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 2].symbol];
         }
         SA[buckets[cache[i + 3].symbol]++] = cache[i + 3].index;
         if ((cache[i + 3].index & rm) == 0) {
-            I[(cache[i + 3].index & SAINT_MAX) / (rm + 1)] =
-                buckets[cache[i + 3].symbol];
+            I[(cache[i + 3].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 3].symbol];
         }
     }
 
     for (j += 3; i < j; i += 1) {
         SA[buckets[cache[i].symbol]++] = cache[i].index;
         if ((cache[i].index & rm) == 0) {
-            I[(cache[i].index & SAINT_MAX) / (rm + 1)] =
-                buckets[cache[i].symbol];
+            I[(cache[i].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i].symbol];
         }
     }
 }
 
 static void libsais_final_sorting_scan_left_to_right_32s_block_gather(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
-    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
-    fast_sint_t omp_block_size) {
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache,
+    fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start,
-        j = omp_block_start + omp_block_size - prefetch_distance - 1;
-         i < j; i += 2) {
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j;
+         i += 2) {
         libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
 
         sa_sint_t s0 = SA[i + prefetch_distance + 0];
@@ -6702,8 +6243,7 @@ static void libsais_final_sorting_scan_left_to_right_32s_block_gather(
         SA[i + 0] = p0 ^ SAINT_MIN;
         if (p0 > 0) {
             p0--;
-            cache[i + 0].index =
-                p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1));
+            cache[i + 0].index = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1));
             symbol0 = T[p0];
         }
         cache[i + 0].symbol = symbol0;
@@ -6711,8 +6251,7 @@ static void libsais_final_sorting_scan_left_to_right_32s_block_gather(
         SA[i + 1] = p1 ^ SAINT_MIN;
         if (p1 > 0) {
             p1--;
-            cache[i + 1].index =
-                p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1));
+            cache[i + 1].index = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1));
             symbol1 = T[p1];
         }
         cache[i + 1].symbol = symbol1;
@@ -6723,8 +6262,7 @@ static void libsais_final_sorting_scan_left_to_right_32s_block_gather(
         SA[i] = p ^ SAINT_MIN;
         if (p > 0) {
             p--;
-            cache[i].index =
-                p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1));
+            cache[i].index = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1));
             symbol = T[p];
         }
         cache[i].symbol = symbol;
@@ -6738,8 +6276,7 @@ static void libsais_final_sorting_scan_left_to_right_32s_block_sort(
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size;
-    for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j;
-         i += 2) {
+    for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2) {
         libsais_prefetchw(&cache[i + 2 * prefetch_distance]);
 
         sa_sint_t s0 = cache[i + prefetch_distance + 0].symbol;
@@ -6758,8 +6295,7 @@ static void libsais_final_sorting_scan_left_to_right_32s_block_sort(
                 if (np > 0) {
                     np--;
                     cache[ni].index =
-                        np | ((sa_sint_t)(T[np - (np > 0)] < T[np])
-                              << (SAINT_BIT - 1));
+                        np | ((sa_sint_t)(T[np - (np > 0)] < T[np]) << (SAINT_BIT - 1));
                     cache[ni].symbol = T[np];
                 }
             }
@@ -6774,8 +6310,7 @@ static void libsais_final_sorting_scan_left_to_right_32s_block_sort(
                 if (np > 0) {
                     np--;
                     cache[ni].index =
-                        np | ((sa_sint_t)(T[np - (np > 0)] < T[np])
-                              << (SAINT_BIT - 1));
+                        np | ((sa_sint_t)(T[np - (np > 0)] < T[np]) << (SAINT_BIT - 1));
                     cache[ni].symbol = T[np];
                 }
             }
@@ -6792,8 +6327,7 @@ static void libsais_final_sorting_scan_left_to_right_32s_block_sort(
                 if (np > 0) {
                     np--;
                     cache[ni].index =
-                        np | ((sa_sint_t)(T[np - (np > 0)] < T[np])
-                              << (SAINT_BIT - 1));
+                        np | ((sa_sint_t)(T[np - (np > 0)] < T[np]) << (SAINT_BIT - 1));
                     cache[ni].symbol = T[np];
                 }
             }
@@ -6802,14 +6336,13 @@ static void libsais_final_sorting_scan_left_to_right_32s_block_sort(
 }
 
 static void libsais_final_bwt_scan_left_to_right_8u_block_omp(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
-    sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start,
-    fast_sint_t block_size, sa_sint_t threads,
+    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket,
+    fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads,
     LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     #if defined(_OPENMP)
-        #pragma omp parallel num_threads(                                   \
-            threads) if (threads > 1 && block_size >= 64 * ALPHABET_SIZE && \
-                         omp_get_dynamic() == 0)
+        #pragma omp parallel num_threads(threads) if (threads > 1 &&                      \
+                                                      block_size >= 64 * ALPHABET_SIZE && \
+                                                      omp_get_dynamic() == 0)
     #endif
     {
     #if defined(_OPENMP)
@@ -6824,15 +6357,14 @@ static void libsais_final_bwt_scan_left_to_right_8u_block_omp(
     #endif
         fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
-                                         ? omp_block_stride
-                                         : block_size - omp_block_start;
+        fast_sint_t omp_block_size =
+            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
 
         omp_block_start += block_start;
 
         if (omp_num_threads == 1) {
-            libsais_final_bwt_scan_left_to_right_8u(
-                T, SA, induction_bucket, omp_block_start, omp_block_size);
+            libsais_final_bwt_scan_left_to_right_8u(T, SA, induction_bucket, omp_block_start,
+                                                    omp_block_size);
         }
     #if defined(_OPENMP)
         else {
@@ -6840,8 +6372,7 @@ static void libsais_final_bwt_scan_left_to_right_8u_block_omp(
                 thread_state[omp_thread_num].state.count =
                     libsais_final_bwt_scan_left_to_right_8u_block_prepare(
                         T, SA, thread_state[omp_thread_num].state.buckets,
-                        thread_state[omp_thread_num].state.cache,
-                        omp_block_start, omp_block_size);
+                        thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size);
             }
 
         #pragma omp barrier
@@ -6850,8 +6381,7 @@ static void libsais_final_bwt_scan_left_to_right_8u_block_omp(
             {
                 fast_sint_t t;
                 for (t = 0; t < omp_num_threads; ++t) {
-                    sa_sint_t * RESTRICT temp_bucket =
-                        thread_state[t].state.buckets;
+                    sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
                     fast_sint_t c;
                     for (c = 0; c < ALPHABET_SIZE; c += 1) {
                         sa_sint_t A = induction_bucket[c], B = temp_bucket[c];
@@ -6875,14 +6405,13 @@ static void libsais_final_bwt_scan_left_to_right_8u_block_omp(
 }
 
 static void libsais_final_bwt_aux_scan_left_to_right_8u_block_omp(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm,
-    sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket,
-    fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I,
+    sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size,
+    sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     #if defined(_OPENMP)
-        #pragma omp parallel num_threads(                                   \
-            threads) if (threads > 1 && block_size >= 64 * ALPHABET_SIZE && \
-                         omp_get_dynamic() == 0)
+        #pragma omp parallel num_threads(threads) if (threads > 1 &&                      \
+                                                      block_size >= 64 * ALPHABET_SIZE && \
+                                                      omp_get_dynamic() == 0)
     #endif
     {
     #if defined(_OPENMP)
@@ -6897,16 +6426,14 @@ static void libsais_final_bwt_aux_scan_left_to_right_8u_block_omp(
     #endif
         fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
-                                         ? omp_block_stride
-                                         : block_size - omp_block_start;
+        fast_sint_t omp_block_size =
+            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
 
         omp_block_start += block_start;
 
         if (omp_num_threads == 1) {
-            libsais_final_bwt_aux_scan_left_to_right_8u(
-                T, SA, rm, I, induction_bucket, omp_block_start,
-                omp_block_size);
+            libsais_final_bwt_aux_scan_left_to_right_8u(T, SA, rm, I, induction_bucket,
+                                                        omp_block_start, omp_block_size);
         }
     #if defined(_OPENMP)
         else {
@@ -6914,8 +6441,7 @@ static void libsais_final_bwt_aux_scan_left_to_right_8u_block_omp(
                 thread_state[omp_thread_num].state.count =
                     libsais_final_bwt_scan_left_to_right_8u_block_prepare(
                         T, SA, thread_state[omp_thread_num].state.buckets,
-                        thread_state[omp_thread_num].state.cache,
-                        omp_block_start, omp_block_size);
+                        thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size);
             }
 
         #pragma omp barrier
@@ -6924,8 +6450,7 @@ static void libsais_final_bwt_aux_scan_left_to_right_8u_block_omp(
             {
                 fast_sint_t t;
                 for (t = 0; t < omp_num_threads; ++t) {
-                    sa_sint_t * RESTRICT temp_bucket =
-                        thread_state[t].state.buckets;
+                    sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
                     fast_sint_t c;
                     for (c = 0; c < ALPHABET_SIZE; c += 1) {
                         sa_sint_t A = induction_bucket[c], B = temp_bucket[c];
@@ -6949,14 +6474,13 @@ static void libsais_final_bwt_aux_scan_left_to_right_8u_block_omp(
 }
 
 static void libsais_final_sorting_scan_left_to_right_8u_block_omp(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
-    sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start,
-    fast_sint_t block_size, sa_sint_t threads,
+    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket,
+    fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads,
     LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     #if defined(_OPENMP)
-        #pragma omp parallel num_threads(                                   \
-            threads) if (threads > 1 && block_size >= 64 * ALPHABET_SIZE && \
-                         omp_get_dynamic() == 0)
+        #pragma omp parallel num_threads(threads) if (threads > 1 &&                      \
+                                                      block_size >= 64 * ALPHABET_SIZE && \
+                                                      omp_get_dynamic() == 0)
     #endif
     {
     #if defined(_OPENMP)
@@ -6971,15 +6495,14 @@ static void libsais_final_sorting_scan_left_to_right_8u_block_omp(
     #endif
         fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
-                                         ? omp_block_stride
-                                         : block_size - omp_block_start;
+        fast_sint_t omp_block_size =
+            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
 
         omp_block_start += block_start;
 
         if (omp_num_threads == 1) {
-            libsais_final_sorting_scan_left_to_right_8u(
-                T, SA, induction_bucket, omp_block_start, omp_block_size);
+            libsais_final_sorting_scan_left_to_right_8u(T, SA, induction_bucket, omp_block_start,
+                                                        omp_block_size);
         }
     #if defined(_OPENMP)
         else {
@@ -6987,8 +6510,7 @@ static void libsais_final_sorting_scan_left_to_right_8u_block_omp(
                 thread_state[omp_thread_num].state.count =
                     libsais_final_sorting_scan_left_to_right_8u_block_prepare(
                         T, SA, thread_state[omp_thread_num].state.buckets,
-                        thread_state[omp_thread_num].state.cache,
-                        omp_block_start, omp_block_size);
+                        thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size);
             }
 
         #pragma omp barrier
@@ -6997,8 +6519,7 @@ static void libsais_final_sorting_scan_left_to_right_8u_block_omp(
             {
                 fast_sint_t t;
                 for (t = 0; t < omp_num_threads; ++t) {
-                    sa_sint_t * RESTRICT temp_bucket =
-                        thread_state[t].state.buckets;
+                    sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
                     fast_sint_t c;
                     for (c = 0; c < ALPHABET_SIZE; c += 1) {
                         sa_sint_t A = induction_bucket[c], B = temp_bucket[c];
@@ -7022,12 +6543,11 @@ static void libsais_final_sorting_scan_left_to_right_8u_block_omp(
 }
 
 static void libsais_final_sorting_scan_left_to_right_32s_block_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
-    sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache,
-    fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) {
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
+    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size,
+    sa_sint_t threads) {
     #if defined(_OPENMP)
-        #pragma omp parallel num_threads(threads) if (threads > 1 && \
-                                                      block_size >= 16384)
+        #pragma omp parallel num_threads(threads) if (threads > 1 && block_size >= 16384)
     #endif
     {
     #if defined(_OPENMP)
@@ -7042,22 +6562,20 @@ static void libsais_final_sorting_scan_left_to_right_32s_block_omp(
     #endif
         fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
-                                         ? omp_block_stride
-                                         : block_size - omp_block_start;
+        fast_sint_t omp_block_size =
+            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
 
         omp_block_start += block_start;
 
         if (omp_num_threads == 1) {
-            libsais_final_sorting_scan_left_to_right_32s(
-                T, SA, buckets, omp_block_start, omp_block_size);
+            libsais_final_sorting_scan_left_to_right_32s(T, SA, buckets, omp_block_start,
+                                                         omp_block_size);
         }
     #if defined(_OPENMP)
         else {
             {
                 libsais_final_sorting_scan_left_to_right_32s_block_gather(
-                    T, SA, cache - block_start, omp_block_start,
-                    omp_block_size);
+                    T, SA, cache - block_start, omp_block_start, omp_block_size);
             }
 
         #pragma omp barrier
@@ -7071,8 +6589,8 @@ static void libsais_final_sorting_scan_left_to_right_32s_block_omp(
         #pragma omp barrier
 
             {
-                libsais_compact_and_place_cached_suffixes(
-                    SA, cache - block_start, omp_block_start, omp_block_size);
+                libsais_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start,
+                                                          omp_block_size);
             }
         }
     #endif
@@ -7087,8 +6605,7 @@ static void libsais_final_bwt_scan_left_to_right_8u_omp(
     LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     SA[induction_bucket[T[(sa_sint_t)n - 1]]++] =
         ((sa_sint_t)n - 1) |
-        ((sa_sint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1])
-         << (SAINT_BIT - 1));
+        ((sa_sint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1));
 
     if (threads == 1 || n < 65536) {
         libsais_final_bwt_scan_left_to_right_8u(T, SA, induction_bucket, 0, n);
@@ -7101,9 +6618,8 @@ static void libsais_final_bwt_scan_left_to_right_8u_omp(
                 block_start++;
             } else {
                 fast_sint_t block_max_end =
-                    block_start +
-                    ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE -
-                                              16 * (fast_sint_t)threads);
+                    block_start + ((fast_sint_t)threads) *
+                                      (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads);
                 if (block_max_end > n) {
                     block_max_end = n;
                 }
@@ -7121,14 +6637,12 @@ static void libsais_final_bwt_scan_left_to_right_8u_omp(
                             p--;
                             SA[block_start] = T[p] | SAINT_MIN;
                             SA[induction_bucket[T[p]]++] =
-                                p | ((sa_sint_t)(T[p - (p > 0)] < T[p])
-                                     << (SAINT_BIT - 1));
+                                p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1));
                         }
                     }
                 } else {
                     libsais_final_bwt_scan_left_to_right_8u_block_omp(
-                        T, SA, induction_bucket, block_start, block_size,
-                        threads, thread_state);
+                        T, SA, induction_bucket, block_start, block_size, threads, thread_state);
                     block_start = block_end;
                 }
             }
@@ -7141,21 +6655,18 @@ static void libsais_final_bwt_scan_left_to_right_8u_omp(
 
 static void libsais_final_bwt_aux_scan_left_to_right_8u_omp(
     const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t n, sa_sint_t rm,
-    sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket,
-    sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     SA[induction_bucket[T[(sa_sint_t)n - 1]]++] =
         ((sa_sint_t)n - 1) |
-        ((sa_sint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1])
-         << (SAINT_BIT - 1));
+        ((sa_sint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1));
 
     if ((((sa_sint_t)n - 1) & rm) == 0) {
-        I[((sa_sint_t)n - 1) / (rm + 1)] =
-            induction_bucket[T[(sa_sint_t)n - 1]];
+        I[((sa_sint_t)n - 1) / (rm + 1)] = induction_bucket[T[(sa_sint_t)n - 1]];
     }
 
     if (threads == 1 || n < 65536) {
-        libsais_final_bwt_aux_scan_left_to_right_8u(T, SA, rm, I,
-                                                    induction_bucket, 0, n);
+        libsais_final_bwt_aux_scan_left_to_right_8u(T, SA, rm, I, induction_bucket, 0, n);
     }
 #if defined(_OPENMP)
     else {
@@ -7165,9 +6676,8 @@ static void libsais_final_bwt_aux_scan_left_to_right_8u_omp(
                 block_start++;
             } else {
                 fast_sint_t block_max_end =
-                    block_start +
-                    ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE -
-                                              16 * (fast_sint_t)threads);
+                    block_start + ((fast_sint_t)threads) *
+                                      (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads);
                 if (block_max_end > n) {
                     block_max_end = n;
                 }
@@ -7185,8 +6695,7 @@ static void libsais_final_bwt_aux_scan_left_to_right_8u_omp(
                             p--;
                             SA[block_start] = T[p] | SAINT_MIN;
                             SA[induction_bucket[T[p]]++] =
-                                p | ((sa_sint_t)(T[p - (p > 0)] < T[p])
-                                     << (SAINT_BIT - 1));
+                                p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1));
                             if ((p & rm) == 0) {
                                 I[p / (rm + 1)] = induction_bucket[T[p]];
                             }
@@ -7194,8 +6703,8 @@ static void libsais_final_bwt_aux_scan_left_to_right_8u_omp(
                     }
                 } else {
                     libsais_final_bwt_aux_scan_left_to_right_8u_block_omp(
-                        T, SA, rm, I, induction_bucket, block_start, block_size,
-                        threads, thread_state);
+                        T, SA, rm, I, induction_bucket, block_start, block_size, threads,
+                        thread_state);
                     block_start = block_end;
                 }
             }
@@ -7212,12 +6721,10 @@ static void libsais_final_sorting_scan_left_to_right_8u_omp(
     LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     SA[induction_bucket[T[(sa_sint_t)n - 1]]++] =
         ((sa_sint_t)n - 1) |
-        ((sa_sint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1])
-         << (SAINT_BIT - 1));
+        ((sa_sint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1));
 
     if (threads == 1 || n < 65536) {
-        libsais_final_sorting_scan_left_to_right_8u(T, SA, induction_bucket, 0,
-                                                    n);
+        libsais_final_sorting_scan_left_to_right_8u(T, SA, induction_bucket, 0, n);
     }
 #if defined(_OPENMP)
     else {
@@ -7227,9 +6734,8 @@ static void libsais_final_sorting_scan_left_to_right_8u_omp(
                 block_start++;
             } else {
                 fast_sint_t block_max_end =
-                    block_start +
-                    ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE -
-                                              16 * (fast_sint_t)threads);
+                    block_start + ((fast_sint_t)threads) *
+                                      (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads);
                 if (block_max_end > n) {
                     block_max_end = n;
                 }
@@ -7246,14 +6752,12 @@ static void libsais_final_sorting_scan_left_to_right_8u_omp(
                         if (p > 0) {
                             p--;
                             SA[induction_bucket[T[p]]++] =
-                                p | ((sa_sint_t)(T[p - (p > 0)] < T[p])
-                                     << (SAINT_BIT - 1));
+                                p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1));
                         }
                     }
                 } else {
                     libsais_final_sorting_scan_left_to_right_8u_block_omp(
-                        T, SA, induction_bucket, block_start, block_size,
-                        threads, thread_state);
+                        T, SA, induction_bucket, block_start, block_size, threads, thread_state);
                     block_start = block_end;
                 }
             }
@@ -7272,22 +6776,20 @@ static void libsais_final_sorting_scan_left_to_right_32s_omp(
         (n - 1) | ((sa_sint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1));
 
     if (threads == 1 || n < 65536) {
-        libsais_final_sorting_scan_left_to_right_32s(T, SA, induction_bucket, 0,
-                                                     n);
+        libsais_final_sorting_scan_left_to_right_32s(T, SA, induction_bucket, 0, n);
     }
 #if defined(_OPENMP)
     else {
         fast_sint_t block_start, block_end;
         for (block_start = 0; block_start < n; block_start = block_end) {
-            block_end = block_start +
-                        (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
+            block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
             if (block_end > n) {
                 block_end = n;
             }
 
             libsais_final_sorting_scan_left_to_right_32s_block_omp(
-                T, SA, induction_bucket, thread_state[0].state.cache,
-                block_start, block_end - block_start, threads);
+                T, SA, induction_bucket, thread_state[0].state.cache, block_start,
+                block_end - block_start, threads);
         }
     }
 #else
@@ -7295,16 +6797,16 @@ static void libsais_final_sorting_scan_left_to_right_32s_omp(
 #endif
 }
 
-static sa_sint_t libsais_final_bwt_scan_right_to_left_8u(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
-    sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start,
-    fast_sint_t omp_block_size) {
+static sa_sint_t libsais_final_bwt_scan_right_to_left_8u(const u8 * RESTRICT T,
+                                                         sa_sint_t * RESTRICT SA,
+                                                         sa_sint_t * RESTRICT induction_bucket,
+                                                         fast_sint_t omp_block_start,
+                                                         fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
     sa_sint_t index = -1;
-    for (i = omp_block_start + omp_block_size - 1,
-        j = omp_block_start + prefetch_distance + 1;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1;
          i >= j; i -= 2) {
         libsais_prefetchw(&SA[i - 2 * prefetch_distance]);
 
@@ -7358,15 +6860,16 @@ static sa_sint_t libsais_final_bwt_scan_right_to_left_8u(
     return index;
 }
 
-static void libsais_final_bwt_aux_scan_right_to_left_8u(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm,
-    sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket,
-    fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+static void libsais_final_bwt_aux_scan_right_to_left_8u(const u8 * RESTRICT T,
+                                                        sa_sint_t * RESTRICT SA, sa_sint_t rm,
+                                                        sa_sint_t * RESTRICT I,
+                                                        sa_sint_t * RESTRICT induction_bucket,
+                                                        fast_sint_t omp_block_start,
+                                                        fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start + omp_block_size - 1,
-        j = omp_block_start + prefetch_distance + 1;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1;
          i >= j; i -= 2) {
         libsais_prefetchw(&SA[i - 2 * prefetch_distance]);
 
@@ -7424,15 +6927,15 @@ static void libsais_final_bwt_aux_scan_right_to_left_8u(
     }
 }
 
-static void libsais_final_sorting_scan_right_to_left_8u(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
-    sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start,
-    fast_sint_t omp_block_size) {
+static void libsais_final_sorting_scan_right_to_left_8u(const u8 * RESTRICT T,
+                                                        sa_sint_t * RESTRICT SA,
+                                                        sa_sint_t * RESTRICT induction_bucket,
+                                                        fast_sint_t omp_block_start,
+                                                        fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start + omp_block_size - 1,
-        j = omp_block_start + prefetch_distance + 1;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1;
          i >= j; i -= 2) {
         libsais_prefetchw(&SA[i - 2 * prefetch_distance]);
 
@@ -7474,15 +6977,15 @@ static void libsais_final_sorting_scan_right_to_left_8u(
     }
 }
 
-static void libsais_final_sorting_scan_right_to_left_32s(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
-    sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start,
-    fast_sint_t omp_block_size) {
+static void libsais_final_sorting_scan_right_to_left_32s(const sa_sint_t * RESTRICT T,
+                                                         sa_sint_t * RESTRICT SA,
+                                                         sa_sint_t * RESTRICT induction_bucket,
+                                                         fast_sint_t omp_block_start,
+                                                         fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start + omp_block_size - 1,
-        j = omp_block_start + 2 * prefetch_distance + 1;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1;
          i >= j; i -= 2) {
         libsais_prefetchw(&SA[i - 3 * prefetch_distance]);
 
@@ -7533,16 +7036,15 @@ static void libsais_final_sorting_scan_right_to_left_32s(
 #if defined(_OPENMP)
 
 static fast_sint_t libsais_final_bwt_scan_right_to_left_8u_block_prepare(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
-    sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache,
-    fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
+    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
+    fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t));
 
     fast_sint_t i, j, count = 0;
-    for (i = omp_block_start + omp_block_size - 1,
-        j = omp_block_start + prefetch_distance + 1;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1;
          i >= j; i -= 2) {
         libsais_prefetchw(&SA[i - 2 * prefetch_distance]);
 
@@ -7596,16 +7098,15 @@ static fast_sint_t libsais_final_bwt_scan_right_to_left_8u_block_prepare(
 }
 
 static fast_sint_t libsais_final_bwt_aux_scan_right_to_left_8u_block_prepare(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
-    sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache,
-    fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
+    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
+    fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t));
 
     fast_sint_t i, j, count = 0;
-    for (i = omp_block_start + omp_block_size - 1,
-        j = omp_block_start + prefetch_distance + 1;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1;
          i >= j; i -= 2) {
         libsais_prefetchw(&SA[i - 2 * prefetch_distance]);
 
@@ -7665,16 +7166,15 @@ static fast_sint_t libsais_final_bwt_aux_scan_right_to_left_8u_block_prepare(
 }
 
 static fast_sint_t libsais_final_sorting_scan_right_to_left_8u_block_prepare(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
-    sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache,
-    fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
+    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
+    fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t));
 
     fast_sint_t i, j, count = 0;
-    for (i = omp_block_start + omp_block_size - 1,
-        j = omp_block_start + prefetch_distance + 1;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1;
          i >= j; i -= 2) {
         libsais_prefetchw(&SA[i - 2 * prefetch_distance]);
 
@@ -7694,16 +7194,14 @@ static fast_sint_t libsais_final_sorting_scan_right_to_left_8u_block_prepare(
         if (p0 > 0) {
             p0--;
             buckets[cache[count].symbol = T[p0]]++;
-            cache[count++].index =
-                p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1));
+            cache[count++].index = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1));
         }
         sa_sint_t p1 = SA[i - 1];
         SA[i - 1] = p1 & SAINT_MAX;
         if (p1 > 0) {
             p1--;
             buckets[cache[count].symbol = T[p1]]++;
-            cache[count++].index =
-                p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1));
+            cache[count++].index = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1));
         }
     }
 
@@ -7713,8 +7211,7 @@ static fast_sint_t libsais_final_sorting_scan_right_to_left_8u_block_prepare(
         if (p > 0) {
             p--;
             buckets[cache[count].symbol = T[p]]++;
-            cache[count++].index =
-                p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1));
+            cache[count++].index = p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1));
         }
     }
 
@@ -7722,8 +7219,8 @@ static fast_sint_t libsais_final_sorting_scan_right_to_left_8u_block_prepare(
 }
 
 static void libsais_final_order_scan_right_to_left_8u_block_place(
-    sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
-    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count) {
+    sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache,
+    fast_sint_t count) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
@@ -7742,9 +7239,8 @@ static void libsais_final_order_scan_right_to_left_8u_block_place(
 }
 
 static void libsais_final_bwt_aux_scan_right_to_left_8u_block_place(
-    sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I,
-    sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache,
-    fast_sint_t count) {
+    sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT buckets,
+    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
@@ -7772,22 +7268,19 @@ static void libsais_final_bwt_aux_scan_right_to_left_8u_block_place(
     for (j += 6; i < j; i += 2) {
         SA[--buckets[cache[i].symbol]] = cache[i].index;
         if ((cache[i + 1].index & rm) == 0) {
-            I[(cache[i + 1].index & SAINT_MAX) / (rm + 1)] =
-                buckets[cache[i].symbol] + 1;
+            I[(cache[i + 1].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i].symbol] + 1;
         }
     }
 }
 
 static void libsais_final_sorting_scan_right_to_left_32s_block_gather(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
-    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
-    fast_sint_t omp_block_size) {
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache,
+    fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start,
-        j = omp_block_start + omp_block_size - prefetch_distance - 1;
-         i < j; i += 2) {
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j;
+         i += 2) {
         libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
 
         sa_sint_t s0 = SA[i + prefetch_distance + 0];
@@ -7807,8 +7300,7 @@ static void libsais_final_sorting_scan_right_to_left_32s_block_gather(
         SA[i + 0] = p0 & SAINT_MAX;
         if (p0 > 0) {
             p0--;
-            cache[i + 0].index =
-                p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1));
+            cache[i + 0].index = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1));
             symbol0 = T[p0];
         }
         cache[i + 0].symbol = symbol0;
@@ -7816,8 +7308,7 @@ static void libsais_final_sorting_scan_right_to_left_32s_block_gather(
         SA[i + 1] = p1 & SAINT_MAX;
         if (p1 > 0) {
             p1--;
-            cache[i + 1].index =
-                p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1));
+            cache[i + 1].index = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1));
             symbol1 = T[p1];
         }
         cache[i + 1].symbol = symbol1;
@@ -7828,8 +7319,7 @@ static void libsais_final_sorting_scan_right_to_left_32s_block_gather(
         SA[i] = p & SAINT_MAX;
         if (p > 0) {
             p--;
-            cache[i].index =
-                p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1));
+            cache[i].index = p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1));
             symbol = T[p];
         }
         cache[i].symbol = symbol;
@@ -7843,8 +7333,7 @@ static void libsais_final_sorting_scan_right_to_left_32s_block_sort(
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start + omp_block_size - 1,
-        j = omp_block_start + prefetch_distance + 1;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1;
          i >= j; i -= 2) {
         libsais_prefetchw(&cache[i - 2 * prefetch_distance]);
 
@@ -7864,8 +7353,7 @@ static void libsais_final_sorting_scan_right_to_left_32s_block_sort(
                 if (np > 0) {
                     np--;
                     cache[ni].index =
-                        np | ((sa_sint_t)(T[np - (np > 0)] > T[np])
-                              << (SAINT_BIT - 1));
+                        np | ((sa_sint_t)(T[np - (np > 0)] > T[np]) << (SAINT_BIT - 1));
                     cache[ni].symbol = T[np];
                 }
             }
@@ -7880,8 +7368,7 @@ static void libsais_final_sorting_scan_right_to_left_32s_block_sort(
                 if (np > 0) {
                     np--;
                     cache[ni].index =
-                        np | ((sa_sint_t)(T[np - (np > 0)] > T[np])
-                              << (SAINT_BIT - 1));
+                        np | ((sa_sint_t)(T[np - (np > 0)] > T[np]) << (SAINT_BIT - 1));
                     cache[ni].symbol = T[np];
                 }
             }
@@ -7898,8 +7385,7 @@ static void libsais_final_sorting_scan_right_to_left_32s_block_sort(
                 if (np > 0) {
                     np--;
                     cache[ni].index =
-                        np | ((sa_sint_t)(T[np - (np > 0)] > T[np])
-                              << (SAINT_BIT - 1));
+                        np | ((sa_sint_t)(T[np - (np > 0)] > T[np]) << (SAINT_BIT - 1));
                     cache[ni].symbol = T[np];
                 }
             }
@@ -7908,14 +7394,13 @@ static void libsais_final_sorting_scan_right_to_left_32s_block_sort(
 }
 
 static void libsais_final_bwt_scan_right_to_left_8u_block_omp(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
-    sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start,
-    fast_sint_t block_size, sa_sint_t threads,
+    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket,
+    fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads,
     LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     #if defined(_OPENMP)
-        #pragma omp parallel num_threads(                                   \
-            threads) if (threads > 1 && block_size >= 64 * ALPHABET_SIZE && \
-                         omp_get_dynamic() == 0)
+        #pragma omp parallel num_threads(threads) if (threads > 1 &&                      \
+                                                      block_size >= 64 * ALPHABET_SIZE && \
+                                                      omp_get_dynamic() == 0)
     #endif
     {
     #if defined(_OPENMP)
@@ -7930,15 +7415,14 @@ static void libsais_final_bwt_scan_right_to_left_8u_block_omp(
     #endif
         fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
-                                         ? omp_block_stride
-                                         : block_size - omp_block_start;
+        fast_sint_t omp_block_size =
+            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
 
         omp_block_start += block_start;
 
         if (omp_num_threads == 1) {
-            libsais_final_bwt_scan_right_to_left_8u(
-                T, SA, induction_bucket, omp_block_start, omp_block_size);
+            libsais_final_bwt_scan_right_to_left_8u(T, SA, induction_bucket, omp_block_start,
+                                                    omp_block_size);
         }
     #if defined(_OPENMP)
         else {
@@ -7946,8 +7430,7 @@ static void libsais_final_bwt_scan_right_to_left_8u_block_omp(
                 thread_state[omp_thread_num].state.count =
                     libsais_final_bwt_scan_right_to_left_8u_block_prepare(
                         T, SA, thread_state[omp_thread_num].state.buckets,
-                        thread_state[omp_thread_num].state.cache,
-                        omp_block_start, omp_block_size);
+                        thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size);
             }
 
         #pragma omp barrier
@@ -7956,8 +7439,7 @@ static void libsais_final_bwt_scan_right_to_left_8u_block_omp(
             {
                 fast_sint_t t;
                 for (t = omp_num_threads - 1; t >= 0; --t) {
-                    sa_sint_t * RESTRICT temp_bucket =
-                        thread_state[t].state.buckets;
+                    sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
                     fast_sint_t c;
                     for (c = 0; c < ALPHABET_SIZE; c += 1) {
                         sa_sint_t A = induction_bucket[c], B = temp_bucket[c];
@@ -7981,14 +7463,13 @@ static void libsais_final_bwt_scan_right_to_left_8u_block_omp(
 }
 
 static void libsais_final_bwt_aux_scan_right_to_left_8u_block_omp(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm,
-    sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket,
-    fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I,
+    sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size,
+    sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     #if defined(_OPENMP)
-        #pragma omp parallel num_threads(                                   \
-            threads) if (threads > 1 && block_size >= 64 * ALPHABET_SIZE && \
-                         omp_get_dynamic() == 0)
+        #pragma omp parallel num_threads(threads) if (threads > 1 &&                      \
+                                                      block_size >= 64 * ALPHABET_SIZE && \
+                                                      omp_get_dynamic() == 0)
     #endif
     {
     #if defined(_OPENMP)
@@ -8003,16 +7484,14 @@ static void libsais_final_bwt_aux_scan_right_to_left_8u_block_omp(
     #endif
         fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
-                                         ? omp_block_stride
-                                         : block_size - omp_block_start;
+        fast_sint_t omp_block_size =
+            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
 
         omp_block_start += block_start;
 
         if (omp_num_threads == 1) {
-            libsais_final_bwt_aux_scan_right_to_left_8u(
-                T, SA, rm, I, induction_bucket, omp_block_start,
-                omp_block_size);
+            libsais_final_bwt_aux_scan_right_to_left_8u(T, SA, rm, I, induction_bucket,
+                                                        omp_block_start, omp_block_size);
         }
     #if defined(_OPENMP)
         else {
@@ -8020,8 +7499,7 @@ static void libsais_final_bwt_aux_scan_right_to_left_8u_block_omp(
                 thread_state[omp_thread_num].state.count =
                     libsais_final_bwt_aux_scan_right_to_left_8u_block_prepare(
                         T, SA, thread_state[omp_thread_num].state.buckets,
-                        thread_state[omp_thread_num].state.cache,
-                        omp_block_start, omp_block_size);
+                        thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size);
             }
 
         #pragma omp barrier
@@ -8030,8 +7508,7 @@ static void libsais_final_bwt_aux_scan_right_to_left_8u_block_omp(
             {
                 fast_sint_t t;
                 for (t = omp_num_threads - 1; t >= 0; --t) {
-                    sa_sint_t * RESTRICT temp_bucket =
-                        thread_state[t].state.buckets;
+                    sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
                     fast_sint_t c;
                     for (c = 0; c < ALPHABET_SIZE; c += 1) {
                         sa_sint_t A = induction_bucket[c], B = temp_bucket[c];
@@ -8055,14 +7532,13 @@ static void libsais_final_bwt_aux_scan_right_to_left_8u_block_omp(
 }
 
 static void libsais_final_sorting_scan_right_to_left_8u_block_omp(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
-    sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start,
-    fast_sint_t block_size, sa_sint_t threads,
+    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket,
+    fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads,
     LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     #if defined(_OPENMP)
-        #pragma omp parallel num_threads(                                   \
-            threads) if (threads > 1 && block_size >= 64 * ALPHABET_SIZE && \
-                         omp_get_dynamic() == 0)
+        #pragma omp parallel num_threads(threads) if (threads > 1 &&                      \
+                                                      block_size >= 64 * ALPHABET_SIZE && \
+                                                      omp_get_dynamic() == 0)
     #endif
     {
     #if defined(_OPENMP)
@@ -8077,15 +7553,14 @@ static void libsais_final_sorting_scan_right_to_left_8u_block_omp(
     #endif
         fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
-                                         ? omp_block_stride
-                                         : block_size - omp_block_start;
+        fast_sint_t omp_block_size =
+            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
 
         omp_block_start += block_start;
 
         if (omp_num_threads == 1) {
-            libsais_final_sorting_scan_right_to_left_8u(
-                T, SA, induction_bucket, omp_block_start, omp_block_size);
+            libsais_final_sorting_scan_right_to_left_8u(T, SA, induction_bucket, omp_block_start,
+                                                        omp_block_size);
         }
     #if defined(_OPENMP)
         else {
@@ -8093,8 +7568,7 @@ static void libsais_final_sorting_scan_right_to_left_8u_block_omp(
                 thread_state[omp_thread_num].state.count =
                     libsais_final_sorting_scan_right_to_left_8u_block_prepare(
                         T, SA, thread_state[omp_thread_num].state.buckets,
-                        thread_state[omp_thread_num].state.cache,
-                        omp_block_start, omp_block_size);
+                        thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size);
             }
 
         #pragma omp barrier
@@ -8103,8 +7577,7 @@ static void libsais_final_sorting_scan_right_to_left_8u_block_omp(
             {
                 fast_sint_t t;
                 for (t = omp_num_threads - 1; t >= 0; --t) {
-                    sa_sint_t * RESTRICT temp_bucket =
-                        thread_state[t].state.buckets;
+                    sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
                     fast_sint_t c;
                     for (c = 0; c < ALPHABET_SIZE; c += 1) {
                         sa_sint_t A = induction_bucket[c], B = temp_bucket[c];
@@ -8128,12 +7601,11 @@ static void libsais_final_sorting_scan_right_to_left_8u_block_omp(
 }
 
 static void libsais_final_sorting_scan_right_to_left_32s_block_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
-    sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache,
-    fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) {
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
+    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size,
+    sa_sint_t threads) {
     #if defined(_OPENMP)
-        #pragma omp parallel num_threads(threads) if (threads > 1 && \
-                                                      block_size >= 16384)
+        #pragma omp parallel num_threads(threads) if (threads > 1 && block_size >= 16384)
     #endif
     {
     #if defined(_OPENMP)
@@ -8148,22 +7620,20 @@ static void libsais_final_sorting_scan_right_to_left_32s_block_omp(
     #endif
         fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
-                                         ? omp_block_stride
-                                         : block_size - omp_block_start;
+        fast_sint_t omp_block_size =
+            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
 
         omp_block_start += block_start;
 
         if (omp_num_threads == 1) {
-            libsais_final_sorting_scan_right_to_left_32s(
-                T, SA, buckets, omp_block_start, omp_block_size);
+            libsais_final_sorting_scan_right_to_left_32s(T, SA, buckets, omp_block_start,
+                                                         omp_block_size);
         }
     #if defined(_OPENMP)
         else {
             {
                 libsais_final_sorting_scan_right_to_left_32s_block_gather(
-                    T, SA, cache - block_start, omp_block_start,
-                    omp_block_size);
+                    T, SA, cache - block_start, omp_block_start, omp_block_size);
             }
 
         #pragma omp barrier
@@ -8177,8 +7647,8 @@ static void libsais_final_sorting_scan_right_to_left_32s_block_omp(
         #pragma omp barrier
 
             {
-                libsais_compact_and_place_cached_suffixes(
-                    SA, cache - block_start, omp_block_start, omp_block_size);
+                libsais_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start,
+                                                          omp_block_size);
             }
         }
     #endif
@@ -8194,8 +7664,7 @@ static sa_sint_t libsais_final_bwt_scan_right_to_left_8u_omp(
     sa_sint_t index = -1;
 
     if (threads == 1 || n < 65536) {
-        index = libsais_final_bwt_scan_right_to_left_8u(T, SA, induction_bucket,
-                                                        0, n);
+        index = libsais_final_bwt_scan_right_to_left_8u(T, SA, induction_bucket, 0, n);
     }
 #if defined(_OPENMP)
     else {
@@ -8205,9 +7674,8 @@ static sa_sint_t libsais_final_bwt_scan_right_to_left_8u_omp(
                 index = (sa_sint_t)block_start--;
             } else {
                 fast_sint_t block_max_end =
-                    block_start -
-                    ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE -
-                                              16 * (fast_sint_t)threads);
+                    block_start - ((fast_sint_t)threads) *
+                                      (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads);
                 if (block_max_end < 0) {
                     block_max_end = -1;
                 }
@@ -8231,8 +7699,7 @@ static sa_sint_t libsais_final_bwt_scan_right_to_left_8u_omp(
                     }
                 } else {
                     libsais_final_bwt_scan_right_to_left_8u_block_omp(
-                        T, SA, induction_bucket, block_end + 1, block_size,
-                        threads, thread_state);
+                        T, SA, induction_bucket, block_end + 1, block_size, threads, thread_state);
                     block_start = block_end;
                 }
             }
@@ -8247,11 +7714,10 @@ static sa_sint_t libsais_final_bwt_scan_right_to_left_8u_omp(
 
 static void libsais_final_bwt_aux_scan_right_to_left_8u_omp(
     const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t rm,
-    sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket,
-    sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     if (threads == 1 || n < 65536) {
-        libsais_final_bwt_aux_scan_right_to_left_8u(T, SA, rm, I,
-                                                    induction_bucket, 0, n);
+        libsais_final_bwt_aux_scan_right_to_left_8u(T, SA, rm, I, induction_bucket, 0, n);
     }
 #if defined(_OPENMP)
     else {
@@ -8262,9 +7728,8 @@ static void libsais_final_bwt_aux_scan_right_to_left_8u_omp(
             } else {
                 fast_sint_t block_max_end =
                     block_start -
-                    ((fast_sint_t)threads) * ((LIBSAIS_PER_THREAD_CACHE_SIZE -
-                                               16 * (fast_sint_t)threads) /
-                                              2);
+                    ((fast_sint_t)threads) *
+                        ((LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads) / 2);
                 if (block_max_end < 0) {
                     block_max_end = -1;
                 }
@@ -8291,8 +7756,8 @@ static void libsais_final_bwt_aux_scan_right_to_left_8u_omp(
                     }
                 } else {
                     libsais_final_bwt_aux_scan_right_to_left_8u_block_omp(
-                        T, SA, rm, I, induction_bucket, block_end + 1,
-                        block_size, threads, thread_state);
+                        T, SA, rm, I, induction_bucket, block_end + 1, block_size, threads,
+                        thread_state);
                     block_start = block_end;
                 }
             }
@@ -8308,8 +7773,7 @@ static void libsais_final_sorting_scan_right_to_left_8u_omp(
     sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads,
     LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     if (threads == 1 || n < 65536) {
-        libsais_final_sorting_scan_right_to_left_8u(T, SA, induction_bucket, 0,
-                                                    n);
+        libsais_final_sorting_scan_right_to_left_8u(T, SA, induction_bucket, 0, n);
     }
 #if defined(_OPENMP)
     else {
@@ -8319,9 +7783,8 @@ static void libsais_final_sorting_scan_right_to_left_8u_omp(
                 block_start--;
             } else {
                 fast_sint_t block_max_end =
-                    block_start -
-                    ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE -
-                                              16 * (fast_sint_t)threads);
+                    block_start - ((fast_sint_t)threads) *
+                                      (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads);
                 if (block_max_end < -1) {
                     block_max_end = -1;
                 }
@@ -8338,14 +7801,12 @@ static void libsais_final_sorting_scan_right_to_left_8u_omp(
                         if (p > 0) {
                             p--;
                             SA[--induction_bucket[T[p]]] =
-                                p | ((sa_sint_t)(T[p - (p > 0)] > T[p])
-                                     << (SAINT_BIT - 1));
+                                p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1));
                         }
                     }
                 } else {
                     libsais_final_sorting_scan_right_to_left_8u_block_omp(
-                        T, SA, induction_bucket, block_end + 1, block_size,
-                        threads, thread_state);
+                        T, SA, induction_bucket, block_end + 1, block_size, threads, thread_state);
                     block_start = block_end;
                 }
             }
@@ -8361,23 +7822,20 @@ static void libsais_final_sorting_scan_right_to_left_32s_omp(
     sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads,
     LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     if (threads == 1 || n < 65536) {
-        libsais_final_sorting_scan_right_to_left_32s(T, SA, induction_bucket, 0,
-                                                     n);
+        libsais_final_sorting_scan_right_to_left_32s(T, SA, induction_bucket, 0, n);
     }
 #if defined(_OPENMP)
     else {
         fast_sint_t block_start, block_end;
-        for (block_start = (fast_sint_t)n - 1; block_start >= 0;
-             block_start = block_end) {
-            block_end = block_start -
-                        (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
+        for (block_start = (fast_sint_t)n - 1; block_start >= 0; block_start = block_end) {
+            block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
             if (block_end < 0) {
                 block_end = -1;
             }
 
             libsais_final_sorting_scan_right_to_left_32s_block_omp(
-                T, SA, induction_bucket, thread_state[0].state.cache,
-                block_end + 1, block_start - block_end, threads);
+                T, SA, induction_bucket, thread_state[0].state.cache, block_end + 1,
+                block_start - block_end, threads);
         }
     }
 #else
@@ -8385,16 +7843,13 @@ static void libsais_final_sorting_scan_right_to_left_32s_omp(
 #endif
 }
 
-static void libsais_clear_lms_suffixes_omp(sa_sint_t * RESTRICT SA, sa_sint_t n,
-                                           sa_sint_t k,
+static void libsais_clear_lms_suffixes_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
                                            sa_sint_t * RESTRICT bucket_start,
-                                           sa_sint_t * RESTRICT bucket_end,
-                                           sa_sint_t threads) {
+                                           sa_sint_t * RESTRICT bucket_end, sa_sint_t threads) {
     fast_sint_t c;
 
 #if defined(_OPENMP)
-    #pragma omp parallel for schedule(static, 1) \
-        num_threads(threads) if (threads > 1 && n >= 65536)
+    #pragma omp parallel for schedule(static, 1) num_threads(threads) if (threads > 1 && n >= 65536)
 #else
     UNUSED(threads);
     UNUSED(n);
@@ -8402,96 +7857,89 @@ static void libsais_clear_lms_suffixes_omp(sa_sint_t * RESTRICT SA, sa_sint_t n,
     for (c = 0; c < k; ++c) {
         if (bucket_end[c] > bucket_start[c]) {
             memset(&SA[bucket_start[c]], 0,
-                   ((size_t)bucket_end[c] - (size_t)bucket_start[c]) *
-                       sizeof(sa_sint_t));
+                   ((size_t)bucket_end[c] - (size_t)bucket_start[c]) * sizeof(sa_sint_t));
         }
     }
 }
 
-static sa_sint_t libsais_induce_final_order_8u_omp(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t bwt,
-    sa_sint_t r, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT buckets,
-    sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+static sa_sint_t libsais_induce_final_order_8u_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
+                                                   sa_sint_t n, sa_sint_t bwt, sa_sint_t r,
+                                                   sa_sint_t * RESTRICT I,
+                                                   sa_sint_t * RESTRICT buckets, sa_sint_t threads,
+                                                   LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     if (!bwt) {
-        libsais_final_sorting_scan_left_to_right_8u_omp(
-            T, SA, n, &buckets[6 * ALPHABET_SIZE], threads, thread_state);
+        libsais_final_sorting_scan_left_to_right_8u_omp(T, SA, n, &buckets[6 * ALPHABET_SIZE],
+                                                        threads, thread_state);
         if (threads > 1 && n >= 65536) {
-            libsais_clear_lms_suffixes_omp(
-                SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE],
-                &buckets[7 * ALPHABET_SIZE], threads);
+            libsais_clear_lms_suffixes_omp(SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE],
+                                           &buckets[7 * ALPHABET_SIZE], threads);
         }
-        libsais_final_sorting_scan_right_to_left_8u_omp(
-            T, SA, n, &buckets[7 * ALPHABET_SIZE], threads, thread_state);
+        libsais_final_sorting_scan_right_to_left_8u_omp(T, SA, n, &buckets[7 * ALPHABET_SIZE],
+                                                        threads, thread_state);
         return 0;
     } else if (I != NULL) {
         libsais_final_bwt_aux_scan_left_to_right_8u_omp(
-            T, SA, n, r - 1, I, &buckets[6 * ALPHABET_SIZE], threads,
-            thread_state);
+            T, SA, n, r - 1, I, &buckets[6 * ALPHABET_SIZE], threads, thread_state);
         if (threads > 1 && n >= 65536) {
-            libsais_clear_lms_suffixes_omp(
-                SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE],
-                &buckets[7 * ALPHABET_SIZE], threads);
+            libsais_clear_lms_suffixes_omp(SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE],
+                                           &buckets[7 * ALPHABET_SIZE], threads);
         }
         libsais_final_bwt_aux_scan_right_to_left_8u_omp(
-            T, SA, n, r - 1, I, &buckets[7 * ALPHABET_SIZE], threads,
-            thread_state);
+            T, SA, n, r - 1, I, &buckets[7 * ALPHABET_SIZE], threads, thread_state);
         return 0;
     } else {
-        libsais_final_bwt_scan_left_to_right_8u_omp(
-            T, SA, n, &buckets[6 * ALPHABET_SIZE], threads, thread_state);
+        libsais_final_bwt_scan_left_to_right_8u_omp(T, SA, n, &buckets[6 * ALPHABET_SIZE], threads,
+                                                    thread_state);
         if (threads > 1 && n >= 65536) {
-            libsais_clear_lms_suffixes_omp(
-                SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE],
-                &buckets[7 * ALPHABET_SIZE], threads);
+            libsais_clear_lms_suffixes_omp(SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE],
+                                           &buckets[7 * ALPHABET_SIZE], threads);
         }
-        return libsais_final_bwt_scan_right_to_left_8u_omp(
-            T, SA, n, &buckets[7 * ALPHABET_SIZE], threads, thread_state);
+        return libsais_final_bwt_scan_right_to_left_8u_omp(T, SA, n, &buckets[7 * ALPHABET_SIZE],
+                                                           threads, thread_state);
     }
 }
 
-static void libsais_induce_final_order_32s_6k(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
-    sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
-    libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[4 * k],
-                                                     threads, thread_state);
-    libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[5 * k],
-                                                     threads, thread_state);
+static void libsais_induce_final_order_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+                                              sa_sint_t n, sa_sint_t k,
+                                              sa_sint_t * RESTRICT buckets, sa_sint_t threads,
+                                              LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[4 * k], threads,
+                                                     thread_state);
+    libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[5 * k], threads,
+                                                     thread_state);
 }
 
-static void libsais_induce_final_order_32s_4k(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
-    sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
-    libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[2 * k],
-                                                     threads, thread_state);
-    libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[3 * k],
-                                                     threads, thread_state);
+static void libsais_induce_final_order_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+                                              sa_sint_t n, sa_sint_t k,
+                                              sa_sint_t * RESTRICT buckets, sa_sint_t threads,
+                                              LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[2 * k], threads,
+                                                     thread_state);
+    libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[3 * k], threads,
+                                                     thread_state);
 }
 
-static void libsais_induce_final_order_32s_2k(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
-    sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
-    libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[1 * k],
-                                                     threads, thread_state);
-    libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[0 * k],
-                                                     threads, thread_state);
+static void libsais_induce_final_order_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+                                              sa_sint_t n, sa_sint_t k,
+                                              sa_sint_t * RESTRICT buckets, sa_sint_t threads,
+                                              LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[1 * k], threads,
+                                                     thread_state);
+    libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[0 * k], threads,
+                                                     thread_state);
 }
 
-static void libsais_induce_final_order_32s_1k(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
-    sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+static void libsais_induce_final_order_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+                                              sa_sint_t n, sa_sint_t k,
+                                              sa_sint_t * RESTRICT buckets, sa_sint_t threads,
+                                              LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     libsais_count_suffixes_32s(T, n, k, buckets);
     libsais_initialize_buckets_start_32s_1k(k, buckets);
-    libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, buckets, threads,
-                                                     thread_state);
+    libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, buckets, threads, thread_state);
 
     libsais_count_suffixes_32s(T, n, k, buckets);
     libsais_initialize_buckets_end_32s_1k(k, buckets);
-    libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, buckets, threads,
-                                                     thread_state);
+    libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, buckets, threads, thread_state);
 }
 
 static sa_sint_t libsais_renumber_unique_and_nonunique_lms_suffixes_32s(
@@ -8502,20 +7950,16 @@ static sa_sint_t libsais_renumber_unique_and_nonunique_lms_suffixes_32s(
     sa_sint_t * RESTRICT SAm = &SA[m];
 
     sa_sint_t i, j;
-    for (i = (sa_sint_t)omp_block_start,
-        j = (sa_sint_t)omp_block_start + (sa_sint_t)omp_block_size -
-            2 * (sa_sint_t)prefetch_distance - 3;
+    for (i = (sa_sint_t)omp_block_start, j = (sa_sint_t)omp_block_start +
+                                             (sa_sint_t)omp_block_size -
+                                             2 * (sa_sint_t)prefetch_distance - 3;
          i < j; i += 4) {
         libsais_prefetch(&SA[i + 3 * prefetch_distance]);
 
-        libsais_prefetchw(
-            &SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 0]) >> 1]);
-        libsais_prefetchw(
-            &SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 1]) >> 1]);
-        libsais_prefetchw(
-            &SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 2]) >> 1]);
-        libsais_prefetchw(
-            &SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 3]) >> 1]);
+        libsais_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 0]) >> 1]);
+        libsais_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 1]) >> 1]);
+        libsais_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 2]) >> 1]);
+        libsais_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 3]) >> 1]);
 
         sa_uint_t q0 = (sa_uint_t)SA[i + prefetch_distance + 0];
         const sa_sint_t * Tq0 = &T[q0];
@@ -8578,9 +8022,11 @@ static sa_sint_t libsais_renumber_unique_and_nonunique_lms_suffixes_32s(
     return f;
 }
 
-static void libsais_compact_unique_and_nonunique_lms_suffixes_32s(
-    sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t * pl, fast_sint_t * pr,
-    fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+static void libsais_compact_unique_and_nonunique_lms_suffixes_32s(sa_sint_t * RESTRICT SA,
+                                                                  sa_sint_t m, fast_sint_t * pl,
+                                                                  fast_sint_t * pr,
+                                                                  fast_sint_t omp_block_start,
+                                                                  fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     sa_sint_t * RESTRICT SAl = &SA[0];
@@ -8628,8 +8074,7 @@ static void libsais_compact_unique_and_nonunique_lms_suffixes_32s(
 
 #if defined(_OPENMP)
 
-static sa_sint_t libsais_count_unique_suffixes(sa_sint_t * RESTRICT SA,
-                                               sa_sint_t m,
+static sa_sint_t libsais_count_unique_suffixes(sa_sint_t * RESTRICT SA, sa_sint_t m,
                                                fast_sint_t omp_block_start,
                                                fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
@@ -8638,9 +8083,8 @@ static sa_sint_t libsais_count_unique_suffixes(sa_sint_t * RESTRICT SA,
 
     fast_sint_t i, j;
     sa_sint_t f0 = 0, f1 = 0, f2 = 0, f3 = 0;
-    for (i = omp_block_start,
-        j = omp_block_start + omp_block_size - prefetch_distance - 3;
-         i < j; i += 4) {
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j;
+         i += 4) {
         libsais_prefetch(&SA[i + 2 * prefetch_distance]);
 
         libsais_prefetch(&SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]);
@@ -8664,8 +8108,8 @@ static sa_sint_t libsais_count_unique_suffixes(sa_sint_t * RESTRICT SA,
 #endif
 
 static sa_sint_t libsais_renumber_unique_and_nonunique_lms_suffixes_32s_omp(
-    sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t m,
-    sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     sa_sint_t f = 0;
 
 #if defined(_OPENMP)
@@ -8684,20 +8128,18 @@ static sa_sint_t libsais_renumber_unique_and_nonunique_lms_suffixes_32s_omp(
 #endif
         fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
-                                         ? omp_block_stride
-                                         : m - omp_block_start;
+        fast_sint_t omp_block_size =
+            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
 
         if (omp_num_threads == 1) {
-            f = libsais_renumber_unique_and_nonunique_lms_suffixes_32s(
-                T, SA, m, 0, omp_block_start, omp_block_size);
+            f = libsais_renumber_unique_and_nonunique_lms_suffixes_32s(T, SA, m, 0, omp_block_start,
+                                                                       omp_block_size);
         }
 #if defined(_OPENMP)
         else {
             {
                 thread_state[omp_thread_num].state.count =
-                    libsais_count_unique_suffixes(SA, m, omp_block_start,
-                                                  omp_block_size);
+                    libsais_count_unique_suffixes(SA, m, omp_block_start, omp_block_size);
             }
 
     #pragma omp barrier
@@ -8709,13 +8151,11 @@ static sa_sint_t libsais_renumber_unique_and_nonunique_lms_suffixes_32s_omp(
                 }
 
                 if (omp_thread_num == omp_num_threads - 1) {
-                    f = (sa_sint_t)(count +
-                                    thread_state[omp_thread_num].state.count);
+                    f = (sa_sint_t)(count + thread_state[omp_thread_num].state.count);
                 }
 
                 libsais_renumber_unique_and_nonunique_lms_suffixes_32s(
-                    T, SA, m, (sa_sint_t)count, omp_block_start,
-                    omp_block_size);
+                    T, SA, m, (sa_sint_t)count, omp_block_start, omp_block_size);
             }
         }
 #endif
@@ -8725,12 +8165,10 @@ static sa_sint_t libsais_renumber_unique_and_nonunique_lms_suffixes_32s_omp(
 }
 
 static void libsais_compact_unique_and_nonunique_lms_suffixes_32s_omp(
-    sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs,
-    sa_sint_t f, sa_sint_t threads,
+    sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t f, sa_sint_t threads,
     LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
 #if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if (threads > 1 && \
-                                                  n >= 131072 && m < fs)
+    #pragma omp parallel num_threads(threads) if (threads > 1 && n >= 131072 && m < fs)
 #endif
     {
 #if defined(_OPENMP)
@@ -8743,32 +8181,28 @@ static void libsais_compact_unique_and_nonunique_lms_suffixes_32s_omp(
         fast_sint_t omp_thread_num = 0;
         fast_sint_t omp_num_threads = 1;
 #endif
-        fast_sint_t omp_block_stride =
-            (((fast_sint_t)n >> 1) / omp_num_threads) & (-16);
+        fast_sint_t omp_block_stride = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size =
-            omp_thread_num < omp_num_threads - 1
-                ? omp_block_stride
-                : ((fast_sint_t)n >> 1) - omp_block_start;
+        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
+                                         ? omp_block_stride
+                                         : ((fast_sint_t)n >> 1) - omp_block_start;
 
         if (omp_num_threads == 1) {
             fast_sint_t l = m, r = (fast_sint_t)n + (fast_sint_t)fs;
-            libsais_compact_unique_and_nonunique_lms_suffixes_32s(
-                SA, m, &l, &r, omp_block_start, omp_block_size);
+            libsais_compact_unique_and_nonunique_lms_suffixes_32s(SA, m, &l, &r, omp_block_start,
+                                                                  omp_block_size);
         }
 #if defined(_OPENMP)
         else {
             {
                 thread_state[omp_thread_num].state.position =
-                    (fast_sint_t)m + ((fast_sint_t)n >> 1) + omp_block_start +
-                    omp_block_size;
+                    (fast_sint_t)m + ((fast_sint_t)n >> 1) + omp_block_start + omp_block_size;
                 thread_state[omp_thread_num].state.count =
                     (fast_sint_t)m + omp_block_start + omp_block_size;
 
                 libsais_compact_unique_and_nonunique_lms_suffixes_32s(
                     SA, m, &thread_state[omp_thread_num].state.position,
-                    &thread_state[omp_thread_num].state.count, omp_block_start,
-                    omp_block_size);
+                    &thread_state[omp_thread_num].state.count, omp_block_start, omp_block_size);
             }
 
     #pragma omp barrier
@@ -8778,29 +8212,24 @@ static void libsais_compact_unique_and_nonunique_lms_suffixes_32s_omp(
                 fast_sint_t t, position;
 
                 for (position = m, t = omp_num_threads - 1; t >= 0; --t) {
-                    fast_sint_t omp_block_end = t < omp_num_threads - 1
-                                                    ? omp_block_stride * (t + 1)
-                                                    : ((fast_sint_t)n >> 1);
-                    fast_sint_t count =
-                        ((fast_sint_t)m + ((fast_sint_t)n >> 1) +
-                         omp_block_end - thread_state[t].state.position);
+                    fast_sint_t omp_block_end = t < omp_num_threads - 1 ? omp_block_stride * (t + 1)
+                                                                        : ((fast_sint_t)n >> 1);
+                    fast_sint_t count = ((fast_sint_t)m + ((fast_sint_t)n >> 1) + omp_block_end -
+                                         thread_state[t].state.position);
 
                     if (count > 0) {
                         position -= count;
-                        memcpy(&SA[position],
-                               &SA[thread_state[t].state.position],
+                        memcpy(&SA[position], &SA[thread_state[t].state.position],
                                (size_t)count * sizeof(sa_sint_t));
                     }
                 }
 
-                for (position = (fast_sint_t)n + (fast_sint_t)fs,
-                    t = omp_num_threads - 1;
-                     t >= 0; --t) {
-                    fast_sint_t omp_block_end = t < omp_num_threads - 1
-                                                    ? omp_block_stride * (t + 1)
-                                                    : ((fast_sint_t)n >> 1);
-                    fast_sint_t count = ((fast_sint_t)m + omp_block_end -
-                                         thread_state[t].state.count);
+                for (position = (fast_sint_t)n + (fast_sint_t)fs, t = omp_num_threads - 1; t >= 0;
+                     --t) {
+                    fast_sint_t omp_block_end = t < omp_num_threads - 1 ? omp_block_stride * (t + 1)
+                                                                        : ((fast_sint_t)n >> 1);
+                    fast_sint_t count =
+                        ((fast_sint_t)m + omp_block_end - thread_state[t].state.count);
 
                     if (count > 0) {
                         position -= count;
@@ -8818,24 +8247,23 @@ static void libsais_compact_unique_and_nonunique_lms_suffixes_32s_omp(
 }
 
 static sa_sint_t libsais_compact_lms_suffixes_32s_omp(
-    sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
-    sa_sint_t fs, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
-    sa_sint_t f = libsais_renumber_unique_and_nonunique_lms_suffixes_32s_omp(
-        T, SA, m, threads, thread_state);
-    libsais_compact_unique_and_nonunique_lms_suffixes_32s_omp(
-        SA, n, m, fs, f, threads, thread_state);
+    sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs,
+    sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    sa_sint_t f =
+        libsais_renumber_unique_and_nonunique_lms_suffixes_32s_omp(T, SA, m, threads, thread_state);
+    libsais_compact_unique_and_nonunique_lms_suffixes_32s_omp(SA, n, m, fs, f, threads,
+                                                              thread_state);
 
     return f;
 }
 
-static void libsais_merge_unique_lms_suffixes_32s(
-    sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
-    fast_sint_t l, fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+static void libsais_merge_unique_lms_suffixes_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+                                                  sa_sint_t n, sa_sint_t m, fast_sint_t l,
+                                                  fast_sint_t omp_block_start,
+                                                  fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
-    const sa_sint_t * RESTRICT SAnm =
-        &SA[(fast_sint_t)n - (fast_sint_t)m - 1 + l];
+    const sa_sint_t * RESTRICT SAnm = &SA[(fast_sint_t)n - (fast_sint_t)m - 1 + l];
 
     sa_sint_t i, j;
     fast_sint_t tmp = *SAnm++;
@@ -8885,18 +8313,17 @@ static void libsais_merge_unique_lms_suffixes_32s(
     }
 }
 
-static void libsais_merge_nonunique_lms_suffixes_32s(
-    sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, fast_sint_t l,
-    fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+static void libsais_merge_nonunique_lms_suffixes_32s(sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                                     sa_sint_t m, fast_sint_t l,
+                                                     fast_sint_t omp_block_start,
+                                                     fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
-    const sa_sint_t * RESTRICT SAnm =
-        &SA[(fast_sint_t)n - (fast_sint_t)m - 1 + l];
+    const sa_sint_t * RESTRICT SAnm = &SA[(fast_sint_t)n - (fast_sint_t)m - 1 + l];
 
     fast_sint_t i, j;
     sa_sint_t tmp = *SAnm++;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - 3; i < j;
-         i += 4) {
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - 3; i < j; i += 4) {
         libsais_prefetch(&SA[i + prefetch_distance]);
 
         if (SA[i + 0] == 0) {
@@ -8926,8 +8353,8 @@ static void libsais_merge_nonunique_lms_suffixes_32s(
 }
 
 static void libsais_merge_unique_lms_suffixes_32s_omp(
-    sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
-    sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
 #if defined(_OPENMP)
     #pragma omp parallel num_threads(threads) if (threads > 1 && n >= 65536)
 #endif
@@ -8944,20 +8371,17 @@ static void libsais_merge_unique_lms_suffixes_32s_omp(
 #endif
         fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
-                                         ? omp_block_stride
-                                         : n - omp_block_start;
+        fast_sint_t omp_block_size =
+            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
 
         if (omp_num_threads == 1) {
-            libsais_merge_unique_lms_suffixes_32s(
-                T, SA, n, m, 0, omp_block_start, omp_block_size);
+            libsais_merge_unique_lms_suffixes_32s(T, SA, n, m, 0, omp_block_start, omp_block_size);
         }
 #if defined(_OPENMP)
         else {
             {
                 thread_state[omp_thread_num].state.count =
-                    libsais_count_negative_marked_suffixes(T, omp_block_start,
-                                                           omp_block_size);
+                    libsais_count_negative_marked_suffixes(T, omp_block_start, omp_block_size);
             }
 
     #pragma omp barrier
@@ -8968,8 +8392,8 @@ static void libsais_merge_unique_lms_suffixes_32s_omp(
                     count += thread_state[t].state.count;
                 }
 
-                libsais_merge_unique_lms_suffixes_32s(
-                    T, SA, n, m, count, omp_block_start, omp_block_size);
+                libsais_merge_unique_lms_suffixes_32s(T, SA, n, m, count, omp_block_start,
+                                                      omp_block_size);
             }
         }
 #endif
@@ -8977,8 +8401,8 @@ static void libsais_merge_unique_lms_suffixes_32s_omp(
 }
 
 static void libsais_merge_nonunique_lms_suffixes_32s_omp(
-    sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t f,
-    sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t f, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
 #if defined(_OPENMP)
     #pragma omp parallel num_threads(threads) if (threads > 1 && m >= 65536)
 #endif
@@ -8995,20 +8419,17 @@ static void libsais_merge_nonunique_lms_suffixes_32s_omp(
 #endif
         fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
-                                         ? omp_block_stride
-                                         : m - omp_block_start;
+        fast_sint_t omp_block_size =
+            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
 
         if (omp_num_threads == 1) {
-            libsais_merge_nonunique_lms_suffixes_32s(
-                SA, n, m, f, omp_block_start, omp_block_size);
+            libsais_merge_nonunique_lms_suffixes_32s(SA, n, m, f, omp_block_start, omp_block_size);
         }
 #if defined(_OPENMP)
         else {
             {
                 thread_state[omp_thread_num].state.count =
-                    libsais_count_zero_marked_suffixes(SA, omp_block_start,
-                                                       omp_block_size);
+                    libsais_count_zero_marked_suffixes(SA, omp_block_start, omp_block_size);
             }
 
     #pragma omp barrier
@@ -9019,8 +8440,8 @@ static void libsais_merge_nonunique_lms_suffixes_32s_omp(
                     count += thread_state[t].state.count;
                 }
 
-                libsais_merge_nonunique_lms_suffixes_32s(
-                    SA, n, m, count, omp_block_start, omp_block_size);
+                libsais_merge_nonunique_lms_suffixes_32s(SA, n, m, count, omp_block_start,
+                                                         omp_block_size);
             }
         }
 #endif
@@ -9028,112 +8449,96 @@ static void libsais_merge_nonunique_lms_suffixes_32s_omp(
 }
 
 static void libsais_merge_compacted_lms_suffixes_32s_omp(
-    sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
-    sa_sint_t f, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
-    libsais_merge_unique_lms_suffixes_32s_omp(T, SA, n, m, threads,
-                                              thread_state);
-    libsais_merge_nonunique_lms_suffixes_32s_omp(SA, n, m, f, threads,
-                                                 thread_state);
+    sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t f,
+    sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    libsais_merge_unique_lms_suffixes_32s_omp(T, SA, n, m, threads, thread_state);
+    libsais_merge_nonunique_lms_suffixes_32s_omp(SA, n, m, f, threads, thread_state);
 }
 
 static void libsais_reconstruct_compacted_lms_suffixes_32s_2k_omp(
-    sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
-    sa_sint_t m, sa_sint_t fs, sa_sint_t f, sa_sint_t * RESTRICT buckets,
-    sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m,
+    sa_sint_t fs, sa_sint_t f, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     if (f > 0) {
         memmove(&SA[n - m - 1], &SA[n + fs - m], (size_t)f * sizeof(sa_sint_t));
 
-        libsais_count_and_gather_compacted_lms_suffixes_32s_2k_omp(
-            T, SA, n, k, buckets, threads, thread_state);
+        libsais_count_and_gather_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, threads,
+                                                                   thread_state);
         libsais_reconstruct_lms_suffixes_omp(SA, n, m - f, threads);
 
-        memcpy(&SA[n - m - 1 + f], &SA[0],
-               ((size_t)m - (size_t)f) * sizeof(sa_sint_t));
+        memcpy(&SA[n - m - 1 + f], &SA[0], ((size_t)m - (size_t)f) * sizeof(sa_sint_t));
         memset(&SA[0], 0, (size_t)m * sizeof(sa_sint_t));
 
-        libsais_merge_compacted_lms_suffixes_32s_omp(T, SA, n, m, f, threads,
-                                                     thread_state);
+        libsais_merge_compacted_lms_suffixes_32s_omp(T, SA, n, m, f, threads, thread_state);
     } else {
-        libsais_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets, 0,
-                                                     n);
+        libsais_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets, 0, n);
         libsais_reconstruct_lms_suffixes_omp(SA, n, m, threads);
     }
 }
 
 static void libsais_reconstruct_compacted_lms_suffixes_32s_1k_omp(
-    sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
-    sa_sint_t fs, sa_sint_t f, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs,
+    sa_sint_t f, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     if (f > 0) {
         memmove(&SA[n - m - 1], &SA[n + fs - m], (size_t)f * sizeof(sa_sint_t));
 
         libsais_gather_compacted_lms_suffixes_32s(T, SA, n);
         libsais_reconstruct_lms_suffixes_omp(SA, n, m - f, threads);
 
-        memcpy(&SA[n - m - 1 + f], &SA[0],
-               ((size_t)m - (size_t)f) * sizeof(sa_sint_t));
+        memcpy(&SA[n - m - 1 + f], &SA[0], ((size_t)m - (size_t)f) * sizeof(sa_sint_t));
         memset(&SA[0], 0, (size_t)m * sizeof(sa_sint_t));
 
-        libsais_merge_compacted_lms_suffixes_32s_omp(T, SA, n, m, f, threads,
-                                                     thread_state);
+        libsais_merge_compacted_lms_suffixes_32s_omp(T, SA, n, m, f, threads, thread_state);
     } else {
         libsais_gather_lms_suffixes_32s(T, SA, n);
         libsais_reconstruct_lms_suffixes_omp(SA, n, m, threads);
     }
 }
 
-static sa_sint_t libsais_main_32s(
-    sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
-    sa_sint_t fs, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+static sa_sint_t libsais_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                  sa_sint_t k, sa_sint_t fs, sa_sint_t threads,
+                                  LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     fs = fs < (SAINT_MAX - n) ? fs : (SAINT_MAX - n);
 
     if (k > 0 && fs / k >= 6) {
         sa_sint_t alignment = (fs - 1024) / k >= 6 ? 1024 : 16;
         sa_sint_t * RESTRICT buckets =
             (fs - alignment) / k >= 6
-                ? (sa_sint_t *)libsais_align_up(
-                      &SA[n + fs - 6 * k - alignment],
-                      (size_t)alignment * sizeof(sa_sint_t))
+                ? (sa_sint_t *)libsais_align_up(&SA[n + fs - 6 * k - alignment],
+                                                (size_t)alignment * sizeof(sa_sint_t))
                 : &SA[n + fs - 6 * k];
 
-        sa_sint_t m = libsais_count_and_gather_lms_suffixes_32s_4k_omp(
-            T, SA, n, k, buckets, threads, thread_state);
+        sa_sint_t m = libsais_count_and_gather_lms_suffixes_32s_4k_omp(T, SA, n, k, buckets,
+                                                                       threads, thread_state);
         if (m > 1) {
             memset(SA, 0, ((size_t)n - (size_t)m) * sizeof(sa_sint_t));
 
             sa_sint_t first_lms_suffix = SA[n - m];
             sa_sint_t left_suffixes_count =
-                libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(
-                    T, k, buckets, first_lms_suffix);
+                libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(T, k, buckets,
+                                                                              first_lms_suffix);
 
-            libsais_radix_sort_lms_suffixes_32s_6k_omp(
-                T, SA, n, m, &buckets[4 * k], threads, thread_state);
-            libsais_radix_sort_set_markers_32s_6k_omp(SA, k, &buckets[4 * k],
-                                                      threads);
+            libsais_radix_sort_lms_suffixes_32s_6k_omp(T, SA, n, m, &buckets[4 * k], threads,
+                                                       thread_state);
+            libsais_radix_sort_set_markers_32s_6k_omp(SA, k, &buckets[4 * k], threads);
 
             if (threads > 1 && n >= 65536) {
-                memset(&SA[(fast_sint_t)n - (fast_sint_t)m], 0,
-                       (size_t)m * sizeof(sa_sint_t));
+                memset(&SA[(fast_sint_t)n - (fast_sint_t)m], 0, (size_t)m * sizeof(sa_sint_t));
             }
 
-            libsais_initialize_buckets_for_partial_sorting_32s_6k(
-                T, k, buckets, first_lms_suffix, left_suffixes_count);
-            libsais_induce_partial_order_32s_6k_omp(
-                T, SA, n, k, buckets, first_lms_suffix, left_suffixes_count,
-                threads, thread_state);
+            libsais_initialize_buckets_for_partial_sorting_32s_6k(T, k, buckets, first_lms_suffix,
+                                                                  left_suffixes_count);
+            libsais_induce_partial_order_32s_6k_omp(T, SA, n, k, buckets, first_lms_suffix,
+                                                    left_suffixes_count, threads, thread_state);
 
-            sa_sint_t names =
-                libsais_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(
-                    SA, n, m, threads, thread_state);
+            sa_sint_t names = libsais_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(
+                SA, n, m, threads, thread_state);
             if (names < m) {
-                sa_sint_t f = libsais_compact_lms_suffixes_32s_omp(
-                    T, SA, n, m, fs, threads, thread_state);
+                sa_sint_t f =
+                    libsais_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state);
 
-                if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f,
-                                     fs + n - 2 * m + f, threads,
-                                     thread_state) != 0) {
+                if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f,
+                                     threads, thread_state) != 0) {
                     return -2;
                 }
 
@@ -9145,15 +8550,13 @@ static sa_sint_t libsais_main_32s(
 
             libsais_initialize_buckets_start_and_end_32s_4k(k, buckets);
             libsais_place_lms_suffixes_histogram_32s_4k(SA, n, k, m, buckets);
-            libsais_induce_final_order_32s_4k(T, SA, n, k, buckets, threads,
-                                              thread_state);
+            libsais_induce_final_order_32s_4k(T, SA, n, k, buckets, threads, thread_state);
         } else {
             SA[0] = SA[n - 1];
 
             libsais_initialize_buckets_start_and_end_32s_6k(k, buckets);
             libsais_place_lms_suffixes_histogram_32s_6k(SA, n, k, m, buckets);
-            libsais_induce_final_order_32s_6k(T, SA, n, k, buckets, threads,
-                                              thread_state);
+            libsais_induce_final_order_32s_6k(T, SA, n, k, buckets, threads, thread_state);
         }
 
         return 0;
@@ -9161,37 +8564,31 @@ static sa_sint_t libsais_main_32s(
         sa_sint_t alignment = (fs - 1024) / k >= 4 ? 1024 : 16;
         sa_sint_t * RESTRICT buckets =
             (fs - alignment) / k >= 4
-                ? (sa_sint_t *)libsais_align_up(
-                      &SA[n + fs - 4 * k - alignment],
-                      (size_t)alignment * sizeof(sa_sint_t))
+                ? (sa_sint_t *)libsais_align_up(&SA[n + fs - 4 * k - alignment],
+                                                (size_t)alignment * sizeof(sa_sint_t))
                 : &SA[n + fs - 4 * k];
 
-        sa_sint_t m = libsais_count_and_gather_lms_suffixes_32s_2k_omp(
-            T, SA, n, k, buckets, threads, thread_state);
+        sa_sint_t m = libsais_count_and_gather_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets,
+                                                                       threads, thread_state);
         if (m > 1) {
-            libsais_initialize_buckets_for_radix_and_partial_sorting_32s_4k(
-                T, k, buckets, SA[n - m]);
+            libsais_initialize_buckets_for_radix_and_partial_sorting_32s_4k(T, k, buckets,
+                                                                            SA[n - m]);
 
-            libsais_radix_sort_lms_suffixes_32s_2k_omp(T, SA, n, m, &buckets[1],
-                                                       threads, thread_state);
-            libsais_radix_sort_set_markers_32s_4k_omp(SA, k, &buckets[1],
-                                                      threads);
+            libsais_radix_sort_lms_suffixes_32s_2k_omp(T, SA, n, m, &buckets[1], threads,
+                                                       thread_state);
+            libsais_radix_sort_set_markers_32s_4k_omp(SA, k, &buckets[1], threads);
 
-            libsais_place_lms_suffixes_interval_32s_4k(SA, n, k, m - 1,
-                                                       buckets);
-            libsais_induce_partial_order_32s_4k_omp(T, SA, n, k, buckets,
-                                                    threads, thread_state);
+            libsais_place_lms_suffixes_interval_32s_4k(SA, n, k, m - 1, buckets);
+            libsais_induce_partial_order_32s_4k_omp(T, SA, n, k, buckets, threads, thread_state);
 
-            sa_sint_t names =
-                libsais_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(
-                    SA, n, m, threads, thread_state);
+            sa_sint_t names = libsais_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(
+                SA, n, m, threads, thread_state);
             if (names < m) {
-                sa_sint_t f = libsais_compact_lms_suffixes_32s_omp(
-                    T, SA, n, m, fs, threads, thread_state);
+                sa_sint_t f =
+                    libsais_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state);
 
-                if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f,
-                                     fs + n - 2 * m + f, threads,
-                                     thread_state) != 0) {
+                if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f,
+                                     threads, thread_state) != 0) {
                     return -2;
                 }
 
@@ -9206,44 +8603,37 @@ static sa_sint_t libsais_main_32s(
 
         libsais_initialize_buckets_start_and_end_32s_4k(k, buckets);
         libsais_place_lms_suffixes_histogram_32s_4k(SA, n, k, m, buckets);
-        libsais_induce_final_order_32s_4k(T, SA, n, k, buckets, threads,
-                                          thread_state);
+        libsais_induce_final_order_32s_4k(T, SA, n, k, buckets, threads, thread_state);
 
         return 0;
     } else if (k > 0 && fs / k >= 2) {
         sa_sint_t alignment = (fs - 1024) / k >= 2 ? 1024 : 16;
         sa_sint_t * RESTRICT buckets =
             (fs - alignment) / k >= 2
-                ? (sa_sint_t *)libsais_align_up(
-                      &SA[n + fs - 2 * k - alignment],
-                      (size_t)alignment * sizeof(sa_sint_t))
+                ? (sa_sint_t *)libsais_align_up(&SA[n + fs - 2 * k - alignment],
+                                                (size_t)alignment * sizeof(sa_sint_t))
                 : &SA[n + fs - 2 * k];
 
-        sa_sint_t m = libsais_count_and_gather_lms_suffixes_32s_2k_omp(
-            T, SA, n, k, buckets, threads, thread_state);
+        sa_sint_t m = libsais_count_and_gather_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets,
+                                                                       threads, thread_state);
         if (m > 1) {
-            libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_2k(
-                T, k, buckets, SA[n - m]);
+            libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_2k(T, k, buckets, SA[n - m]);
 
-            libsais_radix_sort_lms_suffixes_32s_2k_omp(T, SA, n, m, &buckets[1],
-                                                       threads, thread_state);
-            libsais_place_lms_suffixes_interval_32s_2k(SA, n, k, m - 1,
-                                                       buckets);
+            libsais_radix_sort_lms_suffixes_32s_2k_omp(T, SA, n, m, &buckets[1], threads,
+                                                       thread_state);
+            libsais_place_lms_suffixes_interval_32s_2k(SA, n, k, m - 1, buckets);
 
             libsais_initialize_buckets_start_and_end_32s_2k(k, buckets);
-            libsais_induce_partial_order_32s_2k_omp(T, SA, n, k, buckets,
-                                                    threads, thread_state);
+            libsais_induce_partial_order_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state);
 
             sa_sint_t names =
-                libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(
-                    T, SA, n, m, threads);
+                libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(T, SA, n, m, threads);
             if (names < m) {
-                sa_sint_t f = libsais_compact_lms_suffixes_32s_omp(
-                    T, SA, n, m, fs, threads, thread_state);
+                sa_sint_t f =
+                    libsais_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state);
 
-                if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f,
-                                     fs + n - 2 * m + f, threads,
-                                     thread_state) != 0) {
+                if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f,
+                                     threads, thread_state) != 0) {
                     return -2;
                 }
 
@@ -9260,20 +8650,19 @@ static sa_sint_t libsais_main_32s(
         libsais_place_lms_suffixes_histogram_32s_2k(SA, n, k, m, buckets);
 
         libsais_initialize_buckets_start_and_end_32s_2k(k, buckets);
-        libsais_induce_final_order_32s_2k(T, SA, n, k, buckets, threads,
-                                          thread_state);
+        libsais_induce_final_order_32s_2k(T, SA, n, k, buckets, threads, thread_state);
 
         return 0;
     } else {
-        sa_sint_t * buffer = fs < k ? (sa_sint_t *)libsais_alloc_aligned(
-                                          (size_t)k * sizeof(sa_sint_t), 4096)
-                                    : (sa_sint_t *)NULL;
+        sa_sint_t * buffer =
+            fs < k ? (sa_sint_t *)libsais_alloc_aligned((size_t)k * sizeof(sa_sint_t), 4096)
+                   : (sa_sint_t *)NULL;
 
         sa_sint_t alignment = fs - 1024 >= k ? 1024 : 16;
         sa_sint_t * RESTRICT buckets =
-            fs - alignment >= k ? (sa_sint_t *)libsais_align_up(
-                                      &SA[n + fs - k - alignment],
-                                      (size_t)alignment * sizeof(sa_sint_t))
+            fs - alignment >= k
+                ? (sa_sint_t *)libsais_align_up(&SA[n + fs - k - alignment],
+                                                (size_t)alignment * sizeof(sa_sint_t))
             : fs >= k ? &SA[n + fs - k]
                       : buffer;
 
@@ -9288,33 +8677,30 @@ static sa_sint_t libsais_main_32s(
 
         sa_sint_t m = libsais_radix_sort_lms_suffixes_32s_1k(T, SA, n, buckets);
         if (m > 1) {
-            libsais_induce_partial_order_32s_1k_omp(T, SA, n, k, buckets,
-                                                    threads, thread_state);
+            libsais_induce_partial_order_32s_1k_omp(T, SA, n, k, buckets, threads, thread_state);
 
             sa_sint_t names =
-                libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(
-                    T, SA, n, m, threads);
+                libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(T, SA, n, m, threads);
             if (names < m) {
                 if (buffer != NULL) {
                     libsais_free_aligned(buffer);
                     buckets = NULL;
                 }
 
-                sa_sint_t f = libsais_compact_lms_suffixes_32s_omp(
-                    T, SA, n, m, fs, threads, thread_state);
+                sa_sint_t f =
+                    libsais_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state);
 
-                if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f,
-                                     fs + n - 2 * m + f, threads,
-                                     thread_state) != 0) {
+                if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f,
+                                     threads, thread_state) != 0) {
                     return -2;
                 }
 
-                libsais_reconstruct_compacted_lms_suffixes_32s_1k_omp(
-                    T, SA, n, m, fs, f, threads, thread_state);
+                libsais_reconstruct_compacted_lms_suffixes_32s_1k_omp(T, SA, n, m, fs, f, threads,
+                                                                      thread_state);
 
                 if (buckets == NULL) {
-                    buckets = buffer = (sa_sint_t *)libsais_alloc_aligned(
-                        (size_t)k * sizeof(sa_sint_t), 4096);
+                    buckets = buffer =
+                        (sa_sint_t *)libsais_alloc_aligned((size_t)k * sizeof(sa_sint_t), 4096);
                 }
                 if (buckets == NULL) {
                     return -2;
@@ -9326,8 +8712,7 @@ static sa_sint_t libsais_main_32s(
             libsais_place_lms_suffixes_interval_32s_1k(T, SA, k, m, buckets);
         }
 
-        libsais_induce_final_order_32s_1k(T, SA, n, k, buckets, threads,
-                                          thread_state);
+        libsais_induce_final_order_32s_1k(T, SA, n, k, buckets, threads, thread_state);
         libsais_free_aligned(buffer);
 
         return 0;
@@ -9335,45 +8720,39 @@ static sa_sint_t libsais_main_32s(
 }
 
 static sa_sint_t libsais_main_8u(const u8 * T, sa_sint_t * SA, sa_sint_t n,
-                                 sa_sint_t * RESTRICT buckets, sa_sint_t bwt,
-                                 sa_sint_t r, sa_sint_t * RESTRICT I,
-                                 sa_sint_t fs, sa_sint_t * freq,
-                                 sa_sint_t threads,
-                                 LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+                                 sa_sint_t * RESTRICT buckets, sa_sint_t bwt, sa_sint_t r,
+                                 sa_sint_t * RESTRICT I, sa_sint_t fs, sa_sint_t * freq,
+                                 sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     fs = fs < (SAINT_MAX - n) ? fs : (SAINT_MAX - n);
 
-    sa_sint_t m = libsais_count_and_gather_lms_suffixes_8u_omp(
-        T, SA, n, buckets, threads, thread_state);
+    sa_sint_t m =
+        libsais_count_and_gather_lms_suffixes_8u_omp(T, SA, n, buckets, threads, thread_state);
 
     libsais_initialize_buckets_start_and_end_8u(buckets, freq);
 
     if (m > 0) {
         sa_sint_t first_lms_suffix = SA[n - m];
         sa_sint_t left_suffixes_count =
-            libsais_initialize_buckets_for_lms_suffixes_radix_sort_8u(
-                T, buckets, first_lms_suffix);
+            libsais_initialize_buckets_for_lms_suffixes_radix_sort_8u(T, buckets, first_lms_suffix);
 
         if (threads > 1 && n >= 65536) {
             memset(SA, 0, ((size_t)n - (size_t)m) * sizeof(sa_sint_t));
         }
-        libsais_radix_sort_lms_suffixes_8u_omp(T, SA, n, m, buckets, threads,
-                                               thread_state);
+        libsais_radix_sort_lms_suffixes_8u_omp(T, SA, n, m, buckets, threads, thread_state);
         if (threads > 1 && n >= 65536) {
-            memset(&SA[(fast_sint_t)n - (fast_sint_t)m], 0,
-                   (size_t)m * sizeof(sa_sint_t));
+            memset(&SA[(fast_sint_t)n - (fast_sint_t)m], 0, (size_t)m * sizeof(sa_sint_t));
         }
 
-        libsais_initialize_buckets_for_partial_sorting_8u(
-            T, buckets, first_lms_suffix, left_suffixes_count);
+        libsais_initialize_buckets_for_partial_sorting_8u(T, buckets, first_lms_suffix,
+                                                          left_suffixes_count);
         libsais_induce_partial_order_8u_omp(T, SA, n, buckets, first_lms_suffix,
-                                            left_suffixes_count, threads,
-                                            thread_state);
+                                            left_suffixes_count, threads, thread_state);
 
-        sa_sint_t names = libsais_renumber_and_gather_lms_suffixes_8u_omp(
-            SA, n, m, fs, threads, thread_state);
+        sa_sint_t names =
+            libsais_renumber_and_gather_lms_suffixes_8u_omp(SA, n, m, fs, threads, thread_state);
         if (names < m) {
-            if (libsais_main_32s(SA + n + fs - m, SA, m, names, fs + n - 2 * m,
-                                 threads, thread_state) != 0) {
+            if (libsais_main_32s(SA + n + fs - m, SA, m, names, fs + n - 2 * m, threads,
+                                 thread_state) != 0) {
                 return -2;
             }
 
@@ -9386,23 +8765,20 @@ static sa_sint_t libsais_main_8u(const u8 * T, sa_sint_t * SA, sa_sint_t n,
         memset(SA, 0, (size_t)n * sizeof(sa_sint_t));
     }
 
-    return libsais_induce_final_order_8u_omp(T, SA, n, bwt, r, I, buckets,
-                                             threads, thread_state);
+    return libsais_induce_final_order_8u_omp(T, SA, n, bwt, r, I, buckets, threads, thread_state);
 }
 
-static sa_sint_t libsais_main(const u8 * T, sa_sint_t * SA, sa_sint_t n,
-                              sa_sint_t bwt, sa_sint_t r, sa_sint_t * I,
-                              sa_sint_t fs, sa_sint_t * freq,
-                              sa_sint_t threads) {
+static sa_sint_t libsais_main(const u8 * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t bwt, sa_sint_t r,
+                              sa_sint_t * I, sa_sint_t fs, sa_sint_t * freq, sa_sint_t threads) {
     LIBSAIS_THREAD_STATE * RESTRICT thread_state =
         threads > 1 ? libsais_alloc_thread_state(threads) : NULL;
-    sa_sint_t * RESTRICT buckets = (sa_sint_t *)libsais_alloc_aligned(
-        8 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096);
+    sa_sint_t * RESTRICT buckets =
+        (sa_sint_t *)libsais_alloc_aligned(8 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096);
 
-    sa_sint_t index = buckets != NULL && (thread_state != NULL || threads == 1)
-                          ? libsais_main_8u(T, SA, n, buckets, bwt, r, I, fs,
-                                            freq, threads, thread_state)
-                          : -2;
+    sa_sint_t index =
+        buckets != NULL && (thread_state != NULL || threads == 1)
+            ? libsais_main_8u(T, SA, n, buckets, bwt, r, I, fs, freq, threads, thread_state)
+            : -2;
 
     libsais_free_aligned(buckets);
     libsais_free_thread_state(thread_state);
@@ -9410,34 +8786,30 @@ static sa_sint_t libsais_main(const u8 * T, sa_sint_t * SA, sa_sint_t n,
     return index;
 }
 
-static s32 libsais_main_int(sa_sint_t * T, sa_sint_t * SA, sa_sint_t n,
-                            sa_sint_t k, sa_sint_t fs, sa_sint_t threads) {
+static s32 libsais_main_int(sa_sint_t * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t k, sa_sint_t fs,
+                            sa_sint_t threads) {
     LIBSAIS_THREAD_STATE * RESTRICT thread_state =
         threads > 1 ? libsais_alloc_thread_state(threads) : NULL;
 
-    sa_sint_t index =
-        thread_state != NULL || threads == 1
-            ? libsais_main_32s(T, SA, n, k, fs, threads, thread_state)
-            : -2;
+    sa_sint_t index = thread_state != NULL || threads == 1
+                          ? libsais_main_32s(T, SA, n, k, fs, threads, thread_state)
+                          : -2;
 
     libsais_free_thread_state(thread_state);
 
     return index;
 }
 
-static sa_sint_t libsais_main_ctx(const LIBSAIS_CONTEXT * ctx, const u8 * T,
-                                  sa_sint_t * SA, sa_sint_t n, sa_sint_t bwt,
-                                  sa_sint_t r, sa_sint_t * I, sa_sint_t fs,
-                                  sa_sint_t * freq) {
-    return ctx != NULL && (ctx->buckets != NULL &&
-                           (ctx->thread_state != NULL || ctx->threads == 1))
+static sa_sint_t libsais_main_ctx(const LIBSAIS_CONTEXT * ctx, const u8 * T, sa_sint_t * SA,
+                                  sa_sint_t n, sa_sint_t bwt, sa_sint_t r, sa_sint_t * I,
+                                  sa_sint_t fs, sa_sint_t * freq) {
+    return ctx != NULL && (ctx->buckets != NULL && (ctx->thread_state != NULL || ctx->threads == 1))
                ? libsais_main_8u(T, SA, n, ctx->buckets, bwt, r, I, fs, freq,
                                  (sa_sint_t)ctx->threads, ctx->thread_state)
                : -2;
 }
 
-static void libsais_bwt_copy_8u(u8 * RESTRICT U, sa_sint_t * RESTRICT A,
-                                sa_sint_t n) {
+static void libsais_bwt_copy_8u(u8 * RESTRICT U, sa_sint_t * RESTRICT A, sa_sint_t n) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
@@ -9461,8 +8833,8 @@ static void libsais_bwt_copy_8u(u8 * RESTRICT U, sa_sint_t * RESTRICT A,
 
 #if defined(_OPENMP)
 
-static void libsais_bwt_copy_8u_omp(u8 * RESTRICT U, sa_sint_t * RESTRICT A,
-                                    sa_sint_t n, sa_sint_t threads) {
+static void libsais_bwt_copy_8u_omp(u8 * RESTRICT U, sa_sint_t * RESTRICT A, sa_sint_t n,
+                                    sa_sint_t threads) {
     #if defined(_OPENMP)
         #pragma omp parallel num_threads(threads) if (threads > 1 && n >= 65536)
     #endif
@@ -9470,8 +8842,7 @@ static void libsais_bwt_copy_8u_omp(u8 * RESTRICT U, sa_sint_t * RESTRICT A,
     #if defined(_OPENMP)
         fast_sint_t omp_thread_num = omp_get_thread_num();
         fast_sint_t omp_num_threads = omp_get_num_threads();
-        fast_sint_t omp_block_stride =
-            ((fast_sint_t)n / omp_num_threads) & (-16);
+        fast_sint_t omp_block_stride = ((fast_sint_t)n / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
         fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
                                          ? omp_block_stride
@@ -9483,8 +8854,7 @@ static void libsais_bwt_copy_8u_omp(u8 * RESTRICT U, sa_sint_t * RESTRICT A,
         fast_sint_t omp_block_size = (fast_sint_t)n;
     #endif
 
-        libsais_bwt_copy_8u(U + omp_block_start, A + omp_block_start,
-                            (sa_sint_t)omp_block_size);
+        libsais_bwt_copy_8u(U + omp_block_start, A + omp_block_start, (sa_sint_t)omp_block_size);
     }
 }
 
@@ -9492,9 +8862,7 @@ static void libsais_bwt_copy_8u_omp(u8 * RESTRICT U, sa_sint_t * RESTRICT A,
 
 void * libsais_create_ctx(void) { return (void *)libsais_create_ctx_main(1); }
 
-void libsais_free_ctx(void * ctx) {
-    libsais_free_ctx_main((LIBSAIS_CONTEXT *)ctx);
-}
+void libsais_free_ctx(void * ctx) { libsais_free_ctx_main((LIBSAIS_CONTEXT *)ctx); }
 
 s32 libsais(const u8 * T, s32 * SA, s32 n, s32 fs, s32 * freq) {
     if ((T == NULL) || (SA == NULL) || (n < 0) || (fs < 0)) {
@@ -9528,8 +8896,7 @@ s32 libsais_int(s32 * T, s32 * SA, s32 n, s32 k, s32 fs) {
     return libsais_main_int(T, SA, n, k, fs, 1);
 }
 
-s32 libsais_ctx(const void * ctx, const u8 * T, s32 * SA, s32 n, s32 fs,
-                s32 * freq) {
+s32 libsais_ctx(const void * ctx, const u8 * T, s32 * SA, s32 n, s32 fs, s32 * freq) {
     if ((ctx == NULL) || (T == NULL) || (SA == NULL) || (n < 0) || (fs < 0)) {
         return -1;
     } else if (n < 2) {
@@ -9545,8 +8912,7 @@ s32 libsais_ctx(const void * ctx, const u8 * T, s32 * SA, s32 n, s32 fs,
         return 0;
     }
 
-    return libsais_main_ctx((const LIBSAIS_CONTEXT *)ctx, T, SA, n, 0, 0, NULL,
-                            fs, freq);
+    return libsais_main_ctx((const LIBSAIS_CONTEXT *)ctx, T, SA, n, 0, 0, NULL, fs, freq);
 }
 
 s32 libsais_bwt(const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq) {
@@ -9577,10 +8943,9 @@ s32 libsais_bwt(const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq) {
     return index;
 }
 
-s32 libsais_bwt_aux(const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq,
-                    s32 r, s32 * I) {
-    if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) ||
-        (r < 2) || ((r & (r - 1)) != 0) || (I == NULL)) {
+s32 libsais_bwt_aux(const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq, s32 r, s32 * I) {
+    if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (r < 2) ||
+        ((r & (r - 1)) != 0) || (I == NULL)) {
         return -1;
     } else if (n <= 1) {
         if (freq != NULL) {
@@ -9607,10 +8972,8 @@ s32 libsais_bwt_aux(const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq,
     return 0;
 }
 
-s32 libsais_bwt_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n,
-                    s32 fs, s32 * freq) {
-    if ((ctx == NULL) || (T == NULL) || (U == NULL) || (A == NULL) || (n < 0) ||
-        (fs < 0)) {
+s32 libsais_bwt_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq) {
+    if ((ctx == NULL) || (T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0)) {
         return -1;
     } else if (n <= 1) {
         if (freq != NULL) {
@@ -9625,20 +8988,17 @@ s32 libsais_bwt_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n,
         return n;
     }
 
-    sa_sint_t index = libsais_main_ctx((const LIBSAIS_CONTEXT *)ctx, T, A, n, 1,
-                                       0, NULL, fs, freq);
+    sa_sint_t index = libsais_main_ctx((const LIBSAIS_CONTEXT *)ctx, T, A, n, 1, 0, NULL, fs, freq);
     if (index >= 0) {
         index++;
 
         U[0] = T[n - 1];
 
 #if defined(_OPENMP)
-        libsais_bwt_copy_8u_omp(
-            U + 1, A, index - 1,
-            (sa_sint_t)((const LIBSAIS_CONTEXT *)ctx)->threads);
-        libsais_bwt_copy_8u_omp(
-            U + index, A + index, n - index,
-            (sa_sint_t)((const LIBSAIS_CONTEXT *)ctx)->threads);
+        libsais_bwt_copy_8u_omp(U + 1, A, index - 1,
+                                (sa_sint_t)((const LIBSAIS_CONTEXT *)ctx)->threads);
+        libsais_bwt_copy_8u_omp(U + index, A + index, n - index,
+                                (sa_sint_t)((const LIBSAIS_CONTEXT *)ctx)->threads);
 #else
         libsais_bwt_copy_8u(U + 1, A, index - 1);
         libsais_bwt_copy_8u(U + index, A + index, n - index);
@@ -9648,10 +9008,10 @@ s32 libsais_bwt_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n,
     return index;
 }
 
-s32 libsais_bwt_aux_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n,
-                        s32 fs, s32 * freq, s32 r, s32 * I) {
-    if ((ctx == NULL) || (T == NULL) || (U == NULL) || (A == NULL) || (n < 0) ||
-        (fs < 0) || (r < 2) || ((r & (r - 1)) != 0) || (I == NULL)) {
+s32 libsais_bwt_aux_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq,
+                        s32 r, s32 * I) {
+    if ((ctx == NULL) || (T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) ||
+        (r < 2) || ((r & (r - 1)) != 0) || (I == NULL)) {
         return -1;
     } else if (n <= 1) {
         if (freq != NULL) {
@@ -9667,16 +9027,14 @@ s32 libsais_bwt_aux_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n,
         return 0;
     }
 
-    if (libsais_main_ctx((const LIBSAIS_CONTEXT *)ctx, T, A, n, 1, r, I, fs,
-                         freq) != 0) {
+    if (libsais_main_ctx((const LIBSAIS_CONTEXT *)ctx, T, A, n, 1, r, I, fs, freq) != 0) {
         return -2;
     }
 
     U[0] = T[n - 1];
 
 #if defined(_OPENMP)
-    libsais_bwt_copy_8u_omp(U + 1, A, I[0] - 1,
-                            (sa_sint_t)((const LIBSAIS_CONTEXT *)ctx)->threads);
+    libsais_bwt_copy_8u_omp(U + 1, A, I[0] - 1, (sa_sint_t)((const LIBSAIS_CONTEXT *)ctx)->threads);
     libsais_bwt_copy_8u_omp(U + I[0], A + I[0], n - I[0],
                             (sa_sint_t)((const LIBSAIS_CONTEXT *)ctx)->threads);
 #else
@@ -9698,8 +9056,7 @@ void * libsais_create_ctx_omp(s32 threads) {
     return (void *)libsais_create_ctx_main(threads);
 }
 
-s32 libsais_omp(const u8 * T, s32 * SA, s32 n, s32 fs, s32 * freq,
-                s32 threads) {
+s32 libsais_omp(const u8 * T, s32 * SA, s32 n, s32 fs, s32 * freq, s32 threads) {
     if ((T == NULL) || (SA == NULL) || (n < 0) || (fs < 0) || (threads < 0)) {
         return -1;
     } else if (n < 2) {
@@ -9735,10 +9092,8 @@ s32 libsais_int_omp(s32 * T, s32 * SA, s32 n, s32 k, s32 fs, s32 threads) {
     return libsais_main_int(T, SA, n, k, fs, threads);
 }
 
-s32 libsais_bwt_omp(const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq,
-                    s32 threads) {
-    if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) ||
-        (threads < 0)) {
+s32 libsais_bwt_omp(const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq, s32 threads) {
+    if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (threads < 0)) {
         return -1;
     } else if (n <= 1) {
         if (freq != NULL) {
@@ -9767,10 +9122,10 @@ s32 libsais_bwt_omp(const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq,
     return index;
 }
 
-s32 libsais_bwt_aux_omp(const u8 * T, u8 * U, s32 * A, s32 n, s32 fs,
-                        s32 * freq, s32 r, s32 * I, s32 threads) {
-    if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) ||
-        (r < 2) || ((r & (r - 1)) != 0) || (I == NULL) || (threads < 0)) {
+s32 libsais_bwt_aux_omp(const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq, s32 r, s32 * I,
+                        s32 threads) {
+    if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (r < 2) ||
+        ((r & (r - 1)) != 0) || (I == NULL) || (threads < 0)) {
         return -1;
     } else if (n <= 1) {
         if (freq != NULL) {
@@ -9801,26 +9156,21 @@ s32 libsais_bwt_aux_omp(const u8 * T, u8 * U, s32 * A, s32 n, s32 fs,
 
 #endif
 
-static LIBSAIS_UNBWT_CONTEXT * libsais_unbwt_create_ctx_main(
-    sa_sint_t threads) {
+static LIBSAIS_UNBWT_CONTEXT * libsais_unbwt_create_ctx_main(sa_sint_t threads) {
     LIBSAIS_UNBWT_CONTEXT * RESTRICT ctx =
-        (LIBSAIS_UNBWT_CONTEXT *)libsais_alloc_aligned(
-            sizeof(LIBSAIS_UNBWT_CONTEXT), 64);
-    sa_uint_t * RESTRICT bucket2 = (sa_uint_t *)libsais_alloc_aligned(
-        ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t), 4096);
-    u16 * RESTRICT fastbits = (u16 *)libsais_alloc_aligned(
-        (1 + (1 << UNBWT_FASTBITS)) * sizeof(u16), 4096);
+        (LIBSAIS_UNBWT_CONTEXT *)libsais_alloc_aligned(sizeof(LIBSAIS_UNBWT_CONTEXT), 64);
+    sa_uint_t * RESTRICT bucket2 =
+        (sa_uint_t *)libsais_alloc_aligned(ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t), 4096);
+    u16 * RESTRICT fastbits =
+        (u16 *)libsais_alloc_aligned((1 + (1 << UNBWT_FASTBITS)) * sizeof(u16), 4096);
     sa_uint_t * RESTRICT buckets =
-        threads > 1
-            ? (sa_uint_t *)libsais_alloc_aligned(
-                  (size_t)threads *
-                      (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)) *
-                      sizeof(sa_uint_t),
-                  4096)
-            : NULL;
+        threads > 1 ? (sa_uint_t *)libsais_alloc_aligned(
+                          (size_t)threads * (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)) *
+                              sizeof(sa_uint_t),
+                          4096)
+                    : NULL;
 
-    if (ctx != NULL && bucket2 != NULL && fastbits != NULL &&
-        (buckets != NULL || threads == 1)) {
+    if (ctx != NULL && bucket2 != NULL && fastbits != NULL && (buckets != NULL || threads == 1)) {
         ctx->bucket2 = bucket2;
         ctx->fastbits = fastbits;
         ctx->buckets = buckets;
@@ -9846,8 +9196,7 @@ static void libsais_unbwt_free_ctx_main(LIBSAIS_UNBWT_CONTEXT * ctx) {
     }
 }
 
-static void libsais_unbwt_compute_histogram(const u8 * RESTRICT T,
-                                            fast_sint_t n,
+static void libsais_unbwt_compute_histogram(const u8 * RESTRICT T, fast_sint_t n,
                                             sa_uint_t * RESTRICT count) {
     const fast_sint_t prefetch_distance = 256;
 
@@ -10107,9 +9456,10 @@ static void libsais_unbwt_transpose_bucket2(sa_uint_t * RESTRICT bucket2) {
     }
 }
 
-static void libsais_unbwt_compute_bigram_histogram_single(
-    const u8 * RESTRICT T, sa_uint_t * RESTRICT bucket1,
-    sa_uint_t * RESTRICT bucket2, fast_uint_t index) {
+static void libsais_unbwt_compute_bigram_histogram_single(const u8 * RESTRICT T,
+                                                          sa_uint_t * RESTRICT bucket1,
+                                                          sa_uint_t * RESTRICT bucket2,
+                                                          fast_uint_t index) {
     fast_uint_t sum, c;
     for (sum = 1, c = 0; c < ALPHABET_SIZE; ++c) {
         fast_uint_t prev = sum;
@@ -10123,8 +9473,7 @@ static void libsais_unbwt_compute_bigram_histogram_single(
                 if (sum < hi) {
                     hi = sum;
                 }
-                libsais_unbwt_compute_histogram(
-                    &T[prev], (fast_sint_t)(hi - prev), bucket2_p);
+                libsais_unbwt_compute_histogram(&T[prev], (fast_sint_t)(hi - prev), bucket2_p);
             }
 
             {
@@ -10132,8 +9481,7 @@ static void libsais_unbwt_compute_bigram_histogram_single(
                 if (prev > lo) {
                     lo = prev;
                 }
-                libsais_unbwt_compute_histogram(
-                    &T[lo - 1], (fast_sint_t)(sum - lo), bucket2_p);
+                libsais_unbwt_compute_histogram(&T[lo - 1], (fast_sint_t)(sum - lo), bucket2_p);
             }
         }
     }
@@ -10141,10 +9489,8 @@ static void libsais_unbwt_compute_bigram_histogram_single(
     libsais_unbwt_transpose_bucket2(bucket2);
 }
 
-static void libsais_unbwt_calculate_fastbits(sa_uint_t * RESTRICT bucket2,
-                                             u16 * RESTRICT fastbits,
-                                             fast_uint_t lastc,
-                                             fast_uint_t shift) {
+static void libsais_unbwt_calculate_fastbits(sa_uint_t * RESTRICT bucket2, u16 * RESTRICT fastbits,
+                                             fast_uint_t lastc, fast_uint_t shift) {
     fast_uint_t v, w, sum, c, d;
     for (v = 0, w = 0, sum = 1, c = 0; c < ALPHABET_SIZE; ++c) {
         if (c == lastc) {
@@ -10164,10 +9510,10 @@ static void libsais_unbwt_calculate_fastbits(sa_uint_t * RESTRICT bucket2,
     }
 }
 
-static void libsais_unbwt_calculate_biPSI(
-    const u8 * RESTRICT T, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket1,
-    sa_uint_t * RESTRICT bucket2, fast_uint_t index,
-    fast_sint_t omp_block_start, fast_sint_t omp_block_end) {
+static void libsais_unbwt_calculate_biPSI(const u8 * RESTRICT T, sa_uint_t * RESTRICT P,
+                                          sa_uint_t * RESTRICT bucket1,
+                                          sa_uint_t * RESTRICT bucket2, fast_uint_t index,
+                                          fast_sint_t omp_block_start, fast_sint_t omp_block_end) {
     {
         fast_sint_t i = omp_block_start, j = (fast_sint_t)index;
         if (omp_block_end < j) {
@@ -10180,9 +9526,7 @@ static void libsais_unbwt_calculate_biPSI(
 
             if (t != 0) {
                 fast_uint_t w =
-                    (((fast_uint_t)
-                          T[p + (fast_uint_t)(t >>
-                                              ((sizeof(fast_sint_t) * 8) - 1))])
+                    (((fast_uint_t)T[p + (fast_uint_t)(t >> ((sizeof(fast_sint_t) * 8) - 1))])
                      << 8) +
                     c;
                 P[bucket2[w]++] = (sa_uint_t)i;
@@ -10202,9 +9546,7 @@ static void libsais_unbwt_calculate_biPSI(
 
             if (t != 0) {
                 fast_uint_t w =
-                    (((fast_uint_t)
-                          T[p + (fast_uint_t)(t >>
-                                              ((sizeof(fast_sint_t) * 8) - 1))])
+                    (((fast_uint_t)T[p + (fast_uint_t)(t >> ((sizeof(fast_sint_t) * 8) - 1))])
                      << 8) +
                     c;
                 P[bucket2[w]++] = (sa_uint_t)i;
@@ -10213,12 +9555,9 @@ static void libsais_unbwt_calculate_biPSI(
     }
 }
 
-static void libsais_unbwt_init_single(const u8 * RESTRICT T,
-                                      sa_uint_t * RESTRICT P, sa_sint_t n,
-                                      const sa_sint_t * freq,
-                                      const sa_uint_t * RESTRICT I,
-                                      sa_uint_t * RESTRICT bucket2,
-                                      u16 * RESTRICT fastbits) {
+static void libsais_unbwt_init_single(const u8 * RESTRICT T, sa_uint_t * RESTRICT P, sa_sint_t n,
+                                      const sa_sint_t * freq, const sa_uint_t * RESTRICT I,
+                                      sa_uint_t * RESTRICT bucket2, u16 * RESTRICT fastbits) {
     sa_uint_t bucket1[ALPHABET_SIZE];
 
     fast_uint_t index = I[0];
@@ -10246,8 +9585,7 @@ static void libsais_unbwt_init_single(const u8 * RESTRICT T,
 
 static void libsais_unbwt_compute_bigram_histogram_parallel(
     const u8 * RESTRICT T, fast_uint_t index, sa_uint_t * RESTRICT bucket1,
-    sa_uint_t * RESTRICT bucket2, fast_sint_t omp_block_start,
-    fast_sint_t omp_block_size) {
+    sa_uint_t * RESTRICT bucket2, fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
     fast_sint_t i;
     for (i = omp_block_start; i < omp_block_start + omp_block_size; ++i) {
         fast_uint_t c = T[i];
@@ -10256,21 +9594,17 @@ static void libsais_unbwt_compute_bigram_histogram_parallel(
 
         if (t != 0) {
             fast_uint_t w =
-                (((fast_uint_t)
-                      T[p +
-                        (fast_uint_t)(t >> ((sizeof(fast_sint_t) * 8) - 1))])
-                 << 8) +
+                (((fast_uint_t)T[p + (fast_uint_t)(t >> ((sizeof(fast_sint_t) * 8) - 1))]) << 8) +
                 c;
             bucket2[w]++;
         }
     }
 }
 
-static void libsais_unbwt_init_parallel(
-    const u8 * RESTRICT T, sa_uint_t * RESTRICT P, sa_sint_t n,
-    const sa_sint_t * freq, const sa_uint_t * RESTRICT I,
-    sa_uint_t * RESTRICT bucket2, u16 * RESTRICT fastbits,
-    sa_uint_t * RESTRICT buckets, sa_sint_t threads) {
+static void libsais_unbwt_init_parallel(const u8 * RESTRICT T, sa_uint_t * RESTRICT P, sa_sint_t n,
+                                        const sa_sint_t * freq, const sa_uint_t * RESTRICT I,
+                                        sa_uint_t * RESTRICT bucket2, u16 * RESTRICT fastbits,
+                                        sa_uint_t * RESTRICT buckets, sa_sint_t threads) {
     sa_uint_t bucket1[ALPHABET_SIZE];
 
     fast_uint_t index = I[0];
@@ -10292,20 +9626,17 @@ static void libsais_unbwt_init_parallel(
             libsais_unbwt_init_single(T, P, n, freq, I, bucket2, fastbits);
         } else {
             sa_uint_t * RESTRICT bucket1_local =
-                buckets + omp_thread_num *
-                              (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE));
+                buckets + omp_thread_num * (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE));
             sa_uint_t * RESTRICT bucket2_local = bucket1_local + ALPHABET_SIZE;
 
             fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
             fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-            fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
-                                             ? omp_block_stride
-                                             : n - omp_block_start;
+            fast_sint_t omp_block_size =
+                omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
 
             {
                 memset(bucket1_local, 0, ALPHABET_SIZE * sizeof(sa_uint_t));
-                libsais_unbwt_compute_histogram(T + omp_block_start,
-                                                omp_block_size, bucket1_local);
+                libsais_unbwt_compute_histogram(T + omp_block_start, omp_block_size, bucket1_local);
             }
 
     #pragma omp barrier
@@ -10317,8 +9648,7 @@ static void libsais_unbwt_init_parallel(
 
                     fast_sint_t t;
                     for (t = 0; t < omp_num_threads;
-                         ++t, bucket1_temp +=
-                              ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)) {
+                         ++t, bucket1_temp += ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)) {
                         fast_sint_t c;
                         for (c = 0; c < ALPHABET_SIZE; c += 1) {
                             sa_uint_t A = bucket1[c], B = bucket1_temp[c];
@@ -10347,11 +9677,9 @@ static void libsais_unbwt_init_parallel(
                     bucket1_local[c] = A + B;
                 }
 
-                memset(bucket2_local, 0,
-                       ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t));
+                memset(bucket2_local, 0, ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t));
                 libsais_unbwt_compute_bigram_histogram_parallel(
-                    T, index, bucket1_local, bucket2_local, omp_block_start,
-                    omp_block_size);
+                    T, index, bucket1_local, bucket2_local, omp_block_start, omp_block_size);
             }
 
     #pragma omp barrier
@@ -10359,8 +9687,7 @@ static void libsais_unbwt_init_parallel(
             {
                 fast_sint_t omp_bucket2_stride =
                     ((ALPHABET_SIZE * ALPHABET_SIZE) / omp_num_threads) & (-16);
-                fast_sint_t omp_bucket2_start =
-                    omp_thread_num * omp_bucket2_stride;
+                fast_sint_t omp_bucket2_start = omp_thread_num * omp_bucket2_stride;
                 fast_sint_t omp_bucket2_size =
                     omp_thread_num < omp_num_threads - 1
                         ? omp_bucket2_stride
@@ -10370,11 +9697,9 @@ static void libsais_unbwt_init_parallel(
 
                 fast_sint_t t;
                 for (t = 0; t < omp_num_threads;
-                     ++t, bucket2_temp +=
-                          ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)) {
+                     ++t, bucket2_temp += ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)) {
                     fast_sint_t c;
-                    for (c = omp_bucket2_start;
-                         c < omp_bucket2_start + omp_bucket2_size; c += 1) {
+                    for (c = omp_bucket2_start; c < omp_bucket2_start + omp_bucket2_size; c += 1) {
                         sa_uint_t A = bucket2[c], B = bucket2_temp[c];
                         bucket2[c] = A + B;
                         bucket2_temp[c] = A;
@@ -10386,21 +9711,17 @@ static void libsais_unbwt_init_parallel(
 
     #pragma omp master
             {
-                libsais_unbwt_calculate_fastbits(bucket2, fastbits, lastc,
-                                                 shift);
+                libsais_unbwt_calculate_fastbits(bucket2, fastbits, lastc, shift);
 
                 {
                     fast_sint_t t;
                     for (t = omp_num_threads - 1; t >= 1; --t) {
                         sa_uint_t * RESTRICT dst_bucket1 =
-                            buckets + t * (ALPHABET_SIZE +
-                                           (ALPHABET_SIZE * ALPHABET_SIZE));
+                            buckets + t * (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE));
                         sa_uint_t * RESTRICT src_bucket1 =
-                            dst_bucket1 -
-                            (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE));
+                            dst_bucket1 - (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE));
 
-                        memcpy(dst_bucket1, src_bucket1,
-                               ALPHABET_SIZE * sizeof(sa_uint_t));
+                        memcpy(dst_bucket1, src_bucket1, ALPHABET_SIZE * sizeof(sa_uint_t));
                     }
 
                     memcpy(buckets, bucket1, ALPHABET_SIZE * sizeof(sa_uint_t));
@@ -10416,9 +9737,8 @@ static void libsais_unbwt_init_parallel(
                     bucket2_local[c] = A + B;
                 }
 
-                libsais_unbwt_calculate_biPSI(
-                    T, P, bucket1_local, bucket2_local, index, omp_block_start,
-                    omp_block_start + omp_block_size);
+                libsais_unbwt_calculate_biPSI(T, P, bucket1_local, bucket2_local, index,
+                                              omp_block_start, omp_block_start + omp_block_size);
             }
 
     #pragma omp barrier
@@ -10428,8 +9748,7 @@ static void libsais_unbwt_init_parallel(
                 memcpy(
                     bucket2,
                     buckets + ALPHABET_SIZE +
-                        (omp_num_threads - 1) *
-                            (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)),
+                        (omp_num_threads - 1) * (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)),
                     ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t));
             }
         }
@@ -10439,9 +9758,8 @@ static void libsais_unbwt_init_parallel(
 #endif
 
 static void libsais_unbwt_decode_1(u8 * RESTRICT U, sa_uint_t * RESTRICT P,
-                                   sa_uint_t * RESTRICT bucket2,
-                                   u16 * RESTRICT fastbits, fast_uint_t shift,
-                                   fast_uint_t * i0, fast_uint_t k) {
+                                   sa_uint_t * RESTRICT bucket2, u16 * RESTRICT fastbits,
+                                   fast_uint_t shift, fast_uint_t * i0, fast_uint_t k) {
     u16 * RESTRICT U0 = (u16 *)(void *)U;
 
     fast_uint_t i, p0 = *i0;
@@ -10461,9 +9779,8 @@ static void libsais_unbwt_decode_1(u8 * RESTRICT U, sa_uint_t * RESTRICT P,
 }
 
 static void libsais_unbwt_decode_2(u8 * RESTRICT U, sa_uint_t * RESTRICT P,
-                                   sa_uint_t * RESTRICT bucket2,
-                                   u16 * RESTRICT fastbits, fast_uint_t shift,
-                                   fast_uint_t r, fast_uint_t * i0,
+                                   sa_uint_t * RESTRICT bucket2, u16 * RESTRICT fastbits,
+                                   fast_uint_t shift, fast_uint_t r, fast_uint_t * i0,
                                    fast_uint_t * i1, fast_uint_t k) {
     u16 * RESTRICT U0 = (u16 *)(void *)U;
     u16 * RESTRICT U1 = (u16 *)(void *)(((u8 *)U0) + r);
@@ -10494,11 +9811,9 @@ static void libsais_unbwt_decode_2(u8 * RESTRICT U, sa_uint_t * RESTRICT P,
 }
 
 static void libsais_unbwt_decode_3(u8 * RESTRICT U, sa_uint_t * RESTRICT P,
-                                   sa_uint_t * RESTRICT bucket2,
-                                   u16 * RESTRICT fastbits, fast_uint_t shift,
-                                   fast_uint_t r, fast_uint_t * i0,
-                                   fast_uint_t * i1, fast_uint_t * i2,
-                                   fast_uint_t k) {
+                                   sa_uint_t * RESTRICT bucket2, u16 * RESTRICT fastbits,
+                                   fast_uint_t shift, fast_uint_t r, fast_uint_t * i0,
+                                   fast_uint_t * i1, fast_uint_t * i2, fast_uint_t k) {
     u16 * RESTRICT U0 = (u16 *)(void *)U;
     u16 * RESTRICT U1 = (u16 *)(void *)(((u8 *)U0) + r);
     u16 * RESTRICT U2 = (u16 *)(void *)(((u8 *)U1) + r);
@@ -10538,11 +9853,10 @@ static void libsais_unbwt_decode_3(u8 * RESTRICT U, sa_uint_t * RESTRICT P,
 }
 
 static void libsais_unbwt_decode_4(u8 * RESTRICT U, sa_uint_t * RESTRICT P,
-                                   sa_uint_t * RESTRICT bucket2,
-                                   u16 * RESTRICT fastbits, fast_uint_t shift,
-                                   fast_uint_t r, fast_uint_t * i0,
-                                   fast_uint_t * i1, fast_uint_t * i2,
-                                   fast_uint_t * i3, fast_uint_t k) {
+                                   sa_uint_t * RESTRICT bucket2, u16 * RESTRICT fastbits,
+                                   fast_uint_t shift, fast_uint_t r, fast_uint_t * i0,
+                                   fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3,
+                                   fast_uint_t k) {
     u16 * RESTRICT U0 = (u16 *)(void *)U;
     u16 * RESTRICT U1 = (u16 *)(void *)(((u8 *)U0) + r);
     u16 * RESTRICT U2 = (u16 *)(void *)(((u8 *)U1) + r);
@@ -10592,12 +9906,10 @@ static void libsais_unbwt_decode_4(u8 * RESTRICT U, sa_uint_t * RESTRICT P,
 }
 
 static void libsais_unbwt_decode_5(u8 * RESTRICT U, sa_uint_t * RESTRICT P,
-                                   sa_uint_t * RESTRICT bucket2,
-                                   u16 * RESTRICT fastbits, fast_uint_t shift,
-                                   fast_uint_t r, fast_uint_t * i0,
-                                   fast_uint_t * i1, fast_uint_t * i2,
-                                   fast_uint_t * i3, fast_uint_t * i4,
-                                   fast_uint_t k) {
+                                   sa_uint_t * RESTRICT bucket2, u16 * RESTRICT fastbits,
+                                   fast_uint_t shift, fast_uint_t r, fast_uint_t * i0,
+                                   fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3,
+                                   fast_uint_t * i4, fast_uint_t k) {
     u16 * RESTRICT U0 = (u16 *)(void *)U;
     u16 * RESTRICT U1 = (u16 *)(void *)(((u8 *)U0) + r);
     u16 * RESTRICT U2 = (u16 *)(void *)(((u8 *)U1) + r);
@@ -10657,12 +9969,10 @@ static void libsais_unbwt_decode_5(u8 * RESTRICT U, sa_uint_t * RESTRICT P,
 }
 
 static void libsais_unbwt_decode_6(u8 * RESTRICT U, sa_uint_t * RESTRICT P,
-                                   sa_uint_t * RESTRICT bucket2,
-                                   u16 * RESTRICT fastbits, fast_uint_t shift,
-                                   fast_uint_t r, fast_uint_t * i0,
-                                   fast_uint_t * i1, fast_uint_t * i2,
-                                   fast_uint_t * i3, fast_uint_t * i4,
-                                   fast_uint_t * i5, fast_uint_t k) {
+                                   sa_uint_t * RESTRICT bucket2, u16 * RESTRICT fastbits,
+                                   fast_uint_t shift, fast_uint_t r, fast_uint_t * i0,
+                                   fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3,
+                                   fast_uint_t * i4, fast_uint_t * i5, fast_uint_t k) {
     u16 * RESTRICT U0 = (u16 *)(void *)U;
     u16 * RESTRICT U1 = (u16 *)(void *)(((u8 *)U0) + r);
     u16 * RESTRICT U2 = (u16 *)(void *)(((u8 *)U1) + r);
@@ -10731,11 +10041,12 @@ static void libsais_unbwt_decode_6(u8 * RESTRICT U, sa_uint_t * RESTRICT P,
     *i5 = p5;
 }
 
-static void libsais_unbwt_decode_7(
-    u8 * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2,
-    u16 * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0,
-    fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4,
-    fast_uint_t * i5, fast_uint_t * i6, fast_uint_t k) {
+static void libsais_unbwt_decode_7(u8 * RESTRICT U, sa_uint_t * RESTRICT P,
+                                   sa_uint_t * RESTRICT bucket2, u16 * RESTRICT fastbits,
+                                   fast_uint_t shift, fast_uint_t r, fast_uint_t * i0,
+                                   fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3,
+                                   fast_uint_t * i4, fast_uint_t * i5, fast_uint_t * i6,
+                                   fast_uint_t k) {
     u16 * RESTRICT U0 = (u16 *)(void *)U;
     u16 * RESTRICT U1 = (u16 *)(void *)(((u8 *)U0) + r);
     u16 * RESTRICT U2 = (u16 *)(void *)(((u8 *)U1) + r);
@@ -10744,8 +10055,7 @@ static void libsais_unbwt_decode_7(
     u16 * RESTRICT U5 = (u16 *)(void *)(((u8 *)U4) + r);
     u16 * RESTRICT U6 = (u16 *)(void *)(((u8 *)U5) + r);
 
-    fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4, p5 = *i5,
-                   p6 = *i6;
+    fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4, p5 = *i5, p6 = *i6;
 
     for (i = 0; i != k; ++i) {
         u16 c0 = fastbits[p0 >> shift];
@@ -10815,11 +10125,12 @@ static void libsais_unbwt_decode_7(
     *i6 = p6;
 }
 
-static void libsais_unbwt_decode_8(
-    u8 * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2,
-    u16 * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0,
-    fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4,
-    fast_uint_t * i5, fast_uint_t * i6, fast_uint_t * i7, fast_uint_t k) {
+static void libsais_unbwt_decode_8(u8 * RESTRICT U, sa_uint_t * RESTRICT P,
+                                   sa_uint_t * RESTRICT bucket2, u16 * RESTRICT fastbits,
+                                   fast_uint_t shift, fast_uint_t r, fast_uint_t * i0,
+                                   fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3,
+                                   fast_uint_t * i4, fast_uint_t * i5, fast_uint_t * i6,
+                                   fast_uint_t * i7, fast_uint_t k) {
     u16 * RESTRICT U0 = (u16 *)(void *)U;
     u16 * RESTRICT U1 = (u16 *)(void *)(((u8 *)U0) + r);
     u16 * RESTRICT U2 = (u16 *)(void *)(((u8 *)U1) + r);
@@ -10829,8 +10140,7 @@ static void libsais_unbwt_decode_8(
     u16 * RESTRICT U6 = (u16 *)(void *)(((u8 *)U5) + r);
     u16 * RESTRICT U7 = (u16 *)(void *)(((u8 *)U6) + r);
 
-    fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4, p5 = *i5,
-                   p6 = *i6, p7 = *i7;
+    fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4, p5 = *i5, p6 = *i6, p7 = *i7;
 
     for (i = 0; i != k; ++i) {
         u16 c0 = fastbits[p0 >> shift];
@@ -10909,10 +10219,8 @@ static void libsais_unbwt_decode_8(
     *i7 = p7;
 }
 
-static void libsais_unbwt_decode(u8 * RESTRICT U, sa_uint_t * RESTRICT P,
-                                 sa_sint_t n, sa_sint_t r,
-                                 const sa_uint_t * RESTRICT I,
-                                 sa_uint_t * RESTRICT bucket2,
+static void libsais_unbwt_decode(u8 * RESTRICT U, sa_uint_t * RESTRICT P, sa_sint_t n, sa_sint_t r,
+                                 const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2,
                                  u16 * RESTRICT fastbits, fast_sint_t blocks,
                                  fast_uint_t reminder) {
     fast_uint_t shift = 0;
@@ -10922,11 +10230,10 @@ static void libsais_unbwt_decode(u8 * RESTRICT U, sa_uint_t * RESTRICT P,
     fast_uint_t offset = 0;
 
     while (blocks > 8) {
-        fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4],
-                    i5 = I[5], i6 = I[6], i7 = I[7];
-        libsais_unbwt_decode_8(U + offset, P, bucket2, fastbits, shift,
-                               (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5,
-                               &i6, &i7, (fast_uint_t)r >> 1);
+        fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5], i6 = I[6],
+                    i7 = I[7];
+        libsais_unbwt_decode_8(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1,
+                               &i2, &i3, &i4, &i5, &i6, &i7, (fast_uint_t)r >> 1);
         I += 8;
         blocks -= 8;
         offset += 8 * (fast_uint_t)r;
@@ -10934,86 +10241,69 @@ static void libsais_unbwt_decode(u8 * RESTRICT U, sa_uint_t * RESTRICT P,
 
     if (blocks == 1) {
         fast_uint_t i0 = I[0];
-        libsais_unbwt_decode_1(U + offset, P, bucket2, fastbits, shift, &i0,
-                               reminder >> 1);
+        libsais_unbwt_decode_1(U + offset, P, bucket2, fastbits, shift, &i0, reminder >> 1);
     } else if (blocks == 2) {
         fast_uint_t i0 = I[0], i1 = I[1];
-        libsais_unbwt_decode_2(U + offset, P, bucket2, fastbits, shift,
-                               (fast_uint_t)r, &i0, &i1, reminder >> 1);
-        libsais_unbwt_decode_1(U + offset + 2 * (reminder >> 1), P, bucket2,
-                               fastbits, shift, &i0,
+        libsais_unbwt_decode_2(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1,
+                               reminder >> 1);
+        libsais_unbwt_decode_1(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift, &i0,
                                ((fast_uint_t)r >> 1) - (reminder >> 1));
     } else if (blocks == 3) {
         fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2];
-        libsais_unbwt_decode_3(U + offset, P, bucket2, fastbits, shift,
-                               (fast_uint_t)r, &i0, &i1, &i2, reminder >> 1);
-        libsais_unbwt_decode_2(U + offset + 2 * (reminder >> 1), P, bucket2,
-                               fastbits, shift, (fast_uint_t)r, &i0, &i1,
-                               ((fast_uint_t)r >> 1) - (reminder >> 1));
+        libsais_unbwt_decode_3(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1,
+                               &i2, reminder >> 1);
+        libsais_unbwt_decode_2(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift,
+                               (fast_uint_t)r, &i0, &i1, ((fast_uint_t)r >> 1) - (reminder >> 1));
     } else if (blocks == 4) {
         fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3];
-        libsais_unbwt_decode_4(U + offset, P, bucket2, fastbits, shift,
-                               (fast_uint_t)r, &i0, &i1, &i2, &i3,
-                               reminder >> 1);
-        libsais_unbwt_decode_3(U + offset + 2 * (reminder >> 1), P, bucket2,
-                               fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2,
+        libsais_unbwt_decode_4(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1,
+                               &i2, &i3, reminder >> 1);
+        libsais_unbwt_decode_3(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift,
+                               (fast_uint_t)r, &i0, &i1, &i2,
                                ((fast_uint_t)r >> 1) - (reminder >> 1));
     } else if (blocks == 5) {
         fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4];
-        libsais_unbwt_decode_5(U + offset, P, bucket2, fastbits, shift,
-                               (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4,
-                               reminder >> 1);
-        libsais_unbwt_decode_4(U + offset + 2 * (reminder >> 1), P, bucket2,
-                               fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2,
-                               &i3, ((fast_uint_t)r >> 1) - (reminder >> 1));
+        libsais_unbwt_decode_5(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1,
+                               &i2, &i3, &i4, reminder >> 1);
+        libsais_unbwt_decode_4(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift,
+                               (fast_uint_t)r, &i0, &i1, &i2, &i3,
+                               ((fast_uint_t)r >> 1) - (reminder >> 1));
     } else if (blocks == 6) {
-        fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4],
-                    i5 = I[5];
-        libsais_unbwt_decode_6(U + offset, P, bucket2, fastbits, shift,
-                               (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5,
-                               reminder >> 1);
-        libsais_unbwt_decode_5(U + offset + 2 * (reminder >> 1), P, bucket2,
-                               fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2,
-                               &i3, &i4,
+        fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5];
+        libsais_unbwt_decode_6(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1,
+                               &i2, &i3, &i4, &i5, reminder >> 1);
+        libsais_unbwt_decode_5(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift,
+                               (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4,
                                ((fast_uint_t)r >> 1) - (reminder >> 1));
     } else if (blocks == 7) {
-        fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4],
-                    i5 = I[5], i6 = I[6];
-        libsais_unbwt_decode_7(U + offset, P, bucket2, fastbits, shift,
+        fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5], i6 = I[6];
+        libsais_unbwt_decode_7(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1,
+                               &i2, &i3, &i4, &i5, &i6, reminder >> 1);
+        libsais_unbwt_decode_6(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift,
                                (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5,
-                               &i6, reminder >> 1);
-        libsais_unbwt_decode_6(U + offset + 2 * (reminder >> 1), P, bucket2,
-                               fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2,
-                               &i3, &i4, &i5,
                                ((fast_uint_t)r >> 1) - (reminder >> 1));
     } else {
-        fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4],
-                    i5 = I[5], i6 = I[6], i7 = I[7];
-        libsais_unbwt_decode_8(U + offset, P, bucket2, fastbits, shift,
-                               (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5,
-                               &i6, &i7, reminder >> 1);
-        libsais_unbwt_decode_7(U + offset + 2 * (reminder >> 1), P, bucket2,
-                               fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2,
-                               &i3, &i4, &i5, &i6,
+        fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5], i6 = I[6],
+                    i7 = I[7];
+        libsais_unbwt_decode_8(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1,
+                               &i2, &i3, &i4, &i5, &i6, &i7, reminder >> 1);
+        libsais_unbwt_decode_7(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift,
+                               (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, &i6,
                                ((fast_uint_t)r >> 1) - (reminder >> 1));
     }
 }
 
-static void libsais_unbwt_decode_omp(const u8 * RESTRICT T, u8 * RESTRICT U,
-                                     sa_uint_t * RESTRICT P, sa_sint_t n,
-                                     sa_sint_t r, const sa_uint_t * RESTRICT I,
-                                     sa_uint_t * RESTRICT bucket2,
-                                     u16 * RESTRICT fastbits,
+static void libsais_unbwt_decode_omp(const u8 * RESTRICT T, u8 * RESTRICT U, sa_uint_t * RESTRICT P,
+                                     sa_sint_t n, sa_sint_t r, const sa_uint_t * RESTRICT I,
+                                     sa_uint_t * RESTRICT bucket2, u16 * RESTRICT fastbits,
                                      sa_sint_t threads) {
     fast_uint_t lastc = T[0];
     fast_sint_t blocks = 1 + (((fast_sint_t)n - 1) / (fast_sint_t)r);
-    fast_uint_t reminder =
-        (fast_uint_t)n - ((fast_uint_t)r * ((fast_uint_t)blocks - 1));
+    fast_uint_t reminder = (fast_uint_t)n - ((fast_uint_t)r * ((fast_uint_t)blocks - 1));
 
 #if defined(_OPENMP)
     fast_sint_t max_threads = blocks < threads ? blocks : threads;
-    #pragma omp parallel num_threads(max_threads) if (max_threads > 1 && \
-                                                      n >= 65536)
+    #pragma omp parallel num_threads(max_threads) if (max_threads > 1 && n >= 65536)
 #endif
     {
 #if defined(_OPENMP)
@@ -11028,31 +10318,27 @@ static void libsais_unbwt_decode_omp(const u8 * RESTRICT T, u8 * RESTRICT U,
 
         fast_sint_t omp_block_stride = blocks / omp_num_threads;
         fast_sint_t omp_block_reminder = blocks % omp_num_threads;
-        fast_sint_t omp_block_size =
-            omp_block_stride + (omp_thread_num < omp_block_reminder);
+        fast_sint_t omp_block_size = omp_block_stride + (omp_thread_num < omp_block_reminder);
         fast_sint_t omp_block_start =
             omp_block_stride * omp_thread_num +
-            (omp_thread_num < omp_block_reminder ? omp_thread_num
-                                                 : omp_block_reminder);
+            (omp_thread_num < omp_block_reminder ? omp_thread_num : omp_block_reminder);
 
-        libsais_unbwt_decode(
-            U + r * omp_block_start, P, n, r, I + omp_block_start, bucket2,
-            fastbits, omp_block_size,
-            omp_thread_num < omp_num_threads - 1 ? (fast_uint_t)r : reminder);
+        libsais_unbwt_decode(U + r * omp_block_start, P, n, r, I + omp_block_start, bucket2,
+                             fastbits, omp_block_size,
+                             omp_thread_num < omp_num_threads - 1 ? (fast_uint_t)r : reminder);
     }
 
     U[n - 1] = (u8)lastc;
 }
 
-static sa_sint_t libsais_unbwt_core(
-    const u8 * RESTRICT T, u8 * RESTRICT U, sa_uint_t * RESTRICT P, sa_sint_t n,
-    const sa_sint_t * freq, sa_sint_t r, const sa_uint_t * RESTRICT I,
-    sa_uint_t * RESTRICT bucket2, u16 * RESTRICT fastbits,
-    sa_uint_t * RESTRICT buckets, sa_sint_t threads) {
+static sa_sint_t libsais_unbwt_core(const u8 * RESTRICT T, u8 * RESTRICT U, sa_uint_t * RESTRICT P,
+                                    sa_sint_t n, const sa_sint_t * freq, sa_sint_t r,
+                                    const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2,
+                                    u16 * RESTRICT fastbits, sa_uint_t * RESTRICT buckets,
+                                    sa_sint_t threads) {
 #if defined(_OPENMP)
     if (threads > 1 && n >= 262144) {
-        libsais_unbwt_init_parallel(T, P, n, freq, I, bucket2, fastbits,
-                                    buckets, threads);
+        libsais_unbwt_init_parallel(T, P, n, freq, I, bucket2, fastbits, buckets, threads);
     } else
 #else
     UNUSED(buckets);
@@ -11065,33 +10351,30 @@ static sa_sint_t libsais_unbwt_core(
     return 0;
 }
 
-static sa_sint_t libsais_unbwt_main(const u8 * T, u8 * U, sa_uint_t * P,
-                                    sa_sint_t n, const sa_sint_t * freq,
-                                    sa_sint_t r, const sa_uint_t * I,
+static sa_sint_t libsais_unbwt_main(const u8 * T, u8 * U, sa_uint_t * P, sa_sint_t n,
+                                    const sa_sint_t * freq, sa_sint_t r, const sa_uint_t * I,
                                     sa_sint_t threads) {
     fast_uint_t shift = 0;
     while ((n >> shift) > (1 << UNBWT_FASTBITS)) {
         shift++;
     }
 
-    sa_uint_t * RESTRICT bucket2 = (sa_uint_t *)libsais_alloc_aligned(
-        ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t), 4096);
-    u16 * RESTRICT fastbits = (u16 *)libsais_alloc_aligned(
-        ((size_t)1 + (size_t)(n >> shift)) * sizeof(u16), 4096);
+    sa_uint_t * RESTRICT bucket2 =
+        (sa_uint_t *)libsais_alloc_aligned(ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t), 4096);
+    u16 * RESTRICT fastbits =
+        (u16 *)libsais_alloc_aligned(((size_t)1 + (size_t)(n >> shift)) * sizeof(u16), 4096);
     sa_uint_t * RESTRICT buckets =
         threads > 1 && n >= 262144
             ? (sa_uint_t *)libsais_alloc_aligned(
-                  (size_t)threads *
-                      (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)) *
+                  (size_t)threads * (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)) *
                       sizeof(sa_uint_t),
                   4096)
             : NULL;
 
-    sa_sint_t index = bucket2 != NULL && fastbits != NULL &&
-                              (buckets != NULL || threads == 1 || n < 262144)
-                          ? libsais_unbwt_core(T, U, P, n, freq, r, I, bucket2,
-                                               fastbits, buckets, threads)
-                          : -2;
+    sa_sint_t index =
+        bucket2 != NULL && fastbits != NULL && (buckets != NULL || threads == 1 || n < 262144)
+            ? libsais_unbwt_core(T, U, P, n, freq, r, I, bucket2, fastbits, buckets, threads)
+            : -2;
 
     libsais_free_aligned(buckets);
     libsais_free_aligned(fastbits);
@@ -11100,38 +10383,33 @@ static sa_sint_t libsais_unbwt_main(const u8 * T, u8 * U, sa_uint_t * P,
     return index;
 }
 
-static sa_sint_t libsais_unbwt_main_ctx(const LIBSAIS_UNBWT_CONTEXT * ctx,
-                                        const u8 * T, u8 * U, sa_uint_t * P,
-                                        sa_sint_t n, const sa_sint_t * freq,
+static sa_sint_t libsais_unbwt_main_ctx(const LIBSAIS_UNBWT_CONTEXT * ctx, const u8 * T, u8 * U,
+                                        sa_uint_t * P, sa_sint_t n, const sa_sint_t * freq,
                                         sa_sint_t r, const sa_uint_t * I) {
     return ctx != NULL && ctx->bucket2 != NULL && ctx->fastbits != NULL &&
                    (ctx->buckets != NULL || ctx->threads == 1)
-               ? libsais_unbwt_core(T, U, P, n, freq, r, I, ctx->bucket2,
-                                    ctx->fastbits, ctx->buckets,
-                                    (sa_sint_t)ctx->threads)
+               ? libsais_unbwt_core(T, U, P, n, freq, r, I, ctx->bucket2, ctx->fastbits,
+                                    ctx->buckets, (sa_sint_t)ctx->threads)
                : -2;
 }
 
-void * libsais_unbwt_create_ctx(void) {
-    return (void *)libsais_unbwt_create_ctx_main(1);
-}
+void * libsais_unbwt_create_ctx(void) { return (void *)libsais_unbwt_create_ctx_main(1); }
 
 void libsais_unbwt_free_ctx(void * ctx) {
     libsais_unbwt_free_ctx_main((LIBSAIS_UNBWT_CONTEXT *)ctx);
 }
 
-s32 libsais_unbwt(const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq,
-                  s32 i) {
+s32 libsais_unbwt(const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq, s32 i) {
     return libsais_unbwt_aux(T, U, A, n, freq, n, &i);
 }
 
-s32 libsais_unbwt_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n,
-                      const s32 * freq, s32 i) {
+s32 libsais_unbwt_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq,
+                      s32 i) {
     return libsais_unbwt_aux_ctx(ctx, T, U, A, n, freq, n, &i);
 }
 
-s32 libsais_unbwt_aux(const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq,
-                      s32 r, const s32 * I) {
+s32 libsais_unbwt_aux(const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq, s32 r,
+                      const s32 * I) {
     if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) ||
         ((r != n) && ((r < 2) || ((r & (r - 1)) != 0))) || (I == NULL)) {
         return -1;
@@ -11152,12 +10430,11 @@ s32 libsais_unbwt_aux(const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq,
         }
     }
 
-    return libsais_unbwt_main(T, U, (sa_uint_t *)A, n, freq, r,
-                              (const sa_uint_t *)I, 1);
+    return libsais_unbwt_main(T, U, (sa_uint_t *)A, n, freq, r, (const sa_uint_t *)I, 1);
 }
 
-s32 libsais_unbwt_aux_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A,
-                          s32 n, const s32 * freq, s32 r, const s32 * I) {
+s32 libsais_unbwt_aux_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq,
+                          s32 r, const s32 * I) {
     if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) ||
         ((r != n) && ((r < 2) || ((r & (r - 1)) != 0))) || (I == NULL)) {
         return -1;
@@ -11178,9 +10455,8 @@ s32 libsais_unbwt_aux_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A,
         }
     }
 
-    return libsais_unbwt_main_ctx((const LIBSAIS_UNBWT_CONTEXT *)ctx, T, U,
-                                  (sa_uint_t *)A, n, freq, r,
-                                  (const sa_uint_t *)I);
+    return libsais_unbwt_main_ctx((const LIBSAIS_UNBWT_CONTEXT *)ctx, T, U, (sa_uint_t *)A, n, freq,
+                                  r, (const sa_uint_t *)I);
 }
 
 #if defined(_OPENMP)
@@ -11194,16 +10470,14 @@ void * libsais_unbwt_create_ctx_omp(s32 threads) {
     return (void *)libsais_unbwt_create_ctx_main(threads);
 }
 
-s32 libsais_unbwt_omp(const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq,
-                      s32 i, s32 threads) {
+s32 libsais_unbwt_omp(const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq, s32 i, s32 threads) {
     return libsais_unbwt_aux_omp(T, U, A, n, freq, n, &i, threads);
 }
 
-s32 libsais_unbwt_aux_omp(const u8 * T, u8 * U, s32 * A, s32 n,
-                          const s32 * freq, s32 r, const s32 * I, s32 threads) {
+s32 libsais_unbwt_aux_omp(const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq, s32 r,
+                          const s32 * I, s32 threads) {
     if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) ||
-        ((r != n) && ((r < 2) || ((r & (r - 1)) != 0))) || (I == NULL) ||
-        (threads < 0)) {
+        ((r != n) && ((r < 2) || ((r & (r - 1)) != 0))) || (I == NULL) || (threads < 0)) {
         return -1;
     } else if (n <= 1) {
         if (I[0] != n) {
@@ -11223,23 +10497,20 @@ s32 libsais_unbwt_aux_omp(const u8 * T, u8 * U, s32 * A, s32 n,
     }
 
     threads = threads > 0 ? threads : omp_get_max_threads();
-    return libsais_unbwt_main(T, U, (sa_uint_t *)A, n, freq, r,
-                              (const sa_uint_t *)I, threads);
+    return libsais_unbwt_main(T, U, (sa_uint_t *)A, n, freq, r, (const sa_uint_t *)I, threads);
 }
 
 #endif
 
-static void libsais_compute_phi(const sa_sint_t * RESTRICT SA,
-                                sa_sint_t * RESTRICT PLCP, sa_sint_t n,
-                                fast_sint_t omp_block_start,
+static void libsais_compute_phi(const sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT PLCP,
+                                sa_sint_t n, fast_sint_t omp_block_start,
                                 fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
     sa_sint_t k = omp_block_start > 0 ? SA[omp_block_start - 1] : n;
-    for (i = omp_block_start,
-        j = omp_block_start + omp_block_size - prefetch_distance - 3;
-         i < j; i += 4) {
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j;
+         i += 4) {
         libsais_prefetchw(&PLCP[SA[i + prefetch_distance + 0]]);
         libsais_prefetchw(&PLCP[SA[i + prefetch_distance + 1]]);
 
@@ -11263,9 +10534,8 @@ static void libsais_compute_phi(const sa_sint_t * RESTRICT SA,
     }
 }
 
-static void libsais_compute_phi_omp(const sa_sint_t * RESTRICT SA,
-                                    sa_sint_t * RESTRICT PLCP, sa_sint_t n,
-                                    sa_sint_t threads) {
+static void libsais_compute_phi_omp(const sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT PLCP,
+                                    sa_sint_t n, sa_sint_t threads) {
 #if defined(_OPENMP)
     #pragma omp parallel num_threads(threads) if (threads > 1 && n >= 65536)
 #endif
@@ -11281,24 +10551,20 @@ static void libsais_compute_phi_omp(const sa_sint_t * RESTRICT SA,
 #endif
         fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
-                                         ? omp_block_stride
-                                         : n - omp_block_start;
+        fast_sint_t omp_block_size =
+            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
 
         libsais_compute_phi(SA, PLCP, n, omp_block_start, omp_block_size);
     }
 }
 
-static void libsais_compute_plcp(const u8 * RESTRICT T,
-                                 sa_sint_t * RESTRICT PLCP, fast_sint_t n,
-                                 fast_sint_t omp_block_start,
-                                 fast_sint_t omp_block_size) {
+static void libsais_compute_plcp(const u8 * RESTRICT T, sa_sint_t * RESTRICT PLCP, fast_sint_t n,
+                                 fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j, l = 0;
-    for (i = omp_block_start,
-        j = omp_block_start + omp_block_size - prefetch_distance;
-         i < j; i += 1) {
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance; i < j;
+         i += 1) {
         libsais_prefetch(&T[PLCP[i + prefetch_distance] + l]);
 
         fast_sint_t k = PLCP[i], m = n - (i > k ? i : k);
@@ -11321,8 +10587,7 @@ static void libsais_compute_plcp(const u8 * RESTRICT T,
     }
 }
 
-static void libsais_compute_plcp_omp(const u8 * RESTRICT T,
-                                     sa_sint_t * RESTRICT PLCP, sa_sint_t n,
+static void libsais_compute_plcp_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT PLCP, sa_sint_t n,
                                      sa_sint_t threads) {
 #if defined(_OPENMP)
     #pragma omp parallel num_threads(threads) if (threads > 1 && n >= 65536)
@@ -11339,25 +10604,21 @@ static void libsais_compute_plcp_omp(const u8 * RESTRICT T,
 #endif
         fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
-                                         ? omp_block_stride
-                                         : n - omp_block_start;
+        fast_sint_t omp_block_size =
+            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
 
         libsais_compute_plcp(T, PLCP, n, omp_block_start, omp_block_size);
     }
 }
 
-static void libsais_compute_lcp(const sa_sint_t * RESTRICT PLCP,
-                                const sa_sint_t * RESTRICT SA,
-                                sa_sint_t * RESTRICT LCP,
-                                fast_sint_t omp_block_start,
+static void libsais_compute_lcp(const sa_sint_t * RESTRICT PLCP, const sa_sint_t * RESTRICT SA,
+                                sa_sint_t * RESTRICT LCP, fast_sint_t omp_block_start,
                                 fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start,
-        j = omp_block_start + omp_block_size - prefetch_distance - 3;
-         i < j; i += 4) {
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j;
+         i += 4) {
         libsais_prefetch(&PLCP[SA[i + prefetch_distance + 0]]);
         libsais_prefetch(&PLCP[SA[i + prefetch_distance + 1]]);
 
@@ -11376,10 +10637,8 @@ static void libsais_compute_lcp(const sa_sint_t * RESTRICT PLCP,
     }
 }
 
-static void libsais_compute_lcp_omp(const sa_sint_t * RESTRICT PLCP,
-                                    const sa_sint_t * RESTRICT SA,
-                                    sa_sint_t * RESTRICT LCP, sa_sint_t n,
-                                    sa_sint_t threads) {
+static void libsais_compute_lcp_omp(const sa_sint_t * RESTRICT PLCP, const sa_sint_t * RESTRICT SA,
+                                    sa_sint_t * RESTRICT LCP, sa_sint_t n, sa_sint_t threads) {
 #if defined(_OPENMP)
     #pragma omp parallel num_threads(threads) if (threads > 1 && n >= 65536)
 #endif
@@ -11395,9 +10654,8 @@ static void libsais_compute_lcp_omp(const sa_sint_t * RESTRICT PLCP,
 #endif
         fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
-                                         ? omp_block_stride
-                                         : n - omp_block_start;
+        fast_sint_t omp_block_size =
+            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
 
         libsais_compute_lcp(PLCP, SA, LCP, omp_block_start, omp_block_size);
     }
@@ -11436,10 +10694,8 @@ s32 libsais_lcp(const s32 * PLCP, const s32 * SA, s32 * LCP, s32 n) {
 
 #if defined(_OPENMP)
 
-s32 libsais_plcp_omp(const u8 * T, const s32 * SA, s32 * PLCP, s32 n,
-                     s32 threads) {
-    if ((T == NULL) || (SA == NULL) || (PLCP == NULL) || (n < 0) ||
-        (threads < 0)) {
+s32 libsais_plcp_omp(const u8 * T, const s32 * SA, s32 * PLCP, s32 n, s32 threads) {
+    if ((T == NULL) || (SA == NULL) || (PLCP == NULL) || (n < 0) || (threads < 0)) {
         return -1;
     } else if (n <= 1) {
         if (n == 1) {
@@ -11456,10 +10712,8 @@ s32 libsais_plcp_omp(const u8 * T, const s32 * SA, s32 * PLCP, s32 n,
     return 0;
 }
 
-s32 libsais_lcp_omp(const s32 * PLCP, const s32 * SA, s32 * LCP, s32 n,
-                    s32 threads) {
-    if ((PLCP == NULL) || (SA == NULL) || (LCP == NULL) || (n < 0) ||
-        (threads < 0)) {
+s32 libsais_lcp_omp(const s32 * PLCP, const s32 * SA, s32 * LCP, s32 n, s32 threads) {
+    if ((PLCP == NULL) || (SA == NULL) || (LCP == NULL) || (n < 0) || (threads < 0)) {
         return -1;
     } else if (n <= 1) {
         if (n == 1) {
diff --git a/src/lzp.c b/src/lzp.c
new file mode 100644
index 0000000..e49428e
--- /dev/null
+++ b/src/lzp.c
@@ -0,0 +1,239 @@
+
+// Lempel Ziv Prediction code.
+// TODO: Move the LUT allocation out of block coding routine to save some clock cycles.
+
+#include <memory.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "common.h"
+
+#define MATCH 0xf2
+
+static inline s32 num_blocks(s32 n) {
+    if (n < KiB(256)) return 1;
+    if (n < MiB(4)) return 2;
+    if (n < MiB(16)) return 4;
+    return 8;
+}
+
+static s32 lzp_encode_block(const u8 * restrict in, const u8 * in_end, u8 * restrict out,
+                            u8 * out_end, s32 * restrict lut, s32 mask, s32 m_len) {
+    const u8 *ins = in, *outs = out;
+    const u8 * out_eob = out_end - 8;
+    const u8 * heur = in;
+
+    u32 ctx;
+
+    for (s32 i = 0; i < 4; ++i) *out++ = *in++;
+
+    ctx = in[-1] | (in[-2] << 8) | (in[-3] << 16) | (in[-4] << 24);
+
+    while (in < in_end - m_len - 32 && out < out_eob) {
+        u32 idx = (ctx >> 15 ^ ctx ^ ctx >> 3) & mask;
+        s32 val = lut[idx];
+        lut[idx] = in - ins;
+        if (val > 0) {
+            const u8 * restrict ref = ins + val;
+            if (memcmp(in + m_len - 4, ref + m_len - 4, sizeof(u32)) == 0 &&
+                memcmp(in, ref, sizeof(u32)) == 0) {
+                if (heur > in && *(u32 *)heur != *(u32 *)(ref + (heur - in))) goto not_found;
+
+                s32 len = 4;
+                for (; in + len < in_end - m_len - 32; len += sizeof(u32)) {
+                    if (*(u32 *)(in + len) != *(u32 *)(ref + len)) break;
+                }
+
+                if (len < m_len) {
+                    if (heur < in + len) heur = in + len;
+                    goto not_found;
+                }
+
+                len += in[len] == ref[len];
+                len += in[len] == ref[len];
+                len += in[len] == ref[len];
+
+                in += len;
+                ctx = in[-1] | (in[-2] << 8) | (in[-3] << 16) | (in[-4] << 24);
+
+                *out++ = MATCH;
+
+                len -= m_len;
+                while (len >= 254) {
+                    len -= 254;
+                    *out++ = 254;
+                    if (out >= out_eob) break;
+                }
+
+                *out++ = len;
+            } else {
+            not_found:;
+                u8 next = *out++ = *in++;
+                ctx = ctx << 8 | next;
+                if (next == MATCH) *out++ = 255;
+            }
+        } else {
+            ctx = (ctx << 8) | (*out++ = *in++);
+        }
+    }
+
+    ctx = in[-1] | (in[-2] << 8) | (in[-3] << 16) | (in[-4] << 24);
+
+    while (in < in_end && out < out_eob) {
+        u32 idx = (ctx >> 15 ^ ctx ^ ctx >> 3) & mask;
+        s32 val = lut[idx];
+        lut[idx] = (s32)(in - ins);
+
+        u8 next = *out++ = *in++;
+        ctx = ctx << 8 | next;
+        if (next == MATCH && val > 0) *out++ = 255;
+    }
+
+    return out >= out_eob ? -1 : (s32)(out - outs);
+}
+
+static s32 lzp_decode_block(const u8 * restrict in, const u8 * in_end, u8 * restrict out, s32 hash,
+                            s32 m_len) {
+    if (in_end - in < 4) return -1;
+
+    s32 * restrict lut = calloc(1 << hash, sizeof(s32));
+    if (!lut) return -1;
+
+    u32 mask = (s32)(1 << hash) - 1;
+    const u8 * outs = out;
+
+    for (s32 i = 0; i < 4; ++i) *out++ = *in++;
+
+    u32 ctx = out[-1] | (out[-2] << 8) | (out[-3] << 16) | (out[-4] << 24);
+
+    while (in < in_end) {
+        u32 idx = (ctx >> 15 ^ ctx ^ ctx >> 3) & mask;
+        s32 val = lut[idx];
+        lut[idx] = (s32)(out - outs);
+        if (*in == MATCH && val > 0) {
+            in++;
+            if (*in != 255) {
+                s32 len = m_len;
+                while (1) {
+                    len += *in;
+                    if (*in++ != 254) break;
+                }
+
+                const u8 * ref = outs + val;
+                u8 * out_end = out + len;
+
+                while (out < out_end) *out++ = *ref++;
+
+                ctx = out[-1] | out[-2] << 8 | out[-3] << 16 | out[-4] << 24;
+            } else {
+                in++;
+                ctx = (ctx << 8) | (*out++ = MATCH);
+            }
+        } else {
+            ctx = (ctx << 8) | (*out++ = *in++);
+        }
+    }
+
+    free(lut);
+
+    return out - outs;
+}
+
+s32 lzp_compress(const u8 * in, u8 * out, s32 n, s32 hash, s32 m_len) {
+    s32 nblk = num_blocks(n);
+
+    if (nblk == 1) {
+        if (n - m_len < 32) return -1;
+
+        s32 * lut = calloc(1 << hash, sizeof(s32));
+
+        if (!lut) return -1;
+
+        s32 r =
+            lzp_encode_block(in, in + n, out + 1, out + n - 1, lut, (s32)(1 << hash) - 1, m_len);
+
+        free(lut);
+
+        if (r >= 0) {
+            out[0] = 1;
+            r++;
+        }
+
+        return r;
+    }
+
+    s32 out_ptr = 1 + 8 * nblk;
+
+    out[0] = nblk;
+    for (s32 b_id = 0; b_id < nblk; ++b_id) {
+        s32 ins = b_id * (n / nblk);
+        s32 insz = b_id != nblk - 1 ? n / nblk : n - ins;
+        s32 outsz = insz;
+        if (outsz > n - out_ptr) outsz = n - out_ptr;
+
+        s32 r;
+
+        if (insz - m_len < 32)
+            r = -1;
+        else {
+            s32 * lut = calloc(1 << hash, sizeof(s32));
+
+            if (!lut)
+                r = -1;
+            else
+                r = lzp_encode_block(in + ins, in + ins + insz, out + out_ptr,
+                                     out + out_ptr + outsz, lut, (s32)(1 << hash) - 1, m_len);
+
+            free(lut);
+        }
+
+        if (r < 0) {
+            if (out_ptr + insz >= n) return -1;
+            r = insz;
+            memcpy(out + out_ptr, in + ins, insz);
+        }
+        memcpy(out + 1 + 8 * b_id + 0, &insz, sizeof(s32));
+        memcpy(out + 1 + 8 * b_id + 4, &r, sizeof(s32));
+
+        out_ptr += r;
+    }
+
+    return out_ptr;
+}
+
+s32 lzp_decompress(const u8 * in, u8 * out, s32 n, s32 hash, s32 m_len) {
+    s32 nblk = in[0];
+
+    if (nblk == 1) return lzp_decode_block(in + 1, in + n, out, hash, m_len);
+
+    s32 dec[256];
+
+    for (s32 b_id = 0; b_id < nblk; ++b_id) {
+        s32 in_ptr = 0, out_ptr = 0;
+        for (s32 p = 0; p < b_id; ++p) {
+            in_ptr += *(s32 *)(in + 1 + 8 * p + 4);
+            out_ptr += *(s32 *)(in + 1 + 8 * p + 0);
+        }
+
+        in_ptr += 1 + 8 * nblk;
+
+        s32 insz = *(s32 *)(in + 1 + 8 * b_id + 4);
+        s32 outsz = *(s32 *)(in + 1 + 8 * b_id + 0);
+
+        if (insz != outsz) {
+            dec[b_id] =
+                lzp_decode_block(in + in_ptr, in + in_ptr + insz, out + out_ptr, hash, m_len);
+        } else {
+            dec[b_id] = insz;
+            memcpy(out + out_ptr, in + in_ptr, insz);
+        }
+    }
+
+    s32 dataSize = 0, r = 0;
+    for (s32 b_id = 0; b_id < nblk; ++b_id) {
+        if (dec[b_id] < 0) r = dec[b_id];
+        dataSize += dec[b_id];
+    }
+
+    return (r == 0) ? dataSize : r;
+}
diff --git a/src/main.c b/src/main.c
index cce43cc..2af0d42 100644
--- a/src/main.c
+++ b/src/main.c
@@ -58,8 +58,7 @@ int main(int argc, char * argv[]) {
     }
 
     if (mode == 0) {
-        fprintf(stderr, "Usage: %s [-e/-d/-t] [-b block_size] input output\n",
-                argv[0]);
+        fprintf(stderr, "Usage: %s [-e/-d/-t] [-b block_size] input output\n", argv[0]);
         fprintf(stderr,
                 "If input or output are not specified, they default to stdin "
                 "and stdout.\n");
@@ -121,8 +120,7 @@ int main(int argc, char * argv[]) {
         }
     }
 
-    struct block_encoder_state * block_encoder_state =
-        new_block_encoder_state(block_size);
+    struct block_encoder_state * block_encoder_state = new_block_encoder_state(block_size);
 
     if (block_encoder_state == NULL) {
         fprintf(stderr, "Failed to create a block encoder state.\n");
@@ -131,11 +129,9 @@ int main(int argc, char * argv[]) {
 
     if (mode == 1)
         while (commit_read(block_encoder_state,
-                           read(input_des, get_buffer(block_encoder_state),
-                                block_size)) > 0) {
+                           read(input_des, get_buffer(block_encoder_state), block_size)) > 0) {
             if (get_last_error(block_encoder_state) != BZ3_OK) {
-                fprintf(stderr, "Failed to read data: %s\n",
-                        str_last_error(block_encoder_state));
+                fprintf(stderr, "Failed to read data: %s\n", str_last_error(block_encoder_state));
                 return 1;
             }
             struct encoding_result r = encode_block(block_encoder_state);
@@ -150,8 +146,7 @@ int main(int argc, char * argv[]) {
         s32 read_size;
         while ((read_size = read_block(input_des, block_encoder_state)) > 0) {
             if (get_last_error(block_encoder_state) != BZ3_OK) {
-                fprintf(stderr, "Failed to read data: %s\n",
-                        str_last_error(block_encoder_state));
+                fprintf(stderr, "Failed to read data: %s\n", str_last_error(block_encoder_state));
                 return 1;
             }
             struct encoding_result r = decode_block(block_encoder_state);
@@ -166,22 +161,19 @@ int main(int argc, char * argv[]) {
         s32 read_size;
         while ((read_size = read_block(input_des, block_encoder_state)) > 0) {
             if (get_last_error(block_encoder_state) != BZ3_OK) {
-                fprintf(stderr, "Failed to read data: %s\n",
-                        str_last_error(block_encoder_state));
+                fprintf(stderr, "Failed to read data: %s\n", str_last_error(block_encoder_state));
                 return 1;
             }
             decode_block(block_encoder_state);
             if (get_last_error(block_encoder_state) != BZ3_OK) {
-                fprintf(stderr, "Failed to decode data: %s\n",
-                        str_last_error(block_encoder_state));
+                fprintf(stderr, "Failed to decode data: %s\n", str_last_error(block_encoder_state));
                 return 1;
             }
         }
     }
 
     if (get_last_error(block_encoder_state) != BZ3_OK) {
-        fprintf(stderr, "Failed to read data: %s\n",
-                str_last_error(block_encoder_state));
+        fprintf(stderr, "Failed to read data: %s\n", str_last_error(block_encoder_state));
         return 1;
     }
 
diff --git a/src/mtf.c b/src/mtf.c
index e18f646..a2f6cf6 100644
--- a/src/mtf.c
+++ b/src/mtf.c
@@ -55,8 +55,7 @@ void mtf_decode(struct mtf_state * mtf, u8 * src, u8 * dst, u32 count) {
 
         mtf->prev[c] = mtf->curr[c] = i;
 
-        for (; r > 0 && mtf->curr[mtf->ranks[r - 1]] <= i; r--)
-            mtf->ranks[r] = mtf->ranks[r - 1];
+        for (; r > 0 && mtf->curr[mtf->ranks[r - 1]] <= i; r--) mtf->ranks[r] = mtf->ranks[r - 1];
 
         mtf->ranks[r] = c;
     }
diff --git a/src/srt.c b/src/srt.c
index c5c1e38..d292abe 100644
--- a/src/srt.c
+++ b/src/srt.c
@@ -32,7 +32,8 @@ static s32 preprocess(const u32 * freqs, u8 * symbols) {
         for (u32 i = h; i < nb_symbols; i++) {
             const s32 t = symbols[i];
             s32 b = i - h;
-            while ((b >= 0) && (freqs[symbols[b]] < freqs[t] || (freqs[t] == freqs[symbols[b]] && t < symbols[b]))) {
+            while ((b >= 0) && (freqs[symbols[b]] < freqs[t] ||
+                                (freqs[t] == freqs[symbols[b]] && t < symbols[b]))) {
                 symbols[b + h] = symbols[b];
                 b -= h;
             }
diff --git a/src/txt.c b/src/txt.c
index 7968733..316d04f 100644
--- a/src/txt.c
+++ b/src/txt.c
@@ -4,7 +4,7 @@
 #include <math.h>
 
 #ifdef STANDALONE
-#include <stdio.h>
+    #include <stdio.h>
 #endif
 
 int is_text(const u8 * data, s32 len) {
@@ -13,9 +13,8 @@ int is_text(const u8 * data, s32 len) {
 #endif
 
     s32 histogram[256] = { 0 };
-    for(s32 i = 0; i < len; i++)
-        histogram[data[i]]++;
-    
+    for (s32 i = 0; i < len; i++) histogram[data[i]]++;
+
     // Text criterions:
     // 1. Shannon entropy is between 4.5 and 5.2.
     // 2. Majority of the document must be uppercase/lowercase numbers.
@@ -24,9 +23,9 @@ int is_text(const u8 * data, s32 len) {
 
     // Step 1
     double entropy = 0;
-    for(s32 i = 0; i < 256; i++) {
+    for (s32 i = 0; i < 256; i++) {
         double p = (double)histogram[i] / len;
-        if(p == 0) continue;
+        if (p == 0) continue;
         entropy += p * log2(p);
     }
     entropy = -entropy;
@@ -35,34 +34,32 @@ int is_text(const u8 * data, s32 len) {
     printf("Shannon entropy: %lf\n", entropy);
 #endif
 
-    if(entropy > 5.4 || entropy < 4.5)
-        return 0;
-    
+    if (entropy > 5.4 || entropy < 4.5) return 0;
+
     // Step 2
     s32 letters = 0;
     s32 whitespace = 0;
-    for(s32 i = 0; i < 256; i++) {
-        if(i >= 'A' && i <= 'Z')
+    for (s32 i = 0; i < 256; i++) {
+        if (i >= 'A' && i <= 'Z')
             letters += histogram[i];
-        else if(i >= 'a' && i <= 'z')
+        else if (i >= 'a' && i <= 'z')
             letters += histogram[i];
-        else if(i >= '0' && i <= '9')
+        else if (i >= '0' && i <= '9')
             letters += histogram[i];
-        else if(i == ' ' || i == '\t' || i == '\n' || i == '\r' || i == '\v')
+        else if (i == ' ' || i == '\t' || i == '\n' || i == '\r' || i == '\v')
             whitespace += histogram[i];
     }
 
 #ifdef STANDALONE
-    printf("Letters: %d, whitespace: %d, text to other ratio: %lf\n", letters, whitespace, (double)(letters+whitespace) / len);
+    printf("Letters: %d, whitespace: %d, text to other ratio: %lf\n", letters, whitespace,
+           (double)(letters + whitespace) / len);
 #endif
 
-    if((double)(letters+whitespace) / len < 0.6)
-        return 0;
+    if ((double)(letters + whitespace) / len < 0.6) return 0;
 
     // Step 3
     double letters_ratio = (double)letters / whitespace;
-    if(letters_ratio < 2 || letters_ratio > 9)
-        return 0;
+    if (letters_ratio < 2 || letters_ratio > 9) return 0;
 
 #ifdef STANDALONE
     printf("Letter to whitespace ratio: %lf\n", letters_ratio);
tab: 248 wrap: offon