:: commit f2ce4019f546e3ac262e814511fd73fd12aca09c

Kamila Szewczyk <kspalaiologos@gmail.com> — 2022-05-02 10:01

parents: 52b79300fa

update clang format

diff --git a/.clang-format b/.clang-format
index 52efccd..ef36a0a 100644
--- a/.clang-format
+++ b/.clang-format
@@ -2,8 +2,10 @@
 BasedOnStyle: Google
 IndentPPDirectives: BeforeHash
 IndentWidth: '4'
+DerivePointerAlignment: false
 PointerAlignment: Middle
 TabWidth: '4'
 UseTab: Never
+Cpp11BracedListStyle: false
 
 ...
diff --git a/.vscode/settings.json b/.vscode/settings.json
index 926b26e..478f1ca 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -1,5 +1,6 @@
 {
     "files.associations": {
-        "srt.h": "c"
+        "srt.h": "c",
+        "rle.h": "c"
     }
 }
\ No newline at end of file
diff --git a/include/cm.h b/include/cm.h
index ecd80f7..ff99d46 100644
--- a/include/cm.h
+++ b/include/cm.h
@@ -14,10 +14,10 @@ typedef struct {
     int64_t input_ptr, output_ptr, input_max;
 } state;
 
-void flush(state *s);
-void init(state *s);
+void flush(state * s);
+void init(state * s);
 void begin(state * s);
-void encode_byte(state *s, uint8_t c);
-uint8_t decode_byte(state *s);
+void encode_byte(state * s, uint8_t c);
+uint8_t decode_byte(state * s);
 
 #endif
diff --git a/include/crc32.h b/include/crc32.h
index c77ff91..f67eb06 100644
--- a/include/crc32.h
+++ b/include/crc32.h
@@ -23,6 +23,6 @@
 #include <inttypes.h>
 #include <stddef.h>
 
-uint32_t crc32sum(uint32_t crc, uint8_t *buf, size_t size);
+uint32_t crc32sum(uint32_t crc, uint8_t * buf, size_t size);
 
 #endif
diff --git a/include/libsais.h b/include/libsais.h
index f0f6018..57688a9 100644
--- a/include/libsais.h
+++ b/include/libsais.h
@@ -28,323 +28,401 @@ Please see the file LICENSE for full copyright information.
 extern "C" {
 #endif
 
-    #include <stdint.h>
+#include <stdint.h>
 
-    /**
-    * Creates the libsais context that allows reusing allocated memory with each libsais operation. 
-    * In multi-threaded environments, use one context per thread for parallel executions.
-    * @return the libsais context, NULL otherwise.
-    */
-    void * libsais_create_ctx(void);
+/**
+ * Creates the libsais context that allows reusing allocated memory with each
+ * libsais operation. In multi-threaded environments, use one context per thread
+ * for parallel executions.
+ * @return the libsais context, NULL otherwise.
+ */
+void * libsais_create_ctx(void);
 
 #if defined(_OPENMP)
-    /**
-    * Creates the libsais context that allows reusing allocated memory with each parallel libsais operation using OpenMP. 
-    * In multi-threaded environments, use one context per thread for parallel executions.
-    * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default).
-    * @return the libsais context, NULL otherwise.
-    */
-    void * libsais_create_ctx_omp(int32_t threads);
+/**
+ * Creates the libsais context that allows reusing allocated memory with each
+ * parallel libsais operation using OpenMP. In multi-threaded environments, use
+ * one context per thread for parallel executions.
+ * @param threads The number of OpenMP threads to use (can be 0 for OpenMP
+ * default).
+ * @return the libsais context, NULL otherwise.
+ */
+void * libsais_create_ctx_omp(int32_t threads);
 #endif
 
-    /**
-    * Destroys the libsass context and free previusly allocated memory.
-    * @param ctx The libsais context (can be NULL).
-    */
-    void libsais_free_ctx(void * ctx);
-
-    /**
-    * Constructs the suffix array of a given string.
-    * @param T [0..n-1] The input string.
-    * @param SA [0..n-1+fs] The output array of suffixes.
-    * @param n The length of the given string.
-    * @param fs The extra space available at the end of SA array (0 should be enough for most cases).
-    * @param freq [0..255] The output symbol frequency table (can be NULL).
-    * @return 0 if no error occurred, -1 or -2 otherwise.
-    */
-    int32_t libsais(const uint8_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq);
-
-    /**
-    * Constructs the suffix array of a given integer array.
-    * Note, during construction input array will be modified, but restored at the end if no errors occurred.
-    * @param T [0..n-1] The input integer array.
-    * @param SA [0..n-1+fs] The output array of suffixes.
-    * @param n The length of the integer array.
-    * @param k The alphabet size of the input integer array.
-    * @param fs Extra space available at the end of SA array (can be 0, but 4k or better 6k is recommended for optimal performance).
-    * @return 0 if no error occurred, -1 or -2 otherwise.
-    */
-    int32_t libsais_int(int32_t * T, int32_t * SA, int32_t n, int32_t k, int32_t fs);
-
-    /**
-    * Constructs the suffix array of a given string using libsais context.
-    * @param ctx The libsais context.
-    * @param T [0..n-1] The input string.
-    * @param SA [0..n-1+fs] The output array of suffixes.
-    * @param n The length of the given string.
-    * @param fs The extra space available at the end of SA array (0 should be enough for most cases).
-    * @param freq [0..255] The output symbol frequency table (can be NULL).
-    * @return 0 if no error occurred, -1 or -2 otherwise.
-    */
-    int32_t libsais_ctx(const void * ctx, const uint8_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq);
+/**
+ * Destroys the libsass context and free previusly allocated memory.
+ * @param ctx The libsais context (can be NULL).
+ */
+void libsais_free_ctx(void * ctx);
+
+/**
+ * Constructs the suffix array of a given string.
+ * @param T [0..n-1] The input string.
+ * @param SA [0..n-1+fs] The output array of suffixes.
+ * @param n The length of the given string.
+ * @param fs The extra space available at the end of SA array (0 should be
+ * enough for most cases).
+ * @param freq [0..255] The output symbol frequency table (can be NULL).
+ * @return 0 if no error occurred, -1 or -2 otherwise.
+ */
+int32_t libsais(const uint8_t * T, int32_t * SA, int32_t n, int32_t fs,
+                int32_t * freq);
+
+/**
+ * Constructs the suffix array of a given integer array.
+ * Note, during construction input array will be modified, but restored at the
+ * end if no errors occurred.
+ * @param T [0..n-1] The input integer array.
+ * @param SA [0..n-1+fs] The output array of suffixes.
+ * @param n The length of the integer array.
+ * @param k The alphabet size of the input integer array.
+ * @param fs Extra space available at the end of SA array (can be 0, but 4k or
+ * better 6k is recommended for optimal performance).
+ * @return 0 if no error occurred, -1 or -2 otherwise.
+ */
+int32_t libsais_int(int32_t * T, int32_t * SA, int32_t n, int32_t k,
+                    int32_t fs);
+
+/**
+ * Constructs the suffix array of a given string using libsais context.
+ * @param ctx The libsais context.
+ * @param T [0..n-1] The input string.
+ * @param SA [0..n-1+fs] The output array of suffixes.
+ * @param n The length of the given string.
+ * @param fs The extra space available at the end of SA array (0 should be
+ * enough for most cases).
+ * @param freq [0..255] The output symbol frequency table (can be NULL).
+ * @return 0 if no error occurred, -1 or -2 otherwise.
+ */
+int32_t libsais_ctx(const void * ctx, const uint8_t * T, int32_t * SA,
+                    int32_t n, int32_t fs, int32_t * freq);
 
 #if defined(_OPENMP)
-    /**
-    * Constructs the suffix array of a given string in parallel using OpenMP.
-    * @param T [0..n-1] The input string.
-    * @param SA [0..n-1+fs] The output array of suffixes.
-    * @param n The length of the given string.
-    * @param fs The extra space available at the end of SA array (0 should be enough for most cases).
-    * @param freq [0..255] The output symbol frequency table (can be NULL).
-    * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default).
-    * @return 0 if no error occurred, -1 or -2 otherwise.
-    */
-    int32_t libsais_omp(const uint8_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq, int32_t threads);
-
-    /**
-    * Constructs the suffix array of a given integer array in parallel using OpenMP.
-    * Note, during construction input array will be modified, but restored at the end if no errors occurred.
-    * @param T [0..n-1] The input integer array.
-    * @param SA [0..n-1+fs] The output array of suffixes.
-    * @param n The length of the integer array.
-    * @param k The alphabet size of the input integer array.
-    * @param fs Extra space available at the end of SA array (can be 0, but 4k or better 6k is recommended for optimal performance).
-    * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default).
-    * @return 0 if no error occurred, -1 or -2 otherwise.
-    */
-    int32_t libsais_int_omp(int32_t * T, int32_t * SA, int32_t n, int32_t k, int32_t fs, int32_t threads);
+/**
+ * Constructs the suffix array of a given string in parallel using OpenMP.
+ * @param T [0..n-1] The input string.
+ * @param SA [0..n-1+fs] The output array of suffixes.
+ * @param n The length of the given string.
+ * @param fs The extra space available at the end of SA array (0 should be
+ * enough for most cases).
+ * @param freq [0..255] The output symbol frequency table (can be NULL).
+ * @param threads The number of OpenMP threads to use (can be 0 for OpenMP
+ * default).
+ * @return 0 if no error occurred, -1 or -2 otherwise.
+ */
+int32_t libsais_omp(const uint8_t * T, int32_t * SA, int32_t n, int32_t fs,
+                    int32_t * freq, int32_t threads);
+
+/**
+ * Constructs the suffix array of a given integer array in parallel using
+ * OpenMP. Note, during construction input array will be modified, but restored
+ * at the end if no errors occurred.
+ * @param T [0..n-1] The input integer array.
+ * @param SA [0..n-1+fs] The output array of suffixes.
+ * @param n The length of the integer array.
+ * @param k The alphabet size of the input integer array.
+ * @param fs Extra space available at the end of SA array (can be 0, but 4k or
+ * better 6k is recommended for optimal performance).
+ * @param threads The number of OpenMP threads to use (can be 0 for OpenMP
+ * default).
+ * @return 0 if no error occurred, -1 or -2 otherwise.
+ */
+int32_t libsais_int_omp(int32_t * T, int32_t * SA, int32_t n, int32_t k,
+                        int32_t fs, int32_t threads);
 #endif
 
-    /**
-    * Constructs the burrows-wheeler transformed string (BWT) of a given string.
-    * @param T [0..n-1] The input string.
-    * @param U [0..n-1] The output string (can be T).
-    * @param A [0..n-1+fs] The temporary array.
-    * @param n The length of the given string.
-    * @param fs The extra space available at the end of A array (0 should be enough for most cases).
-    * @param freq [0..255] The output symbol frequency table (can be NULL).
-    * @return The primary index if no error occurred, -1 or -2 otherwise.
-    */
-    int32_t libsais_bwt(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq);
-
-    /**
-    * Constructs the burrows-wheeler transformed string (BWT) of a given string with auxiliary indexes.
-    * @param T [0..n-1] The input string.
-    * @param U [0..n-1] The output string (can be T).
-    * @param A [0..n-1+fs] The temporary array.
-    * @param n The length of the given string.
-    * @param fs The extra space available at the end of A array (0 should be enough for most cases).
-    * @param freq [0..255] The output symbol frequency table (can be NULL).
-    * @param r The sampling rate for auxiliary indexes (must be power of 2).
-    * @param I [0..(n-1)/r] The output auxiliary indexes.
-    * @return 0 if no error occurred, -1 or -2 otherwise.
-    */
-    int32_t libsais_bwt_aux(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I);
-
-    /**
-    * Constructs the burrows-wheeler transformed string (BWT) of a given string using libsais context.
-    * @param ctx The libsais context.
-    * @param T [0..n-1] The input string.
-    * @param U [0..n-1] The output string (can be T).
-    * @param A [0..n-1+fs] The temporary array.
-    * @param n The length of the given string.
-    * @param fs The extra space available at the end of A array (0 should be enough for most cases).
-    * @param freq [0..255] The output symbol frequency table (can be NULL).
-    * @return The primary index if no error occurred, -1 or -2 otherwise.
-    */
-    int32_t libsais_bwt_ctx(const void * ctx, const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq);
-
-    /**
-    * Constructs the burrows-wheeler transformed string (BWT) of a given string with auxiliary indexes using libsais context.
-    * @param ctx The libsais context.
-    * @param T [0..n-1] The input string.
-    * @param U [0..n-1] The output string (can be T).
-    * @param A [0..n-1+fs] The temporary array.
-    * @param n The length of the given string.
-    * @param fs The extra space available at the end of A array (0 should be enough for most cases).
-    * @param freq [0..255] The output symbol frequency table (can be NULL).
-    * @param r The sampling rate for auxiliary indexes (must be power of 2).
-    * @param I [0..(n-1)/r] The output auxiliary indexes.
-    * @return 0 if no error occurred, -1 or -2 otherwise.
-    */
-    int32_t libsais_bwt_aux_ctx(const void * ctx, const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I);
+/**
+ * Constructs the burrows-wheeler transformed string (BWT) of a given string.
+ * @param T [0..n-1] The input string.
+ * @param U [0..n-1] The output string (can be T).
+ * @param A [0..n-1+fs] The temporary array.
+ * @param n The length of the given string.
+ * @param fs The extra space available at the end of A array (0 should be enough
+ * for most cases).
+ * @param freq [0..255] The output symbol frequency table (can be NULL).
+ * @return The primary index if no error occurred, -1 or -2 otherwise.
+ */
+int32_t libsais_bwt(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n,
+                    int32_t fs, int32_t * freq);
+
+/**
+ * Constructs the burrows-wheeler transformed string (BWT) of a given string
+ * with auxiliary indexes.
+ * @param T [0..n-1] The input string.
+ * @param U [0..n-1] The output string (can be T).
+ * @param A [0..n-1+fs] The temporary array.
+ * @param n The length of the given string.
+ * @param fs The extra space available at the end of A array (0 should be enough
+ * for most cases).
+ * @param freq [0..255] The output symbol frequency table (can be NULL).
+ * @param r The sampling rate for auxiliary indexes (must be power of 2).
+ * @param I [0..(n-1)/r] The output auxiliary indexes.
+ * @return 0 if no error occurred, -1 or -2 otherwise.
+ */
+int32_t libsais_bwt_aux(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n,
+                        int32_t fs, int32_t * freq, int32_t r, int32_t * I);
+
+/**
+ * Constructs the burrows-wheeler transformed string (BWT) of a given string
+ * using libsais context.
+ * @param ctx The libsais context.
+ * @param T [0..n-1] The input string.
+ * @param U [0..n-1] The output string (can be T).
+ * @param A [0..n-1+fs] The temporary array.
+ * @param n The length of the given string.
+ * @param fs The extra space available at the end of A array (0 should be enough
+ * for most cases).
+ * @param freq [0..255] The output symbol frequency table (can be NULL).
+ * @return The primary index if no error occurred, -1 or -2 otherwise.
+ */
+int32_t libsais_bwt_ctx(const void * ctx, const uint8_t * T, uint8_t * U,
+                        int32_t * A, int32_t n, int32_t fs, int32_t * freq);
+
+/**
+ * Constructs the burrows-wheeler transformed string (BWT) of a given string
+ * with auxiliary indexes using libsais context.
+ * @param ctx The libsais context.
+ * @param T [0..n-1] The input string.
+ * @param U [0..n-1] The output string (can be T).
+ * @param A [0..n-1+fs] The temporary array.
+ * @param n The length of the given string.
+ * @param fs The extra space available at the end of A array (0 should be enough
+ * for most cases).
+ * @param freq [0..255] The output symbol frequency table (can be NULL).
+ * @param r The sampling rate for auxiliary indexes (must be power of 2).
+ * @param I [0..(n-1)/r] The output auxiliary indexes.
+ * @return 0 if no error occurred, -1 or -2 otherwise.
+ */
+int32_t libsais_bwt_aux_ctx(const void * ctx, const uint8_t * T, uint8_t * U,
+                            int32_t * A, int32_t n, int32_t fs, int32_t * freq,
+                            int32_t r, int32_t * I);
 
 #if defined(_OPENMP)
-    /**
-    * Constructs the burrows-wheeler transformed string (BWT) of a given string in parallel using OpenMP.
-    * @param T [0..n-1] The input string.
-    * @param U [0..n-1] The output string (can be T).
-    * @param A [0..n-1+fs] The temporary array.
-    * @param n The length of the given string.
-    * @param fs The extra space available at the end of A array (0 should be enough for most cases).
-    * @param freq [0..255] The output symbol frequency table (can be NULL).
-    * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default).
-    * @return The primary index if no error occurred, -1 or -2 otherwise.
-    */
-    int32_t libsais_bwt_omp(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t threads);
-
-    /**
-    * Constructs the burrows-wheeler transformed string (BWT) of a given string with auxiliary indexes in parallel using OpenMP.
-    * @param T [0..n-1] The input string.
-    * @param U [0..n-1] The output string (can be T).
-    * @param A [0..n-1+fs] The temporary array.
-    * @param n The length of the given string.
-    * @param fs The extra space available at the end of A array (0 should be enough for most cases).
-    * @param freq [0..255] The output symbol frequency table (can be NULL).
-    * @param r The sampling rate for auxiliary indexes (must be power of 2).
-    * @param I [0..(n-1)/r] The output auxiliary indexes.
-    * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default).
-    * @return 0 if no error occurred, -1 or -2 otherwise.
-    */
-    int32_t libsais_bwt_aux_omp(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I, int32_t threads);
+/**
+ * Constructs the burrows-wheeler transformed string (BWT) of a given string in
+ * parallel using OpenMP.
+ * @param T [0..n-1] The input string.
+ * @param U [0..n-1] The output string (can be T).
+ * @param A [0..n-1+fs] The temporary array.
+ * @param n The length of the given string.
+ * @param fs The extra space available at the end of A array (0 should be enough
+ * for most cases).
+ * @param freq [0..255] The output symbol frequency table (can be NULL).
+ * @param threads The number of OpenMP threads to use (can be 0 for OpenMP
+ * default).
+ * @return The primary index if no error occurred, -1 or -2 otherwise.
+ */
+int32_t libsais_bwt_omp(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n,
+                        int32_t fs, int32_t * freq, int32_t threads);
+
+/**
+ * Constructs the burrows-wheeler transformed string (BWT) of a given string
+ * with auxiliary indexes in parallel using OpenMP.
+ * @param T [0..n-1] The input string.
+ * @param U [0..n-1] The output string (can be T).
+ * @param A [0..n-1+fs] The temporary array.
+ * @param n The length of the given string.
+ * @param fs The extra space available at the end of A array (0 should be enough
+ * for most cases).
+ * @param freq [0..255] The output symbol frequency table (can be NULL).
+ * @param r The sampling rate for auxiliary indexes (must be power of 2).
+ * @param I [0..(n-1)/r] The output auxiliary indexes.
+ * @param threads The number of OpenMP threads to use (can be 0 for OpenMP
+ * default).
+ * @return 0 if no error occurred, -1 or -2 otherwise.
+ */
+int32_t libsais_bwt_aux_omp(const uint8_t * T, uint8_t * U, int32_t * A,
+                            int32_t n, int32_t fs, int32_t * freq, int32_t r,
+                            int32_t * I, int32_t threads);
 #endif
 
-    /**
-    * Creates the libsais reverse BWT context that allows reusing allocated memory with each libsais_unbwt_* operation. 
-    * In multi-threaded environments, use one context per thread for parallel executions.
-    * @return the libsais context, NULL otherwise.
-    */
-    void * libsais_unbwt_create_ctx(void);
+/**
+ * Creates the libsais reverse BWT context that allows reusing allocated memory
+ * with each libsais_unbwt_* operation. In multi-threaded environments, use one
+ * context per thread for parallel executions.
+ * @return the libsais context, NULL otherwise.
+ */
+void * libsais_unbwt_create_ctx(void);
 
 #if defined(_OPENMP)
-    /**
-    * Creates the libsais reverse BWT context that allows reusing allocated memory with each parallel libsais_unbwt_* operation using OpenMP. 
-    * In multi-threaded environments, use one context per thread for parallel executions.
-    * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default).
-    * @return the libsais context, NULL otherwise.
-    */
-    void * libsais_unbwt_create_ctx_omp(int32_t threads);
+/**
+ * Creates the libsais reverse BWT context that allows reusing allocated memory
+ * with each parallel libsais_unbwt_* operation using OpenMP. In multi-threaded
+ * environments, use one context per thread for parallel executions.
+ * @param threads The number of OpenMP threads to use (can be 0 for OpenMP
+ * default).
+ * @return the libsais context, NULL otherwise.
+ */
+void * libsais_unbwt_create_ctx_omp(int32_t threads);
 #endif
 
-    /**
-    * Destroys the libsass reverse BWT context and free previusly allocated memory.
-    * @param ctx The libsais context (can be NULL).
-    */
-    void libsais_unbwt_free_ctx(void * ctx);
-
-    /**
-    * Constructs the original string from a given burrows-wheeler transformed string (BWT) with primary index.
-    * @param T [0..n-1] The input string.
-    * @param U [0..n-1] The output string (can be T).
-    * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size).
-    * @param n The length of the given string.
-    * @param freq [0..255] The input symbol frequency table (can be NULL).
-    * @param i The primary index.
-    * @return 0 if no error occurred, -1 or -2 otherwise.
-    */
-    int32_t libsais_unbwt(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i);
-
-    /**
-    * Constructs the original string from a given burrows-wheeler transformed string (BWT) with primary index using libsais reverse BWT context.
-    * @param ctx The libsais reverse BWT context.
-    * @param T [0..n-1] The input string.
-    * @param U [0..n-1] The output string (can be T).
-    * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size).
-    * @param n The length of the given string.
-    * @param freq [0..255] The input symbol frequency table (can be NULL).
-    * @param i The primary index.
-    * @return 0 if no error occurred, -1 or -2 otherwise.
-    */
-    int32_t libsais_unbwt_ctx(const void * ctx, const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i);
-
-    /**
-    * Constructs the original string from a given burrows-wheeler transformed string (BWT) with auxiliary indexes.
-    * @param T [0..n-1] The input string.
-    * @param U [0..n-1] The output string (can be T).
-    * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size).
-    * @param n The length of the given string.
-    * @param freq [0..255] The input symbol frequency table (can be NULL).
-    * @param r The sampling rate for auxiliary indexes (must be power of 2).
-    * @param I [0..(n-1)/r] The input auxiliary indexes.
-    * @return 0 if no error occurred, -1 or -2 otherwise.
-    */
-    int32_t libsais_unbwt_aux(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I);
-
-    /**
-    * Constructs the original string from a given burrows-wheeler transformed string (BWT) with auxiliary indexes using libsais reverse BWT context.
-    * @param ctx The libsais reverse BWT context.
-    * @param T [0..n-1] The input string.
-    * @param U [0..n-1] The output string (can be T).
-    * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size).
-    * @param n The length of the given string.
-    * @param freq [0..255] The input symbol frequency table (can be NULL).
-    * @param r The sampling rate for auxiliary indexes (must be power of 2).
-    * @param I [0..(n-1)/r] The input auxiliary indexes.
-    * @return 0 if no error occurred, -1 or -2 otherwise.
-    */
-    int32_t libsais_unbwt_aux_ctx(const void * ctx, const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I);
+/**
+ * Destroys the libsass reverse BWT context and free previusly allocated memory.
+ * @param ctx The libsais context (can be NULL).
+ */
+void libsais_unbwt_free_ctx(void * ctx);
+
+/**
+ * Constructs the original string from a given burrows-wheeler transformed
+ * string (BWT) with primary index.
+ * @param T [0..n-1] The input string.
+ * @param U [0..n-1] The output string (can be T).
+ * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1
+ * size).
+ * @param n The length of the given string.
+ * @param freq [0..255] The input symbol frequency table (can be NULL).
+ * @param i The primary index.
+ * @return 0 if no error occurred, -1 or -2 otherwise.
+ */
+int32_t libsais_unbwt(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n,
+                      const int32_t * freq, int32_t i);
+
+/**
+ * Constructs the original string from a given burrows-wheeler transformed
+ * string (BWT) with primary index using libsais reverse BWT context.
+ * @param ctx The libsais reverse BWT context.
+ * @param T [0..n-1] The input string.
+ * @param U [0..n-1] The output string (can be T).
+ * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1
+ * size).
+ * @param n The length of the given string.
+ * @param freq [0..255] The input symbol frequency table (can be NULL).
+ * @param i The primary index.
+ * @return 0 if no error occurred, -1 or -2 otherwise.
+ */
+int32_t libsais_unbwt_ctx(const void * ctx, const uint8_t * T, uint8_t * U,
+                          int32_t * A, int32_t n, const int32_t * freq,
+                          int32_t i);
+
+/**
+ * Constructs the original string from a given burrows-wheeler transformed
+ * string (BWT) with auxiliary indexes.
+ * @param T [0..n-1] The input string.
+ * @param U [0..n-1] The output string (can be T).
+ * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1
+ * size).
+ * @param n The length of the given string.
+ * @param freq [0..255] The input symbol frequency table (can be NULL).
+ * @param r The sampling rate for auxiliary indexes (must be power of 2).
+ * @param I [0..(n-1)/r] The input auxiliary indexes.
+ * @return 0 if no error occurred, -1 or -2 otherwise.
+ */
+int32_t libsais_unbwt_aux(const uint8_t * T, uint8_t * U, int32_t * A,
+                          int32_t n, const int32_t * freq, int32_t r,
+                          const int32_t * I);
+
+/**
+ * Constructs the original string from a given burrows-wheeler transformed
+ * string (BWT) with auxiliary indexes using libsais reverse BWT context.
+ * @param ctx The libsais reverse BWT context.
+ * @param T [0..n-1] The input string.
+ * @param U [0..n-1] The output string (can be T).
+ * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1
+ * size).
+ * @param n The length of the given string.
+ * @param freq [0..255] The input symbol frequency table (can be NULL).
+ * @param r The sampling rate for auxiliary indexes (must be power of 2).
+ * @param I [0..(n-1)/r] The input auxiliary indexes.
+ * @return 0 if no error occurred, -1 or -2 otherwise.
+ */
+int32_t libsais_unbwt_aux_ctx(const void * ctx, const uint8_t * T, uint8_t * U,
+                              int32_t * A, int32_t n, const int32_t * freq,
+                              int32_t r, const int32_t * I);
 
 #if defined(_OPENMP)
-    /**
-    * Constructs the original string from a given burrows-wheeler transformed string (BWT) with primary index in parallel using OpenMP.
-    * @param T [0..n-1] The input string.
-    * @param U [0..n-1] The output string (can be T).
-    * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size).
-    * @param n The length of the given string.
-    * @param freq [0..255] The input symbol frequency table (can be NULL).
-    * @param i The primary index.
-    * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default).
-    * @return 0 if no error occurred, -1 or -2 otherwise.
-    */
-    int32_t libsais_unbwt_omp(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i, int32_t threads);
-
-    /**
-    * Constructs the original string from a given burrows-wheeler transformed string (BWT) with auxiliary indexes in parallel using OpenMP.
-    * @param T [0..n-1] The input string.
-    * @param U [0..n-1] The output string (can be T).
-    * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size).
-    * @param n The length of the given string.
-    * @param freq [0..255] The input symbol frequency table (can be NULL).
-    * @param r The sampling rate for auxiliary indexes (must be power of 2).
-    * @param I [0..(n-1)/r] The input auxiliary indexes.
-    * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default).
-    * @return 0 if no error occurred, -1 or -2 otherwise.
-    */
-    int32_t libsais_unbwt_aux_omp(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I, int32_t threads);
+/**
+ * Constructs the original string from a given burrows-wheeler transformed
+ * string (BWT) with primary index in parallel using OpenMP.
+ * @param T [0..n-1] The input string.
+ * @param U [0..n-1] The output string (can be T).
+ * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1
+ * size).
+ * @param n The length of the given string.
+ * @param freq [0..255] The input symbol frequency table (can be NULL).
+ * @param i The primary index.
+ * @param threads The number of OpenMP threads to use (can be 0 for OpenMP
+ * default).
+ * @return 0 if no error occurred, -1 or -2 otherwise.
+ */
+int32_t libsais_unbwt_omp(const uint8_t * T, uint8_t * U, int32_t * A,
+                          int32_t n, const int32_t * freq, int32_t i,
+                          int32_t threads);
+
+/**
+ * Constructs the original string from a given burrows-wheeler transformed
+ * string (BWT) with auxiliary indexes in parallel using OpenMP.
+ * @param T [0..n-1] The input string.
+ * @param U [0..n-1] The output string (can be T).
+ * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1
+ * size).
+ * @param n The length of the given string.
+ * @param freq [0..255] The input symbol frequency table (can be NULL).
+ * @param r The sampling rate for auxiliary indexes (must be power of 2).
+ * @param I [0..(n-1)/r] The input auxiliary indexes.
+ * @param threads The number of OpenMP threads to use (can be 0 for OpenMP
+ * default).
+ * @return 0 if no error occurred, -1 or -2 otherwise.
+ */
+int32_t libsais_unbwt_aux_omp(const uint8_t * T, uint8_t * U, int32_t * A,
+                              int32_t n, const int32_t * freq, int32_t r,
+                              const int32_t * I, int32_t threads);
 #endif
 
-    /**
-    * Constructs the permuted longest common prefix array (PLCP) of a given string and a suffix array.
-    * @param T [0..n-1] The input string.
-    * @param SA [0..n-1] The input suffix array.
-    * @param PLCP [0..n-1] The output permuted longest common prefix array.
-    * @param n The length of the string and the suffix array.
-    * @return 0 if no error occurred, -1 otherwise.
-    */
-    int32_t libsais_plcp(const uint8_t * T, const int32_t * SA, int32_t * PLCP, int32_t n);
-
-    /**
-    * Constructs the longest common prefix array (LCP) of a given permuted longest common prefix array (PLCP) and a suffix array.
-    * @param PLCP [0..n-1] The input permuted longest common prefix array.
-    * @param SA [0..n-1] The input suffix array.
-    * @param LCP [0..n-1] The output longest common prefix array (can be SA).
-    * @param n The length of the permuted longest common prefix array and the suffix array.
-    * @return 0 if no error occurred, -1 otherwise.
-    */
-    int32_t libsais_lcp(const int32_t * PLCP, const int32_t * SA, int32_t * LCP, int32_t n);
+/**
+ * Constructs the permuted longest common prefix array (PLCP) of a given string
+ * and a suffix array.
+ * @param T [0..n-1] The input string.
+ * @param SA [0..n-1] The input suffix array.
+ * @param PLCP [0..n-1] The output permuted longest common prefix array.
+ * @param n The length of the string and the suffix array.
+ * @return 0 if no error occurred, -1 otherwise.
+ */
+int32_t libsais_plcp(const uint8_t * T, const int32_t * SA, int32_t * PLCP,
+                     int32_t n);
+
+/**
+ * Constructs the longest common prefix array (LCP) of a given permuted longest
+ * common prefix array (PLCP) and a suffix array.
+ * @param PLCP [0..n-1] The input permuted longest common prefix array.
+ * @param SA [0..n-1] The input suffix array.
+ * @param LCP [0..n-1] The output longest common prefix array (can be SA).
+ * @param n The length of the permuted longest common prefix array and the
+ * suffix array.
+ * @return 0 if no error occurred, -1 otherwise.
+ */
+int32_t libsais_lcp(const int32_t * PLCP, const int32_t * SA, int32_t * LCP,
+                    int32_t n);
 
 #if defined(_OPENMP)
-    /**
-    * Constructs the permuted longest common prefix array (PLCP) of a given string and a suffix array in parallel using OpenMP.
-    * @param T [0..n-1] The input string.
-    * @param SA [0..n-1] The input suffix array.
-    * @param PLCP [0..n-1] The output permuted longest common prefix array.
-    * @param n The length of the string and the suffix array.
-    * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default).
-    * @return 0 if no error occurred, -1 otherwise.
-    */
-    int32_t libsais_plcp_omp(const uint8_t * T, const int32_t * SA, int32_t * PLCP, int32_t n, int32_t threads);
-
-    /**
-    * Constructs the longest common prefix array (LCP) of a given permuted longest common prefix array (PLCP) and a suffix array in parallel using OpenMP.
-    * @param PLCP [0..n-1] The input permuted longest common prefix array.
-    * @param SA [0..n-1] The input suffix array.
-    * @param LCP [0..n-1] The output longest common prefix array (can be SA).
-    * @param n The length of the permuted longest common prefix array and the suffix array.
-    * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default).
-    * @return 0 if no error occurred, -1 otherwise.
-    */
-    int32_t libsais_lcp_omp(const int32_t * PLCP, const int32_t * SA, int32_t * LCP, int32_t n, int32_t threads);
+/**
+ * Constructs the permuted longest common prefix array (PLCP) of a given string
+ * and a suffix array in parallel using OpenMP.
+ * @param T [0..n-1] The input string.
+ * @param SA [0..n-1] The input suffix array.
+ * @param PLCP [0..n-1] The output permuted longest common prefix array.
+ * @param n The length of the string and the suffix array.
+ * @param threads The number of OpenMP threads to use (can be 0 for OpenMP
+ * default).
+ * @return 0 if no error occurred, -1 otherwise.
+ */
+int32_t libsais_plcp_omp(const uint8_t * T, const int32_t * SA, int32_t * PLCP,
+                         int32_t n, int32_t threads);
+
+/**
+ * Constructs the longest common prefix array (LCP) of a given permuted longest
+ * common prefix array (PLCP) and a suffix array in parallel using OpenMP.
+ * @param PLCP [0..n-1] The input permuted longest common prefix array.
+ * @param SA [0..n-1] The input suffix array.
+ * @param LCP [0..n-1] The output longest common prefix array (can be SA).
+ * @param n The length of the permuted longest common prefix array and the
+ * suffix array.
+ * @param threads The number of OpenMP threads to use (can be 0 for OpenMP
+ * default).
+ * @return 0 if no error occurred, -1 otherwise.
+ */
+int32_t libsais_lcp_omp(const int32_t * PLCP, const int32_t * SA, int32_t * LCP,
+                        int32_t n, int32_t threads);
 #endif
 
 #ifdef __cplusplus
diff --git a/include/mtf.h b/include/mtf.h
index aa25485..f775f2a 100644
--- a/include/mtf.h
+++ b/include/mtf.h
@@ -27,7 +27,9 @@ struct mtf_state {
     uint32_t prev[256], curr[256], symbols[256], ranks[256];
 };
 
-void mtf_encode(struct mtf_state * mtf, uint8_t *src, uint8_t *dst, uint32_t count);
-void mtf_decode(struct mtf_state * mtf, uint8_t *src, uint8_t *dst, uint32_t count);
+void mtf_encode(struct mtf_state * mtf, uint8_t * src, uint8_t * dst,
+                uint32_t count);
+void mtf_decode(struct mtf_state * mtf, uint8_t * src, uint8_t * dst,
+                uint32_t count);
 
 #endif
diff --git a/include/rle.h b/include/rle.h
index f6800a2..da5f579 100644
--- a/include/rle.h
+++ b/include/rle.h
@@ -2,7 +2,10 @@
 #ifndef _RLE_H
 #define _RLE_H
 
-int mrlec(unsigned char *in, int inlen, unsigned char *out);
-int mrled(unsigned char *in, unsigned char *out, int outlen);
+#include <stddef.h>
+#include <stdint.h>
+
+int32_t mrlec(uint8_t * in, int32_t inlen, uint8_t * out);
+int32_t mrled(uint8_t * in, uint8_t * out, int32_t outlen);
 
 #endif
diff --git a/include/srt.h b/include/srt.h
index 63ce264..709c38a 100644
--- a/include/srt.h
+++ b/include/srt.h
@@ -32,7 +32,9 @@ struct srt_state {
     uint32_t bucket_ends[256];
 };
 
-uint32_t srt_encode(struct srt_state * mtf, uint8_t *src, uint8_t *dst, uint32_t count);
-uint32_t srt_decode(struct srt_state * mtf, uint8_t *src, uint8_t *dst, uint32_t count);
+uint32_t srt_encode(struct srt_state * mtf, uint8_t * src, uint8_t * dst,
+                    uint32_t count);
+uint32_t srt_decode(struct srt_state * mtf, uint8_t * src, uint8_t * dst,
+                    uint32_t count);
 
 #endif
diff --git a/src/cm.c b/src/cm.c
index ccfc994..0da95bf 100644
--- a/src/cm.c
+++ b/src/cm.c
@@ -1,16 +1,16 @@
 
 #include "cm.h"
 
-static void write_out(state *s, uint8_t c) {
+static void write_out(state * s, uint8_t c) {
     s->out_queue[s->output_ptr++] = c;
 }
 
-static uint8_t read_in(state *s) {
+static uint8_t read_in(state * s) {
     if (s->input_ptr < s->input_max) return s->in_queue[s->input_ptr++];
     return -1;
 }
 
-static void encodebit0(state *s, uint32_t p) {
+static void encodebit0(state * s, uint32_t p) {
     s->low += (((uint64_t)(s->high - s->low) * p) >> 18) + 1;
     while ((s->low ^ s->high) < (1 << 24)) {
         write_out(s, s->low >> 24);
@@ -19,7 +19,7 @@ static void encodebit0(state *s, uint32_t p) {
     }
 }
 
-static void encodebit1(state *s, uint32_t p) {
+static void encodebit1(state * s, uint32_t p) {
     s->high = s->low + (((uint64_t)(s->high - s->low) * p) >> 18);
     while ((s->low ^ s->high) < (1 << 24)) {
         write_out(s, s->low >> 24);
@@ -28,7 +28,7 @@ static void encodebit1(state *s, uint32_t p) {
     }
 }
 
-static uint8_t decodebit(state *s, uint32_t p) {
+static uint8_t decodebit(state * s, uint32_t p) {
     const uint32_t mid = s->low + (((uint64_t)(s->high - s->low) * p) >> 18);
     const uint8_t bit = s->code <= mid;
     if (bit)
@@ -43,14 +43,18 @@ static uint8_t decodebit(state *s, uint32_t p) {
     return bit;
 }
 
-void flush(state *s) {
-    write_out(s, s->low >> 24); s->low <<= 8;
-    write_out(s, s->low >> 24); s->low <<= 8;
-    write_out(s, s->low >> 24); s->low <<= 8;
-    write_out(s, s->low >> 24); s->low <<= 8;
+void flush(state * s) {
+    write_out(s, s->low >> 24);
+    s->low <<= 8;
+    write_out(s, s->low >> 24);
+    s->low <<= 8;
+    write_out(s, s->low >> 24);
+    s->low <<= 8;
+    write_out(s, s->low >> 24);
+    s->low <<= 8;
 }
 
-void init(state *s) {
+void init(state * s) {
     s->code = (s->code << 8) + read_in(s);
     s->code = (s->code << 8) + read_in(s);
     s->code = (s->code << 8) + read_in(s);
@@ -74,7 +78,7 @@ void begin(state * s) {
             for (int k = 0; k < 17; k++) s->C2[i][j][k] = (k << 12) - (k == 16);
 }
 
-void encode_byte(state *s, uint8_t c) {
+void encode_byte(state * s, uint8_t c) {
     if (s->c1 == s->c2)
         ++s->run;
     else
@@ -118,7 +122,7 @@ void encode_byte(state *s, uint8_t c) {
     s->c1 = ctx & 255;
 }
 
-uint8_t decode_byte(state *s) {
+uint8_t decode_byte(state * s) {
     if (s->c1 == s->c2)
         ++s->run;
     else
diff --git a/src/crc32.c b/src/crc32.c
index f9ac88a..94bff14 100644
--- a/src/crc32.c
+++ b/src/crc32.c
@@ -74,7 +74,7 @@ static const uint32_t crc32Table[256] = {
     0xAD7D5351L
 };
 
-uint32_t crc32sum(uint32_t crc, uint8_t *buf, size_t size) {
+uint32_t crc32sum(uint32_t crc, uint8_t * buf, size_t size) {
     while (size--) crc = crc32Table[(crc ^ *(buf++)) & 0xff] ^ (crc >> 8);
     return crc;
 }
diff --git a/src/libsais.c b/src/libsais.c
index 44cdc19..03dd7a4 100644
--- a/src/libsais.c
+++ b/src/libsais.c
@@ -23,74 +23,69 @@ Please see the file LICENSE for full copyright information.
 
 #include "libsais.h"
 
+#include <limits.h>
 #include <stddef.h>
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
-#include <limits.h>
 
 #if defined(_OPENMP)
     #include <omp.h>
 #else
-    #define UNUSED(_x)                  (void)(_x)
+    #define UNUSED(_x) (void)(_x)
 #endif
 
-typedef int32_t                         sa_sint_t;
-typedef uint32_t                        sa_uint_t;
-typedef ptrdiff_t                       fast_sint_t;
-typedef size_t                          fast_uint_t;
+typedef int32_t sa_sint_t;
+typedef uint32_t sa_uint_t;
+typedef ptrdiff_t fast_sint_t;
+typedef size_t fast_uint_t;
 
-#define SAINT_BIT                       (32)
-#define SAINT_MAX                       INT32_MAX
-#define SAINT_MIN                       INT32_MIN
+#define SAINT_BIT (32)
+#define SAINT_MAX INT32_MAX
+#define SAINT_MIN INT32_MIN
 
-#define ALPHABET_SIZE                   (1 << CHAR_BIT)
-#define UNBWT_FASTBITS                  (17)
+#define ALPHABET_SIZE (1 << CHAR_BIT)
+#define UNBWT_FASTBITS (17)
 
-#define SUFFIX_GROUP_BIT                (SAINT_BIT - 1)
-#define SUFFIX_GROUP_MARKER             (((sa_sint_t)1) << (SUFFIX_GROUP_BIT - 1))
+#define SUFFIX_GROUP_BIT (SAINT_BIT - 1)
+#define SUFFIX_GROUP_MARKER (((sa_sint_t)1) << (SUFFIX_GROUP_BIT - 1))
 
-#define BUCKETS_INDEX2(_c, _s)          (((_c) << 1) + (_s))
-#define BUCKETS_INDEX4(_c, _s)          (((_c) << 2) + (_s))
+#define BUCKETS_INDEX2(_c, _s) (((_c) << 1) + (_s))
+#define BUCKETS_INDEX4(_c, _s) (((_c) << 2) + (_s))
 
-#define LIBSAIS_PER_THREAD_CACHE_SIZE   (24576)
+#define LIBSAIS_PER_THREAD_CACHE_SIZE (24576)
 
-typedef struct LIBSAIS_THREAD_CACHE
-{
-        sa_sint_t                       symbol;
-        sa_sint_t                       index;
+typedef struct LIBSAIS_THREAD_CACHE {
+    sa_sint_t symbol;
+    sa_sint_t index;
 } LIBSAIS_THREAD_CACHE;
 
-typedef union LIBSAIS_THREAD_STATE
-{
-    struct
-    {
-        fast_sint_t                     position;
-        fast_sint_t                     count;
+typedef union LIBSAIS_THREAD_STATE {
+    struct {
+        fast_sint_t position;
+        fast_sint_t count;
 
-        fast_sint_t                     m;
-        fast_sint_t                     last_lms_suffix;
+        fast_sint_t m;
+        fast_sint_t last_lms_suffix;
 
-        sa_sint_t *                     buckets;
-        LIBSAIS_THREAD_CACHE *          cache;
+        sa_sint_t * buckets;
+        LIBSAIS_THREAD_CACHE * cache;
     } state;
 
     uint8_t padding[64];
 } LIBSAIS_THREAD_STATE;
 
-typedef struct LIBSAIS_CONTEXT
-{
-    sa_sint_t *                         buckets;
-    LIBSAIS_THREAD_STATE *              thread_state;
-    fast_sint_t                         threads;
+typedef struct LIBSAIS_CONTEXT {
+    sa_sint_t * buckets;
+    LIBSAIS_THREAD_STATE * thread_state;
+    fast_sint_t threads;
 } LIBSAIS_CONTEXT;
 
-typedef struct LIBSAIS_UNBWT_CONTEXT
-{
-    sa_uint_t *                         bucket2;
-    uint16_t *                          fastbits;
-    sa_uint_t *                         buckets;
-    fast_sint_t                         threads;
+typedef struct LIBSAIS_UNBWT_CONTEXT {
+    sa_uint_t * bucket2;
+    uint16_t * fastbits;
+    sa_uint_t * buckets;
+    fast_sint_t threads;
 } LIBSAIS_UNBWT_CONTEXT;
 
 #if defined(__GNUC__) || defined(__clang__)
@@ -105,7 +100,8 @@ typedef struct LIBSAIS_UNBWT_CONTEXT
     #if __has_builtin(__builtin_prefetch)
         #define HAS_BUILTIN_PREFECTCH
     #endif
-#elif defined(__GNUC__) && (((__GNUC__ == 3) && (__GNUC_MINOR__ >= 2)) || (__GNUC__ >= 4))
+#elif defined(__GNUC__) && \
+    (((__GNUC__ == 3) && (__GNUC_MINOR__ >= 2)) || (__GNUC__ >= 4))
     #define HAS_BUILTIN_PREFECTCH
 #endif
 
@@ -113,22 +109,26 @@ typedef struct LIBSAIS_UNBWT_CONTEXT
     #if __has_builtin(__builtin_bswap16)
         #define HAS_BUILTIN_BSWAP16
     #endif
-#elif defined(__GNUC__) && (((__GNUC__ == 4) && (__GNUC_MINOR__ >= 8)) || (__GNUC__ >= 5))
+#elif defined(__GNUC__) && \
+    (((__GNUC__ == 4) && (__GNUC_MINOR__ >= 8)) || (__GNUC__ >= 5))
     #define HAS_BUILTIN_BSWAP16
 #endif
 
 #if defined(HAS_BUILTIN_PREFECTCH)
-    #define libsais_prefetch(address) __builtin_prefetch((const void *)(address), 0, 0)
-    #define libsais_prefetchw(address) __builtin_prefetch((const void *)(address), 1, 0)
-#elif defined (_M_IX86) || defined (_M_AMD64)
+    #define libsais_prefetch(address) \
+        __builtin_prefetch((const void *)(address), 0, 0)
+    #define libsais_prefetchw(address) \
+        __builtin_prefetch((const void *)(address), 1, 0)
+#elif defined(_M_IX86) || defined(_M_AMD64)
     #include <intrin.h>
-    #define libsais_prefetch(address) _mm_prefetch((const void *)(address), _MM_HINT_NTA)
+    #define libsais_prefetch(address) \
+        _mm_prefetch((const void *)(address), _MM_HINT_NTA)
     #define libsais_prefetchw(address) _m_prefetchw((const void *)(address))
-#elif defined (_M_ARM)
+#elif defined(_M_ARM)
     #include <intrin.h>
     #define libsais_prefetch(address) __prefetch((const void *)(address))
     #define libsais_prefetchw(address) __prefetchw((const void *)(address))
-#elif defined (_M_ARM64)
+#elif defined(_M_ARM64)
     #include <intrin.h>
     #define libsais_prefetch(address) __prefetch2((const void *)(address), 1)
     #define libsais_prefetchw(address) __prefetch2((const void *)(address), 17)
@@ -137,17 +137,25 @@ typedef struct LIBSAIS_UNBWT_CONTEXT
 #endif
 
 #if !defined(__LITTLE_ENDIAN__) && !defined(__BIG_ENDIAN__)
-    #if defined(_LITTLE_ENDIAN) \
-            || (defined(BYTE_ORDER) && defined(LITTLE_ENDIAN) && BYTE_ORDER == LITTLE_ENDIAN) \
-            || (defined(_BYTE_ORDER) && defined(_LITTLE_ENDIAN) && _BYTE_ORDER == _LITTLE_ENDIAN) \
-            || (defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && __BYTE_ORDER == __LITTLE_ENDIAN) \
-            || (defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+    #if defined(_LITTLE_ENDIAN) ||                                      \
+        (defined(BYTE_ORDER) && defined(LITTLE_ENDIAN) &&               \
+         BYTE_ORDER == LITTLE_ENDIAN) ||                                \
+        (defined(_BYTE_ORDER) && defined(_LITTLE_ENDIAN) &&             \
+         _BYTE_ORDER == _LITTLE_ENDIAN) ||                              \
+        (defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) &&           \
+         __BYTE_ORDER == __LITTLE_ENDIAN) ||                            \
+        (defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && \
+         __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
         #define __LITTLE_ENDIAN__
-    #elif defined(_BIG_ENDIAN) \
-            || (defined(BYTE_ORDER) && defined(BIG_ENDIAN) && BYTE_ORDER == BIG_ENDIAN) \
-            || (defined(_BYTE_ORDER) && defined(_BIG_ENDIAN) && _BYTE_ORDER == _BIG_ENDIAN) \
-            || (defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && __BYTE_ORDER == __BIG_ENDIAN) \
-            || (defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    #elif defined(_BIG_ENDIAN) ||                                    \
+        (defined(BYTE_ORDER) && defined(BIG_ENDIAN) &&               \
+         BYTE_ORDER == BIG_ENDIAN) ||                                \
+        (defined(_BYTE_ORDER) && defined(_BIG_ENDIAN) &&             \
+         _BYTE_ORDER == _BIG_ENDIAN) ||                              \
+        (defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) &&           \
+         __BYTE_ORDER == __BIG_ENDIAN) ||                            \
+        (defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && \
+         __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
         #define __BIG_ENDIAN__
     #elif defined(_WIN32)
         #define __LITTLE_ENDIAN__
@@ -168,18 +176,19 @@ typedef struct LIBSAIS_UNBWT_CONTEXT
     #error Your compiler, configuration or platform is not supported.
 #endif
 
-static void * libsais_align_up(const void * address, size_t alignment)
-{
-    return (void *)((((ptrdiff_t)address) + ((ptrdiff_t)alignment) - 1) & (-((ptrdiff_t)alignment)));
+static void * libsais_align_up(const void * address, size_t alignment) {
+    return (void *)((((ptrdiff_t)address) + ((ptrdiff_t)alignment) - 1) &
+                    (-((ptrdiff_t)alignment)));
 }
 
-static void * libsais_alloc_aligned(size_t size, size_t alignment)
-{
+static void * libsais_alloc_aligned(size_t size, size_t alignment) {
     void * address = malloc(size + sizeof(short) + alignment - 1);
-    if (address != NULL)
-    {
-        void * aligned_address = libsais_align_up((void *)((ptrdiff_t)address + (ptrdiff_t)(sizeof(short))), alignment);
-        ((short *)aligned_address)[-1] = (short)((ptrdiff_t)aligned_address - (ptrdiff_t)address);
+    if (address != NULL) {
+        void * aligned_address = libsais_align_up(
+            (void *)((ptrdiff_t)address + (ptrdiff_t)(sizeof(short))),
+            alignment);
+        ((short *)aligned_address)[-1] =
+            (short)((ptrdiff_t)aligned_address - (ptrdiff_t)address);
 
         return aligned_address;
     }
@@ -187,27 +196,33 @@ static void * libsais_alloc_aligned(size_t size, size_t alignment)
     return NULL;
 }
 
-static void libsais_free_aligned(void * aligned_address)
-{
-    if (aligned_address != NULL)
-    {
-        free((void *)((ptrdiff_t)aligned_address - ((short *)aligned_address)[-1]));
+static void libsais_free_aligned(void * aligned_address) {
+    if (aligned_address != NULL) {
+        free((void *)((ptrdiff_t)aligned_address -
+                      ((short *)aligned_address)[-1]));
     }
 }
 
-static LIBSAIS_THREAD_STATE * libsais_alloc_thread_state(sa_sint_t threads)
-{
-    LIBSAIS_THREAD_STATE *  RESTRICT thread_state    = (LIBSAIS_THREAD_STATE *)libsais_alloc_aligned((size_t)threads * sizeof(LIBSAIS_THREAD_STATE), 4096);
-    sa_sint_t *             RESTRICT thread_buckets  = (sa_sint_t *)libsais_alloc_aligned((size_t)threads * 4 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096);
-    LIBSAIS_THREAD_CACHE *  RESTRICT thread_cache    = (LIBSAIS_THREAD_CACHE *)libsais_alloc_aligned((size_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE * sizeof(LIBSAIS_THREAD_CACHE), 4096);
+static LIBSAIS_THREAD_STATE * libsais_alloc_thread_state(sa_sint_t threads) {
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state =
+        (LIBSAIS_THREAD_STATE *)libsais_alloc_aligned(
+            (size_t)threads * sizeof(LIBSAIS_THREAD_STATE), 4096);
+    sa_sint_t * RESTRICT thread_buckets = (sa_sint_t *)libsais_alloc_aligned(
+        (size_t)threads * 4 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096);
+    LIBSAIS_THREAD_CACHE * RESTRICT thread_cache =
+        (LIBSAIS_THREAD_CACHE *)libsais_alloc_aligned(
+            (size_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE *
+                sizeof(LIBSAIS_THREAD_CACHE),
+            4096);
 
-    if (thread_state != NULL && thread_buckets != NULL && thread_cache != NULL)
-    {
+    if (thread_state != NULL && thread_buckets != NULL &&
+        thread_cache != NULL) {
         fast_sint_t t;
-        for (t = 0; t < threads; ++t)
-        { 
-            thread_state[t].state.buckets   = thread_buckets;   thread_buckets  += 4 * ALPHABET_SIZE;
-            thread_state[t].state.cache     = thread_cache;     thread_cache    += LIBSAIS_PER_THREAD_CACHE_SIZE;
+        for (t = 0; t < threads; ++t) {
+            thread_state[t].state.buckets = thread_buckets;
+            thread_buckets += 4 * ALPHABET_SIZE;
+            thread_state[t].state.cache = thread_cache;
+            thread_cache += LIBSAIS_PER_THREAD_CACHE_SIZE;
         }
 
         return thread_state;
@@ -219,24 +234,24 @@ static LIBSAIS_THREAD_STATE * libsais_alloc_thread_state(sa_sint_t threads)
     return NULL;
 }
 
-static void libsais_free_thread_state(LIBSAIS_THREAD_STATE * thread_state)
-{
-    if (thread_state != NULL)
-    {
+static void libsais_free_thread_state(LIBSAIS_THREAD_STATE * thread_state) {
+    if (thread_state != NULL) {
         libsais_free_aligned(thread_state[0].state.cache);
         libsais_free_aligned(thread_state[0].state.buckets);
         libsais_free_aligned(thread_state);
     }
 }
 
-static LIBSAIS_CONTEXT * libsais_create_ctx_main(sa_sint_t threads)
-{
-    LIBSAIS_CONTEXT *       RESTRICT ctx            = (LIBSAIS_CONTEXT *)libsais_alloc_aligned(sizeof(LIBSAIS_CONTEXT), 64);
-    sa_sint_t *             RESTRICT buckets        = (sa_sint_t *)libsais_alloc_aligned(8 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096);
-    LIBSAIS_THREAD_STATE *  RESTRICT thread_state   = threads > 1 ? libsais_alloc_thread_state(threads) : NULL;
+static LIBSAIS_CONTEXT * libsais_create_ctx_main(sa_sint_t threads) {
+    LIBSAIS_CONTEXT * RESTRICT ctx =
+        (LIBSAIS_CONTEXT *)libsais_alloc_aligned(sizeof(LIBSAIS_CONTEXT), 64);
+    sa_sint_t * RESTRICT buckets = (sa_sint_t *)libsais_alloc_aligned(
+        8 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096);
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state =
+        threads > 1 ? libsais_alloc_thread_state(threads) : NULL;
 
-    if (ctx != NULL && buckets != NULL && (thread_state != NULL || threads == 1))
-    {
+    if (ctx != NULL && buckets != NULL &&
+        (thread_state != NULL || threads == 1)) {
         ctx->buckets = buckets;
         ctx->threads = threads;
         ctx->thread_state = thread_state;
@@ -250,10 +265,8 @@ static LIBSAIS_CONTEXT * libsais_create_ctx_main(sa_sint_t threads)
     return NULL;
 }
 
-static void libsais_free_ctx_main(LIBSAIS_CONTEXT * ctx)
-{
-    if (ctx != NULL)
-    {
+static void libsais_free_ctx_main(LIBSAIS_CONTEXT * ctx) {
+    if (ctx != NULL) {
         libsais_free_thread_state(ctx->thread_state);
         libsais_free_aligned(ctx->buckets);
         libsais_free_aligned(ctx);
@@ -262,31 +275,42 @@ static void libsais_free_ctx_main(LIBSAIS_CONTEXT * ctx)
 
 #if defined(_OPENMP)
 
-static sa_sint_t libsais_count_negative_marked_suffixes(sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static sa_sint_t libsais_count_negative_marked_suffixes(
+    sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start,
+    fast_sint_t omp_block_size) {
     sa_sint_t count = 0;
 
-    fast_sint_t i; for (i = omp_block_start; i < omp_block_start + omp_block_size; ++i) { count += (SA[i] < 0); }
+    fast_sint_t i;
+    for (i = omp_block_start; i < omp_block_start + omp_block_size; ++i) {
+        count += (SA[i] < 0);
+    }
 
     return count;
 }
 
-static sa_sint_t libsais_count_zero_marked_suffixes(sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static sa_sint_t libsais_count_zero_marked_suffixes(
+    sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start,
+    fast_sint_t omp_block_size) {
     sa_sint_t count = 0;
 
-    fast_sint_t i; for (i = omp_block_start; i < omp_block_start + omp_block_size; ++i) { count += (SA[i] == 0); }
+    fast_sint_t i;
+    for (i = omp_block_start; i < omp_block_start + omp_block_size; ++i) {
+        count += (SA[i] == 0);
+    }
 
     return count;
 }
 
-static void libsais_place_cached_suffixes(sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static void libsais_place_cached_suffixes(sa_sint_t * RESTRICT SA,
+                                          LIBSAIS_THREAD_CACHE * RESTRICT cache,
+                                          fast_sint_t omp_block_start,
+                                          fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4)
-    {
+    for (i = omp_block_start,
+        j = omp_block_start + omp_block_size - prefetch_distance - 3;
+         i < j; i += 4) {
         libsais_prefetch(&cache[i + 2 * prefetch_distance]);
 
         libsais_prefetchw(&SA[cache[i + prefetch_distance + 0].symbol]);
@@ -300,88 +324,122 @@ static void libsais_place_cached_suffixes(sa_sint_t * RESTRICT SA, LIBSAIS_THREA
         SA[cache[i + 3].symbol] = cache[i + 3].index;
     }
 
-    for (j += prefetch_distance + 3; i < j; i += 1)
-    {
+    for (j += prefetch_distance + 3; i < j; i += 1) {
         SA[cache[i].symbol] = cache[i].index;
     }
 }
 
-static void libsais_compact_and_place_cached_suffixes(sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static void libsais_compact_and_place_cached_suffixes(
+    sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache,
+    fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j, l;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - 3, l = omp_block_start; i < j; i += 4)
-    {
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - 3,
+        l = omp_block_start;
+         i < j; i += 4) {
         libsais_prefetchw(&cache[i + prefetch_distance]);
 
-        cache[l] = cache[i + 0]; l += cache[l].symbol >= 0;
-        cache[l] = cache[i + 1]; l += cache[l].symbol >= 0;
-        cache[l] = cache[i + 2]; l += cache[l].symbol >= 0;
-        cache[l] = cache[i + 3]; l += cache[l].symbol >= 0;
+        cache[l] = cache[i + 0];
+        l += cache[l].symbol >= 0;
+        cache[l] = cache[i + 1];
+        l += cache[l].symbol >= 0;
+        cache[l] = cache[i + 2];
+        l += cache[l].symbol >= 0;
+        cache[l] = cache[i + 3];
+        l += cache[l].symbol >= 0;
     }
 
-    for (j += 3; i < j; i += 1)
-    {
-        cache[l] = cache[i]; l += cache[l].symbol >= 0;
+    for (j += 3; i < j; i += 1) {
+        cache[l] = cache[i];
+        l += cache[l].symbol >= 0;
     }
 
-    libsais_place_cached_suffixes(SA, cache, omp_block_start, l - omp_block_start);
+    libsais_place_cached_suffixes(SA, cache, omp_block_start,
+                                  l - omp_block_start);
 }
 
-static void libsais_accumulate_counts_s32_2(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride)
-{
+static void libsais_accumulate_counts_s32_2(sa_sint_t * RESTRICT bucket00,
+                                            fast_sint_t bucket_size,
+                                            fast_sint_t bucket_stride) {
     sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
-    fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s]; }
+    fast_sint_t s;
+    for (s = 0; s < bucket_size; s += 1) {
+        bucket00[s] = bucket00[s] + bucket01[s];
+    }
 }
 
-static void libsais_accumulate_counts_s32_3(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride)
-{
+static void libsais_accumulate_counts_s32_3(sa_sint_t * RESTRICT bucket00,
+                                            fast_sint_t bucket_size,
+                                            fast_sint_t bucket_stride) {
     sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
     sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
-    fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s]; }
+    fast_sint_t s;
+    for (s = 0; s < bucket_size; s += 1) {
+        bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s];
+    }
 }
 
-static void libsais_accumulate_counts_s32_4(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride)
-{
+static void libsais_accumulate_counts_s32_4(sa_sint_t * RESTRICT bucket00,
+                                            fast_sint_t bucket_size,
+                                            fast_sint_t bucket_stride) {
     sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
     sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
     sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride;
-    fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s]; }
+    fast_sint_t s;
+    for (s = 0; s < bucket_size; s += 1) {
+        bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s];
+    }
 }
 
-static void libsais_accumulate_counts_s32_5(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride)
-{
+static void libsais_accumulate_counts_s32_5(sa_sint_t * RESTRICT bucket00,
+                                            fast_sint_t bucket_size,
+                                            fast_sint_t bucket_stride) {
     sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
     sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
     sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride;
     sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride;
-    fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s]; }
+    fast_sint_t s;
+    for (s = 0; s < bucket_size; s += 1) {
+        bucket00[s] =
+            bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s];
+    }
 }
 
-static void libsais_accumulate_counts_s32_6(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride)
-{
+static void libsais_accumulate_counts_s32_6(sa_sint_t * RESTRICT bucket00,
+                                            fast_sint_t bucket_size,
+                                            fast_sint_t bucket_stride) {
     sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
     sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
     sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride;
     sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride;
     sa_sint_t * RESTRICT bucket05 = bucket04 - bucket_stride;
-    fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] + bucket05[s]; }
+    fast_sint_t s;
+    for (s = 0; s < bucket_size; s += 1) {
+        bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] +
+                      bucket04[s] + bucket05[s];
+    }
 }
 
-static void libsais_accumulate_counts_s32_7(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride)
-{
+static void libsais_accumulate_counts_s32_7(sa_sint_t * RESTRICT bucket00,
+                                            fast_sint_t bucket_size,
+                                            fast_sint_t bucket_stride) {
     sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
     sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
     sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride;
     sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride;
     sa_sint_t * RESTRICT bucket05 = bucket04 - bucket_stride;
     sa_sint_t * RESTRICT bucket06 = bucket05 - bucket_stride;
-    fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] + bucket05[s] + bucket06[s]; }
+    fast_sint_t s;
+    for (s = 0; s < bucket_size; s += 1) {
+        bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] +
+                      bucket04[s] + bucket05[s] + bucket06[s];
+    }
 }
 
-static void libsais_accumulate_counts_s32_8(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride)
-{
+static void libsais_accumulate_counts_s32_8(sa_sint_t * RESTRICT bucket00,
+                                            fast_sint_t bucket_size,
+                                            fast_sint_t bucket_stride) {
     sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
     sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
     sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride;
@@ -389,11 +447,16 @@ static void libsais_accumulate_counts_s32_8(sa_sint_t * RESTRICT bucket00, fast_
     sa_sint_t * RESTRICT bucket05 = bucket04 - bucket_stride;
     sa_sint_t * RESTRICT bucket06 = bucket05 - bucket_stride;
     sa_sint_t * RESTRICT bucket07 = bucket06 - bucket_stride;
-    fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] + bucket05[s] + bucket06[s] + bucket07[s]; }
+    fast_sint_t s;
+    for (s = 0; s < bucket_size; s += 1) {
+        bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] +
+                      bucket04[s] + bucket05[s] + bucket06[s] + bucket07[s];
+    }
 }
 
-static void libsais_accumulate_counts_s32_9(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride)
-{
+static void libsais_accumulate_counts_s32_9(sa_sint_t * RESTRICT bucket00,
+                                            fast_sint_t bucket_size,
+                                            fast_sint_t bucket_stride) {
     sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
     sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
     sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride;
@@ -402,154 +465,242 @@ static void libsais_accumulate_counts_s32_9(sa_sint_t * RESTRICT bucket00, fast_
     sa_sint_t * RESTRICT bucket06 = bucket05 - bucket_stride;
     sa_sint_t * RESTRICT bucket07 = bucket06 - bucket_stride;
     sa_sint_t * RESTRICT bucket08 = bucket07 - bucket_stride;
-    fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] + bucket05[s] + bucket06[s] + bucket07[s] + bucket08[s]; }
-}
-
-static void libsais_accumulate_counts_s32(sa_sint_t * RESTRICT buckets, fast_sint_t bucket_size, fast_sint_t bucket_stride, fast_sint_t num_buckets)
-{
-    while (num_buckets >= 9)
-    {
-        libsais_accumulate_counts_s32_9(buckets - (num_buckets - 9) * bucket_stride, bucket_size, bucket_stride); num_buckets -= 8;
-    }
-
-    switch (num_buckets)
-    {
-        case 1: break;
-        case 2: libsais_accumulate_counts_s32_2(buckets, bucket_size, bucket_stride); break;
-        case 3: libsais_accumulate_counts_s32_3(buckets, bucket_size, bucket_stride); break;
-        case 4: libsais_accumulate_counts_s32_4(buckets, bucket_size, bucket_stride); break;
-        case 5: libsais_accumulate_counts_s32_5(buckets, bucket_size, bucket_stride); break;
-        case 6: libsais_accumulate_counts_s32_6(buckets, bucket_size, bucket_stride); break;
-        case 7: libsais_accumulate_counts_s32_7(buckets, bucket_size, bucket_stride); break;
-        case 8: libsais_accumulate_counts_s32_8(buckets, bucket_size, bucket_stride); break;
-    }
-}
-
-#endif
-
-static void libsais_gather_lms_suffixes_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, fast_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
-    if (omp_block_size > 0)
-    {
+    fast_sint_t s;
+    for (s = 0; s < bucket_size; s += 1) {
+        bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] +
+                      bucket04[s] + bucket05[s] + bucket06[s] + bucket07[s] +
+                      bucket08[s];
+    }
+}
+
+static void libsais_accumulate_counts_s32(sa_sint_t * RESTRICT buckets,
+                                          fast_sint_t bucket_size,
+                                          fast_sint_t bucket_stride,
+                                          fast_sint_t num_buckets) {
+    while (num_buckets >= 9) {
+        libsais_accumulate_counts_s32_9(
+            buckets - (num_buckets - 9) * bucket_stride, bucket_size,
+            bucket_stride);
+        num_buckets -= 8;
+    }
+
+    switch (num_buckets) {
+        case 1:
+            break;
+        case 2:
+            libsais_accumulate_counts_s32_2(buckets, bucket_size,
+                                            bucket_stride);
+            break;
+        case 3:
+            libsais_accumulate_counts_s32_3(buckets, bucket_size,
+                                            bucket_stride);
+            break;
+        case 4:
+            libsais_accumulate_counts_s32_4(buckets, bucket_size,
+                                            bucket_stride);
+            break;
+        case 5:
+            libsais_accumulate_counts_s32_5(buckets, bucket_size,
+                                            bucket_stride);
+            break;
+        case 6:
+            libsais_accumulate_counts_s32_6(buckets, bucket_size,
+                                            bucket_stride);
+            break;
+        case 7:
+            libsais_accumulate_counts_s32_7(buckets, bucket_size,
+                                            bucket_stride);
+            break;
+        case 8:
+            libsais_accumulate_counts_s32_8(buckets, bucket_size,
+                                            bucket_stride);
+            break;
+    }
+}
+
+#endif
+
+static void libsais_gather_lms_suffixes_8u(const uint8_t * RESTRICT T,
+                                           sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                           fast_sint_t m,
+                                           fast_sint_t omp_block_start,
+                                           fast_sint_t omp_block_size) {
+    if (omp_block_size > 0) {
         const fast_sint_t prefetch_distance = 128;
 
-        fast_sint_t i, j = omp_block_start + omp_block_size, c0 = T[omp_block_start + omp_block_size - 1], c1 = -1;
+        fast_sint_t i, j = omp_block_start + omp_block_size,
+                       c0 = T[omp_block_start + omp_block_size - 1], c1 = -1;
 
-        while (j < n && (c1 = T[j]) == c0) { ++j; }
+        while (j < n && (c1 = T[j]) == c0) {
+            ++j;
+        }
 
         fast_uint_t s = c0 >= c1;
 
-        for (i = omp_block_start + omp_block_size - 2, j = omp_block_start + 3; i >= j; i -= 4)
-        {
+        for (i = omp_block_start + omp_block_size - 2, j = omp_block_start + 3;
+             i >= j; i -= 4) {
             libsais_prefetch(&T[i - prefetch_distance]);
 
-            c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
-            c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 0); m -= ((s & 3) == 1);
-            c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 1); m -= ((s & 3) == 1);
-            c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 2); m -= ((s & 3) == 1);
-        }
-
-        for (j -= 3; i >= j; i -= 1)
-        {
-            c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+            c1 = T[i - 0];
+            s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+            SA[m] = (sa_sint_t)(i + 1);
+            m -= ((s & 3) == 1);
+            c0 = T[i - 1];
+            s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+            SA[m] = (sa_sint_t)(i - 0);
+            m -= ((s & 3) == 1);
+            c1 = T[i - 2];
+            s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+            SA[m] = (sa_sint_t)(i - 1);
+            m -= ((s & 3) == 1);
+            c0 = T[i - 3];
+            s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+            SA[m] = (sa_sint_t)(i - 2);
+            m -= ((s & 3) == 1);
+        }
+
+        for (j -= 3; i >= j; i -= 1) {
+            c1 = c0;
+            c0 = T[i];
+            s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+            SA[m] = (sa_sint_t)(i + 1);
+            m -= ((s & 3) == 1);
         }
 
         SA[m] = (sa_sint_t)(i + 1);
     }
 }
 
-static void libsais_gather_lms_suffixes_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
+static void libsais_gather_lms_suffixes_8u_omp(
+    const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+    sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
 #if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536 && omp_get_dynamic() == 0)
+    #pragma omp parallel num_threads(threads) if (threads > 1 && n >= 65536 && \
+                                                  omp_get_dynamic() == 0)
 #endif
     {
 #if defined(_OPENMP)
-        fast_sint_t omp_thread_num    = omp_get_thread_num();
-        fast_sint_t omp_num_threads   = omp_get_num_threads();
+        fast_sint_t omp_thread_num = omp_get_thread_num();
+        fast_sint_t omp_num_threads = omp_get_num_threads();
 #else
-        UNUSED(threads); UNUSED(thread_state);
+        UNUSED(threads);
+        UNUSED(thread_state);
 
-        fast_sint_t omp_thread_num    = 0;
-        fast_sint_t omp_num_threads   = 1;
+        fast_sint_t omp_thread_num = 0;
+        fast_sint_t omp_num_threads = 1;
 #endif
-        fast_sint_t omp_block_stride  = (n / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+        fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
+                                         ? omp_block_stride
+                                         : n - omp_block_start;
 
-        if (omp_num_threads == 1)
-        {
-            libsais_gather_lms_suffixes_8u(T, SA, n, (fast_sint_t)n - 1, omp_block_start, omp_block_size);
+        if (omp_num_threads == 1) {
+            libsais_gather_lms_suffixes_8u(T, SA, n, (fast_sint_t)n - 1,
+                                           omp_block_start, omp_block_size);
         }
 #if defined(_OPENMP)
-        else
-        {
-            fast_sint_t t, m = 0; for (t = omp_num_threads - 1; t > omp_thread_num; --t) { m += thread_state[t].state.m; }
+        else {
+            fast_sint_t t, m = 0;
+            for (t = omp_num_threads - 1; t > omp_thread_num; --t) {
+                m += thread_state[t].state.m;
+            }
 
-            libsais_gather_lms_suffixes_8u(T, SA, n, (fast_sint_t)n - 1 - m, omp_block_start, omp_block_size);
+            libsais_gather_lms_suffixes_8u(T, SA, n, (fast_sint_t)n - 1 - m,
+                                           omp_block_start, omp_block_size);
 
-            #pragma omp barrier
+    #pragma omp barrier
 
-            if (thread_state[omp_thread_num].state.m > 0)
-            {
-                SA[(fast_sint_t)n - 1 - m] = (sa_sint_t)thread_state[omp_thread_num].state.last_lms_suffix;
+            if (thread_state[omp_thread_num].state.m > 0) {
+                SA[(fast_sint_t)n - 1 - m] =
+                    (sa_sint_t)thread_state[omp_thread_num]
+                        .state.last_lms_suffix;
             }
         }
 #endif
     }
 }
 
-static sa_sint_t libsais_gather_lms_suffixes_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n)
-{
+static sa_sint_t libsais_gather_lms_suffixes_32s(const sa_sint_t * RESTRICT T,
+                                                 sa_sint_t * RESTRICT SA,
+                                                 sa_sint_t n) {
     const fast_sint_t prefetch_distance = 32;
 
-    sa_sint_t             i   = n - 2;
-    sa_sint_t             m   = n - 1;
-    fast_uint_t           s   = 1;
-    fast_sint_t           c0  = T[n - 1];
-    fast_sint_t           c1  = 0;
+    sa_sint_t i = n - 2;
+    sa_sint_t m = n - 1;
+    fast_uint_t s = 1;
+    fast_sint_t c0 = T[n - 1];
+    fast_sint_t c1 = 0;
 
-    for (; i >= 3; i -= 4)
-    {
+    for (; i >= 3; i -= 4) {
         libsais_prefetch(&T[i - prefetch_distance]);
 
-        c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = i + 1; m -= ((s & 3) == 1);
-        c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = i - 0; m -= ((s & 3) == 1);
-        c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = i - 1; m -= ((s & 3) == 1);
-        c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = i - 2; m -= ((s & 3) == 1);
-    }
-
-    for (; i >= 0; i -= 1)
-    {
-        c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = i + 1; m -= ((s & 3) == 1);
+        c1 = T[i - 0];
+        s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+        SA[m] = i + 1;
+        m -= ((s & 3) == 1);
+        c0 = T[i - 1];
+        s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+        SA[m] = i - 0;
+        m -= ((s & 3) == 1);
+        c1 = T[i - 2];
+        s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+        SA[m] = i - 1;
+        m -= ((s & 3) == 1);
+        c0 = T[i - 3];
+        s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+        SA[m] = i - 2;
+        m -= ((s & 3) == 1);
+    }
+
+    for (; i >= 0; i -= 1) {
+        c1 = c0;
+        c0 = T[i];
+        s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+        SA[m] = i + 1;
+        m -= ((s & 3) == 1);
     }
 
     return n - 1 - m;
 }
 
-static sa_sint_t libsais_gather_compacted_lms_suffixes_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n)
-{
+static sa_sint_t libsais_gather_compacted_lms_suffixes_32s(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n) {
     const fast_sint_t prefetch_distance = 32;
 
-    sa_sint_t             i   = n - 2;
-    sa_sint_t             m   = n - 1;
-    fast_uint_t           s   = 1;
-    fast_sint_t           c0  = T[n - 1];
-    fast_sint_t           c1  = 0;
+    sa_sint_t i = n - 2;
+    sa_sint_t m = n - 1;
+    fast_uint_t s = 1;
+    fast_sint_t c0 = T[n - 1];
+    fast_sint_t c1 = 0;
 
-    for (; i >= 3; i -= 4)
-    {
+    for (; i >= 3; i -= 4) {
         libsais_prefetch(&T[i - prefetch_distance]);
 
-        c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = i + 1; m -= ((fast_sint_t)(s & 3) == (c0 >= 0));
-        c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = i - 0; m -= ((fast_sint_t)(s & 3) == (c1 >= 0));
-        c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = i - 1; m -= ((fast_sint_t)(s & 3) == (c0 >= 0));
-        c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = i - 2; m -= ((fast_sint_t)(s & 3) == (c1 >= 0));
-    }
-
-    for (; i >= 0; i -= 1)
-    {
-        c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = i + 1; m -= ((fast_sint_t)(s & 3) == (c1 >= 0));
+        c1 = T[i - 0];
+        s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+        SA[m] = i + 1;
+        m -= ((fast_sint_t)(s & 3) == (c0 >= 0));
+        c0 = T[i - 1];
+        s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+        SA[m] = i - 0;
+        m -= ((fast_sint_t)(s & 3) == (c1 >= 0));
+        c1 = T[i - 2];
+        s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+        SA[m] = i - 1;
+        m -= ((fast_sint_t)(s & 3) == (c0 >= 0));
+        c0 = T[i - 3];
+        s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+        SA[m] = i - 2;
+        m -= ((fast_sint_t)(s & 3) == (c1 >= 0));
+    }
+
+    for (; i >= 0; i -= 1) {
+        c1 = c0;
+        c0 = T[i];
+        s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+        SA[m] = i + 1;
+        m -= ((fast_sint_t)(s & 3) == (c1 >= 0));
     }
 
     return n - 1 - m;
@@ -557,42 +708,51 @@ static sa_sint_t libsais_gather_compacted_lms_suffixes_32s(const sa_sint_t * RES
 
 #if defined(_OPENMP)
 
-static void libsais_count_lms_suffixes_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets)
-{
+static void libsais_count_lms_suffixes_32s_4k(const sa_sint_t * RESTRICT T,
+                                              sa_sint_t n, sa_sint_t k,
+                                              sa_sint_t * RESTRICT buckets) {
     const fast_sint_t prefetch_distance = 32;
 
     memset(buckets, 0, 4 * (size_t)k * sizeof(sa_sint_t));
 
-    sa_sint_t             i   = n - 2;
-    fast_uint_t           s   = 1;
-    fast_sint_t           c0  = T[n - 1];
-    fast_sint_t           c1  = 0;
+    sa_sint_t i = n - 2;
+    fast_uint_t s = 1;
+    fast_sint_t c0 = T[n - 1];
+    fast_sint_t c1 = 0;
 
-    for (; i >= prefetch_distance + 3; i -= 4)
-    {
+    for (; i >= prefetch_distance + 3; i -= 4) {
         libsais_prefetch(&T[i - 2 * prefetch_distance]);
 
-        libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 0], 0)]);
-        libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 1], 0)]);
-        libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 2], 0)]);
-        libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 3], 0)]);
-
-        c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+        libsais_prefetchw(
+            &buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 0], 0)]);
+        libsais_prefetchw(
+            &buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 1], 0)]);
+        libsais_prefetchw(
+            &buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 2], 0)]);
+        libsais_prefetchw(
+            &buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 3], 0)]);
+
+        c1 = T[i - 0];
+        s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
         buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
 
-        c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+        c0 = T[i - 1];
+        s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
         buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
 
-        c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+        c1 = T[i - 2];
+        s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
         buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
 
-        c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+        c0 = T[i - 3];
+        s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
         buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
     }
 
-    for (; i >= 0; i -= 1)
-    {
-        c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+    for (; i >= 0; i -= 1) {
+        c1 = c0;
+        c0 = T[i];
+        s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
         buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
     }
 
@@ -601,42 +761,51 @@ static void libsais_count_lms_suffixes_32s_4k(const sa_sint_t * RESTRICT T, sa_s
 
 #endif
 
-static void libsais_count_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets)
-{
+static void libsais_count_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T,
+                                              sa_sint_t n, sa_sint_t k,
+                                              sa_sint_t * RESTRICT buckets) {
     const fast_sint_t prefetch_distance = 32;
 
     memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t));
 
-    sa_sint_t             i   = n - 2;
-    fast_uint_t           s   = 1;
-    fast_sint_t           c0  = T[n - 1];
-    fast_sint_t           c1  = 0;
+    sa_sint_t i = n - 2;
+    fast_uint_t s = 1;
+    fast_sint_t c0 = T[n - 1];
+    fast_sint_t c1 = 0;
 
-    for (; i >= prefetch_distance + 3; i -= 4)
-    {
+    for (; i >= prefetch_distance + 3; i -= 4) {
         libsais_prefetch(&T[i - 2 * prefetch_distance]);
 
-        libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0], 0)]);
-        libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1], 0)]);
-        libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2], 0)]);
-        libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3], 0)]);
-
-        c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+        libsais_prefetchw(
+            &buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0], 0)]);
+        libsais_prefetchw(
+            &buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1], 0)]);
+        libsais_prefetchw(
+            &buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2], 0)]);
+        libsais_prefetchw(
+            &buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3], 0)]);
+
+        c1 = T[i - 0];
+        s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
         buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
 
-        c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+        c0 = T[i - 1];
+        s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
         buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
 
-        c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+        c1 = T[i - 2];
+        s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
         buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
 
-        c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+        c0 = T[i - 3];
+        s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
         buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
     }
 
-    for (; i >= 0; i -= 1)
-    {
-        c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+    for (; i >= 0; i -= 1) {
+        c1 = c0;
+        c0 = T[i];
+        s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
         buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
     }
 
@@ -645,153 +814,206 @@ static void libsais_count_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_s
 
 #if defined(_OPENMP)
 
-static void libsais_count_compacted_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets)
-{
+static void libsais_count_compacted_lms_suffixes_32s_2k(
+    const sa_sint_t * RESTRICT T, sa_sint_t n, sa_sint_t k,
+    sa_sint_t * RESTRICT buckets) {
     const fast_sint_t prefetch_distance = 32;
 
     memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t));
 
-    sa_sint_t             i   = n - 2;
-    fast_uint_t           s   = 1;
-    fast_sint_t           c0  = T[n - 1];
-    fast_sint_t           c1  = 0;
+    sa_sint_t i = n - 2;
+    fast_uint_t s = 1;
+    fast_sint_t c0 = T[n - 1];
+    fast_sint_t c1 = 0;
 
-    for (; i >= prefetch_distance + 3; i -= 4)
-    {
+    for (; i >= prefetch_distance + 3; i -= 4) {
         libsais_prefetch(&T[i - 2 * prefetch_distance]);
 
-        libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0] & SAINT_MAX, 0)]);
-        libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1] & SAINT_MAX, 0)]);
-        libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2] & SAINT_MAX, 0)]);
-        libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3] & SAINT_MAX, 0)]);
-
-        c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
-        c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+        libsais_prefetchw(&buckets[BUCKETS_INDEX2(
+            T[i - prefetch_distance - 0] & SAINT_MAX, 0)]);
+        libsais_prefetchw(&buckets[BUCKETS_INDEX2(
+            T[i - prefetch_distance - 1] & SAINT_MAX, 0)]);
+        libsais_prefetchw(&buckets[BUCKETS_INDEX2(
+            T[i - prefetch_distance - 2] & SAINT_MAX, 0)]);
+        libsais_prefetchw(&buckets[BUCKETS_INDEX2(
+            T[i - prefetch_distance - 3] & SAINT_MAX, 0)]);
+
+        c1 = T[i - 0];
+        s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+        c0 &= SAINT_MAX;
+        buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
 
-        c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
-        c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+        c0 = T[i - 1];
+        s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+        c1 &= SAINT_MAX;
+        buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
 
-        c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
-        c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+        c1 = T[i - 2];
+        s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+        c0 &= SAINT_MAX;
+        buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
 
-        c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
-        c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+        c0 = T[i - 3];
+        s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+        c1 &= SAINT_MAX;
+        buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
     }
 
-    for (; i >= 0; i -= 1)
-    {
-        c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
-        c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+    for (; i >= 0; i -= 1) {
+        c1 = c0;
+        c0 = T[i];
+        s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+        c1 &= SAINT_MAX;
+        buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
     }
 
-    c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, 0)]++;
+    c0 &= SAINT_MAX;
+    buckets[BUCKETS_INDEX2((fast_uint_t)c0, 0)]++;
 }
 
 #endif
 
-static sa_sint_t libsais_count_and_gather_lms_suffixes_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static sa_sint_t libsais_count_and_gather_lms_suffixes_8u(
+    const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+    sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start,
+    fast_sint_t omp_block_size) {
     memset(buckets, 0, 4 * ALPHABET_SIZE * sizeof(sa_sint_t));
 
     fast_sint_t m = omp_block_start + omp_block_size - 1;
 
-    if (omp_block_size > 0)
-    {
+    if (omp_block_size > 0) {
         const fast_sint_t prefetch_distance = 128;
 
         fast_sint_t i, j = m + 1, c0 = T[m], c1 = -1;
 
-        while (j < n && (c1 = T[j]) == c0) { ++j; }
+        while (j < n && (c1 = T[j]) == c0) {
+            ++j;
+        }
 
         fast_uint_t s = c0 >= c1;
 
-        for (i = m - 1, j = omp_block_start + 3; i >= j; i -= 4)
-        {
+        for (i = m - 1, j = omp_block_start + 3; i >= j; i -= 4) {
             libsais_prefetch(&T[i - prefetch_distance]);
 
-            c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+            c1 = T[i - 0];
+            s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+            SA[m] = (sa_sint_t)(i + 1);
+            m -= ((s & 3) == 1);
             buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
 
-            c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 0); m -= ((s & 3) == 1);
+            c0 = T[i - 1];
+            s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+            SA[m] = (sa_sint_t)(i - 0);
+            m -= ((s & 3) == 1);
             buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
 
-            c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 1); m -= ((s & 3) == 1);
+            c1 = T[i - 2];
+            s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+            SA[m] = (sa_sint_t)(i - 1);
+            m -= ((s & 3) == 1);
             buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
 
-            c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 2); m -= ((s & 3) == 1);
+            c0 = T[i - 3];
+            s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+            SA[m] = (sa_sint_t)(i - 2);
+            m -= ((s & 3) == 1);
             buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
         }
 
-        for (j -= 3; i >= j; i -= 1)
-        {
-            c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+        for (j -= 3; i >= j; i -= 1) {
+            c1 = c0;
+            c0 = T[i];
+            s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+            SA[m] = (sa_sint_t)(i + 1);
+            m -= ((s & 3) == 1);
             buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
         }
 
-        c1 = (i >= 0) ? T[i] : -1; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+        c1 = (i >= 0) ? T[i] : -1;
+        s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+        SA[m] = (sa_sint_t)(i + 1);
+        m -= ((s & 3) == 1);
         buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
     }
 
     return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m);
 }
 
-static sa_sint_t libsais_count_and_gather_lms_suffixes_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
+static sa_sint_t libsais_count_and_gather_lms_suffixes_8u_omp(
+    const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+    sa_sint_t * RESTRICT buckets, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     sa_sint_t m = 0;
 
 #if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536 && omp_get_dynamic() == 0)
+    #pragma omp parallel num_threads(threads) if (threads > 1 && n >= 65536 && \
+                                                  omp_get_dynamic() == 0)
 #endif
     {
 #if defined(_OPENMP)
-        fast_sint_t omp_thread_num    = omp_get_thread_num();
-        fast_sint_t omp_num_threads   = omp_get_num_threads();
+        fast_sint_t omp_thread_num = omp_get_thread_num();
+        fast_sint_t omp_num_threads = omp_get_num_threads();
 #else
-        UNUSED(threads); UNUSED(thread_state);
+        UNUSED(threads);
+        UNUSED(thread_state);
 
-        fast_sint_t omp_thread_num    = 0;
-        fast_sint_t omp_num_threads   = 1;
+        fast_sint_t omp_thread_num = 0;
+        fast_sint_t omp_num_threads = 1;
 #endif
-        fast_sint_t omp_block_stride  = (n / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+        fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
+                                         ? omp_block_stride
+                                         : n - omp_block_start;
 
-        if (omp_num_threads == 1)
-        {
-            m = libsais_count_and_gather_lms_suffixes_8u(T, SA, n, buckets, omp_block_start, omp_block_size);
+        if (omp_num_threads == 1) {
+            m = libsais_count_and_gather_lms_suffixes_8u(
+                T, SA, n, buckets, omp_block_start, omp_block_size);
         }
 #if defined(_OPENMP)
-        else
-        {
+        else {
             {
-                thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size;
-                thread_state[omp_thread_num].state.m = libsais_count_and_gather_lms_suffixes_8u(T, SA, n, thread_state[omp_thread_num].state.buckets, omp_block_start, omp_block_size);
+                thread_state[omp_thread_num].state.position =
+                    omp_block_start + omp_block_size;
+                thread_state[omp_thread_num].state.m =
+                    libsais_count_and_gather_lms_suffixes_8u(
+                        T, SA, n, thread_state[omp_thread_num].state.buckets,
+                        omp_block_start, omp_block_size);
 
-                if (thread_state[omp_thread_num].state.m > 0)
-                {
-                    thread_state[omp_thread_num].state.last_lms_suffix = SA[thread_state[omp_thread_num].state.position - 1];
+                if (thread_state[omp_thread_num].state.m > 0) {
+                    thread_state[omp_thread_num].state.last_lms_suffix =
+                        SA[thread_state[omp_thread_num].state.position - 1];
                 }
             }
 
-            #pragma omp barrier
+    #pragma omp barrier
 
-            #pragma omp master
+    #pragma omp master
             {
                 memset(buckets, 0, 4 * ALPHABET_SIZE * sizeof(sa_sint_t));
 
                 fast_sint_t t;
-                for (t = omp_num_threads - 1; t >= 0; --t)
-                {
+                for (t = omp_num_threads - 1; t >= 0; --t) {
                     m += (sa_sint_t)thread_state[t].state.m;
 
-                    if (t != omp_num_threads - 1 && thread_state[t].state.m > 0)
-                    {
-                        memcpy(&SA[n - m], &SA[thread_state[t].state.position - thread_state[t].state.m], (size_t)thread_state[t].state.m * sizeof(sa_sint_t));
+                    if (t != omp_num_threads - 1 &&
+                        thread_state[t].state.m > 0) {
+                        memcpy(&SA[n - m],
+                               &SA[thread_state[t].state.position -
+                                   thread_state[t].state.m],
+                               (size_t)thread_state[t].state.m *
+                                   sizeof(sa_sint_t));
                     }
 
                     {
-                        sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
-                        fast_sint_t s; for (s = 0; s < 4 * ALPHABET_SIZE; s += 1) { sa_sint_t A = buckets[s], B = temp_bucket[s]; buckets[s] = A + B; temp_bucket[s] = A; }
+                        sa_sint_t * RESTRICT temp_bucket =
+                            thread_state[t].state.buckets;
+                        fast_sint_t s;
+                        for (s = 0; s < 4 * ALPHABET_SIZE; s += 1) {
+                            sa_sint_t A = buckets[s], B = temp_bucket[s];
+                            buckets[s] = A + B;
+                            temp_bucket[s] = A;
+                        }
                     }
                 }
             }
@@ -802,154 +1024,235 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_8u_omp(const uint8_t * RE
     return m;
 }
 
-static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+    sa_sint_t k, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start,
+    fast_sint_t omp_block_size) {
     memset(buckets, 0, 4 * (size_t)k * sizeof(sa_sint_t));
 
     fast_sint_t m = omp_block_start + omp_block_size - 1;
 
-    if (omp_block_size > 0)
-    {
+    if (omp_block_size > 0) {
         const fast_sint_t prefetch_distance = 32;
 
         fast_sint_t i, j = m + 1, c0 = T[m], c1 = -1;
 
-        while (j < n && (c1 = T[j]) == c0) { ++j; }
+        while (j < n && (c1 = T[j]) == c0) {
+            ++j;
+        }
 
         fast_uint_t s = c0 >= c1;
 
-        for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4)
-        {
+        for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j;
+             i -= 4) {
             libsais_prefetch(&T[i - 2 * prefetch_distance]);
 
-            libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 0], 0)]);
-            libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 1], 0)]);
-            libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 2], 0)]);
-            libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 3], 0)]);
-
-            c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+            libsais_prefetchw(
+                &buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 0], 0)]);
+            libsais_prefetchw(
+                &buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 1], 0)]);
+            libsais_prefetchw(
+                &buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 2], 0)]);
+            libsais_prefetchw(
+                &buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 3], 0)]);
+
+            c1 = T[i - 0];
+            s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+            SA[m] = (sa_sint_t)(i + 1);
+            m -= ((s & 3) == 1);
             buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
 
-            c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 0); m -= ((s & 3) == 1);
+            c0 = T[i - 1];
+            s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+            SA[m] = (sa_sint_t)(i - 0);
+            m -= ((s & 3) == 1);
             buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
 
-            c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 1); m -= ((s & 3) == 1);
+            c1 = T[i - 2];
+            s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+            SA[m] = (sa_sint_t)(i - 1);
+            m -= ((s & 3) == 1);
             buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
 
-            c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 2); m -= ((s & 3) == 1);
+            c0 = T[i - 3];
+            s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+            SA[m] = (sa_sint_t)(i - 2);
+            m -= ((s & 3) == 1);
             buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
         }
 
-        for (j -= prefetch_distance + 3; i >= j; i -= 1)
-        {
-            c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+        for (j -= prefetch_distance + 3; i >= j; i -= 1) {
+            c1 = c0;
+            c0 = T[i];
+            s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+            SA[m] = (sa_sint_t)(i + 1);
+            m -= ((s & 3) == 1);
             buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
         }
 
-        c1 = (i >= 0) ? T[i] : -1; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+        c1 = (i >= 0) ? T[i] : -1;
+        s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+        SA[m] = (sa_sint_t)(i + 1);
+        m -= ((s & 3) == 1);
         buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
     }
 
     return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m);
 }
 
-static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+    sa_sint_t k, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start,
+    fast_sint_t omp_block_size) {
     memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t));
 
     fast_sint_t m = omp_block_start + omp_block_size - 1;
 
-    if (omp_block_size > 0)
-    {
+    if (omp_block_size > 0) {
         const fast_sint_t prefetch_distance = 32;
 
         fast_sint_t i, j = m + 1, c0 = T[m], c1 = -1;
 
-        while (j < n && (c1 = T[j]) == c0) { ++j; }
+        while (j < n && (c1 = T[j]) == c0) {
+            ++j;
+        }
 
         fast_uint_t s = c0 >= c1;
 
-        for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4)
-        {
+        for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j;
+             i -= 4) {
             libsais_prefetch(&T[i - 2 * prefetch_distance]);
 
-            libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0], 0)]);
-            libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1], 0)]);
-            libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2], 0)]);
-            libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3], 0)]);
-
-            c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+            libsais_prefetchw(
+                &buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0], 0)]);
+            libsais_prefetchw(
+                &buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1], 0)]);
+            libsais_prefetchw(
+                &buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2], 0)]);
+            libsais_prefetchw(
+                &buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3], 0)]);
+
+            c1 = T[i - 0];
+            s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+            SA[m] = (sa_sint_t)(i + 1);
+            m -= ((s & 3) == 1);
             buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
 
-            c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 0); m -= ((s & 3) == 1);
+            c0 = T[i - 1];
+            s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+            SA[m] = (sa_sint_t)(i - 0);
+            m -= ((s & 3) == 1);
             buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
 
-            c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 1); m -= ((s & 3) == 1);
+            c1 = T[i - 2];
+            s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+            SA[m] = (sa_sint_t)(i - 1);
+            m -= ((s & 3) == 1);
             buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
 
-            c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 2); m -= ((s & 3) == 1);
+            c0 = T[i - 3];
+            s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+            SA[m] = (sa_sint_t)(i - 2);
+            m -= ((s & 3) == 1);
             buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
         }
 
-        for (j -= prefetch_distance + 3; i >= j; i -= 1)
-        {
-            c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+        for (j -= prefetch_distance + 3; i >= j; i -= 1) {
+            c1 = c0;
+            c0 = T[i];
+            s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+            SA[m] = (sa_sint_t)(i + 1);
+            m -= ((s & 3) == 1);
             buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
         }
 
-        c1 = (i >= 0) ? T[i] : -1; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((s & 3) == 1);
+        c1 = (i >= 0) ? T[i] : -1;
+        s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+        SA[m] = (sa_sint_t)(i + 1);
+        m -= ((s & 3) == 1);
         buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
     }
 
     return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m);
 }
 
-static sa_sint_t libsais_count_and_gather_compacted_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static sa_sint_t libsais_count_and_gather_compacted_lms_suffixes_32s_2k(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+    sa_sint_t k, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start,
+    fast_sint_t omp_block_size) {
     memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t));
 
     fast_sint_t m = omp_block_start + omp_block_size - 1;
 
-    if (omp_block_size > 0)
-    {
+    if (omp_block_size > 0) {
         const fast_sint_t prefetch_distance = 32;
 
         fast_sint_t i, j = m + 1, c0 = T[m], c1 = -1;
 
-        while (j < n && (c1 = T[j]) == c0) { ++j; }
+        while (j < n && (c1 = T[j]) == c0) {
+            ++j;
+        }
 
         fast_uint_t s = c0 >= c1;
 
-        for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4)
-        {
+        for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j;
+             i -= 4) {
             libsais_prefetch(&T[i - 2 * prefetch_distance]);
 
-            libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0] & SAINT_MAX, 0)]);
-            libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1] & SAINT_MAX, 0)]);
-            libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2] & SAINT_MAX, 0)]);
-            libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3] & SAINT_MAX, 0)]);
-
-            c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((fast_sint_t)(s & 3) == (c0 >= 0));
-            c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+            libsais_prefetchw(&buckets[BUCKETS_INDEX2(
+                T[i - prefetch_distance - 0] & SAINT_MAX, 0)]);
+            libsais_prefetchw(&buckets[BUCKETS_INDEX2(
+                T[i - prefetch_distance - 1] & SAINT_MAX, 0)]);
+            libsais_prefetchw(&buckets[BUCKETS_INDEX2(
+                T[i - prefetch_distance - 2] & SAINT_MAX, 0)]);
+            libsais_prefetchw(&buckets[BUCKETS_INDEX2(
+                T[i - prefetch_distance - 3] & SAINT_MAX, 0)]);
+
+            c1 = T[i - 0];
+            s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+            SA[m] = (sa_sint_t)(i + 1);
+            m -= ((fast_sint_t)(s & 3) == (c0 >= 0));
+            c0 &= SAINT_MAX;
+            buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
 
-            c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 0); m -= ((fast_sint_t)(s & 3) == (c1 >= 0));
-            c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+            c0 = T[i - 1];
+            s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+            SA[m] = (sa_sint_t)(i - 0);
+            m -= ((fast_sint_t)(s & 3) == (c1 >= 0));
+            c1 &= SAINT_MAX;
+            buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
 
-            c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 1); m -= ((fast_sint_t)(s & 3) == (c0 >= 0));
-            c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+            c1 = T[i - 2];
+            s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+            SA[m] = (sa_sint_t)(i - 1);
+            m -= ((fast_sint_t)(s & 3) == (c0 >= 0));
+            c0 &= SAINT_MAX;
+            buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
 
-            c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i - 2); m -= ((fast_sint_t)(s & 3) == (c1 >= 0));
-            c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+            c0 = T[i - 3];
+            s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+            SA[m] = (sa_sint_t)(i - 2);
+            m -= ((fast_sint_t)(s & 3) == (c1 >= 0));
+            c1 &= SAINT_MAX;
+            buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
         }
 
-        for (j -= prefetch_distance + 3; i >= j; i -= 1)
-        {
-            c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((fast_sint_t)(s & 3) == (c1 >= 0));
-            c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+        for (j -= prefetch_distance + 3; i >= j; i -= 1) {
+            c1 = c0;
+            c0 = T[i];
+            s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+            SA[m] = (sa_sint_t)(i + 1);
+            m -= ((fast_sint_t)(s & 3) == (c1 >= 0));
+            c1 &= SAINT_MAX;
+            buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
         }
 
-        c1 = (i >= 0) ? T[i] : -1; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); SA[m] = (sa_sint_t)(i + 1); m -= ((fast_sint_t)(s & 3) == (c0 >= 0));
-        c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+        c1 = (i >= 0) ? T[i] : -1;
+        s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+        SA[m] = (sa_sint_t)(i + 1);
+        m -= ((fast_sint_t)(s & 3) == (c0 >= 0));
+        c0 &= SAINT_MAX;
+        buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
     }
 
     return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m);
@@ -957,234 +1260,285 @@ static sa_sint_t libsais_count_and_gather_compacted_lms_suffixes_32s_2k(const sa
 
 #if defined(_OPENMP)
 
-static fast_sint_t libsais_get_bucket_stride(fast_sint_t free_space, fast_sint_t bucket_size, fast_sint_t num_buckets)
-{
-    fast_sint_t bucket_size_1024 = (bucket_size + 1023) & (-1024); if (free_space / (num_buckets - 1) >= bucket_size_1024) { return bucket_size_1024; }
-    fast_sint_t bucket_size_16 = (bucket_size + 15) & (-16); if (free_space / (num_buckets - 1) >= bucket_size_16) { return bucket_size_16; }
+static fast_sint_t libsais_get_bucket_stride(fast_sint_t free_space,
+                                             fast_sint_t bucket_size,
+                                             fast_sint_t num_buckets) {
+    fast_sint_t bucket_size_1024 = (bucket_size + 1023) & (-1024);
+    if (free_space / (num_buckets - 1) >= bucket_size_1024) {
+        return bucket_size_1024;
+    }
+    fast_sint_t bucket_size_16 = (bucket_size + 15) & (-16);
+    if (free_space / (num_buckets - 1) >= bucket_size_16) {
+        return bucket_size_16;
+    }
 
     return bucket_size;
 }
 
-static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k_fs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
+static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k_fs_omp(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+    sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     sa_sint_t m = 0;
 
-#if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536)
-#endif
+    #if defined(_OPENMP)
+        #pragma omp parallel num_threads(threads) if (threads > 1 && n >= 65536)
+    #endif
     {
-#if defined(_OPENMP)
-        fast_sint_t omp_thread_num    = omp_get_thread_num();
-        fast_sint_t omp_num_threads   = omp_get_num_threads();
-#else
-        UNUSED(threads); UNUSED(thread_state);
+    #if defined(_OPENMP)
+        fast_sint_t omp_thread_num = omp_get_thread_num();
+        fast_sint_t omp_num_threads = omp_get_num_threads();
+    #else
+        UNUSED(threads);
+        UNUSED(thread_state);
 
-        fast_sint_t omp_thread_num    = 0;
-        fast_sint_t omp_num_threads   = 1;
-#endif
-        fast_sint_t omp_block_stride  = (n / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+        fast_sint_t omp_thread_num = 0;
+        fast_sint_t omp_num_threads = 1;
+    #endif
+        fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
+                                         ? omp_block_stride
+                                         : n - omp_block_start;
 
-        if (omp_num_threads == 1)
-        {
-            m = libsais_count_and_gather_lms_suffixes_32s_4k(T, SA, n, k, buckets, omp_block_start, omp_block_size);
+        if (omp_num_threads == 1) {
+            m = libsais_count_and_gather_lms_suffixes_32s_4k(
+                T, SA, n, k, buckets, omp_block_start, omp_block_size);
         }
-#if defined(_OPENMP)
-        else
-        {
-            fast_sint_t bucket_size       = 4 * (fast_sint_t)k;
-            fast_sint_t bucket_stride     = libsais_get_bucket_stride(buckets - &SA[n], bucket_size, omp_num_threads);
+    #if defined(_OPENMP)
+        else {
+            fast_sint_t bucket_size = 4 * (fast_sint_t)k;
+            fast_sint_t bucket_stride = libsais_get_bucket_stride(
+                buckets - &SA[n], bucket_size, omp_num_threads);
 
             {
-                thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size;
-                thread_state[omp_thread_num].state.count = libsais_count_and_gather_lms_suffixes_32s_4k(T, SA, n, k, buckets - (omp_thread_num * bucket_stride), omp_block_start, omp_block_size);
+                thread_state[omp_thread_num].state.position =
+                    omp_block_start + omp_block_size;
+                thread_state[omp_thread_num].state.count =
+                    libsais_count_and_gather_lms_suffixes_32s_4k(
+                        T, SA, n, k, buckets - (omp_thread_num * bucket_stride),
+                        omp_block_start, omp_block_size);
             }
 
-            #pragma omp barrier
+        #pragma omp barrier
 
-            if (omp_thread_num == omp_num_threads - 1)
-            {
+            if (omp_thread_num == omp_num_threads - 1) {
                 fast_sint_t t;
-                for (t = omp_num_threads - 1; t >= 0; --t)
-                {
+                for (t = omp_num_threads - 1; t >= 0; --t) {
                     m += (sa_sint_t)thread_state[t].state.count;
 
-                    if (t != omp_num_threads - 1 && thread_state[t].state.count > 0)
-                    {
-                        memcpy(&SA[n - m], &SA[thread_state[t].state.position - thread_state[t].state.count], (size_t)thread_state[t].state.count * sizeof(sa_sint_t));
+                    if (t != omp_num_threads - 1 &&
+                        thread_state[t].state.count > 0) {
+                        memcpy(&SA[n - m],
+                               &SA[thread_state[t].state.position -
+                                   thread_state[t].state.count],
+                               (size_t)thread_state[t].state.count *
+                                   sizeof(sa_sint_t));
                     }
                 }
-            }
-            else
-            {
-                omp_num_threads     = omp_num_threads - 1;
-                omp_block_stride    = (bucket_size / omp_num_threads) & (-16);
-                omp_block_start     = omp_thread_num * omp_block_stride;
-                omp_block_size      = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : bucket_size - omp_block_start;
+            } else {
+                omp_num_threads = omp_num_threads - 1;
+                omp_block_stride = (bucket_size / omp_num_threads) & (-16);
+                omp_block_start = omp_thread_num * omp_block_stride;
+                omp_block_size = omp_thread_num < omp_num_threads - 1
+                                     ? omp_block_stride
+                                     : bucket_size - omp_block_start;
 
-                libsais_accumulate_counts_s32(buckets + omp_block_start, omp_block_size, bucket_stride, omp_num_threads + 1);
+                libsais_accumulate_counts_s32(buckets + omp_block_start,
+                                              omp_block_size, bucket_stride,
+                                              omp_num_threads + 1);
             }
         }
-#endif
+    #endif
     }
 
     return m;
 }
 
-static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k_fs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
+static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k_fs_omp(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+    sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     sa_sint_t m = 0;
 
-#if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536)
-#endif
+    #if defined(_OPENMP)
+        #pragma omp parallel num_threads(threads) if (threads > 1 && n >= 65536)
+    #endif
     {
-#if defined(_OPENMP)
-        fast_sint_t omp_thread_num    = omp_get_thread_num();
-        fast_sint_t omp_num_threads   = omp_get_num_threads();
-#else
-        UNUSED(threads); UNUSED(thread_state);
+    #if defined(_OPENMP)
+        fast_sint_t omp_thread_num = omp_get_thread_num();
+        fast_sint_t omp_num_threads = omp_get_num_threads();
+    #else
+        UNUSED(threads);
+        UNUSED(thread_state);
 
-        fast_sint_t omp_thread_num    = 0;
-        fast_sint_t omp_num_threads   = 1;
-#endif
-        fast_sint_t omp_block_stride  = (n / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+        fast_sint_t omp_thread_num = 0;
+        fast_sint_t omp_num_threads = 1;
+    #endif
+        fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
+                                         ? omp_block_stride
+                                         : n - omp_block_start;
 
-        if (omp_num_threads == 1)
-        {
-            m = libsais_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets, omp_block_start, omp_block_size);
+        if (omp_num_threads == 1) {
+            m = libsais_count_and_gather_lms_suffixes_32s_2k(
+                T, SA, n, k, buckets, omp_block_start, omp_block_size);
         }
-#if defined(_OPENMP)
-        else
-        {
-            fast_sint_t bucket_size       = 2 * (fast_sint_t)k;
-            fast_sint_t bucket_stride     = libsais_get_bucket_stride(buckets - &SA[n], bucket_size, omp_num_threads);
+    #if defined(_OPENMP)
+        else {
+            fast_sint_t bucket_size = 2 * (fast_sint_t)k;
+            fast_sint_t bucket_stride = libsais_get_bucket_stride(
+                buckets - &SA[n], bucket_size, omp_num_threads);
 
             {
-                thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size;
-                thread_state[omp_thread_num].state.count = libsais_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets - (omp_thread_num * bucket_stride), omp_block_start, omp_block_size);
+                thread_state[omp_thread_num].state.position =
+                    omp_block_start + omp_block_size;
+                thread_state[omp_thread_num].state.count =
+                    libsais_count_and_gather_lms_suffixes_32s_2k(
+                        T, SA, n, k, buckets - (omp_thread_num * bucket_stride),
+                        omp_block_start, omp_block_size);
             }
 
-            #pragma omp barrier
+        #pragma omp barrier
 
-            if (omp_thread_num == omp_num_threads - 1)
-            {
+            if (omp_thread_num == omp_num_threads - 1) {
                 fast_sint_t t;
-                for (t = omp_num_threads - 1; t >= 0; --t)
-                {
+                for (t = omp_num_threads - 1; t >= 0; --t) {
                     m += (sa_sint_t)thread_state[t].state.count;
 
-                    if (t != omp_num_threads - 1 && thread_state[t].state.count > 0)
-                    {
-                        memcpy(&SA[n - m], &SA[thread_state[t].state.position - thread_state[t].state.count], (size_t)thread_state[t].state.count * sizeof(sa_sint_t));
+                    if (t != omp_num_threads - 1 &&
+                        thread_state[t].state.count > 0) {
+                        memcpy(&SA[n - m],
+                               &SA[thread_state[t].state.position -
+                                   thread_state[t].state.count],
+                               (size_t)thread_state[t].state.count *
+                                   sizeof(sa_sint_t));
                     }
                 }
-            }
-            else
-            {
-                omp_num_threads     = omp_num_threads - 1;
-                omp_block_stride    = (bucket_size / omp_num_threads) & (-16);
-                omp_block_start     = omp_thread_num * omp_block_stride;
-                omp_block_size      = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : bucket_size - omp_block_start;
+            } else {
+                omp_num_threads = omp_num_threads - 1;
+                omp_block_stride = (bucket_size / omp_num_threads) & (-16);
+                omp_block_start = omp_thread_num * omp_block_stride;
+                omp_block_size = omp_thread_num < omp_num_threads - 1
+                                     ? omp_block_stride
+                                     : bucket_size - omp_block_start;
 
-                libsais_accumulate_counts_s32(buckets + omp_block_start, omp_block_size, bucket_stride, omp_num_threads + 1);
+                libsais_accumulate_counts_s32(buckets + omp_block_start,
+                                              omp_block_size, bucket_stride,
+                                              omp_num_threads + 1);
             }
         }
-#endif
+    #endif
     }
 
     return m;
 }
 
-static void libsais_count_and_gather_compacted_lms_suffixes_32s_2k_fs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
-#if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536)
-#endif
+static void libsais_count_and_gather_compacted_lms_suffixes_32s_2k_fs_omp(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+    sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    #if defined(_OPENMP)
+        #pragma omp parallel num_threads(threads) if (threads > 1 && n >= 65536)
+    #endif
     {
-#if defined(_OPENMP)
-        fast_sint_t omp_thread_num    = omp_get_thread_num();
-        fast_sint_t omp_num_threads   = omp_get_num_threads();
-#else
-        UNUSED(threads); UNUSED(thread_state);
+    #if defined(_OPENMP)
+        fast_sint_t omp_thread_num = omp_get_thread_num();
+        fast_sint_t omp_num_threads = omp_get_num_threads();
+    #else
+        UNUSED(threads);
+        UNUSED(thread_state);
 
-        fast_sint_t omp_thread_num    = 0;
-        fast_sint_t omp_num_threads   = 1;
-#endif
-        fast_sint_t omp_block_stride  = (n / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+        fast_sint_t omp_thread_num = 0;
+        fast_sint_t omp_num_threads = 1;
+    #endif
+        fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
+                                         ? omp_block_stride
+                                         : n - omp_block_start;
 
-        if (omp_num_threads == 1)
-        {
-            libsais_count_and_gather_compacted_lms_suffixes_32s_2k(T, SA, n, k, buckets, omp_block_start, omp_block_size);
+        if (omp_num_threads == 1) {
+            libsais_count_and_gather_compacted_lms_suffixes_32s_2k(
+                T, SA, n, k, buckets, omp_block_start, omp_block_size);
         }
-#if defined(_OPENMP)
-        else
-        {
-            fast_sint_t bucket_size       = 2 * (fast_sint_t)k;
-            fast_sint_t bucket_stride     = libsais_get_bucket_stride(buckets - &SA[n + n], bucket_size, omp_num_threads);
+    #if defined(_OPENMP)
+        else {
+            fast_sint_t bucket_size = 2 * (fast_sint_t)k;
+            fast_sint_t bucket_stride = libsais_get_bucket_stride(
+                buckets - &SA[n + n], bucket_size, omp_num_threads);
 
             {
-                thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size;
-                thread_state[omp_thread_num].state.count = libsais_count_and_gather_compacted_lms_suffixes_32s_2k(T, SA + n, n, k, buckets - (omp_thread_num * bucket_stride), omp_block_start, omp_block_size);
+                thread_state[omp_thread_num].state.position =
+                    omp_block_start + omp_block_size;
+                thread_state[omp_thread_num].state.count =
+                    libsais_count_and_gather_compacted_lms_suffixes_32s_2k(
+                        T, SA + n, n, k,
+                        buckets - (omp_thread_num * bucket_stride),
+                        omp_block_start, omp_block_size);
             }
 
-            #pragma omp barrier
+        #pragma omp barrier
 
             {
-                fast_sint_t t, m = 0; for (t = omp_num_threads - 1; t >= omp_thread_num; --t) { m += (sa_sint_t)thread_state[t].state.count; }
+                fast_sint_t t, m = 0;
+                for (t = omp_num_threads - 1; t >= omp_thread_num; --t) {
+                    m += (sa_sint_t)thread_state[t].state.count;
+                }
 
-                if (thread_state[omp_thread_num].state.count > 0)
-                {
-                    memcpy(&SA[n - m], &SA[n + thread_state[omp_thread_num].state.position - thread_state[omp_thread_num].state.count], (size_t)thread_state[omp_thread_num].state.count * sizeof(sa_sint_t));
+                if (thread_state[omp_thread_num].state.count > 0) {
+                    memcpy(&SA[n - m],
+                           &SA[n + thread_state[omp_thread_num].state.position -
+                               thread_state[omp_thread_num].state.count],
+                           (size_t)thread_state[omp_thread_num].state.count *
+                               sizeof(sa_sint_t));
                 }
             }
 
             {
-                omp_block_stride    = (bucket_size / omp_num_threads) & (-16);
-                omp_block_start     = omp_thread_num * omp_block_stride;
-                omp_block_size      = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : bucket_size - omp_block_start;
+                omp_block_stride = (bucket_size / omp_num_threads) & (-16);
+                omp_block_start = omp_thread_num * omp_block_stride;
+                omp_block_size = omp_thread_num < omp_num_threads - 1
+                                     ? omp_block_stride
+                                     : bucket_size - omp_block_start;
 
-                libsais_accumulate_counts_s32(buckets + omp_block_start, omp_block_size, bucket_stride, omp_num_threads);
+                libsais_accumulate_counts_s32(buckets + omp_block_start,
+                                              omp_block_size, bucket_stride,
+                                              omp_num_threads);
             }
         }
-#endif
+    #endif
     }
 }
 
 #endif
 
-static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k_nofs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads)
-{
+static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k_nofs_omp(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+    sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads) {
     sa_sint_t m = 0;
 
 #if defined(_OPENMP)
-    #pragma omp parallel num_threads(2) if(threads > 1 && n >= 65536)
+    #pragma omp parallel num_threads(2) if (threads > 1 && n >= 65536)
 #endif
     {
 #if defined(_OPENMP)
-        fast_sint_t omp_thread_num    = omp_get_thread_num();
-        fast_sint_t omp_num_threads   = omp_get_num_threads();
+        fast_sint_t omp_thread_num = omp_get_thread_num();
+        fast_sint_t omp_num_threads = omp_get_num_threads();
 #else
         UNUSED(threads);
 
-        fast_sint_t omp_num_threads   = 1;
+        fast_sint_t omp_num_threads = 1;
 #endif
-        if (omp_num_threads == 1)
-        {
-            m = libsais_count_and_gather_lms_suffixes_32s_4k(T, SA, n, k, buckets, 0, n);
+        if (omp_num_threads == 1) {
+            m = libsais_count_and_gather_lms_suffixes_32s_4k(T, SA, n, k,
+                                                             buckets, 0, n);
         }
 #if defined(_OPENMP)
-        else if (omp_thread_num == 0)
-        {
+        else if (omp_thread_num == 0) {
             libsais_count_lms_suffixes_32s_4k(T, n, k, buckets);
-        }
-        else
-        {
+        } else {
             m = libsais_gather_lms_suffixes_32s(T, SA, n);
         }
 #endif
@@ -1193,33 +1547,31 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k_nofs_omp(const sa_
     return m;
 }
 
-static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k_nofs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads)
-{
+static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k_nofs_omp(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+    sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads) {
     sa_sint_t m = 0;
 
 #if defined(_OPENMP)
-    #pragma omp parallel num_threads(2) if(threads > 1 && n >= 65536)
+    #pragma omp parallel num_threads(2) if (threads > 1 && n >= 65536)
 #endif
     {
 #if defined(_OPENMP)
-        fast_sint_t omp_thread_num    = omp_get_thread_num();
-        fast_sint_t omp_num_threads   = omp_get_num_threads();
+        fast_sint_t omp_thread_num = omp_get_thread_num();
+        fast_sint_t omp_num_threads = omp_get_num_threads();
 #else
         UNUSED(threads);
 
-        fast_sint_t omp_num_threads   = 1;
+        fast_sint_t omp_num_threads = 1;
 #endif
-        if (omp_num_threads == 1)
-        {
-            m = libsais_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets, 0, n);
+        if (omp_num_threads == 1) {
+            m = libsais_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k,
+                                                             buckets, 0, n);
         }
 #if defined(_OPENMP)
-        else if (omp_thread_num == 0)
-        {
+        else if (omp_thread_num == 0) {
             libsais_count_lms_suffixes_32s_2k(T, n, k, buckets);
-        }
-        else
-        {
+        } else {
             m = libsais_gather_lms_suffixes_32s(T, SA, n);
         }
 #endif
@@ -1228,33 +1580,32 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k_nofs_omp(const sa_
     return m;
 }
 
-static sa_sint_t libsais_count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads)
-{
+static sa_sint_t
+libsais_count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+    sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads) {
     sa_sint_t m = 0;
 
 #if defined(_OPENMP)
-    #pragma omp parallel num_threads(2) if(threads > 1 && n >= 65536)
+    #pragma omp parallel num_threads(2) if (threads > 1 && n >= 65536)
 #endif
     {
 #if defined(_OPENMP)
-        fast_sint_t omp_thread_num    = omp_get_thread_num();
-        fast_sint_t omp_num_threads   = omp_get_num_threads();
+        fast_sint_t omp_thread_num = omp_get_thread_num();
+        fast_sint_t omp_num_threads = omp_get_num_threads();
 #else
         UNUSED(threads);
 
-        fast_sint_t omp_num_threads   = 1;
+        fast_sint_t omp_num_threads = 1;
 #endif
-        if (omp_num_threads == 1)
-        {
-            m = libsais_count_and_gather_compacted_lms_suffixes_32s_2k(T, SA, n, k, buckets, 0, n);
+        if (omp_num_threads == 1) {
+            m = libsais_count_and_gather_compacted_lms_suffixes_32s_2k(
+                T, SA, n, k, buckets, 0, n);
         }
 #if defined(_OPENMP)
-        else if (omp_thread_num == 0)
-        {
+        else if (omp_thread_num == 0) {
             libsais_count_compacted_lms_suffixes_32s_2k(T, n, k, buckets);
-        }
-        else
-        {
+        } else {
             m = libsais_gather_compacted_lms_suffixes_32s(T, SA, n);
         }
 #endif
@@ -1263,77 +1614,104 @@ static sa_sint_t libsais_count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp
     return m;
 }
 
-static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
+static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k_omp(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+    sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     sa_sint_t m;
 
 #if defined(_OPENMP)
-    sa_sint_t max_threads = (sa_sint_t)((buckets - &SA[n]) / ((4 * (fast_sint_t)k + 15) & (-16))); if (max_threads > threads) { max_threads = threads; }
-    if (max_threads > 1 && n >= 65536 && n / k >= 2)
-    {
-        if (max_threads > n / 16 / k) { max_threads = n / 16 / k; }
-        m = libsais_count_and_gather_lms_suffixes_32s_4k_fs_omp(T, SA, n, k, buckets, max_threads > 2 ? max_threads : 2, thread_state);
+    sa_sint_t max_threads =
+        (sa_sint_t)((buckets - &SA[n]) / ((4 * (fast_sint_t)k + 15) & (-16)));
+    if (max_threads > threads) {
+        max_threads = threads;
     }
-    else
+    if (max_threads > 1 && n >= 65536 && n / k >= 2) {
+        if (max_threads > n / 16 / k) {
+            max_threads = n / 16 / k;
+        }
+        m = libsais_count_and_gather_lms_suffixes_32s_4k_fs_omp(
+            T, SA, n, k, buckets, max_threads > 2 ? max_threads : 2,
+            thread_state);
+    } else
 #else
     UNUSED(thread_state);
 #endif
     {
-        m = libsais_count_and_gather_lms_suffixes_32s_4k_nofs_omp(T, SA, n, k, buckets, threads);
+        m = libsais_count_and_gather_lms_suffixes_32s_4k_nofs_omp(
+            T, SA, n, k, buckets, threads);
     }
 
     return m;
 }
 
-static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
+static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k_omp(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+    sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     sa_sint_t m;
 
 #if defined(_OPENMP)
-    sa_sint_t max_threads = (sa_sint_t)((buckets - &SA[n]) / ((2 * (fast_sint_t)k + 15) & (-16))); if (max_threads > threads) { max_threads = threads; }
-    if (max_threads > 1 && n >= 65536 && n / k >= 2)
-    {
-        if (max_threads > n / 8 / k) { max_threads = n / 8 / k; }
-        m = libsais_count_and_gather_lms_suffixes_32s_2k_fs_omp(T, SA, n, k, buckets, max_threads > 2 ? max_threads : 2, thread_state);
+    sa_sint_t max_threads =
+        (sa_sint_t)((buckets - &SA[n]) / ((2 * (fast_sint_t)k + 15) & (-16)));
+    if (max_threads > threads) {
+        max_threads = threads;
     }
-    else
+    if (max_threads > 1 && n >= 65536 && n / k >= 2) {
+        if (max_threads > n / 8 / k) {
+            max_threads = n / 8 / k;
+        }
+        m = libsais_count_and_gather_lms_suffixes_32s_2k_fs_omp(
+            T, SA, n, k, buckets, max_threads > 2 ? max_threads : 2,
+            thread_state);
+    } else
 #else
     UNUSED(thread_state);
 #endif
     {
-        m = libsais_count_and_gather_lms_suffixes_32s_2k_nofs_omp(T, SA, n, k, buckets, threads);
+        m = libsais_count_and_gather_lms_suffixes_32s_2k_nofs_omp(
+            T, SA, n, k, buckets, threads);
     }
 
     return m;
 }
 
-static void libsais_count_and_gather_compacted_lms_suffixes_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
+static void libsais_count_and_gather_compacted_lms_suffixes_32s_2k_omp(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+    sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
 #if defined(_OPENMP)
-    sa_sint_t max_threads = (sa_sint_t)((buckets - &SA[n + n]) / ((2 * (fast_sint_t)k + 15) & (-16))); if (max_threads > threads) { max_threads = threads; }
-    if (max_threads > 1 && n >= 65536 && n / k >= 2)
-    {
-        if (max_threads > n / 8 / k) { max_threads = n / 8 / k; }
-        libsais_count_and_gather_compacted_lms_suffixes_32s_2k_fs_omp(T, SA, n, k, buckets, max_threads > 2 ? max_threads : 2, thread_state);
+    sa_sint_t max_threads = (sa_sint_t)((buckets - &SA[n + n]) /
+                                        ((2 * (fast_sint_t)k + 15) & (-16)));
+    if (max_threads > threads) {
+        max_threads = threads;
     }
-    else
+    if (max_threads > 1 && n >= 65536 && n / k >= 2) {
+        if (max_threads > n / 8 / k) {
+            max_threads = n / 8 / k;
+        }
+        libsais_count_and_gather_compacted_lms_suffixes_32s_2k_fs_omp(
+            T, SA, n, k, buckets, max_threads > 2 ? max_threads : 2,
+            thread_state);
+    } else
 #else
     UNUSED(thread_state);
 #endif
     {
-        libsais_count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp(T, SA, n, k, buckets, threads);
+        libsais_count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp(
+            T, SA, n, k, buckets, threads);
     }
 }
 
-static void libsais_count_suffixes_32s(const sa_sint_t * RESTRICT T, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets)
-{
+static void libsais_count_suffixes_32s(const sa_sint_t * RESTRICT T,
+                                       sa_sint_t n, sa_sint_t k,
+                                       sa_sint_t * RESTRICT buckets) {
     const fast_sint_t prefetch_distance = 32;
 
     memset(buckets, 0, (size_t)k * sizeof(sa_sint_t));
 
     fast_sint_t i, j;
-    for (i = 0, j = (fast_sint_t)n - 7; i < j; i += 8)
-    {
+    for (i = 0, j = (fast_sint_t)n - 7; i < j; i += 8) {
         libsais_prefetch(&T[i + prefetch_distance]);
 
         buckets[T[i + 0]]++;
@@ -1346,109 +1724,139 @@ static void libsais_count_suffixes_32s(const sa_sint_t * RESTRICT T, sa_sint_t n
         buckets[T[i + 7]]++;
     }
 
-    for (j += 7; i < j; i += 1)
-    {
+    for (j += 7; i < j; i += 1) {
         buckets[T[i]]++;
     }
 }
 
-static void libsais_initialize_buckets_start_and_end_8u(sa_sint_t * RESTRICT buckets, sa_sint_t * RESTRICT freq)
-{
+static void libsais_initialize_buckets_start_and_end_8u(
+    sa_sint_t * RESTRICT buckets, sa_sint_t * RESTRICT freq) {
     sa_sint_t * RESTRICT bucket_start = &buckets[6 * ALPHABET_SIZE];
-    sa_sint_t * RESTRICT bucket_end   = &buckets[7 * ALPHABET_SIZE];
+    sa_sint_t * RESTRICT bucket_end = &buckets[7 * ALPHABET_SIZE];
 
-    if (freq != NULL)
-    {
-        fast_sint_t i, j; sa_sint_t sum = 0;
-        for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX4(1, 0), j += 1)
-        {
+    if (freq != NULL) {
+        fast_sint_t i, j;
+        sa_sint_t sum = 0;
+        for (i = BUCKETS_INDEX4(0, 0), j = 0;
+             i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0);
+             i += BUCKETS_INDEX4(1, 0), j += 1) {
             bucket_start[j] = sum;
-            sum += (freq[j] = buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 2)] + buckets[i + BUCKETS_INDEX4(0, 3)]);
+            sum += (freq[j] = buckets[i + BUCKETS_INDEX4(0, 0)] +
+                              buckets[i + BUCKETS_INDEX4(0, 1)] +
+                              buckets[i + BUCKETS_INDEX4(0, 2)] +
+                              buckets[i + BUCKETS_INDEX4(0, 3)]);
             bucket_end[j] = sum;
         }
-    }
-    else
-    {
-        fast_sint_t i, j; sa_sint_t sum = 0;
-        for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX4(1, 0), j += 1)
-        {
+    } else {
+        fast_sint_t i, j;
+        sa_sint_t sum = 0;
+        for (i = BUCKETS_INDEX4(0, 0), j = 0;
+             i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0);
+             i += BUCKETS_INDEX4(1, 0), j += 1) {
             bucket_start[j] = sum;
-            sum += buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 2)] + buckets[i + BUCKETS_INDEX4(0, 3)];
+            sum += buckets[i + BUCKETS_INDEX4(0, 0)] +
+                   buckets[i + BUCKETS_INDEX4(0, 1)] +
+                   buckets[i + BUCKETS_INDEX4(0, 2)] +
+                   buckets[i + BUCKETS_INDEX4(0, 3)];
             bucket_end[j] = sum;
         }
     }
 }
 
-static void libsais_initialize_buckets_start_and_end_32s_6k(sa_sint_t k, sa_sint_t * RESTRICT buckets)
-{
+static void libsais_initialize_buckets_start_and_end_32s_6k(
+    sa_sint_t k, sa_sint_t * RESTRICT buckets) {
     sa_sint_t * RESTRICT bucket_start = &buckets[4 * k];
-    sa_sint_t * RESTRICT bucket_end   = &buckets[5 * k];
+    sa_sint_t * RESTRICT bucket_end = &buckets[5 * k];
 
-    fast_sint_t i, j; sa_sint_t sum = 0;
-    for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX4(1, 0), j += 1)
-    {
+    fast_sint_t i, j;
+    sa_sint_t sum = 0;
+    for (i = BUCKETS_INDEX4(0, 0), j = 0;
+         i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0);
+         i += BUCKETS_INDEX4(1, 0), j += 1) {
         bucket_start[j] = sum;
-        sum += buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 2)] + buckets[i + BUCKETS_INDEX4(0, 3)];
+        sum += buckets[i + BUCKETS_INDEX4(0, 0)] +
+               buckets[i + BUCKETS_INDEX4(0, 1)] +
+               buckets[i + BUCKETS_INDEX4(0, 2)] +
+               buckets[i + BUCKETS_INDEX4(0, 3)];
         bucket_end[j] = sum;
     }
 }
 
-static void libsais_initialize_buckets_start_and_end_32s_4k(sa_sint_t k, sa_sint_t * RESTRICT buckets)
-{
+static void libsais_initialize_buckets_start_and_end_32s_4k(
+    sa_sint_t k, sa_sint_t * RESTRICT buckets) {
     sa_sint_t * RESTRICT bucket_start = &buckets[2 * k];
-    sa_sint_t * RESTRICT bucket_end   = &buckets[3 * k];
+    sa_sint_t * RESTRICT bucket_end = &buckets[3 * k];
 
-    fast_sint_t i, j; sa_sint_t sum = 0;
-    for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0), j += 1)
-    { 
+    fast_sint_t i, j;
+    sa_sint_t sum = 0;
+    for (i = BUCKETS_INDEX2(0, 0), j = 0;
+         i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0);
+         i += BUCKETS_INDEX2(1, 0), j += 1) {
         bucket_start[j] = sum;
-        sum += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)];
+        sum += buckets[i + BUCKETS_INDEX2(0, 0)] +
+               buckets[i + BUCKETS_INDEX2(0, 1)];
         bucket_end[j] = sum;
     }
 }
 
-static void libsais_initialize_buckets_end_32s_2k(sa_sint_t k, sa_sint_t * RESTRICT buckets)
-{
-    fast_sint_t i; sa_sint_t sum0 = 0;
-    for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0))
-    { 
-        sum0 += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)]; buckets[i + BUCKETS_INDEX2(0, 0)] = sum0;
+static void libsais_initialize_buckets_end_32s_2k(
+    sa_sint_t k, sa_sint_t * RESTRICT buckets) {
+    fast_sint_t i;
+    sa_sint_t sum0 = 0;
+    for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0);
+         i += BUCKETS_INDEX2(1, 0)) {
+        sum0 += buckets[i + BUCKETS_INDEX2(0, 0)] +
+                buckets[i + BUCKETS_INDEX2(0, 1)];
+        buckets[i + BUCKETS_INDEX2(0, 0)] = sum0;
     }
 }
 
-static void libsais_initialize_buckets_start_and_end_32s_2k(sa_sint_t k, sa_sint_t * RESTRICT buckets)
-{
+static void libsais_initialize_buckets_start_and_end_32s_2k(
+    sa_sint_t k, sa_sint_t * RESTRICT buckets) {
     fast_sint_t i, j;
-    for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0), j += 1)
-    {
+    for (i = BUCKETS_INDEX2(0, 0), j = 0;
+         i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0);
+         i += BUCKETS_INDEX2(1, 0), j += 1) {
         buckets[j] = buckets[i];
     }
 
-    buckets[k] = 0; memcpy(&buckets[k + 1], buckets, ((size_t)k - 1) * sizeof(sa_sint_t));
+    buckets[k] = 0;
+    memcpy(&buckets[k + 1], buckets, ((size_t)k - 1) * sizeof(sa_sint_t));
 }
 
-static void libsais_initialize_buckets_start_32s_1k(sa_sint_t k, sa_sint_t * RESTRICT buckets)
-{
-    fast_sint_t i; sa_sint_t sum = 0;
-    for (i = 0; i <= (fast_sint_t)k - 1; i += 1) { sa_sint_t tmp = buckets[i]; buckets[i] = sum; sum += tmp; }
+static void libsais_initialize_buckets_start_32s_1k(
+    sa_sint_t k, sa_sint_t * RESTRICT buckets) {
+    fast_sint_t i;
+    sa_sint_t sum = 0;
+    for (i = 0; i <= (fast_sint_t)k - 1; i += 1) {
+        sa_sint_t tmp = buckets[i];
+        buckets[i] = sum;
+        sum += tmp;
+    }
 }
 
-static void libsais_initialize_buckets_end_32s_1k(sa_sint_t k, sa_sint_t * RESTRICT buckets)
-{
-    fast_sint_t i; sa_sint_t sum = 0;
-    for (i = 0; i <= (fast_sint_t)k - 1; i += 1) { sum += buckets[i]; buckets[i] = sum; }
+static void libsais_initialize_buckets_end_32s_1k(
+    sa_sint_t k, sa_sint_t * RESTRICT buckets) {
+    fast_sint_t i;
+    sa_sint_t sum = 0;
+    for (i = 0; i <= (fast_sint_t)k - 1; i += 1) {
+        sum += buckets[i];
+        buckets[i] = sum;
+    }
 }
 
-static sa_sint_t libsais_initialize_buckets_for_lms_suffixes_radix_sort_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix)
-{
+static sa_sint_t libsais_initialize_buckets_for_lms_suffixes_radix_sort_8u(
+    const uint8_t * RESTRICT T, sa_sint_t * RESTRICT buckets,
+    sa_sint_t first_lms_suffix) {
     {
-        fast_uint_t     s = 0;
-        fast_sint_t     c0 = T[first_lms_suffix];
-        fast_sint_t     c1 = 0;
+        fast_uint_t s = 0;
+        fast_sint_t c0 = T[first_lms_suffix];
+        fast_sint_t c1 = 0;
 
-        for (; --first_lms_suffix >= 0; )
-        {
-            c1 = c0; c0 = T[first_lms_suffix]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+        for (; --first_lms_suffix >= 0;) {
+            c1 = c0;
+            c0 = T[first_lms_suffix];
+            s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
             buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]--;
         }
 
@@ -1458,42 +1866,52 @@ static sa_sint_t libsais_initialize_buckets_for_lms_suffixes_radix_sort_8u(const
     {
         sa_sint_t * RESTRICT temp_bucket = &buckets[4 * ALPHABET_SIZE];
 
-        fast_sint_t i, j; sa_sint_t sum = 0;
-        for (i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0))
-        { 
-            temp_bucket[j + BUCKETS_INDEX2(0, 1)] = sum; sum += buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 3)]; temp_bucket[j] = sum;
+        fast_sint_t i, j;
+        sa_sint_t sum = 0;
+        for (i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0);
+             i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0);
+             i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) {
+            temp_bucket[j + BUCKETS_INDEX2(0, 1)] = sum;
+            sum += buckets[i + BUCKETS_INDEX4(0, 1)] +
+                   buckets[i + BUCKETS_INDEX4(0, 3)];
+            temp_bucket[j] = sum;
         }
 
         return sum;
     }
 }
 
-static void libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix)
-{
+static void libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_2k(
+    const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets,
+    sa_sint_t first_lms_suffix) {
     buckets[BUCKETS_INDEX2(T[first_lms_suffix], 0)]++;
     buckets[BUCKETS_INDEX2(T[first_lms_suffix], 1)]--;
 
-    fast_sint_t i; sa_sint_t sum0 = 0, sum1 = 0;
-    for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0))
-    { 
-        sum0 += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)];
+    fast_sint_t i;
+    sa_sint_t sum0 = 0, sum1 = 0;
+    for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0);
+         i += BUCKETS_INDEX2(1, 0)) {
+        sum0 += buckets[i + BUCKETS_INDEX2(0, 0)] +
+                buckets[i + BUCKETS_INDEX2(0, 1)];
         sum1 += buckets[i + BUCKETS_INDEX2(0, 1)];
-        
+
         buckets[i + BUCKETS_INDEX2(0, 0)] = sum0;
         buckets[i + BUCKETS_INDEX2(0, 1)] = sum1;
     }
 }
 
-static sa_sint_t libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix)
-{
+static sa_sint_t libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(
+    const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets,
+    sa_sint_t first_lms_suffix) {
     {
-        fast_uint_t     s = 0;
-        fast_sint_t     c0 = T[first_lms_suffix];
-        fast_sint_t     c1 = 0;
+        fast_uint_t s = 0;
+        fast_sint_t c0 = T[first_lms_suffix];
+        fast_sint_t c1 = 0;
 
-        for (; --first_lms_suffix >= 0; )
-        {
-            c1 = c0; c0 = T[first_lms_suffix]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+        for (; --first_lms_suffix >= 0;) {
+            c1 = c0;
+            c0 = T[first_lms_suffix];
+            s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
             buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]--;
         }
 
@@ -1503,44 +1921,55 @@ static sa_sint_t libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(c
     {
         sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k];
 
-        fast_sint_t i, j; sa_sint_t sum = 0;
-        for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX4(1, 0), j += 1)
-        { 
-            sum += buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 3)]; temp_bucket[j] = sum;
+        fast_sint_t i, j;
+        sa_sint_t sum = 0;
+        for (i = BUCKETS_INDEX4(0, 0), j = 0;
+             i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0);
+             i += BUCKETS_INDEX4(1, 0), j += 1) {
+            sum += buckets[i + BUCKETS_INDEX4(0, 1)] +
+                   buckets[i + BUCKETS_INDEX4(0, 3)];
+            temp_bucket[j] = sum;
         }
 
         return sum;
     }
 }
 
-static void libsais_initialize_buckets_for_radix_and_partial_sorting_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix)
-{
+static void libsais_initialize_buckets_for_radix_and_partial_sorting_32s_4k(
+    const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets,
+    sa_sint_t first_lms_suffix) {
     sa_sint_t * RESTRICT bucket_start = &buckets[2 * k];
-    sa_sint_t * RESTRICT bucket_end   = &buckets[3 * k];
+    sa_sint_t * RESTRICT bucket_end = &buckets[3 * k];
 
     buckets[BUCKETS_INDEX2(T[first_lms_suffix], 0)]++;
     buckets[BUCKETS_INDEX2(T[first_lms_suffix], 1)]--;
 
-    fast_sint_t i, j; sa_sint_t sum0 = 0, sum1 = 0;
-    for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0), j += 1)
-    { 
+    fast_sint_t i, j;
+    sa_sint_t sum0 = 0, sum1 = 0;
+    for (i = BUCKETS_INDEX2(0, 0), j = 0;
+         i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0);
+         i += BUCKETS_INDEX2(1, 0), j += 1) {
         bucket_start[j] = sum1;
 
         sum0 += buckets[i + BUCKETS_INDEX2(0, 1)];
-        sum1 += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)];
+        sum1 += buckets[i + BUCKETS_INDEX2(0, 0)] +
+                buckets[i + BUCKETS_INDEX2(0, 1)];
         buckets[i + BUCKETS_INDEX2(0, 1)] = sum0;
 
         bucket_end[j] = sum1;
     }
 }
 
-static void libsais_radix_sort_lms_suffixes_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static void libsais_radix_sort_lms_suffixes_8u(
+    const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+    sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start,
+    fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4)
-    {
+    for (i = omp_block_start + omp_block_size - 1,
+        j = omp_block_start + prefetch_distance + 3;
+         i >= j; i -= 4) {
         libsais_prefetch(&SA[i - 2 * prefetch_distance]);
 
         libsais_prefetch(&T[SA[i - prefetch_distance - 0]]);
@@ -1548,75 +1977,94 @@ static void libsais_radix_sort_lms_suffixes_8u(const uint8_t * RESTRICT T, sa_si
         libsais_prefetch(&T[SA[i - prefetch_distance - 2]]);
         libsais_prefetch(&T[SA[i - prefetch_distance - 3]]);
 
-        sa_sint_t p0 = SA[i - 0]; SA[--induction_bucket[BUCKETS_INDEX2(T[p0], 0)]] = p0;
-        sa_sint_t p1 = SA[i - 1]; SA[--induction_bucket[BUCKETS_INDEX2(T[p1], 0)]] = p1;
-        sa_sint_t p2 = SA[i - 2]; SA[--induction_bucket[BUCKETS_INDEX2(T[p2], 0)]] = p2;
-        sa_sint_t p3 = SA[i - 3]; SA[--induction_bucket[BUCKETS_INDEX2(T[p3], 0)]] = p3;
+        sa_sint_t p0 = SA[i - 0];
+        SA[--induction_bucket[BUCKETS_INDEX2(T[p0], 0)]] = p0;
+        sa_sint_t p1 = SA[i - 1];
+        SA[--induction_bucket[BUCKETS_INDEX2(T[p1], 0)]] = p1;
+        sa_sint_t p2 = SA[i - 2];
+        SA[--induction_bucket[BUCKETS_INDEX2(T[p2], 0)]] = p2;
+        sa_sint_t p3 = SA[i - 3];
+        SA[--induction_bucket[BUCKETS_INDEX2(T[p3], 0)]] = p3;
     }
 
-    for (j -= prefetch_distance + 3; i >= j; i -= 1)
-    {
-        sa_sint_t p = SA[i]; SA[--induction_bucket[BUCKETS_INDEX2(T[p], 0)]] = p;
+    for (j -= prefetch_distance + 3; i >= j; i -= 1) {
+        sa_sint_t p = SA[i];
+        SA[--induction_bucket[BUCKETS_INDEX2(T[p], 0)]] = p;
     }
 }
 
-static void libsais_radix_sort_lms_suffixes_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
+static void libsais_radix_sort_lms_suffixes_8u_omp(
+    const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+    sa_sint_t m, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
 #if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536 && m >= 65536 && omp_get_dynamic() == 0)
+    #pragma omp parallel num_threads(threads) if (threads > 1 && n >= 65536 && \
+                                                  m >= 65536 &&                \
+                                                  omp_get_dynamic() == 0)
 #endif
     {
 #if defined(_OPENMP)
-        fast_sint_t omp_thread_num    = omp_get_thread_num();
-        fast_sint_t omp_num_threads   = omp_get_num_threads();
+        fast_sint_t omp_thread_num = omp_get_thread_num();
+        fast_sint_t omp_num_threads = omp_get_num_threads();
 #else
-        UNUSED(threads); UNUSED(thread_state);
+        UNUSED(threads);
+        UNUSED(thread_state);
 
-        fast_sint_t omp_num_threads   = 1;
+        fast_sint_t omp_num_threads = 1;
 #endif
-        if (omp_num_threads == 1)
-        {
-            libsais_radix_sort_lms_suffixes_8u(T, SA, &buckets[4 * ALPHABET_SIZE], (fast_sint_t)n - (fast_sint_t)m + 1, (fast_sint_t)m - 1);
+        if (omp_num_threads == 1) {
+            libsais_radix_sort_lms_suffixes_8u(
+                T, SA, &buckets[4 * ALPHABET_SIZE],
+                (fast_sint_t)n - (fast_sint_t)m + 1, (fast_sint_t)m - 1);
         }
 #if defined(_OPENMP)
-        else
-        {
+        else {
             {
                 sa_sint_t * RESTRICT src_bucket = &buckets[4 * ALPHABET_SIZE];
-                sa_sint_t * RESTRICT dst_bucket = thread_state[omp_thread_num].state.buckets;
+                sa_sint_t * RESTRICT dst_bucket =
+                    thread_state[omp_thread_num].state.buckets;
 
                 fast_sint_t i, j;
-                for (i = BUCKETS_INDEX2(0, 0), j = BUCKETS_INDEX4(0, 1); i <= BUCKETS_INDEX2(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX2(1, 0), j += BUCKETS_INDEX4(1, 0))
-                {
+                for (i = BUCKETS_INDEX2(0, 0), j = BUCKETS_INDEX4(0, 1);
+                     i <= BUCKETS_INDEX2(ALPHABET_SIZE - 1, 0);
+                     i += BUCKETS_INDEX2(1, 0), j += BUCKETS_INDEX4(1, 0)) {
                     dst_bucket[i] = src_bucket[i] - dst_bucket[j];
                 }
             }
 
             {
-                fast_sint_t t, omp_block_start = 0, omp_block_size = thread_state[omp_thread_num].state.m;
-                for (t = omp_num_threads - 1; t >= omp_thread_num; --t) omp_block_start += thread_state[t].state.m;
+                fast_sint_t t,
+                    omp_block_start = 0,
+                    omp_block_size = thread_state[omp_thread_num].state.m;
+                for (t = omp_num_threads - 1; t >= omp_thread_num; --t)
+                    omp_block_start += thread_state[t].state.m;
 
-                if (omp_block_start == (fast_sint_t)m && omp_block_size > 0)
-                {
-                    omp_block_start -= 1; omp_block_size -= 1;
+                if (omp_block_start == (fast_sint_t)m && omp_block_size > 0) {
+                    omp_block_start -= 1;
+                    omp_block_size -= 1;
                 }
 
-                libsais_radix_sort_lms_suffixes_8u(T, SA, thread_state[omp_thread_num].state.buckets, (fast_sint_t)n - omp_block_start, omp_block_size);
+                libsais_radix_sort_lms_suffixes_8u(
+                    T, SA, thread_state[omp_thread_num].state.buckets,
+                    (fast_sint_t)n - omp_block_start, omp_block_size);
             }
         }
 #endif
     }
 }
 
-static void libsais_radix_sort_lms_suffixes_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static void libsais_radix_sort_lms_suffixes_32s_6k(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+    sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start,
+    fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 3; i >= j; i -= 4)
-    {
+    for (i = omp_block_start + omp_block_size - 1,
+        j = omp_block_start + 2 * prefetch_distance + 3;
+         i >= j; i -= 4) {
         libsais_prefetch(&SA[i - 3 * prefetch_distance]);
-        
+
         libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 0]]);
         libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 1]]);
         libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 2]]);
@@ -1627,58 +2075,76 @@ static void libsais_radix_sort_lms_suffixes_32s_6k(const sa_sint_t * RESTRICT T,
         libsais_prefetchw(&induction_bucket[T[SA[i - prefetch_distance - 2]]]);
         libsais_prefetchw(&induction_bucket[T[SA[i - prefetch_distance - 3]]]);
 
-        sa_sint_t p0 = SA[i - 0]; SA[--induction_bucket[T[p0]]] = p0;
-        sa_sint_t p1 = SA[i - 1]; SA[--induction_bucket[T[p1]]] = p1;
-        sa_sint_t p2 = SA[i - 2]; SA[--induction_bucket[T[p2]]] = p2;
-        sa_sint_t p3 = SA[i - 3]; SA[--induction_bucket[T[p3]]] = p3;
+        sa_sint_t p0 = SA[i - 0];
+        SA[--induction_bucket[T[p0]]] = p0;
+        sa_sint_t p1 = SA[i - 1];
+        SA[--induction_bucket[T[p1]]] = p1;
+        sa_sint_t p2 = SA[i - 2];
+        SA[--induction_bucket[T[p2]]] = p2;
+        sa_sint_t p3 = SA[i - 3];
+        SA[--induction_bucket[T[p3]]] = p3;
     }
 
-    for (j -= 2 * prefetch_distance + 3; i >= j; i -= 1)
-    {
-        sa_sint_t p = SA[i]; SA[--induction_bucket[T[p]]] = p;
+    for (j -= 2 * prefetch_distance + 3; i >= j; i -= 1) {
+        sa_sint_t p = SA[i];
+        SA[--induction_bucket[T[p]]] = p;
     }
 }
 
-static void libsais_radix_sort_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static void libsais_radix_sort_lms_suffixes_32s_2k(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+    sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start,
+    fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 3; i >= j; i -= 4)
-    {
+    for (i = omp_block_start + omp_block_size - 1,
+        j = omp_block_start + 2 * prefetch_distance + 3;
+         i >= j; i -= 4) {
         libsais_prefetch(&SA[i - 3 * prefetch_distance]);
-        
+
         libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 0]]);
         libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 1]]);
         libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 2]]);
         libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 3]]);
 
-        libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 0]], 0)]);
-        libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 1]], 0)]);
-        libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 2]], 0)]);
-        libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 3]], 0)]);
+        libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(
+            T[SA[i - prefetch_distance - 0]], 0)]);
+        libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(
+            T[SA[i - prefetch_distance - 1]], 0)]);
+        libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(
+            T[SA[i - prefetch_distance - 2]], 0)]);
+        libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(
+            T[SA[i - prefetch_distance - 3]], 0)]);
 
-        sa_sint_t p0 = SA[i - 0]; SA[--induction_bucket[BUCKETS_INDEX2(T[p0], 0)]] = p0;
-        sa_sint_t p1 = SA[i - 1]; SA[--induction_bucket[BUCKETS_INDEX2(T[p1], 0)]] = p1;
-        sa_sint_t p2 = SA[i - 2]; SA[--induction_bucket[BUCKETS_INDEX2(T[p2], 0)]] = p2;
-        sa_sint_t p3 = SA[i - 3]; SA[--induction_bucket[BUCKETS_INDEX2(T[p3], 0)]] = p3;
+        sa_sint_t p0 = SA[i - 0];
+        SA[--induction_bucket[BUCKETS_INDEX2(T[p0], 0)]] = p0;
+        sa_sint_t p1 = SA[i - 1];
+        SA[--induction_bucket[BUCKETS_INDEX2(T[p1], 0)]] = p1;
+        sa_sint_t p2 = SA[i - 2];
+        SA[--induction_bucket[BUCKETS_INDEX2(T[p2], 0)]] = p2;
+        sa_sint_t p3 = SA[i - 3];
+        SA[--induction_bucket[BUCKETS_INDEX2(T[p3], 0)]] = p3;
     }
 
-    for (j -= 2 * prefetch_distance + 3; i >= j; i -= 1)
-    {
-        sa_sint_t p = SA[i]; SA[--induction_bucket[BUCKETS_INDEX2(T[p], 0)]] = p;
+    for (j -= 2 * prefetch_distance + 3; i >= j; i -= 1) {
+        sa_sint_t p = SA[i];
+        SA[--induction_bucket[BUCKETS_INDEX2(T[p], 0)]] = p;
     }
 }
 
 #if defined(_OPENMP)
 
-static void libsais_radix_sort_lms_suffixes_32s_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static void libsais_radix_sort_lms_suffixes_32s_block_gather(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
+    fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4)
-    {
+    for (i = omp_block_start,
+        j = omp_block_start + omp_block_size - prefetch_distance - 3;
+         i < j; i += 4) {
         libsais_prefetch(&SA[i + 2 * prefetch_distance]);
 
         libsais_prefetch(&T[SA[i + prefetch_distance + 0]]);
@@ -1694,25 +2160,31 @@ static void libsais_radix_sort_lms_suffixes_32s_block_gather(const sa_sint_t * R
         cache[i + 3].symbol = T[cache[i + 3].index = SA[i + 3]];
     }
 
-    for (j += prefetch_distance + 3; i < j; i += 1)
-    {
+    for (j += prefetch_distance + 3; i < j; i += 1) {
         cache[i].symbol = T[cache[i].index = SA[i]];
     }
 }
 
-static void libsais_radix_sort_lms_suffixes_32s_6k_block_sort(sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static void libsais_radix_sort_lms_suffixes_32s_6k_block_sort(
+    sa_sint_t * RESTRICT induction_bucket,
+    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
+    fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4)
-    {
+    for (i = omp_block_start + omp_block_size - 1,
+        j = omp_block_start + prefetch_distance + 3;
+         i >= j; i -= 4) {
         libsais_prefetchw(&cache[i - 2 * prefetch_distance]);
 
-        libsais_prefetchw(&induction_bucket[cache[i - prefetch_distance - 0].symbol]);
-        libsais_prefetchw(&induction_bucket[cache[i - prefetch_distance - 1].symbol]);
-        libsais_prefetchw(&induction_bucket[cache[i - prefetch_distance - 2].symbol]);
-        libsais_prefetchw(&induction_bucket[cache[i - prefetch_distance - 3].symbol]);
+        libsais_prefetchw(
+            &induction_bucket[cache[i - prefetch_distance - 0].symbol]);
+        libsais_prefetchw(
+            &induction_bucket[cache[i - prefetch_distance - 1].symbol]);
+        libsais_prefetchw(
+            &induction_bucket[cache[i - prefetch_distance - 2].symbol]);
+        libsais_prefetchw(
+            &induction_bucket[cache[i - prefetch_distance - 3].symbol]);
 
         cache[i - 0].symbol = --induction_bucket[cache[i - 0].symbol];
         cache[i - 1].symbol = --induction_bucket[cache[i - 1].symbol];
@@ -1720,153 +2192,193 @@ static void libsais_radix_sort_lms_suffixes_32s_6k_block_sort(sa_sint_t * RESTRI
         cache[i - 3].symbol = --induction_bucket[cache[i - 3].symbol];
     }
 
-    for (j -= prefetch_distance + 3; i >= j; i -= 1)
-    {
+    for (j -= prefetch_distance + 3; i >= j; i -= 1) {
         cache[i].symbol = --induction_bucket[cache[i].symbol];
     }
 }
 
-static void libsais_radix_sort_lms_suffixes_32s_2k_block_sort(sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static void libsais_radix_sort_lms_suffixes_32s_2k_block_sort(
+    sa_sint_t * RESTRICT induction_bucket,
+    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
+    fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4)
-    {
+    for (i = omp_block_start + omp_block_size - 1,
+        j = omp_block_start + prefetch_distance + 3;
+         i >= j; i -= 4) {
         libsais_prefetchw(&cache[i - 2 * prefetch_distance]);
 
-        libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 0].symbol, 0)]);
-        libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 1].symbol, 0)]);
-        libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 2].symbol, 0)]);
-        libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 3].symbol, 0)]);
-
-        cache[i - 0].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 0].symbol, 0)];
-        cache[i - 1].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 1].symbol, 0)];
-        cache[i - 2].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 2].symbol, 0)];
-        cache[i - 3].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 3].symbol, 0)];
-    }
-
-    for (j -= prefetch_distance + 3; i >= j; i -= 1)
-    {
-        cache[i].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i].symbol, 0)];
-    }
-}
-
-static void libsais_radix_sort_lms_suffixes_32s_6k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
-{
-#if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
-#endif
+        libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(
+            cache[i - prefetch_distance - 0].symbol, 0)]);
+        libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(
+            cache[i - prefetch_distance - 1].symbol, 0)]);
+        libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(
+            cache[i - prefetch_distance - 2].symbol, 0)]);
+        libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(
+            cache[i - prefetch_distance - 3].symbol, 0)]);
+
+        cache[i - 0].symbol =
+            --induction_bucket[BUCKETS_INDEX2(cache[i - 0].symbol, 0)];
+        cache[i - 1].symbol =
+            --induction_bucket[BUCKETS_INDEX2(cache[i - 1].symbol, 0)];
+        cache[i - 2].symbol =
+            --induction_bucket[BUCKETS_INDEX2(cache[i - 2].symbol, 0)];
+        cache[i - 3].symbol =
+            --induction_bucket[BUCKETS_INDEX2(cache[i - 3].symbol, 0)];
+    }
+
+    for (j -= prefetch_distance + 3; i >= j; i -= 1) {
+        cache[i].symbol =
+            --induction_bucket[BUCKETS_INDEX2(cache[i].symbol, 0)];
+    }
+}
+
+static void libsais_radix_sort_lms_suffixes_32s_6k_block_omp(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+    sa_sint_t * RESTRICT induction_bucket,
+    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start,
+    fast_sint_t block_size, sa_sint_t threads) {
+    #if defined(_OPENMP)
+        #pragma omp parallel num_threads(threads) if (threads > 1 && \
+                                                      block_size >= 16384)
+    #endif
     {
-#if defined(_OPENMP)
-        fast_sint_t omp_thread_num    = omp_get_thread_num();
-        fast_sint_t omp_num_threads   = omp_get_num_threads();
-#else
-        UNUSED(threads); UNUSED(cache);
+    #if defined(_OPENMP)
+        fast_sint_t omp_thread_num = omp_get_thread_num();
+        fast_sint_t omp_num_threads = omp_get_num_threads();
+    #else
+        UNUSED(threads);
+        UNUSED(cache);
 
-        fast_sint_t omp_thread_num    = 0;
-        fast_sint_t omp_num_threads   = 1;
-#endif
-        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+        fast_sint_t omp_thread_num = 0;
+        fast_sint_t omp_num_threads = 1;
+    #endif
+        fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
+                                         ? omp_block_stride
+                                         : block_size - omp_block_start;
 
         omp_block_start += block_start;
 
-        if (omp_num_threads == 1)
-        {
-            libsais_radix_sort_lms_suffixes_32s_6k(T, SA, induction_bucket, omp_block_start, omp_block_size);
+        if (omp_num_threads == 1) {
+            libsais_radix_sort_lms_suffixes_32s_6k(
+                T, SA, induction_bucket, omp_block_start, omp_block_size);
         }
-#if defined(_OPENMP)
-        else
-        {
+    #if defined(_OPENMP)
+        else {
             {
-                libsais_radix_sort_lms_suffixes_32s_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+                libsais_radix_sort_lms_suffixes_32s_block_gather(
+                    T, SA, cache - block_start, omp_block_start,
+                    omp_block_size);
             }
 
-            #pragma omp barrier
+        #pragma omp barrier
 
-            #pragma omp master
+        #pragma omp master
             {
-                libsais_radix_sort_lms_suffixes_32s_6k_block_sort(induction_bucket, cache - block_start, block_start, block_size);
+                libsais_radix_sort_lms_suffixes_32s_6k_block_sort(
+                    induction_bucket, cache - block_start, block_start,
+                    block_size);
             }
 
-            #pragma omp barrier
+        #pragma omp barrier
 
             {
-                libsais_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+                libsais_place_cached_suffixes(SA, cache - block_start,
+                                              omp_block_start, omp_block_size);
             }
         }
-#endif
+    #endif
     }
 }
 
-static void libsais_radix_sort_lms_suffixes_32s_2k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
-{
-#if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
-#endif
+static void libsais_radix_sort_lms_suffixes_32s_2k_block_omp(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+    sa_sint_t * RESTRICT induction_bucket,
+    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start,
+    fast_sint_t block_size, sa_sint_t threads) {
+    #if defined(_OPENMP)
+        #pragma omp parallel num_threads(threads) if (threads > 1 && \
+                                                      block_size >= 16384)
+    #endif
     {
-#if defined(_OPENMP)
-        fast_sint_t omp_thread_num    = omp_get_thread_num();
-        fast_sint_t omp_num_threads   = omp_get_num_threads();
-#else
-        UNUSED(threads); UNUSED(cache);
+    #if defined(_OPENMP)
+        fast_sint_t omp_thread_num = omp_get_thread_num();
+        fast_sint_t omp_num_threads = omp_get_num_threads();
+    #else
+        UNUSED(threads);
+        UNUSED(cache);
 
-        fast_sint_t omp_thread_num    = 0;
-        fast_sint_t omp_num_threads   = 1;
-#endif
-        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+        fast_sint_t omp_thread_num = 0;
+        fast_sint_t omp_num_threads = 1;
+    #endif
+        fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
+                                         ? omp_block_stride
+                                         : block_size - omp_block_start;
 
         omp_block_start += block_start;
 
-        if (omp_num_threads == 1)
-        {
-            libsais_radix_sort_lms_suffixes_32s_2k(T, SA, induction_bucket, omp_block_start, omp_block_size);
+        if (omp_num_threads == 1) {
+            libsais_radix_sort_lms_suffixes_32s_2k(
+                T, SA, induction_bucket, omp_block_start, omp_block_size);
         }
-#if defined(_OPENMP)
-        else
-        {
+    #if defined(_OPENMP)
+        else {
             {
-                libsais_radix_sort_lms_suffixes_32s_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+                libsais_radix_sort_lms_suffixes_32s_block_gather(
+                    T, SA, cache - block_start, omp_block_start,
+                    omp_block_size);
             }
 
-            #pragma omp barrier
+        #pragma omp barrier
 
-            #pragma omp master
+        #pragma omp master
             {
-                libsais_radix_sort_lms_suffixes_32s_2k_block_sort(induction_bucket, cache - block_start, block_start, block_size);
+                libsais_radix_sort_lms_suffixes_32s_2k_block_sort(
+                    induction_bucket, cache - block_start, block_start,
+                    block_size);
             }
 
-            #pragma omp barrier
+        #pragma omp barrier
 
             {
-                libsais_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+                libsais_place_cached_suffixes(SA, cache - block_start,
+                                              omp_block_start, omp_block_size);
             }
         }
-#endif
+    #endif
     }
 }
 
 #endif
 
-static void libsais_radix_sort_lms_suffixes_32s_6k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
-    if (threads == 1 || m < 65536)
-    {
-        libsais_radix_sort_lms_suffixes_32s_6k(T, SA, induction_bucket, (fast_sint_t)n - (fast_sint_t)m + 1, (fast_sint_t)m - 1);
+static void libsais_radix_sort_lms_suffixes_32s_6k_omp(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+    sa_sint_t m, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    if (threads == 1 || m < 65536) {
+        libsais_radix_sort_lms_suffixes_32s_6k(
+            T, SA, induction_bucket, (fast_sint_t)n - (fast_sint_t)m + 1,
+            (fast_sint_t)m - 1);
     }
 #if defined(_OPENMP)
-    else
-    {
+    else {
         fast_sint_t block_start, block_end;
-        for (block_start = 0; block_start < (fast_sint_t)m - 1; block_start = block_end)
-        {
-            block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end >= m) { block_end = (fast_sint_t)m - 1; }
+        for (block_start = 0; block_start < (fast_sint_t)m - 1;
+             block_start = block_end) {
+            block_end = block_start +
+                        (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
+            if (block_end >= m) {
+                block_end = (fast_sint_t)m - 1;
+            }
 
-            libsais_radix_sort_lms_suffixes_32s_6k_block_omp(T, SA, induction_bucket, thread_state[0].state.cache, (fast_sint_t)n - block_end, block_end - block_start, threads);
+            libsais_radix_sort_lms_suffixes_32s_6k_block_omp(
+                T, SA, induction_bucket, thread_state[0].state.cache,
+                (fast_sint_t)n - block_end, block_end - block_start, threads);
         }
     }
 #else
@@ -1874,21 +2386,29 @@ static void libsais_radix_sort_lms_suffixes_32s_6k_omp(const sa_sint_t * RESTRIC
 #endif
 }
 
-static void libsais_radix_sort_lms_suffixes_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
-    if (threads == 1 || m < 65536)
-    {
-        libsais_radix_sort_lms_suffixes_32s_2k(T, SA, induction_bucket, (fast_sint_t)n - (fast_sint_t)m + 1, (fast_sint_t)m - 1);
+static void libsais_radix_sort_lms_suffixes_32s_2k_omp(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+    sa_sint_t m, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    if (threads == 1 || m < 65536) {
+        libsais_radix_sort_lms_suffixes_32s_2k(
+            T, SA, induction_bucket, (fast_sint_t)n - (fast_sint_t)m + 1,
+            (fast_sint_t)m - 1);
     }
 #if defined(_OPENMP)
-    else
-    {
+    else {
         fast_sint_t block_start, block_end;
-        for (block_start = 0; block_start < (fast_sint_t)m - 1; block_start = block_end)
-        {
-            block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end >= m) { block_end = (fast_sint_t)m - 1; }
+        for (block_start = 0; block_start < (fast_sint_t)m - 1;
+             block_start = block_end) {
+            block_end = block_start +
+                        (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
+            if (block_end >= m) {
+                block_end = (fast_sint_t)m - 1;
+            }
 
-            libsais_radix_sort_lms_suffixes_32s_2k_block_omp(T, SA, induction_bucket, thread_state[0].state.cache, (fast_sint_t)n - block_end, block_end - block_start, threads);
+            libsais_radix_sort_lms_suffixes_32s_2k_block_omp(
+                T, SA, induction_bucket, thread_state[0].state.cache,
+                (fast_sint_t)n - block_end, block_end - block_start, threads);
         }
     }
 #else
@@ -1896,19 +2416,19 @@ static void libsais_radix_sort_lms_suffixes_32s_2k_omp(const sa_sint_t * RESTRIC
 #endif
 }
 
-static sa_sint_t libsais_radix_sort_lms_suffixes_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets)
-{
+static sa_sint_t libsais_radix_sort_lms_suffixes_32s_1k(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+    sa_sint_t * RESTRICT buckets) {
     const fast_sint_t prefetch_distance = 32;
 
-    sa_sint_t             i = n - 2;
-    sa_sint_t             m = 0;
-    fast_uint_t           s = 1;
-    fast_sint_t           c0 = T[n - 1];
-    fast_sint_t           c1 = 0;
-    fast_sint_t           c2 = 0;
+    sa_sint_t i = n - 2;
+    sa_sint_t m = 0;
+    fast_uint_t s = 1;
+    fast_sint_t c0 = T[n - 1];
+    fast_sint_t c1 = 0;
+    fast_sint_t c2 = 0;
 
-    for (; i >= prefetch_distance + 3; i -= 4)
-    {
+    for (; i >= prefetch_distance + 3; i -= 4) {
         libsais_prefetch(&T[i - 2 * prefetch_distance]);
 
         libsais_prefetchw(&buckets[T[i - prefetch_distance - 0]]);
@@ -1916,40 +2436,61 @@ static sa_sint_t libsais_radix_sort_lms_suffixes_32s_1k(const sa_sint_t * RESTRI
         libsais_prefetchw(&buckets[T[i - prefetch_distance - 2]]);
         libsais_prefetchw(&buckets[T[i - prefetch_distance - 3]]);
 
-        c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); 
-        if ((s & 3) == 1) { SA[--buckets[c2 = c0]] = i + 1; m++; }
-        
-        c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); 
-        if ((s & 3) == 1) { SA[--buckets[c2 = c1]] = i - 0; m++; }
+        c1 = T[i - 0];
+        s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+        if ((s & 3) == 1) {
+            SA[--buckets[c2 = c0]] = i + 1;
+            m++;
+        }
+
+        c0 = T[i - 1];
+        s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+        if ((s & 3) == 1) {
+            SA[--buckets[c2 = c1]] = i - 0;
+            m++;
+        }
 
-        c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); 
-        if ((s & 3) == 1) { SA[--buckets[c2 = c0]] = i - 1; m++; }
+        c1 = T[i - 2];
+        s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+        if ((s & 3) == 1) {
+            SA[--buckets[c2 = c0]] = i - 1;
+            m++;
+        }
 
-        c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); 
-        if ((s & 3) == 1) { SA[--buckets[c2 = c1]] = i - 2; m++; }
+        c0 = T[i - 3];
+        s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+        if ((s & 3) == 1) {
+            SA[--buckets[c2 = c1]] = i - 2;
+            m++;
+        }
     }
 
-    for (; i >= 0; i -= 1)
-    {
-        c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); 
-        if ((s & 3) == 1) { SA[--buckets[c2 = c1]] = i + 1; m++; }
+    for (; i >= 0; i -= 1) {
+        c1 = c0;
+        c0 = T[i];
+        s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+        if ((s & 3) == 1) {
+            SA[--buckets[c2 = c1]] = i + 1;
+            m++;
+        }
     }
 
-    if (m > 1)
-    {
+    if (m > 1) {
         SA[buckets[c2]] = 0;
     }
 
     return m;
 }
 
-static void libsais_radix_sort_set_markers_32s_6k(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static void libsais_radix_sort_set_markers_32s_6k(
+    sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket,
+    fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4)
-    {
+    for (i = omp_block_start,
+        j = omp_block_start + omp_block_size - prefetch_distance - 3;
+         i < j; i += 4) {
         libsais_prefetch(&induction_bucket[i + 2 * prefetch_distance]);
 
         libsais_prefetchw(&SA[induction_bucket[i + prefetch_distance + 0]]);
@@ -1963,25 +2504,31 @@ static void libsais_radix_sort_set_markers_32s_6k(sa_sint_t * RESTRICT SA, sa_si
         SA[induction_bucket[i + 3]] |= SAINT_MIN;
     }
 
-    for (j += prefetch_distance + 3; i < j; i += 1)
-    {
+    for (j += prefetch_distance + 3; i < j; i += 1) {
         SA[induction_bucket[i]] |= SAINT_MIN;
     }
 }
 
-static void libsais_radix_sort_set_markers_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static void libsais_radix_sort_set_markers_32s_4k(
+    sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket,
+    fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4)
-    {
-        libsais_prefetch(&induction_bucket[BUCKETS_INDEX2(i + 2 * prefetch_distance, 0)]);
-
-        libsais_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 0, 0)]]);
-        libsais_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 1, 0)]]);
-        libsais_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 2, 0)]]);
-        libsais_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 3, 0)]]);
+    for (i = omp_block_start,
+        j = omp_block_start + omp_block_size - prefetch_distance - 3;
+         i < j; i += 4) {
+        libsais_prefetch(
+            &induction_bucket[BUCKETS_INDEX2(i + 2 * prefetch_distance, 0)]);
+
+        libsais_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(
+            i + prefetch_distance + 0, 0)]]);
+        libsais_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(
+            i + prefetch_distance + 1, 0)]]);
+        libsais_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(
+            i + prefetch_distance + 2, 0)]]);
+        libsais_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(
+            i + prefetch_distance + 3, 0)]]);
 
         SA[induction_bucket[BUCKETS_INDEX2(i + 0, 0)]] |= SUFFIX_GROUP_MARKER;
         SA[induction_bucket[BUCKETS_INDEX2(i + 1, 0)]] |= SUFFIX_GROUP_MARKER;
@@ -1989,70 +2536,83 @@ static void libsais_radix_sort_set_markers_32s_4k(sa_sint_t * RESTRICT SA, sa_si
         SA[induction_bucket[BUCKETS_INDEX2(i + 3, 0)]] |= SUFFIX_GROUP_MARKER;
     }
 
-    for (j += prefetch_distance + 3; i < j; i += 1)
-    {
+    for (j += prefetch_distance + 3; i < j; i += 1) {
         SA[induction_bucket[BUCKETS_INDEX2(i, 0)]] |= SUFFIX_GROUP_MARKER;
     }
 }
 
-static void libsais_radix_sort_set_markers_32s_6k_omp(sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads)
-{
+static void libsais_radix_sort_set_markers_32s_6k_omp(
+    sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket,
+    sa_sint_t threads) {
 #if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if(threads > 1 && k >= 65536)
+    #pragma omp parallel num_threads(threads) if (threads > 1 && k >= 65536)
 #endif
     {
 #if defined(_OPENMP)
-        fast_sint_t omp_thread_num    = omp_get_thread_num();
-        fast_sint_t omp_num_threads   = omp_get_num_threads();
-        fast_sint_t omp_block_stride  = (((fast_sint_t)k - 1) / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : (fast_sint_t)k - 1 - omp_block_start;
+        fast_sint_t omp_thread_num = omp_get_thread_num();
+        fast_sint_t omp_num_threads = omp_get_num_threads();
+        fast_sint_t omp_block_stride =
+            (((fast_sint_t)k - 1) / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
+                                         ? omp_block_stride
+                                         : (fast_sint_t)k - 1 - omp_block_start;
 #else
         UNUSED(threads);
 
-        fast_sint_t omp_block_start   = 0;
-        fast_sint_t omp_block_size    = (fast_sint_t)k - 1;
+        fast_sint_t omp_block_start = 0;
+        fast_sint_t omp_block_size = (fast_sint_t)k - 1;
 #endif
 
-        libsais_radix_sort_set_markers_32s_6k(SA, induction_bucket, omp_block_start, omp_block_size);
+        libsais_radix_sort_set_markers_32s_6k(SA, induction_bucket,
+                                              omp_block_start, omp_block_size);
     }
 }
 
-static void libsais_radix_sort_set_markers_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads)
-{
+static void libsais_radix_sort_set_markers_32s_4k_omp(
+    sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket,
+    sa_sint_t threads) {
 #if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if(threads > 1 && k >= 65536)
+    #pragma omp parallel num_threads(threads) if (threads > 1 && k >= 65536)
 #endif
     {
 #if defined(_OPENMP)
-        fast_sint_t omp_thread_num    = omp_get_thread_num();
-        fast_sint_t omp_num_threads   = omp_get_num_threads();
-        fast_sint_t omp_block_stride  = (((fast_sint_t)k - 1) / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : (fast_sint_t)k - 1 - omp_block_start;
+        fast_sint_t omp_thread_num = omp_get_thread_num();
+        fast_sint_t omp_num_threads = omp_get_num_threads();
+        fast_sint_t omp_block_stride =
+            (((fast_sint_t)k - 1) / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
+                                         ? omp_block_stride
+                                         : (fast_sint_t)k - 1 - omp_block_start;
 #else
         UNUSED(threads);
 
-        fast_sint_t omp_block_start   = 0;
-        fast_sint_t omp_block_size    = (fast_sint_t)k - 1;
+        fast_sint_t omp_block_start = 0;
+        fast_sint_t omp_block_size = (fast_sint_t)k - 1;
 #endif
 
-        libsais_radix_sort_set_markers_32s_4k(SA, induction_bucket, omp_block_start, omp_block_size);
+        libsais_radix_sort_set_markers_32s_4k(SA, induction_bucket,
+                                              omp_block_start, omp_block_size);
     }
 }
 
-static void libsais_initialize_buckets_for_partial_sorting_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count)
-{
+static void libsais_initialize_buckets_for_partial_sorting_8u(
+    const uint8_t * RESTRICT T, sa_sint_t * RESTRICT buckets,
+    sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count) {
     sa_sint_t * RESTRICT temp_bucket = &buckets[4 * ALPHABET_SIZE];
 
     buckets[BUCKETS_INDEX4((fast_uint_t)T[first_lms_suffix], 1)]++;
 
-    fast_sint_t i, j; sa_sint_t sum0 = left_suffixes_count + 1, sum1 = 0;
-    for (i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0))
-    { 
+    fast_sint_t i, j;
+    sa_sint_t sum0 = left_suffixes_count + 1, sum1 = 0;
+    for (i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0);
+         i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0);
+         i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) {
         temp_bucket[j + BUCKETS_INDEX2(0, 0)] = sum0;
 
-        sum0 += buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 2)];
+        sum0 += buckets[i + BUCKETS_INDEX4(0, 0)] +
+                buckets[i + BUCKETS_INDEX4(0, 2)];
         sum1 += buckets[i + BUCKETS_INDEX4(0, 1)];
 
         buckets[j + BUCKETS_INDEX2(0, 0)] = sum0;
@@ -2060,13 +2620,17 @@ static void libsais_initialize_buckets_for_partial_sorting_8u(const uint8_t * RE
     }
 }
 
-static void libsais_initialize_buckets_for_partial_sorting_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count)
-{
+static void libsais_initialize_buckets_for_partial_sorting_32s_6k(
+    const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets,
+    sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count) {
     sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k];
 
-    fast_sint_t i, j; sa_sint_t sum0 = left_suffixes_count + 1, sum1 = 0, sum2 = 0;
-    for (first_lms_suffix = T[first_lms_suffix], i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX4((fast_sint_t)first_lms_suffix - 1, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0))
-    {
+    fast_sint_t i, j;
+    sa_sint_t sum0 = left_suffixes_count + 1, sum1 = 0, sum2 = 0;
+    for (first_lms_suffix = T[first_lms_suffix], i = BUCKETS_INDEX4(0, 0),
+        j = BUCKETS_INDEX2(0, 0);
+         i <= BUCKETS_INDEX4((fast_sint_t)first_lms_suffix - 1, 0);
+         i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) {
         sa_sint_t SS = buckets[i + BUCKETS_INDEX4(0, 0)];
         sa_sint_t LS = buckets[i + BUCKETS_INDEX4(0, 1)];
         sa_sint_t SL = buckets[i + BUCKETS_INDEX4(0, 2)];
@@ -2077,14 +2641,16 @@ static void libsais_initialize_buckets_for_partial_sorting_32s_6k(const sa_sint_
         buckets[i + BUCKETS_INDEX4(0, 2)] = 0;
         buckets[i + BUCKETS_INDEX4(0, 3)] = 0;
 
-        sum0 += SS + SL; sum1 += LS; sum2 += LS + LL;
+        sum0 += SS + SL;
+        sum1 += LS;
+        sum2 += LS + LL;
 
         temp_bucket[j + BUCKETS_INDEX2(0, 0)] = sum0;
         temp_bucket[j + BUCKETS_INDEX2(0, 1)] = sum1;
     }
 
-    for (sum1 += 1; i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0))
-    { 
+    for (sum1 += 1; i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0);
+         i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) {
         sa_sint_t SS = buckets[i + BUCKETS_INDEX4(0, 0)];
         sa_sint_t LS = buckets[i + BUCKETS_INDEX4(0, 1)];
         sa_sint_t SL = buckets[i + BUCKETS_INDEX4(0, 2)];
@@ -2095,23 +2661,28 @@ static void libsais_initialize_buckets_for_partial_sorting_32s_6k(const sa_sint_
         buckets[i + BUCKETS_INDEX4(0, 2)] = 0;
         buckets[i + BUCKETS_INDEX4(0, 3)] = 0;
 
-        sum0 += SS + SL; sum1 += LS; sum2 += LS + LL;
+        sum0 += SS + SL;
+        sum1 += LS;
+        sum2 += LS + LL;
 
         temp_bucket[j + BUCKETS_INDEX2(0, 0)] = sum0;
         temp_bucket[j + BUCKETS_INDEX2(0, 1)] = sum1;
     }
 }
 
-static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u(
+    const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+    sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start,
+    fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     sa_sint_t * RESTRICT induction_bucket = &buckets[4 * ALPHABET_SIZE];
-    sa_sint_t * RESTRICT distinct_names   = &buckets[2 * ALPHABET_SIZE];
+    sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE];
 
     fast_sint_t i, j;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
-    {
+    for (i = omp_block_start,
+        j = omp_block_start + omp_block_size - prefetch_distance - 1;
+         i < j; i += 2) {
         libsais_prefetch(&SA[i + 2 * prefetch_distance]);
 
         libsais_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1);
@@ -2119,17 +2690,33 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u(const uint8_t * R
         libsais_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 1);
         libsais_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 2);
 
-        sa_sint_t p0 = SA[i + 0]; d += (p0 < 0); p0 &= SAINT_MAX; sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] >= T[p0 - 1]);
-        SA[induction_bucket[v0]++] = (p0 - 1) | ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d;
-
-        sa_sint_t p1 = SA[i + 1]; d += (p1 < 0); p1 &= SAINT_MAX; sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] >= T[p1 - 1]);
-        SA[induction_bucket[v1]++] = (p1 - 1) | ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1)); distinct_names[v1] = d;
-    }
-
-    for (j += prefetch_distance + 1; i < j; i += 1)
-    {
-        sa_sint_t p = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] >= T[p - 1]);
-        SA[induction_bucket[v]++] = (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d;
+        sa_sint_t p0 = SA[i + 0];
+        d += (p0 < 0);
+        p0 &= SAINT_MAX;
+        sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] >= T[p0 - 1]);
+        SA[induction_bucket[v0]++] =
+            (p0 - 1) |
+            ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1));
+        distinct_names[v0] = d;
+
+        sa_sint_t p1 = SA[i + 1];
+        d += (p1 < 0);
+        p1 &= SAINT_MAX;
+        sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] >= T[p1 - 1]);
+        SA[induction_bucket[v1]++] =
+            (p1 - 1) |
+            ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1));
+        distinct_names[v1] = d;
+    }
+
+    for (j += prefetch_distance + 1; i < j; i += 1) {
+        sa_sint_t p = SA[i];
+        d += (p < 0);
+        p &= SAINT_MAX;
+        sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] >= T[p - 1]);
+        SA[induction_bucket[v]++] =
+            (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1));
+        distinct_names[v] = d;
     }
 
     return d;
@@ -2137,18 +2724,23 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u(const uint8_t * R
 
 #if defined(_OPENMP)
 
-static void libsais_partial_sorting_scan_left_to_right_8u_block_prepare(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size, LIBSAIS_THREAD_STATE * RESTRICT state)
-{
+static void libsais_partial_sorting_scan_left_to_right_8u_block_prepare(
+    const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+    sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache,
+    fast_sint_t omp_block_start, fast_sint_t omp_block_size,
+    LIBSAIS_THREAD_STATE * RESTRICT state) {
     const fast_sint_t prefetch_distance = 32;
 
     sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
-    sa_sint_t * RESTRICT distinct_names   = &buckets[2 * ALPHABET_SIZE];
+    sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE];
 
     memset(buckets, 0, 4 * ALPHABET_SIZE * sizeof(sa_sint_t));
 
-    fast_sint_t i, j, count = 0; sa_sint_t d = 1;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
-    {
+    fast_sint_t i, j, count = 0;
+    sa_sint_t d = 1;
+    for (i = omp_block_start,
+        j = omp_block_start + omp_block_size - prefetch_distance - 1;
+         i < j; i += 2) {
         libsais_prefetch(&SA[i + 2 * prefetch_distance]);
 
         libsais_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1);
@@ -2156,105 +2748,164 @@ static void libsais_partial_sorting_scan_left_to_right_8u_block_prepare(const ui
         libsais_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 1);
         libsais_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 2);
 
-        sa_sint_t p0 = cache[count].index = SA[i + 0]; d += (p0 < 0); p0 &= SAINT_MAX; sa_sint_t v0 = cache[count++].symbol = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] >= T[p0 - 1]); induction_bucket[v0]++; distinct_names[v0] = d;
-        sa_sint_t p1 = cache[count].index = SA[i + 1]; d += (p1 < 0); p1 &= SAINT_MAX; sa_sint_t v1 = cache[count++].symbol = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] >= T[p1 - 1]); induction_bucket[v1]++; distinct_names[v1] = d;
-    }
-
-    for (j += prefetch_distance + 1; i < j; i += 1)
-    {
-        sa_sint_t p = cache[count].index = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = cache[count++].symbol = BUCKETS_INDEX2(T[p - 1], T[p - 2] >= T[p - 1]); induction_bucket[v]++; distinct_names[v] = d;
-    }
-
-    state[0].state.position   = (fast_sint_t)d - 1;
-    state[0].state.count      = count;
-}
-
-static void libsais_partial_sorting_scan_left_to_right_8u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count, sa_sint_t d)
-{
+        sa_sint_t p0 = cache[count].index = SA[i + 0];
+        d += (p0 < 0);
+        p0 &= SAINT_MAX;
+        sa_sint_t v0 = cache[count++].symbol =
+            BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] >= T[p0 - 1]);
+        induction_bucket[v0]++;
+        distinct_names[v0] = d;
+        sa_sint_t p1 = cache[count].index = SA[i + 1];
+        d += (p1 < 0);
+        p1 &= SAINT_MAX;
+        sa_sint_t v1 = cache[count++].symbol =
+            BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] >= T[p1 - 1]);
+        induction_bucket[v1]++;
+        distinct_names[v1] = d;
+    }
+
+    for (j += prefetch_distance + 1; i < j; i += 1) {
+        sa_sint_t p = cache[count].index = SA[i];
+        d += (p < 0);
+        p &= SAINT_MAX;
+        sa_sint_t v = cache[count++].symbol =
+            BUCKETS_INDEX2(T[p - 1], T[p - 2] >= T[p - 1]);
+        induction_bucket[v]++;
+        distinct_names[v] = d;
+    }
+
+    state[0].state.position = (fast_sint_t)d - 1;
+    state[0].state.count = count;
+}
+
+static void libsais_partial_sorting_scan_left_to_right_8u_block_place(
+    sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
+    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count, sa_sint_t d) {
     const fast_sint_t prefetch_distance = 32;
 
     sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
-    sa_sint_t * RESTRICT distinct_names   = &buckets[2 * ALPHABET_SIZE];
+    sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE];
 
     fast_sint_t i, j;
-    for (i = 0, j = count - 1; i < j; i += 2)
-    {
+    for (i = 0, j = count - 1; i < j; i += 2) {
         libsais_prefetch(&cache[i + prefetch_distance]);
 
-        sa_sint_t p0 = cache[i + 0].index; d += (p0 < 0); sa_sint_t v0 = cache[i + 0].symbol;
-        SA[induction_bucket[v0]++] = (p0 - 1) | ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d;
+        sa_sint_t p0 = cache[i + 0].index;
+        d += (p0 < 0);
+        sa_sint_t v0 = cache[i + 0].symbol;
+        SA[induction_bucket[v0]++] =
+            (p0 - 1) |
+            ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1));
+        distinct_names[v0] = d;
 
-        sa_sint_t p1 = cache[i + 1].index; d += (p1 < 0); sa_sint_t v1 = cache[i + 1].symbol;
-        SA[induction_bucket[v1]++] = (p1 - 1) | ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1)); distinct_names[v1] = d;
+        sa_sint_t p1 = cache[i + 1].index;
+        d += (p1 < 0);
+        sa_sint_t v1 = cache[i + 1].symbol;
+        SA[induction_bucket[v1]++] =
+            (p1 - 1) |
+            ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1));
+        distinct_names[v1] = d;
     }
 
-    for (j += 1; i < j; i += 1)
-    {
-        sa_sint_t p = cache[i].index; d += (p < 0); sa_sint_t v = cache[i].symbol;
-        SA[induction_bucket[v]++] = (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d;
+    for (j += 1; i < j; i += 1) {
+        sa_sint_t p = cache[i].index;
+        d += (p < 0);
+        sa_sint_t v = cache[i].symbol;
+        SA[induction_bucket[v]++] =
+            (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1));
+        distinct_names[v] = d;
     }
 }
 
-static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u_block_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
-#if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0)
-#endif
+static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u_block_omp(
+    const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+    sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t block_start,
+    fast_sint_t block_size, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    #if defined(_OPENMP)
+        #pragma omp parallel num_threads(                                   \
+            threads) if (threads > 1 && block_size >= 64 * ALPHABET_SIZE && \
+                         omp_get_dynamic() == 0)
+    #endif
     {
-#if defined(_OPENMP)
-        fast_sint_t omp_thread_num    = omp_get_thread_num();
-        fast_sint_t omp_num_threads   = omp_get_num_threads();
-#else
-        UNUSED(threads); UNUSED(thread_state);
+    #if defined(_OPENMP)
+        fast_sint_t omp_thread_num = omp_get_thread_num();
+        fast_sint_t omp_num_threads = omp_get_num_threads();
+    #else
+        UNUSED(threads);
+        UNUSED(thread_state);
 
-        fast_sint_t omp_thread_num    = 0;
-        fast_sint_t omp_num_threads   = 1;
-#endif
-        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+        fast_sint_t omp_thread_num = 0;
+        fast_sint_t omp_num_threads = 1;
+    #endif
+        fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
+                                         ? omp_block_stride
+                                         : block_size - omp_block_start;
 
         omp_block_start += block_start;
 
-        if (omp_num_threads == 1)
-        {
-            d = libsais_partial_sorting_scan_left_to_right_8u(T, SA, buckets, d, omp_block_start, omp_block_size);
+        if (omp_num_threads == 1) {
+            d = libsais_partial_sorting_scan_left_to_right_8u(
+                T, SA, buckets, d, omp_block_start, omp_block_size);
         }
-#if defined(_OPENMP)
-        else
-        {
+    #if defined(_OPENMP)
+        else {
             {
-                libsais_partial_sorting_scan_left_to_right_8u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size, &thread_state[omp_thread_num]);
+                libsais_partial_sorting_scan_left_to_right_8u_block_prepare(
+                    T, SA, thread_state[omp_thread_num].state.buckets,
+                    thread_state[omp_thread_num].state.cache, omp_block_start,
+                    omp_block_size, &thread_state[omp_thread_num]);
             }
 
-            #pragma omp barrier
+        #pragma omp barrier
 
-            #pragma omp master
+        #pragma omp master
             {
-                sa_sint_t * RESTRICT induction_bucket = &buckets[4 * ALPHABET_SIZE];
-                sa_sint_t * RESTRICT distinct_names   = &buckets[2 * ALPHABET_SIZE];
+                sa_sint_t * RESTRICT induction_bucket =
+                    &buckets[4 * ALPHABET_SIZE];
+                sa_sint_t * RESTRICT distinct_names =
+                    &buckets[2 * ALPHABET_SIZE];
 
                 fast_sint_t t;
-                for (t = 0; t < omp_num_threads; ++t)
-                {
-                    sa_sint_t * RESTRICT temp_induction_bucket    = &thread_state[t].state.buckets[0 * ALPHABET_SIZE];
-                    sa_sint_t * RESTRICT temp_distinct_names      = &thread_state[t].state.buckets[2 * ALPHABET_SIZE];
-
-                    fast_sint_t c; 
-                    for (c = 0; c < 2 * ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_induction_bucket[c]; induction_bucket[c] = A + B; temp_induction_bucket[c] = A; }
+                for (t = 0; t < omp_num_threads; ++t) {
+                    sa_sint_t * RESTRICT temp_induction_bucket =
+                        &thread_state[t].state.buckets[0 * ALPHABET_SIZE];
+                    sa_sint_t * RESTRICT temp_distinct_names =
+                        &thread_state[t].state.buckets[2 * ALPHABET_SIZE];
+
+                    fast_sint_t c;
+                    for (c = 0; c < 2 * ALPHABET_SIZE; c += 1) {
+                        sa_sint_t A = induction_bucket[c],
+                                  B = temp_induction_bucket[c];
+                        induction_bucket[c] = A + B;
+                        temp_induction_bucket[c] = A;
+                    }
 
-                    for (d -= 1, c = 0; c < 2 * ALPHABET_SIZE; c += 1) { sa_sint_t A = distinct_names[c], B = temp_distinct_names[c], D = B + d; distinct_names[c] = B > 0 ? D : A; temp_distinct_names[c] = A; }
-                    d += 1 + (sa_sint_t)thread_state[t].state.position; thread_state[t].state.position = (fast_sint_t)d - thread_state[t].state.position;
+                    for (d -= 1, c = 0; c < 2 * ALPHABET_SIZE; c += 1) {
+                        sa_sint_t A = distinct_names[c],
+                                  B = temp_distinct_names[c], D = B + d;
+                        distinct_names[c] = B > 0 ? D : A;
+                        temp_distinct_names[c] = A;
+                    }
+                    d += 1 + (sa_sint_t)thread_state[t].state.position;
+                    thread_state[t].state.position =
+                        (fast_sint_t)d - thread_state[t].state.position;
                 }
             }
 
-            #pragma omp barrier
+        #pragma omp barrier
 
             {
-                libsais_partial_sorting_scan_left_to_right_8u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count, (sa_sint_t)thread_state[omp_thread_num].state.position);
+                libsais_partial_sorting_scan_left_to_right_8u_block_place(
+                    SA, thread_state[omp_thread_num].state.buckets,
+                    thread_state[omp_thread_num].state.cache,
+                    thread_state[omp_thread_num].state.count,
+                    (sa_sint_t)thread_state[omp_thread_num].state.position);
             }
         }
-#endif
+    #endif
     }
 
     return d;
@@ -2262,45 +2913,57 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u_block_omp(const u
 
 #endif
 
-static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
+static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u_omp(
+    const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+    sa_sint_t * RESTRICT buckets, sa_sint_t left_suffixes_count, sa_sint_t d,
+    sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     sa_sint_t * RESTRICT induction_bucket = &buckets[4 * ALPHABET_SIZE];
-    sa_sint_t * RESTRICT distinct_names   = &buckets[2 * ALPHABET_SIZE];
+    sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE];
 
-    SA[induction_bucket[BUCKETS_INDEX2(T[n - 1], T[n - 2] >= T[n - 1])]++] = (n - 1) | SAINT_MIN;
+    SA[induction_bucket[BUCKETS_INDEX2(T[n - 1], T[n - 2] >= T[n - 1])]++] =
+        (n - 1) | SAINT_MIN;
     distinct_names[BUCKETS_INDEX2(T[n - 1], T[n - 2] >= T[n - 1])] = ++d;
 
-    if (threads == 1 || left_suffixes_count < 65536)
-    {
-        d = libsais_partial_sorting_scan_left_to_right_8u(T, SA, buckets, d, 0, left_suffixes_count);
+    if (threads == 1 || left_suffixes_count < 65536) {
+        d = libsais_partial_sorting_scan_left_to_right_8u(T, SA, buckets, d, 0,
+                                                          left_suffixes_count);
     }
 #if defined(_OPENMP)
-    else
-    {
+    else {
         fast_sint_t block_start;
-        for (block_start = 0; block_start < left_suffixes_count; )
-        {
-            if (SA[block_start] == 0)
-            {
+        for (block_start = 0; block_start < left_suffixes_count;) {
+            if (SA[block_start] == 0) {
                 block_start++;
-            }
-            else
-            {
-                fast_sint_t block_max_end = block_start + ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end > left_suffixes_count) { block_max_end = left_suffixes_count;}
-                fast_sint_t block_end     = block_start + 1; while (block_end < block_max_end && SA[block_end] != 0) { block_end++; }
-                fast_sint_t block_size    = block_end - block_start;
-
-                if (block_size < 32)
-                {
-                    for (; block_start < block_end; block_start += 1)
-                    {
-                        sa_sint_t p = SA[block_start]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] >= T[p - 1]);
-                        SA[induction_bucket[v]++] = (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d;
-                    }
+            } else {
+                fast_sint_t block_max_end =
+                    block_start +
+                    ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE -
+                                              16 * (fast_sint_t)threads);
+                if (block_max_end > left_suffixes_count) {
+                    block_max_end = left_suffixes_count;
                 }
-                else
-                {
-                    d = libsais_partial_sorting_scan_left_to_right_8u_block_omp(T, SA, buckets, d, block_start, block_size, threads, thread_state);
+                fast_sint_t block_end = block_start + 1;
+                while (block_end < block_max_end && SA[block_end] != 0) {
+                    block_end++;
+                }
+                fast_sint_t block_size = block_end - block_start;
+
+                if (block_size < 32) {
+                    for (; block_start < block_end; block_start += 1) {
+                        sa_sint_t p = SA[block_start];
+                        d += (p < 0);
+                        p &= SAINT_MAX;
+                        sa_sint_t v =
+                            BUCKETS_INDEX2(T[p - 1], T[p - 2] >= T[p - 1]);
+                        SA[induction_bucket[v]++] =
+                            (p - 1) | ((sa_sint_t)(distinct_names[v] != d)
+                                       << (SAINT_BIT - 1));
+                        distinct_names[v] = d;
+                    }
+                } else {
+                    d = libsais_partial_sorting_scan_left_to_right_8u_block_omp(
+                        T, SA, buckets, d, block_start, block_size, threads,
+                        thread_state);
                     block_start = block_end;
                 }
             }
@@ -2313,13 +2976,16 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u_omp(const uint8_t
     return d;
 }
 
-static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+    sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start,
+    fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2)
-    {
+    for (i = omp_block_start,
+        j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1;
+         i < j; i += 2) {
         libsais_prefetch(&SA[i + 3 * prefetch_distance]);
 
         libsais_prefetch(&T[SA[i + 2 * prefetch_distance + 0] & SAINT_MAX] - 1);
@@ -2327,103 +2993,201 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k(const sa_sint
         libsais_prefetch(&T[SA[i + 2 * prefetch_distance + 1] & SAINT_MAX] - 1);
         libsais_prefetch(&T[SA[i + 2 * prefetch_distance + 1] & SAINT_MAX] - 2);
 
-        sa_sint_t p0 = SA[i + prefetch_distance + 0] & SAINT_MAX; sa_sint_t v0 = BUCKETS_INDEX4(T[p0 - (p0 > 0)], 0); libsais_prefetchw(&buckets[v0]);
-        sa_sint_t p1 = SA[i + prefetch_distance + 1] & SAINT_MAX; sa_sint_t v1 = BUCKETS_INDEX4(T[p1 - (p1 > 0)], 0); libsais_prefetchw(&buckets[v1]);
-
-        sa_sint_t p2 = SA[i + 0]; d += (p2 < 0); p2 &= SAINT_MAX; sa_sint_t v2 = BUCKETS_INDEX4(T[p2 - 1], T[p2 - 2] >= T[p2 - 1]);
-        SA[buckets[v2]++] = (p2 - 1) | ((sa_sint_t)(buckets[2 + v2] != d) << (SAINT_BIT - 1)); buckets[2 + v2] = d;
-
-        sa_sint_t p3 = SA[i + 1]; d += (p3 < 0); p3 &= SAINT_MAX; sa_sint_t v3 = BUCKETS_INDEX4(T[p3 - 1], T[p3 - 2] >= T[p3 - 1]);
-        SA[buckets[v3]++] = (p3 - 1) | ((sa_sint_t)(buckets[2 + v3] != d) << (SAINT_BIT - 1)); buckets[2 + v3] = d;
-    }
-
-    for (j += 2 * prefetch_distance + 1; i < j; i += 1)
-    {
-        sa_sint_t p = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX4(T[p - 1], T[p - 2] >= T[p - 1]);
-        SA[buckets[v]++] = (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1)); buckets[2 + v] = d;
+        sa_sint_t p0 = SA[i + prefetch_distance + 0] & SAINT_MAX;
+        sa_sint_t v0 = BUCKETS_INDEX4(T[p0 - (p0 > 0)], 0);
+        libsais_prefetchw(&buckets[v0]);
+        sa_sint_t p1 = SA[i + prefetch_distance + 1] & SAINT_MAX;
+        sa_sint_t v1 = BUCKETS_INDEX4(T[p1 - (p1 > 0)], 0);
+        libsais_prefetchw(&buckets[v1]);
+
+        sa_sint_t p2 = SA[i + 0];
+        d += (p2 < 0);
+        p2 &= SAINT_MAX;
+        sa_sint_t v2 = BUCKETS_INDEX4(T[p2 - 1], T[p2 - 2] >= T[p2 - 1]);
+        SA[buckets[v2]++] =
+            (p2 - 1) | ((sa_sint_t)(buckets[2 + v2] != d) << (SAINT_BIT - 1));
+        buckets[2 + v2] = d;
+
+        sa_sint_t p3 = SA[i + 1];
+        d += (p3 < 0);
+        p3 &= SAINT_MAX;
+        sa_sint_t v3 = BUCKETS_INDEX4(T[p3 - 1], T[p3 - 2] >= T[p3 - 1]);
+        SA[buckets[v3]++] =
+            (p3 - 1) | ((sa_sint_t)(buckets[2 + v3] != d) << (SAINT_BIT - 1));
+        buckets[2 + v3] = d;
+    }
+
+    for (j += 2 * prefetch_distance + 1; i < j; i += 1) {
+        sa_sint_t p = SA[i];
+        d += (p < 0);
+        p &= SAINT_MAX;
+        sa_sint_t v = BUCKETS_INDEX4(T[p - 1], T[p - 2] >= T[p - 1]);
+        SA[buckets[v]++] =
+            (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1));
+        buckets[2 + v] = d;
     }
 
     return d;
 }
 
-static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k,
+    sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start,
+    fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     sa_sint_t * RESTRICT induction_bucket = &buckets[2 * k];
-    sa_sint_t * RESTRICT distinct_names   = &buckets[0 * k];
+    sa_sint_t * RESTRICT distinct_names = &buckets[0 * k];
 
     fast_sint_t i, j;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2)
-    {
+    for (i = omp_block_start,
+        j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1;
+         i < j; i += 2) {
         libsais_prefetchw(&SA[i + 3 * prefetch_distance]);
 
-        sa_sint_t s0 = SA[i + 2 * prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 & ~SUFFIX_GROUP_MARKER] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
-        sa_sint_t s1 = SA[i + 2 * prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 & ~SUFFIX_GROUP_MARKER] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
-        sa_sint_t s2 = SA[i + 1 * prefetch_distance + 0]; if (s2 > 0) { const fast_sint_t Ts2 = T[(s2 & ~SUFFIX_GROUP_MARKER) - 1]; libsais_prefetchw(&induction_bucket[Ts2]); libsais_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts2, 0)]); }
-        sa_sint_t s3 = SA[i + 1 * prefetch_distance + 1]; if (s3 > 0) { const fast_sint_t Ts3 = T[(s3 & ~SUFFIX_GROUP_MARKER) - 1]; libsais_prefetchw(&induction_bucket[Ts3]); libsais_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts3, 0)]); }
-
-        sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX;
-        if (p0 > 0)
-        {
-            SA[i + 0] = 0; d += (p0 >> (SUFFIX_GROUP_BIT - 1)); p0 &= ~SUFFIX_GROUP_MARKER; sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] < T[p0 - 1]);
-            SA[induction_bucket[T[p0 - 1]]++] = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] < T[p0 - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v0] = d;
-        }
-
-        sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX;
-        if (p1 > 0)
-        {
-            SA[i + 1] = 0; d += (p1 >> (SUFFIX_GROUP_BIT - 1)); p1 &= ~SUFFIX_GROUP_MARKER; sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] < T[p1 - 1]);
-            SA[induction_bucket[T[p1 - 1]]++] = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] < T[p1 - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v1] = d;
-        }
-    }
-
-    for (j += 2 * prefetch_distance + 1; i < j; i += 1)
-    {
-        sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX;
-        if (p > 0)
-        {
-            SA[i] = 0; d += (p >> (SUFFIX_GROUP_BIT - 1)); p &= ~SUFFIX_GROUP_MARKER; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] < T[p - 1]);
-            SA[induction_bucket[T[p - 1]]++] = (p - 1) | ((sa_sint_t)(T[p - 2] < T[p - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v] = d;
+        sa_sint_t s0 = SA[i + 2 * prefetch_distance + 0];
+        const sa_sint_t * Ts0 = &T[s0 & ~SUFFIX_GROUP_MARKER] - 1;
+        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        Ts0--;
+        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i + 2 * prefetch_distance + 1];
+        const sa_sint_t * Ts1 = &T[s1 & ~SUFFIX_GROUP_MARKER] - 1;
+        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+        Ts1--;
+        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+        sa_sint_t s2 = SA[i + 1 * prefetch_distance + 0];
+        if (s2 > 0) {
+            const fast_sint_t Ts2 = T[(s2 & ~SUFFIX_GROUP_MARKER) - 1];
+            libsais_prefetchw(&induction_bucket[Ts2]);
+            libsais_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts2, 0)]);
+        }
+        sa_sint_t s3 = SA[i + 1 * prefetch_distance + 1];
+        if (s3 > 0) {
+            const fast_sint_t Ts3 = T[(s3 & ~SUFFIX_GROUP_MARKER) - 1];
+            libsais_prefetchw(&induction_bucket[Ts3]);
+            libsais_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts3, 0)]);
+        }
+
+        sa_sint_t p0 = SA[i + 0];
+        SA[i + 0] = p0 & SAINT_MAX;
+        if (p0 > 0) {
+            SA[i + 0] = 0;
+            d += (p0 >> (SUFFIX_GROUP_BIT - 1));
+            p0 &= ~SUFFIX_GROUP_MARKER;
+            sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] < T[p0 - 1]);
+            SA[induction_bucket[T[p0 - 1]]++] =
+                (p0 - 1) |
+                ((sa_sint_t)(T[p0 - 2] < T[p0 - 1]) << (SAINT_BIT - 1)) |
+                ((sa_sint_t)(distinct_names[v0] != d)
+                 << (SUFFIX_GROUP_BIT - 1));
+            distinct_names[v0] = d;
+        }
+
+        sa_sint_t p1 = SA[i + 1];
+        SA[i + 1] = p1 & SAINT_MAX;
+        if (p1 > 0) {
+            SA[i + 1] = 0;
+            d += (p1 >> (SUFFIX_GROUP_BIT - 1));
+            p1 &= ~SUFFIX_GROUP_MARKER;
+            sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] < T[p1 - 1]);
+            SA[induction_bucket[T[p1 - 1]]++] =
+                (p1 - 1) |
+                ((sa_sint_t)(T[p1 - 2] < T[p1 - 1]) << (SAINT_BIT - 1)) |
+                ((sa_sint_t)(distinct_names[v1] != d)
+                 << (SUFFIX_GROUP_BIT - 1));
+            distinct_names[v1] = d;
+        }
+    }
+
+    for (j += 2 * prefetch_distance + 1; i < j; i += 1) {
+        sa_sint_t p = SA[i];
+        SA[i] = p & SAINT_MAX;
+        if (p > 0) {
+            SA[i] = 0;
+            d += (p >> (SUFFIX_GROUP_BIT - 1));
+            p &= ~SUFFIX_GROUP_MARKER;
+            sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] < T[p - 1]);
+            SA[induction_bucket[T[p - 1]]++] =
+                (p - 1) |
+                ((sa_sint_t)(T[p - 2] < T[p - 1]) << (SAINT_BIT - 1)) |
+                ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1));
+            distinct_names[v] = d;
         }
     }
 
     return d;
 }
 
-static void libsais_partial_sorting_scan_left_to_right_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static void libsais_partial_sorting_scan_left_to_right_32s_1k(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+    sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start,
+    fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2)
-    {
+    for (i = omp_block_start,
+        j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1;
+         i < j; i += 2) {
         libsais_prefetchw(&SA[i + 3 * prefetch_distance]);
 
-        sa_sint_t s0 = SA[i + 2 * prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
-        sa_sint_t s1 = SA[i + 2 * prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
-        sa_sint_t s2 = SA[i + 1 * prefetch_distance + 0]; if (s2 > 0) { libsais_prefetchw(&induction_bucket[T[s2 - 1]]); libsais_prefetch(&T[s2] - 2); }
-        sa_sint_t s3 = SA[i + 1 * prefetch_distance + 1]; if (s3 > 0) { libsais_prefetchw(&induction_bucket[T[s3 - 1]]); libsais_prefetch(&T[s3] - 2); }
-
-        sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { SA[i + 0] = 0; SA[induction_bucket[T[p0 - 1]]++] = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] < T[p0 - 1]) << (SAINT_BIT - 1)); }
-        sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { SA[i + 1] = 0; SA[induction_bucket[T[p1 - 1]]++] = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] < T[p1 - 1]) << (SAINT_BIT - 1)); }
-    }
-
-    for (j += 2 * prefetch_distance + 1; i < j; i += 1)
-    {
-        sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { SA[i] = 0; SA[induction_bucket[T[p - 1]]++] = (p - 1) | ((sa_sint_t)(T[p - 2] < T[p - 1]) << (SAINT_BIT - 1)); }
+        sa_sint_t s0 = SA[i + 2 * prefetch_distance + 0];
+        const sa_sint_t * Ts0 = &T[s0] - 1;
+        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i + 2 * prefetch_distance + 1];
+        const sa_sint_t * Ts1 = &T[s1] - 1;
+        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+        sa_sint_t s2 = SA[i + 1 * prefetch_distance + 0];
+        if (s2 > 0) {
+            libsais_prefetchw(&induction_bucket[T[s2 - 1]]);
+            libsais_prefetch(&T[s2] - 2);
+        }
+        sa_sint_t s3 = SA[i + 1 * prefetch_distance + 1];
+        if (s3 > 0) {
+            libsais_prefetchw(&induction_bucket[T[s3 - 1]]);
+            libsais_prefetch(&T[s3] - 2);
+        }
+
+        sa_sint_t p0 = SA[i + 0];
+        SA[i + 0] = p0 & SAINT_MAX;
+        if (p0 > 0) {
+            SA[i + 0] = 0;
+            SA[induction_bucket[T[p0 - 1]]++] =
+                (p0 - 1) |
+                ((sa_sint_t)(T[p0 - 2] < T[p0 - 1]) << (SAINT_BIT - 1));
+        }
+        sa_sint_t p1 = SA[i + 1];
+        SA[i + 1] = p1 & SAINT_MAX;
+        if (p1 > 0) {
+            SA[i + 1] = 0;
+            SA[induction_bucket[T[p1 - 1]]++] =
+                (p1 - 1) |
+                ((sa_sint_t)(T[p1 - 2] < T[p1 - 1]) << (SAINT_BIT - 1));
+        }
+    }
+
+    for (j += 2 * prefetch_distance + 1; i < j; i += 1) {
+        sa_sint_t p = SA[i];
+        SA[i] = p & SAINT_MAX;
+        if (p > 0) {
+            SA[i] = 0;
+            SA[induction_bucket[T[p - 1]]++] =
+                (p - 1) | ((sa_sint_t)(T[p - 2] < T[p - 1]) << (SAINT_BIT - 1));
+        }
     }
 }
 
 #if defined(_OPENMP)
 
-static void libsais_partial_sorting_scan_left_to_right_32s_6k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static void libsais_partial_sorting_scan_left_to_right_32s_6k_block_gather(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
+    fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
-    {
+    for (i = omp_block_start,
+        j = omp_block_start + omp_block_size - prefetch_distance - 1;
+         i < j; i += 2) {
         libsais_prefetch(&SA[i + 2 * prefetch_distance]);
 
         libsais_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1);
@@ -2433,344 +3197,582 @@ static void libsais_partial_sorting_scan_left_to_right_32s_6k_block_gather(const
 
         libsais_prefetchw(&cache[i + prefetch_distance]);
 
-        sa_sint_t p0 = cache[i + 0].index = SA[i + 0]; sa_sint_t symbol0 = 0; p0 &= SAINT_MAX; if (p0 != 0) { symbol0 = BUCKETS_INDEX4(T[p0 - 1], T[p0 - 2] >= T[p0 - 1]); } cache[i + 0].symbol = symbol0;
-        sa_sint_t p1 = cache[i + 1].index = SA[i + 1]; sa_sint_t symbol1 = 0; p1 &= SAINT_MAX; if (p1 != 0) { symbol1 = BUCKETS_INDEX4(T[p1 - 1], T[p1 - 2] >= T[p1 - 1]); } cache[i + 1].symbol = symbol1;
+        sa_sint_t p0 = cache[i + 0].index = SA[i + 0];
+        sa_sint_t symbol0 = 0;
+        p0 &= SAINT_MAX;
+        if (p0 != 0) {
+            symbol0 = BUCKETS_INDEX4(T[p0 - 1], T[p0 - 2] >= T[p0 - 1]);
+        }
+        cache[i + 0].symbol = symbol0;
+        sa_sint_t p1 = cache[i + 1].index = SA[i + 1];
+        sa_sint_t symbol1 = 0;
+        p1 &= SAINT_MAX;
+        if (p1 != 0) {
+            symbol1 = BUCKETS_INDEX4(T[p1 - 1], T[p1 - 2] >= T[p1 - 1]);
+        }
+        cache[i + 1].symbol = symbol1;
     }
 
-    for (j += prefetch_distance + 1; i < j; i += 1)
-    {
-        sa_sint_t p = cache[i].index = SA[i]; sa_sint_t symbol = 0; p &= SAINT_MAX; if (p != 0) { symbol = BUCKETS_INDEX4(T[p - 1], T[p - 2] >= T[p - 1]); } cache[i].symbol = symbol;
+    for (j += prefetch_distance + 1; i < j; i += 1) {
+        sa_sint_t p = cache[i].index = SA[i];
+        sa_sint_t symbol = 0;
+        p &= SAINT_MAX;
+        if (p != 0) {
+            symbol = BUCKETS_INDEX4(T[p - 1], T[p - 2] >= T[p - 1]);
+        }
+        cache[i].symbol = symbol;
     }
 }
 
-static void libsais_partial_sorting_scan_left_to_right_32s_4k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static void libsais_partial_sorting_scan_left_to_right_32s_4k_block_gather(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
+    fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
-    {
+    for (i = omp_block_start,
+        j = omp_block_start + omp_block_size - prefetch_distance - 1;
+         i < j; i += 2) {
         libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
 
-        sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 & ~SUFFIX_GROUP_MARKER] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
-        sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 & ~SUFFIX_GROUP_MARKER] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+        sa_sint_t s0 = SA[i + prefetch_distance + 0];
+        const sa_sint_t * Ts0 = &T[s0 & ~SUFFIX_GROUP_MARKER] - 1;
+        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        Ts0--;
+        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i + prefetch_distance + 1];
+        const sa_sint_t * Ts1 = &T[s1 & ~SUFFIX_GROUP_MARKER] - 1;
+        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+        Ts1--;
+        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
 
         libsais_prefetchw(&cache[i + prefetch_distance]);
 
-        sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; if (p0 > 0) { cache[i + 0].index = p0; p0 &= ~SUFFIX_GROUP_MARKER; symbol0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] < T[p0 - 1]); p0 = 0; } cache[i + 0].symbol = symbol0; SA[i + 0] = p0 & SAINT_MAX;
-        sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; if (p1 > 0) { cache[i + 1].index = p1; p1 &= ~SUFFIX_GROUP_MARKER; symbol1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] < T[p1 - 1]); p1 = 0; } cache[i + 1].symbol = symbol1; SA[i + 1] = p1 & SAINT_MAX;
+        sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0];
+        if (p0 > 0) {
+            cache[i + 0].index = p0;
+            p0 &= ~SUFFIX_GROUP_MARKER;
+            symbol0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] < T[p0 - 1]);
+            p0 = 0;
+        }
+        cache[i + 0].symbol = symbol0;
+        SA[i + 0] = p0 & SAINT_MAX;
+        sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1];
+        if (p1 > 0) {
+            cache[i + 1].index = p1;
+            p1 &= ~SUFFIX_GROUP_MARKER;
+            symbol1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] < T[p1 - 1]);
+            p1 = 0;
+        }
+        cache[i + 1].symbol = symbol1;
+        SA[i + 1] = p1 & SAINT_MAX;
     }
 
-    for (j += prefetch_distance + 1; i < j; i += 1)
-    {
-        sa_sint_t symbol = SAINT_MIN, p = SA[i]; if (p > 0) { cache[i].index = p; p &= ~SUFFIX_GROUP_MARKER; symbol = BUCKETS_INDEX2(T[p - 1], T[p - 2] < T[p - 1]); p = 0; } cache[i].symbol = symbol; SA[i] = p & SAINT_MAX;
+    for (j += prefetch_distance + 1; i < j; i += 1) {
+        sa_sint_t symbol = SAINT_MIN, p = SA[i];
+        if (p > 0) {
+            cache[i].index = p;
+            p &= ~SUFFIX_GROUP_MARKER;
+            symbol = BUCKETS_INDEX2(T[p - 1], T[p - 2] < T[p - 1]);
+            p = 0;
+        }
+        cache[i].symbol = symbol;
+        SA[i] = p & SAINT_MAX;
     }
 }
 
-static void libsais_partial_sorting_scan_left_to_right_32s_1k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static void libsais_partial_sorting_scan_left_to_right_32s_1k_block_gather(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
+    fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
-    {
+    for (i = omp_block_start,
+        j = omp_block_start + omp_block_size - prefetch_distance - 1;
+         i < j; i += 2) {
         libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
 
-        sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
-        sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+        sa_sint_t s0 = SA[i + prefetch_distance + 0];
+        const sa_sint_t * Ts0 = &T[s0] - 1;
+        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        Ts0--;
+        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i + prefetch_distance + 1];
+        const sa_sint_t * Ts1 = &T[s1] - 1;
+        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+        Ts1--;
+        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
 
         libsais_prefetchw(&cache[i + prefetch_distance]);
 
-        sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; if (p0 > 0) { cache[i + 0].index = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] < T[p0 - 1]) << (SAINT_BIT - 1)); symbol0 = T[p0 - 1]; p0 = 0; } cache[i + 0].symbol = symbol0; SA[i + 0] = p0 & SAINT_MAX;
-        sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; if (p1 > 0) { cache[i + 1].index = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] < T[p1 - 1]) << (SAINT_BIT - 1)); symbol1 = T[p1 - 1]; p1 = 0; } cache[i + 1].symbol = symbol1; SA[i + 1] = p1 & SAINT_MAX;
+        sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0];
+        if (p0 > 0) {
+            cache[i + 0].index = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] < T[p0 - 1])
+                                             << (SAINT_BIT - 1));
+            symbol0 = T[p0 - 1];
+            p0 = 0;
+        }
+        cache[i + 0].symbol = symbol0;
+        SA[i + 0] = p0 & SAINT_MAX;
+        sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1];
+        if (p1 > 0) {
+            cache[i + 1].index = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] < T[p1 - 1])
+                                             << (SAINT_BIT - 1));
+            symbol1 = T[p1 - 1];
+            p1 = 0;
+        }
+        cache[i + 1].symbol = symbol1;
+        SA[i + 1] = p1 & SAINT_MAX;
     }
 
-    for (j += prefetch_distance + 1; i < j; i += 1)
-    {
-        sa_sint_t symbol = SAINT_MIN, p = SA[i]; if (p > 0) { cache[i].index = (p - 1) | ((sa_sint_t)(T[p - 2] < T[p - 1]) << (SAINT_BIT - 1)); symbol = T[p - 1]; p = 0; } cache[i].symbol = symbol; SA[i] = p & SAINT_MAX;
+    for (j += prefetch_distance + 1; i < j; i += 1) {
+        sa_sint_t symbol = SAINT_MIN, p = SA[i];
+        if (p > 0) {
+            cache[i].index =
+                (p - 1) | ((sa_sint_t)(T[p - 2] < T[p - 1]) << (SAINT_BIT - 1));
+            symbol = T[p - 1];
+            p = 0;
+        }
+        cache[i].symbol = symbol;
+        SA[i] = p & SAINT_MAX;
     }
 }
 
-static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k_block_sort(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT buckets, sa_sint_t d,
+    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
+    fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size;
-    for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2)
-    {
+    for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j;
+         i += 2) {
         libsais_prefetchw(&cache[i + 2 * prefetch_distance]);
 
         libsais_prefetchw(&buckets[cache[i + prefetch_distance + 0].symbol]);
         libsais_prefetchw(&buckets[cache[i + prefetch_distance + 1].symbol]);
 
-        sa_sint_t v0 = cache[i + 0].symbol, p0 = cache[i + 0].index; d += (p0 < 0); cache[i + 0].symbol = buckets[v0]++; cache[i + 0].index = (p0 - 1) | ((sa_sint_t)(buckets[2 + v0] != d) << (SAINT_BIT - 1)); buckets[2 + v0] = d;
-        if (cache[i + 0].symbol < omp_block_end) { sa_sint_t s = cache[i + 0].symbol, q = (cache[s].index = cache[i + 0].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] >= T[q - 1]); }
-
-        sa_sint_t v1 = cache[i + 1].symbol, p1 = cache[i + 1].index; d += (p1 < 0); cache[i + 1].symbol = buckets[v1]++; cache[i + 1].index = (p1 - 1) | ((sa_sint_t)(buckets[2 + v1] != d) << (SAINT_BIT - 1)); buckets[2 + v1] = d;
-        if (cache[i + 1].symbol < omp_block_end) { sa_sint_t s = cache[i + 1].symbol, q = (cache[s].index = cache[i + 1].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] >= T[q - 1]); }
-    }
-
-    for (j += prefetch_distance + 1; i < j; i += 1)
-    {
-        sa_sint_t v = cache[i].symbol, p = cache[i].index; d += (p < 0); cache[i].symbol = buckets[v]++; cache[i].index = (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1)); buckets[2 + v] = d;
-        if (cache[i].symbol < omp_block_end) { sa_sint_t s = cache[i].symbol, q = (cache[s].index = cache[i].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] >= T[q - 1]); }
+        sa_sint_t v0 = cache[i + 0].symbol, p0 = cache[i + 0].index;
+        d += (p0 < 0);
+        cache[i + 0].symbol = buckets[v0]++;
+        cache[i + 0].index =
+            (p0 - 1) | ((sa_sint_t)(buckets[2 + v0] != d) << (SAINT_BIT - 1));
+        buckets[2 + v0] = d;
+        if (cache[i + 0].symbol < omp_block_end) {
+            sa_sint_t s = cache[i + 0].symbol,
+                      q = (cache[s].index = cache[i + 0].index) & SAINT_MAX;
+            cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] >= T[q - 1]);
+        }
+
+        sa_sint_t v1 = cache[i + 1].symbol, p1 = cache[i + 1].index;
+        d += (p1 < 0);
+        cache[i + 1].symbol = buckets[v1]++;
+        cache[i + 1].index =
+            (p1 - 1) | ((sa_sint_t)(buckets[2 + v1] != d) << (SAINT_BIT - 1));
+        buckets[2 + v1] = d;
+        if (cache[i + 1].symbol < omp_block_end) {
+            sa_sint_t s = cache[i + 1].symbol,
+                      q = (cache[s].index = cache[i + 1].index) & SAINT_MAX;
+            cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] >= T[q - 1]);
+        }
+    }
+
+    for (j += prefetch_distance + 1; i < j; i += 1) {
+        sa_sint_t v = cache[i].symbol, p = cache[i].index;
+        d += (p < 0);
+        cache[i].symbol = buckets[v]++;
+        cache[i].index =
+            (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1));
+        buckets[2 + v] = d;
+        if (cache[i].symbol < omp_block_end) {
+            sa_sint_t s = cache[i].symbol,
+                      q = (cache[s].index = cache[i].index) & SAINT_MAX;
+            cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] >= T[q - 1]);
+        }
     }
 
     return d;
 }
 
-static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k_block_sort(
+    const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets,
+    sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache,
+    fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     sa_sint_t * RESTRICT induction_bucket = &buckets[2 * k];
-    sa_sint_t * RESTRICT distinct_names   = &buckets[0 * k];
+    sa_sint_t * RESTRICT distinct_names = &buckets[0 * k];
 
     fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size;
-    for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2)
-    {
+    for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j;
+         i += 2) {
         libsais_prefetchw(&cache[i + 2 * prefetch_distance]);
 
-        sa_sint_t s0 = cache[i + prefetch_distance + 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0 >> 1]; libsais_prefetchw(s0 >= 0 ? Is0 : NULL); const sa_sint_t * Ds0 = &distinct_names[s0]; libsais_prefetchw(s0 >= 0 ? Ds0 : NULL); 
-        sa_sint_t s1 = cache[i + prefetch_distance + 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1 >> 1]; libsais_prefetchw(s1 >= 0 ? Is1 : NULL); const sa_sint_t * Ds1 = &distinct_names[s1]; libsais_prefetchw(s1 >= 0 ? Ds1 : NULL);
-        
+        sa_sint_t s0 = cache[i + prefetch_distance + 0].symbol;
+        const sa_sint_t * Is0 = &induction_bucket[s0 >> 1];
+        libsais_prefetchw(s0 >= 0 ? Is0 : NULL);
+        const sa_sint_t * Ds0 = &distinct_names[s0];
+        libsais_prefetchw(s0 >= 0 ? Ds0 : NULL);
+        sa_sint_t s1 = cache[i + prefetch_distance + 1].symbol;
+        const sa_sint_t * Is1 = &induction_bucket[s1 >> 1];
+        libsais_prefetchw(s1 >= 0 ? Is1 : NULL);
+        const sa_sint_t * Ds1 = &distinct_names[s1];
+        libsais_prefetchw(s1 >= 0 ? Ds1 : NULL);
+
         sa_sint_t v0 = cache[i + 0].symbol;
-        if (v0 >= 0)
-        {
-            sa_sint_t p0 = cache[i + 0].index; d += (p0 >> (SUFFIX_GROUP_BIT - 1)); cache[i + 0].symbol = induction_bucket[v0 >> 1]++; cache[i + 0].index = (p0 - 1) | (v0 << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v0] = d;
-            if (cache[i + 0].symbol < omp_block_end) { sa_sint_t ni = cache[i + 0].symbol, np = cache[i + 0].index; if (np > 0) { cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] < T[np - 1]); np = 0; } cache[i + 0].index = np & SAINT_MAX; }
+        if (v0 >= 0) {
+            sa_sint_t p0 = cache[i + 0].index;
+            d += (p0 >> (SUFFIX_GROUP_BIT - 1));
+            cache[i + 0].symbol = induction_bucket[v0 >> 1]++;
+            cache[i + 0].index = (p0 - 1) | (v0 << (SAINT_BIT - 1)) |
+                                 ((sa_sint_t)(distinct_names[v0] != d)
+                                  << (SUFFIX_GROUP_BIT - 1));
+            distinct_names[v0] = d;
+            if (cache[i + 0].symbol < omp_block_end) {
+                sa_sint_t ni = cache[i + 0].symbol, np = cache[i + 0].index;
+                if (np > 0) {
+                    cache[ni].index = np;
+                    np &= ~SUFFIX_GROUP_MARKER;
+                    cache[ni].symbol =
+                        BUCKETS_INDEX2(T[np - 1], T[np - 2] < T[np - 1]);
+                    np = 0;
+                }
+                cache[i + 0].index = np & SAINT_MAX;
+            }
         }
 
         sa_sint_t v1 = cache[i + 1].symbol;
-        if (v1 >= 0)
-        {
-            sa_sint_t p1 = cache[i + 1].index; d += (p1 >> (SUFFIX_GROUP_BIT - 1)); cache[i + 1].symbol = induction_bucket[v1 >> 1]++; cache[i + 1].index = (p1 - 1) | (v1 << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v1] = d;
-            if (cache[i + 1].symbol < omp_block_end) { sa_sint_t ni = cache[i + 1].symbol, np = cache[i + 1].index; if (np > 0) { cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] < T[np - 1]); np = 0; } cache[i + 1].index = np & SAINT_MAX; }
+        if (v1 >= 0) {
+            sa_sint_t p1 = cache[i + 1].index;
+            d += (p1 >> (SUFFIX_GROUP_BIT - 1));
+            cache[i + 1].symbol = induction_bucket[v1 >> 1]++;
+            cache[i + 1].index = (p1 - 1) | (v1 << (SAINT_BIT - 1)) |
+                                 ((sa_sint_t)(distinct_names[v1] != d)
+                                  << (SUFFIX_GROUP_BIT - 1));
+            distinct_names[v1] = d;
+            if (cache[i + 1].symbol < omp_block_end) {
+                sa_sint_t ni = cache[i + 1].symbol, np = cache[i + 1].index;
+                if (np > 0) {
+                    cache[ni].index = np;
+                    np &= ~SUFFIX_GROUP_MARKER;
+                    cache[ni].symbol =
+                        BUCKETS_INDEX2(T[np - 1], T[np - 2] < T[np - 1]);
+                    np = 0;
+                }
+                cache[i + 1].index = np & SAINT_MAX;
+            }
         }
     }
 
-    for (j += prefetch_distance + 1; i < j; i += 1)
-    {
+    for (j += prefetch_distance + 1; i < j; i += 1) {
         sa_sint_t v = cache[i].symbol;
-        if (v >= 0)
-        {
-            sa_sint_t p = cache[i].index; d += (p >> (SUFFIX_GROUP_BIT - 1)); cache[i].symbol = induction_bucket[v >> 1]++; cache[i].index = (p - 1) | (v << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v] = d;
-            if (cache[i].symbol < omp_block_end) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; if (np > 0) { cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] < T[np - 1]); np = 0; } cache[i].index = np & SAINT_MAX; }
+        if (v >= 0) {
+            sa_sint_t p = cache[i].index;
+            d += (p >> (SUFFIX_GROUP_BIT - 1));
+            cache[i].symbol = induction_bucket[v >> 1]++;
+            cache[i].index =
+                (p - 1) | (v << (SAINT_BIT - 1)) |
+                ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1));
+            distinct_names[v] = d;
+            if (cache[i].symbol < omp_block_end) {
+                sa_sint_t ni = cache[i].symbol, np = cache[i].index;
+                if (np > 0) {
+                    cache[ni].index = np;
+                    np &= ~SUFFIX_GROUP_MARKER;
+                    cache[ni].symbol =
+                        BUCKETS_INDEX2(T[np - 1], T[np - 2] < T[np - 1]);
+                    np = 0;
+                }
+                cache[i].index = np & SAINT_MAX;
+            }
         }
     }
 
     return d;
 }
 
-static void libsais_partial_sorting_scan_left_to_right_32s_1k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static void libsais_partial_sorting_scan_left_to_right_32s_1k_block_sort(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT induction_bucket,
+    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
+    fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size;
-    for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2)
-    {
+    for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j;
+         i += 2) {
         libsais_prefetchw(&cache[i + 2 * prefetch_distance]);
 
-        sa_sint_t s0 = cache[i + prefetch_distance + 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0]; libsais_prefetchw(s0 >= 0 ? Is0 : NULL);
-        sa_sint_t s1 = cache[i + prefetch_distance + 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1]; libsais_prefetchw(s1 >= 0 ? Is1 : NULL);
-        
+        sa_sint_t s0 = cache[i + prefetch_distance + 0].symbol;
+        const sa_sint_t * Is0 = &induction_bucket[s0];
+        libsais_prefetchw(s0 >= 0 ? Is0 : NULL);
+        sa_sint_t s1 = cache[i + prefetch_distance + 1].symbol;
+        const sa_sint_t * Is1 = &induction_bucket[s1];
+        libsais_prefetchw(s1 >= 0 ? Is1 : NULL);
+
         sa_sint_t v0 = cache[i + 0].symbol;
-        if (v0 >= 0)
-        {
+        if (v0 >= 0) {
             cache[i + 0].symbol = induction_bucket[v0]++;
-            if (cache[i + 0].symbol < omp_block_end) { sa_sint_t ni = cache[i + 0].symbol, np = cache[i + 0].index; if (np > 0) { cache[ni].index = (np - 1) | ((sa_sint_t)(T[np - 2] < T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; np = 0; } cache[i + 0].index = np & SAINT_MAX; }
+            if (cache[i + 0].symbol < omp_block_end) {
+                sa_sint_t ni = cache[i + 0].symbol, np = cache[i + 0].index;
+                if (np > 0) {
+                    cache[ni].index =
+                        (np - 1) |
+                        ((sa_sint_t)(T[np - 2] < T[np - 1]) << (SAINT_BIT - 1));
+                    cache[ni].symbol = T[np - 1];
+                    np = 0;
+                }
+                cache[i + 0].index = np & SAINT_MAX;
+            }
         }
 
         sa_sint_t v1 = cache[i + 1].symbol;
-        if (v1 >= 0)
-        {
+        if (v1 >= 0) {
             cache[i + 1].symbol = induction_bucket[v1]++;
-            if (cache[i + 1].symbol < omp_block_end) { sa_sint_t ni = cache[i + 1].symbol, np = cache[i + 1].index; if (np > 0) { cache[ni].index = (np - 1) | ((sa_sint_t)(T[np - 2] < T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; np = 0; } cache[i + 1].index = np & SAINT_MAX; }
+            if (cache[i + 1].symbol < omp_block_end) {
+                sa_sint_t ni = cache[i + 1].symbol, np = cache[i + 1].index;
+                if (np > 0) {
+                    cache[ni].index =
+                        (np - 1) |
+                        ((sa_sint_t)(T[np - 2] < T[np - 1]) << (SAINT_BIT - 1));
+                    cache[ni].symbol = T[np - 1];
+                    np = 0;
+                }
+                cache[i + 1].index = np & SAINT_MAX;
+            }
         }
     }
 
-    for (j += prefetch_distance + 1; i < j; i += 1)
-    {
+    for (j += prefetch_distance + 1; i < j; i += 1) {
         sa_sint_t v = cache[i].symbol;
-        if (v >= 0)
-        {
+        if (v >= 0) {
             cache[i].symbol = induction_bucket[v]++;
-            if (cache[i].symbol < omp_block_end) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; if (np > 0) { cache[ni].index = (np - 1) | ((sa_sint_t)(T[np - 2] < T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; np = 0; } cache[i].index = np & SAINT_MAX; }
+            if (cache[i].symbol < omp_block_end) {
+                sa_sint_t ni = cache[i].symbol, np = cache[i].index;
+                if (np > 0) {
+                    cache[ni].index =
+                        (np - 1) |
+                        ((sa_sint_t)(T[np - 2] < T[np - 1]) << (SAINT_BIT - 1));
+                    cache[ni].symbol = T[np - 1];
+                    np = 0;
+                }
+                cache[i].index = np & SAINT_MAX;
+            }
         }
     }
 }
 
-static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
-{
-#if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
-#endif
+static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k_block_omp(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+    sa_sint_t * RESTRICT buckets, sa_sint_t d,
+    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start,
+    fast_sint_t block_size, sa_sint_t threads) {
+    #if defined(_OPENMP)
+        #pragma omp parallel num_threads(threads) if (threads > 1 && \
+                                                      block_size >= 16384)
+    #endif
     {
-#if defined(_OPENMP)
-        fast_sint_t omp_thread_num    = omp_get_thread_num();
-        fast_sint_t omp_num_threads   = omp_get_num_threads();
-#else
-        UNUSED(threads); UNUSED(cache);
+    #if defined(_OPENMP)
+        fast_sint_t omp_thread_num = omp_get_thread_num();
+        fast_sint_t omp_num_threads = omp_get_num_threads();
+    #else
+        UNUSED(threads);
+        UNUSED(cache);
 
-        fast_sint_t omp_thread_num    = 0;
-        fast_sint_t omp_num_threads   = 1;
-#endif
-        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+        fast_sint_t omp_thread_num = 0;
+        fast_sint_t omp_num_threads = 1;
+    #endif
+        fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
+                                         ? omp_block_stride
+                                         : block_size - omp_block_start;
 
         omp_block_start += block_start;
 
-        if (omp_num_threads == 1)
-        {
-            d = libsais_partial_sorting_scan_left_to_right_32s_6k(T, SA, buckets, d, omp_block_start, omp_block_size);
+        if (omp_num_threads == 1) {
+            d = libsais_partial_sorting_scan_left_to_right_32s_6k(
+                T, SA, buckets, d, omp_block_start, omp_block_size);
         }
-#if defined(_OPENMP)
-        else
-        {
+    #if defined(_OPENMP)
+        else {
             {
-                libsais_partial_sorting_scan_left_to_right_32s_6k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+                libsais_partial_sorting_scan_left_to_right_32s_6k_block_gather(
+                    T, SA, cache - block_start, omp_block_start,
+                    omp_block_size);
             }
 
-            #pragma omp barrier
+        #pragma omp barrier
 
-            #pragma omp master
+        #pragma omp master
             {
-                d = libsais_partial_sorting_scan_left_to_right_32s_6k_block_sort(T, buckets, d, cache - block_start, block_start, block_size);
+                d = libsais_partial_sorting_scan_left_to_right_32s_6k_block_sort(
+                    T, buckets, d, cache - block_start, block_start,
+                    block_size);
             }
 
-            #pragma omp barrier
+        #pragma omp barrier
 
             {
-                libsais_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+                libsais_place_cached_suffixes(SA, cache - block_start,
+                                              omp_block_start, omp_block_size);
             }
         }
-#endif
+    #endif
     }
 
     return d;
 }
 
-static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
-{
-#if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
-#endif
+static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k_block_omp(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k,
+    sa_sint_t * RESTRICT buckets, sa_sint_t d,
+    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start,
+    fast_sint_t block_size, sa_sint_t threads) {
+    #if defined(_OPENMP)
+        #pragma omp parallel num_threads(threads) if (threads > 1 && \
+                                                      block_size >= 16384)
+    #endif
     {
-#if defined(_OPENMP)
-        fast_sint_t omp_thread_num    = omp_get_thread_num();
-        fast_sint_t omp_num_threads   = omp_get_num_threads();
-#else
-        UNUSED(threads); UNUSED(cache);
+    #if defined(_OPENMP)
+        fast_sint_t omp_thread_num = omp_get_thread_num();
+        fast_sint_t omp_num_threads = omp_get_num_threads();
+    #else
+        UNUSED(threads);
+        UNUSED(cache);
 
-        fast_sint_t omp_thread_num    = 0;
-        fast_sint_t omp_num_threads   = 1;
-#endif
-        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+        fast_sint_t omp_thread_num = 0;
+        fast_sint_t omp_num_threads = 1;
+    #endif
+        fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
+                                         ? omp_block_stride
+                                         : block_size - omp_block_start;
 
         omp_block_start += block_start;
 
-        if (omp_num_threads == 1)
-        {
-            d = libsais_partial_sorting_scan_left_to_right_32s_4k(T, SA, k, buckets, d, omp_block_start, omp_block_size);
+        if (omp_num_threads == 1) {
+            d = libsais_partial_sorting_scan_left_to_right_32s_4k(
+                T, SA, k, buckets, d, omp_block_start, omp_block_size);
         }
-#if defined(_OPENMP)
-        else
-        {
+    #if defined(_OPENMP)
+        else {
             {
-                libsais_partial_sorting_scan_left_to_right_32s_4k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+                libsais_partial_sorting_scan_left_to_right_32s_4k_block_gather(
+                    T, SA, cache - block_start, omp_block_start,
+                    omp_block_size);
             }
 
-            #pragma omp barrier
+        #pragma omp barrier
 
-            #pragma omp master
+        #pragma omp master
             {
-                d = libsais_partial_sorting_scan_left_to_right_32s_4k_block_sort(T, k, buckets, d, cache - block_start, block_start, block_size);
+                d = libsais_partial_sorting_scan_left_to_right_32s_4k_block_sort(
+                    T, k, buckets, d, cache - block_start, block_start,
+                    block_size);
             }
 
-            #pragma omp barrier
+        #pragma omp barrier
 
             {
-                libsais_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+                libsais_compact_and_place_cached_suffixes(
+                    SA, cache - block_start, omp_block_start, omp_block_size);
             }
         }
-#endif
+    #endif
     }
 
     return d;
 }
 
-static void libsais_partial_sorting_scan_left_to_right_32s_1k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
-{
-#if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
-#endif
+static void libsais_partial_sorting_scan_left_to_right_32s_1k_block_omp(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+    sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache,
+    fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) {
+    #if defined(_OPENMP)
+        #pragma omp parallel num_threads(threads) if (threads > 1 && \
+                                                      block_size >= 16384)
+    #endif
     {
-#if defined(_OPENMP)
-        fast_sint_t omp_thread_num    = omp_get_thread_num();
-        fast_sint_t omp_num_threads   = omp_get_num_threads();
-#else
-        UNUSED(threads); UNUSED(cache);
+    #if defined(_OPENMP)
+        fast_sint_t omp_thread_num = omp_get_thread_num();
+        fast_sint_t omp_num_threads = omp_get_num_threads();
+    #else
+        UNUSED(threads);
+        UNUSED(cache);
 
-        fast_sint_t omp_thread_num    = 0;
-        fast_sint_t omp_num_threads   = 1;
-#endif
-        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+        fast_sint_t omp_thread_num = 0;
+        fast_sint_t omp_num_threads = 1;
+    #endif
+        fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
+                                         ? omp_block_stride
+                                         : block_size - omp_block_start;
 
         omp_block_start += block_start;
 
-        if (omp_num_threads == 1)
-        {
-            libsais_partial_sorting_scan_left_to_right_32s_1k(T, SA, buckets, omp_block_start, omp_block_size);
+        if (omp_num_threads == 1) {
+            libsais_partial_sorting_scan_left_to_right_32s_1k(
+                T, SA, buckets, omp_block_start, omp_block_size);
         }
-#if defined(_OPENMP)
-        else
-        {
+    #if defined(_OPENMP)
+        else {
             {
-                libsais_partial_sorting_scan_left_to_right_32s_1k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+                libsais_partial_sorting_scan_left_to_right_32s_1k_block_gather(
+                    T, SA, cache - block_start, omp_block_start,
+                    omp_block_size);
             }
 
-            #pragma omp barrier
+        #pragma omp barrier
 
-            #pragma omp master
+        #pragma omp master
             {
-                libsais_partial_sorting_scan_left_to_right_32s_1k_block_sort(T, buckets, cache - block_start, block_start, block_size);
+                libsais_partial_sorting_scan_left_to_right_32s_1k_block_sort(
+                    T, buckets, cache - block_start, block_start, block_size);
             }
 
-            #pragma omp barrier
+        #pragma omp barrier
 
             {
-                libsais_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+                libsais_compact_and_place_cached_suffixes(
+                    SA, cache - block_start, omp_block_start, omp_block_size);
             }
         }
-#endif
+    #endif
     }
 }
 
 #endif
 
-static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
-    SA[buckets[BUCKETS_INDEX4(T[n - 1], T[n - 2] >= T[n - 1])]++] = (n - 1) | SAINT_MIN;
+static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k_omp(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+    sa_sint_t * RESTRICT buckets, sa_sint_t left_suffixes_count, sa_sint_t d,
+    sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    SA[buckets[BUCKETS_INDEX4(T[n - 1], T[n - 2] >= T[n - 1])]++] =
+        (n - 1) | SAINT_MIN;
     buckets[2 + BUCKETS_INDEX4(T[n - 1], T[n - 2] >= T[n - 1])] = ++d;
 
-    if (threads == 1 || left_suffixes_count < 65536)
-    {
-        d = libsais_partial_sorting_scan_left_to_right_32s_6k(T, SA, buckets, d, 0, left_suffixes_count);
+    if (threads == 1 || left_suffixes_count < 65536) {
+        d = libsais_partial_sorting_scan_left_to_right_32s_6k(
+            T, SA, buckets, d, 0, left_suffixes_count);
     }
 #if defined(_OPENMP)
-    else
-    {
+    else {
         fast_sint_t block_start, block_end;
-        for (block_start = 0; block_start < left_suffixes_count; block_start = block_end)
-        {
-            block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end > left_suffixes_count) { block_end = left_suffixes_count; }
+        for (block_start = 0; block_start < left_suffixes_count;
+             block_start = block_end) {
+            block_end = block_start +
+                        (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
+            if (block_end > left_suffixes_count) {
+                block_end = left_suffixes_count;
+            }
 
-            d = libsais_partial_sorting_scan_left_to_right_32s_6k_block_omp(T, SA, buckets, d, thread_state[0].state.cache, block_start, block_end - block_start, threads);
+            d = libsais_partial_sorting_scan_left_to_right_32s_6k_block_omp(
+                T, SA, buckets, d, thread_state[0].state.cache, block_start,
+                block_end - block_start, threads);
         }
     }
 #else
@@ -2780,27 +3782,35 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k_omp(const sa_
     return d;
 }
 
-static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
+static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k_omp(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+    sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     sa_sint_t * RESTRICT induction_bucket = &buckets[2 * k];
-    sa_sint_t * RESTRICT distinct_names   = &buckets[0 * k];
+    sa_sint_t * RESTRICT distinct_names = &buckets[0 * k];
 
-    SA[induction_bucket[T[n - 1]]++] = (n - 1) | ((sa_sint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1)) | SUFFIX_GROUP_MARKER;
+    SA[induction_bucket[T[n - 1]]++] =
+        (n - 1) | ((sa_sint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1)) |
+        SUFFIX_GROUP_MARKER;
     distinct_names[BUCKETS_INDEX2(T[n - 1], T[n - 2] < T[n - 1])] = ++d;
 
-    if (threads == 1 || n < 65536)
-    {
-        d = libsais_partial_sorting_scan_left_to_right_32s_4k(T, SA, k, buckets, d, 0, n);
+    if (threads == 1 || n < 65536) {
+        d = libsais_partial_sorting_scan_left_to_right_32s_4k(T, SA, k, buckets,
+                                                              d, 0, n);
     }
 #if defined(_OPENMP)
-    else
-    {
+    else {
         fast_sint_t block_start, block_end;
-        for (block_start = 0; block_start < n; block_start = block_end)
-        {
-            block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end > n) { block_end = n; }
+        for (block_start = 0; block_start < n; block_start = block_end) {
+            block_end = block_start +
+                        (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
+            if (block_end > n) {
+                block_end = n;
+            }
 
-            d = libsais_partial_sorting_scan_left_to_right_32s_4k_block_omp(T, SA, k, buckets, d, thread_state[0].state.cache, block_start, block_end - block_start, threads);
+            d = libsais_partial_sorting_scan_left_to_right_32s_4k_block_omp(
+                T, SA, k, buckets, d, thread_state[0].state.cache, block_start,
+                block_end - block_start, threads);
         }
     }
 #else
@@ -2810,23 +3820,29 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k_omp(const sa_
     return d;
 }
 
-static void libsais_partial_sorting_scan_left_to_right_32s_1k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
-    SA[buckets[T[n - 1]]++] = (n - 1) | ((sa_sint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1));
+static void libsais_partial_sorting_scan_left_to_right_32s_1k_omp(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+    sa_sint_t * RESTRICT buckets, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    SA[buckets[T[n - 1]]++] =
+        (n - 1) | ((sa_sint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1));
 
-    if (threads == 1 || n < 65536)
-    {
-       libsais_partial_sorting_scan_left_to_right_32s_1k(T, SA, buckets, 0, n);
+    if (threads == 1 || n < 65536) {
+        libsais_partial_sorting_scan_left_to_right_32s_1k(T, SA, buckets, 0, n);
     }
 #if defined(_OPENMP)
-    else
-    {
+    else {
         fast_sint_t block_start, block_end;
-        for (block_start = 0; block_start < n; block_start = block_end)
-        {
-            block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end > n) { block_end = n; }
+        for (block_start = 0; block_start < n; block_start = block_end) {
+            block_end = block_start +
+                        (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
+            if (block_end > n) {
+                block_end = n;
+            }
 
-            libsais_partial_sorting_scan_left_to_right_32s_1k_block_omp(T, SA, buckets, thread_state[0].state.cache, block_start, block_end - block_start, threads);
+            libsais_partial_sorting_scan_left_to_right_32s_1k_block_omp(
+                T, SA, buckets, thread_state[0].state.cache, block_start,
+                block_end - block_start, threads);
         }
     }
 #else
@@ -2834,8 +3850,9 @@ static void libsais_partial_sorting_scan_left_to_right_32s_1k_omp(const sa_sint_
 #endif
 }
 
-static void libsais_partial_sorting_shift_markers_8u_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, const sa_sint_t * RESTRICT buckets, sa_sint_t threads)
-{
+static void libsais_partial_sorting_shift_markers_8u_omp(
+    sa_sint_t * RESTRICT SA, sa_sint_t n, const sa_sint_t * RESTRICT buckets,
+    sa_sint_t threads) {
     const fast_sint_t prefetch_distance = 32;
 
     const sa_sint_t * RESTRICT temp_bucket = &buckets[4 * ALPHABET_SIZE];
@@ -2843,106 +3860,155 @@ static void libsais_partial_sorting_shift_markers_8u_omp(sa_sint_t * RESTRICT SA
     fast_sint_t c;
 
 #if defined(_OPENMP)
-    #pragma omp parallel for schedule(static, 1) num_threads(threads) if(threads > 1 && n >= 65536)
+    #pragma omp parallel for schedule(static, 1) \
+        num_threads(threads) if (threads > 1 && n >= 65536)
 #else
-    UNUSED(threads); UNUSED(n);
+    UNUSED(threads);
+    UNUSED(n);
 #endif
-    for (c = BUCKETS_INDEX2(ALPHABET_SIZE - 1, 0); c >= BUCKETS_INDEX2(1, 0); c -= BUCKETS_INDEX2(1, 0))
-    {
-        fast_sint_t i, j; sa_sint_t s = SAINT_MIN;
-        for (i = (fast_sint_t)temp_bucket[c] - 1, j = (fast_sint_t)buckets[c - BUCKETS_INDEX2(1, 0)] + 3; i >= j; i -= 4)
-        {
+    for (c = BUCKETS_INDEX2(ALPHABET_SIZE - 1, 0); c >= BUCKETS_INDEX2(1, 0);
+         c -= BUCKETS_INDEX2(1, 0)) {
+        fast_sint_t i, j;
+        sa_sint_t s = SAINT_MIN;
+        for (i = (fast_sint_t)temp_bucket[c] - 1,
+            j = (fast_sint_t)buckets[c - BUCKETS_INDEX2(1, 0)] + 3;
+             i >= j; i -= 4) {
             libsais_prefetchw(&SA[i - prefetch_distance]);
 
-            sa_sint_t p0 = SA[i - 0], q0 = (p0 & SAINT_MIN) ^ s; s = s ^ q0; SA[i - 0] = p0 ^ q0;
-            sa_sint_t p1 = SA[i - 1], q1 = (p1 & SAINT_MIN) ^ s; s = s ^ q1; SA[i - 1] = p1 ^ q1;
-            sa_sint_t p2 = SA[i - 2], q2 = (p2 & SAINT_MIN) ^ s; s = s ^ q2; SA[i - 2] = p2 ^ q2;
-            sa_sint_t p3 = SA[i - 3], q3 = (p3 & SAINT_MIN) ^ s; s = s ^ q3; SA[i - 3] = p3 ^ q3;
+            sa_sint_t p0 = SA[i - 0], q0 = (p0 & SAINT_MIN) ^ s;
+            s = s ^ q0;
+            SA[i - 0] = p0 ^ q0;
+            sa_sint_t p1 = SA[i - 1], q1 = (p1 & SAINT_MIN) ^ s;
+            s = s ^ q1;
+            SA[i - 1] = p1 ^ q1;
+            sa_sint_t p2 = SA[i - 2], q2 = (p2 & SAINT_MIN) ^ s;
+            s = s ^ q2;
+            SA[i - 2] = p2 ^ q2;
+            sa_sint_t p3 = SA[i - 3], q3 = (p3 & SAINT_MIN) ^ s;
+            s = s ^ q3;
+            SA[i - 3] = p3 ^ q3;
         }
 
-        for (j -= 3; i >= j; i -= 1)
-        {
-            sa_sint_t p = SA[i], q = (p & SAINT_MIN) ^ s; s = s ^ q; SA[i] = p ^ q;
+        for (j -= 3; i >= j; i -= 1) {
+            sa_sint_t p = SA[i], q = (p & SAINT_MIN) ^ s;
+            s = s ^ q;
+            SA[i] = p ^ q;
         }
     }
 }
 
-static void libsais_partial_sorting_shift_markers_32s_6k_omp(sa_sint_t * RESTRICT SA, sa_sint_t k, const sa_sint_t * RESTRICT buckets, sa_sint_t threads)
-{
+static void libsais_partial_sorting_shift_markers_32s_6k_omp(
+    sa_sint_t * RESTRICT SA, sa_sint_t k, const sa_sint_t * RESTRICT buckets,
+    sa_sint_t threads) {
     const fast_sint_t prefetch_distance = 32;
 
     const sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k];
-    
+
     fast_sint_t c;
 
 #if defined(_OPENMP)
-    #pragma omp parallel for schedule(static, 1) num_threads(threads) if(threads > 1 && k >= 65536)
+    #pragma omp parallel for schedule(static, 1) \
+        num_threads(threads) if (threads > 1 && k >= 65536)
 #else
     UNUSED(threads);
 #endif
-    for (c = (fast_sint_t)k - 1; c >= 1; c -= 1)
-    {
-        fast_sint_t i, j; sa_sint_t s = SAINT_MIN;
-        for (i = (fast_sint_t)buckets[BUCKETS_INDEX4(c, 0)] - 1, j = (fast_sint_t)temp_bucket[BUCKETS_INDEX2(c - 1, 0)] + 3; i >= j; i -= 4)
-        {
+    for (c = (fast_sint_t)k - 1; c >= 1; c -= 1) {
+        fast_sint_t i, j;
+        sa_sint_t s = SAINT_MIN;
+        for (i = (fast_sint_t)buckets[BUCKETS_INDEX4(c, 0)] - 1,
+            j = (fast_sint_t)temp_bucket[BUCKETS_INDEX2(c - 1, 0)] + 3;
+             i >= j; i -= 4) {
             libsais_prefetchw(&SA[i - prefetch_distance]);
 
-            sa_sint_t p0 = SA[i - 0], q0 = (p0 & SAINT_MIN) ^ s; s = s ^ q0; SA[i - 0] = p0 ^ q0;
-            sa_sint_t p1 = SA[i - 1], q1 = (p1 & SAINT_MIN) ^ s; s = s ^ q1; SA[i - 1] = p1 ^ q1;
-            sa_sint_t p2 = SA[i - 2], q2 = (p2 & SAINT_MIN) ^ s; s = s ^ q2; SA[i - 2] = p2 ^ q2;
-            sa_sint_t p3 = SA[i - 3], q3 = (p3 & SAINT_MIN) ^ s; s = s ^ q3; SA[i - 3] = p3 ^ q3;
+            sa_sint_t p0 = SA[i - 0], q0 = (p0 & SAINT_MIN) ^ s;
+            s = s ^ q0;
+            SA[i - 0] = p0 ^ q0;
+            sa_sint_t p1 = SA[i - 1], q1 = (p1 & SAINT_MIN) ^ s;
+            s = s ^ q1;
+            SA[i - 1] = p1 ^ q1;
+            sa_sint_t p2 = SA[i - 2], q2 = (p2 & SAINT_MIN) ^ s;
+            s = s ^ q2;
+            SA[i - 2] = p2 ^ q2;
+            sa_sint_t p3 = SA[i - 3], q3 = (p3 & SAINT_MIN) ^ s;
+            s = s ^ q3;
+            SA[i - 3] = p3 ^ q3;
         }
 
-        for (j -= 3; i >= j; i -= 1)
-        {
-            sa_sint_t p = SA[i], q = (p & SAINT_MIN) ^ s; s = s ^ q; SA[i] = p ^ q;
+        for (j -= 3; i >= j; i -= 1) {
+            sa_sint_t p = SA[i], q = (p & SAINT_MIN) ^ s;
+            s = s ^ q;
+            SA[i] = p ^ q;
         }
     }
 }
 
-static void libsais_partial_sorting_shift_markers_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n)
-{
+static void libsais_partial_sorting_shift_markers_32s_4k(
+    sa_sint_t * RESTRICT SA, sa_sint_t n) {
     const fast_sint_t prefetch_distance = 32;
 
-    fast_sint_t i; sa_sint_t s = SUFFIX_GROUP_MARKER;
-    for (i = (fast_sint_t)n - 1; i >= 3; i -= 4)
-    {
+    fast_sint_t i;
+    sa_sint_t s = SUFFIX_GROUP_MARKER;
+    for (i = (fast_sint_t)n - 1; i >= 3; i -= 4) {
         libsais_prefetchw(&SA[i - prefetch_distance]);
 
-        sa_sint_t p0 = SA[i - 0], q0 = ((p0 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p0 > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q0; SA[i - 0] = p0 ^ q0;
-        sa_sint_t p1 = SA[i - 1], q1 = ((p1 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p1 > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q1; SA[i - 1] = p1 ^ q1;
-        sa_sint_t p2 = SA[i - 2], q2 = ((p2 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p2 > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q2; SA[i - 2] = p2 ^ q2;
-        sa_sint_t p3 = SA[i - 3], q3 = ((p3 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p3 > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q3; SA[i - 3] = p3 ^ q3;
-    }
-
-    for (; i >= 0; i -= 1)
-    {
-        sa_sint_t p = SA[i], q = ((p & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q; SA[i] = p ^ q;
-    }
-}
-
-static void libsais_partial_sorting_shift_buckets_32s_6k(sa_sint_t k, sa_sint_t * RESTRICT buckets)
-{
+        sa_sint_t p0 = SA[i - 0],
+                  q0 = ((p0 & SUFFIX_GROUP_MARKER) ^ s) &
+                       ((sa_sint_t)(p0 > 0) << ((SUFFIX_GROUP_BIT - 1)));
+        s = s ^ q0;
+        SA[i - 0] = p0 ^ q0;
+        sa_sint_t p1 = SA[i - 1],
+                  q1 = ((p1 & SUFFIX_GROUP_MARKER) ^ s) &
+                       ((sa_sint_t)(p1 > 0) << ((SUFFIX_GROUP_BIT - 1)));
+        s = s ^ q1;
+        SA[i - 1] = p1 ^ q1;
+        sa_sint_t p2 = SA[i - 2],
+                  q2 = ((p2 & SUFFIX_GROUP_MARKER) ^ s) &
+                       ((sa_sint_t)(p2 > 0) << ((SUFFIX_GROUP_BIT - 1)));
+        s = s ^ q2;
+        SA[i - 2] = p2 ^ q2;
+        sa_sint_t p3 = SA[i - 3],
+                  q3 = ((p3 & SUFFIX_GROUP_MARKER) ^ s) &
+                       ((sa_sint_t)(p3 > 0) << ((SUFFIX_GROUP_BIT - 1)));
+        s = s ^ q3;
+        SA[i - 3] = p3 ^ q3;
+    }
+
+    for (; i >= 0; i -= 1) {
+        sa_sint_t p = SA[i],
+                  q = ((p & SUFFIX_GROUP_MARKER) ^ s) &
+                      ((sa_sint_t)(p > 0) << ((SUFFIX_GROUP_BIT - 1)));
+        s = s ^ q;
+        SA[i] = p ^ q;
+    }
+}
+
+static void libsais_partial_sorting_shift_buckets_32s_6k(
+    sa_sint_t k, sa_sint_t * RESTRICT buckets) {
     sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k];
 
     fast_sint_t i;
-    for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0))
-    {
-        buckets[2 * i + BUCKETS_INDEX4(0, 0)] = temp_bucket[i + BUCKETS_INDEX2(0, 0)];
-        buckets[2 * i + BUCKETS_INDEX4(0, 1)] = temp_bucket[i + BUCKETS_INDEX2(0, 1)];
+    for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0);
+         i += BUCKETS_INDEX2(1, 0)) {
+        buckets[2 * i + BUCKETS_INDEX4(0, 0)] =
+            temp_bucket[i + BUCKETS_INDEX2(0, 0)];
+        buckets[2 * i + BUCKETS_INDEX4(0, 1)] =
+            temp_bucket[i + BUCKETS_INDEX2(0, 1)];
     }
 }
 
-static sa_sint_t libsais_partial_sorting_scan_right_to_left_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static sa_sint_t libsais_partial_sorting_scan_right_to_left_8u(
+    const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+    sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start,
+    fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
-    sa_sint_t * RESTRICT distinct_names   = &buckets[2 * ALPHABET_SIZE];
+    sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE];
 
     fast_sint_t i, j;
-    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
-    {
+    for (i = omp_block_start + omp_block_size - 1,
+        j = omp_block_start + prefetch_distance + 1;
+         i >= j; i -= 2) {
         libsais_prefetch(&SA[i - 2 * prefetch_distance]);
 
         libsais_prefetch(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 1);
@@ -2950,17 +4016,33 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_8u(const uint8_t * R
         libsais_prefetch(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 1);
         libsais_prefetch(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 2);
 
-        sa_sint_t p0 = SA[i - 0]; d += (p0 < 0); p0 &= SAINT_MAX; sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]);
-        SA[--induction_bucket[v0]] = (p0 - 1) | ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d;
+        sa_sint_t p0 = SA[i - 0];
+        d += (p0 < 0);
+        p0 &= SAINT_MAX;
+        sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]);
+        SA[--induction_bucket[v0]] =
+            (p0 - 1) |
+            ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1));
+        distinct_names[v0] = d;
 
-        sa_sint_t p1 = SA[i - 1]; d += (p1 < 0); p1 &= SAINT_MAX; sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]);
-        SA[--induction_bucket[v1]] = (p1 - 1) | ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1)); distinct_names[v1] = d;
+        sa_sint_t p1 = SA[i - 1];
+        d += (p1 < 0);
+        p1 &= SAINT_MAX;
+        sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]);
+        SA[--induction_bucket[v1]] =
+            (p1 - 1) |
+            ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1));
+        distinct_names[v1] = d;
     }
 
-    for (j -= prefetch_distance + 1; i >= j; i -= 1)
-    {
-        sa_sint_t p = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]);
-        SA[--induction_bucket[v]] = (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d;
+    for (j -= prefetch_distance + 1; i >= j; i -= 1) {
+        sa_sint_t p = SA[i];
+        d += (p < 0);
+        p &= SAINT_MAX;
+        sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]);
+        SA[--induction_bucket[v]] =
+            (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1));
+        distinct_names[v] = d;
     }
 
     return d;
@@ -2968,18 +4050,23 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_8u(const uint8_t * R
 
 #if defined(_OPENMP)
 
-static void libsais_partial_sorting_scan_right_to_left_8u_block_prepare(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size, LIBSAIS_THREAD_STATE * RESTRICT state)
-{
+static void libsais_partial_sorting_scan_right_to_left_8u_block_prepare(
+    const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+    sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache,
+    fast_sint_t omp_block_start, fast_sint_t omp_block_size,
+    LIBSAIS_THREAD_STATE * RESTRICT state) {
     const fast_sint_t prefetch_distance = 32;
 
     sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
-    sa_sint_t * RESTRICT distinct_names   = &buckets[2 * ALPHABET_SIZE];
+    sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE];
 
     memset(buckets, 0, 4 * ALPHABET_SIZE * sizeof(sa_sint_t));
 
-    fast_sint_t i, j, count = 0; sa_sint_t d = 1;
-    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
-    {
+    fast_sint_t i, j, count = 0;
+    sa_sint_t d = 1;
+    for (i = omp_block_start + omp_block_size - 1,
+        j = omp_block_start + prefetch_distance + 1;
+         i >= j; i -= 2) {
         libsais_prefetch(&SA[i - 2 * prefetch_distance]);
 
         libsais_prefetch(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 1);
@@ -2987,105 +4074,164 @@ static void libsais_partial_sorting_scan_right_to_left_8u_block_prepare(const ui
         libsais_prefetch(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 1);
         libsais_prefetch(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 2);
 
-        sa_sint_t p0 = cache[count].index = SA[i - 0]; d += (p0 < 0); p0 &= SAINT_MAX; sa_sint_t v0 = cache[count++].symbol = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]); induction_bucket[v0]++; distinct_names[v0] = d;
-        sa_sint_t p1 = cache[count].index = SA[i - 1]; d += (p1 < 0); p1 &= SAINT_MAX; sa_sint_t v1 = cache[count++].symbol = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]); induction_bucket[v1]++; distinct_names[v1] = d;
-    }
-
-    for (j -= prefetch_distance + 1; i >= j; i -= 1)
-    {
-        sa_sint_t p = cache[count].index = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = cache[count++].symbol = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]); induction_bucket[v]++; distinct_names[v] = d;
-    }
-
-    state[0].state.position   = (fast_sint_t)d - 1;
-    state[0].state.count      = count;
-}
-
-static void libsais_partial_sorting_scan_right_to_left_8u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count, sa_sint_t d)
-{
+        sa_sint_t p0 = cache[count].index = SA[i - 0];
+        d += (p0 < 0);
+        p0 &= SAINT_MAX;
+        sa_sint_t v0 = cache[count++].symbol =
+            BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]);
+        induction_bucket[v0]++;
+        distinct_names[v0] = d;
+        sa_sint_t p1 = cache[count].index = SA[i - 1];
+        d += (p1 < 0);
+        p1 &= SAINT_MAX;
+        sa_sint_t v1 = cache[count++].symbol =
+            BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]);
+        induction_bucket[v1]++;
+        distinct_names[v1] = d;
+    }
+
+    for (j -= prefetch_distance + 1; i >= j; i -= 1) {
+        sa_sint_t p = cache[count].index = SA[i];
+        d += (p < 0);
+        p &= SAINT_MAX;
+        sa_sint_t v = cache[count++].symbol =
+            BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]);
+        induction_bucket[v]++;
+        distinct_names[v] = d;
+    }
+
+    state[0].state.position = (fast_sint_t)d - 1;
+    state[0].state.count = count;
+}
+
+static void libsais_partial_sorting_scan_right_to_left_8u_block_place(
+    sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
+    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count, sa_sint_t d) {
     const fast_sint_t prefetch_distance = 32;
 
     sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
-    sa_sint_t * RESTRICT distinct_names   = &buckets[2 * ALPHABET_SIZE];
+    sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE];
 
     fast_sint_t i, j;
-    for (i = 0, j = count - 1; i < j; i += 2)
-    {
+    for (i = 0, j = count - 1; i < j; i += 2) {
         libsais_prefetch(&cache[i + prefetch_distance]);
 
-        sa_sint_t p0 = cache[i + 0].index; d += (p0 < 0); sa_sint_t v0 = cache[i + 0].symbol;
-        SA[--induction_bucket[v0]] = (p0 - 1) | ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d;
+        sa_sint_t p0 = cache[i + 0].index;
+        d += (p0 < 0);
+        sa_sint_t v0 = cache[i + 0].symbol;
+        SA[--induction_bucket[v0]] =
+            (p0 - 1) |
+            ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1));
+        distinct_names[v0] = d;
 
-        sa_sint_t p1 = cache[i + 1].index; d += (p1 < 0); sa_sint_t v1 = cache[i + 1].symbol;
-        SA[--induction_bucket[v1]] = (p1 - 1) | ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1)); distinct_names[v1] = d;
+        sa_sint_t p1 = cache[i + 1].index;
+        d += (p1 < 0);
+        sa_sint_t v1 = cache[i + 1].symbol;
+        SA[--induction_bucket[v1]] =
+            (p1 - 1) |
+            ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1));
+        distinct_names[v1] = d;
     }
 
-    for (j += 1; i < j; i += 1)
-    {
-        sa_sint_t p = cache[i].index; d += (p < 0); sa_sint_t v = cache[i].symbol;
-        SA[--induction_bucket[v]] = (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d;
+    for (j += 1; i < j; i += 1) {
+        sa_sint_t p = cache[i].index;
+        d += (p < 0);
+        sa_sint_t v = cache[i].symbol;
+        SA[--induction_bucket[v]] =
+            (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1));
+        distinct_names[v] = d;
     }
 }
 
-static sa_sint_t libsais_partial_sorting_scan_right_to_left_8u_block_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
-#if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0)
-#endif
+static sa_sint_t libsais_partial_sorting_scan_right_to_left_8u_block_omp(
+    const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+    sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t block_start,
+    fast_sint_t block_size, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    #if defined(_OPENMP)
+        #pragma omp parallel num_threads(                                   \
+            threads) if (threads > 1 && block_size >= 64 * ALPHABET_SIZE && \
+                         omp_get_dynamic() == 0)
+    #endif
     {
-#if defined(_OPENMP)
-        fast_sint_t omp_thread_num    = omp_get_thread_num();
-        fast_sint_t omp_num_threads   = omp_get_num_threads();
-#else
-        UNUSED(threads); UNUSED(thread_state);
+    #if defined(_OPENMP)
+        fast_sint_t omp_thread_num = omp_get_thread_num();
+        fast_sint_t omp_num_threads = omp_get_num_threads();
+    #else
+        UNUSED(threads);
+        UNUSED(thread_state);
 
-        fast_sint_t omp_thread_num    = 0;
-        fast_sint_t omp_num_threads   = 1;
-#endif
-        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+        fast_sint_t omp_thread_num = 0;
+        fast_sint_t omp_num_threads = 1;
+    #endif
+        fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
+                                         ? omp_block_stride
+                                         : block_size - omp_block_start;
 
         omp_block_start += block_start;
 
-        if (omp_num_threads == 1)
-        {
-            d = libsais_partial_sorting_scan_right_to_left_8u(T, SA, buckets, d, omp_block_start, omp_block_size);
+        if (omp_num_threads == 1) {
+            d = libsais_partial_sorting_scan_right_to_left_8u(
+                T, SA, buckets, d, omp_block_start, omp_block_size);
         }
-#if defined(_OPENMP)
-        else
-        {
+    #if defined(_OPENMP)
+        else {
             {
-                libsais_partial_sorting_scan_right_to_left_8u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size, &thread_state[omp_thread_num]);
+                libsais_partial_sorting_scan_right_to_left_8u_block_prepare(
+                    T, SA, thread_state[omp_thread_num].state.buckets,
+                    thread_state[omp_thread_num].state.cache, omp_block_start,
+                    omp_block_size, &thread_state[omp_thread_num]);
             }
 
-            #pragma omp barrier
+        #pragma omp barrier
 
-            #pragma omp master
+        #pragma omp master
             {
-                sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
-                sa_sint_t * RESTRICT distinct_names   = &buckets[2 * ALPHABET_SIZE];
+                sa_sint_t * RESTRICT induction_bucket =
+                    &buckets[0 * ALPHABET_SIZE];
+                sa_sint_t * RESTRICT distinct_names =
+                    &buckets[2 * ALPHABET_SIZE];
 
                 fast_sint_t t;
-                for (t = omp_num_threads - 1; t >= 0; --t)
-                {
-                    sa_sint_t * RESTRICT temp_induction_bucket    = &thread_state[t].state.buckets[0 * ALPHABET_SIZE];
-                    sa_sint_t * RESTRICT temp_distinct_names      = &thread_state[t].state.buckets[2 * ALPHABET_SIZE];
-
-                    fast_sint_t c; 
-                    for (c = 0; c < 2 * ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_induction_bucket[c]; induction_bucket[c] = A - B; temp_induction_bucket[c] = A; }
+                for (t = omp_num_threads - 1; t >= 0; --t) {
+                    sa_sint_t * RESTRICT temp_induction_bucket =
+                        &thread_state[t].state.buckets[0 * ALPHABET_SIZE];
+                    sa_sint_t * RESTRICT temp_distinct_names =
+                        &thread_state[t].state.buckets[2 * ALPHABET_SIZE];
+
+                    fast_sint_t c;
+                    for (c = 0; c < 2 * ALPHABET_SIZE; c += 1) {
+                        sa_sint_t A = induction_bucket[c],
+                                  B = temp_induction_bucket[c];
+                        induction_bucket[c] = A - B;
+                        temp_induction_bucket[c] = A;
+                    }
 
-                    for (d -= 1, c = 0; c < 2 * ALPHABET_SIZE; c += 1) { sa_sint_t A = distinct_names[c], B = temp_distinct_names[c], D = B + d; distinct_names[c] = B > 0 ? D : A; temp_distinct_names[c] = A; }
-                    d += 1 + (sa_sint_t)thread_state[t].state.position; thread_state[t].state.position = (fast_sint_t)d - thread_state[t].state.position;
+                    for (d -= 1, c = 0; c < 2 * ALPHABET_SIZE; c += 1) {
+                        sa_sint_t A = distinct_names[c],
+                                  B = temp_distinct_names[c], D = B + d;
+                        distinct_names[c] = B > 0 ? D : A;
+                        temp_distinct_names[c] = A;
+                    }
+                    d += 1 + (sa_sint_t)thread_state[t].state.position;
+                    thread_state[t].state.position =
+                        (fast_sint_t)d - thread_state[t].state.position;
                 }
             }
 
-            #pragma omp barrier
+        #pragma omp barrier
 
             {
-                libsais_partial_sorting_scan_right_to_left_8u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count, (sa_sint_t)thread_state[omp_thread_num].state.position);
+                libsais_partial_sorting_scan_right_to_left_8u_block_place(
+                    SA, thread_state[omp_thread_num].state.buckets,
+                    thread_state[omp_thread_num].state.cache,
+                    thread_state[omp_thread_num].state.count,
+                    (sa_sint_t)thread_state[omp_thread_num].state.position);
             }
         }
-#endif
+    #endif
     }
 
     return d;
@@ -3093,45 +4239,57 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_8u_block_omp(const u
 
 #endif
 
-static void libsais_partial_sorting_scan_right_to_left_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
-    fast_sint_t scan_start    = (fast_sint_t)left_suffixes_count + 1;
-    fast_sint_t scan_end      = (fast_sint_t)n - (fast_sint_t)first_lms_suffix;
+static void libsais_partial_sorting_scan_right_to_left_8u_omp(
+    const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+    sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix,
+    sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    fast_sint_t scan_start = (fast_sint_t)left_suffixes_count + 1;
+    fast_sint_t scan_end = (fast_sint_t)n - (fast_sint_t)first_lms_suffix;
 
-    if (threads == 1 || (scan_end - scan_start) < 65536)
-    {
-        libsais_partial_sorting_scan_right_to_left_8u(T, SA, buckets, d, scan_start, scan_end - scan_start);
+    if (threads == 1 || (scan_end - scan_start) < 65536) {
+        libsais_partial_sorting_scan_right_to_left_8u(
+            T, SA, buckets, d, scan_start, scan_end - scan_start);
     }
 #if defined(_OPENMP)
-    else
-    {
+    else {
         sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
-        sa_sint_t * RESTRICT distinct_names   = &buckets[2 * ALPHABET_SIZE];
+        sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE];
 
         fast_sint_t block_start;
-        for (block_start = scan_end - 1; block_start >= scan_start; )
-        {
-            if (SA[block_start] == 0)
-            {
+        for (block_start = scan_end - 1; block_start >= scan_start;) {
+            if (SA[block_start] == 0) {
                 block_start--;
-            }
-            else
-            {
-                fast_sint_t block_max_end = block_start - ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end < scan_start) { block_max_end = scan_start - 1; }
-                fast_sint_t block_end     = block_start - 1; while (block_end > block_max_end && SA[block_end] != 0) { block_end--; }
-                fast_sint_t block_size    = block_start - block_end;
-
-                if (block_size < 32)
-                {
-                    for (; block_start > block_end; block_start -= 1)
-                    {
-                        sa_sint_t p = SA[block_start]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]);
-                        SA[--induction_bucket[v]] = (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d;
-                    }
+            } else {
+                fast_sint_t block_max_end =
+                    block_start -
+                    ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE -
+                                              16 * (fast_sint_t)threads);
+                if (block_max_end < scan_start) {
+                    block_max_end = scan_start - 1;
                 }
-                else
-                {
-                    d = libsais_partial_sorting_scan_right_to_left_8u_block_omp(T, SA, buckets, d, block_end + 1, block_size, threads, thread_state);
+                fast_sint_t block_end = block_start - 1;
+                while (block_end > block_max_end && SA[block_end] != 0) {
+                    block_end--;
+                }
+                fast_sint_t block_size = block_start - block_end;
+
+                if (block_size < 32) {
+                    for (; block_start > block_end; block_start -= 1) {
+                        sa_sint_t p = SA[block_start];
+                        d += (p < 0);
+                        p &= SAINT_MAX;
+                        sa_sint_t v =
+                            BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]);
+                        SA[--induction_bucket[v]] =
+                            (p - 1) | ((sa_sint_t)(distinct_names[v] != d)
+                                       << (SAINT_BIT - 1));
+                        distinct_names[v] = d;
+                    }
+                } else {
+                    d = libsais_partial_sorting_scan_right_to_left_8u_block_omp(
+                        T, SA, buckets, d, block_end + 1, block_size, threads,
+                        thread_state);
                     block_start = block_end;
                 }
             }
@@ -3142,13 +4300,16 @@ static void libsais_partial_sorting_scan_right_to_left_8u_omp(const uint8_t * RE
 #endif
 }
 
-static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+    sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start,
+    fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2)
-    {
+    for (i = omp_block_start + omp_block_size - 1,
+        j = omp_block_start + 2 * prefetch_distance + 1;
+         i >= j; i -= 2) {
         libsais_prefetch(&SA[i - 3 * prefetch_distance]);
 
         libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 0] & SAINT_MAX] - 1);
@@ -3156,103 +4317,195 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k(const sa_sint
         libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 1] & SAINT_MAX] - 1);
         libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 1] & SAINT_MAX] - 2);
 
-        sa_sint_t p0 = SA[i - prefetch_distance - 0] & SAINT_MAX; sa_sint_t v0 = BUCKETS_INDEX4(T[p0 - (p0 > 0)], 0); libsais_prefetchw(&buckets[v0]);
-        sa_sint_t p1 = SA[i - prefetch_distance - 1] & SAINT_MAX; sa_sint_t v1 = BUCKETS_INDEX4(T[p1 - (p1 > 0)], 0); libsais_prefetchw(&buckets[v1]);
-
-        sa_sint_t p2 = SA[i - 0]; d += (p2 < 0); p2 &= SAINT_MAX; sa_sint_t v2 = BUCKETS_INDEX4(T[p2 - 1], T[p2 - 2] > T[p2 - 1]);
-        SA[--buckets[v2]] = (p2 - 1) | ((sa_sint_t)(buckets[2 + v2] != d) << (SAINT_BIT - 1)); buckets[2 + v2] = d;
-
-        sa_sint_t p3 = SA[i - 1]; d += (p3 < 0); p3 &= SAINT_MAX; sa_sint_t v3 = BUCKETS_INDEX4(T[p3 - 1], T[p3 - 2] > T[p3 - 1]);
-        SA[--buckets[v3]] = (p3 - 1) | ((sa_sint_t)(buckets[2 + v3] != d) << (SAINT_BIT - 1)); buckets[2 + v3] = d;
-    }
-
-    for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1)
-    {
-        sa_sint_t p = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX4(T[p - 1], T[p - 2] > T[p - 1]);
-        SA[--buckets[v]] = (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1)); buckets[2 + v] = d;
+        sa_sint_t p0 = SA[i - prefetch_distance - 0] & SAINT_MAX;
+        sa_sint_t v0 = BUCKETS_INDEX4(T[p0 - (p0 > 0)], 0);
+        libsais_prefetchw(&buckets[v0]);
+        sa_sint_t p1 = SA[i - prefetch_distance - 1] & SAINT_MAX;
+        sa_sint_t v1 = BUCKETS_INDEX4(T[p1 - (p1 > 0)], 0);
+        libsais_prefetchw(&buckets[v1]);
+
+        sa_sint_t p2 = SA[i - 0];
+        d += (p2 < 0);
+        p2 &= SAINT_MAX;
+        sa_sint_t v2 = BUCKETS_INDEX4(T[p2 - 1], T[p2 - 2] > T[p2 - 1]);
+        SA[--buckets[v2]] =
+            (p2 - 1) | ((sa_sint_t)(buckets[2 + v2] != d) << (SAINT_BIT - 1));
+        buckets[2 + v2] = d;
+
+        sa_sint_t p3 = SA[i - 1];
+        d += (p3 < 0);
+        p3 &= SAINT_MAX;
+        sa_sint_t v3 = BUCKETS_INDEX4(T[p3 - 1], T[p3 - 2] > T[p3 - 1]);
+        SA[--buckets[v3]] =
+            (p3 - 1) | ((sa_sint_t)(buckets[2 + v3] != d) << (SAINT_BIT - 1));
+        buckets[2 + v3] = d;
+    }
+
+    for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1) {
+        sa_sint_t p = SA[i];
+        d += (p < 0);
+        p &= SAINT_MAX;
+        sa_sint_t v = BUCKETS_INDEX4(T[p - 1], T[p - 2] > T[p - 1]);
+        SA[--buckets[v]] =
+            (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1));
+        buckets[2 + v] = d;
     }
 
     return d;
 }
 
-static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k,
+    sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start,
+    fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     sa_sint_t * RESTRICT induction_bucket = &buckets[3 * k];
-    sa_sint_t * RESTRICT distinct_names   = &buckets[0 * k];
+    sa_sint_t * RESTRICT distinct_names = &buckets[0 * k];
 
     fast_sint_t i, j;
-    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2)
-    {
+    for (i = omp_block_start + omp_block_size - 1,
+        j = omp_block_start + 2 * prefetch_distance + 1;
+         i >= j; i -= 2) {
         libsais_prefetchw(&SA[i - 3 * prefetch_distance]);
 
-        sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0]; const sa_sint_t * Ts0 = &T[s0 & ~SUFFIX_GROUP_MARKER] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
-        sa_sint_t s1 = SA[i - 2 * prefetch_distance - 1]; const sa_sint_t * Ts1 = &T[s1 & ~SUFFIX_GROUP_MARKER] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
-        sa_sint_t s2 = SA[i - 1 * prefetch_distance - 0]; if (s2 > 0) { const fast_sint_t Ts2 = T[(s2 & ~SUFFIX_GROUP_MARKER) - 1]; libsais_prefetchw(&induction_bucket[Ts2]); libsais_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts2, 0)]); }
-        sa_sint_t s3 = SA[i - 1 * prefetch_distance - 1]; if (s3 > 0) { const fast_sint_t Ts3 = T[(s3 & ~SUFFIX_GROUP_MARKER) - 1]; libsais_prefetchw(&induction_bucket[Ts3]); libsais_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts3, 0)]); }
+        sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0];
+        const sa_sint_t * Ts0 = &T[s0 & ~SUFFIX_GROUP_MARKER] - 1;
+        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        Ts0--;
+        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i - 2 * prefetch_distance - 1];
+        const sa_sint_t * Ts1 = &T[s1 & ~SUFFIX_GROUP_MARKER] - 1;
+        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+        Ts1--;
+        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+        sa_sint_t s2 = SA[i - 1 * prefetch_distance - 0];
+        if (s2 > 0) {
+            const fast_sint_t Ts2 = T[(s2 & ~SUFFIX_GROUP_MARKER) - 1];
+            libsais_prefetchw(&induction_bucket[Ts2]);
+            libsais_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts2, 0)]);
+        }
+        sa_sint_t s3 = SA[i - 1 * prefetch_distance - 1];
+        if (s3 > 0) {
+            const fast_sint_t Ts3 = T[(s3 & ~SUFFIX_GROUP_MARKER) - 1];
+            libsais_prefetchw(&induction_bucket[Ts3]);
+            libsais_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts3, 0)]);
+        }
 
         sa_sint_t p0 = SA[i - 0];
-        if (p0 > 0)
-        {
-            SA[i - 0] = 0; d += (p0 >> (SUFFIX_GROUP_BIT - 1)); p0 &= ~SUFFIX_GROUP_MARKER; sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]);
-            SA[--induction_bucket[T[p0 - 1]]] = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v0] = d;
+        if (p0 > 0) {
+            SA[i - 0] = 0;
+            d += (p0 >> (SUFFIX_GROUP_BIT - 1));
+            p0 &= ~SUFFIX_GROUP_MARKER;
+            sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]);
+            SA[--induction_bucket[T[p0 - 1]]] =
+                (p0 - 1) |
+                ((sa_sint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1)) |
+                ((sa_sint_t)(distinct_names[v0] != d)
+                 << (SUFFIX_GROUP_BIT - 1));
+            distinct_names[v0] = d;
         }
 
         sa_sint_t p1 = SA[i - 1];
-        if (p1 > 0)
-        {
-            SA[i - 1] = 0; d += (p1 >> (SUFFIX_GROUP_BIT - 1)); p1 &= ~SUFFIX_GROUP_MARKER; sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]);
-            SA[--induction_bucket[T[p1 - 1]]] = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v1] = d;
+        if (p1 > 0) {
+            SA[i - 1] = 0;
+            d += (p1 >> (SUFFIX_GROUP_BIT - 1));
+            p1 &= ~SUFFIX_GROUP_MARKER;
+            sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]);
+            SA[--induction_bucket[T[p1 - 1]]] =
+                (p1 - 1) |
+                ((sa_sint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1)) |
+                ((sa_sint_t)(distinct_names[v1] != d)
+                 << (SUFFIX_GROUP_BIT - 1));
+            distinct_names[v1] = d;
         }
     }
 
-    for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1)
-    {
+    for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1) {
         sa_sint_t p = SA[i];
-        if (p > 0)
-        {
-            SA[i] = 0; d += (p >> (SUFFIX_GROUP_BIT - 1)); p &= ~SUFFIX_GROUP_MARKER; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]);
-            SA[--induction_bucket[T[p - 1]]] = (p - 1) | ((sa_sint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v] = d;
+        if (p > 0) {
+            SA[i] = 0;
+            d += (p >> (SUFFIX_GROUP_BIT - 1));
+            p &= ~SUFFIX_GROUP_MARKER;
+            sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]);
+            SA[--induction_bucket[T[p - 1]]] =
+                (p - 1) |
+                ((sa_sint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1)) |
+                ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1));
+            distinct_names[v] = d;
         }
     }
 
     return d;
 }
 
-static void libsais_partial_sorting_scan_right_to_left_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static void libsais_partial_sorting_scan_right_to_left_32s_1k(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+    sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start,
+    fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2)
-    {
+    for (i = omp_block_start + omp_block_size - 1,
+        j = omp_block_start + 2 * prefetch_distance + 1;
+         i >= j; i -= 2) {
         libsais_prefetchw(&SA[i - 3 * prefetch_distance]);
 
-        sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
-        sa_sint_t s1 = SA[i - 2 * prefetch_distance - 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
-        sa_sint_t s2 = SA[i - 1 * prefetch_distance - 0]; if (s2 > 0) { libsais_prefetchw(&induction_bucket[T[s2 - 1]]); libsais_prefetch(&T[s2] - 2); }
-        sa_sint_t s3 = SA[i - 1 * prefetch_distance - 1]; if (s3 > 0) { libsais_prefetchw(&induction_bucket[T[s3 - 1]]); libsais_prefetch(&T[s3] - 2); }
+        sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0];
+        const sa_sint_t * Ts0 = &T[s0] - 1;
+        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i - 2 * prefetch_distance - 1];
+        const sa_sint_t * Ts1 = &T[s1] - 1;
+        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+        sa_sint_t s2 = SA[i - 1 * prefetch_distance - 0];
+        if (s2 > 0) {
+            libsais_prefetchw(&induction_bucket[T[s2 - 1]]);
+            libsais_prefetch(&T[s2] - 2);
+        }
+        sa_sint_t s3 = SA[i - 1 * prefetch_distance - 1];
+        if (s3 > 0) {
+            libsais_prefetchw(&induction_bucket[T[s3 - 1]]);
+            libsais_prefetch(&T[s3] - 2);
+        }
 
-        sa_sint_t p0 = SA[i - 0]; if (p0 > 0) { SA[i - 0] = 0; SA[--induction_bucket[T[p0 - 1]]] = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1)); }
-        sa_sint_t p1 = SA[i - 1]; if (p1 > 0) { SA[i - 1] = 0; SA[--induction_bucket[T[p1 - 1]]] = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1)); }
+        sa_sint_t p0 = SA[i - 0];
+        if (p0 > 0) {
+            SA[i - 0] = 0;
+            SA[--induction_bucket[T[p0 - 1]]] =
+                (p0 - 1) |
+                ((sa_sint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1));
+        }
+        sa_sint_t p1 = SA[i - 1];
+        if (p1 > 0) {
+            SA[i - 1] = 0;
+            SA[--induction_bucket[T[p1 - 1]]] =
+                (p1 - 1) |
+                ((sa_sint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1));
+        }
     }
 
-    for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1)
-    {
-        sa_sint_t p = SA[i]; if (p > 0) { SA[i] = 0; SA[--induction_bucket[T[p - 1]]] = (p - 1) | ((sa_sint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1)); }
+    for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1) {
+        sa_sint_t p = SA[i];
+        if (p > 0) {
+            SA[i] = 0;
+            SA[--induction_bucket[T[p - 1]]] =
+                (p - 1) | ((sa_sint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1));
+        }
     }
 }
 
 #if defined(_OPENMP)
 
-static void libsais_partial_sorting_scan_right_to_left_32s_6k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static void libsais_partial_sorting_scan_right_to_left_32s_6k_block_gather(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
+    fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
-    {
+    for (i = omp_block_start,
+        j = omp_block_start + omp_block_size - prefetch_distance - 1;
+         i < j; i += 2) {
         libsais_prefetch(&SA[i + 2 * prefetch_distance]);
 
         libsais_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1);
@@ -3262,344 +4515,573 @@ static void libsais_partial_sorting_scan_right_to_left_32s_6k_block_gather(const
 
         libsais_prefetchw(&cache[i + prefetch_distance]);
 
-        sa_sint_t p0 = cache[i + 0].index = SA[i + 0]; sa_sint_t symbol0 = 0; p0 &= SAINT_MAX; if (p0 != 0) { symbol0 = BUCKETS_INDEX4(T[p0 - 1], T[p0 - 2] > T[p0 - 1]); } cache[i + 0].symbol = symbol0;
-        sa_sint_t p1 = cache[i + 1].index = SA[i + 1]; sa_sint_t symbol1 = 0; p1 &= SAINT_MAX; if (p1 != 0) { symbol1 = BUCKETS_INDEX4(T[p1 - 1], T[p1 - 2] > T[p1 - 1]); } cache[i + 1].symbol = symbol1;
+        sa_sint_t p0 = cache[i + 0].index = SA[i + 0];
+        sa_sint_t symbol0 = 0;
+        p0 &= SAINT_MAX;
+        if (p0 != 0) {
+            symbol0 = BUCKETS_INDEX4(T[p0 - 1], T[p0 - 2] > T[p0 - 1]);
+        }
+        cache[i + 0].symbol = symbol0;
+        sa_sint_t p1 = cache[i + 1].index = SA[i + 1];
+        sa_sint_t symbol1 = 0;
+        p1 &= SAINT_MAX;
+        if (p1 != 0) {
+            symbol1 = BUCKETS_INDEX4(T[p1 - 1], T[p1 - 2] > T[p1 - 1]);
+        }
+        cache[i + 1].symbol = symbol1;
     }
 
-    for (j += prefetch_distance + 1; i < j; i += 1)
-    {
-        sa_sint_t p = cache[i].index = SA[i]; sa_sint_t symbol = 0; p &= SAINT_MAX; if (p != 0) { symbol = BUCKETS_INDEX4(T[p - 1], T[p - 2] > T[p - 1]); } cache[i].symbol = symbol;
+    for (j += prefetch_distance + 1; i < j; i += 1) {
+        sa_sint_t p = cache[i].index = SA[i];
+        sa_sint_t symbol = 0;
+        p &= SAINT_MAX;
+        if (p != 0) {
+            symbol = BUCKETS_INDEX4(T[p - 1], T[p - 2] > T[p - 1]);
+        }
+        cache[i].symbol = symbol;
     }
 }
 
-static void libsais_partial_sorting_scan_right_to_left_32s_4k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static void libsais_partial_sorting_scan_right_to_left_32s_4k_block_gather(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
+    fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
-    {
+    for (i = omp_block_start,
+        j = omp_block_start + omp_block_size - prefetch_distance - 1;
+         i < j; i += 2) {
         libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
 
-        sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 & ~SUFFIX_GROUP_MARKER] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
-        sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 & ~SUFFIX_GROUP_MARKER] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+        sa_sint_t s0 = SA[i + prefetch_distance + 0];
+        const sa_sint_t * Ts0 = &T[s0 & ~SUFFIX_GROUP_MARKER] - 1;
+        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        Ts0--;
+        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i + prefetch_distance + 1];
+        const sa_sint_t * Ts1 = &T[s1 & ~SUFFIX_GROUP_MARKER] - 1;
+        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+        Ts1--;
+        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
 
         libsais_prefetchw(&cache[i + prefetch_distance]);
 
-        sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; if (p0 > 0) { SA[i + 0] = 0; cache[i + 0].index = p0; p0 &= ~SUFFIX_GROUP_MARKER; symbol0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]); } cache[i + 0].symbol = symbol0;
-        sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; if (p1 > 0) { SA[i + 1] = 0; cache[i + 1].index = p1; p1 &= ~SUFFIX_GROUP_MARKER; symbol1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]); } cache[i + 1].symbol = symbol1;
-    }
+        sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0];
+        if (p0 > 0) {
+            SA[i + 0] = 0;
+            cache[i + 0].index = p0;
+            p0 &= ~SUFFIX_GROUP_MARKER;
+            symbol0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]);
+        }
+        cache[i + 0].symbol = symbol0;
+        sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1];
+        if (p1 > 0) {
+            SA[i + 1] = 0;
+            cache[i + 1].index = p1;
+            p1 &= ~SUFFIX_GROUP_MARKER;
+            symbol1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]);
+        }
+        cache[i + 1].symbol = symbol1;
+    }
 
-    for (j += prefetch_distance + 1; i < j; i += 1)
-    {
-        sa_sint_t symbol = SAINT_MIN, p = SA[i]; if (p > 0) { SA[i] = 0; cache[i].index = p; p &= ~SUFFIX_GROUP_MARKER; symbol = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]); } cache[i].symbol = symbol;
+    for (j += prefetch_distance + 1; i < j; i += 1) {
+        sa_sint_t symbol = SAINT_MIN, p = SA[i];
+        if (p > 0) {
+            SA[i] = 0;
+            cache[i].index = p;
+            p &= ~SUFFIX_GROUP_MARKER;
+            symbol = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]);
+        }
+        cache[i].symbol = symbol;
     }
 }
 
-static void libsais_partial_sorting_scan_right_to_left_32s_1k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static void libsais_partial_sorting_scan_right_to_left_32s_1k_block_gather(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
+    fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
-    {
+    for (i = omp_block_start,
+        j = omp_block_start + omp_block_size - prefetch_distance - 1;
+         i < j; i += 2) {
         libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
 
-        sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
-        sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+        sa_sint_t s0 = SA[i + prefetch_distance + 0];
+        const sa_sint_t * Ts0 = &T[s0] - 1;
+        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        Ts0--;
+        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i + prefetch_distance + 1];
+        const sa_sint_t * Ts1 = &T[s1] - 1;
+        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+        Ts1--;
+        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
 
         libsais_prefetchw(&cache[i + prefetch_distance]);
 
-        sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; if (p0 > 0) { SA[i + 0] = 0; cache[i + 0].index = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1)); symbol0 = T[p0 - 1]; } cache[i + 0].symbol = symbol0;
-        sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; if (p1 > 0) { SA[i + 1] = 0; cache[i + 1].index = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1)); symbol1 = T[p1 - 1]; } cache[i + 1].symbol = symbol1;
+        sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0];
+        if (p0 > 0) {
+            SA[i + 0] = 0;
+            cache[i + 0].index = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] > T[p0 - 1])
+                                             << (SAINT_BIT - 1));
+            symbol0 = T[p0 - 1];
+        }
+        cache[i + 0].symbol = symbol0;
+        sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1];
+        if (p1 > 0) {
+            SA[i + 1] = 0;
+            cache[i + 1].index = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] > T[p1 - 1])
+                                             << (SAINT_BIT - 1));
+            symbol1 = T[p1 - 1];
+        }
+        cache[i + 1].symbol = symbol1;
     }
 
-    for (j += prefetch_distance + 1; i < j; i += 1)
-    {
-        sa_sint_t symbol = SAINT_MIN, p = SA[i]; if (p > 0) { SA[i] = 0; cache[i].index = (p - 1) | ((sa_sint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1)); symbol = T[p - 1]; } cache[i].symbol = symbol;
+    for (j += prefetch_distance + 1; i < j; i += 1) {
+        sa_sint_t symbol = SAINT_MIN, p = SA[i];
+        if (p > 0) {
+            SA[i] = 0;
+            cache[i].index =
+                (p - 1) | ((sa_sint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1));
+            symbol = T[p - 1];
+        }
+        cache[i].symbol = symbol;
     }
 }
 
-static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k_block_sort(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT buckets, sa_sint_t d,
+    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
+    fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
-    {
+    for (i = omp_block_start + omp_block_size - 1,
+        j = omp_block_start + prefetch_distance + 1;
+         i >= j; i -= 2) {
         libsais_prefetchw(&cache[i - 2 * prefetch_distance]);
 
         libsais_prefetchw(&buckets[cache[i - prefetch_distance - 0].symbol]);
         libsais_prefetchw(&buckets[cache[i - prefetch_distance - 1].symbol]);
 
-        sa_sint_t v0 = cache[i - 0].symbol, p0 = cache[i - 0].index; d += (p0 < 0); cache[i - 0].symbol = --buckets[v0]; cache[i - 0].index = (p0 - 1) | ((sa_sint_t)(buckets[2 + v0] != d) << (SAINT_BIT - 1)); buckets[2 + v0] = d;
-        if (cache[i - 0].symbol >= omp_block_start) { sa_sint_t s = cache[i - 0].symbol, q = (cache[s].index = cache[i - 0].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] > T[q - 1]); }
-
-        sa_sint_t v1 = cache[i - 1].symbol, p1 = cache[i - 1].index; d += (p1 < 0); cache[i - 1].symbol = --buckets[v1]; cache[i - 1].index = (p1 - 1) | ((sa_sint_t)(buckets[2 + v1] != d) << (SAINT_BIT - 1)); buckets[2 + v1] = d;
-        if (cache[i - 1].symbol >= omp_block_start) { sa_sint_t s = cache[i - 1].symbol, q = (cache[s].index = cache[i - 1].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] > T[q - 1]); }
-    }
-
-    for (j -= prefetch_distance + 1; i >= j; i -= 1)
-    {
-        sa_sint_t v = cache[i].symbol, p = cache[i].index; d += (p < 0); cache[i].symbol = --buckets[v]; cache[i].index = (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1)); buckets[2 + v] = d;
-        if (cache[i].symbol >= omp_block_start) { sa_sint_t s = cache[i].symbol, q = (cache[s].index = cache[i].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] > T[q - 1]); }
+        sa_sint_t v0 = cache[i - 0].symbol, p0 = cache[i - 0].index;
+        d += (p0 < 0);
+        cache[i - 0].symbol = --buckets[v0];
+        cache[i - 0].index =
+            (p0 - 1) | ((sa_sint_t)(buckets[2 + v0] != d) << (SAINT_BIT - 1));
+        buckets[2 + v0] = d;
+        if (cache[i - 0].symbol >= omp_block_start) {
+            sa_sint_t s = cache[i - 0].symbol,
+                      q = (cache[s].index = cache[i - 0].index) & SAINT_MAX;
+            cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] > T[q - 1]);
+        }
+
+        sa_sint_t v1 = cache[i - 1].symbol, p1 = cache[i - 1].index;
+        d += (p1 < 0);
+        cache[i - 1].symbol = --buckets[v1];
+        cache[i - 1].index =
+            (p1 - 1) | ((sa_sint_t)(buckets[2 + v1] != d) << (SAINT_BIT - 1));
+        buckets[2 + v1] = d;
+        if (cache[i - 1].symbol >= omp_block_start) {
+            sa_sint_t s = cache[i - 1].symbol,
+                      q = (cache[s].index = cache[i - 1].index) & SAINT_MAX;
+            cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] > T[q - 1]);
+        }
+    }
+
+    for (j -= prefetch_distance + 1; i >= j; i -= 1) {
+        sa_sint_t v = cache[i].symbol, p = cache[i].index;
+        d += (p < 0);
+        cache[i].symbol = --buckets[v];
+        cache[i].index =
+            (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1));
+        buckets[2 + v] = d;
+        if (cache[i].symbol >= omp_block_start) {
+            sa_sint_t s = cache[i].symbol,
+                      q = (cache[s].index = cache[i].index) & SAINT_MAX;
+            cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] > T[q - 1]);
+        }
     }
 
     return d;
 }
 
-static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k_block_sort(
+    const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets,
+    sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache,
+    fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     sa_sint_t * RESTRICT induction_bucket = &buckets[3 * k];
-    sa_sint_t * RESTRICT distinct_names   = &buckets[0 * k];
+    sa_sint_t * RESTRICT distinct_names = &buckets[0 * k];
 
     fast_sint_t i, j;
-    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
-    {
+    for (i = omp_block_start + omp_block_size - 1,
+        j = omp_block_start + prefetch_distance + 1;
+         i >= j; i -= 2) {
         libsais_prefetchw(&cache[i - 2 * prefetch_distance]);
 
-        sa_sint_t s0 = cache[i - prefetch_distance - 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0 >> 1]; libsais_prefetchw(s0 >= 0 ? Is0 : NULL); const sa_sint_t * Ds0 = &distinct_names[s0]; libsais_prefetchw(s0 >= 0 ? Ds0 : NULL); 
-        sa_sint_t s1 = cache[i - prefetch_distance - 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1 >> 1]; libsais_prefetchw(s1 >= 0 ? Is1 : NULL); const sa_sint_t * Ds1 = &distinct_names[s1]; libsais_prefetchw(s1 >= 0 ? Ds1 : NULL);
+        sa_sint_t s0 = cache[i - prefetch_distance - 0].symbol;
+        const sa_sint_t * Is0 = &induction_bucket[s0 >> 1];
+        libsais_prefetchw(s0 >= 0 ? Is0 : NULL);
+        const sa_sint_t * Ds0 = &distinct_names[s0];
+        libsais_prefetchw(s0 >= 0 ? Ds0 : NULL);
+        sa_sint_t s1 = cache[i - prefetch_distance - 1].symbol;
+        const sa_sint_t * Is1 = &induction_bucket[s1 >> 1];
+        libsais_prefetchw(s1 >= 0 ? Is1 : NULL);
+        const sa_sint_t * Ds1 = &distinct_names[s1];
+        libsais_prefetchw(s1 >= 0 ? Ds1 : NULL);
 
         sa_sint_t v0 = cache[i - 0].symbol;
-        if (v0 >= 0)
-        {
-            sa_sint_t p0 = cache[i - 0].index; d += (p0 >> (SUFFIX_GROUP_BIT - 1)); cache[i - 0].symbol = --induction_bucket[v0 >> 1]; cache[i - 0].index = (p0 - 1) | (v0 << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v0] = d;
-            if (cache[i - 0].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 0].symbol, np = cache[i - 0].index; if (np > 0) { cache[i - 0].index = 0; cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] > T[np - 1]); } }
+        if (v0 >= 0) {
+            sa_sint_t p0 = cache[i - 0].index;
+            d += (p0 >> (SUFFIX_GROUP_BIT - 1));
+            cache[i - 0].symbol = --induction_bucket[v0 >> 1];
+            cache[i - 0].index = (p0 - 1) | (v0 << (SAINT_BIT - 1)) |
+                                 ((sa_sint_t)(distinct_names[v0] != d)
+                                  << (SUFFIX_GROUP_BIT - 1));
+            distinct_names[v0] = d;
+            if (cache[i - 0].symbol >= omp_block_start) {
+                sa_sint_t ni = cache[i - 0].symbol, np = cache[i - 0].index;
+                if (np > 0) {
+                    cache[i - 0].index = 0;
+                    cache[ni].index = np;
+                    np &= ~SUFFIX_GROUP_MARKER;
+                    cache[ni].symbol =
+                        BUCKETS_INDEX2(T[np - 1], T[np - 2] > T[np - 1]);
+                }
+            }
         }
 
         sa_sint_t v1 = cache[i - 1].symbol;
-        if (v1 >= 0)
-        {
-            sa_sint_t p1 = cache[i - 1].index; d += (p1 >> (SUFFIX_GROUP_BIT - 1)); cache[i - 1].symbol = --induction_bucket[v1 >> 1]; cache[i - 1].index = (p1 - 1) | (v1 << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v1] = d;
-            if (cache[i - 1].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 1].symbol, np = cache[i - 1].index; if (np > 0) { cache[i - 1].index = 0; cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] > T[np - 1]); } }
+        if (v1 >= 0) {
+            sa_sint_t p1 = cache[i - 1].index;
+            d += (p1 >> (SUFFIX_GROUP_BIT - 1));
+            cache[i - 1].symbol = --induction_bucket[v1 >> 1];
+            cache[i - 1].index = (p1 - 1) | (v1 << (SAINT_BIT - 1)) |
+                                 ((sa_sint_t)(distinct_names[v1] != d)
+                                  << (SUFFIX_GROUP_BIT - 1));
+            distinct_names[v1] = d;
+            if (cache[i - 1].symbol >= omp_block_start) {
+                sa_sint_t ni = cache[i - 1].symbol, np = cache[i - 1].index;
+                if (np > 0) {
+                    cache[i - 1].index = 0;
+                    cache[ni].index = np;
+                    np &= ~SUFFIX_GROUP_MARKER;
+                    cache[ni].symbol =
+                        BUCKETS_INDEX2(T[np - 1], T[np - 2] > T[np - 1]);
+                }
+            }
         }
     }
 
-    for (j -= prefetch_distance + 1; i >= j; i -= 1)
-    {
+    for (j -= prefetch_distance + 1; i >= j; i -= 1) {
         sa_sint_t v = cache[i].symbol;
-        if (v >= 0)
-        {
-            sa_sint_t p = cache[i].index; d += (p >> (SUFFIX_GROUP_BIT - 1)); cache[i].symbol = --induction_bucket[v >> 1]; cache[i].index = (p - 1) | (v << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v] = d;
-            if (cache[i].symbol >= omp_block_start) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; if (np > 0) { cache[i].index = 0; cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] > T[np - 1]); } }
+        if (v >= 0) {
+            sa_sint_t p = cache[i].index;
+            d += (p >> (SUFFIX_GROUP_BIT - 1));
+            cache[i].symbol = --induction_bucket[v >> 1];
+            cache[i].index =
+                (p - 1) | (v << (SAINT_BIT - 1)) |
+                ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1));
+            distinct_names[v] = d;
+            if (cache[i].symbol >= omp_block_start) {
+                sa_sint_t ni = cache[i].symbol, np = cache[i].index;
+                if (np > 0) {
+                    cache[i].index = 0;
+                    cache[ni].index = np;
+                    np &= ~SUFFIX_GROUP_MARKER;
+                    cache[ni].symbol =
+                        BUCKETS_INDEX2(T[np - 1], T[np - 2] > T[np - 1]);
+                }
+            }
         }
     }
 
     return d;
 }
 
-static void libsais_partial_sorting_scan_right_to_left_32s_1k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static void libsais_partial_sorting_scan_right_to_left_32s_1k_block_sort(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT induction_bucket,
+    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
+    fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
-    {
+    for (i = omp_block_start + omp_block_size - 1,
+        j = omp_block_start + prefetch_distance + 1;
+         i >= j; i -= 2) {
         libsais_prefetchw(&cache[i - 2 * prefetch_distance]);
 
-        sa_sint_t s0 = cache[i - prefetch_distance - 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0]; libsais_prefetchw(s0 >= 0 ? Is0 : NULL);
-        sa_sint_t s1 = cache[i - prefetch_distance - 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1]; libsais_prefetchw(s1 >= 0 ? Is1 : NULL);
+        sa_sint_t s0 = cache[i - prefetch_distance - 0].symbol;
+        const sa_sint_t * Is0 = &induction_bucket[s0];
+        libsais_prefetchw(s0 >= 0 ? Is0 : NULL);
+        sa_sint_t s1 = cache[i - prefetch_distance - 1].symbol;
+        const sa_sint_t * Is1 = &induction_bucket[s1];
+        libsais_prefetchw(s1 >= 0 ? Is1 : NULL);
 
         sa_sint_t v0 = cache[i - 0].symbol;
-        if (v0 >= 0)
-        {
+        if (v0 >= 0) {
             cache[i - 0].symbol = --induction_bucket[v0];
-            if (cache[i - 0].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 0].symbol, np = cache[i - 0].index; if (np > 0) { cache[i - 0].index = 0; cache[ni].index = (np - 1) | ((sa_sint_t)(T[np - 2] > T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; } }
+            if (cache[i - 0].symbol >= omp_block_start) {
+                sa_sint_t ni = cache[i - 0].symbol, np = cache[i - 0].index;
+                if (np > 0) {
+                    cache[i - 0].index = 0;
+                    cache[ni].index =
+                        (np - 1) |
+                        ((sa_sint_t)(T[np - 2] > T[np - 1]) << (SAINT_BIT - 1));
+                    cache[ni].symbol = T[np - 1];
+                }
+            }
         }
 
         sa_sint_t v1 = cache[i - 1].symbol;
-        if (v1 >= 0)
-        {
+        if (v1 >= 0) {
             cache[i - 1].symbol = --induction_bucket[v1];
-            if (cache[i - 1].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 1].symbol, np = cache[i - 1].index; if (np > 0) { cache[i - 1].index = 0; cache[ni].index = (np - 1) | ((sa_sint_t)(T[np - 2] > T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; }}
+            if (cache[i - 1].symbol >= omp_block_start) {
+                sa_sint_t ni = cache[i - 1].symbol, np = cache[i - 1].index;
+                if (np > 0) {
+                    cache[i - 1].index = 0;
+                    cache[ni].index =
+                        (np - 1) |
+                        ((sa_sint_t)(T[np - 2] > T[np - 1]) << (SAINT_BIT - 1));
+                    cache[ni].symbol = T[np - 1];
+                }
+            }
         }
     }
 
-    for (j -= prefetch_distance + 1; i >= j; i -= 1)
-    {
+    for (j -= prefetch_distance + 1; i >= j; i -= 1) {
         sa_sint_t v = cache[i].symbol;
-        if (v >= 0)
-        {
+        if (v >= 0) {
             cache[i].symbol = --induction_bucket[v];
-            if (cache[i].symbol >= omp_block_start) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; if (np > 0) { cache[i].index = 0; cache[ni].index = (np - 1) | ((sa_sint_t)(T[np - 2] > T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; } }
+            if (cache[i].symbol >= omp_block_start) {
+                sa_sint_t ni = cache[i].symbol, np = cache[i].index;
+                if (np > 0) {
+                    cache[i].index = 0;
+                    cache[ni].index =
+                        (np - 1) |
+                        ((sa_sint_t)(T[np - 2] > T[np - 1]) << (SAINT_BIT - 1));
+                    cache[ni].symbol = T[np - 1];
+                }
+            }
         }
     }
 }
 
-static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
-{
-#if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
-#endif
+static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k_block_omp(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+    sa_sint_t * RESTRICT buckets, sa_sint_t d,
+    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start,
+    fast_sint_t block_size, sa_sint_t threads) {
+    #if defined(_OPENMP)
+        #pragma omp parallel num_threads(threads) if (threads > 1 && \
+                                                      block_size >= 16384)
+    #endif
     {
-#if defined(_OPENMP)
-        fast_sint_t omp_thread_num    = omp_get_thread_num();
-        fast_sint_t omp_num_threads   = omp_get_num_threads();
-#else
-        UNUSED(threads); UNUSED(cache);
+    #if defined(_OPENMP)
+        fast_sint_t omp_thread_num = omp_get_thread_num();
+        fast_sint_t omp_num_threads = omp_get_num_threads();
+    #else
+        UNUSED(threads);
+        UNUSED(cache);
 
-        fast_sint_t omp_thread_num    = 0;
-        fast_sint_t omp_num_threads   = 1;
-#endif
-        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+        fast_sint_t omp_thread_num = 0;
+        fast_sint_t omp_num_threads = 1;
+    #endif
+        fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
+                                         ? omp_block_stride
+                                         : block_size - omp_block_start;
 
         omp_block_start += block_start;
 
-        if (omp_num_threads == 1)
-        {
-            d = libsais_partial_sorting_scan_right_to_left_32s_6k(T, SA, buckets, d, omp_block_start, omp_block_size);
+        if (omp_num_threads == 1) {
+            d = libsais_partial_sorting_scan_right_to_left_32s_6k(
+                T, SA, buckets, d, omp_block_start, omp_block_size);
         }
-#if defined(_OPENMP)
-        else
-        {
+    #if defined(_OPENMP)
+        else {
             {
-                libsais_partial_sorting_scan_right_to_left_32s_6k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+                libsais_partial_sorting_scan_right_to_left_32s_6k_block_gather(
+                    T, SA, cache - block_start, omp_block_start,
+                    omp_block_size);
             }
 
-            #pragma omp barrier
+        #pragma omp barrier
 
-            #pragma omp master
+        #pragma omp master
             {
-                d = libsais_partial_sorting_scan_right_to_left_32s_6k_block_sort(T, buckets, d, cache - block_start, block_start, block_size);
+                d = libsais_partial_sorting_scan_right_to_left_32s_6k_block_sort(
+                    T, buckets, d, cache - block_start, block_start,
+                    block_size);
             }
 
-            #pragma omp barrier
+        #pragma omp barrier
 
             {
-                libsais_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+                libsais_place_cached_suffixes(SA, cache - block_start,
+                                              omp_block_start, omp_block_size);
             }
         }
-#endif
+    #endif
     }
 
     return d;
 }
 
-static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
-{
-#if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
-#endif
+static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k_block_omp(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k,
+    sa_sint_t * RESTRICT buckets, sa_sint_t d,
+    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start,
+    fast_sint_t block_size, sa_sint_t threads) {
+    #if defined(_OPENMP)
+        #pragma omp parallel num_threads(threads) if (threads > 1 && \
+                                                      block_size >= 16384)
+    #endif
     {
-#if defined(_OPENMP)
-        fast_sint_t omp_thread_num    = omp_get_thread_num();
-        fast_sint_t omp_num_threads   = omp_get_num_threads();
-#else
-        UNUSED(threads); UNUSED(cache);
+    #if defined(_OPENMP)
+        fast_sint_t omp_thread_num = omp_get_thread_num();
+        fast_sint_t omp_num_threads = omp_get_num_threads();
+    #else
+        UNUSED(threads);
+        UNUSED(cache);
 
-        fast_sint_t omp_thread_num    = 0;
-        fast_sint_t omp_num_threads   = 1;
-#endif
-        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+        fast_sint_t omp_thread_num = 0;
+        fast_sint_t omp_num_threads = 1;
+    #endif
+        fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
+                                         ? omp_block_stride
+                                         : block_size - omp_block_start;
 
         omp_block_start += block_start;
 
-        if (omp_num_threads == 1)
-        {
-            d = libsais_partial_sorting_scan_right_to_left_32s_4k(T, SA, k, buckets, d, omp_block_start, omp_block_size);
+        if (omp_num_threads == 1) {
+            d = libsais_partial_sorting_scan_right_to_left_32s_4k(
+                T, SA, k, buckets, d, omp_block_start, omp_block_size);
         }
-#if defined(_OPENMP)
-        else
-        {
+    #if defined(_OPENMP)
+        else {
             {
-                libsais_partial_sorting_scan_right_to_left_32s_4k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+                libsais_partial_sorting_scan_right_to_left_32s_4k_block_gather(
+                    T, SA, cache - block_start, omp_block_start,
+                    omp_block_size);
             }
 
-            #pragma omp barrier
+        #pragma omp barrier
 
-            #pragma omp master
+        #pragma omp master
             {
-                d = libsais_partial_sorting_scan_right_to_left_32s_4k_block_sort(T, k, buckets, d, cache - block_start, block_start, block_size);
+                d = libsais_partial_sorting_scan_right_to_left_32s_4k_block_sort(
+                    T, k, buckets, d, cache - block_start, block_start,
+                    block_size);
             }
 
-            #pragma omp barrier
+        #pragma omp barrier
 
             {
-                libsais_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+                libsais_compact_and_place_cached_suffixes(
+                    SA, cache - block_start, omp_block_start, omp_block_size);
             }
         }
-#endif
+    #endif
     }
 
     return d;
 }
 
-static void libsais_partial_sorting_scan_right_to_left_32s_1k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
-{
-#if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
-#endif
+static void libsais_partial_sorting_scan_right_to_left_32s_1k_block_omp(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+    sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache,
+    fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) {
+    #if defined(_OPENMP)
+        #pragma omp parallel num_threads(threads) if (threads > 1 && \
+                                                      block_size >= 16384)
+    #endif
     {
-#if defined(_OPENMP)
-        fast_sint_t omp_thread_num    = omp_get_thread_num();
-        fast_sint_t omp_num_threads   = omp_get_num_threads();
-#else
-        UNUSED(threads); UNUSED(cache);
+    #if defined(_OPENMP)
+        fast_sint_t omp_thread_num = omp_get_thread_num();
+        fast_sint_t omp_num_threads = omp_get_num_threads();
+    #else
+        UNUSED(threads);
+        UNUSED(cache);
 
-        fast_sint_t omp_thread_num    = 0;
-        fast_sint_t omp_num_threads   = 1;
-#endif
-        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+        fast_sint_t omp_thread_num = 0;
+        fast_sint_t omp_num_threads = 1;
+    #endif
+        fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
+                                         ? omp_block_stride
+                                         : block_size - omp_block_start;
 
         omp_block_start += block_start;
 
-        if (omp_num_threads == 1)
-        {
-            libsais_partial_sorting_scan_right_to_left_32s_1k(T, SA, buckets, omp_block_start, omp_block_size);
+        if (omp_num_threads == 1) {
+            libsais_partial_sorting_scan_right_to_left_32s_1k(
+                T, SA, buckets, omp_block_start, omp_block_size);
         }
-#if defined(_OPENMP)
-        else
-        {
+    #if defined(_OPENMP)
+        else {
             {
-                libsais_partial_sorting_scan_right_to_left_32s_1k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+                libsais_partial_sorting_scan_right_to_left_32s_1k_block_gather(
+                    T, SA, cache - block_start, omp_block_start,
+                    omp_block_size);
             }
 
-            #pragma omp barrier
+        #pragma omp barrier
 
-            #pragma omp master
+        #pragma omp master
             {
-                libsais_partial_sorting_scan_right_to_left_32s_1k_block_sort(T, buckets, cache - block_start, block_start, block_size);
+                libsais_partial_sorting_scan_right_to_left_32s_1k_block_sort(
+                    T, buckets, cache - block_start, block_start, block_size);
             }
 
-            #pragma omp barrier
+        #pragma omp barrier
 
             {
-                libsais_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+                libsais_compact_and_place_cached_suffixes(
+                    SA, cache - block_start, omp_block_start, omp_block_size);
             }
         }
-#endif
+    #endif
     }
 }
 
 #endif
 
-static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
-    fast_sint_t scan_start    = (fast_sint_t)left_suffixes_count + 1;
-    fast_sint_t scan_end      = (fast_sint_t)n - (fast_sint_t)first_lms_suffix;
+static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k_omp(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+    sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix,
+    sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    fast_sint_t scan_start = (fast_sint_t)left_suffixes_count + 1;
+    fast_sint_t scan_end = (fast_sint_t)n - (fast_sint_t)first_lms_suffix;
 
-    if (threads == 1 || (scan_end - scan_start) < 65536)
-    {
-        d = libsais_partial_sorting_scan_right_to_left_32s_6k(T, SA, buckets, d, scan_start, scan_end - scan_start);
+    if (threads == 1 || (scan_end - scan_start) < 65536) {
+        d = libsais_partial_sorting_scan_right_to_left_32s_6k(
+            T, SA, buckets, d, scan_start, scan_end - scan_start);
     }
 #if defined(_OPENMP)
-    else
-    {
+    else {
         fast_sint_t block_start, block_end;
-        for (block_start = scan_end - 1; block_start >= scan_start; block_start = block_end)
-        {
-            block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end < scan_start) { block_end = scan_start - 1; }
+        for (block_start = scan_end - 1; block_start >= scan_start;
+             block_start = block_end) {
+            block_end = block_start -
+                        (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
+            if (block_end < scan_start) {
+                block_end = scan_start - 1;
+            }
 
-            d = libsais_partial_sorting_scan_right_to_left_32s_6k_block_omp(T, SA, buckets, d, thread_state[0].state.cache, block_end + 1, block_start - block_end, threads);
+            d = libsais_partial_sorting_scan_right_to_left_32s_6k_block_omp(
+                T, SA, buckets, d, thread_state[0].state.cache, block_end + 1,
+                block_start - block_end, threads);
         }
     }
 #else
@@ -3609,21 +5091,28 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k_omp(const sa_
     return d;
 }
 
-static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
-    if (threads == 1 || n < 65536)
-    {
-        d = libsais_partial_sorting_scan_right_to_left_32s_4k(T, SA, k, buckets, d, 0, n);
+static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k_omp(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+    sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    if (threads == 1 || n < 65536) {
+        d = libsais_partial_sorting_scan_right_to_left_32s_4k(T, SA, k, buckets,
+                                                              d, 0, n);
     }
 #if defined(_OPENMP)
-    else
-    {
+    else {
         fast_sint_t block_start, block_end;
-        for (block_start = (fast_sint_t)n - 1; block_start >= 0; block_start = block_end)
-        {
-            block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end < 0) { block_end = -1; }
+        for (block_start = (fast_sint_t)n - 1; block_start >= 0;
+             block_start = block_end) {
+            block_end = block_start -
+                        (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
+            if (block_end < 0) {
+                block_end = -1;
+            }
 
-            d = libsais_partial_sorting_scan_right_to_left_32s_4k_block_omp(T, SA, k, buckets, d, thread_state[0].state.cache, block_end + 1, block_start - block_end, threads);
+            d = libsais_partial_sorting_scan_right_to_left_32s_4k_block_omp(
+                T, SA, k, buckets, d, thread_state[0].state.cache,
+                block_end + 1, block_start - block_end, threads);
         }
     }
 #else
@@ -3633,21 +5122,27 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k_omp(const sa_
     return d;
 }
 
-static void libsais_partial_sorting_scan_right_to_left_32s_1k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
-    if (threads == 1 || n < 65536)
-    {
+static void libsais_partial_sorting_scan_right_to_left_32s_1k_omp(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+    sa_sint_t * RESTRICT buckets, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    if (threads == 1 || n < 65536) {
         libsais_partial_sorting_scan_right_to_left_32s_1k(T, SA, buckets, 0, n);
     }
 #if defined(_OPENMP)
-    else
-    {
+    else {
         fast_sint_t block_start, block_end;
-        for (block_start = (fast_sint_t)n - 1; block_start >= 0; block_start = block_end)
-        {
-            block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end < 0) { block_end = -1; }
+        for (block_start = (fast_sint_t)n - 1; block_start >= 0;
+             block_start = block_end) {
+            block_end = block_start -
+                        (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
+            if (block_end < 0) {
+                block_end = -1;
+            }
 
-            libsais_partial_sorting_scan_right_to_left_32s_1k_block_omp(T, SA, buckets, thread_state[0].state.cache, block_end + 1, block_start - block_end, threads);
+            libsais_partial_sorting_scan_right_to_left_32s_1k_block_omp(
+                T, SA, buckets, thread_state[0].state.cache, block_end + 1,
+                block_start - block_end, threads);
         }
     }
 #else
@@ -3655,93 +5150,122 @@ static void libsais_partial_sorting_scan_right_to_left_32s_1k_omp(const sa_sint_
 #endif
 }
 
-static fast_sint_t libsais_partial_sorting_gather_lms_suffixes_32s_4k(sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static fast_sint_t libsais_partial_sorting_gather_lms_suffixes_32s_4k(
+    sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start,
+    fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j, l;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - 3, l = omp_block_start; i < j; i += 4)
-    {
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - 3,
+        l = omp_block_start;
+         i < j; i += 4) {
         libsais_prefetch(&SA[i + prefetch_distance]);
 
-        sa_sint_t s0 = SA[i + 0]; SA[l] = (s0 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER); l += (s0 < 0);
-        sa_sint_t s1 = SA[i + 1]; SA[l] = (s1 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER); l += (s1 < 0);
-        sa_sint_t s2 = SA[i + 2]; SA[l] = (s2 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER); l += (s2 < 0);
-        sa_sint_t s3 = SA[i + 3]; SA[l] = (s3 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER); l += (s3 < 0);
+        sa_sint_t s0 = SA[i + 0];
+        SA[l] = (s0 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER);
+        l += (s0 < 0);
+        sa_sint_t s1 = SA[i + 1];
+        SA[l] = (s1 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER);
+        l += (s1 < 0);
+        sa_sint_t s2 = SA[i + 2];
+        SA[l] = (s2 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER);
+        l += (s2 < 0);
+        sa_sint_t s3 = SA[i + 3];
+        SA[l] = (s3 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER);
+        l += (s3 < 0);
     }
 
-    for (j += 3; i < j; i += 1)
-    {
-        sa_sint_t s = SA[i]; SA[l] = (s - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER); l += (s < 0);
+    for (j += 3; i < j; i += 1) {
+        sa_sint_t s = SA[i];
+        SA[l] = (s - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER);
+        l += (s < 0);
     }
 
     return l;
 }
 
-static fast_sint_t libsais_partial_sorting_gather_lms_suffixes_32s_1k(sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static fast_sint_t libsais_partial_sorting_gather_lms_suffixes_32s_1k(
+    sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start,
+    fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j, l;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - 3, l = omp_block_start; i < j; i += 4)
-    {
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - 3,
+        l = omp_block_start;
+         i < j; i += 4) {
         libsais_prefetch(&SA[i + prefetch_distance]);
 
-        sa_sint_t s0 = SA[i + 0]; SA[l] = s0 & SAINT_MAX; l += (s0 < 0);
-        sa_sint_t s1 = SA[i + 1]; SA[l] = s1 & SAINT_MAX; l += (s1 < 0);
-        sa_sint_t s2 = SA[i + 2]; SA[l] = s2 & SAINT_MAX; l += (s2 < 0);
-        sa_sint_t s3 = SA[i + 3]; SA[l] = s3 & SAINT_MAX; l += (s3 < 0);
+        sa_sint_t s0 = SA[i + 0];
+        SA[l] = s0 & SAINT_MAX;
+        l += (s0 < 0);
+        sa_sint_t s1 = SA[i + 1];
+        SA[l] = s1 & SAINT_MAX;
+        l += (s1 < 0);
+        sa_sint_t s2 = SA[i + 2];
+        SA[l] = s2 & SAINT_MAX;
+        l += (s2 < 0);
+        sa_sint_t s3 = SA[i + 3];
+        SA[l] = s3 & SAINT_MAX;
+        l += (s3 < 0);
     }
 
-    for (j += 3; i < j; i += 1)
-    {
-        sa_sint_t s = SA[i]; SA[l] = s & SAINT_MAX; l += (s < 0);
+    for (j += 3; i < j; i += 1) {
+        sa_sint_t s = SA[i];
+        SA[l] = s & SAINT_MAX;
+        l += (s < 0);
     }
 
     return l;
 }
 
-static void libsais_partial_sorting_gather_lms_suffixes_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
+static void libsais_partial_sorting_gather_lms_suffixes_32s_4k_omp(
+    sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
 #if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536)
+    #pragma omp parallel num_threads(threads) if (threads > 1 && n >= 65536)
 #endif
     {
 #if defined(_OPENMP)
-        fast_sint_t omp_thread_num    = omp_get_thread_num();
-        fast_sint_t omp_num_threads   = omp_get_num_threads();
+        fast_sint_t omp_thread_num = omp_get_thread_num();
+        fast_sint_t omp_num_threads = omp_get_num_threads();
 #else
-        UNUSED(threads); UNUSED(thread_state);
+        UNUSED(threads);
+        UNUSED(thread_state);
 
-        fast_sint_t omp_thread_num    = 0;
-        fast_sint_t omp_num_threads   = 1;
+        fast_sint_t omp_thread_num = 0;
+        fast_sint_t omp_num_threads = 1;
 #endif
-        fast_sint_t omp_block_stride  = (n / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+        fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
+                                         ? omp_block_stride
+                                         : n - omp_block_start;
 
-        if (omp_num_threads == 1)
-        {
-            libsais_partial_sorting_gather_lms_suffixes_32s_4k(SA, omp_block_start, omp_block_size);
+        if (omp_num_threads == 1) {
+            libsais_partial_sorting_gather_lms_suffixes_32s_4k(
+                SA, omp_block_start, omp_block_size);
         }
 #if defined(_OPENMP)
-        else
-        {
+        else {
             {
                 thread_state[omp_thread_num].state.position = omp_block_start;
-                thread_state[omp_thread_num].state.count = libsais_partial_sorting_gather_lms_suffixes_32s_4k(SA, omp_block_start, omp_block_size) - omp_block_start;
+                thread_state[omp_thread_num].state.count =
+                    libsais_partial_sorting_gather_lms_suffixes_32s_4k(
+                        SA, omp_block_start, omp_block_size) -
+                    omp_block_start;
             }
 
-            #pragma omp barrier
+    #pragma omp barrier
 
-            #pragma omp master
+    #pragma omp master
             {
                 fast_sint_t t, position = 0;
-                for (t = 0; t < omp_num_threads; ++t)
-                { 
-                    if (t > 0 && thread_state[t].state.count > 0)
-                    {
-                        memmove(&SA[position], &SA[thread_state[t].state.position], (size_t)thread_state[t].state.count * sizeof(sa_sint_t));
+                for (t = 0; t < omp_num_threads; ++t) {
+                    if (t > 0 && thread_state[t].state.count > 0) {
+                        memmove(&SA[position],
+                                &SA[thread_state[t].state.position],
+                                (size_t)thread_state[t].state.count *
+                                    sizeof(sa_sint_t));
                     }
 
                     position += thread_state[t].state.count;
@@ -3752,47 +5276,54 @@ static void libsais_partial_sorting_gather_lms_suffixes_32s_4k_omp(sa_sint_t * R
     }
 }
 
-static void libsais_partial_sorting_gather_lms_suffixes_32s_1k_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
+static void libsais_partial_sorting_gather_lms_suffixes_32s_1k_omp(
+    sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
 #if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536)
+    #pragma omp parallel num_threads(threads) if (threads > 1 && n >= 65536)
 #endif
     {
 #if defined(_OPENMP)
-        fast_sint_t omp_thread_num    = omp_get_thread_num();
-        fast_sint_t omp_num_threads   = omp_get_num_threads();
+        fast_sint_t omp_thread_num = omp_get_thread_num();
+        fast_sint_t omp_num_threads = omp_get_num_threads();
 #else
-        UNUSED(threads); UNUSED(thread_state);
+        UNUSED(threads);
+        UNUSED(thread_state);
 
-        fast_sint_t omp_thread_num    = 0;
-        fast_sint_t omp_num_threads   = 1;
+        fast_sint_t omp_thread_num = 0;
+        fast_sint_t omp_num_threads = 1;
 #endif
-        fast_sint_t omp_block_stride  = (n / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+        fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
+                                         ? omp_block_stride
+                                         : n - omp_block_start;
 
-        if (omp_num_threads == 1)
-        {
-            libsais_partial_sorting_gather_lms_suffixes_32s_1k(SA, omp_block_start, omp_block_size);
+        if (omp_num_threads == 1) {
+            libsais_partial_sorting_gather_lms_suffixes_32s_1k(
+                SA, omp_block_start, omp_block_size);
         }
 #if defined(_OPENMP)
-        else
-        {
+        else {
             {
                 thread_state[omp_thread_num].state.position = omp_block_start;
-                thread_state[omp_thread_num].state.count = libsais_partial_sorting_gather_lms_suffixes_32s_1k(SA, omp_block_start, omp_block_size) - omp_block_start;
+                thread_state[omp_thread_num].state.count =
+                    libsais_partial_sorting_gather_lms_suffixes_32s_1k(
+                        SA, omp_block_start, omp_block_size) -
+                    omp_block_start;
             }
 
-            #pragma omp barrier
+    #pragma omp barrier
 
-            #pragma omp master
+    #pragma omp master
             {
                 fast_sint_t t, position = 0;
-                for (t = 0; t < omp_num_threads; ++t)
-                { 
-                    if (t > 0 && thread_state[t].state.count > 0)
-                    {
-                        memmove(&SA[position], &SA[thread_state[t].state.position], (size_t)thread_state[t].state.count * sizeof(sa_sint_t));
+                for (t = 0; t < omp_num_threads; ++t) {
+                    if (t > 0 && thread_state[t].state.count > 0) {
+                        memmove(&SA[position],
+                                &SA[thread_state[t].state.position],
+                                (size_t)thread_state[t].state.count *
+                                    sizeof(sa_sint_t));
                     }
 
                     position += thread_state[t].state.count;
@@ -3803,103 +5334,158 @@ static void libsais_partial_sorting_gather_lms_suffixes_32s_1k_omp(sa_sint_t * R
     }
 }
 
-static void libsais_induce_partial_order_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
-    memset(&buckets[2 * ALPHABET_SIZE], 0, 2 * ALPHABET_SIZE * sizeof(sa_sint_t));
+static void libsais_induce_partial_order_8u_omp(
+    const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+    sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix,
+    sa_sint_t left_suffixes_count, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    memset(&buckets[2 * ALPHABET_SIZE], 0,
+           2 * ALPHABET_SIZE * sizeof(sa_sint_t));
 
-    sa_sint_t d = libsais_partial_sorting_scan_left_to_right_8u_omp(T, SA, n, buckets, left_suffixes_count, 0, threads, thread_state);
+    sa_sint_t d = libsais_partial_sorting_scan_left_to_right_8u_omp(
+        T, SA, n, buckets, left_suffixes_count, 0, threads, thread_state);
     libsais_partial_sorting_shift_markers_8u_omp(SA, n, buckets, threads);
-    libsais_partial_sorting_scan_right_to_left_8u_omp(T, SA, n, buckets, first_lms_suffix, left_suffixes_count, d, threads, thread_state);
-}
-
-static void libsais_induce_partial_order_32s_6k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
-    sa_sint_t d = libsais_partial_sorting_scan_left_to_right_32s_6k_omp(T, SA, n, buckets, left_suffixes_count, 0, threads, thread_state);
+    libsais_partial_sorting_scan_right_to_left_8u_omp(
+        T, SA, n, buckets, first_lms_suffix, left_suffixes_count, d, threads,
+        thread_state);
+}
+
+static void libsais_induce_partial_order_32s_6k_omp(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+    sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix,
+    sa_sint_t left_suffixes_count, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    sa_sint_t d = libsais_partial_sorting_scan_left_to_right_32s_6k_omp(
+        T, SA, n, buckets, left_suffixes_count, 0, threads, thread_state);
     libsais_partial_sorting_shift_markers_32s_6k_omp(SA, k, buckets, threads);
     libsais_partial_sorting_shift_buckets_32s_6k(k, buckets);
-    libsais_partial_sorting_scan_right_to_left_32s_6k_omp(T, SA, n, buckets, first_lms_suffix, left_suffixes_count, d, threads, thread_state);
+    libsais_partial_sorting_scan_right_to_left_32s_6k_omp(
+        T, SA, n, buckets, first_lms_suffix, left_suffixes_count, d, threads,
+        thread_state);
 }
 
-static void libsais_induce_partial_order_32s_4k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
+static void libsais_induce_partial_order_32s_4k_omp(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+    sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t));
 
-    sa_sint_t d = libsais_partial_sorting_scan_left_to_right_32s_4k_omp(T, SA, n, k, buckets, 0, threads, thread_state);
+    sa_sint_t d = libsais_partial_sorting_scan_left_to_right_32s_4k_omp(
+        T, SA, n, k, buckets, 0, threads, thread_state);
     libsais_partial_sorting_shift_markers_32s_4k(SA, n);
-    libsais_partial_sorting_scan_right_to_left_32s_4k_omp(T, SA, n, k, buckets, d, threads, thread_state);
-    libsais_partial_sorting_gather_lms_suffixes_32s_4k_omp(SA, n, threads, thread_state);
-}
-
-static void libsais_induce_partial_order_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
-    libsais_partial_sorting_scan_left_to_right_32s_1k_omp(T, SA, n, &buckets[1 * k], threads, thread_state);
-    libsais_partial_sorting_scan_right_to_left_32s_1k_omp(T, SA, n, &buckets[0 * k], threads, thread_state);
-    libsais_partial_sorting_gather_lms_suffixes_32s_1k_omp(SA, n, threads, thread_state);
-}
-
-static void libsais_induce_partial_order_32s_1k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
+    libsais_partial_sorting_scan_right_to_left_32s_4k_omp(
+        T, SA, n, k, buckets, d, threads, thread_state);
+    libsais_partial_sorting_gather_lms_suffixes_32s_4k_omp(SA, n, threads,
+                                                           thread_state);
+}
+
+static void libsais_induce_partial_order_32s_2k_omp(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+    sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    libsais_partial_sorting_scan_left_to_right_32s_1k_omp(
+        T, SA, n, &buckets[1 * k], threads, thread_state);
+    libsais_partial_sorting_scan_right_to_left_32s_1k_omp(
+        T, SA, n, &buckets[0 * k], threads, thread_state);
+    libsais_partial_sorting_gather_lms_suffixes_32s_1k_omp(SA, n, threads,
+                                                           thread_state);
+}
+
+static void libsais_induce_partial_order_32s_1k_omp(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+    sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     libsais_count_suffixes_32s(T, n, k, buckets);
     libsais_initialize_buckets_start_32s_1k(k, buckets);
-    libsais_partial_sorting_scan_left_to_right_32s_1k_omp(T, SA, n, buckets, threads, thread_state);
+    libsais_partial_sorting_scan_left_to_right_32s_1k_omp(
+        T, SA, n, buckets, threads, thread_state);
 
     libsais_count_suffixes_32s(T, n, k, buckets);
     libsais_initialize_buckets_end_32s_1k(k, buckets);
-    libsais_partial_sorting_scan_right_to_left_32s_1k_omp(T, SA, n, buckets, threads, thread_state);
+    libsais_partial_sorting_scan_right_to_left_32s_1k_omp(
+        T, SA, n, buckets, threads, thread_state);
 
-    libsais_partial_sorting_gather_lms_suffixes_32s_1k_omp(SA, n, threads, thread_state);
+    libsais_partial_sorting_gather_lms_suffixes_32s_1k_omp(SA, n, threads,
+                                                           thread_state);
 }
 
-static sa_sint_t libsais_renumber_lms_suffixes_8u(sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t name, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static sa_sint_t libsais_renumber_lms_suffixes_8u(sa_sint_t * RESTRICT SA,
+                                                  sa_sint_t m, sa_sint_t name,
+                                                  fast_sint_t omp_block_start,
+                                                  fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     sa_sint_t * RESTRICT SAm = &SA[m];
 
     fast_sint_t i, j;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4)
-    {
+    for (i = omp_block_start,
+        j = omp_block_start + omp_block_size - prefetch_distance - 3;
+         i < j; i += 4) {
         libsais_prefetch(&SA[i + 2 * prefetch_distance]);
 
-        libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 0] & SAINT_MAX) >> 1]);
-        libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 1] & SAINT_MAX) >> 1]);
-        libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 2] & SAINT_MAX) >> 1]);
-        libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 3] & SAINT_MAX) >> 1]);
-
-        sa_sint_t p0 = SA[i + 0]; SAm[(p0 & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p0 < 0;
-        sa_sint_t p1 = SA[i + 1]; SAm[(p1 & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p1 < 0;
-        sa_sint_t p2 = SA[i + 2]; SAm[(p2 & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p2 < 0;
-        sa_sint_t p3 = SA[i + 3]; SAm[(p3 & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p3 < 0;
-    }
-
-    for (j += prefetch_distance + 3; i < j; i += 1)
-    {
-        sa_sint_t p = SA[i]; SAm[(p & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p < 0;
+        libsais_prefetchw(
+            &SAm[(SA[i + prefetch_distance + 0] & SAINT_MAX) >> 1]);
+        libsais_prefetchw(
+            &SAm[(SA[i + prefetch_distance + 1] & SAINT_MAX) >> 1]);
+        libsais_prefetchw(
+            &SAm[(SA[i + prefetch_distance + 2] & SAINT_MAX) >> 1]);
+        libsais_prefetchw(
+            &SAm[(SA[i + prefetch_distance + 3] & SAINT_MAX) >> 1]);
+
+        sa_sint_t p0 = SA[i + 0];
+        SAm[(p0 & SAINT_MAX) >> 1] = name | SAINT_MIN;
+        name += p0 < 0;
+        sa_sint_t p1 = SA[i + 1];
+        SAm[(p1 & SAINT_MAX) >> 1] = name | SAINT_MIN;
+        name += p1 < 0;
+        sa_sint_t p2 = SA[i + 2];
+        SAm[(p2 & SAINT_MAX) >> 1] = name | SAINT_MIN;
+        name += p2 < 0;
+        sa_sint_t p3 = SA[i + 3];
+        SAm[(p3 & SAINT_MAX) >> 1] = name | SAINT_MIN;
+        name += p3 < 0;
+    }
+
+    for (j += prefetch_distance + 3; i < j; i += 1) {
+        sa_sint_t p = SA[i];
+        SAm[(p & SAINT_MAX) >> 1] = name | SAINT_MIN;
+        name += p < 0;
     }
 
     return name;
 }
 
-static fast_sint_t libsais_gather_marked_suffixes_8u(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t l, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static fast_sint_t libsais_gather_marked_suffixes_8u(
+    sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t l,
+    fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     l -= 1;
 
     fast_sint_t i, j;
-    for (i = (fast_sint_t)m + omp_block_start + omp_block_size - 1, j = (fast_sint_t)m + omp_block_start + 3; i >= j; i -= 4)
-    {
+    for (i = (fast_sint_t)m + omp_block_start + omp_block_size - 1,
+        j = (fast_sint_t)m + omp_block_start + 3;
+         i >= j; i -= 4) {
         libsais_prefetch(&SA[i - prefetch_distance]);
 
-        sa_sint_t s0 = SA[i - 0]; SA[l] = s0 & SAINT_MAX; l -= s0 < 0;
-        sa_sint_t s1 = SA[i - 1]; SA[l] = s1 & SAINT_MAX; l -= s1 < 0;
-        sa_sint_t s2 = SA[i - 2]; SA[l] = s2 & SAINT_MAX; l -= s2 < 0;
-        sa_sint_t s3 = SA[i - 3]; SA[l] = s3 & SAINT_MAX; l -= s3 < 0;
+        sa_sint_t s0 = SA[i - 0];
+        SA[l] = s0 & SAINT_MAX;
+        l -= s0 < 0;
+        sa_sint_t s1 = SA[i - 1];
+        SA[l] = s1 & SAINT_MAX;
+        l -= s1 < 0;
+        sa_sint_t s2 = SA[i - 2];
+        SA[l] = s2 & SAINT_MAX;
+        l -= s2 < 0;
+        sa_sint_t s3 = SA[i - 3];
+        SA[l] = s3 & SAINT_MAX;
+        l -= s3 < 0;
     }
 
-    for (j -= 3; i >= j; i -= 1)
-    {
-        sa_sint_t s = SA[i]; SA[l] = s & SAINT_MAX; l -= s < 0;
+    for (j -= 3; i >= j; i -= 1) {
+        sa_sint_t s = SA[i];
+        SA[l] = s & SAINT_MAX;
+        l -= s < 0;
     }
 
     l += 1;
@@ -3907,49 +5493,59 @@ static fast_sint_t libsais_gather_marked_suffixes_8u(sa_sint_t * RESTRICT SA, sa
     return l;
 }
 
-static sa_sint_t libsais_renumber_lms_suffixes_8u_omp(sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
+static sa_sint_t libsais_renumber_lms_suffixes_8u_omp(
+    sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     sa_sint_t name = 0;
 
 #if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536)
+    #pragma omp parallel num_threads(threads) if (threads > 1 && m >= 65536)
 #endif
     {
 #if defined(_OPENMP)
-        fast_sint_t omp_thread_num    = omp_get_thread_num();
-        fast_sint_t omp_num_threads   = omp_get_num_threads();
+        fast_sint_t omp_thread_num = omp_get_thread_num();
+        fast_sint_t omp_num_threads = omp_get_num_threads();
 #else
-        UNUSED(threads); UNUSED(thread_state);
+        UNUSED(threads);
+        UNUSED(thread_state);
 
-        fast_sint_t omp_thread_num    = 0;
-        fast_sint_t omp_num_threads   = 1;
+        fast_sint_t omp_thread_num = 0;
+        fast_sint_t omp_num_threads = 1;
 #endif
-        fast_sint_t omp_block_stride  = (m / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
+        fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
+                                         ? omp_block_stride
+                                         : m - omp_block_start;
 
-        if (omp_num_threads == 1)
-        {
-            name = libsais_renumber_lms_suffixes_8u(SA, m, 0, omp_block_start, omp_block_size);
+        if (omp_num_threads == 1) {
+            name = libsais_renumber_lms_suffixes_8u(SA, m, 0, omp_block_start,
+                                                    omp_block_size);
         }
 #if defined(_OPENMP)
-        else
-        {
+        else {
             {
-                thread_state[omp_thread_num].state.count = libsais_count_negative_marked_suffixes(SA, omp_block_start, omp_block_size);
+                thread_state[omp_thread_num].state.count =
+                    libsais_count_negative_marked_suffixes(SA, omp_block_start,
+                                                           omp_block_size);
             }
 
-            #pragma omp barrier
+    #pragma omp barrier
 
             {
-                fast_sint_t t, count = 0; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; }
+                fast_sint_t t, count = 0;
+                for (t = 0; t < omp_thread_num; ++t) {
+                    count += thread_state[t].state.count;
+                }
 
-                if (omp_thread_num == omp_num_threads - 1)
-                {
-                    name = (sa_sint_t)(count + thread_state[omp_thread_num].state.count);
+                if (omp_thread_num == omp_num_threads - 1) {
+                    name =
+                        (sa_sint_t)(count +
+                                    thread_state[omp_thread_num].state.count);
                 }
 
-                libsais_renumber_lms_suffixes_8u(SA, m, (sa_sint_t)count, omp_block_start, omp_block_size);
+                libsais_renumber_lms_suffixes_8u(
+                    SA, m, (sa_sint_t)count, omp_block_start, omp_block_size);
             }
         }
 #endif
@@ -3958,57 +5554,73 @@ static sa_sint_t libsais_renumber_lms_suffixes_8u_omp(sa_sint_t * RESTRICT SA, s
     return name;
 }
 
-static void libsais_gather_marked_lms_suffixes_8u_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
+static void libsais_gather_marked_lms_suffixes_8u_omp(
+    sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs,
+    sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
 #if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 131072)
+    #pragma omp parallel num_threads(threads) if (threads > 1 && n >= 131072)
 #endif
     {
 #if defined(_OPENMP)
-        fast_sint_t omp_thread_num    = omp_get_thread_num();
-        fast_sint_t omp_num_threads   = omp_get_num_threads();
+        fast_sint_t omp_thread_num = omp_get_thread_num();
+        fast_sint_t omp_num_threads = omp_get_num_threads();
 #else
-        UNUSED(threads); UNUSED(thread_state);
-
-        fast_sint_t omp_thread_num    = 0;
-        fast_sint_t omp_num_threads   = 1;
-#endif
-        fast_sint_t omp_block_stride  = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ((fast_sint_t)n >> 1) - omp_block_start;
-
-        if (omp_num_threads == 1)
-        {
-            libsais_gather_marked_suffixes_8u(SA, m, (fast_sint_t)n + (fast_sint_t)fs, omp_block_start, omp_block_size);
-        }
-#if defined(_OPENMP)
-        else
-        {
-            {
-                if (omp_thread_num < omp_num_threads - 1)
-                {
-                    thread_state[omp_thread_num].state.position = libsais_gather_marked_suffixes_8u(SA, m, (fast_sint_t)m + omp_block_start + omp_block_size, omp_block_start, omp_block_size);
-                    thread_state[omp_thread_num].state.count = (fast_sint_t)m + omp_block_start + omp_block_size - thread_state[omp_thread_num].state.position;
-                }
-                else
-                {
-                    thread_state[omp_thread_num].state.position = libsais_gather_marked_suffixes_8u(SA, m, (fast_sint_t)n + (fast_sint_t)fs, omp_block_start, omp_block_size);
-                    thread_state[omp_thread_num].state.count = (fast_sint_t)n + (fast_sint_t)fs - thread_state[omp_thread_num].state.position;
+        UNUSED(threads);
+        UNUSED(thread_state);
+
+        fast_sint_t omp_thread_num = 0;
+        fast_sint_t omp_num_threads = 1;
+#endif
+        fast_sint_t omp_block_stride =
+            (((fast_sint_t)n >> 1) / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size =
+            omp_thread_num < omp_num_threads - 1
+                ? omp_block_stride
+                : ((fast_sint_t)n >> 1) - omp_block_start;
+
+        if (omp_num_threads == 1) {
+            libsais_gather_marked_suffixes_8u(SA, m,
+                                              (fast_sint_t)n + (fast_sint_t)fs,
+                                              omp_block_start, omp_block_size);
+        }
+#if defined(_OPENMP)
+        else {
+            {
+                if (omp_thread_num < omp_num_threads - 1) {
+                    thread_state[omp_thread_num].state.position =
+                        libsais_gather_marked_suffixes_8u(
+                            SA, m,
+                            (fast_sint_t)m + omp_block_start + omp_block_size,
+                            omp_block_start, omp_block_size);
+                    thread_state[omp_thread_num].state.count =
+                        (fast_sint_t)m + omp_block_start + omp_block_size -
+                        thread_state[omp_thread_num].state.position;
+                } else {
+                    thread_state[omp_thread_num].state.position =
+                        libsais_gather_marked_suffixes_8u(
+                            SA, m, (fast_sint_t)n + (fast_sint_t)fs,
+                            omp_block_start, omp_block_size);
+                    thread_state[omp_thread_num].state.count =
+                        (fast_sint_t)n + (fast_sint_t)fs -
+                        thread_state[omp_thread_num].state.position;
                 }
             }
 
-            #pragma omp barrier
+    #pragma omp barrier
 
-            #pragma omp master
+    #pragma omp master
             {
                 fast_sint_t t, position = (fast_sint_t)n + (fast_sint_t)fs;
-                    
-                for (t = omp_num_threads - 1; t >= 0; --t)
-                { 
+
+                for (t = omp_num_threads - 1; t >= 0; --t) {
                     position -= thread_state[t].state.count;
-                    if (t != omp_num_threads - 1 && thread_state[t].state.count > 0)
-                    {
-                        memmove(&SA[position], &SA[thread_state[t].state.position], (size_t)thread_state[t].state.count * sizeof(sa_sint_t));
+                    if (t != omp_num_threads - 1 &&
+                        thread_state[t].state.count > 0) {
+                        memmove(&SA[position],
+                                &SA[thread_state[t].state.position],
+                                (size_t)thread_state[t].state.count *
+                                    sizeof(sa_sint_t));
                     }
                 }
             }
@@ -4017,83 +5629,119 @@ static void libsais_gather_marked_lms_suffixes_8u_omp(sa_sint_t * RESTRICT SA, s
     }
 }
 
-static sa_sint_t libsais_renumber_and_gather_lms_suffixes_8u_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
+static sa_sint_t libsais_renumber_and_gather_lms_suffixes_8u_omp(
+    sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs,
+    sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     memset(&SA[m], 0, ((size_t)n >> 1) * sizeof(sa_sint_t));
 
-    sa_sint_t name = libsais_renumber_lms_suffixes_8u_omp(SA, m, threads, thread_state);
-    if (name < m)
-    {
-        libsais_gather_marked_lms_suffixes_8u_omp(SA, n, m, fs, threads, thread_state);
-    }
-    else
-    {
-        fast_sint_t i; for (i = 0; i < m; i += 1) { SA[i] &= SAINT_MAX; }
+    sa_sint_t name =
+        libsais_renumber_lms_suffixes_8u_omp(SA, m, threads, thread_state);
+    if (name < m) {
+        libsais_gather_marked_lms_suffixes_8u_omp(SA, n, m, fs, threads,
+                                                  thread_state);
+    } else {
+        fast_sint_t i;
+        for (i = 0; i < m; i += 1) {
+            SA[i] &= SAINT_MAX;
+        }
     }
 
     return name;
 }
 
-static sa_sint_t libsais_renumber_distinct_lms_suffixes_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t name, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static sa_sint_t libsais_renumber_distinct_lms_suffixes_32s_4k(
+    sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t name,
+    fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     sa_sint_t * RESTRICT SAm = &SA[m];
 
-    fast_sint_t i, j; sa_sint_t p0, p1, p2, p3 = 0;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4)
-    {
+    fast_sint_t i, j;
+    sa_sint_t p0, p1, p2, p3 = 0;
+    for (i = omp_block_start,
+        j = omp_block_start + omp_block_size - prefetch_distance - 3;
+         i < j; i += 4) {
         libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
 
-        libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 0] & SAINT_MAX) >> 1]);
-        libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 1] & SAINT_MAX) >> 1]);
-        libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 2] & SAINT_MAX) >> 1]);
-        libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 3] & SAINT_MAX) >> 1]);
-
-        p0 = SA[i + 0]; SAm[(SA[i + 0] = p0 & SAINT_MAX) >> 1] = name | (p0 & p3 & SAINT_MIN); name += p0 < 0;
-        p1 = SA[i + 1]; SAm[(SA[i + 1] = p1 & SAINT_MAX) >> 1] = name | (p1 & p0 & SAINT_MIN); name += p1 < 0;
-        p2 = SA[i + 2]; SAm[(SA[i + 2] = p2 & SAINT_MAX) >> 1] = name | (p2 & p1 & SAINT_MIN); name += p2 < 0;
-        p3 = SA[i + 3]; SAm[(SA[i + 3] = p3 & SAINT_MAX) >> 1] = name | (p3 & p2 & SAINT_MIN); name += p3 < 0;
-    }
-
-    for (j += prefetch_distance + 3; i < j; i += 1)
-    {
-        p2 = p3; p3 = SA[i]; SAm[(SA[i] = p3 & SAINT_MAX) >> 1] = name | (p3 & p2 & SAINT_MIN); name += p3 < 0;
+        libsais_prefetchw(
+            &SAm[(SA[i + prefetch_distance + 0] & SAINT_MAX) >> 1]);
+        libsais_prefetchw(
+            &SAm[(SA[i + prefetch_distance + 1] & SAINT_MAX) >> 1]);
+        libsais_prefetchw(
+            &SAm[(SA[i + prefetch_distance + 2] & SAINT_MAX) >> 1]);
+        libsais_prefetchw(
+            &SAm[(SA[i + prefetch_distance + 3] & SAINT_MAX) >> 1]);
+
+        p0 = SA[i + 0];
+        SAm[(SA[i + 0] = p0 & SAINT_MAX) >> 1] = name | (p0 & p3 & SAINT_MIN);
+        name += p0 < 0;
+        p1 = SA[i + 1];
+        SAm[(SA[i + 1] = p1 & SAINT_MAX) >> 1] = name | (p1 & p0 & SAINT_MIN);
+        name += p1 < 0;
+        p2 = SA[i + 2];
+        SAm[(SA[i + 2] = p2 & SAINT_MAX) >> 1] = name | (p2 & p1 & SAINT_MIN);
+        name += p2 < 0;
+        p3 = SA[i + 3];
+        SAm[(SA[i + 3] = p3 & SAINT_MAX) >> 1] = name | (p3 & p2 & SAINT_MIN);
+        name += p3 < 0;
+    }
+
+    for (j += prefetch_distance + 3; i < j; i += 1) {
+        p2 = p3;
+        p3 = SA[i];
+        SAm[(SA[i] = p3 & SAINT_MAX) >> 1] = name | (p3 & p2 & SAINT_MIN);
+        name += p3 < 0;
     }
 
     return name;
 }
 
-static void libsais_mark_distinct_lms_suffixes_32s(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static void libsais_mark_distinct_lms_suffixes_32s(sa_sint_t * RESTRICT SA,
+                                                   sa_sint_t m,
+                                                   fast_sint_t omp_block_start,
+                                                   fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
-    fast_sint_t i, j; sa_sint_t p0, p1, p2, p3 = 0;
-    for (i = (fast_sint_t)m + omp_block_start, j = (fast_sint_t)m + omp_block_start + omp_block_size - 3; i < j; i += 4)
-    {
+    fast_sint_t i, j;
+    sa_sint_t p0, p1, p2, p3 = 0;
+    for (i = (fast_sint_t)m + omp_block_start,
+        j = (fast_sint_t)m + omp_block_start + omp_block_size - 3;
+         i < j; i += 4) {
         libsais_prefetchw(&SA[i + prefetch_distance]);
 
-        p0 = SA[i + 0]; SA[i + 0] = p0 & (p3 | SAINT_MAX); p0 = (p0 == 0) ? p3 : p0;
-        p1 = SA[i + 1]; SA[i + 1] = p1 & (p0 | SAINT_MAX); p1 = (p1 == 0) ? p0 : p1;
-        p2 = SA[i + 2]; SA[i + 2] = p2 & (p1 | SAINT_MAX); p2 = (p2 == 0) ? p1 : p2;
-        p3 = SA[i + 3]; SA[i + 3] = p3 & (p2 | SAINT_MAX); p3 = (p3 == 0) ? p2 : p3;
+        p0 = SA[i + 0];
+        SA[i + 0] = p0 & (p3 | SAINT_MAX);
+        p0 = (p0 == 0) ? p3 : p0;
+        p1 = SA[i + 1];
+        SA[i + 1] = p1 & (p0 | SAINT_MAX);
+        p1 = (p1 == 0) ? p0 : p1;
+        p2 = SA[i + 2];
+        SA[i + 2] = p2 & (p1 | SAINT_MAX);
+        p2 = (p2 == 0) ? p1 : p2;
+        p3 = SA[i + 3];
+        SA[i + 3] = p3 & (p2 | SAINT_MAX);
+        p3 = (p3 == 0) ? p2 : p3;
     }
 
-    for (j += 3; i < j; i += 1)
-    {
-        p2 = p3; p3 = SA[i]; SA[i] = p3 & (p2 | SAINT_MAX); p3 = (p3 == 0) ? p2 : p3;
+    for (j += 3; i < j; i += 1) {
+        p2 = p3;
+        p3 = SA[i];
+        SA[i] = p3 & (p2 | SAINT_MAX);
+        p3 = (p3 == 0) ? p2 : p3;
     }
 }
 
-static void libsais_clamp_lms_suffixes_length_32s(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static void libsais_clamp_lms_suffixes_length_32s(sa_sint_t * RESTRICT SA,
+                                                  sa_sint_t m,
+                                                  fast_sint_t omp_block_start,
+                                                  fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     sa_sint_t * RESTRICT SAm = &SA[m];
 
     fast_sint_t i, j;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - 3; i < j; i += 4)
-    {
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - 3; i < j;
+         i += 4) {
         libsais_prefetchw(&SAm[i + prefetch_distance]);
 
         SAm[i + 0] = (SAm[i + 0] < 0 ? SAm[i + 0] : 0) & SAINT_MAX;
@@ -4102,55 +5750,64 @@ static void libsais_clamp_lms_suffixes_length_32s(sa_sint_t * RESTRICT SA, sa_si
         SAm[i + 3] = (SAm[i + 3] < 0 ? SAm[i + 3] : 0) & SAINT_MAX;
     }
 
-    for (j += 3; i < j; i += 1)
-    {
+    for (j += 3; i < j; i += 1) {
         SAm[i] = (SAm[i] < 0 ? SAm[i] : 0) & SAINT_MAX;
     }
 }
 
-static sa_sint_t libsais_renumber_distinct_lms_suffixes_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
+static sa_sint_t libsais_renumber_distinct_lms_suffixes_32s_4k_omp(
+    sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     sa_sint_t name = 0;
 
 #if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536)
+    #pragma omp parallel num_threads(threads) if (threads > 1 && m >= 65536)
 #endif
     {
 #if defined(_OPENMP)
-        fast_sint_t omp_thread_num    = omp_get_thread_num();
-        fast_sint_t omp_num_threads   = omp_get_num_threads();
+        fast_sint_t omp_thread_num = omp_get_thread_num();
+        fast_sint_t omp_num_threads = omp_get_num_threads();
 #else
-        UNUSED(threads); UNUSED(thread_state);
+        UNUSED(threads);
+        UNUSED(thread_state);
 
-        fast_sint_t omp_thread_num    = 0;
-        fast_sint_t omp_num_threads   = 1;
+        fast_sint_t omp_thread_num = 0;
+        fast_sint_t omp_num_threads = 1;
 #endif
-        fast_sint_t omp_block_stride  = (m / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
+        fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
+                                         ? omp_block_stride
+                                         : m - omp_block_start;
 
-        if (omp_num_threads == 1)
-        {
-            name = libsais_renumber_distinct_lms_suffixes_32s_4k(SA, m, 1, omp_block_start, omp_block_size);
+        if (omp_num_threads == 1) {
+            name = libsais_renumber_distinct_lms_suffixes_32s_4k(
+                SA, m, 1, omp_block_start, omp_block_size);
         }
 #if defined(_OPENMP)
-        else
-        {
+        else {
             {
-                thread_state[omp_thread_num].state.count = libsais_count_negative_marked_suffixes(SA, omp_block_start, omp_block_size);
+                thread_state[omp_thread_num].state.count =
+                    libsais_count_negative_marked_suffixes(SA, omp_block_start,
+                                                           omp_block_size);
             }
 
-            #pragma omp barrier
+    #pragma omp barrier
 
             {
-                fast_sint_t t, count = 1; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; }
+                fast_sint_t t, count = 1;
+                for (t = 0; t < omp_thread_num; ++t) {
+                    count += thread_state[t].state.count;
+                }
 
-                if (omp_thread_num == omp_num_threads - 1)
-                {
-                    name = (sa_sint_t)(count + thread_state[omp_thread_num].state.count);
+                if (omp_thread_num == omp_num_threads - 1) {
+                    name =
+                        (sa_sint_t)(count +
+                                    thread_state[omp_thread_num].state.count);
                 }
 
-                libsais_renumber_distinct_lms_suffixes_32s_4k(SA, m, (sa_sint_t)count, omp_block_start, omp_block_size);
+                libsais_renumber_distinct_lms_suffixes_32s_4k(
+                    SA, m, (sa_sint_t)count, omp_block_start, omp_block_size);
             }
         }
 #endif
@@ -4159,65 +5816,79 @@ static sa_sint_t libsais_renumber_distinct_lms_suffixes_32s_4k_omp(sa_sint_t * R
     return name - 1;
 }
 
-static void libsais_mark_distinct_lms_suffixes_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads)
-{
+static void libsais_mark_distinct_lms_suffixes_32s_omp(sa_sint_t * RESTRICT SA,
+                                                       sa_sint_t n, sa_sint_t m,
+                                                       sa_sint_t threads) {
 #if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 131072)
+    #pragma omp parallel num_threads(threads) if (threads > 1 && n >= 131072)
 #endif
     {
 #if defined(_OPENMP)
-        fast_sint_t omp_thread_num    = omp_get_thread_num();
-        fast_sint_t omp_num_threads   = omp_get_num_threads();
-        fast_sint_t omp_block_stride  = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ((fast_sint_t)n >> 1) - omp_block_start;
+        fast_sint_t omp_thread_num = omp_get_thread_num();
+        fast_sint_t omp_num_threads = omp_get_num_threads();
+        fast_sint_t omp_block_stride =
+            (((fast_sint_t)n >> 1) / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size =
+            omp_thread_num < omp_num_threads - 1
+                ? omp_block_stride
+                : ((fast_sint_t)n >> 1) - omp_block_start;
 #else
         UNUSED(threads);
 
-        fast_sint_t omp_block_start   = 0;
-        fast_sint_t omp_block_size    = (fast_sint_t)n >> 1;
+        fast_sint_t omp_block_start = 0;
+        fast_sint_t omp_block_size = (fast_sint_t)n >> 1;
 #endif
-        libsais_mark_distinct_lms_suffixes_32s(SA, m, omp_block_start, omp_block_size);
+        libsais_mark_distinct_lms_suffixes_32s(SA, m, omp_block_start,
+                                               omp_block_size);
     }
 }
 
-static void libsais_clamp_lms_suffixes_length_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads)
-{
+static void libsais_clamp_lms_suffixes_length_32s_omp(sa_sint_t * RESTRICT SA,
+                                                      sa_sint_t n, sa_sint_t m,
+                                                      sa_sint_t threads) {
 #if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 131072)
+    #pragma omp parallel num_threads(threads) if (threads > 1 && n >= 131072)
 #endif
     {
 #if defined(_OPENMP)
-        fast_sint_t omp_thread_num    = omp_get_thread_num();
-        fast_sint_t omp_num_threads   = omp_get_num_threads();
-        fast_sint_t omp_block_stride  = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ((fast_sint_t)n >> 1) - omp_block_start;
+        fast_sint_t omp_thread_num = omp_get_thread_num();
+        fast_sint_t omp_num_threads = omp_get_num_threads();
+        fast_sint_t omp_block_stride =
+            (((fast_sint_t)n >> 1) / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size =
+            omp_thread_num < omp_num_threads - 1
+                ? omp_block_stride
+                : ((fast_sint_t)n >> 1) - omp_block_start;
 #else
         UNUSED(threads);
 
-        fast_sint_t omp_block_start   = 0;
-        fast_sint_t omp_block_size    = (fast_sint_t)n >> 1;
+        fast_sint_t omp_block_start = 0;
+        fast_sint_t omp_block_size = (fast_sint_t)n >> 1;
 #endif
-        libsais_clamp_lms_suffixes_length_32s(SA, m, omp_block_start, omp_block_size);
+        libsais_clamp_lms_suffixes_length_32s(SA, m, omp_block_start,
+                                              omp_block_size);
     }
 }
 
-static sa_sint_t libsais_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
+static sa_sint_t libsais_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(
+    sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     memset(&SA[m], 0, ((size_t)n >> 1) * sizeof(sa_sint_t));
 
-    sa_sint_t name = libsais_renumber_distinct_lms_suffixes_32s_4k_omp(SA, m, threads, thread_state);
-    if (name < m)
-    {
+    sa_sint_t name = libsais_renumber_distinct_lms_suffixes_32s_4k_omp(
+        SA, m, threads, thread_state);
+    if (name < m) {
         libsais_mark_distinct_lms_suffixes_32s_omp(SA, n, m, threads);
     }
 
     return name;
 }
 
-static sa_sint_t libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads)
-{
+static sa_sint_t libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(
+    sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
+    sa_sint_t threads) {
     const fast_sint_t prefetch_distance = 32;
 
     sa_sint_t * RESTRICT SAm = &SA[m];
@@ -4225,85 +5896,131 @@ static sa_sint_t libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(sa_s
     {
         libsais_gather_lms_suffixes_32s(T, SA, n);
 
-        memset(&SA[m], 0, ((size_t)n - (size_t)m - (size_t)m) * sizeof(sa_sint_t));
+        memset(&SA[m], 0,
+               ((size_t)n - (size_t)m - (size_t)m) * sizeof(sa_sint_t));
 
         fast_sint_t i, j;
-        for (i = (fast_sint_t)n - (fast_sint_t)m, j = (fast_sint_t)n - 1 - prefetch_distance - 3; i < j; i += 4)
-        {
+        for (i = (fast_sint_t)n - (fast_sint_t)m,
+            j = (fast_sint_t)n - 1 - prefetch_distance - 3;
+             i < j; i += 4) {
             libsais_prefetch(&SA[i + 2 * prefetch_distance]);
 
-            libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]);
-            libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]);
-            libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 2]) >> 1]);
-            libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 3]) >> 1]);
-
-            SAm[((sa_uint_t)SA[i + 0]) >> 1] = SA[i + 1] - SA[i + 0] + 1 + SAINT_MIN;
-            SAm[((sa_uint_t)SA[i + 1]) >> 1] = SA[i + 2] - SA[i + 1] + 1 + SAINT_MIN;
-            SAm[((sa_uint_t)SA[i + 2]) >> 1] = SA[i + 3] - SA[i + 2] + 1 + SAINT_MIN;
-            SAm[((sa_uint_t)SA[i + 3]) >> 1] = SA[i + 4] - SA[i + 3] + 1 + SAINT_MIN;
-        }
-
-        for (j += prefetch_distance + 3; i < j; i += 1)
-        {
+            libsais_prefetchw(
+                &SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]);
+            libsais_prefetchw(
+                &SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]);
+            libsais_prefetchw(
+                &SAm[((sa_uint_t)SA[i + prefetch_distance + 2]) >> 1]);
+            libsais_prefetchw(
+                &SAm[((sa_uint_t)SA[i + prefetch_distance + 3]) >> 1]);
+
+            SAm[((sa_uint_t)SA[i + 0]) >> 1] =
+                SA[i + 1] - SA[i + 0] + 1 + SAINT_MIN;
+            SAm[((sa_uint_t)SA[i + 1]) >> 1] =
+                SA[i + 2] - SA[i + 1] + 1 + SAINT_MIN;
+            SAm[((sa_uint_t)SA[i + 2]) >> 1] =
+                SA[i + 3] - SA[i + 2] + 1 + SAINT_MIN;
+            SAm[((sa_uint_t)SA[i + 3]) >> 1] =
+                SA[i + 4] - SA[i + 3] + 1 + SAINT_MIN;
+        }
+
+        for (j += prefetch_distance + 3; i < j; i += 1) {
             SAm[((sa_uint_t)SA[i]) >> 1] = SA[i + 1] - SA[i] + 1 + SAINT_MIN;
         }
 
         SAm[((sa_uint_t)SA[n - 1]) >> 1] = 1 + SAINT_MIN;
     }
 
-    {
-        libsais_clamp_lms_suffixes_length_32s_omp(SA, n, m, threads);
-    }
+    { libsais_clamp_lms_suffixes_length_32s_omp(SA, n, m, threads); }
 
     sa_sint_t name = 1;
 
     {
-        fast_sint_t i, j, p = SA[0], plen = SAm[p >> 1]; sa_sint_t pdiff = SAINT_MIN;
-        for (i = 1, j = m - prefetch_distance - 1; i < j; i += 2)
-        {
+        fast_sint_t i, j, p = SA[0], plen = SAm[p >> 1];
+        sa_sint_t pdiff = SAINT_MIN;
+        for (i = 1, j = m - prefetch_distance - 1; i < j; i += 2) {
             libsais_prefetch(&SA[i + 2 * prefetch_distance]);
-            
-            libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]); libsais_prefetch(&T[((sa_uint_t)SA[i + prefetch_distance + 0])]);
-            libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]); libsais_prefetch(&T[((sa_uint_t)SA[i + prefetch_distance + 1])]);
 
-            fast_sint_t q = SA[i + 0], qlen = SAm[q >> 1]; sa_sint_t qdiff = SAINT_MIN;
-            if (plen == qlen) { fast_sint_t l = 0; do { if (T[p + l] != T[q + l]) { break; } } while (++l < qlen); qdiff = (sa_sint_t)(l - qlen) & SAINT_MIN; }
-            SAm[p >> 1] = name | (pdiff & qdiff); name += (qdiff < 0);
-
-            p = SA[i + 1]; plen = SAm[p >> 1]; pdiff = SAINT_MIN;
-            if (qlen == plen) { fast_sint_t l = 0; do { if (T[q + l] != T[p + l]) { break; } } while (++l < plen); pdiff = (sa_sint_t)(l - plen) & SAINT_MIN; }
-            SAm[q >> 1] = name | (qdiff & pdiff); name += (pdiff < 0);
+            libsais_prefetchw(
+                &SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]);
+            libsais_prefetch(&T[((sa_uint_t)SA[i + prefetch_distance + 0])]);
+            libsais_prefetchw(
+                &SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]);
+            libsais_prefetch(&T[((sa_uint_t)SA[i + prefetch_distance + 1])]);
+
+            fast_sint_t q = SA[i + 0], qlen = SAm[q >> 1];
+            sa_sint_t qdiff = SAINT_MIN;
+            if (plen == qlen) {
+                fast_sint_t l = 0;
+                do {
+                    if (T[p + l] != T[q + l]) {
+                        break;
+                    }
+                } while (++l < qlen);
+                qdiff = (sa_sint_t)(l - qlen) & SAINT_MIN;
+            }
+            SAm[p >> 1] = name | (pdiff & qdiff);
+            name += (qdiff < 0);
+
+            p = SA[i + 1];
+            plen = SAm[p >> 1];
+            pdiff = SAINT_MIN;
+            if (qlen == plen) {
+                fast_sint_t l = 0;
+                do {
+                    if (T[q + l] != T[p + l]) {
+                        break;
+                    }
+                } while (++l < plen);
+                pdiff = (sa_sint_t)(l - plen) & SAINT_MIN;
+            }
+            SAm[q >> 1] = name | (qdiff & pdiff);
+            name += (pdiff < 0);
         }
 
-        for (j += prefetch_distance + 1; i < j; i += 1)
-        {
-            fast_sint_t q = SA[i], qlen = SAm[q >> 1]; sa_sint_t qdiff = SAINT_MIN;
-            if (plen == qlen) { fast_sint_t l = 0; do { if (T[p + l] != T[q + l]) { break; } } while (++l < plen); qdiff = (sa_sint_t)(l - plen) & SAINT_MIN; }
-            SAm[p >> 1] = name | (pdiff & qdiff); name += (qdiff < 0);
+        for (j += prefetch_distance + 1; i < j; i += 1) {
+            fast_sint_t q = SA[i], qlen = SAm[q >> 1];
+            sa_sint_t qdiff = SAINT_MIN;
+            if (plen == qlen) {
+                fast_sint_t l = 0;
+                do {
+                    if (T[p + l] != T[q + l]) {
+                        break;
+                    }
+                } while (++l < plen);
+                qdiff = (sa_sint_t)(l - plen) & SAINT_MIN;
+            }
+            SAm[p >> 1] = name | (pdiff & qdiff);
+            name += (qdiff < 0);
 
-            p = q; plen = qlen; pdiff = qdiff;
+            p = q;
+            plen = qlen;
+            pdiff = qdiff;
         }
 
-        SAm[p >> 1] = name | pdiff; name++;
+        SAm[p >> 1] = name | pdiff;
+        name++;
     }
 
-    if (name <= m)
-    {
+    if (name <= m) {
         libsais_mark_distinct_lms_suffixes_32s_omp(SA, n, m, threads);
     }
 
     return name - 1;
 }
 
-static void libsais_reconstruct_lms_suffixes(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static void libsais_reconstruct_lms_suffixes(sa_sint_t * RESTRICT SA,
+                                             sa_sint_t n, sa_sint_t m,
+                                             fast_sint_t omp_block_start,
+                                             fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     const sa_sint_t * RESTRICT SAnm = &SA[n - m];
 
     fast_sint_t i, j;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4)
-    {
+    for (i = omp_block_start,
+        j = omp_block_start + omp_block_size - prefetch_distance - 3;
+         i < j; i += 4) {
         libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
 
         libsais_prefetch(&SAnm[SA[i + prefetch_distance + 0]]);
@@ -4317,100 +6034,105 @@ static void libsais_reconstruct_lms_suffixes(sa_sint_t * RESTRICT SA, sa_sint_t
         SA[i + 3] = SAnm[SA[i + 3]];
     }
 
-    for (j += prefetch_distance + 3; i < j; i += 1)
-    {
+    for (j += prefetch_distance + 3; i < j; i += 1) {
         SA[i] = SAnm[SA[i]];
     }
 }
 
-static void libsais_reconstruct_lms_suffixes_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads)
-{
+static void libsais_reconstruct_lms_suffixes_omp(sa_sint_t * RESTRICT SA,
+                                                 sa_sint_t n, sa_sint_t m,
+                                                 sa_sint_t threads) {
 #if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536)
+    #pragma omp parallel num_threads(threads) if (threads > 1 && m >= 65536)
 #endif
     {
 #if defined(_OPENMP)
-        fast_sint_t omp_thread_num    = omp_get_thread_num();
-        fast_sint_t omp_num_threads   = omp_get_num_threads();
-        fast_sint_t omp_block_stride  = (m / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
+        fast_sint_t omp_thread_num = omp_get_thread_num();
+        fast_sint_t omp_num_threads = omp_get_num_threads();
+        fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
+                                         ? omp_block_stride
+                                         : m - omp_block_start;
 #else
         UNUSED(threads);
 
-        fast_sint_t omp_block_start   = 0;
-        fast_sint_t omp_block_size    = m;
+        fast_sint_t omp_block_start = 0;
+        fast_sint_t omp_block_size = m;
 #endif
 
-        libsais_reconstruct_lms_suffixes(SA, n, m, omp_block_start, omp_block_size);
+        libsais_reconstruct_lms_suffixes(SA, n, m, omp_block_start,
+                                         omp_block_size);
     }
 }
 
-static void libsais_place_lms_suffixes_interval_8u(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, const sa_sint_t * RESTRICT buckets)
-{
+static void libsais_place_lms_suffixes_interval_8u(
+    sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
+    const sa_sint_t * RESTRICT buckets) {
     const sa_sint_t * RESTRICT bucket_end = &buckets[7 * ALPHABET_SIZE];
 
     fast_sint_t c, j = n;
-    for (c = ALPHABET_SIZE - 2; c >= 0; --c)
-    {
-        fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1) + BUCKETS_INDEX2(1, 0)] - (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1)];
-        if (l > 0)
-        {
+    for (c = ALPHABET_SIZE - 2; c >= 0; --c) {
+        fast_sint_t l =
+            (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1) + BUCKETS_INDEX2(1, 0)] -
+            (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1)];
+        if (l > 0) {
             fast_sint_t i = bucket_end[c];
-            if (j - i > 0)
-            {
+            if (j - i > 0) {
                 memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
             }
 
-            memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
+            memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l],
+                    (size_t)l * sizeof(sa_sint_t));
         }
     }
 
     memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
 }
 
-static void libsais_place_lms_suffixes_interval_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets)
-{
+static void libsais_place_lms_suffixes_interval_32s_4k(
+    sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m,
+    const sa_sint_t * RESTRICT buckets) {
     const sa_sint_t * RESTRICT bucket_end = &buckets[3 * k];
 
     fast_sint_t c, j = n;
-    for (c = (fast_sint_t)k - 2; c >= 0; --c)
-    {
-        fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1) + BUCKETS_INDEX2(1, 0)] - (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1)];
-        if (l > 0)
-        {
+    for (c = (fast_sint_t)k - 2; c >= 0; --c) {
+        fast_sint_t l =
+            (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1) + BUCKETS_INDEX2(1, 0)] -
+            (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1)];
+        if (l > 0) {
             fast_sint_t i = bucket_end[c];
-            if (j - i > 0)
-            {
+            if (j - i > 0) {
                 memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
             }
 
-            memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
+            memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l],
+                    (size_t)l * sizeof(sa_sint_t));
         }
     }
 
     memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
 }
 
-static void libsais_place_lms_suffixes_interval_32s_2k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets)
-{
+static void libsais_place_lms_suffixes_interval_32s_2k(
+    sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m,
+    const sa_sint_t * RESTRICT buckets) {
     fast_sint_t j = n;
 
-    if (k > 1)
-    {
+    if (k > 1) {
         fast_sint_t c;
-        for (c = BUCKETS_INDEX2((fast_sint_t)k - 2, 0); c >= BUCKETS_INDEX2(0, 0); c -= BUCKETS_INDEX2(1, 0))
-        {
-            fast_sint_t l = (fast_sint_t)buckets[c + BUCKETS_INDEX2(1, 1)] - (fast_sint_t)buckets[c + BUCKETS_INDEX2(0, 1)];
-            if (l > 0)
-            {
+        for (c = BUCKETS_INDEX2((fast_sint_t)k - 2, 0);
+             c >= BUCKETS_INDEX2(0, 0); c -= BUCKETS_INDEX2(1, 0)) {
+            fast_sint_t l = (fast_sint_t)buckets[c + BUCKETS_INDEX2(1, 1)] -
+                            (fast_sint_t)buckets[c + BUCKETS_INDEX2(0, 1)];
+            if (l > 0) {
                 fast_sint_t i = buckets[c];
-                if (j - i > 0)
-                {
+                if (j - i > 0) {
                     memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
                 }
 
-                memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
+                memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l],
+                        (size_t)l * sizeof(sa_sint_t));
             }
         }
     }
@@ -4418,13 +6140,14 @@ static void libsais_place_lms_suffixes_interval_32s_2k(sa_sint_t * RESTRICT SA,
     memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
 }
 
-static void libsais_place_lms_suffixes_interval_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t m, sa_sint_t * RESTRICT buckets)
-{
+static void libsais_place_lms_suffixes_interval_32s_1k(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k,
+    sa_sint_t m, sa_sint_t * RESTRICT buckets) {
     const fast_sint_t prefetch_distance = 32;
 
-    sa_sint_t c = k - 1; fast_sint_t i, l = buckets[c];
-    for (i = (fast_sint_t)m - 1; i >= prefetch_distance + 3; i -= 4)
-    {
+    sa_sint_t c = k - 1;
+    fast_sint_t i, l = buckets[c];
+    for (i = (fast_sint_t)m - 1; i >= prefetch_distance + 3; i -= 4) {
         libsais_prefetch(&SA[i - 2 * prefetch_distance]);
 
         libsais_prefetch(&T[SA[i - prefetch_distance - 0]]);
@@ -4432,85 +6155,116 @@ static void libsais_place_lms_suffixes_interval_32s_1k(const sa_sint_t * RESTRIC
         libsais_prefetch(&T[SA[i - prefetch_distance - 2]]);
         libsais_prefetch(&T[SA[i - prefetch_distance - 3]]);
 
-        sa_sint_t p0 = SA[i - 0]; if (T[p0] != c) { c = T[p0]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p0;
-        sa_sint_t p1 = SA[i - 1]; if (T[p1] != c) { c = T[p1]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p1;
-        sa_sint_t p2 = SA[i - 2]; if (T[p2] != c) { c = T[p2]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p2;
-        sa_sint_t p3 = SA[i - 3]; if (T[p3] != c) { c = T[p3]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p3;
-    }
-
-    for (; i >= 0; i -= 1)
-    {
-        sa_sint_t p = SA[i]; if (T[p] != c) { c = T[p]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p;
+        sa_sint_t p0 = SA[i - 0];
+        if (T[p0] != c) {
+            c = T[p0];
+            memset(&SA[buckets[c]], 0,
+                   (size_t)(l - buckets[c]) * sizeof(sa_sint_t));
+            l = buckets[c];
+        }
+        SA[--l] = p0;
+        sa_sint_t p1 = SA[i - 1];
+        if (T[p1] != c) {
+            c = T[p1];
+            memset(&SA[buckets[c]], 0,
+                   (size_t)(l - buckets[c]) * sizeof(sa_sint_t));
+            l = buckets[c];
+        }
+        SA[--l] = p1;
+        sa_sint_t p2 = SA[i - 2];
+        if (T[p2] != c) {
+            c = T[p2];
+            memset(&SA[buckets[c]], 0,
+                   (size_t)(l - buckets[c]) * sizeof(sa_sint_t));
+            l = buckets[c];
+        }
+        SA[--l] = p2;
+        sa_sint_t p3 = SA[i - 3];
+        if (T[p3] != c) {
+            c = T[p3];
+            memset(&SA[buckets[c]], 0,
+                   (size_t)(l - buckets[c]) * sizeof(sa_sint_t));
+            l = buckets[c];
+        }
+        SA[--l] = p3;
+    }
+
+    for (; i >= 0; i -= 1) {
+        sa_sint_t p = SA[i];
+        if (T[p] != c) {
+            c = T[p];
+            memset(&SA[buckets[c]], 0,
+                   (size_t)(l - buckets[c]) * sizeof(sa_sint_t));
+            l = buckets[c];
+        }
+        SA[--l] = p;
     }
 
     memset(&SA[0], 0, (size_t)l * sizeof(sa_sint_t));
 }
 
-static void libsais_place_lms_suffixes_histogram_32s_6k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets)
-{
+static void libsais_place_lms_suffixes_histogram_32s_6k(
+    sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m,
+    const sa_sint_t * RESTRICT buckets) {
     const sa_sint_t * RESTRICT bucket_end = &buckets[5 * k];
 
     fast_sint_t c, j = n;
-    for (c = (fast_sint_t)k - 2; c >= 0; --c)
-    {
+    for (c = (fast_sint_t)k - 2; c >= 0; --c) {
         fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX4(c, 1)];
-        if (l > 0)
-        {
+        if (l > 0) {
             fast_sint_t i = bucket_end[c];
-            if (j - i > 0)
-            {
+            if (j - i > 0) {
                 memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
             }
 
-            memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
+            memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l],
+                    (size_t)l * sizeof(sa_sint_t));
         }
     }
 
     memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
 }
 
-static void libsais_place_lms_suffixes_histogram_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets)
-{
+static void libsais_place_lms_suffixes_histogram_32s_4k(
+    sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m,
+    const sa_sint_t * RESTRICT buckets) {
     const sa_sint_t * RESTRICT bucket_end = &buckets[3 * k];
 
     fast_sint_t c, j = n;
-    for (c = (fast_sint_t)k - 2; c >= 0; --c)
-    {
+    for (c = (fast_sint_t)k - 2; c >= 0; --c) {
         fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1)];
-        if (l > 0)
-        {
+        if (l > 0) {
             fast_sint_t i = bucket_end[c];
-            if (j - i > 0)
-            {
+            if (j - i > 0) {
                 memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
             }
 
-            memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
+            memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l],
+                    (size_t)l * sizeof(sa_sint_t));
         }
     }
 
     memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
 }
 
-static void libsais_place_lms_suffixes_histogram_32s_2k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets)
-{
+static void libsais_place_lms_suffixes_histogram_32s_2k(
+    sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m,
+    const sa_sint_t * RESTRICT buckets) {
     fast_sint_t j = n;
 
-    if (k > 1)
-    {
+    if (k > 1) {
         fast_sint_t c;
-        for (c = BUCKETS_INDEX2((fast_sint_t)k - 2, 0); c >= BUCKETS_INDEX2(0, 0); c -= BUCKETS_INDEX2(1, 0))
-        {
+        for (c = BUCKETS_INDEX2((fast_sint_t)k - 2, 0);
+             c >= BUCKETS_INDEX2(0, 0); c -= BUCKETS_INDEX2(1, 0)) {
             fast_sint_t l = (fast_sint_t)buckets[c + BUCKETS_INDEX2(0, 1)];
-            if (l > 0)
-            {
+            if (l > 0) {
                 fast_sint_t i = buckets[c];
-                if (j - i > 0)
-                {
+                if (j - i > 0) {
                     memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
                 }
 
-                memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
+                memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l],
+                        (size_t)l * sizeof(sa_sint_t));
             }
         }
     }
@@ -4518,157 +6272,353 @@ static void libsais_place_lms_suffixes_histogram_32s_2k(sa_sint_t * RESTRICT SA,
     memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
 }
 
-static void libsais_final_bwt_scan_left_to_right_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static void libsais_final_bwt_scan_left_to_right_8u(
+    const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+    sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start,
+    fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
-    {
+    for (i = omp_block_start,
+        j = omp_block_start + omp_block_size - prefetch_distance - 1;
+         i < j; i += 2) {
         libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
 
-        sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
-        sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
-
-        sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[i + 0] = T[p0] | SAINT_MIN; SA[induction_bucket[T[p0]]++] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); }
-        sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[i + 1] = T[p1] | SAINT_MIN; SA[induction_bucket[T[p1]]++] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); }
-    }
-
-    for (j += prefetch_distance + 1; i < j; i += 1)
-    {
-        sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[i] = T[p] | SAINT_MIN; SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); }
+        sa_sint_t s0 = SA[i + prefetch_distance + 0];
+        const uint8_t * Ts0 = &T[s0] - 1;
+        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        Ts0--;
+        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i + prefetch_distance + 1];
+        const uint8_t * Ts1 = &T[s1] - 1;
+        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+        Ts1--;
+        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+
+        sa_sint_t p0 = SA[i + 0];
+        SA[i + 0] = p0 & SAINT_MAX;
+        if (p0 > 0) {
+            p0--;
+            SA[i + 0] = T[p0] | SAINT_MIN;
+            SA[induction_bucket[T[p0]]++] =
+                p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1));
+        }
+        sa_sint_t p1 = SA[i + 1];
+        SA[i + 1] = p1 & SAINT_MAX;
+        if (p1 > 0) {
+            p1--;
+            SA[i + 1] = T[p1] | SAINT_MIN;
+            SA[induction_bucket[T[p1]]++] =
+                p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1));
+        }
+    }
+
+    for (j += prefetch_distance + 1; i < j; i += 1) {
+        sa_sint_t p = SA[i];
+        SA[i] = p & SAINT_MAX;
+        if (p > 0) {
+            p--;
+            SA[i] = T[p] | SAINT_MIN;
+            SA[induction_bucket[T[p]]++] =
+                p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1));
+        }
     }
 }
 
-static void libsais_final_bwt_aux_scan_left_to_right_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static void libsais_final_bwt_aux_scan_left_to_right_8u(
+    const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm,
+    sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket,
+    fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
-    {
+    for (i = omp_block_start,
+        j = omp_block_start + omp_block_size - prefetch_distance - 1;
+         i < j; i += 2) {
         libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
 
-        sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
-        sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
-
-        sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[i + 0] = T[p0] | SAINT_MIN; SA[induction_bucket[T[p0]]++] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); if ((p0 & rm) == 0) { I[p0 / (rm + 1)] = induction_bucket[T[p0]]; }}
-        sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[i + 1] = T[p1] | SAINT_MIN; SA[induction_bucket[T[p1]]++] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); if ((p1 & rm) == 0) { I[p1 / (rm + 1)] = induction_bucket[T[p1]]; }}
-    }
-
-    for (j += prefetch_distance + 1; i < j; i += 1)
-    {
-        sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[i] = T[p] | SAINT_MIN; SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); if ((p & rm) == 0) { I[p / (rm + 1)] = induction_bucket[T[p]]; } }
+        sa_sint_t s0 = SA[i + prefetch_distance + 0];
+        const uint8_t * Ts0 = &T[s0] - 1;
+        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        Ts0--;
+        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i + prefetch_distance + 1];
+        const uint8_t * Ts1 = &T[s1] - 1;
+        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+        Ts1--;
+        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+
+        sa_sint_t p0 = SA[i + 0];
+        SA[i + 0] = p0 & SAINT_MAX;
+        if (p0 > 0) {
+            p0--;
+            SA[i + 0] = T[p0] | SAINT_MIN;
+            SA[induction_bucket[T[p0]]++] =
+                p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1));
+            if ((p0 & rm) == 0) {
+                I[p0 / (rm + 1)] = induction_bucket[T[p0]];
+            }
+        }
+        sa_sint_t p1 = SA[i + 1];
+        SA[i + 1] = p1 & SAINT_MAX;
+        if (p1 > 0) {
+            p1--;
+            SA[i + 1] = T[p1] | SAINT_MIN;
+            SA[induction_bucket[T[p1]]++] =
+                p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1));
+            if ((p1 & rm) == 0) {
+                I[p1 / (rm + 1)] = induction_bucket[T[p1]];
+            }
+        }
+    }
+
+    for (j += prefetch_distance + 1; i < j; i += 1) {
+        sa_sint_t p = SA[i];
+        SA[i] = p & SAINT_MAX;
+        if (p > 0) {
+            p--;
+            SA[i] = T[p] | SAINT_MIN;
+            SA[induction_bucket[T[p]]++] =
+                p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1));
+            if ((p & rm) == 0) {
+                I[p / (rm + 1)] = induction_bucket[T[p]];
+            }
+        }
     }
 }
 
-static void libsais_final_sorting_scan_left_to_right_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static void libsais_final_sorting_scan_left_to_right_8u(
+    const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+    sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start,
+    fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
-    {
+    for (i = omp_block_start,
+        j = omp_block_start + omp_block_size - prefetch_distance - 1;
+         i < j; i += 2) {
         libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
 
-        sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
-        sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
-
-        sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 ^ SAINT_MIN; if (p0 > 0) { p0--; SA[induction_bucket[T[p0]]++] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); }
-        sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 ^ SAINT_MIN; if (p1 > 0) { p1--; SA[induction_bucket[T[p1]]++] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); }
-    }
-
-    for (j += prefetch_distance + 1; i < j; i += 1)
-    {
-        sa_sint_t p = SA[i]; SA[i] = p ^ SAINT_MIN; if (p > 0) { p--; SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); }
+        sa_sint_t s0 = SA[i + prefetch_distance + 0];
+        const uint8_t * Ts0 = &T[s0] - 1;
+        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        Ts0--;
+        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i + prefetch_distance + 1];
+        const uint8_t * Ts1 = &T[s1] - 1;
+        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+        Ts1--;
+        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+
+        sa_sint_t p0 = SA[i + 0];
+        SA[i + 0] = p0 ^ SAINT_MIN;
+        if (p0 > 0) {
+            p0--;
+            SA[induction_bucket[T[p0]]++] =
+                p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1));
+        }
+        sa_sint_t p1 = SA[i + 1];
+        SA[i + 1] = p1 ^ SAINT_MIN;
+        if (p1 > 0) {
+            p1--;
+            SA[induction_bucket[T[p1]]++] =
+                p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1));
+        }
+    }
+
+    for (j += prefetch_distance + 1; i < j; i += 1) {
+        sa_sint_t p = SA[i];
+        SA[i] = p ^ SAINT_MIN;
+        if (p > 0) {
+            p--;
+            SA[induction_bucket[T[p]]++] =
+                p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1));
+        }
     }
 }
 
-static void libsais_final_sorting_scan_left_to_right_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static void libsais_final_sorting_scan_left_to_right_32s(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+    sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start,
+    fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2)
-    {
+    for (i = omp_block_start,
+        j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1;
+         i < j; i += 2) {
         libsais_prefetchw(&SA[i + 3 * prefetch_distance]);
 
-        sa_sint_t s0 = SA[i + 2 * prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
-        sa_sint_t s1 = SA[i + 2 * prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
-        sa_sint_t s2 = SA[i + 1 * prefetch_distance + 0]; if (s2 > 0) { libsais_prefetchw(&induction_bucket[T[s2 - 1]]); libsais_prefetch(&T[s2] - 2); }
-        sa_sint_t s3 = SA[i + 1 * prefetch_distance + 1]; if (s3 > 0) { libsais_prefetchw(&induction_bucket[T[s3 - 1]]); libsais_prefetch(&T[s3] - 2); }
+        sa_sint_t s0 = SA[i + 2 * prefetch_distance + 0];
+        const sa_sint_t * Ts0 = &T[s0] - 1;
+        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i + 2 * prefetch_distance + 1];
+        const sa_sint_t * Ts1 = &T[s1] - 1;
+        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+        sa_sint_t s2 = SA[i + 1 * prefetch_distance + 0];
+        if (s2 > 0) {
+            libsais_prefetchw(&induction_bucket[T[s2 - 1]]);
+            libsais_prefetch(&T[s2] - 2);
+        }
+        sa_sint_t s3 = SA[i + 1 * prefetch_distance + 1];
+        if (s3 > 0) {
+            libsais_prefetchw(&induction_bucket[T[s3 - 1]]);
+            libsais_prefetch(&T[s3] - 2);
+        }
 
-        sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 ^ SAINT_MIN; if (p0 > 0) { p0--; SA[induction_bucket[T[p0]]++] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); }
-        sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 ^ SAINT_MIN; if (p1 > 0) { p1--; SA[induction_bucket[T[p1]]++] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); }
+        sa_sint_t p0 = SA[i + 0];
+        SA[i + 0] = p0 ^ SAINT_MIN;
+        if (p0 > 0) {
+            p0--;
+            SA[induction_bucket[T[p0]]++] =
+                p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1));
+        }
+        sa_sint_t p1 = SA[i + 1];
+        SA[i + 1] = p1 ^ SAINT_MIN;
+        if (p1 > 0) {
+            p1--;
+            SA[induction_bucket[T[p1]]++] =
+                p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1));
+        }
     }
 
-    for (j += 2 * prefetch_distance + 1; i < j; i += 1)
-    {
-        sa_sint_t p = SA[i]; SA[i] = p ^ SAINT_MIN; if (p > 0) { p--; SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); }
+    for (j += 2 * prefetch_distance + 1; i < j; i += 1) {
+        sa_sint_t p = SA[i];
+        SA[i] = p ^ SAINT_MIN;
+        if (p > 0) {
+            p--;
+            SA[induction_bucket[T[p]]++] =
+                p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1));
+        }
     }
 }
 
 #if defined(_OPENMP)
 
-static fast_sint_t libsais_final_bwt_scan_left_to_right_8u_block_prepare(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
-   const fast_sint_t prefetch_distance = 32;
-
-   memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t));
-
-   fast_sint_t i, j, count = 0;
-   for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
-   {
-       libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
+static fast_sint_t libsais_final_bwt_scan_left_to_right_8u_block_prepare(
+    const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+    sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache,
+    fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+    const fast_sint_t prefetch_distance = 32;
 
-       sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
-       sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+    memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t));
 
-       sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[i + 0] = T[p0] | SAINT_MIN; buckets[cache[count].symbol = T[p0]]++; cache[count++].index = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); }
-       sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[i + 1] = T[p1] | SAINT_MIN; buckets[cache[count].symbol = T[p1]]++; cache[count++].index = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); }
-   }
+    fast_sint_t i, j, count = 0;
+    for (i = omp_block_start,
+        j = omp_block_start + omp_block_size - prefetch_distance - 1;
+         i < j; i += 2) {
+        libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
 
-   for (j += prefetch_distance + 1; i < j; i += 1)
-   {
-       sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[i] = T[p] | SAINT_MIN; buckets[cache[count].symbol = T[p]]++; cache[count++].index = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); }
-   }
+        sa_sint_t s0 = SA[i + prefetch_distance + 0];
+        const uint8_t * Ts0 = &T[s0] - 1;
+        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        Ts0--;
+        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i + prefetch_distance + 1];
+        const uint8_t * Ts1 = &T[s1] - 1;
+        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+        Ts1--;
+        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+
+        sa_sint_t p0 = SA[i + 0];
+        SA[i + 0] = p0 & SAINT_MAX;
+        if (p0 > 0) {
+            p0--;
+            SA[i + 0] = T[p0] | SAINT_MIN;
+            buckets[cache[count].symbol = T[p0]]++;
+            cache[count++].index =
+                p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1));
+        }
+        sa_sint_t p1 = SA[i + 1];
+        SA[i + 1] = p1 & SAINT_MAX;
+        if (p1 > 0) {
+            p1--;
+            SA[i + 1] = T[p1] | SAINT_MIN;
+            buckets[cache[count].symbol = T[p1]]++;
+            cache[count++].index =
+                p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1));
+        }
+    }
+
+    for (j += prefetch_distance + 1; i < j; i += 1) {
+        sa_sint_t p = SA[i];
+        SA[i] = p & SAINT_MAX;
+        if (p > 0) {
+            p--;
+            SA[i] = T[p] | SAINT_MIN;
+            buckets[cache[count].symbol = T[p]]++;
+            cache[count++].index =
+                p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1));
+        }
+    }
 
-   return count;
+    return count;
 }
 
-static fast_sint_t libsais_final_sorting_scan_left_to_right_8u_block_prepare(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
-   const fast_sint_t prefetch_distance = 32;
-
-   memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t));
-
-   fast_sint_t i, j, count = 0;
-   for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
-   {
-       libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
+static fast_sint_t libsais_final_sorting_scan_left_to_right_8u_block_prepare(
+    const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+    sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache,
+    fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+    const fast_sint_t prefetch_distance = 32;
 
-       sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
-       sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+    memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t));
 
-       sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 ^ SAINT_MIN; if (p0 > 0) { p0--; buckets[cache[count].symbol = T[p0]]++; cache[count++].index = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); }
-       sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 ^ SAINT_MIN; if (p1 > 0) { p1--; buckets[cache[count].symbol = T[p1]]++; cache[count++].index = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); }
-   }
+    fast_sint_t i, j, count = 0;
+    for (i = omp_block_start,
+        j = omp_block_start + omp_block_size - prefetch_distance - 1;
+         i < j; i += 2) {
+        libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
 
-   for (j += prefetch_distance + 1; i < j; i += 1)
-   {
-       sa_sint_t p = SA[i]; SA[i] = p ^ SAINT_MIN; if (p > 0) { p--; buckets[cache[count].symbol = T[p]]++; cache[count++].index = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); }
-   }
+        sa_sint_t s0 = SA[i + prefetch_distance + 0];
+        const uint8_t * Ts0 = &T[s0] - 1;
+        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        Ts0--;
+        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i + prefetch_distance + 1];
+        const uint8_t * Ts1 = &T[s1] - 1;
+        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+        Ts1--;
+        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+
+        sa_sint_t p0 = SA[i + 0];
+        SA[i + 0] = p0 ^ SAINT_MIN;
+        if (p0 > 0) {
+            p0--;
+            buckets[cache[count].symbol = T[p0]]++;
+            cache[count++].index =
+                p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1));
+        }
+        sa_sint_t p1 = SA[i + 1];
+        SA[i + 1] = p1 ^ SAINT_MIN;
+        if (p1 > 0) {
+            p1--;
+            buckets[cache[count].symbol = T[p1]]++;
+            cache[count++].index =
+                p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1));
+        }
+    }
+
+    for (j += prefetch_distance + 1; i < j; i += 1) {
+        sa_sint_t p = SA[i];
+        SA[i] = p ^ SAINT_MIN;
+        if (p > 0) {
+            p--;
+            buckets[cache[count].symbol = T[p]]++;
+            cache[count++].index =
+                p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1));
+        }
+    }
 
-   return count;
+    return count;
 }
 
-static void libsais_final_order_scan_left_to_right_8u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count)
-{
+static void libsais_final_order_scan_left_to_right_8u_block_place(
+    sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
+    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = 0, j = count - 3; i < j; i += 4)
-    {
+    for (i = 0, j = count - 3; i < j; i += 4) {
         libsais_prefetch(&cache[i + prefetch_distance]);
 
         SA[buckets[cache[i + 0].symbol]++] = cache[i + 0].index;
@@ -4677,342 +6627,508 @@ static void libsais_final_order_scan_left_to_right_8u_block_place(sa_sint_t * RE
         SA[buckets[cache[i + 3].symbol]++] = cache[i + 3].index;
     }
 
-    for (j += 3; i < j; i += 1)
-    {
+    for (j += 3; i < j; i += 1) {
         SA[buckets[cache[i].symbol]++] = cache[i].index;
     }
 }
 
-static void libsais_final_bwt_aux_scan_left_to_right_8u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count)
-{
+static void libsais_final_bwt_aux_scan_left_to_right_8u_block_place(
+    sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I,
+    sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache,
+    fast_sint_t count) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = 0, j = count - 3; i < j; i += 4)
-    {
+    for (i = 0, j = count - 3; i < j; i += 4) {
         libsais_prefetch(&cache[i + prefetch_distance]);
 
-        SA[buckets[cache[i + 0].symbol]++] = cache[i + 0].index; if ((cache[i + 0].index & rm) == 0) { I[(cache[i + 0].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 0].symbol]; }
-        SA[buckets[cache[i + 1].symbol]++] = cache[i + 1].index; if ((cache[i + 1].index & rm) == 0) { I[(cache[i + 1].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 1].symbol]; }
-        SA[buckets[cache[i + 2].symbol]++] = cache[i + 2].index; if ((cache[i + 2].index & rm) == 0) { I[(cache[i + 2].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 2].symbol]; }
-        SA[buckets[cache[i + 3].symbol]++] = cache[i + 3].index; if ((cache[i + 3].index & rm) == 0) { I[(cache[i + 3].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 3].symbol]; }
+        SA[buckets[cache[i + 0].symbol]++] = cache[i + 0].index;
+        if ((cache[i + 0].index & rm) == 0) {
+            I[(cache[i + 0].index & SAINT_MAX) / (rm + 1)] =
+                buckets[cache[i + 0].symbol];
+        }
+        SA[buckets[cache[i + 1].symbol]++] = cache[i + 1].index;
+        if ((cache[i + 1].index & rm) == 0) {
+            I[(cache[i + 1].index & SAINT_MAX) / (rm + 1)] =
+                buckets[cache[i + 1].symbol];
+        }
+        SA[buckets[cache[i + 2].symbol]++] = cache[i + 2].index;
+        if ((cache[i + 2].index & rm) == 0) {
+            I[(cache[i + 2].index & SAINT_MAX) / (rm + 1)] =
+                buckets[cache[i + 2].symbol];
+        }
+        SA[buckets[cache[i + 3].symbol]++] = cache[i + 3].index;
+        if ((cache[i + 3].index & rm) == 0) {
+            I[(cache[i + 3].index & SAINT_MAX) / (rm + 1)] =
+                buckets[cache[i + 3].symbol];
+        }
     }
 
-    for (j += 3; i < j; i += 1)
-    {
-        SA[buckets[cache[i].symbol]++] = cache[i].index; if ((cache[i].index & rm) == 0) { I[(cache[i].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i].symbol]; }
+    for (j += 3; i < j; i += 1) {
+        SA[buckets[cache[i].symbol]++] = cache[i].index;
+        if ((cache[i].index & rm) == 0) {
+            I[(cache[i].index & SAINT_MAX) / (rm + 1)] =
+                buckets[cache[i].symbol];
+        }
     }
 }
 
-static void libsais_final_sorting_scan_left_to_right_32s_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static void libsais_final_sorting_scan_left_to_right_32s_block_gather(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
+    fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
-    {
+    for (i = omp_block_start,
+        j = omp_block_start + omp_block_size - prefetch_distance - 1;
+         i < j; i += 2) {
         libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
 
-        sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
-        sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+        sa_sint_t s0 = SA[i + prefetch_distance + 0];
+        const sa_sint_t * Ts0 = &T[s0] - 1;
+        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        Ts0--;
+        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i + prefetch_distance + 1];
+        const sa_sint_t * Ts1 = &T[s1] - 1;
+        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+        Ts1--;
+        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
 
         libsais_prefetchw(&cache[i + prefetch_distance]);
 
-        sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; SA[i + 0] = p0 ^ SAINT_MIN; if (p0 > 0) { p0--; cache[i + 0].index = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); symbol0 = T[p0]; } cache[i + 0].symbol = symbol0;
-        sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; SA[i + 1] = p1 ^ SAINT_MIN; if (p1 > 0) { p1--; cache[i + 1].index = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); symbol1 = T[p1]; } cache[i + 1].symbol = symbol1;
+        sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0];
+        SA[i + 0] = p0 ^ SAINT_MIN;
+        if (p0 > 0) {
+            p0--;
+            cache[i + 0].index =
+                p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1));
+            symbol0 = T[p0];
+        }
+        cache[i + 0].symbol = symbol0;
+        sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1];
+        SA[i + 1] = p1 ^ SAINT_MIN;
+        if (p1 > 0) {
+            p1--;
+            cache[i + 1].index =
+                p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1));
+            symbol1 = T[p1];
+        }
+        cache[i + 1].symbol = symbol1;
     }
 
-    for (j += prefetch_distance + 1; i < j; i += 1)
-    {
-        sa_sint_t symbol = SAINT_MIN, p = SA[i]; SA[i] = p ^ SAINT_MIN; if (p > 0) { p--; cache[i].index = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); symbol = T[p]; } cache[i].symbol = symbol;
+    for (j += prefetch_distance + 1; i < j; i += 1) {
+        sa_sint_t symbol = SAINT_MIN, p = SA[i];
+        SA[i] = p ^ SAINT_MIN;
+        if (p > 0) {
+            p--;
+            cache[i].index =
+                p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1));
+            symbol = T[p];
+        }
+        cache[i].symbol = symbol;
     }
 }
 
-static void libsais_final_sorting_scan_left_to_right_32s_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static void libsais_final_sorting_scan_left_to_right_32s_block_sort(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT induction_bucket,
+    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
+    fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size;
-    for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2)
-    {
+    for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j;
+         i += 2) {
         libsais_prefetchw(&cache[i + 2 * prefetch_distance]);
 
-        sa_sint_t s0 = cache[i + prefetch_distance + 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0]; libsais_prefetchw(s0 >= 0 ? Is0 : NULL);
-        sa_sint_t s1 = cache[i + prefetch_distance + 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1]; libsais_prefetchw(s1 >= 0 ? Is1 : NULL);
-        
+        sa_sint_t s0 = cache[i + prefetch_distance + 0].symbol;
+        const sa_sint_t * Is0 = &induction_bucket[s0];
+        libsais_prefetchw(s0 >= 0 ? Is0 : NULL);
+        sa_sint_t s1 = cache[i + prefetch_distance + 1].symbol;
+        const sa_sint_t * Is1 = &induction_bucket[s1];
+        libsais_prefetchw(s1 >= 0 ? Is1 : NULL);
+
         sa_sint_t v0 = cache[i + 0].symbol;
-        if (v0 >= 0)
-        {
+        if (v0 >= 0) {
             cache[i + 0].symbol = induction_bucket[v0]++;
-            if (cache[i + 0].symbol < omp_block_end) { sa_sint_t ni = cache[i + 0].symbol, np = cache[i + 0].index; cache[i + 0].index = np ^ SAINT_MIN; if (np > 0) { np--; cache[ni].index = np | ((sa_sint_t)(T[np - (np > 0)] < T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } }
+            if (cache[i + 0].symbol < omp_block_end) {
+                sa_sint_t ni = cache[i + 0].symbol, np = cache[i + 0].index;
+                cache[i + 0].index = np ^ SAINT_MIN;
+                if (np > 0) {
+                    np--;
+                    cache[ni].index =
+                        np | ((sa_sint_t)(T[np - (np > 0)] < T[np])
+                              << (SAINT_BIT - 1));
+                    cache[ni].symbol = T[np];
+                }
+            }
         }
 
         sa_sint_t v1 = cache[i + 1].symbol;
-        if (v1 >= 0)
-        {
+        if (v1 >= 0) {
             cache[i + 1].symbol = induction_bucket[v1]++;
-            if (cache[i + 1].symbol < omp_block_end) { sa_sint_t ni = cache[i + 1].symbol, np = cache[i + 1].index; cache[i + 1].index = np ^ SAINT_MIN; if (np > 0) { np--; cache[ni].index = np | ((sa_sint_t)(T[np - (np > 0)] < T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } }
+            if (cache[i + 1].symbol < omp_block_end) {
+                sa_sint_t ni = cache[i + 1].symbol, np = cache[i + 1].index;
+                cache[i + 1].index = np ^ SAINT_MIN;
+                if (np > 0) {
+                    np--;
+                    cache[ni].index =
+                        np | ((sa_sint_t)(T[np - (np > 0)] < T[np])
+                              << (SAINT_BIT - 1));
+                    cache[ni].symbol = T[np];
+                }
+            }
         }
     }
 
-    for (j += prefetch_distance + 1; i < j; i += 1)
-    {
+    for (j += prefetch_distance + 1; i < j; i += 1) {
         sa_sint_t v = cache[i].symbol;
-        if (v >= 0)
-        {
+        if (v >= 0) {
             cache[i].symbol = induction_bucket[v]++;
-            if (cache[i].symbol < omp_block_end) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; cache[i].index = np ^ SAINT_MIN; if (np > 0) { np--; cache[ni].index = np | ((sa_sint_t)(T[np - (np > 0)] < T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } }
+            if (cache[i].symbol < omp_block_end) {
+                sa_sint_t ni = cache[i].symbol, np = cache[i].index;
+                cache[i].index = np ^ SAINT_MIN;
+                if (np > 0) {
+                    np--;
+                    cache[ni].index =
+                        np | ((sa_sint_t)(T[np - (np > 0)] < T[np])
+                              << (SAINT_BIT - 1));
+                    cache[ni].symbol = T[np];
+                }
+            }
         }
     }
 }
 
-static void libsais_final_bwt_scan_left_to_right_8u_block_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
-#if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0)
-#endif
+static void libsais_final_bwt_scan_left_to_right_8u_block_omp(
+    const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+    sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start,
+    fast_sint_t block_size, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    #if defined(_OPENMP)
+        #pragma omp parallel num_threads(                                   \
+            threads) if (threads > 1 && block_size >= 64 * ALPHABET_SIZE && \
+                         omp_get_dynamic() == 0)
+    #endif
     {
-#if defined(_OPENMP)
-        fast_sint_t omp_thread_num    = omp_get_thread_num();
-        fast_sint_t omp_num_threads   = omp_get_num_threads();
-#else
-        UNUSED(threads); UNUSED(thread_state);
+    #if defined(_OPENMP)
+        fast_sint_t omp_thread_num = omp_get_thread_num();
+        fast_sint_t omp_num_threads = omp_get_num_threads();
+    #else
+        UNUSED(threads);
+        UNUSED(thread_state);
 
-        fast_sint_t omp_thread_num    = 0;
-        fast_sint_t omp_num_threads   = 1;
-#endif
-        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+        fast_sint_t omp_thread_num = 0;
+        fast_sint_t omp_num_threads = 1;
+    #endif
+        fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
+                                         ? omp_block_stride
+                                         : block_size - omp_block_start;
 
         omp_block_start += block_start;
 
-        if (omp_num_threads == 1)
-        {
-            libsais_final_bwt_scan_left_to_right_8u(T, SA, induction_bucket, omp_block_start, omp_block_size);
+        if (omp_num_threads == 1) {
+            libsais_final_bwt_scan_left_to_right_8u(
+                T, SA, induction_bucket, omp_block_start, omp_block_size);
         }
-#if defined(_OPENMP)
-        else
-        {
+    #if defined(_OPENMP)
+        else {
             {
-                thread_state[omp_thread_num].state.count = libsais_final_bwt_scan_left_to_right_8u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size);
+                thread_state[omp_thread_num].state.count =
+                    libsais_final_bwt_scan_left_to_right_8u_block_prepare(
+                        T, SA, thread_state[omp_thread_num].state.buckets,
+                        thread_state[omp_thread_num].state.cache,
+                        omp_block_start, omp_block_size);
             }
 
-            #pragma omp barrier
+        #pragma omp barrier
 
-            #pragma omp master
+        #pragma omp master
             {
                 fast_sint_t t;
-                for (t = 0; t < omp_num_threads; ++t)
-                {
-                    sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
-                    fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A + B; temp_bucket[c] = A; }
+                for (t = 0; t < omp_num_threads; ++t) {
+                    sa_sint_t * RESTRICT temp_bucket =
+                        thread_state[t].state.buckets;
+                    fast_sint_t c;
+                    for (c = 0; c < ALPHABET_SIZE; c += 1) {
+                        sa_sint_t A = induction_bucket[c], B = temp_bucket[c];
+                        induction_bucket[c] = A + B;
+                        temp_bucket[c] = A;
+                    }
                 }
             }
 
-            #pragma omp barrier
+        #pragma omp barrier
 
             {
-                libsais_final_order_scan_left_to_right_8u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count);
+                libsais_final_order_scan_left_to_right_8u_block_place(
+                    SA, thread_state[omp_thread_num].state.buckets,
+                    thread_state[omp_thread_num].state.cache,
+                    thread_state[omp_thread_num].state.count);
             }
         }
-#endif
+    #endif
     }
 }
 
-static void libsais_final_bwt_aux_scan_left_to_right_8u_block_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
-#if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0)
-#endif
+static void libsais_final_bwt_aux_scan_left_to_right_8u_block_omp(
+    const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm,
+    sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket,
+    fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    #if defined(_OPENMP)
+        #pragma omp parallel num_threads(                                   \
+            threads) if (threads > 1 && block_size >= 64 * ALPHABET_SIZE && \
+                         omp_get_dynamic() == 0)
+    #endif
     {
-#if defined(_OPENMP)
-        fast_sint_t omp_thread_num    = omp_get_thread_num();
-        fast_sint_t omp_num_threads   = omp_get_num_threads();
-#else
-        UNUSED(threads); UNUSED(thread_state);
+    #if defined(_OPENMP)
+        fast_sint_t omp_thread_num = omp_get_thread_num();
+        fast_sint_t omp_num_threads = omp_get_num_threads();
+    #else
+        UNUSED(threads);
+        UNUSED(thread_state);
 
-        fast_sint_t omp_thread_num    = 0;
-        fast_sint_t omp_num_threads   = 1;
-#endif
-        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+        fast_sint_t omp_thread_num = 0;
+        fast_sint_t omp_num_threads = 1;
+    #endif
+        fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
+                                         ? omp_block_stride
+                                         : block_size - omp_block_start;
 
         omp_block_start += block_start;
 
-        if (omp_num_threads == 1)
-        {
-            libsais_final_bwt_aux_scan_left_to_right_8u(T, SA, rm, I, induction_bucket, omp_block_start, omp_block_size);
+        if (omp_num_threads == 1) {
+            libsais_final_bwt_aux_scan_left_to_right_8u(
+                T, SA, rm, I, induction_bucket, omp_block_start,
+                omp_block_size);
         }
-#if defined(_OPENMP)
-        else
-        {
+    #if defined(_OPENMP)
+        else {
             {
-                thread_state[omp_thread_num].state.count = libsais_final_bwt_scan_left_to_right_8u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size);
+                thread_state[omp_thread_num].state.count =
+                    libsais_final_bwt_scan_left_to_right_8u_block_prepare(
+                        T, SA, thread_state[omp_thread_num].state.buckets,
+                        thread_state[omp_thread_num].state.cache,
+                        omp_block_start, omp_block_size);
             }
 
-            #pragma omp barrier
+        #pragma omp barrier
 
-            #pragma omp master
+        #pragma omp master
             {
                 fast_sint_t t;
-                for (t = 0; t < omp_num_threads; ++t)
-                {
-                    sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
-                    fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A + B; temp_bucket[c] = A; }
+                for (t = 0; t < omp_num_threads; ++t) {
+                    sa_sint_t * RESTRICT temp_bucket =
+                        thread_state[t].state.buckets;
+                    fast_sint_t c;
+                    for (c = 0; c < ALPHABET_SIZE; c += 1) {
+                        sa_sint_t A = induction_bucket[c], B = temp_bucket[c];
+                        induction_bucket[c] = A + B;
+                        temp_bucket[c] = A;
+                    }
                 }
             }
 
-            #pragma omp barrier
+        #pragma omp barrier
 
             {
-                libsais_final_bwt_aux_scan_left_to_right_8u_block_place(SA, rm, I, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count);
+                libsais_final_bwt_aux_scan_left_to_right_8u_block_place(
+                    SA, rm, I, thread_state[omp_thread_num].state.buckets,
+                    thread_state[omp_thread_num].state.cache,
+                    thread_state[omp_thread_num].state.count);
             }
         }
-#endif
+    #endif
     }
 }
 
-static void libsais_final_sorting_scan_left_to_right_8u_block_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
-#if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0)
-#endif
+static void libsais_final_sorting_scan_left_to_right_8u_block_omp(
+    const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+    sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start,
+    fast_sint_t block_size, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    #if defined(_OPENMP)
+        #pragma omp parallel num_threads(                                   \
+            threads) if (threads > 1 && block_size >= 64 * ALPHABET_SIZE && \
+                         omp_get_dynamic() == 0)
+    #endif
     {
-#if defined(_OPENMP)
-        fast_sint_t omp_thread_num    = omp_get_thread_num();
-        fast_sint_t omp_num_threads   = omp_get_num_threads();
-#else
-        UNUSED(threads); UNUSED(thread_state);
+    #if defined(_OPENMP)
+        fast_sint_t omp_thread_num = omp_get_thread_num();
+        fast_sint_t omp_num_threads = omp_get_num_threads();
+    #else
+        UNUSED(threads);
+        UNUSED(thread_state);
 
-        fast_sint_t omp_thread_num    = 0;
-        fast_sint_t omp_num_threads   = 1;
-#endif
-        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+        fast_sint_t omp_thread_num = 0;
+        fast_sint_t omp_num_threads = 1;
+    #endif
+        fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
+                                         ? omp_block_stride
+                                         : block_size - omp_block_start;
 
         omp_block_start += block_start;
 
-        if (omp_num_threads == 1)
-        {
-            libsais_final_sorting_scan_left_to_right_8u(T, SA, induction_bucket, omp_block_start, omp_block_size);
+        if (omp_num_threads == 1) {
+            libsais_final_sorting_scan_left_to_right_8u(
+                T, SA, induction_bucket, omp_block_start, omp_block_size);
         }
-#if defined(_OPENMP)
-        else
-        {
+    #if defined(_OPENMP)
+        else {
             {
-                thread_state[omp_thread_num].state.count = libsais_final_sorting_scan_left_to_right_8u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size);
+                thread_state[omp_thread_num].state.count =
+                    libsais_final_sorting_scan_left_to_right_8u_block_prepare(
+                        T, SA, thread_state[omp_thread_num].state.buckets,
+                        thread_state[omp_thread_num].state.cache,
+                        omp_block_start, omp_block_size);
             }
 
-            #pragma omp barrier
+        #pragma omp barrier
 
-            #pragma omp master
+        #pragma omp master
             {
                 fast_sint_t t;
-                for (t = 0; t < omp_num_threads; ++t)
-                {
-                    sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
-                    fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A + B; temp_bucket[c] = A; }
+                for (t = 0; t < omp_num_threads; ++t) {
+                    sa_sint_t * RESTRICT temp_bucket =
+                        thread_state[t].state.buckets;
+                    fast_sint_t c;
+                    for (c = 0; c < ALPHABET_SIZE; c += 1) {
+                        sa_sint_t A = induction_bucket[c], B = temp_bucket[c];
+                        induction_bucket[c] = A + B;
+                        temp_bucket[c] = A;
+                    }
                 }
             }
 
-            #pragma omp barrier
+        #pragma omp barrier
 
             {
-                libsais_final_order_scan_left_to_right_8u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count);
+                libsais_final_order_scan_left_to_right_8u_block_place(
+                    SA, thread_state[omp_thread_num].state.buckets,
+                    thread_state[omp_thread_num].state.cache,
+                    thread_state[omp_thread_num].state.count);
             }
         }
-#endif
+    #endif
     }
 }
 
-static void libsais_final_sorting_scan_left_to_right_32s_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
-{
-#if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
-#endif
+static void libsais_final_sorting_scan_left_to_right_32s_block_omp(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+    sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache,
+    fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) {
+    #if defined(_OPENMP)
+        #pragma omp parallel num_threads(threads) if (threads > 1 && \
+                                                      block_size >= 16384)
+    #endif
     {
-#if defined(_OPENMP)
-        fast_sint_t omp_thread_num    = omp_get_thread_num();
-        fast_sint_t omp_num_threads   = omp_get_num_threads();
-#else
-        UNUSED(threads); UNUSED(cache);
+    #if defined(_OPENMP)
+        fast_sint_t omp_thread_num = omp_get_thread_num();
+        fast_sint_t omp_num_threads = omp_get_num_threads();
+    #else
+        UNUSED(threads);
+        UNUSED(cache);
 
-        fast_sint_t omp_thread_num    = 0;
-        fast_sint_t omp_num_threads   = 1;
-#endif
-        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+        fast_sint_t omp_thread_num = 0;
+        fast_sint_t omp_num_threads = 1;
+    #endif
+        fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
+                                         ? omp_block_stride
+                                         : block_size - omp_block_start;
 
         omp_block_start += block_start;
 
-        if (omp_num_threads == 1)
-        {
-            libsais_final_sorting_scan_left_to_right_32s(T, SA, buckets, omp_block_start, omp_block_size);
+        if (omp_num_threads == 1) {
+            libsais_final_sorting_scan_left_to_right_32s(
+                T, SA, buckets, omp_block_start, omp_block_size);
         }
-#if defined(_OPENMP)
-        else
-        {
+    #if defined(_OPENMP)
+        else {
             {
-                libsais_final_sorting_scan_left_to_right_32s_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+                libsais_final_sorting_scan_left_to_right_32s_block_gather(
+                    T, SA, cache - block_start, omp_block_start,
+                    omp_block_size);
             }
 
-            #pragma omp barrier
+        #pragma omp barrier
 
-            #pragma omp master
+        #pragma omp master
             {
-                libsais_final_sorting_scan_left_to_right_32s_block_sort(T, buckets, cache - block_start, block_start, block_size);
+                libsais_final_sorting_scan_left_to_right_32s_block_sort(
+                    T, buckets, cache - block_start, block_start, block_size);
             }
 
-            #pragma omp barrier
+        #pragma omp barrier
 
             {
-                libsais_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+                libsais_compact_and_place_cached_suffixes(
+                    SA, cache - block_start, omp_block_start, omp_block_size);
             }
         }
-#endif
+    #endif
     }
 }
 
 #endif
 
-static void libsais_final_bwt_scan_left_to_right_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t n, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
-    SA[induction_bucket[T[(sa_sint_t)n - 1]]++] = ((sa_sint_t)n - 1) | ((sa_sint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1));
+static void libsais_final_bwt_scan_left_to_right_8u_omp(
+    const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t n,
+    sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    SA[induction_bucket[T[(sa_sint_t)n - 1]]++] =
+        ((sa_sint_t)n - 1) |
+        ((sa_sint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1])
+         << (SAINT_BIT - 1));
 
-    if (threads == 1 || n < 65536)
-    {
+    if (threads == 1 || n < 65536) {
         libsais_final_bwt_scan_left_to_right_8u(T, SA, induction_bucket, 0, n);
     }
 #if defined(_OPENMP)
-    else
-    {
+    else {
         fast_sint_t block_start;
-        for (block_start = 0; block_start < n; )
-        {
-            if (SA[block_start] == 0)
-            {
+        for (block_start = 0; block_start < n;) {
+            if (SA[block_start] == 0) {
                 block_start++;
-            }
-            else
-            {
-                fast_sint_t block_max_end = block_start + ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end > n) { block_max_end = n;}
-                fast_sint_t block_end     = block_start + 1; while (block_end < block_max_end && SA[block_end] != 0) { block_end++; }
-                fast_sint_t block_size    = block_end - block_start;
-
-                if (block_size < 32)
-                {
-                    for (; block_start < block_end; block_start += 1)
-                    {
-                        sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; SA[block_start] = T[p] | SAINT_MIN; SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); }
-                    }
+            } else {
+                fast_sint_t block_max_end =
+                    block_start +
+                    ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE -
+                                              16 * (fast_sint_t)threads);
+                if (block_max_end > n) {
+                    block_max_end = n;
                 }
-                else
-                {
-                    libsais_final_bwt_scan_left_to_right_8u_block_omp(T, SA, induction_bucket, block_start, block_size, threads, thread_state);
+                fast_sint_t block_end = block_start + 1;
+                while (block_end < block_max_end && SA[block_end] != 0) {
+                    block_end++;
+                }
+                fast_sint_t block_size = block_end - block_start;
+
+                if (block_size < 32) {
+                    for (; block_start < block_end; block_start += 1) {
+                        sa_sint_t p = SA[block_start];
+                        SA[block_start] = p & SAINT_MAX;
+                        if (p > 0) {
+                            p--;
+                            SA[block_start] = T[p] | SAINT_MIN;
+                            SA[induction_bucket[T[p]]++] =
+                                p | ((sa_sint_t)(T[p - (p > 0)] < T[p])
+                                     << (SAINT_BIT - 1));
+                        }
+                    }
+                } else {
+                    libsais_final_bwt_scan_left_to_right_8u_block_omp(
+                        T, SA, induction_bucket, block_start, block_size,
+                        threads, thread_state);
                     block_start = block_end;
                 }
             }
@@ -5023,42 +7139,63 @@ static void libsais_final_bwt_scan_left_to_right_8u_omp(const uint8_t * RESTRICT
 #endif
 }
 
-static void libsais_final_bwt_aux_scan_left_to_right_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t n, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
-    SA[induction_bucket[T[(sa_sint_t)n - 1]]++] = ((sa_sint_t)n - 1) | ((sa_sint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1));
+static void libsais_final_bwt_aux_scan_left_to_right_8u_omp(
+    const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t n,
+    sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket,
+    sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    SA[induction_bucket[T[(sa_sint_t)n - 1]]++] =
+        ((sa_sint_t)n - 1) |
+        ((sa_sint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1])
+         << (SAINT_BIT - 1));
 
-    if ((((sa_sint_t)n - 1) & rm) == 0) { I[((sa_sint_t)n - 1) / (rm + 1)] = induction_bucket[T[(sa_sint_t)n - 1]]; }
+    if ((((sa_sint_t)n - 1) & rm) == 0) {
+        I[((sa_sint_t)n - 1) / (rm + 1)] =
+            induction_bucket[T[(sa_sint_t)n - 1]];
+    }
 
-    if (threads == 1 || n < 65536)
-    {
-        libsais_final_bwt_aux_scan_left_to_right_8u(T, SA, rm, I, induction_bucket, 0, n);
+    if (threads == 1 || n < 65536) {
+        libsais_final_bwt_aux_scan_left_to_right_8u(T, SA, rm, I,
+                                                    induction_bucket, 0, n);
     }
 #if defined(_OPENMP)
-    else
-    {
+    else {
         fast_sint_t block_start;
-        for (block_start = 0; block_start < n; )
-        {
-            if (SA[block_start] == 0)
-            {
+        for (block_start = 0; block_start < n;) {
+            if (SA[block_start] == 0) {
                 block_start++;
-            }
-            else
-            {
-                fast_sint_t block_max_end = block_start + ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end > n) { block_max_end = n;}
-                fast_sint_t block_end     = block_start + 1; while (block_end < block_max_end && SA[block_end] != 0) { block_end++; }
-                fast_sint_t block_size    = block_end - block_start;
-
-                if (block_size < 32)
-                {
-                    for (; block_start < block_end; block_start += 1)
-                    {
-                        sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; SA[block_start] = T[p] | SAINT_MIN; SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); if ((p & rm) == 0) { I[p / (rm + 1)] = induction_bucket[T[p]]; } }
-                    }
+            } else {
+                fast_sint_t block_max_end =
+                    block_start +
+                    ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE -
+                                              16 * (fast_sint_t)threads);
+                if (block_max_end > n) {
+                    block_max_end = n;
                 }
-                else
-                {
-                    libsais_final_bwt_aux_scan_left_to_right_8u_block_omp(T, SA, rm, I, induction_bucket, block_start, block_size, threads, thread_state);
+                fast_sint_t block_end = block_start + 1;
+                while (block_end < block_max_end && SA[block_end] != 0) {
+                    block_end++;
+                }
+                fast_sint_t block_size = block_end - block_start;
+
+                if (block_size < 32) {
+                    for (; block_start < block_end; block_start += 1) {
+                        sa_sint_t p = SA[block_start];
+                        SA[block_start] = p & SAINT_MAX;
+                        if (p > 0) {
+                            p--;
+                            SA[block_start] = T[p] | SAINT_MIN;
+                            SA[induction_bucket[T[p]]++] =
+                                p | ((sa_sint_t)(T[p - (p > 0)] < T[p])
+                                     << (SAINT_BIT - 1));
+                            if ((p & rm) == 0) {
+                                I[p / (rm + 1)] = induction_bucket[T[p]];
+                            }
+                        }
+                    }
+                } else {
+                    libsais_final_bwt_aux_scan_left_to_right_8u_block_omp(
+                        T, SA, rm, I, induction_bucket, block_start, block_size,
+                        threads, thread_state);
                     block_start = block_end;
                 }
             }
@@ -5069,40 +7206,54 @@ static void libsais_final_bwt_aux_scan_left_to_right_8u_omp(const uint8_t * REST
 #endif
 }
 
-static void libsais_final_sorting_scan_left_to_right_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t n, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
-    SA[induction_bucket[T[(sa_sint_t)n - 1]]++] = ((sa_sint_t)n - 1) | ((sa_sint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1));
+static void libsais_final_sorting_scan_left_to_right_8u_omp(
+    const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t n,
+    sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    SA[induction_bucket[T[(sa_sint_t)n - 1]]++] =
+        ((sa_sint_t)n - 1) |
+        ((sa_sint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1])
+         << (SAINT_BIT - 1));
 
-    if (threads == 1 || n < 65536)
-    {
-        libsais_final_sorting_scan_left_to_right_8u(T, SA, induction_bucket, 0, n);
+    if (threads == 1 || n < 65536) {
+        libsais_final_sorting_scan_left_to_right_8u(T, SA, induction_bucket, 0,
+                                                    n);
     }
 #if defined(_OPENMP)
-    else
-    {
+    else {
         fast_sint_t block_start;
-        for (block_start = 0; block_start < n; )
-        {
-            if (SA[block_start] == 0)
-            {
+        for (block_start = 0; block_start < n;) {
+            if (SA[block_start] == 0) {
                 block_start++;
-            }
-            else
-            {
-                fast_sint_t block_max_end = block_start + ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end > n) { block_max_end = n;}
-                fast_sint_t block_end     = block_start + 1; while (block_end < block_max_end && SA[block_end] != 0) { block_end++; }
-                fast_sint_t block_size    = block_end - block_start;
-
-                if (block_size < 32)
-                {
-                    for (; block_start < block_end; block_start += 1)
-                    {
-                        sa_sint_t p = SA[block_start]; SA[block_start] = p ^ SAINT_MIN; if (p > 0) { p--; SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); }
-                    }
+            } else {
+                fast_sint_t block_max_end =
+                    block_start +
+                    ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE -
+                                              16 * (fast_sint_t)threads);
+                if (block_max_end > n) {
+                    block_max_end = n;
                 }
-                else
-                {
-                    libsais_final_sorting_scan_left_to_right_8u_block_omp(T, SA, induction_bucket, block_start, block_size, threads, thread_state);
+                fast_sint_t block_end = block_start + 1;
+                while (block_end < block_max_end && SA[block_end] != 0) {
+                    block_end++;
+                }
+                fast_sint_t block_size = block_end - block_start;
+
+                if (block_size < 32) {
+                    for (; block_start < block_end; block_start += 1) {
+                        sa_sint_t p = SA[block_start];
+                        SA[block_start] = p ^ SAINT_MIN;
+                        if (p > 0) {
+                            p--;
+                            SA[induction_bucket[T[p]]++] =
+                                p | ((sa_sint_t)(T[p - (p > 0)] < T[p])
+                                     << (SAINT_BIT - 1));
+                        }
+                    }
+                } else {
+                    libsais_final_sorting_scan_left_to_right_8u_block_omp(
+                        T, SA, induction_bucket, block_start, block_size,
+                        threads, thread_state);
                     block_start = block_end;
                 }
             }
@@ -5113,23 +7264,30 @@ static void libsais_final_sorting_scan_left_to_right_8u_omp(const uint8_t * REST
 #endif
 }
 
-static void libsais_final_sorting_scan_left_to_right_32s_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
-    SA[induction_bucket[T[n - 1]]++] = (n - 1) | ((sa_sint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1));
+static void libsais_final_sorting_scan_left_to_right_32s_omp(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+    sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    SA[induction_bucket[T[n - 1]]++] =
+        (n - 1) | ((sa_sint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1));
 
-    if (threads == 1 || n < 65536)
-    {
-        libsais_final_sorting_scan_left_to_right_32s(T, SA, induction_bucket, 0, n);
+    if (threads == 1 || n < 65536) {
+        libsais_final_sorting_scan_left_to_right_32s(T, SA, induction_bucket, 0,
+                                                     n);
     }
 #if defined(_OPENMP)
-    else
-    {
+    else {
         fast_sint_t block_start, block_end;
-        for (block_start = 0; block_start < n; block_start = block_end)
-        {
-            block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end > n) { block_end = n; }
+        for (block_start = 0; block_start < n; block_start = block_end) {
+            block_end = block_start +
+                        (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
+            if (block_end > n) {
+                block_end = n;
+            }
 
-            libsais_final_sorting_scan_left_to_right_32s_block_omp(T, SA, induction_bucket, thread_state[0].state.cache, block_start, block_end - block_start, threads);
+            libsais_final_sorting_scan_left_to_right_32s_block_omp(
+                T, SA, induction_bucket, thread_state[0].state.cache,
+                block_start, block_end - block_start, threads);
         }
     }
 #else
@@ -5137,193 +7295,439 @@ static void libsais_final_sorting_scan_left_to_right_32s_omp(const sa_sint_t * R
 #endif
 }
 
-static sa_sint_t libsais_final_bwt_scan_right_to_left_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static sa_sint_t libsais_final_bwt_scan_right_to_left_8u(
+    const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+    sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start,
+    fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
-    fast_sint_t i, j; sa_sint_t index = -1;
-    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
-    {
+    fast_sint_t i, j;
+    sa_sint_t index = -1;
+    for (i = omp_block_start + omp_block_size - 1,
+        j = omp_block_start + prefetch_distance + 1;
+         i >= j; i -= 2) {
         libsais_prefetchw(&SA[i - 2 * prefetch_distance]);
 
-        sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
-        sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+        sa_sint_t s0 = SA[i - prefetch_distance - 0];
+        const uint8_t * Ts0 = &T[s0] - 1;
+        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        Ts0--;
+        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i - prefetch_distance - 1];
+        const uint8_t * Ts1 = &T[s1] - 1;
+        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+        Ts1--;
+        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
 
-        sa_sint_t p0 = SA[i - 0]; index = (p0 == 0) ? (sa_sint_t)(i - 0) : index;
-        SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; uint8_t c0 = T[p0 - (p0 > 0)], c1 = T[p0]; SA[i - 0] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p0 : t; }
+        sa_sint_t p0 = SA[i - 0];
+        index = (p0 == 0) ? (sa_sint_t)(i - 0) : index;
+        SA[i - 0] = p0 & SAINT_MAX;
+        if (p0 > 0) {
+            p0--;
+            uint8_t c0 = T[p0 - (p0 > 0)], c1 = T[p0];
+            SA[i - 0] = c1;
+            sa_sint_t t = c0 | SAINT_MIN;
+            SA[--induction_bucket[c1]] = (c0 <= c1) ? p0 : t;
+        }
 
-        sa_sint_t p1 = SA[i - 1]; index = (p1 == 0) ? (sa_sint_t)(i - 1) : index;
-        SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; uint8_t c0 = T[p1 - (p1 > 0)], c1 = T[p1]; SA[i - 1] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p1 : t; }
+        sa_sint_t p1 = SA[i - 1];
+        index = (p1 == 0) ? (sa_sint_t)(i - 1) : index;
+        SA[i - 1] = p1 & SAINT_MAX;
+        if (p1 > 0) {
+            p1--;
+            uint8_t c0 = T[p1 - (p1 > 0)], c1 = T[p1];
+            SA[i - 1] = c1;
+            sa_sint_t t = c0 | SAINT_MIN;
+            SA[--induction_bucket[c1]] = (c0 <= c1) ? p1 : t;
+        }
     }
 
-    for (j -= prefetch_distance + 1; i >= j; i -= 1)
-    {
-        sa_sint_t p = SA[i]; index = (p == 0) ? (sa_sint_t)i : index;
-        SA[i] = p & SAINT_MAX; if (p > 0) { p--; uint8_t c0 = T[p - (p > 0)], c1 = T[p]; SA[i] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t; }
+    for (j -= prefetch_distance + 1; i >= j; i -= 1) {
+        sa_sint_t p = SA[i];
+        index = (p == 0) ? (sa_sint_t)i : index;
+        SA[i] = p & SAINT_MAX;
+        if (p > 0) {
+            p--;
+            uint8_t c0 = T[p - (p > 0)], c1 = T[p];
+            SA[i] = c1;
+            sa_sint_t t = c0 | SAINT_MIN;
+            SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t;
+        }
     }
 
     return index;
 }
 
-static void libsais_final_bwt_aux_scan_right_to_left_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static void libsais_final_bwt_aux_scan_right_to_left_8u(
+    const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm,
+    sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket,
+    fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
-    {
+    for (i = omp_block_start + omp_block_size - 1,
+        j = omp_block_start + prefetch_distance + 1;
+         i >= j; i -= 2) {
         libsais_prefetchw(&SA[i - 2 * prefetch_distance]);
 
-        sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
-        sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+        sa_sint_t s0 = SA[i - prefetch_distance - 0];
+        const uint8_t * Ts0 = &T[s0] - 1;
+        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        Ts0--;
+        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i - prefetch_distance - 1];
+        const uint8_t * Ts1 = &T[s1] - 1;
+        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+        Ts1--;
+        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
 
         sa_sint_t p0 = SA[i - 0];
-        SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; uint8_t c0 = T[p0 - (p0 > 0)], c1 = T[p0]; SA[i - 0] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p0 : t; if ((p0 & rm) == 0) { I[p0 / (rm + 1)] = induction_bucket[T[p0]] + 1; } }
+        SA[i - 0] = p0 & SAINT_MAX;
+        if (p0 > 0) {
+            p0--;
+            uint8_t c0 = T[p0 - (p0 > 0)], c1 = T[p0];
+            SA[i - 0] = c1;
+            sa_sint_t t = c0 | SAINT_MIN;
+            SA[--induction_bucket[c1]] = (c0 <= c1) ? p0 : t;
+            if ((p0 & rm) == 0) {
+                I[p0 / (rm + 1)] = induction_bucket[T[p0]] + 1;
+            }
+        }
 
         sa_sint_t p1 = SA[i - 1];
-        SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; uint8_t c0 = T[p1 - (p1 > 0)], c1 = T[p1]; SA[i - 1] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p1 : t; if ((p1 & rm) == 0) { I[p1 / (rm + 1)] = induction_bucket[T[p1]] + 1; } }
+        SA[i - 1] = p1 & SAINT_MAX;
+        if (p1 > 0) {
+            p1--;
+            uint8_t c0 = T[p1 - (p1 > 0)], c1 = T[p1];
+            SA[i - 1] = c1;
+            sa_sint_t t = c0 | SAINT_MIN;
+            SA[--induction_bucket[c1]] = (c0 <= c1) ? p1 : t;
+            if ((p1 & rm) == 0) {
+                I[p1 / (rm + 1)] = induction_bucket[T[p1]] + 1;
+            }
+        }
     }
 
-    for (j -= prefetch_distance + 1; i >= j; i -= 1)
-    {
+    for (j -= prefetch_distance + 1; i >= j; i -= 1) {
         sa_sint_t p = SA[i];
-        SA[i] = p & SAINT_MAX; if (p > 0) { p--; uint8_t c0 = T[p - (p > 0)], c1 = T[p]; SA[i] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t; if ((p & rm) == 0) { I[p / (rm + 1)] = induction_bucket[T[p]] + 1; } }
+        SA[i] = p & SAINT_MAX;
+        if (p > 0) {
+            p--;
+            uint8_t c0 = T[p - (p > 0)], c1 = T[p];
+            SA[i] = c1;
+            sa_sint_t t = c0 | SAINT_MIN;
+            SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t;
+            if ((p & rm) == 0) {
+                I[p / (rm + 1)] = induction_bucket[T[p]] + 1;
+            }
+        }
     }
 }
 
-static void libsais_final_sorting_scan_right_to_left_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static void libsais_final_sorting_scan_right_to_left_8u(
+    const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+    sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start,
+    fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
-    {
+    for (i = omp_block_start + omp_block_size - 1,
+        j = omp_block_start + prefetch_distance + 1;
+         i >= j; i -= 2) {
         libsais_prefetchw(&SA[i - 2 * prefetch_distance]);
 
-        sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
-        sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+        sa_sint_t s0 = SA[i - prefetch_distance - 0];
+        const uint8_t * Ts0 = &T[s0] - 1;
+        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        Ts0--;
+        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i - prefetch_distance - 1];
+        const uint8_t * Ts1 = &T[s1] - 1;
+        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+        Ts1--;
+        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
 
-        sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[--induction_bucket[T[p0]]] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1)); }
-        sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[--induction_bucket[T[p1]]] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1)); }
+        sa_sint_t p0 = SA[i - 0];
+        SA[i - 0] = p0 & SAINT_MAX;
+        if (p0 > 0) {
+            p0--;
+            SA[--induction_bucket[T[p0]]] =
+                p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1));
+        }
+        sa_sint_t p1 = SA[i - 1];
+        SA[i - 1] = p1 & SAINT_MAX;
+        if (p1 > 0) {
+            p1--;
+            SA[--induction_bucket[T[p1]]] =
+                p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1));
+        }
     }
 
-    for (j -= prefetch_distance + 1; i >= j; i -= 1)
-    {
-        sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[--induction_bucket[T[p]]] = p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); }
+    for (j -= prefetch_distance + 1; i >= j; i -= 1) {
+        sa_sint_t p = SA[i];
+        SA[i] = p & SAINT_MAX;
+        if (p > 0) {
+            p--;
+            SA[--induction_bucket[T[p]]] =
+                p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1));
+        }
     }
 }
 
-static void libsais_final_sorting_scan_right_to_left_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static void libsais_final_sorting_scan_right_to_left_32s(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+    sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start,
+    fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2)
-    {
+    for (i = omp_block_start + omp_block_size - 1,
+        j = omp_block_start + 2 * prefetch_distance + 1;
+         i >= j; i -= 2) {
         libsais_prefetchw(&SA[i - 3 * prefetch_distance]);
 
-        sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
-        sa_sint_t s1 = SA[i - 2 * prefetch_distance - 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
-        sa_sint_t s2 = SA[i - 1 * prefetch_distance - 0]; if (s2 > 0) { libsais_prefetchw(&induction_bucket[T[s2 - 1]]); libsais_prefetch(&T[s2] - 2); }
-        sa_sint_t s3 = SA[i - 1 * prefetch_distance - 1]; if (s3 > 0) { libsais_prefetchw(&induction_bucket[T[s3 - 1]]); libsais_prefetch(&T[s3] - 2); }
+        sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0];
+        const sa_sint_t * Ts0 = &T[s0] - 1;
+        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i - 2 * prefetch_distance - 1];
+        const sa_sint_t * Ts1 = &T[s1] - 1;
+        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+        sa_sint_t s2 = SA[i - 1 * prefetch_distance - 0];
+        if (s2 > 0) {
+            libsais_prefetchw(&induction_bucket[T[s2 - 1]]);
+            libsais_prefetch(&T[s2] - 2);
+        }
+        sa_sint_t s3 = SA[i - 1 * prefetch_distance - 1];
+        if (s3 > 0) {
+            libsais_prefetchw(&induction_bucket[T[s3 - 1]]);
+            libsais_prefetch(&T[s3] - 2);
+        }
 
-        sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[--induction_bucket[T[p0]]] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1)); }
-        sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[--induction_bucket[T[p1]]] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1)); }
+        sa_sint_t p0 = SA[i - 0];
+        SA[i - 0] = p0 & SAINT_MAX;
+        if (p0 > 0) {
+            p0--;
+            SA[--induction_bucket[T[p0]]] =
+                p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1));
+        }
+        sa_sint_t p1 = SA[i - 1];
+        SA[i - 1] = p1 & SAINT_MAX;
+        if (p1 > 0) {
+            p1--;
+            SA[--induction_bucket[T[p1]]] =
+                p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1));
+        }
     }
 
-    for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1)
-    {
-        sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[--induction_bucket[T[p]]] = p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); }
+    for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1) {
+        sa_sint_t p = SA[i];
+        SA[i] = p & SAINT_MAX;
+        if (p > 0) {
+            p--;
+            SA[--induction_bucket[T[p]]] =
+                p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1));
+        }
     }
 }
 
 #if defined(_OPENMP)
 
-static fast_sint_t libsais_final_bwt_scan_right_to_left_8u_block_prepare(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
-   const fast_sint_t prefetch_distance = 32;
+static fast_sint_t libsais_final_bwt_scan_right_to_left_8u_block_prepare(
+    const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+    sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache,
+    fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+    const fast_sint_t prefetch_distance = 32;
 
-   memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t));
+    memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t));
 
-   fast_sint_t i, j, count = 0;
-   for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
-   {
-       libsais_prefetchw(&SA[i - 2 * prefetch_distance]);
+    fast_sint_t i, j, count = 0;
+    for (i = omp_block_start + omp_block_size - 1,
+        j = omp_block_start + prefetch_distance + 1;
+         i >= j; i -= 2) {
+        libsais_prefetchw(&SA[i - 2 * prefetch_distance]);
 
-       sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
-       sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+        sa_sint_t s0 = SA[i - prefetch_distance - 0];
+        const uint8_t * Ts0 = &T[s0] - 1;
+        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        Ts0--;
+        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i - prefetch_distance - 1];
+        const uint8_t * Ts1 = &T[s1] - 1;
+        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+        Ts1--;
+        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
 
-       sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; uint8_t c0 = T[p0 - (p0 > 0)], c1 = T[p0]; SA[i - 0] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count++].index = (c0 <= c1) ? p0 : t; }
-       sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; uint8_t c0 = T[p1 - (p1 > 0)], c1 = T[p1]; SA[i - 1] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count++].index = (c0 <= c1) ? p1 : t; }
-   }
+        sa_sint_t p0 = SA[i - 0];
+        SA[i - 0] = p0 & SAINT_MAX;
+        if (p0 > 0) {
+            p0--;
+            uint8_t c0 = T[p0 - (p0 > 0)], c1 = T[p0];
+            SA[i - 0] = c1;
+            sa_sint_t t = c0 | SAINT_MIN;
+            buckets[cache[count].symbol = c1]++;
+            cache[count++].index = (c0 <= c1) ? p0 : t;
+        }
+        sa_sint_t p1 = SA[i - 1];
+        SA[i - 1] = p1 & SAINT_MAX;
+        if (p1 > 0) {
+            p1--;
+            uint8_t c0 = T[p1 - (p1 > 0)], c1 = T[p1];
+            SA[i - 1] = c1;
+            sa_sint_t t = c0 | SAINT_MIN;
+            buckets[cache[count].symbol = c1]++;
+            cache[count++].index = (c0 <= c1) ? p1 : t;
+        }
+    }
 
-   for (j -= prefetch_distance + 1; i >= j; i -= 1)
-   {
-       sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; uint8_t c0 = T[p - (p > 0)], c1 = T[p]; SA[i] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count++].index = (c0 <= c1) ? p : t; }
-   }
+    for (j -= prefetch_distance + 1; i >= j; i -= 1) {
+        sa_sint_t p = SA[i];
+        SA[i] = p & SAINT_MAX;
+        if (p > 0) {
+            p--;
+            uint8_t c0 = T[p - (p > 0)], c1 = T[p];
+            SA[i] = c1;
+            sa_sint_t t = c0 | SAINT_MIN;
+            buckets[cache[count].symbol = c1]++;
+            cache[count++].index = (c0 <= c1) ? p : t;
+        }
+    }
 
-   return count;
+    return count;
 }
 
-static fast_sint_t libsais_final_bwt_aux_scan_right_to_left_8u_block_prepare(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
-   const fast_sint_t prefetch_distance = 32;
+static fast_sint_t libsais_final_bwt_aux_scan_right_to_left_8u_block_prepare(
+    const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+    sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache,
+    fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+    const fast_sint_t prefetch_distance = 32;
 
-   memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t));
+    memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t));
 
-   fast_sint_t i, j, count = 0;
-   for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
-   {
-       libsais_prefetchw(&SA[i - 2 * prefetch_distance]);
+    fast_sint_t i, j, count = 0;
+    for (i = omp_block_start + omp_block_size - 1,
+        j = omp_block_start + prefetch_distance + 1;
+         i >= j; i -= 2) {
+        libsais_prefetchw(&SA[i - 2 * prefetch_distance]);
 
-       sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
-       sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+        sa_sint_t s0 = SA[i - prefetch_distance - 0];
+        const uint8_t * Ts0 = &T[s0] - 1;
+        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        Ts0--;
+        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i - prefetch_distance - 1];
+        const uint8_t * Ts1 = &T[s1] - 1;
+        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+        Ts1--;
+        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
 
-       sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; uint8_t c0 = T[p0 - (p0 > 0)], c1 = T[p0]; SA[i - 0] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count].index = (c0 <= c1) ? p0 : t; cache[count + 1].index = p0; count += 2; }
-       sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; uint8_t c0 = T[p1 - (p1 > 0)], c1 = T[p1]; SA[i - 1] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count].index = (c0 <= c1) ? p1 : t; cache[count + 1].index = p1; count += 2; }
-   }
+        sa_sint_t p0 = SA[i - 0];
+        SA[i - 0] = p0 & SAINT_MAX;
+        if (p0 > 0) {
+            p0--;
+            uint8_t c0 = T[p0 - (p0 > 0)], c1 = T[p0];
+            SA[i - 0] = c1;
+            sa_sint_t t = c0 | SAINT_MIN;
+            buckets[cache[count].symbol = c1]++;
+            cache[count].index = (c0 <= c1) ? p0 : t;
+            cache[count + 1].index = p0;
+            count += 2;
+        }
+        sa_sint_t p1 = SA[i - 1];
+        SA[i - 1] = p1 & SAINT_MAX;
+        if (p1 > 0) {
+            p1--;
+            uint8_t c0 = T[p1 - (p1 > 0)], c1 = T[p1];
+            SA[i - 1] = c1;
+            sa_sint_t t = c0 | SAINT_MIN;
+            buckets[cache[count].symbol = c1]++;
+            cache[count].index = (c0 <= c1) ? p1 : t;
+            cache[count + 1].index = p1;
+            count += 2;
+        }
+    }
 
-   for (j -= prefetch_distance + 1; i >= j; i -= 1)
-   {
-       sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; uint8_t c0 = T[p - (p > 0)], c1 = T[p]; SA[i] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count].index = (c0 <= c1) ? p : t; cache[count + 1].index = p; count += 2; }
-   }
+    for (j -= prefetch_distance + 1; i >= j; i -= 1) {
+        sa_sint_t p = SA[i];
+        SA[i] = p & SAINT_MAX;
+        if (p > 0) {
+            p--;
+            uint8_t c0 = T[p - (p > 0)], c1 = T[p];
+            SA[i] = c1;
+            sa_sint_t t = c0 | SAINT_MIN;
+            buckets[cache[count].symbol = c1]++;
+            cache[count].index = (c0 <= c1) ? p : t;
+            cache[count + 1].index = p;
+            count += 2;
+        }
+    }
 
-   return count;
+    return count;
 }
 
-static fast_sint_t libsais_final_sorting_scan_right_to_left_8u_block_prepare(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
-   const fast_sint_t prefetch_distance = 32;
+static fast_sint_t libsais_final_sorting_scan_right_to_left_8u_block_prepare(
+    const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+    sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache,
+    fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+    const fast_sint_t prefetch_distance = 32;
 
-   memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t));
+    memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t));
 
-   fast_sint_t i, j, count = 0;
-   for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
-   {
-       libsais_prefetchw(&SA[i - 2 * prefetch_distance]);
+    fast_sint_t i, j, count = 0;
+    for (i = omp_block_start + omp_block_size - 1,
+        j = omp_block_start + prefetch_distance + 1;
+         i >= j; i -= 2) {
+        libsais_prefetchw(&SA[i - 2 * prefetch_distance]);
 
-       sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint8_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
-       sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint8_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+        sa_sint_t s0 = SA[i - prefetch_distance - 0];
+        const uint8_t * Ts0 = &T[s0] - 1;
+        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        Ts0--;
+        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i - prefetch_distance - 1];
+        const uint8_t * Ts1 = &T[s1] - 1;
+        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+        Ts1--;
+        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
 
-       sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; buckets[cache[count].symbol = T[p0]]++; cache[count++].index = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1)); }
-       sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; buckets[cache[count].symbol = T[p1]]++; cache[count++].index = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1)); }
-   }
+        sa_sint_t p0 = SA[i - 0];
+        SA[i - 0] = p0 & SAINT_MAX;
+        if (p0 > 0) {
+            p0--;
+            buckets[cache[count].symbol = T[p0]]++;
+            cache[count++].index =
+                p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1));
+        }
+        sa_sint_t p1 = SA[i - 1];
+        SA[i - 1] = p1 & SAINT_MAX;
+        if (p1 > 0) {
+            p1--;
+            buckets[cache[count].symbol = T[p1]]++;
+            cache[count++].index =
+                p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1));
+        }
+    }
 
-   for (j -= prefetch_distance + 1; i >= j; i -= 1)
-   {
-       sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; buckets[cache[count].symbol = T[p]]++; cache[count++].index = p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); }
-   }
+    for (j -= prefetch_distance + 1; i >= j; i -= 1) {
+        sa_sint_t p = SA[i];
+        SA[i] = p & SAINT_MAX;
+        if (p > 0) {
+            p--;
+            buckets[cache[count].symbol = T[p]]++;
+            cache[count++].index =
+                p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1));
+        }
+    }
 
-   return count;
+    return count;
 }
 
-static void libsais_final_order_scan_right_to_left_8u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count)
-{
+static void libsais_final_order_scan_right_to_left_8u_block_place(
+    sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
+    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = 0, j = count - 3; i < j; i += 4)
-    {
+    for (i = 0, j = count - 3; i < j; i += 4) {
         libsais_prefetch(&cache[i + prefetch_distance]);
 
         SA[--buckets[cache[i + 0].symbol]] = cache[i + 0].index;
@@ -5332,342 +7736,503 @@ static void libsais_final_order_scan_right_to_left_8u_block_place(sa_sint_t * RE
         SA[--buckets[cache[i + 3].symbol]] = cache[i + 3].index;
     }
 
-    for (j += 3; i < j; i += 1)
-    {
+    for (j += 3; i < j; i += 1) {
         SA[--buckets[cache[i].symbol]] = cache[i].index;
     }
 }
 
-static void libsais_final_bwt_aux_scan_right_to_left_8u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count)
-{
+static void libsais_final_bwt_aux_scan_right_to_left_8u_block_place(
+    sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I,
+    sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache,
+    fast_sint_t count) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = 0, j = count - 6; i < j; i += 8)
-    {
+    for (i = 0, j = count - 6; i < j; i += 8) {
         libsais_prefetch(&cache[i + prefetch_distance]);
 
-        SA[--buckets[cache[i + 0].symbol]] = cache[i + 0].index; if ((cache[i + 1].index & rm) == 0) { I[cache[i + 1].index / (rm + 1)] = buckets[cache[i + 0].symbol] + 1; }
-        SA[--buckets[cache[i + 2].symbol]] = cache[i + 2].index; if ((cache[i + 3].index & rm) == 0) { I[cache[i + 3].index / (rm + 1)] = buckets[cache[i + 2].symbol] + 1; }
-        SA[--buckets[cache[i + 4].symbol]] = cache[i + 4].index; if ((cache[i + 5].index & rm) == 0) { I[cache[i + 5].index / (rm + 1)] = buckets[cache[i + 4].symbol] + 1; }
-        SA[--buckets[cache[i + 6].symbol]] = cache[i + 6].index; if ((cache[i + 7].index & rm) == 0) { I[cache[i + 7].index / (rm + 1)] = buckets[cache[i + 6].symbol] + 1; }
+        SA[--buckets[cache[i + 0].symbol]] = cache[i + 0].index;
+        if ((cache[i + 1].index & rm) == 0) {
+            I[cache[i + 1].index / (rm + 1)] = buckets[cache[i + 0].symbol] + 1;
+        }
+        SA[--buckets[cache[i + 2].symbol]] = cache[i + 2].index;
+        if ((cache[i + 3].index & rm) == 0) {
+            I[cache[i + 3].index / (rm + 1)] = buckets[cache[i + 2].symbol] + 1;
+        }
+        SA[--buckets[cache[i + 4].symbol]] = cache[i + 4].index;
+        if ((cache[i + 5].index & rm) == 0) {
+            I[cache[i + 5].index / (rm + 1)] = buckets[cache[i + 4].symbol] + 1;
+        }
+        SA[--buckets[cache[i + 6].symbol]] = cache[i + 6].index;
+        if ((cache[i + 7].index & rm) == 0) {
+            I[cache[i + 7].index / (rm + 1)] = buckets[cache[i + 6].symbol] + 1;
+        }
     }
 
-    for (j += 6; i < j; i += 2)
-    {
-        SA[--buckets[cache[i].symbol]] = cache[i].index; if ((cache[i + 1].index & rm) == 0) { I[(cache[i + 1].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i].symbol] + 1; }
+    for (j += 6; i < j; i += 2) {
+        SA[--buckets[cache[i].symbol]] = cache[i].index;
+        if ((cache[i + 1].index & rm) == 0) {
+            I[(cache[i + 1].index & SAINT_MAX) / (rm + 1)] =
+                buckets[cache[i].symbol] + 1;
+        }
     }
 }
 
-static void libsais_final_sorting_scan_right_to_left_32s_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static void libsais_final_sorting_scan_right_to_left_32s_block_gather(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
+    fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2)
-    {
+    for (i = omp_block_start,
+        j = omp_block_start + omp_block_size - prefetch_distance - 1;
+         i < j; i += 2) {
         libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
 
-        sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL);
-        sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+        sa_sint_t s0 = SA[i + prefetch_distance + 0];
+        const sa_sint_t * Ts0 = &T[s0] - 1;
+        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        Ts0--;
+        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i + prefetch_distance + 1];
+        const sa_sint_t * Ts1 = &T[s1] - 1;
+        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+        Ts1--;
+        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
 
         libsais_prefetchw(&cache[i + prefetch_distance]);
 
-        sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; cache[i + 0].index = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1)); symbol0 = T[p0]; } cache[i + 0].symbol = symbol0;
-        sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; cache[i + 1].index = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1)); symbol1 = T[p1]; } cache[i + 1].symbol = symbol1;
+        sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0];
+        SA[i + 0] = p0 & SAINT_MAX;
+        if (p0 > 0) {
+            p0--;
+            cache[i + 0].index =
+                p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1));
+            symbol0 = T[p0];
+        }
+        cache[i + 0].symbol = symbol0;
+        sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1];
+        SA[i + 1] = p1 & SAINT_MAX;
+        if (p1 > 0) {
+            p1--;
+            cache[i + 1].index =
+                p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1));
+            symbol1 = T[p1];
+        }
+        cache[i + 1].symbol = symbol1;
     }
 
-    for (j += prefetch_distance + 1; i < j; i += 1)
-    {
-        sa_sint_t symbol = SAINT_MIN, p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; cache[i].index = p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); symbol = T[p]; } cache[i].symbol = symbol;
+    for (j += prefetch_distance + 1; i < j; i += 1) {
+        sa_sint_t symbol = SAINT_MIN, p = SA[i];
+        SA[i] = p & SAINT_MAX;
+        if (p > 0) {
+            p--;
+            cache[i].index =
+                p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1));
+            symbol = T[p];
+        }
+        cache[i].symbol = symbol;
     }
 }
 
-static void libsais_final_sorting_scan_right_to_left_32s_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static void libsais_final_sorting_scan_right_to_left_32s_block_sort(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT induction_bucket,
+    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
+    fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2)
-    {
+    for (i = omp_block_start + omp_block_size - 1,
+        j = omp_block_start + prefetch_distance + 1;
+         i >= j; i -= 2) {
         libsais_prefetchw(&cache[i - 2 * prefetch_distance]);
 
-        sa_sint_t s0 = cache[i - prefetch_distance - 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0]; libsais_prefetchw(s0 >= 0 ? Is0 : NULL);
-        sa_sint_t s1 = cache[i - prefetch_distance - 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1]; libsais_prefetchw(s1 >= 0 ? Is1 : NULL);
+        sa_sint_t s0 = cache[i - prefetch_distance - 0].symbol;
+        const sa_sint_t * Is0 = &induction_bucket[s0];
+        libsais_prefetchw(s0 >= 0 ? Is0 : NULL);
+        sa_sint_t s1 = cache[i - prefetch_distance - 1].symbol;
+        const sa_sint_t * Is1 = &induction_bucket[s1];
+        libsais_prefetchw(s1 >= 0 ? Is1 : NULL);
 
         sa_sint_t v0 = cache[i - 0].symbol;
-        if (v0 >= 0)
-        {
+        if (v0 >= 0) {
             cache[i - 0].symbol = --induction_bucket[v0];
-            if (cache[i - 0].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 0].symbol, np = cache[i - 0].index; cache[i - 0].index = np & SAINT_MAX; if (np > 0) { np--; cache[ni].index = np | ((sa_sint_t)(T[np - (np > 0)] > T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } }
+            if (cache[i - 0].symbol >= omp_block_start) {
+                sa_sint_t ni = cache[i - 0].symbol, np = cache[i - 0].index;
+                cache[i - 0].index = np & SAINT_MAX;
+                if (np > 0) {
+                    np--;
+                    cache[ni].index =
+                        np | ((sa_sint_t)(T[np - (np > 0)] > T[np])
+                              << (SAINT_BIT - 1));
+                    cache[ni].symbol = T[np];
+                }
+            }
         }
 
         sa_sint_t v1 = cache[i - 1].symbol;
-        if (v1 >= 0)
-        {
+        if (v1 >= 0) {
             cache[i - 1].symbol = --induction_bucket[v1];
-            if (cache[i - 1].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 1].symbol, np = cache[i - 1].index; cache[i - 1].index = np & SAINT_MAX; if (np > 0) { np--; cache[ni].index = np | ((sa_sint_t)(T[np - (np > 0)] > T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } }
+            if (cache[i - 1].symbol >= omp_block_start) {
+                sa_sint_t ni = cache[i - 1].symbol, np = cache[i - 1].index;
+                cache[i - 1].index = np & SAINT_MAX;
+                if (np > 0) {
+                    np--;
+                    cache[ni].index =
+                        np | ((sa_sint_t)(T[np - (np > 0)] > T[np])
+                              << (SAINT_BIT - 1));
+                    cache[ni].symbol = T[np];
+                }
+            }
         }
     }
 
-    for (j -= prefetch_distance + 1; i >= j; i -= 1)
-    {
+    for (j -= prefetch_distance + 1; i >= j; i -= 1) {
         sa_sint_t v = cache[i].symbol;
-        if (v >= 0)
-        {
+        if (v >= 0) {
             cache[i].symbol = --induction_bucket[v];
-            if (cache[i].symbol >= omp_block_start) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; cache[i].index = np & SAINT_MAX; if (np > 0) { np--; cache[ni].index = np | ((sa_sint_t)(T[np - (np > 0)] > T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } }
+            if (cache[i].symbol >= omp_block_start) {
+                sa_sint_t ni = cache[i].symbol, np = cache[i].index;
+                cache[i].index = np & SAINT_MAX;
+                if (np > 0) {
+                    np--;
+                    cache[ni].index =
+                        np | ((sa_sint_t)(T[np - (np > 0)] > T[np])
+                              << (SAINT_BIT - 1));
+                    cache[ni].symbol = T[np];
+                }
+            }
         }
     }
 }
 
-static void libsais_final_bwt_scan_right_to_left_8u_block_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
-#if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0)
-#endif
+static void libsais_final_bwt_scan_right_to_left_8u_block_omp(
+    const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+    sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start,
+    fast_sint_t block_size, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    #if defined(_OPENMP)
+        #pragma omp parallel num_threads(                                   \
+            threads) if (threads > 1 && block_size >= 64 * ALPHABET_SIZE && \
+                         omp_get_dynamic() == 0)
+    #endif
     {
-#if defined(_OPENMP)
-        fast_sint_t omp_thread_num    = omp_get_thread_num();
-        fast_sint_t omp_num_threads   = omp_get_num_threads();
-#else
-        UNUSED(threads); UNUSED(thread_state);
+    #if defined(_OPENMP)
+        fast_sint_t omp_thread_num = omp_get_thread_num();
+        fast_sint_t omp_num_threads = omp_get_num_threads();
+    #else
+        UNUSED(threads);
+        UNUSED(thread_state);
 
-        fast_sint_t omp_thread_num    = 0;
-        fast_sint_t omp_num_threads   = 1;
-#endif
-        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+        fast_sint_t omp_thread_num = 0;
+        fast_sint_t omp_num_threads = 1;
+    #endif
+        fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
+                                         ? omp_block_stride
+                                         : block_size - omp_block_start;
 
         omp_block_start += block_start;
 
-        if (omp_num_threads == 1)
-        {
-            libsais_final_bwt_scan_right_to_left_8u(T, SA, induction_bucket, omp_block_start, omp_block_size);
+        if (omp_num_threads == 1) {
+            libsais_final_bwt_scan_right_to_left_8u(
+                T, SA, induction_bucket, omp_block_start, omp_block_size);
         }
-#if defined(_OPENMP)
-        else
-        {
+    #if defined(_OPENMP)
+        else {
             {
-                thread_state[omp_thread_num].state.count = libsais_final_bwt_scan_right_to_left_8u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size);
+                thread_state[omp_thread_num].state.count =
+                    libsais_final_bwt_scan_right_to_left_8u_block_prepare(
+                        T, SA, thread_state[omp_thread_num].state.buckets,
+                        thread_state[omp_thread_num].state.cache,
+                        omp_block_start, omp_block_size);
             }
 
-            #pragma omp barrier
+        #pragma omp barrier
 
-            #pragma omp master
+        #pragma omp master
             {
                 fast_sint_t t;
-                for (t = omp_num_threads - 1; t >= 0; --t)
-                {
-                    sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
-                    fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A - B; temp_bucket[c] = A; }
+                for (t = omp_num_threads - 1; t >= 0; --t) {
+                    sa_sint_t * RESTRICT temp_bucket =
+                        thread_state[t].state.buckets;
+                    fast_sint_t c;
+                    for (c = 0; c < ALPHABET_SIZE; c += 1) {
+                        sa_sint_t A = induction_bucket[c], B = temp_bucket[c];
+                        induction_bucket[c] = A - B;
+                        temp_bucket[c] = A;
+                    }
                 }
             }
 
-            #pragma omp barrier
+        #pragma omp barrier
 
             {
-                libsais_final_order_scan_right_to_left_8u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count);
+                libsais_final_order_scan_right_to_left_8u_block_place(
+                    SA, thread_state[omp_thread_num].state.buckets,
+                    thread_state[omp_thread_num].state.cache,
+                    thread_state[omp_thread_num].state.count);
             }
         }
-#endif
+    #endif
     }
 }
 
-static void libsais_final_bwt_aux_scan_right_to_left_8u_block_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
-#if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0)
-#endif
+static void libsais_final_bwt_aux_scan_right_to_left_8u_block_omp(
+    const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm,
+    sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket,
+    fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    #if defined(_OPENMP)
+        #pragma omp parallel num_threads(                                   \
+            threads) if (threads > 1 && block_size >= 64 * ALPHABET_SIZE && \
+                         omp_get_dynamic() == 0)
+    #endif
     {
-#if defined(_OPENMP)
-        fast_sint_t omp_thread_num    = omp_get_thread_num();
-        fast_sint_t omp_num_threads   = omp_get_num_threads();
-#else
-        UNUSED(threads); UNUSED(thread_state);
+    #if defined(_OPENMP)
+        fast_sint_t omp_thread_num = omp_get_thread_num();
+        fast_sint_t omp_num_threads = omp_get_num_threads();
+    #else
+        UNUSED(threads);
+        UNUSED(thread_state);
 
-        fast_sint_t omp_thread_num    = 0;
-        fast_sint_t omp_num_threads   = 1;
-#endif
-        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+        fast_sint_t omp_thread_num = 0;
+        fast_sint_t omp_num_threads = 1;
+    #endif
+        fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
+                                         ? omp_block_stride
+                                         : block_size - omp_block_start;
 
         omp_block_start += block_start;
 
-        if (omp_num_threads == 1)
-        {
-            libsais_final_bwt_aux_scan_right_to_left_8u(T, SA, rm, I, induction_bucket, omp_block_start, omp_block_size);
+        if (omp_num_threads == 1) {
+            libsais_final_bwt_aux_scan_right_to_left_8u(
+                T, SA, rm, I, induction_bucket, omp_block_start,
+                omp_block_size);
         }
-#if defined(_OPENMP)
-        else
-        {
+    #if defined(_OPENMP)
+        else {
             {
-                thread_state[omp_thread_num].state.count = libsais_final_bwt_aux_scan_right_to_left_8u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size);
+                thread_state[omp_thread_num].state.count =
+                    libsais_final_bwt_aux_scan_right_to_left_8u_block_prepare(
+                        T, SA, thread_state[omp_thread_num].state.buckets,
+                        thread_state[omp_thread_num].state.cache,
+                        omp_block_start, omp_block_size);
             }
 
-            #pragma omp barrier
+        #pragma omp barrier
 
-            #pragma omp master
+        #pragma omp master
             {
                 fast_sint_t t;
-                for (t = omp_num_threads - 1; t >= 0; --t)
-                {
-                    sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
-                    fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A - B; temp_bucket[c] = A; }
+                for (t = omp_num_threads - 1; t >= 0; --t) {
+                    sa_sint_t * RESTRICT temp_bucket =
+                        thread_state[t].state.buckets;
+                    fast_sint_t c;
+                    for (c = 0; c < ALPHABET_SIZE; c += 1) {
+                        sa_sint_t A = induction_bucket[c], B = temp_bucket[c];
+                        induction_bucket[c] = A - B;
+                        temp_bucket[c] = A;
+                    }
                 }
             }
 
-            #pragma omp barrier
+        #pragma omp barrier
 
             {
-                libsais_final_bwt_aux_scan_right_to_left_8u_block_place(SA, rm, I, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count);
+                libsais_final_bwt_aux_scan_right_to_left_8u_block_place(
+                    SA, rm, I, thread_state[omp_thread_num].state.buckets,
+                    thread_state[omp_thread_num].state.cache,
+                    thread_state[omp_thread_num].state.count);
             }
         }
-#endif
+    #endif
     }
 }
 
-static void libsais_final_sorting_scan_right_to_left_8u_block_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
-#if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * ALPHABET_SIZE && omp_get_dynamic() == 0)
-#endif
+static void libsais_final_sorting_scan_right_to_left_8u_block_omp(
+    const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+    sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start,
+    fast_sint_t block_size, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    #if defined(_OPENMP)
+        #pragma omp parallel num_threads(                                   \
+            threads) if (threads > 1 && block_size >= 64 * ALPHABET_SIZE && \
+                         omp_get_dynamic() == 0)
+    #endif
     {
-#if defined(_OPENMP)
-        fast_sint_t omp_thread_num    = omp_get_thread_num();
-        fast_sint_t omp_num_threads   = omp_get_num_threads();
-#else
-        UNUSED(threads); UNUSED(thread_state);
+    #if defined(_OPENMP)
+        fast_sint_t omp_thread_num = omp_get_thread_num();
+        fast_sint_t omp_num_threads = omp_get_num_threads();
+    #else
+        UNUSED(threads);
+        UNUSED(thread_state);
 
-        fast_sint_t omp_thread_num    = 0;
-        fast_sint_t omp_num_threads   = 1;
-#endif
-        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+        fast_sint_t omp_thread_num = 0;
+        fast_sint_t omp_num_threads = 1;
+    #endif
+        fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
+                                         ? omp_block_stride
+                                         : block_size - omp_block_start;
 
         omp_block_start += block_start;
 
-        if (omp_num_threads == 1)
-        {
-            libsais_final_sorting_scan_right_to_left_8u(T, SA, induction_bucket, omp_block_start, omp_block_size);
+        if (omp_num_threads == 1) {
+            libsais_final_sorting_scan_right_to_left_8u(
+                T, SA, induction_bucket, omp_block_start, omp_block_size);
         }
-#if defined(_OPENMP)
-        else
-        {
+    #if defined(_OPENMP)
+        else {
             {
-                thread_state[omp_thread_num].state.count = libsais_final_sorting_scan_right_to_left_8u_block_prepare(T, SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size);
+                thread_state[omp_thread_num].state.count =
+                    libsais_final_sorting_scan_right_to_left_8u_block_prepare(
+                        T, SA, thread_state[omp_thread_num].state.buckets,
+                        thread_state[omp_thread_num].state.cache,
+                        omp_block_start, omp_block_size);
             }
 
-            #pragma omp barrier
+        #pragma omp barrier
 
-            #pragma omp master
+        #pragma omp master
             {
                 fast_sint_t t;
-                for (t = omp_num_threads - 1; t >= 0; --t)
-                {
-                    sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
-                    fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A - B; temp_bucket[c] = A; }
+                for (t = omp_num_threads - 1; t >= 0; --t) {
+                    sa_sint_t * RESTRICT temp_bucket =
+                        thread_state[t].state.buckets;
+                    fast_sint_t c;
+                    for (c = 0; c < ALPHABET_SIZE; c += 1) {
+                        sa_sint_t A = induction_bucket[c], B = temp_bucket[c];
+                        induction_bucket[c] = A - B;
+                        temp_bucket[c] = A;
+                    }
                 }
             }
 
-            #pragma omp barrier
+        #pragma omp barrier
 
             {
-                libsais_final_order_scan_right_to_left_8u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count);
+                libsais_final_order_scan_right_to_left_8u_block_place(
+                    SA, thread_state[omp_thread_num].state.buckets,
+                    thread_state[omp_thread_num].state.cache,
+                    thread_state[omp_thread_num].state.count);
             }
         }
-#endif
+    #endif
     }
 }
 
-static void libsais_final_sorting_scan_right_to_left_32s_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads)
-{
-#if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384)
-#endif
+static void libsais_final_sorting_scan_right_to_left_32s_block_omp(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+    sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache,
+    fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) {
+    #if defined(_OPENMP)
+        #pragma omp parallel num_threads(threads) if (threads > 1 && \
+                                                      block_size >= 16384)
+    #endif
     {
-#if defined(_OPENMP)
-        fast_sint_t omp_thread_num    = omp_get_thread_num();
-        fast_sint_t omp_num_threads   = omp_get_num_threads();
-#else
-        UNUSED(threads); UNUSED(cache);
+    #if defined(_OPENMP)
+        fast_sint_t omp_thread_num = omp_get_thread_num();
+        fast_sint_t omp_num_threads = omp_get_num_threads();
+    #else
+        UNUSED(threads);
+        UNUSED(cache);
 
-        fast_sint_t omp_thread_num    = 0;
-        fast_sint_t omp_num_threads   = 1;
-#endif
-        fast_sint_t omp_block_stride  = (block_size / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+        fast_sint_t omp_thread_num = 0;
+        fast_sint_t omp_num_threads = 1;
+    #endif
+        fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
+                                         ? omp_block_stride
+                                         : block_size - omp_block_start;
 
         omp_block_start += block_start;
 
-        if (omp_num_threads == 1)
-        {
-            libsais_final_sorting_scan_right_to_left_32s(T, SA, buckets, omp_block_start, omp_block_size);
+        if (omp_num_threads == 1) {
+            libsais_final_sorting_scan_right_to_left_32s(
+                T, SA, buckets, omp_block_start, omp_block_size);
         }
-#if defined(_OPENMP)
-        else
-        {
+    #if defined(_OPENMP)
+        else {
             {
-                libsais_final_sorting_scan_right_to_left_32s_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size);
+                libsais_final_sorting_scan_right_to_left_32s_block_gather(
+                    T, SA, cache - block_start, omp_block_start,
+                    omp_block_size);
             }
 
-            #pragma omp barrier
+        #pragma omp barrier
 
-            #pragma omp master
+        #pragma omp master
             {
-                libsais_final_sorting_scan_right_to_left_32s_block_sort(T, buckets, cache - block_start, block_start, block_size);
+                libsais_final_sorting_scan_right_to_left_32s_block_sort(
+                    T, buckets, cache - block_start, block_start, block_size);
             }
 
-            #pragma omp barrier
+        #pragma omp barrier
 
             {
-                libsais_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size);
+                libsais_compact_and_place_cached_suffixes(
+                    SA, cache - block_start, omp_block_start, omp_block_size);
             }
         }
-#endif
+    #endif
     }
 }
 
 #endif
 
-static sa_sint_t libsais_final_bwt_scan_right_to_left_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
+static sa_sint_t libsais_final_bwt_scan_right_to_left_8u_omp(
+    const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+    sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     sa_sint_t index = -1;
 
-    if (threads == 1 || n < 65536)
-    {
-        index = libsais_final_bwt_scan_right_to_left_8u(T, SA, induction_bucket, 0, n);
+    if (threads == 1 || n < 65536) {
+        index = libsais_final_bwt_scan_right_to_left_8u(T, SA, induction_bucket,
+                                                        0, n);
     }
 #if defined(_OPENMP)
-    else
-    {
+    else {
         fast_sint_t block_start;
-        for (block_start = (fast_sint_t)n - 1; block_start >= 0; )
-        {
-            if (SA[block_start] == 0)
-            {
+        for (block_start = (fast_sint_t)n - 1; block_start >= 0;) {
+            if (SA[block_start] == 0) {
                 index = (sa_sint_t)block_start--;
-            }
-            else
-            {
-                fast_sint_t block_max_end = block_start - ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end < 0) { block_max_end = -1; }
-                fast_sint_t block_end     = block_start - 1; while (block_end > block_max_end && SA[block_end] != 0) { block_end--; }
-                fast_sint_t block_size    = block_start - block_end;
-
-                if (block_size < 32)
-                {
-                    for (; block_start > block_end; block_start -= 1)
-                    {
-                        sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; uint8_t c0 = T[p - (p > 0)], c1 = T[p]; SA[block_start] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t; }
-                    }
+            } else {
+                fast_sint_t block_max_end =
+                    block_start -
+                    ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE -
+                                              16 * (fast_sint_t)threads);
+                if (block_max_end < 0) {
+                    block_max_end = -1;
                 }
-                else
-                {
-                    libsais_final_bwt_scan_right_to_left_8u_block_omp(T, SA, induction_bucket, block_end + 1, block_size, threads, thread_state);
+                fast_sint_t block_end = block_start - 1;
+                while (block_end > block_max_end && SA[block_end] != 0) {
+                    block_end--;
+                }
+                fast_sint_t block_size = block_start - block_end;
+
+                if (block_size < 32) {
+                    for (; block_start > block_end; block_start -= 1) {
+                        sa_sint_t p = SA[block_start];
+                        SA[block_start] = p & SAINT_MAX;
+                        if (p > 0) {
+                            p--;
+                            uint8_t c0 = T[p - (p > 0)], c1 = T[p];
+                            SA[block_start] = c1;
+                            sa_sint_t t = c0 | SAINT_MIN;
+                            SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t;
+                        }
+                    }
+                } else {
+                    libsais_final_bwt_scan_right_to_left_8u_block_omp(
+                        T, SA, induction_bucket, block_end + 1, block_size,
+                        threads, thread_state);
                     block_start = block_end;
                 }
             }
@@ -5680,38 +8245,54 @@ static sa_sint_t libsais_final_bwt_scan_right_to_left_8u_omp(const uint8_t * RES
     return index;
 }
 
-static void libsais_final_bwt_aux_scan_right_to_left_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
-    if (threads == 1 || n < 65536)
-    {
-        libsais_final_bwt_aux_scan_right_to_left_8u(T, SA, rm, I, induction_bucket, 0, n);
+static void libsais_final_bwt_aux_scan_right_to_left_8u_omp(
+    const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+    sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket,
+    sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    if (threads == 1 || n < 65536) {
+        libsais_final_bwt_aux_scan_right_to_left_8u(T, SA, rm, I,
+                                                    induction_bucket, 0, n);
     }
 #if defined(_OPENMP)
-    else
-    {
+    else {
         fast_sint_t block_start;
-        for (block_start = (fast_sint_t)n - 1; block_start >= 0; )
-        {
-            if (SA[block_start] == 0)
-            {
+        for (block_start = (fast_sint_t)n - 1; block_start >= 0;) {
+            if (SA[block_start] == 0) {
                 block_start--;
-            }
-            else
-            {
-                fast_sint_t block_max_end = block_start - ((fast_sint_t)threads) * ((LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads) / 2); if (block_max_end < 0) { block_max_end = -1; }
-                fast_sint_t block_end     = block_start - 1; while (block_end > block_max_end && SA[block_end] != 0) { block_end--; }
-                fast_sint_t block_size    = block_start - block_end;
-
-                if (block_size < 32)
-                {
-                    for (; block_start > block_end; block_start -= 1)
-                    {
-                        sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; uint8_t c0 = T[p - (p > 0)], c1 = T[p]; SA[block_start] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t; if ((p & rm) == 0) { I[p / (rm + 1)] = induction_bucket[T[p]] + 1; } }
-                    }
+            } else {
+                fast_sint_t block_max_end =
+                    block_start -
+                    ((fast_sint_t)threads) * ((LIBSAIS_PER_THREAD_CACHE_SIZE -
+                                               16 * (fast_sint_t)threads) /
+                                              2);
+                if (block_max_end < 0) {
+                    block_max_end = -1;
                 }
-                else
-                {
-                    libsais_final_bwt_aux_scan_right_to_left_8u_block_omp(T, SA, rm, I, induction_bucket, block_end + 1, block_size, threads, thread_state);
+                fast_sint_t block_end = block_start - 1;
+                while (block_end > block_max_end && SA[block_end] != 0) {
+                    block_end--;
+                }
+                fast_sint_t block_size = block_start - block_end;
+
+                if (block_size < 32) {
+                    for (; block_start > block_end; block_start -= 1) {
+                        sa_sint_t p = SA[block_start];
+                        SA[block_start] = p & SAINT_MAX;
+                        if (p > 0) {
+                            p--;
+                            uint8_t c0 = T[p - (p > 0)], c1 = T[p];
+                            SA[block_start] = c1;
+                            sa_sint_t t = c0 | SAINT_MIN;
+                            SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t;
+                            if ((p & rm) == 0) {
+                                I[p / (rm + 1)] = induction_bucket[T[p]] + 1;
+                            }
+                        }
+                    }
+                } else {
+                    libsais_final_bwt_aux_scan_right_to_left_8u_block_omp(
+                        T, SA, rm, I, induction_bucket, block_end + 1,
+                        block_size, threads, thread_state);
                     block_start = block_end;
                 }
             }
@@ -5722,38 +8303,49 @@ static void libsais_final_bwt_aux_scan_right_to_left_8u_omp(const uint8_t * REST
 #endif
 }
 
-static void libsais_final_sorting_scan_right_to_left_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
-    if (threads == 1 || n < 65536)
-    {
-        libsais_final_sorting_scan_right_to_left_8u(T, SA, induction_bucket, 0, n);
+static void libsais_final_sorting_scan_right_to_left_8u_omp(
+    const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+    sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    if (threads == 1 || n < 65536) {
+        libsais_final_sorting_scan_right_to_left_8u(T, SA, induction_bucket, 0,
+                                                    n);
     }
 #if defined(_OPENMP)
-    else
-    {
+    else {
         fast_sint_t block_start;
-        for (block_start = (fast_sint_t)n - 1; block_start >= 0; )
-        {
-            if (SA[block_start] == 0)
-            {
+        for (block_start = (fast_sint_t)n - 1; block_start >= 0;) {
+            if (SA[block_start] == 0) {
                 block_start--;
-            }
-            else
-            {
-                fast_sint_t block_max_end = block_start - ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end < -1) { block_max_end = -1; }
-                fast_sint_t block_end     = block_start - 1; while (block_end > block_max_end && SA[block_end] != 0) { block_end--; }
-                fast_sint_t block_size    = block_start - block_end;
-
-                if (block_size < 32)
-                {
-                    for (; block_start > block_end; block_start -= 1)
-                    {
-                        sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; SA[--induction_bucket[T[p]]] = p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); }
-                    }
+            } else {
+                fast_sint_t block_max_end =
+                    block_start -
+                    ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE -
+                                              16 * (fast_sint_t)threads);
+                if (block_max_end < -1) {
+                    block_max_end = -1;
                 }
-                else
-                {
-                    libsais_final_sorting_scan_right_to_left_8u_block_omp(T, SA, induction_bucket, block_end + 1, block_size, threads, thread_state);
+                fast_sint_t block_end = block_start - 1;
+                while (block_end > block_max_end && SA[block_end] != 0) {
+                    block_end--;
+                }
+                fast_sint_t block_size = block_start - block_end;
+
+                if (block_size < 32) {
+                    for (; block_start > block_end; block_start -= 1) {
+                        sa_sint_t p = SA[block_start];
+                        SA[block_start] = p & SAINT_MAX;
+                        if (p > 0) {
+                            p--;
+                            SA[--induction_bucket[T[p]]] =
+                                p | ((sa_sint_t)(T[p - (p > 0)] > T[p])
+                                     << (SAINT_BIT - 1));
+                        }
+                    }
+                } else {
+                    libsais_final_sorting_scan_right_to_left_8u_block_omp(
+                        T, SA, induction_bucket, block_end + 1, block_size,
+                        threads, thread_state);
                     block_start = block_end;
                 }
             }
@@ -5764,21 +8356,28 @@ static void libsais_final_sorting_scan_right_to_left_8u_omp(const uint8_t * REST
 #endif
 }
 
-static void libsais_final_sorting_scan_right_to_left_32s_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
-    if (threads == 1 || n < 65536)
-    {
-        libsais_final_sorting_scan_right_to_left_32s(T, SA, induction_bucket, 0, n);
+static void libsais_final_sorting_scan_right_to_left_32s_omp(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+    sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    if (threads == 1 || n < 65536) {
+        libsais_final_sorting_scan_right_to_left_32s(T, SA, induction_bucket, 0,
+                                                     n);
     }
 #if defined(_OPENMP)
-    else
-    {
+    else {
         fast_sint_t block_start, block_end;
-        for (block_start = (fast_sint_t)n - 1; block_start >= 0; block_start = block_end)
-        {
-            block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end < 0) { block_end = -1; }
+        for (block_start = (fast_sint_t)n - 1; block_start >= 0;
+             block_start = block_end) {
+            block_end = block_start -
+                        (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
+            if (block_end < 0) {
+                block_end = -1;
+            }
 
-            libsais_final_sorting_scan_right_to_left_32s_block_omp(T, SA, induction_bucket, thread_state[0].state.cache, block_end + 1, block_start - block_end, threads);
+            libsais_final_sorting_scan_right_to_left_32s_block_omp(
+                T, SA, induction_bucket, thread_state[0].state.cache,
+                block_end + 1, block_start - block_end, threads);
         }
     }
 #else
@@ -5786,150 +8385,263 @@ static void libsais_final_sorting_scan_right_to_left_32s_omp(const sa_sint_t * R
 #endif
 }
 
-static void libsais_clear_lms_suffixes_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT bucket_start, sa_sint_t * RESTRICT bucket_end, sa_sint_t threads)
-{
+static void libsais_clear_lms_suffixes_omp(sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                           sa_sint_t k,
+                                           sa_sint_t * RESTRICT bucket_start,
+                                           sa_sint_t * RESTRICT bucket_end,
+                                           sa_sint_t threads) {
     fast_sint_t c;
 
 #if defined(_OPENMP)
-    #pragma omp parallel for schedule(static, 1) num_threads(threads) if(threads > 1 && n >= 65536)
+    #pragma omp parallel for schedule(static, 1) \
+        num_threads(threads) if (threads > 1 && n >= 65536)
 #else
-    UNUSED(threads); UNUSED(n);
+    UNUSED(threads);
+    UNUSED(n);
 #endif
-    for (c = 0; c < k; ++c)
-    {
-        if (bucket_end[c] > bucket_start[c])
-        {
-            memset(&SA[bucket_start[c]], 0, ((size_t)bucket_end[c] - (size_t)bucket_start[c]) * sizeof(sa_sint_t));
+    for (c = 0; c < k; ++c) {
+        if (bucket_end[c] > bucket_start[c]) {
+            memset(&SA[bucket_start[c]], 0,
+                   ((size_t)bucket_end[c] - (size_t)bucket_start[c]) *
+                       sizeof(sa_sint_t));
         }
     }
 }
 
-static sa_sint_t libsais_induce_final_order_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t bwt, sa_sint_t r, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
-    if (!bwt)
-    {
-        libsais_final_sorting_scan_left_to_right_8u_omp(T, SA, n, &buckets[6 * ALPHABET_SIZE], threads, thread_state);
-        if (threads > 1 && n >= 65536) { libsais_clear_lms_suffixes_omp(SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE], &buckets[7 * ALPHABET_SIZE], threads); }
-        libsais_final_sorting_scan_right_to_left_8u_omp(T, SA, n, &buckets[7 * ALPHABET_SIZE], threads, thread_state);
+static sa_sint_t libsais_induce_final_order_8u_omp(
+    const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+    sa_sint_t bwt, sa_sint_t r, sa_sint_t * RESTRICT I,
+    sa_sint_t * RESTRICT buckets, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    if (!bwt) {
+        libsais_final_sorting_scan_left_to_right_8u_omp(
+            T, SA, n, &buckets[6 * ALPHABET_SIZE], threads, thread_state);
+        if (threads > 1 && n >= 65536) {
+            libsais_clear_lms_suffixes_omp(
+                SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE],
+                &buckets[7 * ALPHABET_SIZE], threads);
+        }
+        libsais_final_sorting_scan_right_to_left_8u_omp(
+            T, SA, n, &buckets[7 * ALPHABET_SIZE], threads, thread_state);
         return 0;
-    }
-    else if (I != NULL)
-    {
-        libsais_final_bwt_aux_scan_left_to_right_8u_omp(T, SA, n, r - 1, I, &buckets[6 * ALPHABET_SIZE], threads, thread_state);
-        if (threads > 1 && n >= 65536) { libsais_clear_lms_suffixes_omp(SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE], &buckets[7 * ALPHABET_SIZE], threads); }
-        libsais_final_bwt_aux_scan_right_to_left_8u_omp(T, SA, n, r - 1, I, &buckets[7 * ALPHABET_SIZE], threads, thread_state);
+    } else if (I != NULL) {
+        libsais_final_bwt_aux_scan_left_to_right_8u_omp(
+            T, SA, n, r - 1, I, &buckets[6 * ALPHABET_SIZE], threads,
+            thread_state);
+        if (threads > 1 && n >= 65536) {
+            libsais_clear_lms_suffixes_omp(
+                SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE],
+                &buckets[7 * ALPHABET_SIZE], threads);
+        }
+        libsais_final_bwt_aux_scan_right_to_left_8u_omp(
+            T, SA, n, r - 1, I, &buckets[7 * ALPHABET_SIZE], threads,
+            thread_state);
         return 0;
-    }
-    else
-    {
-        libsais_final_bwt_scan_left_to_right_8u_omp(T, SA, n, &buckets[6 * ALPHABET_SIZE], threads, thread_state);
-        if (threads > 1 && n >= 65536) { libsais_clear_lms_suffixes_omp(SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE], &buckets[7 * ALPHABET_SIZE], threads); }
-        return libsais_final_bwt_scan_right_to_left_8u_omp(T, SA, n, &buckets[7 * ALPHABET_SIZE], threads, thread_state);
-    }
-}
-
-static void libsais_induce_final_order_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
-    libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[4 * k], threads, thread_state);
-    libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[5 * k], threads, thread_state);
-}
-
-static void libsais_induce_final_order_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
-    libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[2 * k], threads, thread_state);
-    libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[3 * k], threads, thread_state);
-}
-
-static void libsais_induce_final_order_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
-    libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[1 * k], threads, thread_state);
-    libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[0 * k], threads, thread_state);
-}
-
-static void libsais_induce_final_order_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
+    } else {
+        libsais_final_bwt_scan_left_to_right_8u_omp(
+            T, SA, n, &buckets[6 * ALPHABET_SIZE], threads, thread_state);
+        if (threads > 1 && n >= 65536) {
+            libsais_clear_lms_suffixes_omp(
+                SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE],
+                &buckets[7 * ALPHABET_SIZE], threads);
+        }
+        return libsais_final_bwt_scan_right_to_left_8u_omp(
+            T, SA, n, &buckets[7 * ALPHABET_SIZE], threads, thread_state);
+    }
+}
+
+static void libsais_induce_final_order_32s_6k(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+    sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[4 * k],
+                                                     threads, thread_state);
+    libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[5 * k],
+                                                     threads, thread_state);
+}
+
+static void libsais_induce_final_order_32s_4k(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+    sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[2 * k],
+                                                     threads, thread_state);
+    libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[3 * k],
+                                                     threads, thread_state);
+}
+
+static void libsais_induce_final_order_32s_2k(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+    sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[1 * k],
+                                                     threads, thread_state);
+    libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[0 * k],
+                                                     threads, thread_state);
+}
+
+static void libsais_induce_final_order_32s_1k(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+    sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     libsais_count_suffixes_32s(T, n, k, buckets);
     libsais_initialize_buckets_start_32s_1k(k, buckets);
-    libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, buckets, threads, thread_state);
+    libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, buckets, threads,
+                                                     thread_state);
 
     libsais_count_suffixes_32s(T, n, k, buckets);
     libsais_initialize_buckets_end_32s_1k(k, buckets);
-    libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, buckets, threads, thread_state);
+    libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, buckets, threads,
+                                                     thread_state);
 }
 
-static sa_sint_t libsais_renumber_unique_and_nonunique_lms_suffixes_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t f, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static sa_sint_t libsais_renumber_unique_and_nonunique_lms_suffixes_32s(
+    sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t f,
+    fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     sa_sint_t * RESTRICT SAm = &SA[m];
 
     sa_sint_t i, j;
-    for (i = (sa_sint_t)omp_block_start, j = (sa_sint_t)omp_block_start + (sa_sint_t)omp_block_size - 2 * (sa_sint_t)prefetch_distance - 3; i < j; i += 4)
-    {
+    for (i = (sa_sint_t)omp_block_start,
+        j = (sa_sint_t)omp_block_start + (sa_sint_t)omp_block_size -
+            2 * (sa_sint_t)prefetch_distance - 3;
+         i < j; i += 4) {
         libsais_prefetch(&SA[i + 3 * prefetch_distance]);
 
-        libsais_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 0]) >> 1]);
-        libsais_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 1]) >> 1]);
-        libsais_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 2]) >> 1]);
-        libsais_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 3]) >> 1]);
-
-        sa_uint_t q0 = (sa_uint_t)SA[i + prefetch_distance + 0]; const sa_sint_t * Tq0 = &T[q0]; libsais_prefetchw(SAm[q0 >> 1] < 0 ? Tq0 : NULL);
-        sa_uint_t q1 = (sa_uint_t)SA[i + prefetch_distance + 1]; const sa_sint_t * Tq1 = &T[q1]; libsais_prefetchw(SAm[q1 >> 1] < 0 ? Tq1 : NULL);
-        sa_uint_t q2 = (sa_uint_t)SA[i + prefetch_distance + 2]; const sa_sint_t * Tq2 = &T[q2]; libsais_prefetchw(SAm[q2 >> 1] < 0 ? Tq2 : NULL);
-        sa_uint_t q3 = (sa_uint_t)SA[i + prefetch_distance + 3]; const sa_sint_t * Tq3 = &T[q3]; libsais_prefetchw(SAm[q3 >> 1] < 0 ? Tq3 : NULL);
-
-        sa_uint_t p0 = (sa_uint_t)SA[i + 0]; sa_sint_t s0 = SAm[p0 >> 1]; if (s0 < 0) { T[p0] |= SAINT_MIN; f++; s0 = i + 0 + SAINT_MIN + f; } SAm[p0 >> 1] = s0 - f;
-        sa_uint_t p1 = (sa_uint_t)SA[i + 1]; sa_sint_t s1 = SAm[p1 >> 1]; if (s1 < 0) { T[p1] |= SAINT_MIN; f++; s1 = i + 1 + SAINT_MIN + f; } SAm[p1 >> 1] = s1 - f;
-        sa_uint_t p2 = (sa_uint_t)SA[i + 2]; sa_sint_t s2 = SAm[p2 >> 1]; if (s2 < 0) { T[p2] |= SAINT_MIN; f++; s2 = i + 2 + SAINT_MIN + f; } SAm[p2 >> 1] = s2 - f;
-        sa_uint_t p3 = (sa_uint_t)SA[i + 3]; sa_sint_t s3 = SAm[p3 >> 1]; if (s3 < 0) { T[p3] |= SAINT_MIN; f++; s3 = i + 3 + SAINT_MIN + f; } SAm[p3 >> 1] = s3 - f;
-    }
-
-    for (j += 2 * (sa_sint_t)prefetch_distance + 3; i < j; i += 1)
-    {
-        sa_uint_t p = (sa_uint_t)SA[i]; sa_sint_t s = SAm[p >> 1]; if (s < 0) { T[p] |= SAINT_MIN; f++; s = i + SAINT_MIN + f; } SAm[p >> 1] = s - f;
+        libsais_prefetchw(
+            &SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 0]) >> 1]);
+        libsais_prefetchw(
+            &SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 1]) >> 1]);
+        libsais_prefetchw(
+            &SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 2]) >> 1]);
+        libsais_prefetchw(
+            &SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 3]) >> 1]);
+
+        sa_uint_t q0 = (sa_uint_t)SA[i + prefetch_distance + 0];
+        const sa_sint_t * Tq0 = &T[q0];
+        libsais_prefetchw(SAm[q0 >> 1] < 0 ? Tq0 : NULL);
+        sa_uint_t q1 = (sa_uint_t)SA[i + prefetch_distance + 1];
+        const sa_sint_t * Tq1 = &T[q1];
+        libsais_prefetchw(SAm[q1 >> 1] < 0 ? Tq1 : NULL);
+        sa_uint_t q2 = (sa_uint_t)SA[i + prefetch_distance + 2];
+        const sa_sint_t * Tq2 = &T[q2];
+        libsais_prefetchw(SAm[q2 >> 1] < 0 ? Tq2 : NULL);
+        sa_uint_t q3 = (sa_uint_t)SA[i + prefetch_distance + 3];
+        const sa_sint_t * Tq3 = &T[q3];
+        libsais_prefetchw(SAm[q3 >> 1] < 0 ? Tq3 : NULL);
+
+        sa_uint_t p0 = (sa_uint_t)SA[i + 0];
+        sa_sint_t s0 = SAm[p0 >> 1];
+        if (s0 < 0) {
+            T[p0] |= SAINT_MIN;
+            f++;
+            s0 = i + 0 + SAINT_MIN + f;
+        }
+        SAm[p0 >> 1] = s0 - f;
+        sa_uint_t p1 = (sa_uint_t)SA[i + 1];
+        sa_sint_t s1 = SAm[p1 >> 1];
+        if (s1 < 0) {
+            T[p1] |= SAINT_MIN;
+            f++;
+            s1 = i + 1 + SAINT_MIN + f;
+        }
+        SAm[p1 >> 1] = s1 - f;
+        sa_uint_t p2 = (sa_uint_t)SA[i + 2];
+        sa_sint_t s2 = SAm[p2 >> 1];
+        if (s2 < 0) {
+            T[p2] |= SAINT_MIN;
+            f++;
+            s2 = i + 2 + SAINT_MIN + f;
+        }
+        SAm[p2 >> 1] = s2 - f;
+        sa_uint_t p3 = (sa_uint_t)SA[i + 3];
+        sa_sint_t s3 = SAm[p3 >> 1];
+        if (s3 < 0) {
+            T[p3] |= SAINT_MIN;
+            f++;
+            s3 = i + 3 + SAINT_MIN + f;
+        }
+        SAm[p3 >> 1] = s3 - f;
+    }
+
+    for (j += 2 * (sa_sint_t)prefetch_distance + 3; i < j; i += 1) {
+        sa_uint_t p = (sa_uint_t)SA[i];
+        sa_sint_t s = SAm[p >> 1];
+        if (s < 0) {
+            T[p] |= SAINT_MIN;
+            f++;
+            s = i + SAINT_MIN + f;
+        }
+        SAm[p >> 1] = s - f;
     }
 
     return f;
 }
 
-static void libsais_compact_unique_and_nonunique_lms_suffixes_32s(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t * pl, fast_sint_t * pr, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static void libsais_compact_unique_and_nonunique_lms_suffixes_32s(
+    sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t * pl, fast_sint_t * pr,
+    fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     sa_sint_t * RESTRICT SAl = &SA[0];
     sa_sint_t * RESTRICT SAr = &SA[0];
 
     fast_sint_t i, j, l = *pl - 1, r = *pr - 1;
-    for (i = (fast_sint_t)m + omp_block_start + omp_block_size - 1, j = (fast_sint_t)m + omp_block_start + 3; i >= j; i -= 4)
-    {
+    for (i = (fast_sint_t)m + omp_block_start + omp_block_size - 1,
+        j = (fast_sint_t)m + omp_block_start + 3;
+         i >= j; i -= 4) {
         libsais_prefetch(&SA[i - prefetch_distance]);
 
-        sa_sint_t p0 = SA[i - 0]; SAl[l] = p0 & SAINT_MAX; l -= p0 < 0; SAr[r] = p0 - 1; r -= p0 > 0;
-        sa_sint_t p1 = SA[i - 1]; SAl[l] = p1 & SAINT_MAX; l -= p1 < 0; SAr[r] = p1 - 1; r -= p1 > 0;
-        sa_sint_t p2 = SA[i - 2]; SAl[l] = p2 & SAINT_MAX; l -= p2 < 0; SAr[r] = p2 - 1; r -= p2 > 0;
-        sa_sint_t p3 = SA[i - 3]; SAl[l] = p3 & SAINT_MAX; l -= p3 < 0; SAr[r] = p3 - 1; r -= p3 > 0;
+        sa_sint_t p0 = SA[i - 0];
+        SAl[l] = p0 & SAINT_MAX;
+        l -= p0 < 0;
+        SAr[r] = p0 - 1;
+        r -= p0 > 0;
+        sa_sint_t p1 = SA[i - 1];
+        SAl[l] = p1 & SAINT_MAX;
+        l -= p1 < 0;
+        SAr[r] = p1 - 1;
+        r -= p1 > 0;
+        sa_sint_t p2 = SA[i - 2];
+        SAl[l] = p2 & SAINT_MAX;
+        l -= p2 < 0;
+        SAr[r] = p2 - 1;
+        r -= p2 > 0;
+        sa_sint_t p3 = SA[i - 3];
+        SAl[l] = p3 & SAINT_MAX;
+        l -= p3 < 0;
+        SAr[r] = p3 - 1;
+        r -= p3 > 0;
+    }
+
+    for (j -= 3; i >= j; i -= 1) {
+        sa_sint_t p = SA[i];
+        SAl[l] = p & SAINT_MAX;
+        l -= p < 0;
+        SAr[r] = p - 1;
+        r -= p > 0;
     }
 
-    for (j -= 3; i >= j; i -= 1)
-    {
-        sa_sint_t p = SA[i]; SAl[l] = p & SAINT_MAX; l -= p < 0; SAr[r] = p - 1; r -= p > 0;
-    }
-    
-    *pl = l + 1; *pr = r + 1;
+    *pl = l + 1;
+    *pr = r + 1;
 }
 
-
 #if defined(_OPENMP)
 
-static sa_sint_t libsais_count_unique_suffixes(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static sa_sint_t libsais_count_unique_suffixes(sa_sint_t * RESTRICT SA,
+                                               sa_sint_t m,
+                                               fast_sint_t omp_block_start,
+                                               fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     sa_sint_t * RESTRICT SAm = &SA[m];
 
-    fast_sint_t i, j; sa_sint_t f0 = 0, f1 = 0, f2 = 0, f3 = 0;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4)
-    {
+    fast_sint_t i, j;
+    sa_sint_t f0 = 0, f1 = 0, f2 = 0, f3 = 0;
+    for (i = omp_block_start,
+        j = omp_block_start + omp_block_size - prefetch_distance - 3;
+         i < j; i += 4) {
         libsais_prefetch(&SA[i + 2 * prefetch_distance]);
 
         libsais_prefetch(&SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]);
@@ -5943,8 +8655,7 @@ static sa_sint_t libsais_count_unique_suffixes(sa_sint_t * RESTRICT SA, sa_sint_
         f3 += SAm[((sa_uint_t)SA[i + 3]) >> 1] < 0;
     }
 
-    for (j += prefetch_distance + 3; i < j; i += 1)
-    {
+    for (j += prefetch_distance + 3; i < j; i += 1) {
         f0 += SAm[((sa_uint_t)SA[i]) >> 1] < 0;
     }
 
@@ -5953,49 +8664,59 @@ static sa_sint_t libsais_count_unique_suffixes(sa_sint_t * RESTRICT SA, sa_sint_
 
 #endif
 
-static sa_sint_t libsais_renumber_unique_and_nonunique_lms_suffixes_32s_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
+static sa_sint_t libsais_renumber_unique_and_nonunique_lms_suffixes_32s_omp(
+    sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t m,
+    sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     sa_sint_t f = 0;
 
 #if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536)
+    #pragma omp parallel num_threads(threads) if (threads > 1 && m >= 65536)
 #endif
     {
 #if defined(_OPENMP)
-        fast_sint_t omp_thread_num    = omp_get_thread_num();
-        fast_sint_t omp_num_threads   = omp_get_num_threads();
+        fast_sint_t omp_thread_num = omp_get_thread_num();
+        fast_sint_t omp_num_threads = omp_get_num_threads();
 #else
-        UNUSED(threads); UNUSED(thread_state);
+        UNUSED(threads);
+        UNUSED(thread_state);
 
-        fast_sint_t omp_thread_num    = 0;
-        fast_sint_t omp_num_threads   = 1;
+        fast_sint_t omp_thread_num = 0;
+        fast_sint_t omp_num_threads = 1;
 #endif
-        fast_sint_t omp_block_stride  = (m / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
+        fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
+                                         ? omp_block_stride
+                                         : m - omp_block_start;
 
-        if (omp_num_threads == 1)
-        {
-            f = libsais_renumber_unique_and_nonunique_lms_suffixes_32s(T, SA, m, 0, omp_block_start, omp_block_size);
+        if (omp_num_threads == 1) {
+            f = libsais_renumber_unique_and_nonunique_lms_suffixes_32s(
+                T, SA, m, 0, omp_block_start, omp_block_size);
         }
 #if defined(_OPENMP)
-        else
-        {
+        else {
             {
-                thread_state[omp_thread_num].state.count = libsais_count_unique_suffixes(SA, m, omp_block_start, omp_block_size);
+                thread_state[omp_thread_num].state.count =
+                    libsais_count_unique_suffixes(SA, m, omp_block_start,
+                                                  omp_block_size);
             }
 
-            #pragma omp barrier
+    #pragma omp barrier
 
             {
-                fast_sint_t t, count = 0; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; }
+                fast_sint_t t, count = 0;
+                for (t = 0; t < omp_thread_num; ++t) {
+                    count += thread_state[t].state.count;
+                }
 
-                if (omp_thread_num == omp_num_threads - 1)
-                {
-                    f = (sa_sint_t)(count + thread_state[omp_thread_num].state.count);
+                if (omp_thread_num == omp_num_threads - 1) {
+                    f = (sa_sint_t)(count +
+                                    thread_state[omp_thread_num].state.count);
                 }
 
-                libsais_renumber_unique_and_nonunique_lms_suffixes_32s(T, SA, m, (sa_sint_t)count, omp_block_start, omp_block_size);
+                libsais_renumber_unique_and_nonunique_lms_suffixes_32s(
+                    T, SA, m, (sa_sint_t)count, omp_block_start,
+                    omp_block_size);
             }
         }
 #endif
@@ -6004,65 +8725,88 @@ static sa_sint_t libsais_renumber_unique_and_nonunique_lms_suffixes_32s_omp(sa_s
     return f;
 }
 
-static void libsais_compact_unique_and_nonunique_lms_suffixes_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t f, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
+static void libsais_compact_unique_and_nonunique_lms_suffixes_32s_omp(
+    sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs,
+    sa_sint_t f, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
 #if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 131072 && m < fs)
+    #pragma omp parallel num_threads(threads) if (threads > 1 && \
+                                                  n >= 131072 && m < fs)
 #endif
     {
 #if defined(_OPENMP)
-        fast_sint_t omp_thread_num    = omp_get_thread_num();
-        fast_sint_t omp_num_threads   = omp_get_num_threads();
+        fast_sint_t omp_thread_num = omp_get_thread_num();
+        fast_sint_t omp_num_threads = omp_get_num_threads();
 #else
-        UNUSED(threads); UNUSED(thread_state);
+        UNUSED(threads);
+        UNUSED(thread_state);
 
-        fast_sint_t omp_thread_num    = 0;
-        fast_sint_t omp_num_threads   = 1;
+        fast_sint_t omp_thread_num = 0;
+        fast_sint_t omp_num_threads = 1;
 #endif
-        fast_sint_t omp_block_stride  = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ((fast_sint_t)n >> 1) - omp_block_start;
+        fast_sint_t omp_block_stride =
+            (((fast_sint_t)n >> 1) / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size =
+            omp_thread_num < omp_num_threads - 1
+                ? omp_block_stride
+                : ((fast_sint_t)n >> 1) - omp_block_start;
 
-        if (omp_num_threads == 1)
-        {
+        if (omp_num_threads == 1) {
             fast_sint_t l = m, r = (fast_sint_t)n + (fast_sint_t)fs;
-            libsais_compact_unique_and_nonunique_lms_suffixes_32s(SA, m, &l, &r, omp_block_start, omp_block_size);
+            libsais_compact_unique_and_nonunique_lms_suffixes_32s(
+                SA, m, &l, &r, omp_block_start, omp_block_size);
         }
 #if defined(_OPENMP)
-        else
-        {
+        else {
             {
-                thread_state[omp_thread_num].state.position   = (fast_sint_t)m + ((fast_sint_t)n >> 1) + omp_block_start + omp_block_size;
-                thread_state[omp_thread_num].state.count      = (fast_sint_t)m + omp_block_start + omp_block_size;
+                thread_state[omp_thread_num].state.position =
+                    (fast_sint_t)m + ((fast_sint_t)n >> 1) + omp_block_start +
+                    omp_block_size;
+                thread_state[omp_thread_num].state.count =
+                    (fast_sint_t)m + omp_block_start + omp_block_size;
 
-                libsais_compact_unique_and_nonunique_lms_suffixes_32s(SA, m, &thread_state[omp_thread_num].state.position, &thread_state[omp_thread_num].state.count, omp_block_start, omp_block_size);
+                libsais_compact_unique_and_nonunique_lms_suffixes_32s(
+                    SA, m, &thread_state[omp_thread_num].state.position,
+                    &thread_state[omp_thread_num].state.count, omp_block_start,
+                    omp_block_size);
             }
 
-            #pragma omp barrier
+    #pragma omp barrier
 
-            #pragma omp master
+    #pragma omp master
             {
                 fast_sint_t t, position;
 
-                for (position = m, t = omp_num_threads - 1; t >= 0; --t)
-                { 
-                    fast_sint_t omp_block_end     = t < omp_num_threads - 1 ? omp_block_stride * (t + 1) : ((fast_sint_t)n >> 1);
-                    fast_sint_t count             = ((fast_sint_t)m + ((fast_sint_t)n >> 1) + omp_block_end - thread_state[t].state.position);
-
-                    if (count > 0)
-                    {
-                        position -= count; memcpy(&SA[position], &SA[thread_state[t].state.position], (size_t)count * sizeof(sa_sint_t));
+                for (position = m, t = omp_num_threads - 1; t >= 0; --t) {
+                    fast_sint_t omp_block_end = t < omp_num_threads - 1
+                                                    ? omp_block_stride * (t + 1)
+                                                    : ((fast_sint_t)n >> 1);
+                    fast_sint_t count =
+                        ((fast_sint_t)m + ((fast_sint_t)n >> 1) +
+                         omp_block_end - thread_state[t].state.position);
+
+                    if (count > 0) {
+                        position -= count;
+                        memcpy(&SA[position],
+                               &SA[thread_state[t].state.position],
+                               (size_t)count * sizeof(sa_sint_t));
                     }
                 }
 
-                for (position = (fast_sint_t)n + (fast_sint_t)fs, t = omp_num_threads - 1; t >= 0; --t)
-                {
-                    fast_sint_t omp_block_end     = t < omp_num_threads - 1 ? omp_block_stride * (t + 1) : ((fast_sint_t)n >> 1);
-                    fast_sint_t count             = ((fast_sint_t)m + omp_block_end - thread_state[t].state.count);
-
-                    if (count > 0)
-                    {
-                        position -= count; memcpy(&SA[position], &SA[thread_state[t].state.count], (size_t)count * sizeof(sa_sint_t));
+                for (position = (fast_sint_t)n + (fast_sint_t)fs,
+                    t = omp_num_threads - 1;
+                     t >= 0; --t) {
+                    fast_sint_t omp_block_end = t < omp_num_threads - 1
+                                                    ? omp_block_stride * (t + 1)
+                                                    : ((fast_sint_t)n >> 1);
+                    fast_sint_t count = ((fast_sint_t)m + omp_block_end -
+                                         thread_state[t].state.count);
+
+                    if (count > 0) {
+                        position -= count;
+                        memcpy(&SA[position], &SA[thread_state[t].state.count],
+                               (size_t)count * sizeof(sa_sint_t));
                     }
                 }
             }
@@ -6070,331 +8814,446 @@ static void libsais_compact_unique_and_nonunique_lms_suffixes_32s_omp(sa_sint_t
 #endif
     }
 
-    memcpy(&SA[(fast_sint_t)n + (fast_sint_t)fs - (fast_sint_t)m], &SA[(fast_sint_t)m - (fast_sint_t)f], (size_t)f * sizeof(sa_sint_t));
+    memcpy(&SA[(fast_sint_t)n + (fast_sint_t)fs - (fast_sint_t)m],
+           &SA[(fast_sint_t)m - (fast_sint_t)f], (size_t)f * sizeof(sa_sint_t));
 }
 
-static sa_sint_t libsais_compact_lms_suffixes_32s_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
-    sa_sint_t f = libsais_renumber_unique_and_nonunique_lms_suffixes_32s_omp(T, SA, m, threads, thread_state);
-    libsais_compact_unique_and_nonunique_lms_suffixes_32s_omp(SA, n, m, fs, f, threads, thread_state);
+static sa_sint_t libsais_compact_lms_suffixes_32s_omp(
+    sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
+    sa_sint_t fs, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    sa_sint_t f = libsais_renumber_unique_and_nonunique_lms_suffixes_32s_omp(
+        T, SA, m, threads, thread_state);
+    libsais_compact_unique_and_nonunique_lms_suffixes_32s_omp(
+        SA, n, m, fs, f, threads, thread_state);
 
     return f;
 }
 
-static void libsais_merge_unique_lms_suffixes_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, fast_sint_t l, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static void libsais_merge_unique_lms_suffixes_32s(
+    sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
+    fast_sint_t l, fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
-    const sa_sint_t * RESTRICT SAnm = &SA[(fast_sint_t)n - (fast_sint_t)m - 1 + l];
+    const sa_sint_t * RESTRICT SAnm =
+        &SA[(fast_sint_t)n - (fast_sint_t)m - 1 + l];
 
-    sa_sint_t i, j; fast_sint_t tmp = *SAnm++;
-    for (i = (sa_sint_t)omp_block_start, j = (sa_sint_t)omp_block_start + (sa_sint_t)omp_block_size - 6; i < j; i += 4)
-    {
+    sa_sint_t i, j;
+    fast_sint_t tmp = *SAnm++;
+    for (i = (sa_sint_t)omp_block_start,
+        j = (sa_sint_t)omp_block_start + (sa_sint_t)omp_block_size - 6;
+         i < j; i += 4) {
         libsais_prefetch(&T[i + prefetch_distance]);
 
-        sa_sint_t c0 = T[i + 0]; if (c0 < 0) { T[i + 0] = c0 & SAINT_MAX; SA[tmp] = i + 0; i++; tmp = *SAnm++; }
-        sa_sint_t c1 = T[i + 1]; if (c1 < 0) { T[i + 1] = c1 & SAINT_MAX; SA[tmp] = i + 1; i++; tmp = *SAnm++; }
-        sa_sint_t c2 = T[i + 2]; if (c2 < 0) { T[i + 2] = c2 & SAINT_MAX; SA[tmp] = i + 2; i++; tmp = *SAnm++; }
-        sa_sint_t c3 = T[i + 3]; if (c3 < 0) { T[i + 3] = c3 & SAINT_MAX; SA[tmp] = i + 3; i++; tmp = *SAnm++; }
+        sa_sint_t c0 = T[i + 0];
+        if (c0 < 0) {
+            T[i + 0] = c0 & SAINT_MAX;
+            SA[tmp] = i + 0;
+            i++;
+            tmp = *SAnm++;
+        }
+        sa_sint_t c1 = T[i + 1];
+        if (c1 < 0) {
+            T[i + 1] = c1 & SAINT_MAX;
+            SA[tmp] = i + 1;
+            i++;
+            tmp = *SAnm++;
+        }
+        sa_sint_t c2 = T[i + 2];
+        if (c2 < 0) {
+            T[i + 2] = c2 & SAINT_MAX;
+            SA[tmp] = i + 2;
+            i++;
+            tmp = *SAnm++;
+        }
+        sa_sint_t c3 = T[i + 3];
+        if (c3 < 0) {
+            T[i + 3] = c3 & SAINT_MAX;
+            SA[tmp] = i + 3;
+            i++;
+            tmp = *SAnm++;
+        }
     }
 
-    for (j += 6; i < j; i += 1)
-    {
-        sa_sint_t c = T[i]; if (c < 0) { T[i] = c & SAINT_MAX; SA[tmp] = i; i++; tmp = *SAnm++; }
+    for (j += 6; i < j; i += 1) {
+        sa_sint_t c = T[i];
+        if (c < 0) {
+            T[i] = c & SAINT_MAX;
+            SA[tmp] = i;
+            i++;
+            tmp = *SAnm++;
+        }
     }
 }
 
-static void libsais_merge_nonunique_lms_suffixes_32s(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, fast_sint_t l, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static void libsais_merge_nonunique_lms_suffixes_32s(
+    sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, fast_sint_t l,
+    fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
-    const sa_sint_t * RESTRICT SAnm = &SA[(fast_sint_t)n - (fast_sint_t)m - 1 + l];
+    const sa_sint_t * RESTRICT SAnm =
+        &SA[(fast_sint_t)n - (fast_sint_t)m - 1 + l];
 
-    fast_sint_t i, j; sa_sint_t tmp = *SAnm++;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - 3; i < j; i += 4)
-    {
+    fast_sint_t i, j;
+    sa_sint_t tmp = *SAnm++;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - 3; i < j;
+         i += 4) {
         libsais_prefetch(&SA[i + prefetch_distance]);
 
-        if (SA[i + 0] == 0) { SA[i + 0] = tmp; tmp = *SAnm++; }
-        if (SA[i + 1] == 0) { SA[i + 1] = tmp; tmp = *SAnm++; }
-        if (SA[i + 2] == 0) { SA[i + 2] = tmp; tmp = *SAnm++; }
-        if (SA[i + 3] == 0) { SA[i + 3] = tmp; tmp = *SAnm++; }
+        if (SA[i + 0] == 0) {
+            SA[i + 0] = tmp;
+            tmp = *SAnm++;
+        }
+        if (SA[i + 1] == 0) {
+            SA[i + 1] = tmp;
+            tmp = *SAnm++;
+        }
+        if (SA[i + 2] == 0) {
+            SA[i + 2] = tmp;
+            tmp = *SAnm++;
+        }
+        if (SA[i + 3] == 0) {
+            SA[i + 3] = tmp;
+            tmp = *SAnm++;
+        }
     }
 
-    for (j += 3; i < j; i += 1)
-    {
-        if (SA[i] == 0) { SA[i] = tmp; tmp = *SAnm++; }
+    for (j += 3; i < j; i += 1) {
+        if (SA[i] == 0) {
+            SA[i] = tmp;
+            tmp = *SAnm++;
+        }
     }
 }
 
-static void libsais_merge_unique_lms_suffixes_32s_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
+static void libsais_merge_unique_lms_suffixes_32s_omp(
+    sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
+    sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
 #if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536)
+    #pragma omp parallel num_threads(threads) if (threads > 1 && n >= 65536)
 #endif
     {
 #if defined(_OPENMP)
-        fast_sint_t omp_thread_num    = omp_get_thread_num();
-        fast_sint_t omp_num_threads   = omp_get_num_threads();
+        fast_sint_t omp_thread_num = omp_get_thread_num();
+        fast_sint_t omp_num_threads = omp_get_num_threads();
 #else
-        UNUSED(threads); UNUSED(thread_state);
+        UNUSED(threads);
+        UNUSED(thread_state);
 
-        fast_sint_t omp_thread_num    = 0;
-        fast_sint_t omp_num_threads   = 1;
+        fast_sint_t omp_thread_num = 0;
+        fast_sint_t omp_num_threads = 1;
 #endif
-        fast_sint_t omp_block_stride  = (n / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+        fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
+                                         ? omp_block_stride
+                                         : n - omp_block_start;
 
-        if (omp_num_threads == 1)
-        {
-            libsais_merge_unique_lms_suffixes_32s(T, SA, n, m, 0, omp_block_start, omp_block_size);
+        if (omp_num_threads == 1) {
+            libsais_merge_unique_lms_suffixes_32s(
+                T, SA, n, m, 0, omp_block_start, omp_block_size);
         }
 #if defined(_OPENMP)
-        else
-        {
+        else {
             {
-                thread_state[omp_thread_num].state.count = libsais_count_negative_marked_suffixes(T, omp_block_start, omp_block_size);
+                thread_state[omp_thread_num].state.count =
+                    libsais_count_negative_marked_suffixes(T, omp_block_start,
+                                                           omp_block_size);
             }
 
-            #pragma omp barrier
+    #pragma omp barrier
 
             {
-                fast_sint_t t, count = 0; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; }
+                fast_sint_t t, count = 0;
+                for (t = 0; t < omp_thread_num; ++t) {
+                    count += thread_state[t].state.count;
+                }
 
-                libsais_merge_unique_lms_suffixes_32s(T, SA, n, m, count, omp_block_start, omp_block_size);
+                libsais_merge_unique_lms_suffixes_32s(
+                    T, SA, n, m, count, omp_block_start, omp_block_size);
             }
         }
 #endif
     }
 }
 
-static void libsais_merge_nonunique_lms_suffixes_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t f, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
+static void libsais_merge_nonunique_lms_suffixes_32s_omp(
+    sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t f,
+    sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
 #if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536)
+    #pragma omp parallel num_threads(threads) if (threads > 1 && m >= 65536)
 #endif
     {
 #if defined(_OPENMP)
-        fast_sint_t omp_thread_num    = omp_get_thread_num();
-        fast_sint_t omp_num_threads   = omp_get_num_threads();
+        fast_sint_t omp_thread_num = omp_get_thread_num();
+        fast_sint_t omp_num_threads = omp_get_num_threads();
 #else
-        UNUSED(threads); UNUSED(thread_state);
+        UNUSED(threads);
+        UNUSED(thread_state);
 
-        fast_sint_t omp_thread_num    = 0;
-        fast_sint_t omp_num_threads   = 1;
+        fast_sint_t omp_thread_num = 0;
+        fast_sint_t omp_num_threads = 1;
 #endif
-        fast_sint_t omp_block_stride  = (m / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
+        fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
+                                         ? omp_block_stride
+                                         : m - omp_block_start;
 
-        if (omp_num_threads == 1)
-        {
-            libsais_merge_nonunique_lms_suffixes_32s(SA, n, m, f, omp_block_start, omp_block_size);
+        if (omp_num_threads == 1) {
+            libsais_merge_nonunique_lms_suffixes_32s(
+                SA, n, m, f, omp_block_start, omp_block_size);
         }
 #if defined(_OPENMP)
-        else
-        {
+        else {
             {
-                thread_state[omp_thread_num].state.count = libsais_count_zero_marked_suffixes(SA, omp_block_start, omp_block_size);
+                thread_state[omp_thread_num].state.count =
+                    libsais_count_zero_marked_suffixes(SA, omp_block_start,
+                                                       omp_block_size);
             }
 
-            #pragma omp barrier
+    #pragma omp barrier
 
             {
-                fast_sint_t t, count = f; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; }
+                fast_sint_t t, count = f;
+                for (t = 0; t < omp_thread_num; ++t) {
+                    count += thread_state[t].state.count;
+                }
 
-                libsais_merge_nonunique_lms_suffixes_32s(SA, n, m, count, omp_block_start, omp_block_size);
+                libsais_merge_nonunique_lms_suffixes_32s(
+                    SA, n, m, count, omp_block_start, omp_block_size);
             }
         }
 #endif
     }
 }
 
-static void libsais_merge_compacted_lms_suffixes_32s_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t f, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
-    libsais_merge_unique_lms_suffixes_32s_omp(T, SA, n, m, threads, thread_state);
-    libsais_merge_nonunique_lms_suffixes_32s_omp(SA, n, m, f, threads, thread_state);
+static void libsais_merge_compacted_lms_suffixes_32s_omp(
+    sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
+    sa_sint_t f, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    libsais_merge_unique_lms_suffixes_32s_omp(T, SA, n, m, threads,
+                                              thread_state);
+    libsais_merge_nonunique_lms_suffixes_32s_omp(SA, n, m, f, threads,
+                                                 thread_state);
 }
 
-static void libsais_reconstruct_compacted_lms_suffixes_32s_2k_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, sa_sint_t fs, sa_sint_t f, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
-    if (f > 0)
-    {
+static void libsais_reconstruct_compacted_lms_suffixes_32s_2k_omp(
+    sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
+    sa_sint_t m, sa_sint_t fs, sa_sint_t f, sa_sint_t * RESTRICT buckets,
+    sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    if (f > 0) {
         memmove(&SA[n - m - 1], &SA[n + fs - m], (size_t)f * sizeof(sa_sint_t));
 
-        libsais_count_and_gather_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state);
+        libsais_count_and_gather_compacted_lms_suffixes_32s_2k_omp(
+            T, SA, n, k, buckets, threads, thread_state);
         libsais_reconstruct_lms_suffixes_omp(SA, n, m - f, threads);
 
-        memcpy(&SA[n - m - 1 + f], &SA[0], ((size_t)m - (size_t)f) * sizeof(sa_sint_t));
+        memcpy(&SA[n - m - 1 + f], &SA[0],
+               ((size_t)m - (size_t)f) * sizeof(sa_sint_t));
         memset(&SA[0], 0, (size_t)m * sizeof(sa_sint_t));
 
-        libsais_merge_compacted_lms_suffixes_32s_omp(T, SA, n, m, f, threads, thread_state);
-    }
-    else
-    {
-        libsais_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets, 0, n);
+        libsais_merge_compacted_lms_suffixes_32s_omp(T, SA, n, m, f, threads,
+                                                     thread_state);
+    } else {
+        libsais_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets, 0,
+                                                     n);
         libsais_reconstruct_lms_suffixes_omp(SA, n, m, threads);
     }
 }
 
-static void libsais_reconstruct_compacted_lms_suffixes_32s_1k_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t f, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
-    if (f > 0)
-    {
+static void libsais_reconstruct_compacted_lms_suffixes_32s_1k_omp(
+    sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
+    sa_sint_t fs, sa_sint_t f, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    if (f > 0) {
         memmove(&SA[n - m - 1], &SA[n + fs - m], (size_t)f * sizeof(sa_sint_t));
 
         libsais_gather_compacted_lms_suffixes_32s(T, SA, n);
         libsais_reconstruct_lms_suffixes_omp(SA, n, m - f, threads);
 
-        memcpy(&SA[n - m - 1 + f], &SA[0], ((size_t)m - (size_t)f) * sizeof(sa_sint_t));
+        memcpy(&SA[n - m - 1 + f], &SA[0],
+               ((size_t)m - (size_t)f) * sizeof(sa_sint_t));
         memset(&SA[0], 0, (size_t)m * sizeof(sa_sint_t));
 
-        libsais_merge_compacted_lms_suffixes_32s_omp(T, SA, n, m, f, threads, thread_state);
-    }
-    else
-    {
+        libsais_merge_compacted_lms_suffixes_32s_omp(T, SA, n, m, f, threads,
+                                                     thread_state);
+    } else {
         libsais_gather_lms_suffixes_32s(T, SA, n);
         libsais_reconstruct_lms_suffixes_omp(SA, n, m, threads);
     }
 }
 
-static sa_sint_t libsais_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
+static sa_sint_t libsais_main_32s(
+    sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
+    sa_sint_t fs, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     fs = fs < (SAINT_MAX - n) ? fs : (SAINT_MAX - n);
 
-    if (k > 0 && fs / k >= 6)
-    {
+    if (k > 0 && fs / k >= 6) {
         sa_sint_t alignment = (fs - 1024) / k >= 6 ? 1024 : 16;
-        sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 6 ? (sa_sint_t *)libsais_align_up(&SA[n + fs - 6 * k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 6 * k];
-
-        sa_sint_t m = libsais_count_and_gather_lms_suffixes_32s_4k_omp(T, SA, n, k, buckets, threads, thread_state);
-        if (m > 1)
-        {
+        sa_sint_t * RESTRICT buckets =
+            (fs - alignment) / k >= 6
+                ? (sa_sint_t *)libsais_align_up(
+                      &SA[n + fs - 6 * k - alignment],
+                      (size_t)alignment * sizeof(sa_sint_t))
+                : &SA[n + fs - 6 * k];
+
+        sa_sint_t m = libsais_count_and_gather_lms_suffixes_32s_4k_omp(
+            T, SA, n, k, buckets, threads, thread_state);
+        if (m > 1) {
             memset(SA, 0, ((size_t)n - (size_t)m) * sizeof(sa_sint_t));
 
-            sa_sint_t first_lms_suffix    = SA[n - m];
-            sa_sint_t left_suffixes_count = libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(T, k, buckets, first_lms_suffix);
-
-            libsais_radix_sort_lms_suffixes_32s_6k_omp(T, SA, n, m, &buckets[4 * k], threads, thread_state);
-            libsais_radix_sort_set_markers_32s_6k_omp(SA, k, &buckets[4 * k], threads);
-
-            if (threads > 1 && n >= 65536) { memset(&SA[(fast_sint_t)n - (fast_sint_t)m], 0, (size_t)m * sizeof(sa_sint_t)); }
-
-            libsais_initialize_buckets_for_partial_sorting_32s_6k(T, k, buckets, first_lms_suffix, left_suffixes_count);
-            libsais_induce_partial_order_32s_6k_omp(T, SA, n, k, buckets, first_lms_suffix, left_suffixes_count, threads, thread_state);
-
-            sa_sint_t names = libsais_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(SA, n, m, threads, thread_state);
-            if (names < m)
-            {
-                sa_sint_t f = libsais_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state);
-
-                if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state) != 0)
-                {
+            sa_sint_t first_lms_suffix = SA[n - m];
+            sa_sint_t left_suffixes_count =
+                libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(
+                    T, k, buckets, first_lms_suffix);
+
+            libsais_radix_sort_lms_suffixes_32s_6k_omp(
+                T, SA, n, m, &buckets[4 * k], threads, thread_state);
+            libsais_radix_sort_set_markers_32s_6k_omp(SA, k, &buckets[4 * k],
+                                                      threads);
+
+            if (threads > 1 && n >= 65536) {
+                memset(&SA[(fast_sint_t)n - (fast_sint_t)m], 0,
+                       (size_t)m * sizeof(sa_sint_t));
+            }
+
+            libsais_initialize_buckets_for_partial_sorting_32s_6k(
+                T, k, buckets, first_lms_suffix, left_suffixes_count);
+            libsais_induce_partial_order_32s_6k_omp(
+                T, SA, n, k, buckets, first_lms_suffix, left_suffixes_count,
+                threads, thread_state);
+
+            sa_sint_t names =
+                libsais_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(
+                    SA, n, m, threads, thread_state);
+            if (names < m) {
+                sa_sint_t f = libsais_compact_lms_suffixes_32s_omp(
+                    T, SA, n, m, fs, threads, thread_state);
+
+                if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f,
+                                     fs + n - 2 * m + f, threads,
+                                     thread_state) != 0) {
                     return -2;
                 }
 
-                libsais_reconstruct_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, m, fs, f, buckets, threads, thread_state);
-            }
-            else
-            {
+                libsais_reconstruct_compacted_lms_suffixes_32s_2k_omp(
+                    T, SA, n, k, m, fs, f, buckets, threads, thread_state);
+            } else {
                 libsais_count_lms_suffixes_32s_2k(T, n, k, buckets);
             }
 
             libsais_initialize_buckets_start_and_end_32s_4k(k, buckets);
             libsais_place_lms_suffixes_histogram_32s_4k(SA, n, k, m, buckets);
-            libsais_induce_final_order_32s_4k(T, SA, n, k, buckets, threads, thread_state);
-        }
-        else
-        {
+            libsais_induce_final_order_32s_4k(T, SA, n, k, buckets, threads,
+                                              thread_state);
+        } else {
             SA[0] = SA[n - 1];
 
             libsais_initialize_buckets_start_and_end_32s_6k(k, buckets);
             libsais_place_lms_suffixes_histogram_32s_6k(SA, n, k, m, buckets);
-            libsais_induce_final_order_32s_6k(T, SA, n, k, buckets, threads, thread_state);
+            libsais_induce_final_order_32s_6k(T, SA, n, k, buckets, threads,
+                                              thread_state);
         }
 
         return 0;
-    }
-    else if (k > 0 && fs / k >= 4)
-    {
+    } else if (k > 0 && fs / k >= 4) {
         sa_sint_t alignment = (fs - 1024) / k >= 4 ? 1024 : 16;
-        sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 4 ? (sa_sint_t *)libsais_align_up(&SA[n + fs - 4 * k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 4 * k];
-
-        sa_sint_t m = libsais_count_and_gather_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state);
-        if (m > 1)
-        {
-            libsais_initialize_buckets_for_radix_and_partial_sorting_32s_4k(T, k, buckets, SA[n - m]);
-
-            libsais_radix_sort_lms_suffixes_32s_2k_omp(T, SA, n, m, &buckets[1], threads, thread_state);
-            libsais_radix_sort_set_markers_32s_4k_omp(SA, k, &buckets[1], threads);
-            
-            libsais_place_lms_suffixes_interval_32s_4k(SA, n, k, m - 1, buckets);
-            libsais_induce_partial_order_32s_4k_omp(T, SA, n, k, buckets, threads, thread_state);
-
-            sa_sint_t names = libsais_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(SA, n, m, threads, thread_state);
-            if (names < m)
-            {
-                sa_sint_t f = libsais_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state);
-
-                if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state) != 0)
-                {
+        sa_sint_t * RESTRICT buckets =
+            (fs - alignment) / k >= 4
+                ? (sa_sint_t *)libsais_align_up(
+                      &SA[n + fs - 4 * k - alignment],
+                      (size_t)alignment * sizeof(sa_sint_t))
+                : &SA[n + fs - 4 * k];
+
+        sa_sint_t m = libsais_count_and_gather_lms_suffixes_32s_2k_omp(
+            T, SA, n, k, buckets, threads, thread_state);
+        if (m > 1) {
+            libsais_initialize_buckets_for_radix_and_partial_sorting_32s_4k(
+                T, k, buckets, SA[n - m]);
+
+            libsais_radix_sort_lms_suffixes_32s_2k_omp(T, SA, n, m, &buckets[1],
+                                                       threads, thread_state);
+            libsais_radix_sort_set_markers_32s_4k_omp(SA, k, &buckets[1],
+                                                      threads);
+
+            libsais_place_lms_suffixes_interval_32s_4k(SA, n, k, m - 1,
+                                                       buckets);
+            libsais_induce_partial_order_32s_4k_omp(T, SA, n, k, buckets,
+                                                    threads, thread_state);
+
+            sa_sint_t names =
+                libsais_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(
+                    SA, n, m, threads, thread_state);
+            if (names < m) {
+                sa_sint_t f = libsais_compact_lms_suffixes_32s_omp(
+                    T, SA, n, m, fs, threads, thread_state);
+
+                if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f,
+                                     fs + n - 2 * m + f, threads,
+                                     thread_state) != 0) {
                     return -2;
                 }
 
-                libsais_reconstruct_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, m, fs, f, buckets, threads, thread_state);
-            }
-            else
-            {
+                libsais_reconstruct_compacted_lms_suffixes_32s_2k_omp(
+                    T, SA, n, k, m, fs, f, buckets, threads, thread_state);
+            } else {
                 libsais_count_lms_suffixes_32s_2k(T, n, k, buckets);
             }
-        }
-        else
-        {
+        } else {
             SA[0] = SA[n - 1];
         }
 
         libsais_initialize_buckets_start_and_end_32s_4k(k, buckets);
         libsais_place_lms_suffixes_histogram_32s_4k(SA, n, k, m, buckets);
-        libsais_induce_final_order_32s_4k(T, SA, n, k, buckets, threads, thread_state);
+        libsais_induce_final_order_32s_4k(T, SA, n, k, buckets, threads,
+                                          thread_state);
 
         return 0;
-    }
-    else if (k > 0 && fs / k >= 2)
-    {
+    } else if (k > 0 && fs / k >= 2) {
         sa_sint_t alignment = (fs - 1024) / k >= 2 ? 1024 : 16;
-        sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 2 ? (sa_sint_t *)libsais_align_up(&SA[n + fs - 2 * k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 2 * k];
-
-        sa_sint_t m = libsais_count_and_gather_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state);
-        if (m > 1)
-        {
-            libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_2k(T, k, buckets, SA[n - m]);
-
-            libsais_radix_sort_lms_suffixes_32s_2k_omp(T, SA, n, m, &buckets[1], threads, thread_state);
-            libsais_place_lms_suffixes_interval_32s_2k(SA, n, k, m - 1, buckets);
+        sa_sint_t * RESTRICT buckets =
+            (fs - alignment) / k >= 2
+                ? (sa_sint_t *)libsais_align_up(
+                      &SA[n + fs - 2 * k - alignment],
+                      (size_t)alignment * sizeof(sa_sint_t))
+                : &SA[n + fs - 2 * k];
+
+        sa_sint_t m = libsais_count_and_gather_lms_suffixes_32s_2k_omp(
+            T, SA, n, k, buckets, threads, thread_state);
+        if (m > 1) {
+            libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_2k(
+                T, k, buckets, SA[n - m]);
+
+            libsais_radix_sort_lms_suffixes_32s_2k_omp(T, SA, n, m, &buckets[1],
+                                                       threads, thread_state);
+            libsais_place_lms_suffixes_interval_32s_2k(SA, n, k, m - 1,
+                                                       buckets);
 
             libsais_initialize_buckets_start_and_end_32s_2k(k, buckets);
-            libsais_induce_partial_order_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state);
-
-            sa_sint_t names = libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(T, SA, n, m, threads);
-            if (names < m)
-            {
-                sa_sint_t f = libsais_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state);
-
-                if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state) != 0)
-                {
+            libsais_induce_partial_order_32s_2k_omp(T, SA, n, k, buckets,
+                                                    threads, thread_state);
+
+            sa_sint_t names =
+                libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(
+                    T, SA, n, m, threads);
+            if (names < m) {
+                sa_sint_t f = libsais_compact_lms_suffixes_32s_omp(
+                    T, SA, n, m, fs, threads, thread_state);
+
+                if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f,
+                                     fs + n - 2 * m + f, threads,
+                                     thread_state) != 0) {
                     return -2;
                 }
 
-                libsais_reconstruct_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, m, fs, f, buckets, threads, thread_state);
-            }
-            else
-            {
+                libsais_reconstruct_compacted_lms_suffixes_32s_2k_omp(
+                    T, SA, n, k, m, fs, f, buckets, threads, thread_state);
+            } else {
                 libsais_count_lms_suffixes_32s_2k(T, n, k, buckets);
             }
-        }
-        else
-        {
+        } else {
             SA[0] = SA[n - 1];
         }
 
@@ -6402,84 +9261,120 @@ static sa_sint_t libsais_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT S
         libsais_place_lms_suffixes_histogram_32s_2k(SA, n, k, m, buckets);
 
         libsais_initialize_buckets_start_and_end_32s_2k(k, buckets);
-        libsais_induce_final_order_32s_2k(T, SA, n, k, buckets, threads, thread_state);
+        libsais_induce_final_order_32s_2k(T, SA, n, k, buckets, threads,
+                                          thread_state);
 
         return 0;
-    }
-    else
-    {
-        sa_sint_t * buffer = fs < k ? (sa_sint_t *)libsais_alloc_aligned((size_t)k * sizeof(sa_sint_t), 4096) : (sa_sint_t *)NULL;
+    } else {
+        sa_sint_t * buffer = fs < k ? (sa_sint_t *)libsais_alloc_aligned(
+                                          (size_t)k * sizeof(sa_sint_t), 4096)
+                                    : (sa_sint_t *)NULL;
 
         sa_sint_t alignment = fs - 1024 >= k ? 1024 : 16;
-        sa_sint_t * RESTRICT buckets = fs - alignment >= k ? (sa_sint_t *)libsais_align_up(&SA[n + fs - k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : fs >= k ? &SA[n + fs - k] : buffer;
+        sa_sint_t * RESTRICT buckets =
+            fs - alignment >= k ? (sa_sint_t *)libsais_align_up(
+                                      &SA[n + fs - k - alignment],
+                                      (size_t)alignment * sizeof(sa_sint_t))
+            : fs >= k ? &SA[n + fs - k]
+                      : buffer;
 
-        if (buckets == NULL) { return -2; }
+        if (buckets == NULL) {
+            return -2;
+        }
 
         memset(SA, 0, (size_t)n * sizeof(sa_sint_t));
 
-        libsais_count_suffixes_32s(T, n, k, buckets); 
+        libsais_count_suffixes_32s(T, n, k, buckets);
         libsais_initialize_buckets_end_32s_1k(k, buckets);
 
         sa_sint_t m = libsais_radix_sort_lms_suffixes_32s_1k(T, SA, n, buckets);
-        if (m > 1)
-        {
-            libsais_induce_partial_order_32s_1k_omp(T, SA, n, k, buckets, threads, thread_state);
-
-            sa_sint_t names = libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(T, SA, n, m, threads);
-            if (names < m)
-            {
-                if (buffer != NULL) { libsais_free_aligned(buffer); buckets = NULL; }
+        if (m > 1) {
+            libsais_induce_partial_order_32s_1k_omp(T, SA, n, k, buckets,
+                                                    threads, thread_state);
+
+            sa_sint_t names =
+                libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(
+                    T, SA, n, m, threads);
+            if (names < m) {
+                if (buffer != NULL) {
+                    libsais_free_aligned(buffer);
+                    buckets = NULL;
+                }
 
-                sa_sint_t f = libsais_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state);
+                sa_sint_t f = libsais_compact_lms_suffixes_32s_omp(
+                    T, SA, n, m, fs, threads, thread_state);
 
-                if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state) != 0)
-                {
+                if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f,
+                                     fs + n - 2 * m + f, threads,
+                                     thread_state) != 0) {
                     return -2;
                 }
 
-                libsais_reconstruct_compacted_lms_suffixes_32s_1k_omp(T, SA, n, m, fs, f, threads, thread_state);
+                libsais_reconstruct_compacted_lms_suffixes_32s_1k_omp(
+                    T, SA, n, m, fs, f, threads, thread_state);
 
-                if (buckets == NULL) { buckets = buffer = (sa_sint_t *)libsais_alloc_aligned((size_t)k * sizeof(sa_sint_t), 4096); }
-                if (buckets == NULL) { return -2; }
+                if (buckets == NULL) {
+                    buckets = buffer = (sa_sint_t *)libsais_alloc_aligned(
+                        (size_t)k * sizeof(sa_sint_t), 4096);
+                }
+                if (buckets == NULL) {
+                    return -2;
+                }
             }
-            
+
             libsais_count_suffixes_32s(T, n, k, buckets);
             libsais_initialize_buckets_end_32s_1k(k, buckets);
             libsais_place_lms_suffixes_interval_32s_1k(T, SA, k, m, buckets);
         }
 
-        libsais_induce_final_order_32s_1k(T, SA, n, k, buckets, threads, thread_state);
+        libsais_induce_final_order_32s_1k(T, SA, n, k, buckets, threads,
+                                          thread_state);
         libsais_free_aligned(buffer);
 
         return 0;
     }
 }
 
-static sa_sint_t libsais_main_8u(const uint8_t * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t bwt, sa_sint_t r, sa_sint_t * RESTRICT I, sa_sint_t fs, sa_sint_t * freq, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state)
-{
+static sa_sint_t libsais_main_8u(const uint8_t * T, sa_sint_t * SA, sa_sint_t n,
+                                 sa_sint_t * RESTRICT buckets, sa_sint_t bwt,
+                                 sa_sint_t r, sa_sint_t * RESTRICT I,
+                                 sa_sint_t fs, sa_sint_t * freq,
+                                 sa_sint_t threads,
+                                 LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     fs = fs < (SAINT_MAX - n) ? fs : (SAINT_MAX - n);
 
-    sa_sint_t m = libsais_count_and_gather_lms_suffixes_8u_omp(T, SA, n, buckets, threads, thread_state);
+    sa_sint_t m = libsais_count_and_gather_lms_suffixes_8u_omp(
+        T, SA, n, buckets, threads, thread_state);
 
     libsais_initialize_buckets_start_and_end_8u(buckets, freq);
 
-    if (m > 0)
-    {
-        sa_sint_t first_lms_suffix    = SA[n - m];
-        sa_sint_t left_suffixes_count = libsais_initialize_buckets_for_lms_suffixes_radix_sort_8u(T, buckets, first_lms_suffix);
+    if (m > 0) {
+        sa_sint_t first_lms_suffix = SA[n - m];
+        sa_sint_t left_suffixes_count =
+            libsais_initialize_buckets_for_lms_suffixes_radix_sort_8u(
+                T, buckets, first_lms_suffix);
 
-        if (threads > 1 && n >= 65536) { memset(SA, 0, ((size_t)n - (size_t)m) * sizeof(sa_sint_t)); }
-        libsais_radix_sort_lms_suffixes_8u_omp(T, SA, n, m, buckets, threads, thread_state);
-        if (threads > 1 && n >= 65536) { memset(&SA[(fast_sint_t)n - (fast_sint_t)m], 0, (size_t)m * sizeof(sa_sint_t)); }
+        if (threads > 1 && n >= 65536) {
+            memset(SA, 0, ((size_t)n - (size_t)m) * sizeof(sa_sint_t));
+        }
+        libsais_radix_sort_lms_suffixes_8u_omp(T, SA, n, m, buckets, threads,
+                                               thread_state);
+        if (threads > 1 && n >= 65536) {
+            memset(&SA[(fast_sint_t)n - (fast_sint_t)m], 0,
+                   (size_t)m * sizeof(sa_sint_t));
+        }
 
-        libsais_initialize_buckets_for_partial_sorting_8u(T, buckets, first_lms_suffix, left_suffixes_count);
-        libsais_induce_partial_order_8u_omp(T, SA, n, buckets, first_lms_suffix, left_suffixes_count, threads, thread_state);
+        libsais_initialize_buckets_for_partial_sorting_8u(
+            T, buckets, first_lms_suffix, left_suffixes_count);
+        libsais_induce_partial_order_8u_omp(T, SA, n, buckets, first_lms_suffix,
+                                            left_suffixes_count, threads,
+                                            thread_state);
 
-        sa_sint_t names = libsais_renumber_and_gather_lms_suffixes_8u_omp(SA, n, m, fs, threads, thread_state);
-        if (names < m)
-        {
-            if (libsais_main_32s(SA + n + fs - m, SA, m, names, fs + n - 2 * m, threads, thread_state) != 0)
-            {
+        sa_sint_t names = libsais_renumber_and_gather_lms_suffixes_8u_omp(
+            SA, n, m, fs, threads, thread_state);
+        if (names < m) {
+            if (libsais_main_32s(SA + n + fs - m, SA, m, names, fs + n - 2 * m,
+                                 threads, thread_state) != 0) {
                 return -2;
             }
 
@@ -6488,23 +9383,27 @@ static sa_sint_t libsais_main_8u(const uint8_t * T, sa_sint_t * SA, sa_sint_t n,
         }
 
         libsais_place_lms_suffixes_interval_8u(SA, n, m, buckets);
-    }
-    else
-    {
+    } else {
         memset(SA, 0, (size_t)n * sizeof(sa_sint_t));
     }
 
-    return libsais_induce_final_order_8u_omp(T, SA, n, bwt, r, I, buckets, threads, thread_state);
+    return libsais_induce_final_order_8u_omp(T, SA, n, bwt, r, I, buckets,
+                                             threads, thread_state);
 }
 
-static sa_sint_t libsais_main(const uint8_t * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t bwt, sa_sint_t r, sa_sint_t * I, sa_sint_t fs, sa_sint_t * freq, sa_sint_t threads)
-{
-    LIBSAIS_THREAD_STATE *  RESTRICT thread_state   = threads > 1 ? libsais_alloc_thread_state(threads) : NULL;
-    sa_sint_t *             RESTRICT buckets        = (sa_sint_t *)libsais_alloc_aligned(8 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096);
+static sa_sint_t libsais_main(const uint8_t * T, sa_sint_t * SA, sa_sint_t n,
+                              sa_sint_t bwt, sa_sint_t r, sa_sint_t * I,
+                              sa_sint_t fs, sa_sint_t * freq,
+                              sa_sint_t threads) {
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state =
+        threads > 1 ? libsais_alloc_thread_state(threads) : NULL;
+    sa_sint_t * RESTRICT buckets = (sa_sint_t *)libsais_alloc_aligned(
+        8 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096);
 
     sa_sint_t index = buckets != NULL && (thread_state != NULL || threads == 1)
-        ? libsais_main_8u(T, SA, n, buckets, bwt, r, I, fs, freq, threads, thread_state)
-        : -2;
+                          ? libsais_main_8u(T, SA, n, buckets, bwt, r, I, fs,
+                                            freq, threads, thread_state)
+                          : -2;
 
     libsais_free_aligned(buckets);
     libsais_free_thread_state(thread_state);
@@ -6512,33 +9411,39 @@ static sa_sint_t libsais_main(const uint8_t * T, sa_sint_t * SA, sa_sint_t n, sa
     return index;
 }
 
-static int32_t libsais_main_int(sa_sint_t * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t k, sa_sint_t fs, sa_sint_t threads)
-{
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state = threads > 1 ? libsais_alloc_thread_state(threads) : NULL;
+static int32_t libsais_main_int(sa_sint_t * T, sa_sint_t * SA, sa_sint_t n,
+                                sa_sint_t k, sa_sint_t fs, sa_sint_t threads) {
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state =
+        threads > 1 ? libsais_alloc_thread_state(threads) : NULL;
 
-    sa_sint_t index = thread_state != NULL || threads == 1
-        ? libsais_main_32s(T, SA, n, k, fs, threads, thread_state)
-        : -2;
+    sa_sint_t index =
+        thread_state != NULL || threads == 1
+            ? libsais_main_32s(T, SA, n, k, fs, threads, thread_state)
+            : -2;
 
     libsais_free_thread_state(thread_state);
 
     return index;
 }
 
-static sa_sint_t libsais_main_ctx(const LIBSAIS_CONTEXT * ctx, const uint8_t * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t bwt, sa_sint_t r, sa_sint_t * I, sa_sint_t fs, sa_sint_t * freq)
-{
-    return ctx != NULL && (ctx->buckets != NULL && (ctx->thread_state != NULL || ctx->threads == 1))
-        ? libsais_main_8u(T, SA, n, ctx->buckets, bwt, r, I, fs, freq, (sa_sint_t)ctx->threads, ctx->thread_state)
-        : -2;
+static sa_sint_t libsais_main_ctx(const LIBSAIS_CONTEXT * ctx,
+                                  const uint8_t * T, sa_sint_t * SA,
+                                  sa_sint_t n, sa_sint_t bwt, sa_sint_t r,
+                                  sa_sint_t * I, sa_sint_t fs,
+                                  sa_sint_t * freq) {
+    return ctx != NULL && (ctx->buckets != NULL &&
+                           (ctx->thread_state != NULL || ctx->threads == 1))
+               ? libsais_main_8u(T, SA, n, ctx->buckets, bwt, r, I, fs, freq,
+                                 (sa_sint_t)ctx->threads, ctx->thread_state)
+               : -2;
 }
 
-static void libsais_bwt_copy_8u(uint8_t * RESTRICT U, sa_sint_t * RESTRICT A, sa_sint_t n)
-{
+static void libsais_bwt_copy_8u(uint8_t * RESTRICT U, sa_sint_t * RESTRICT A,
+                                sa_sint_t n) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = 0, j = (fast_sint_t)n - 7; i < j; i += 8)
-    {
+    for (i = 0, j = (fast_sint_t)n - 7; i < j; i += 8) {
         libsais_prefetch(&A[i + prefetch_distance]);
 
         U[i + 0] = (uint8_t)A[i + 0];
@@ -6551,112 +9456,123 @@ static void libsais_bwt_copy_8u(uint8_t * RESTRICT U, sa_sint_t * RESTRICT A, sa
         U[i + 7] = (uint8_t)A[i + 7];
     }
 
-    for (j += 7; i < j; i += 1)
-    {
+    for (j += 7; i < j; i += 1) {
         U[i] = (uint8_t)A[i];
     }
 }
 
 #if defined(_OPENMP)
 
-static void libsais_bwt_copy_8u_omp(uint8_t * RESTRICT U, sa_sint_t * RESTRICT A, sa_sint_t n, sa_sint_t threads)
-{
-#if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536)
-#endif
+static void libsais_bwt_copy_8u_omp(uint8_t * RESTRICT U,
+                                    sa_sint_t * RESTRICT A, sa_sint_t n,
+                                    sa_sint_t threads) {
+    #if defined(_OPENMP)
+        #pragma omp parallel num_threads(threads) if (threads > 1 && n >= 65536)
+    #endif
     {
-#if defined(_OPENMP)
-        fast_sint_t omp_thread_num    = omp_get_thread_num();
-        fast_sint_t omp_num_threads   = omp_get_num_threads();
-        fast_sint_t omp_block_stride  = ((fast_sint_t)n / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : (fast_sint_t)n - omp_block_start;
-#else
+    #if defined(_OPENMP)
+        fast_sint_t omp_thread_num = omp_get_thread_num();
+        fast_sint_t omp_num_threads = omp_get_num_threads();
+        fast_sint_t omp_block_stride =
+            ((fast_sint_t)n / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
+                                         ? omp_block_stride
+                                         : (fast_sint_t)n - omp_block_start;
+    #else
         UNUSED(threads);
 
-        fast_sint_t omp_block_start   = 0;
-        fast_sint_t omp_block_size    = (fast_sint_t)n;
-#endif
+        fast_sint_t omp_block_start = 0;
+        fast_sint_t omp_block_size = (fast_sint_t)n;
+    #endif
 
-        libsais_bwt_copy_8u(U + omp_block_start, A + omp_block_start, (sa_sint_t)omp_block_size);
+        libsais_bwt_copy_8u(U + omp_block_start, A + omp_block_start,
+                            (sa_sint_t)omp_block_size);
     }
 }
 
 #endif
 
-void * libsais_create_ctx(void)
-{
-    return (void *)libsais_create_ctx_main(1);
-}
+void * libsais_create_ctx(void) { return (void *)libsais_create_ctx_main(1); }
 
-void libsais_free_ctx(void * ctx)
-{
+void libsais_free_ctx(void * ctx) {
     libsais_free_ctx_main((LIBSAIS_CONTEXT *)ctx);
 }
 
-int32_t libsais(const uint8_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq)
-{
-    if ((T == NULL) || (SA == NULL) || (n < 0) || (fs < 0))
-    {
+int32_t libsais(const uint8_t * T, int32_t * SA, int32_t n, int32_t fs,
+                int32_t * freq) {
+    if ((T == NULL) || (SA == NULL) || (n < 0) || (fs < 0)) {
         return -1;
-    }
-    else if (n < 2)
-    {
-        if (freq != NULL) { memset(freq, 0, ALPHABET_SIZE * sizeof(int32_t)); }
-        if (n == 1) { SA[0] = 0; if (freq != NULL) { freq[T[0]]++; } }
+    } else if (n < 2) {
+        if (freq != NULL) {
+            memset(freq, 0, ALPHABET_SIZE * sizeof(int32_t));
+        }
+        if (n == 1) {
+            SA[0] = 0;
+            if (freq != NULL) {
+                freq[T[0]]++;
+            }
+        }
         return 0;
     }
 
     return libsais_main(T, SA, n, 0, 0, NULL, fs, freq, 1);
 }
 
-int32_t libsais_int(int32_t * T, int32_t * SA, int32_t n, int32_t k, int32_t fs)
-{
-    if ((T == NULL) || (SA == NULL) || (n < 0) || (fs < 0))
-    {
+int32_t libsais_int(int32_t * T, int32_t * SA, int32_t n, int32_t k,
+                    int32_t fs) {
+    if ((T == NULL) || (SA == NULL) || (n < 0) || (fs < 0)) {
         return -1;
-    }
-    else if (n < 2)
-    {
-        if (n == 1) { SA[0] = 0; }
+    } else if (n < 2) {
+        if (n == 1) {
+            SA[0] = 0;
+        }
         return 0;
     }
 
     return libsais_main_int(T, SA, n, k, fs, 1);
 }
 
-int32_t libsais_ctx(const void * ctx, const uint8_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq)
-{
-    if ((ctx == NULL) || (T == NULL) || (SA == NULL) || (n < 0) || (fs < 0))
-    {
+int32_t libsais_ctx(const void * ctx, const uint8_t * T, int32_t * SA,
+                    int32_t n, int32_t fs, int32_t * freq) {
+    if ((ctx == NULL) || (T == NULL) || (SA == NULL) || (n < 0) || (fs < 0)) {
         return -1;
-    }
-    else if (n < 2)
-    {
-        if (freq != NULL) { memset(freq, 0, ALPHABET_SIZE * sizeof(int32_t)); }
-        if (n == 1) { SA[0] = 0; if (freq != NULL) { freq[T[0]]++; } }
+    } else if (n < 2) {
+        if (freq != NULL) {
+            memset(freq, 0, ALPHABET_SIZE * sizeof(int32_t));
+        }
+        if (n == 1) {
+            SA[0] = 0;
+            if (freq != NULL) {
+                freq[T[0]]++;
+            }
+        }
         return 0;
     }
 
-    return libsais_main_ctx((const LIBSAIS_CONTEXT *)ctx, T, SA, n, 0, 0, NULL, fs, freq);
+    return libsais_main_ctx((const LIBSAIS_CONTEXT *)ctx, T, SA, n, 0, 0, NULL,
+                            fs, freq);
 }
 
-int32_t libsais_bwt(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq)
-{
-    if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0))
-    { 
-        return -1; 
-    }
-    else if (n <= 1) 
-    { 
-        if (freq != NULL) { memset(freq, 0, ALPHABET_SIZE * sizeof(int32_t)); }
-        if (n == 1) { U[0] = T[0]; if (freq != NULL) { freq[T[0]]++; } }
+int32_t libsais_bwt(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n,
+                    int32_t fs, int32_t * freq) {
+    if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0)) {
+        return -1;
+    } else if (n <= 1) {
+        if (freq != NULL) {
+            memset(freq, 0, ALPHABET_SIZE * sizeof(int32_t));
+        }
+        if (n == 1) {
+            U[0] = T[0];
+            if (freq != NULL) {
+                freq[T[0]]++;
+            }
+        }
         return n;
     }
 
     sa_sint_t index = libsais_main(T, A, n, 1, 0, NULL, fs, freq, 1);
-    if (index >= 0) 
-    { 
+    if (index >= 0) {
         index++;
 
         U[0] = T[n - 1];
@@ -6667,22 +9583,26 @@ int32_t libsais_bwt(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int3
     return index;
 }
 
-int32_t libsais_bwt_aux(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I)
-{
-    if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (r < 2) || ((r & (r - 1)) != 0) || (I == NULL))
-    { 
-        return -1; 
-    }
-    else if (n <= 1) 
-    {
-        if (freq != NULL) { memset(freq, 0, ALPHABET_SIZE * sizeof(int32_t)); }
-        if (n == 1) { U[0] = T[0]; if (freq != NULL) { freq[T[0]]++; } }
+int32_t libsais_bwt_aux(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n,
+                        int32_t fs, int32_t * freq, int32_t r, int32_t * I) {
+    if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) ||
+        (r < 2) || ((r & (r - 1)) != 0) || (I == NULL)) {
+        return -1;
+    } else if (n <= 1) {
+        if (freq != NULL) {
+            memset(freq, 0, ALPHABET_SIZE * sizeof(int32_t));
+        }
+        if (n == 1) {
+            U[0] = T[0];
+            if (freq != NULL) {
+                freq[T[0]]++;
+            }
+        }
         I[0] = n;
         return 0;
     }
 
-    if (libsais_main(T, A, n, 1, r, I, fs, freq, 1) != 0)
-    {
+    if (libsais_main(T, A, n, 1, r, I, fs, freq, 1) != 0) {
         return -2;
     }
 
@@ -6693,29 +9613,38 @@ int32_t libsais_bwt_aux(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n,
     return 0;
 }
 
-int32_t libsais_bwt_ctx(const void * ctx, const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq)
-{
-    if ((ctx == NULL) || (T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0))
-    { 
-        return -1; 
-    }
-    else if (n <= 1) 
-    {
-        if (freq != NULL) { memset(freq, 0, ALPHABET_SIZE * sizeof(int32_t)); }
-        if (n == 1) { U[0] = T[0]; if (freq != NULL) { freq[T[0]]++; } }
+int32_t libsais_bwt_ctx(const void * ctx, const uint8_t * T, uint8_t * U,
+                        int32_t * A, int32_t n, int32_t fs, int32_t * freq) {
+    if ((ctx == NULL) || (T == NULL) || (U == NULL) || (A == NULL) || (n < 0) ||
+        (fs < 0)) {
+        return -1;
+    } else if (n <= 1) {
+        if (freq != NULL) {
+            memset(freq, 0, ALPHABET_SIZE * sizeof(int32_t));
+        }
+        if (n == 1) {
+            U[0] = T[0];
+            if (freq != NULL) {
+                freq[T[0]]++;
+            }
+        }
         return n;
     }
 
-    sa_sint_t index = libsais_main_ctx((const LIBSAIS_CONTEXT *)ctx, T, A, n, 1, 0, NULL, fs, freq);
-    if (index >= 0) 
-    { 
+    sa_sint_t index = libsais_main_ctx((const LIBSAIS_CONTEXT *)ctx, T, A, n, 1,
+                                       0, NULL, fs, freq);
+    if (index >= 0) {
         index++;
 
         U[0] = T[n - 1];
 
 #if defined(_OPENMP)
-        libsais_bwt_copy_8u_omp(U + 1, A, index - 1, (sa_sint_t)((const LIBSAIS_CONTEXT *)ctx)->threads);
-        libsais_bwt_copy_8u_omp(U + index, A + index, n - index, (sa_sint_t)((const LIBSAIS_CONTEXT *)ctx)->threads);
+        libsais_bwt_copy_8u_omp(
+            U + 1, A, index - 1,
+            (sa_sint_t)((const LIBSAIS_CONTEXT *)ctx)->threads);
+        libsais_bwt_copy_8u_omp(
+            U + index, A + index, n - index,
+            (sa_sint_t)((const LIBSAIS_CONTEXT *)ctx)->threads);
 #else
         libsais_bwt_copy_8u(U + 1, A, index - 1);
         libsais_bwt_copy_8u(U + index, A + index, n - index);
@@ -6725,30 +9654,38 @@ int32_t libsais_bwt_ctx(const void * ctx, const uint8_t * T, uint8_t * U, int32_
     return index;
 }
 
-int32_t libsais_bwt_aux_ctx(const void * ctx, const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I)
-{
-    if ((ctx == NULL) || (T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (r < 2) || ((r & (r - 1)) != 0) || (I == NULL))
-    { 
-        return -1; 
-    }
-    else if (n <= 1) 
-    {
-        if (freq != NULL) { memset(freq, 0, ALPHABET_SIZE * sizeof(int32_t)); }
-        if (n == 1) { U[0] = T[0]; if (freq != NULL) { freq[T[0]]++; } }
+int32_t libsais_bwt_aux_ctx(const void * ctx, const uint8_t * T, uint8_t * U,
+                            int32_t * A, int32_t n, int32_t fs, int32_t * freq,
+                            int32_t r, int32_t * I) {
+    if ((ctx == NULL) || (T == NULL) || (U == NULL) || (A == NULL) || (n < 0) ||
+        (fs < 0) || (r < 2) || ((r & (r - 1)) != 0) || (I == NULL)) {
+        return -1;
+    } else if (n <= 1) {
+        if (freq != NULL) {
+            memset(freq, 0, ALPHABET_SIZE * sizeof(int32_t));
+        }
+        if (n == 1) {
+            U[0] = T[0];
+            if (freq != NULL) {
+                freq[T[0]]++;
+            }
+        }
         I[0] = n;
         return 0;
     }
 
-    if (libsais_main_ctx((const LIBSAIS_CONTEXT *)ctx, T, A, n, 1, r, I, fs, freq) != 0)
-    {
+    if (libsais_main_ctx((const LIBSAIS_CONTEXT *)ctx, T, A, n, 1, r, I, fs,
+                         freq) != 0) {
         return -2;
     }
 
     U[0] = T[n - 1];
 
 #if defined(_OPENMP)
-    libsais_bwt_copy_8u_omp(U + 1, A, I[0] - 1, (sa_sint_t)((const LIBSAIS_CONTEXT *)ctx)->threads);
-    libsais_bwt_copy_8u_omp(U + I[0], A + I[0], n - I[0], (sa_sint_t)((const LIBSAIS_CONTEXT *)ctx)->threads);
+    libsais_bwt_copy_8u_omp(U + 1, A, I[0] - 1,
+                            (sa_sint_t)((const LIBSAIS_CONTEXT *)ctx)->threads);
+    libsais_bwt_copy_8u_omp(U + I[0], A + I[0], n - I[0],
+                            (sa_sint_t)((const LIBSAIS_CONTEXT *)ctx)->threads);
 #else
     libsais_bwt_copy_8u(U + 1, A, I[0] - 1);
     libsais_bwt_copy_8u(U + I[0], A + I[0], n - I[0]);
@@ -6759,24 +9696,29 @@ int32_t libsais_bwt_aux_ctx(const void * ctx, const uint8_t * T, uint8_t * U, in
 
 #if defined(_OPENMP)
 
-void * libsais_create_ctx_omp(int32_t threads)
-{
-    if (threads < 0) { return NULL; }
+void * libsais_create_ctx_omp(int32_t threads) {
+    if (threads < 0) {
+        return NULL;
+    }
 
     threads = threads > 0 ? threads : omp_get_max_threads();
     return (void *)libsais_create_ctx_main(threads);
 }
 
-int32_t libsais_omp(const uint8_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq, int32_t threads)
-{
-    if ((T == NULL) || (SA == NULL) || (n < 0) || (fs < 0) || (threads < 0))
-    {
+int32_t libsais_omp(const uint8_t * T, int32_t * SA, int32_t n, int32_t fs,
+                    int32_t * freq, int32_t threads) {
+    if ((T == NULL) || (SA == NULL) || (n < 0) || (fs < 0) || (threads < 0)) {
         return -1;
-    }
-    else if (n < 2)
-    {
-        if (freq != NULL) { memset(freq, 0, ALPHABET_SIZE * sizeof(int32_t)); }
-        if (n == 1) { SA[0] = 0; if (freq != NULL) { freq[T[0]]++; } }
+    } else if (n < 2) {
+        if (freq != NULL) {
+            memset(freq, 0, ALPHABET_SIZE * sizeof(int32_t));
+        }
+        if (n == 1) {
+            SA[0] = 0;
+            if (freq != NULL) {
+                freq[T[0]]++;
+            }
+        }
         return 0;
     }
 
@@ -6785,15 +9727,14 @@ int32_t libsais_omp(const uint8_t * T, int32_t * SA, int32_t n, int32_t fs, int3
     return libsais_main(T, SA, n, 0, 0, NULL, fs, freq, threads);
 }
 
-int32_t libsais_int_omp(int32_t * T, int32_t * SA, int32_t n, int32_t k, int32_t fs, int32_t threads)
-{
-    if ((T == NULL) || (SA == NULL) || (n < 0) || (fs < 0) || (threads < 0))
-    {
+int32_t libsais_int_omp(int32_t * T, int32_t * SA, int32_t n, int32_t k,
+                        int32_t fs, int32_t threads) {
+    if ((T == NULL) || (SA == NULL) || (n < 0) || (fs < 0) || (threads < 0)) {
         return -1;
-    }
-    else if (n < 2)
-    {
-        if (n == 1) { SA[0] = 0; }
+    } else if (n < 2) {
+        if (n == 1) {
+            SA[0] = 0;
+        }
         return 0;
     }
 
@@ -6802,24 +9743,28 @@ int32_t libsais_int_omp(int32_t * T, int32_t * SA, int32_t n, int32_t k, int32_t
     return libsais_main_int(T, SA, n, k, fs, threads);
 }
 
-int32_t libsais_bwt_omp(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t threads)
-{
-    if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (threads < 0))
-    {
+int32_t libsais_bwt_omp(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n,
+                        int32_t fs, int32_t * freq, int32_t threads) {
+    if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) ||
+        (threads < 0)) {
         return -1;
-    }
-    else if (n <= 1)
-    {
-        if (freq != NULL) { memset(freq, 0, ALPHABET_SIZE * sizeof(int32_t)); }
-        if (n == 1) { U[0] = T[0]; if (freq != NULL) { freq[T[0]]++; } }
+    } else if (n <= 1) {
+        if (freq != NULL) {
+            memset(freq, 0, ALPHABET_SIZE * sizeof(int32_t));
+        }
+        if (n == 1) {
+            U[0] = T[0];
+            if (freq != NULL) {
+                freq[T[0]]++;
+            }
+        }
         return n;
     }
 
     threads = threads > 0 ? threads : omp_get_max_threads();
 
     sa_sint_t index = libsais_main(T, A, n, 1, 0, NULL, fs, freq, threads);
-    if (index >= 0)
-    {
+    if (index >= 0) {
         index++;
 
         U[0] = T[n - 1];
@@ -6830,24 +9775,29 @@ int32_t libsais_bwt_omp(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n,
     return index;
 }
 
-int32_t libsais_bwt_aux_omp(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I, int32_t threads)
-{
-    if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (r < 2) || ((r & (r - 1)) != 0) || (I == NULL) || (threads < 0))
-    {
+int32_t libsais_bwt_aux_omp(const uint8_t * T, uint8_t * U, int32_t * A,
+                            int32_t n, int32_t fs, int32_t * freq, int32_t r,
+                            int32_t * I, int32_t threads) {
+    if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) ||
+        (r < 2) || ((r & (r - 1)) != 0) || (I == NULL) || (threads < 0)) {
         return -1;
-    }
-    else if (n <= 1)
-    {
-        if (freq != NULL) { memset(freq, 0, ALPHABET_SIZE * sizeof(int32_t)); }
-        if (n == 1) { U[0] = T[0]; if (freq != NULL) { freq[T[0]]++; } }
+    } else if (n <= 1) {
+        if (freq != NULL) {
+            memset(freq, 0, ALPHABET_SIZE * sizeof(int32_t));
+        }
+        if (n == 1) {
+            U[0] = T[0];
+            if (freq != NULL) {
+                freq[T[0]]++;
+            }
+        }
         I[0] = n;
         return 0;
     }
 
     threads = threads > 0 ? threads : omp_get_max_threads();
 
-    if (libsais_main(T, A, n, 1, r, I, fs, freq, threads) != 0)
-    {
+    if (libsais_main(T, A, n, 1, r, I, fs, freq, threads) != 0) {
         return -2;
     }
 
@@ -6860,19 +9810,30 @@ int32_t libsais_bwt_aux_omp(const uint8_t * T, uint8_t * U, int32_t * A, int32_t
 
 #endif
 
-static LIBSAIS_UNBWT_CONTEXT * libsais_unbwt_create_ctx_main(sa_sint_t threads)
-{
-    LIBSAIS_UNBWT_CONTEXT *     RESTRICT ctx            = (LIBSAIS_UNBWT_CONTEXT *)libsais_alloc_aligned(sizeof(LIBSAIS_UNBWT_CONTEXT), 64);
-    sa_uint_t *                 RESTRICT bucket2        = (sa_uint_t *)libsais_alloc_aligned(ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t), 4096);
-    uint16_t *                  RESTRICT fastbits       = (uint16_t *)libsais_alloc_aligned((1 + (1 << UNBWT_FASTBITS)) * sizeof(uint16_t), 4096);
-    sa_uint_t *                 RESTRICT buckets        = threads > 1 ? (sa_uint_t *)libsais_alloc_aligned((size_t)threads * (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)) * sizeof(sa_uint_t), 4096) : NULL;
-
-    if (ctx != NULL && bucket2 != NULL && fastbits != NULL && (buckets != NULL || threads == 1))
-    {
-        ctx->bucket2    = bucket2;
-        ctx->fastbits   = fastbits;
-        ctx->buckets    = buckets;
-        ctx->threads    = threads;
+static LIBSAIS_UNBWT_CONTEXT * libsais_unbwt_create_ctx_main(
+    sa_sint_t threads) {
+    LIBSAIS_UNBWT_CONTEXT * RESTRICT ctx =
+        (LIBSAIS_UNBWT_CONTEXT *)libsais_alloc_aligned(
+            sizeof(LIBSAIS_UNBWT_CONTEXT), 64);
+    sa_uint_t * RESTRICT bucket2 = (sa_uint_t *)libsais_alloc_aligned(
+        ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t), 4096);
+    uint16_t * RESTRICT fastbits = (uint16_t *)libsais_alloc_aligned(
+        (1 + (1 << UNBWT_FASTBITS)) * sizeof(uint16_t), 4096);
+    sa_uint_t * RESTRICT buckets =
+        threads > 1
+            ? (sa_uint_t *)libsais_alloc_aligned(
+                  (size_t)threads *
+                      (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)) *
+                      sizeof(sa_uint_t),
+                  4096)
+            : NULL;
+
+    if (ctx != NULL && bucket2 != NULL && fastbits != NULL &&
+        (buckets != NULL || threads == 1)) {
+        ctx->bucket2 = bucket2;
+        ctx->fastbits = fastbits;
+        ctx->buckets = buckets;
+        ctx->threads = threads;
 
         return ctx;
     }
@@ -6885,10 +9846,8 @@ static LIBSAIS_UNBWT_CONTEXT * libsais_unbwt_create_ctx_main(sa_sint_t threads)
     return NULL;
 }
 
-static void libsais_unbwt_free_ctx_main(LIBSAIS_UNBWT_CONTEXT * ctx)
-{
-    if (ctx != NULL)
-    {
+static void libsais_unbwt_free_ctx_main(LIBSAIS_UNBWT_CONTEXT * ctx) {
+    if (ctx != NULL) {
         libsais_free_aligned(ctx->buckets);
         libsais_free_aligned(ctx->fastbits);
         libsais_free_aligned(ctx->bucket2);
@@ -6896,14 +9855,14 @@ static void libsais_unbwt_free_ctx_main(LIBSAIS_UNBWT_CONTEXT * ctx)
     }
 }
 
-static void libsais_unbwt_compute_histogram(const uint8_t * RESTRICT T, fast_sint_t n, sa_uint_t * RESTRICT count)
-{
+static void libsais_unbwt_compute_histogram(const uint8_t * RESTRICT T,
+                                            fast_sint_t n,
+                                            sa_uint_t * RESTRICT count) {
     const fast_sint_t prefetch_distance = 256;
 
     const uint8_t * RESTRICT T_p = T;
 
-    if (n >= 1024)
-    {
+    if (n >= 1024) {
         sa_uint_t copy[4 * (ALPHABET_SIZE + 16)];
 
         memset(copy, 0, 4 * (ALPHABET_SIZE + 16) * sizeof(sa_uint_t));
@@ -6913,117 +9872,277 @@ static void libsais_unbwt_compute_histogram(const uint8_t * RESTRICT T, fast_sin
         sa_uint_t * RESTRICT copy2 = copy + 2 * (ALPHABET_SIZE + 16);
         sa_uint_t * RESTRICT copy3 = copy + 3 * (ALPHABET_SIZE + 16);
 
-        for (; T_p < (uint8_t * )((ptrdiff_t)(T + 63) & (-64)); T_p += 1) { copy0[T_p[0]]++; }
+        for (; T_p < (uint8_t *)((ptrdiff_t)(T + 63) & (-64)); T_p += 1) {
+            copy0[T_p[0]]++;
+        }
 
-        fast_uint_t x = ((const uint32_t *)(const void *)T_p)[0], y = ((const uint32_t *)(const void *)T_p)[1];
+        fast_uint_t x = ((const uint32_t *)(const void *)T_p)[0],
+                    y = ((const uint32_t *)(const void *)T_p)[1];
 
-        for (; T_p < (uint8_t * )((ptrdiff_t)(T + n - 8) & (-64)); T_p += 64)
-        { 
+        for (; T_p < (uint8_t *)((ptrdiff_t)(T + n - 8) & (-64)); T_p += 64) {
             libsais_prefetch(&T_p[prefetch_distance]);
 
-            fast_uint_t z = ((const uint32_t *)(const void *)T_p)[2], w = ((const uint32_t *)(const void *)T_p)[3];
-            copy0[(uint8_t)x]++; x >>= 8; copy1[(uint8_t)x]++; x >>= 8; copy2[(uint8_t)x]++; x >>= 8; copy3[x]++;
-            copy0[(uint8_t)y]++; y >>= 8; copy1[(uint8_t)y]++; y >>= 8; copy2[(uint8_t)y]++; y >>= 8; copy3[y]++;
-
-            x = ((const uint32_t *)(const void *)T_p)[4]; y = ((const uint32_t *)(const void *)T_p)[5];
-            copy0[(uint8_t)z]++; z >>= 8; copy1[(uint8_t)z]++; z >>= 8; copy2[(uint8_t)z]++; z >>= 8; copy3[z]++;
-            copy0[(uint8_t)w]++; w >>= 8; copy1[(uint8_t)w]++; w >>= 8; copy2[(uint8_t)w]++; w >>= 8; copy3[w]++;
-
-            z = ((const uint32_t *)(const void *)T_p)[6]; w = ((const uint32_t *)(const void *)T_p)[7];
-            copy0[(uint8_t)x]++; x >>= 8; copy1[(uint8_t)x]++; x >>= 8; copy2[(uint8_t)x]++; x >>= 8; copy3[x]++;
-            copy0[(uint8_t)y]++; y >>= 8; copy1[(uint8_t)y]++; y >>= 8; copy2[(uint8_t)y]++; y >>= 8; copy3[y]++;
-
-            x = ((const uint32_t *)(const void *)T_p)[8]; y = ((const uint32_t *)(const void *)T_p)[9];
-            copy0[(uint8_t)z]++; z >>= 8; copy1[(uint8_t)z]++; z >>= 8; copy2[(uint8_t)z]++; z >>= 8; copy3[z]++;
-            copy0[(uint8_t)w]++; w >>= 8; copy1[(uint8_t)w]++; w >>= 8; copy2[(uint8_t)w]++; w >>= 8; copy3[w]++;
-
-            z = ((const uint32_t *)(const void *)T_p)[10]; w = ((const uint32_t *)(const void *)T_p)[11];
-            copy0[(uint8_t)x]++; x >>= 8; copy1[(uint8_t)x]++; x >>= 8; copy2[(uint8_t)x]++; x >>= 8; copy3[x]++;
-            copy0[(uint8_t)y]++; y >>= 8; copy1[(uint8_t)y]++; y >>= 8; copy2[(uint8_t)y]++; y >>= 8; copy3[y]++;
-
-            x = ((const uint32_t *)(const void *)T_p)[12]; y = ((const uint32_t *)(const void *)T_p)[13];
-            copy0[(uint8_t)z]++; z >>= 8; copy1[(uint8_t)z]++; z >>= 8; copy2[(uint8_t)z]++; z >>= 8; copy3[z]++;
-            copy0[(uint8_t)w]++; w >>= 8; copy1[(uint8_t)w]++; w >>= 8; copy2[(uint8_t)w]++; w >>= 8; copy3[w]++;
-
-            z = ((const uint32_t *)(const void *)T_p)[14]; w = ((const uint32_t *)(const void *)T_p)[15];
-            copy0[(uint8_t)x]++; x >>= 8; copy1[(uint8_t)x]++; x >>= 8; copy2[(uint8_t)x]++; x >>= 8; copy3[x]++;
-            copy0[(uint8_t)y]++; y >>= 8; copy1[(uint8_t)y]++; y >>= 8; copy2[(uint8_t)y]++; y >>= 8; copy3[y]++;
-
-            x = ((const uint32_t *)(const void *)T_p)[16]; y = ((const uint32_t *)(const void *)T_p)[17];
-            copy0[(uint8_t)z]++; z >>= 8; copy1[(uint8_t)z]++; z >>= 8; copy2[(uint8_t)z]++; z >>= 8; copy3[z]++;
-            copy0[(uint8_t)w]++; w >>= 8; copy1[(uint8_t)w]++; w >>= 8; copy2[(uint8_t)w]++; w >>= 8; copy3[w]++;
-        }
-
-        copy0[(uint8_t)x]++; x >>= 8; copy1[(uint8_t)x]++; x >>= 8; copy2[(uint8_t)x]++; x >>= 8; copy3[x]++;
-        copy0[(uint8_t)y]++; y >>= 8; copy1[(uint8_t)y]++; y >>= 8; copy2[(uint8_t)y]++; y >>= 8; copy3[y]++;
+            fast_uint_t z = ((const uint32_t *)(const void *)T_p)[2],
+                        w = ((const uint32_t *)(const void *)T_p)[3];
+            copy0[(uint8_t)x]++;
+            x >>= 8;
+            copy1[(uint8_t)x]++;
+            x >>= 8;
+            copy2[(uint8_t)x]++;
+            x >>= 8;
+            copy3[x]++;
+            copy0[(uint8_t)y]++;
+            y >>= 8;
+            copy1[(uint8_t)y]++;
+            y >>= 8;
+            copy2[(uint8_t)y]++;
+            y >>= 8;
+            copy3[y]++;
+
+            x = ((const uint32_t *)(const void *)T_p)[4];
+            y = ((const uint32_t *)(const void *)T_p)[5];
+            copy0[(uint8_t)z]++;
+            z >>= 8;
+            copy1[(uint8_t)z]++;
+            z >>= 8;
+            copy2[(uint8_t)z]++;
+            z >>= 8;
+            copy3[z]++;
+            copy0[(uint8_t)w]++;
+            w >>= 8;
+            copy1[(uint8_t)w]++;
+            w >>= 8;
+            copy2[(uint8_t)w]++;
+            w >>= 8;
+            copy3[w]++;
+
+            z = ((const uint32_t *)(const void *)T_p)[6];
+            w = ((const uint32_t *)(const void *)T_p)[7];
+            copy0[(uint8_t)x]++;
+            x >>= 8;
+            copy1[(uint8_t)x]++;
+            x >>= 8;
+            copy2[(uint8_t)x]++;
+            x >>= 8;
+            copy3[x]++;
+            copy0[(uint8_t)y]++;
+            y >>= 8;
+            copy1[(uint8_t)y]++;
+            y >>= 8;
+            copy2[(uint8_t)y]++;
+            y >>= 8;
+            copy3[y]++;
+
+            x = ((const uint32_t *)(const void *)T_p)[8];
+            y = ((const uint32_t *)(const void *)T_p)[9];
+            copy0[(uint8_t)z]++;
+            z >>= 8;
+            copy1[(uint8_t)z]++;
+            z >>= 8;
+            copy2[(uint8_t)z]++;
+            z >>= 8;
+            copy3[z]++;
+            copy0[(uint8_t)w]++;
+            w >>= 8;
+            copy1[(uint8_t)w]++;
+            w >>= 8;
+            copy2[(uint8_t)w]++;
+            w >>= 8;
+            copy3[w]++;
+
+            z = ((const uint32_t *)(const void *)T_p)[10];
+            w = ((const uint32_t *)(const void *)T_p)[11];
+            copy0[(uint8_t)x]++;
+            x >>= 8;
+            copy1[(uint8_t)x]++;
+            x >>= 8;
+            copy2[(uint8_t)x]++;
+            x >>= 8;
+            copy3[x]++;
+            copy0[(uint8_t)y]++;
+            y >>= 8;
+            copy1[(uint8_t)y]++;
+            y >>= 8;
+            copy2[(uint8_t)y]++;
+            y >>= 8;
+            copy3[y]++;
+
+            x = ((const uint32_t *)(const void *)T_p)[12];
+            y = ((const uint32_t *)(const void *)T_p)[13];
+            copy0[(uint8_t)z]++;
+            z >>= 8;
+            copy1[(uint8_t)z]++;
+            z >>= 8;
+            copy2[(uint8_t)z]++;
+            z >>= 8;
+            copy3[z]++;
+            copy0[(uint8_t)w]++;
+            w >>= 8;
+            copy1[(uint8_t)w]++;
+            w >>= 8;
+            copy2[(uint8_t)w]++;
+            w >>= 8;
+            copy3[w]++;
+
+            z = ((const uint32_t *)(const void *)T_p)[14];
+            w = ((const uint32_t *)(const void *)T_p)[15];
+            copy0[(uint8_t)x]++;
+            x >>= 8;
+            copy1[(uint8_t)x]++;
+            x >>= 8;
+            copy2[(uint8_t)x]++;
+            x >>= 8;
+            copy3[x]++;
+            copy0[(uint8_t)y]++;
+            y >>= 8;
+            copy1[(uint8_t)y]++;
+            y >>= 8;
+            copy2[(uint8_t)y]++;
+            y >>= 8;
+            copy3[y]++;
+
+            x = ((const uint32_t *)(const void *)T_p)[16];
+            y = ((const uint32_t *)(const void *)T_p)[17];
+            copy0[(uint8_t)z]++;
+            z >>= 8;
+            copy1[(uint8_t)z]++;
+            z >>= 8;
+            copy2[(uint8_t)z]++;
+            z >>= 8;
+            copy3[z]++;
+            copy0[(uint8_t)w]++;
+            w >>= 8;
+            copy1[(uint8_t)w]++;
+            w >>= 8;
+            copy2[(uint8_t)w]++;
+            w >>= 8;
+            copy3[w]++;
+        }
+
+        copy0[(uint8_t)x]++;
+        x >>= 8;
+        copy1[(uint8_t)x]++;
+        x >>= 8;
+        copy2[(uint8_t)x]++;
+        x >>= 8;
+        copy3[x]++;
+        copy0[(uint8_t)y]++;
+        y >>= 8;
+        copy1[(uint8_t)y]++;
+        y >>= 8;
+        copy2[(uint8_t)y]++;
+        y >>= 8;
+        copy3[y]++;
 
         T_p += 8;
 
-        fast_uint_t i; for (i = 0; i < ALPHABET_SIZE; i++) { count[i] += copy0[i] + copy1[i] + copy2[i] + copy3[i]; }
+        fast_uint_t i;
+        for (i = 0; i < ALPHABET_SIZE; i++) {
+            count[i] += copy0[i] + copy1[i] + copy2[i] + copy3[i];
+        }
     }
 
-    for (; T_p < T + n; T_p += 1) { count[T_p[0]]++; }
+    for (; T_p < T + n; T_p += 1) {
+        count[T_p[0]]++;
+    }
 }
 
-static void libsais_unbwt_transpose_bucket2(sa_uint_t * RESTRICT bucket2)
-{
+static void libsais_unbwt_transpose_bucket2(sa_uint_t * RESTRICT bucket2) {
     fast_uint_t x, y, c, d;
-    for (x = 0; x != ALPHABET_SIZE; x += 16)
-    {
-        for (c = x; c != x + 16; ++c)
-        {
-            for (d = c + 1; d != x + 16; ++d)
-            {
-                sa_uint_t tmp = bucket2[(d << 8) + c]; bucket2[(d << 8) + c] = bucket2[(c << 8) + d]; bucket2[(c << 8) + d] = tmp;
+    for (x = 0; x != ALPHABET_SIZE; x += 16) {
+        for (c = x; c != x + 16; ++c) {
+            for (d = c + 1; d != x + 16; ++d) {
+                sa_uint_t tmp = bucket2[(d << 8) + c];
+                bucket2[(d << 8) + c] = bucket2[(c << 8) + d];
+                bucket2[(c << 8) + d] = tmp;
             }
         }
 
-        for (y = x + 16; y != ALPHABET_SIZE; y += 16)
-        {
-            for (c = x; c != x + 16; ++c)
-            {
+        for (y = x + 16; y != ALPHABET_SIZE; y += 16) {
+            for (c = x; c != x + 16; ++c) {
                 sa_uint_t * bucket2_yc = &bucket2[(y << 8) + c];
                 sa_uint_t * bucket2_cy = &bucket2[(c << 8) + y];
 
-                sa_uint_t tmp00 = bucket2_yc[ 0 * 256]; bucket2_yc[ 0 * 256] = bucket2_cy[ 0]; bucket2_cy[ 0] = tmp00;
-                sa_uint_t tmp01 = bucket2_yc[ 1 * 256]; bucket2_yc[ 1 * 256] = bucket2_cy[ 1]; bucket2_cy[ 1] = tmp01;
-                sa_uint_t tmp02 = bucket2_yc[ 2 * 256]; bucket2_yc[ 2 * 256] = bucket2_cy[ 2]; bucket2_cy[ 2] = tmp02;
-                sa_uint_t tmp03 = bucket2_yc[ 3 * 256]; bucket2_yc[ 3 * 256] = bucket2_cy[ 3]; bucket2_cy[ 3] = tmp03;
-                sa_uint_t tmp04 = bucket2_yc[ 4 * 256]; bucket2_yc[ 4 * 256] = bucket2_cy[ 4]; bucket2_cy[ 4] = tmp04;
-                sa_uint_t tmp05 = bucket2_yc[ 5 * 256]; bucket2_yc[ 5 * 256] = bucket2_cy[ 5]; bucket2_cy[ 5] = tmp05;
-                sa_uint_t tmp06 = bucket2_yc[ 6 * 256]; bucket2_yc[ 6 * 256] = bucket2_cy[ 6]; bucket2_cy[ 6] = tmp06;
-                sa_uint_t tmp07 = bucket2_yc[ 7 * 256]; bucket2_yc[ 7 * 256] = bucket2_cy[ 7]; bucket2_cy[ 7] = tmp07;
-                sa_uint_t tmp08 = bucket2_yc[ 8 * 256]; bucket2_yc[ 8 * 256] = bucket2_cy[ 8]; bucket2_cy[ 8] = tmp08;
-                sa_uint_t tmp09 = bucket2_yc[ 9 * 256]; bucket2_yc[ 9 * 256] = bucket2_cy[ 9]; bucket2_cy[ 9] = tmp09;
-                sa_uint_t tmp10 = bucket2_yc[10 * 256]; bucket2_yc[10 * 256] = bucket2_cy[10]; bucket2_cy[10] = tmp10;
-                sa_uint_t tmp11 = bucket2_yc[11 * 256]; bucket2_yc[11 * 256] = bucket2_cy[11]; bucket2_cy[11] = tmp11;
-                sa_uint_t tmp12 = bucket2_yc[12 * 256]; bucket2_yc[12 * 256] = bucket2_cy[12]; bucket2_cy[12] = tmp12;
-                sa_uint_t tmp13 = bucket2_yc[13 * 256]; bucket2_yc[13 * 256] = bucket2_cy[13]; bucket2_cy[13] = tmp13;
-                sa_uint_t tmp14 = bucket2_yc[14 * 256]; bucket2_yc[14 * 256] = bucket2_cy[14]; bucket2_cy[14] = tmp14;
-                sa_uint_t tmp15 = bucket2_yc[15 * 256]; bucket2_yc[15 * 256] = bucket2_cy[15]; bucket2_cy[15] = tmp15;
-            }
-        }
-    }
-}
-
-static void libsais_unbwt_compute_bigram_histogram_single(const uint8_t * RESTRICT T, sa_uint_t * RESTRICT bucket1, sa_uint_t * RESTRICT bucket2, fast_uint_t index)
-{
+                sa_uint_t tmp00 = bucket2_yc[0 * 256];
+                bucket2_yc[0 * 256] = bucket2_cy[0];
+                bucket2_cy[0] = tmp00;
+                sa_uint_t tmp01 = bucket2_yc[1 * 256];
+                bucket2_yc[1 * 256] = bucket2_cy[1];
+                bucket2_cy[1] = tmp01;
+                sa_uint_t tmp02 = bucket2_yc[2 * 256];
+                bucket2_yc[2 * 256] = bucket2_cy[2];
+                bucket2_cy[2] = tmp02;
+                sa_uint_t tmp03 = bucket2_yc[3 * 256];
+                bucket2_yc[3 * 256] = bucket2_cy[3];
+                bucket2_cy[3] = tmp03;
+                sa_uint_t tmp04 = bucket2_yc[4 * 256];
+                bucket2_yc[4 * 256] = bucket2_cy[4];
+                bucket2_cy[4] = tmp04;
+                sa_uint_t tmp05 = bucket2_yc[5 * 256];
+                bucket2_yc[5 * 256] = bucket2_cy[5];
+                bucket2_cy[5] = tmp05;
+                sa_uint_t tmp06 = bucket2_yc[6 * 256];
+                bucket2_yc[6 * 256] = bucket2_cy[6];
+                bucket2_cy[6] = tmp06;
+                sa_uint_t tmp07 = bucket2_yc[7 * 256];
+                bucket2_yc[7 * 256] = bucket2_cy[7];
+                bucket2_cy[7] = tmp07;
+                sa_uint_t tmp08 = bucket2_yc[8 * 256];
+                bucket2_yc[8 * 256] = bucket2_cy[8];
+                bucket2_cy[8] = tmp08;
+                sa_uint_t tmp09 = bucket2_yc[9 * 256];
+                bucket2_yc[9 * 256] = bucket2_cy[9];
+                bucket2_cy[9] = tmp09;
+                sa_uint_t tmp10 = bucket2_yc[10 * 256];
+                bucket2_yc[10 * 256] = bucket2_cy[10];
+                bucket2_cy[10] = tmp10;
+                sa_uint_t tmp11 = bucket2_yc[11 * 256];
+                bucket2_yc[11 * 256] = bucket2_cy[11];
+                bucket2_cy[11] = tmp11;
+                sa_uint_t tmp12 = bucket2_yc[12 * 256];
+                bucket2_yc[12 * 256] = bucket2_cy[12];
+                bucket2_cy[12] = tmp12;
+                sa_uint_t tmp13 = bucket2_yc[13 * 256];
+                bucket2_yc[13 * 256] = bucket2_cy[13];
+                bucket2_cy[13] = tmp13;
+                sa_uint_t tmp14 = bucket2_yc[14 * 256];
+                bucket2_yc[14 * 256] = bucket2_cy[14];
+                bucket2_cy[14] = tmp14;
+                sa_uint_t tmp15 = bucket2_yc[15 * 256];
+                bucket2_yc[15 * 256] = bucket2_cy[15];
+                bucket2_cy[15] = tmp15;
+            }
+        }
+    }
+}
+
+static void libsais_unbwt_compute_bigram_histogram_single(
+    const uint8_t * RESTRICT T, sa_uint_t * RESTRICT bucket1,
+    sa_uint_t * RESTRICT bucket2, fast_uint_t index) {
     fast_uint_t sum, c;
-    for (sum = 1, c = 0; c < ALPHABET_SIZE; ++c)
-    {
-        fast_uint_t prev = sum; sum += bucket1[c]; bucket1[c] = (sa_uint_t)prev;
-        if (prev != sum)
-        {
+    for (sum = 1, c = 0; c < ALPHABET_SIZE; ++c) {
+        fast_uint_t prev = sum;
+        sum += bucket1[c];
+        bucket1[c] = (sa_uint_t)prev;
+        if (prev != sum) {
             sa_uint_t * RESTRICT bucket2_p = &bucket2[c << 8];
 
             {
-                fast_uint_t hi = index; if (sum < hi) { hi = sum; }
-                libsais_unbwt_compute_histogram(&T[prev], (fast_sint_t)(hi - prev), bucket2_p);
+                fast_uint_t hi = index;
+                if (sum < hi) {
+                    hi = sum;
+                }
+                libsais_unbwt_compute_histogram(
+                    &T[prev], (fast_sint_t)(hi - prev), bucket2_p);
             }
 
             {
-                fast_uint_t lo = index + 1; if (prev > lo) { lo = prev; }
-                libsais_unbwt_compute_histogram(&T[lo - 1], (fast_sint_t)(sum - lo), bucket2_p);
+                fast_uint_t lo = index + 1;
+                if (prev > lo) {
+                    lo = prev;
+                }
+                libsais_unbwt_compute_histogram(
+                    &T[lo - 1], (fast_sint_t)(sum - lo), bucket2_p);
             }
         }
     }
@@ -7031,73 +10150,96 @@ static void libsais_unbwt_compute_bigram_histogram_single(const uint8_t * RESTRI
     libsais_unbwt_transpose_bucket2(bucket2);
 }
 
-static void libsais_unbwt_calculate_fastbits(sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t lastc, fast_uint_t shift)
-{
+static void libsais_unbwt_calculate_fastbits(sa_uint_t * RESTRICT bucket2,
+                                             uint16_t * RESTRICT fastbits,
+                                             fast_uint_t lastc,
+                                             fast_uint_t shift) {
     fast_uint_t v, w, sum, c, d;
-    for (v = 0, w = 0, sum = 1, c = 0; c < ALPHABET_SIZE; ++c)
-    {
-        if (c == lastc) { sum += 1; }
-
-        for (d = 0; d < ALPHABET_SIZE; ++d, ++w)
-        {
-            fast_uint_t prev = sum; sum += bucket2[w]; bucket2[w] = (sa_uint_t)prev;
-            if (prev != sum)
-            {
-                for (; v <= ((sum - 1) >> shift); ++v) { fastbits[v] = (uint16_t)w; }
+    for (v = 0, w = 0, sum = 1, c = 0; c < ALPHABET_SIZE; ++c) {
+        if (c == lastc) {
+            sum += 1;
+        }
+
+        for (d = 0; d < ALPHABET_SIZE; ++d, ++w) {
+            fast_uint_t prev = sum;
+            sum += bucket2[w];
+            bucket2[w] = (sa_uint_t)prev;
+            if (prev != sum) {
+                for (; v <= ((sum - 1) >> shift); ++v) {
+                    fastbits[v] = (uint16_t)w;
+                }
             }
         }
     }
 }
 
-static void libsais_unbwt_calculate_biPSI(const uint8_t * RESTRICT T, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket1, sa_uint_t * RESTRICT bucket2, fast_uint_t index, fast_sint_t omp_block_start, fast_sint_t omp_block_end)
-{
+static void libsais_unbwt_calculate_biPSI(
+    const uint8_t * RESTRICT T, sa_uint_t * RESTRICT P,
+    sa_uint_t * RESTRICT bucket1, sa_uint_t * RESTRICT bucket2,
+    fast_uint_t index, fast_sint_t omp_block_start, fast_sint_t omp_block_end) {
     {
-        fast_sint_t i = omp_block_start, j = (fast_sint_t)index; if (omp_block_end < j) { j = omp_block_end; }
-        for (; i < j; ++i)
-        {
+        fast_sint_t i = omp_block_start, j = (fast_sint_t)index;
+        if (omp_block_end < j) {
+            j = omp_block_end;
+        }
+        for (; i < j; ++i) {
             fast_uint_t c = T[i];
             fast_uint_t p = bucket1[c]++;
             fast_sint_t t = (fast_sint_t)(index - p);
 
-            if (t != 0)
-            {
-                fast_uint_t w = (((fast_uint_t)T[p + (fast_uint_t)(t >> ((sizeof(fast_sint_t) * 8) - 1))]) << 8) + c;
+            if (t != 0) {
+                fast_uint_t w =
+                    (((fast_uint_t)
+                          T[p + (fast_uint_t)(t >>
+                                              ((sizeof(fast_sint_t) * 8) - 1))])
+                     << 8) +
+                    c;
                 P[bucket2[w]++] = (sa_uint_t)i;
             }
         }
     }
 
     {
-        fast_sint_t i = (fast_sint_t)index, j = omp_block_end; if (omp_block_start > i) { i = omp_block_start; }
-        for (i += 1; i <= j; ++i)
-        {
+        fast_sint_t i = (fast_sint_t)index, j = omp_block_end;
+        if (omp_block_start > i) {
+            i = omp_block_start;
+        }
+        for (i += 1; i <= j; ++i) {
             fast_uint_t c = T[i - 1];
             fast_uint_t p = bucket1[c]++;
             fast_sint_t t = (fast_sint_t)(index - p);
 
-            if (t != 0)
-            {
-                fast_uint_t w = (((fast_uint_t)T[p + (fast_uint_t)(t >> ((sizeof(fast_sint_t) * 8) - 1))]) << 8) + c;
+            if (t != 0) {
+                fast_uint_t w =
+                    (((fast_uint_t)
+                          T[p + (fast_uint_t)(t >>
+                                              ((sizeof(fast_sint_t) * 8) - 1))])
+                     << 8) +
+                    c;
                 P[bucket2[w]++] = (sa_uint_t)i;
             }
         }
     }
 }
 
-static void libsais_unbwt_init_single(const uint8_t * RESTRICT T, sa_uint_t * RESTRICT P, sa_sint_t n, const sa_sint_t * freq, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits)
-{
+static void libsais_unbwt_init_single(const uint8_t * RESTRICT T,
+                                      sa_uint_t * RESTRICT P, sa_sint_t n,
+                                      const sa_sint_t * freq,
+                                      const sa_uint_t * RESTRICT I,
+                                      sa_uint_t * RESTRICT bucket2,
+                                      uint16_t * RESTRICT fastbits) {
     sa_uint_t bucket1[ALPHABET_SIZE];
 
     fast_uint_t index = I[0];
     fast_uint_t lastc = T[0];
-    fast_uint_t shift = 0; while ((n >> shift) > (1 << UNBWT_FASTBITS)) { shift++; }
+    fast_uint_t shift = 0;
+    while ((n >> shift) > (1 << UNBWT_FASTBITS)) {
+        shift++;
+    }
 
-    if (freq != NULL)
-    {
+    if (freq != NULL) {
         memcpy(bucket1, freq, ALPHABET_SIZE * sizeof(sa_uint_t));
-    }
-    else
-    {
+    } else {
         memset(bucket1, 0, ALPHABET_SIZE * sizeof(sa_uint_t));
         libsais_unbwt_compute_histogram(T, n, bucket1);
     }
@@ -7111,136 +10253,193 @@ static void libsais_unbwt_init_single(const uint8_t * RESTRICT T, sa_uint_t * RE
 
 #if defined(_OPENMP)
 
-static void libsais_unbwt_compute_bigram_histogram_parallel(const uint8_t * RESTRICT T, fast_uint_t index, sa_uint_t * RESTRICT bucket1, sa_uint_t * RESTRICT bucket2, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static void libsais_unbwt_compute_bigram_histogram_parallel(
+    const uint8_t * RESTRICT T, fast_uint_t index, sa_uint_t * RESTRICT bucket1,
+    sa_uint_t * RESTRICT bucket2, fast_sint_t omp_block_start,
+    fast_sint_t omp_block_size) {
     fast_sint_t i;
-    for (i = omp_block_start; i < omp_block_start + omp_block_size; ++i)
-    {
+    for (i = omp_block_start; i < omp_block_start + omp_block_size; ++i) {
         fast_uint_t c = T[i];
         fast_uint_t p = bucket1[c]++;
         fast_sint_t t = (fast_sint_t)(index - p);
 
-        if (t != 0)
-        {
-            fast_uint_t w = (((fast_uint_t)T[p + (fast_uint_t)(t >> ((sizeof(fast_sint_t) * 8) - 1))]) << 8) + c;
+        if (t != 0) {
+            fast_uint_t w =
+                (((fast_uint_t)
+                      T[p +
+                        (fast_uint_t)(t >> ((sizeof(fast_sint_t) * 8) - 1))])
+                 << 8) +
+                c;
             bucket2[w]++;
         }
     }
 }
 
-static void libsais_unbwt_init_parallel(const uint8_t * RESTRICT T, sa_uint_t * RESTRICT P, sa_sint_t n, const sa_sint_t * freq, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, sa_uint_t * RESTRICT buckets, sa_sint_t threads)
-{
+static void libsais_unbwt_init_parallel(
+    const uint8_t * RESTRICT T, sa_uint_t * RESTRICT P, sa_sint_t n,
+    const sa_sint_t * freq, const sa_uint_t * RESTRICT I,
+    sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits,
+    sa_uint_t * RESTRICT buckets, sa_sint_t threads) {
     sa_uint_t bucket1[ALPHABET_SIZE];
 
     fast_uint_t index = I[0];
     fast_uint_t lastc = T[0];
-    fast_uint_t shift = 0; while ((n >> shift) > (1 << UNBWT_FASTBITS)) { shift++; }
+    fast_uint_t shift = 0;
+    while ((n >> shift) > (1 << UNBWT_FASTBITS)) {
+        shift++;
+    }
 
     memset(bucket1, 0, ALPHABET_SIZE * sizeof(sa_uint_t));
     memset(bucket2, 0, ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t));
 
-    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536)
+    #pragma omp parallel num_threads(threads) if (threads > 1 && n >= 65536)
     {
-        fast_sint_t omp_thread_num  = omp_get_thread_num();
+        fast_sint_t omp_thread_num = omp_get_thread_num();
         fast_sint_t omp_num_threads = omp_get_num_threads();
 
-        if (omp_num_threads == 1)
-        {
+        if (omp_num_threads == 1) {
             libsais_unbwt_init_single(T, P, n, freq, I, bucket2, fastbits);
-        }
-        else
-        {
-            sa_uint_t * RESTRICT bucket1_local  = buckets + omp_thread_num * (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE));
-            sa_uint_t * RESTRICT bucket2_local  = bucket1_local + ALPHABET_SIZE;
+        } else {
+            sa_uint_t * RESTRICT bucket1_local =
+                buckets + omp_thread_num *
+                              (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE));
+            sa_uint_t * RESTRICT bucket2_local = bucket1_local + ALPHABET_SIZE;
 
-            fast_sint_t omp_block_stride        = (n / omp_num_threads) & (-16);
-            fast_sint_t omp_block_start         = omp_thread_num * omp_block_stride;
-            fast_sint_t omp_block_size          = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+            fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
+            fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+            fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
+                                             ? omp_block_stride
+                                             : n - omp_block_start;
 
             {
                 memset(bucket1_local, 0, ALPHABET_SIZE * sizeof(sa_uint_t));
-                libsais_unbwt_compute_histogram(T + omp_block_start, omp_block_size, bucket1_local);
+                libsais_unbwt_compute_histogram(T + omp_block_start,
+                                                omp_block_size, bucket1_local);
             }
 
-            #pragma omp barrier
+    #pragma omp barrier
 
-            #pragma omp master
+    #pragma omp master
             {
                 {
                     sa_uint_t * RESTRICT bucket1_temp = buckets;
 
                     fast_sint_t t;
-                    for (t = 0; t < omp_num_threads; ++t, bucket1_temp += ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE))
-                    {
-                        fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_uint_t A = bucket1[c], B = bucket1_temp[c]; bucket1[c] = A + B; bucket1_temp[c] = A; }
+                    for (t = 0; t < omp_num_threads;
+                         ++t, bucket1_temp +=
+                              ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)) {
+                        fast_sint_t c;
+                        for (c = 0; c < ALPHABET_SIZE; c += 1) {
+                            sa_uint_t A = bucket1[c], B = bucket1_temp[c];
+                            bucket1[c] = A + B;
+                            bucket1_temp[c] = A;
+                        }
                     }
                 }
 
                 {
                     fast_uint_t sum, c;
-                    for (sum = 1, c = 0; c < ALPHABET_SIZE; ++c) { fast_uint_t prev = sum; sum += bucket1[c]; bucket1[c] = (sa_uint_t)prev; }
+                    for (sum = 1, c = 0; c < ALPHABET_SIZE; ++c) {
+                        fast_uint_t prev = sum;
+                        sum += bucket1[c];
+                        bucket1[c] = (sa_uint_t)prev;
+                    }
                 }
             }
 
-            #pragma omp barrier
+    #pragma omp barrier
 
             {
-                fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_uint_t A = bucket1[c], B = bucket1_local[c]; bucket1_local[c] = A + B; }
+                fast_sint_t c;
+                for (c = 0; c < ALPHABET_SIZE; c += 1) {
+                    sa_uint_t A = bucket1[c], B = bucket1_local[c];
+                    bucket1_local[c] = A + B;
+                }
 
-                memset(bucket2_local, 0, ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t));
-                libsais_unbwt_compute_bigram_histogram_parallel(T, index, bucket1_local, bucket2_local, omp_block_start, omp_block_size);
+                memset(bucket2_local, 0,
+                       ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t));
+                libsais_unbwt_compute_bigram_histogram_parallel(
+                    T, index, bucket1_local, bucket2_local, omp_block_start,
+                    omp_block_size);
             }
 
-            #pragma omp barrier
+    #pragma omp barrier
 
             {
-                fast_sint_t omp_bucket2_stride  = ((ALPHABET_SIZE * ALPHABET_SIZE) / omp_num_threads) & (-16);
-                fast_sint_t omp_bucket2_start   = omp_thread_num * omp_bucket2_stride;
-                fast_sint_t omp_bucket2_size    = omp_thread_num < omp_num_threads - 1 ? omp_bucket2_stride : (ALPHABET_SIZE * ALPHABET_SIZE) - omp_bucket2_start;
+                fast_sint_t omp_bucket2_stride =
+                    ((ALPHABET_SIZE * ALPHABET_SIZE) / omp_num_threads) & (-16);
+                fast_sint_t omp_bucket2_start =
+                    omp_thread_num * omp_bucket2_stride;
+                fast_sint_t omp_bucket2_size =
+                    omp_thread_num < omp_num_threads - 1
+                        ? omp_bucket2_stride
+                        : (ALPHABET_SIZE * ALPHABET_SIZE) - omp_bucket2_start;
 
                 sa_uint_t * RESTRICT bucket2_temp = buckets + ALPHABET_SIZE;
 
                 fast_sint_t t;
-                for (t = 0; t < omp_num_threads; ++t, bucket2_temp += ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE))
-                {
-                    fast_sint_t c; for (c = omp_bucket2_start; c < omp_bucket2_start + omp_bucket2_size; c += 1) { sa_uint_t A = bucket2[c], B = bucket2_temp[c]; bucket2[c] = A + B; bucket2_temp[c] = A; }
+                for (t = 0; t < omp_num_threads;
+                     ++t, bucket2_temp +=
+                          ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)) {
+                    fast_sint_t c;
+                    for (c = omp_bucket2_start;
+                         c < omp_bucket2_start + omp_bucket2_size; c += 1) {
+                        sa_uint_t A = bucket2[c], B = bucket2_temp[c];
+                        bucket2[c] = A + B;
+                        bucket2_temp[c] = A;
+                    }
                 }
             }
 
-            #pragma omp barrier
+    #pragma omp barrier
 
-            #pragma omp master
+    #pragma omp master
             {
-
-                libsais_unbwt_calculate_fastbits(bucket2, fastbits, lastc, shift);
+                libsais_unbwt_calculate_fastbits(bucket2, fastbits, lastc,
+                                                 shift);
 
                 {
                     fast_sint_t t;
-                    for (t = omp_num_threads - 1; t >= 1; --t) 
-                    { 
-                        sa_uint_t * RESTRICT dst_bucket1 = buckets + t * (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE));
-                        sa_uint_t * RESTRICT src_bucket1 = dst_bucket1 - (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE));
-
-                        memcpy(dst_bucket1, src_bucket1, ALPHABET_SIZE * sizeof(sa_uint_t));
+                    for (t = omp_num_threads - 1; t >= 1; --t) {
+                        sa_uint_t * RESTRICT dst_bucket1 =
+                            buckets + t * (ALPHABET_SIZE +
+                                           (ALPHABET_SIZE * ALPHABET_SIZE));
+                        sa_uint_t * RESTRICT src_bucket1 =
+                            dst_bucket1 -
+                            (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE));
+
+                        memcpy(dst_bucket1, src_bucket1,
+                               ALPHABET_SIZE * sizeof(sa_uint_t));
                     }
 
                     memcpy(buckets, bucket1, ALPHABET_SIZE * sizeof(sa_uint_t));
                 }
             }
 
-            #pragma omp barrier
+    #pragma omp barrier
 
             {
-                fast_sint_t c; for (c = 0; c < ALPHABET_SIZE * ALPHABET_SIZE; c += 1) { sa_uint_t A = bucket2[c], B = bucket2_local[c]; bucket2_local[c] = A + B; }
+                fast_sint_t c;
+                for (c = 0; c < ALPHABET_SIZE * ALPHABET_SIZE; c += 1) {
+                    sa_uint_t A = bucket2[c], B = bucket2_local[c];
+                    bucket2_local[c] = A + B;
+                }
 
-                libsais_unbwt_calculate_biPSI(T, P, bucket1_local, bucket2_local, index, omp_block_start, omp_block_start + omp_block_size);
+                libsais_unbwt_calculate_biPSI(
+                    T, P, bucket1_local, bucket2_local, index, omp_block_start,
+                    omp_block_start + omp_block_size);
             }
 
-            #pragma omp barrier
+    #pragma omp barrier
 
-            #pragma omp master
+    #pragma omp master
             {
-                memcpy(bucket2, buckets + ALPHABET_SIZE + (omp_num_threads - 1) * (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)), ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t));
+                memcpy(
+                    bucket2,
+                    buckets + ALPHABET_SIZE +
+                        (omp_num_threads - 1) *
+                            (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)),
+                    ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t));
             }
         }
     }
@@ -7248,56 +10447,114 @@ static void libsais_unbwt_init_parallel(const uint8_t * RESTRICT T, sa_uint_t *
 
 #endif
 
-static void libsais_unbwt_decode_1(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t * i0, fast_uint_t k)
-{
+static void libsais_unbwt_decode_1(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P,
+                                   sa_uint_t * RESTRICT bucket2,
+                                   uint16_t * RESTRICT fastbits,
+                                   fast_uint_t shift, fast_uint_t * i0,
+                                   fast_uint_t k) {
     uint16_t * RESTRICT U0 = (uint16_t *)(void *)U;
 
     fast_uint_t i, p0 = *i0;
 
-    for (i = 0; i != k; ++i)
-    {
-        uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = libsais_bswap16(c0);
+    for (i = 0; i != k; ++i) {
+        uint16_t c0 = fastbits[p0 >> shift];
+        if (bucket2[c0] <= p0) {
+            do {
+                c0++;
+            } while (bucket2[c0] <= p0);
+        }
+        p0 = P[p0];
+        U0[i] = libsais_bswap16(c0);
     }
 
     *i0 = p0;
 }
 
-static void libsais_unbwt_decode_2(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t k)
-{
+static void libsais_unbwt_decode_2(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P,
+                                   sa_uint_t * RESTRICT bucket2,
+                                   uint16_t * RESTRICT fastbits,
+                                   fast_uint_t shift, fast_uint_t r,
+                                   fast_uint_t * i0, fast_uint_t * i1,
+                                   fast_uint_t k) {
     uint16_t * RESTRICT U0 = (uint16_t *)(void *)U;
     uint16_t * RESTRICT U1 = (uint16_t *)(void *)(((uint8_t *)U0) + r);
 
     fast_uint_t i, p0 = *i0, p1 = *i1;
 
-    for (i = 0; i != k; ++i)
-    {
-        uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = libsais_bswap16(c0);
-        uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = libsais_bswap16(c1);
+    for (i = 0; i != k; ++i) {
+        uint16_t c0 = fastbits[p0 >> shift];
+        if (bucket2[c0] <= p0) {
+            do {
+                c0++;
+            } while (bucket2[c0] <= p0);
+        }
+        p0 = P[p0];
+        U0[i] = libsais_bswap16(c0);
+        uint16_t c1 = fastbits[p1 >> shift];
+        if (bucket2[c1] <= p1) {
+            do {
+                c1++;
+            } while (bucket2[c1] <= p1);
+        }
+        p1 = P[p1];
+        U1[i] = libsais_bswap16(c1);
     }
 
-    *i0 = p0; *i1 = p1;
+    *i0 = p0;
+    *i1 = p1;
 }
 
-static void libsais_unbwt_decode_3(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t k)
-{
+static void libsais_unbwt_decode_3(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P,
+                                   sa_uint_t * RESTRICT bucket2,
+                                   uint16_t * RESTRICT fastbits,
+                                   fast_uint_t shift, fast_uint_t r,
+                                   fast_uint_t * i0, fast_uint_t * i1,
+                                   fast_uint_t * i2, fast_uint_t k) {
     uint16_t * RESTRICT U0 = (uint16_t *)(void *)U;
     uint16_t * RESTRICT U1 = (uint16_t *)(void *)(((uint8_t *)U0) + r);
     uint16_t * RESTRICT U2 = (uint16_t *)(void *)(((uint8_t *)U1) + r);
 
     fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2;
 
-    for (i = 0; i != k; ++i)
-    {
-        uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = libsais_bswap16(c0);
-        uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = libsais_bswap16(c1);
-        uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = libsais_bswap16(c2);
+    for (i = 0; i != k; ++i) {
+        uint16_t c0 = fastbits[p0 >> shift];
+        if (bucket2[c0] <= p0) {
+            do {
+                c0++;
+            } while (bucket2[c0] <= p0);
+        }
+        p0 = P[p0];
+        U0[i] = libsais_bswap16(c0);
+        uint16_t c1 = fastbits[p1 >> shift];
+        if (bucket2[c1] <= p1) {
+            do {
+                c1++;
+            } while (bucket2[c1] <= p1);
+        }
+        p1 = P[p1];
+        U1[i] = libsais_bswap16(c1);
+        uint16_t c2 = fastbits[p2 >> shift];
+        if (bucket2[c2] <= p2) {
+            do {
+                c2++;
+            } while (bucket2[c2] <= p2);
+        }
+        p2 = P[p2];
+        U2[i] = libsais_bswap16(c2);
     }
 
-    *i0 = p0; *i1 = p1; *i2 = p2;
+    *i0 = p0;
+    *i1 = p1;
+    *i2 = p2;
 }
 
-static void libsais_unbwt_decode_4(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t k)
-{
+static void libsais_unbwt_decode_4(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P,
+                                   sa_uint_t * RESTRICT bucket2,
+                                   uint16_t * RESTRICT fastbits,
+                                   fast_uint_t shift, fast_uint_t r,
+                                   fast_uint_t * i0, fast_uint_t * i1,
+                                   fast_uint_t * i2, fast_uint_t * i3,
+                                   fast_uint_t k) {
     uint16_t * RESTRICT U0 = (uint16_t *)(void *)U;
     uint16_t * RESTRICT U1 = (uint16_t *)(void *)(((uint8_t *)U0) + r);
     uint16_t * RESTRICT U2 = (uint16_t *)(void *)(((uint8_t *)U1) + r);
@@ -7305,19 +10562,54 @@ static void libsais_unbwt_decode_4(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P,
 
     fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3;
 
-    for (i = 0; i != k; ++i)
-    {
-        uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = libsais_bswap16(c0);
-        uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = libsais_bswap16(c1);
-        uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = libsais_bswap16(c2);
-        uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = libsais_bswap16(c3);
+    for (i = 0; i != k; ++i) {
+        uint16_t c0 = fastbits[p0 >> shift];
+        if (bucket2[c0] <= p0) {
+            do {
+                c0++;
+            } while (bucket2[c0] <= p0);
+        }
+        p0 = P[p0];
+        U0[i] = libsais_bswap16(c0);
+        uint16_t c1 = fastbits[p1 >> shift];
+        if (bucket2[c1] <= p1) {
+            do {
+                c1++;
+            } while (bucket2[c1] <= p1);
+        }
+        p1 = P[p1];
+        U1[i] = libsais_bswap16(c1);
+        uint16_t c2 = fastbits[p2 >> shift];
+        if (bucket2[c2] <= p2) {
+            do {
+                c2++;
+            } while (bucket2[c2] <= p2);
+        }
+        p2 = P[p2];
+        U2[i] = libsais_bswap16(c2);
+        uint16_t c3 = fastbits[p3 >> shift];
+        if (bucket2[c3] <= p3) {
+            do {
+                c3++;
+            } while (bucket2[c3] <= p3);
+        }
+        p3 = P[p3];
+        U3[i] = libsais_bswap16(c3);
     }
 
-    *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3;
-}
-
-static void libsais_unbwt_decode_5(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4, fast_uint_t k)
-{
+    *i0 = p0;
+    *i1 = p1;
+    *i2 = p2;
+    *i3 = p3;
+}
+
+static void libsais_unbwt_decode_5(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P,
+                                   sa_uint_t * RESTRICT bucket2,
+                                   uint16_t * RESTRICT fastbits,
+                                   fast_uint_t shift, fast_uint_t r,
+                                   fast_uint_t * i0, fast_uint_t * i1,
+                                   fast_uint_t * i2, fast_uint_t * i3,
+                                   fast_uint_t * i4, fast_uint_t k) {
     uint16_t * RESTRICT U0 = (uint16_t *)(void *)U;
     uint16_t * RESTRICT U1 = (uint16_t *)(void *)(((uint8_t *)U0) + r);
     uint16_t * RESTRICT U2 = (uint16_t *)(void *)(((uint8_t *)U1) + r);
@@ -7326,20 +10618,61 @@ static void libsais_unbwt_decode_5(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P,
 
     fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4;
 
-    for (i = 0; i != k; ++i)
-    {
-        uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = libsais_bswap16(c0);
-        uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = libsais_bswap16(c1);
-        uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = libsais_bswap16(c2);
-        uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = libsais_bswap16(c3);
-        uint16_t c4 = fastbits[p4 >> shift]; if (bucket2[c4] <= p4) { do { c4++; } while (bucket2[c4] <= p4); } p4 = P[p4]; U4[i] = libsais_bswap16(c4);
+    for (i = 0; i != k; ++i) {
+        uint16_t c0 = fastbits[p0 >> shift];
+        if (bucket2[c0] <= p0) {
+            do {
+                c0++;
+            } while (bucket2[c0] <= p0);
+        }
+        p0 = P[p0];
+        U0[i] = libsais_bswap16(c0);
+        uint16_t c1 = fastbits[p1 >> shift];
+        if (bucket2[c1] <= p1) {
+            do {
+                c1++;
+            } while (bucket2[c1] <= p1);
+        }
+        p1 = P[p1];
+        U1[i] = libsais_bswap16(c1);
+        uint16_t c2 = fastbits[p2 >> shift];
+        if (bucket2[c2] <= p2) {
+            do {
+                c2++;
+            } while (bucket2[c2] <= p2);
+        }
+        p2 = P[p2];
+        U2[i] = libsais_bswap16(c2);
+        uint16_t c3 = fastbits[p3 >> shift];
+        if (bucket2[c3] <= p3) {
+            do {
+                c3++;
+            } while (bucket2[c3] <= p3);
+        }
+        p3 = P[p3];
+        U3[i] = libsais_bswap16(c3);
+        uint16_t c4 = fastbits[p4 >> shift];
+        if (bucket2[c4] <= p4) {
+            do {
+                c4++;
+            } while (bucket2[c4] <= p4);
+        }
+        p4 = P[p4];
+        U4[i] = libsais_bswap16(c4);
     }
 
-    *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3; *i4 = p4;
+    *i0 = p0;
+    *i1 = p1;
+    *i2 = p2;
+    *i3 = p3;
+    *i4 = p4;
 }
 
-static void libsais_unbwt_decode_6(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4, fast_uint_t * i5, fast_uint_t k)
-{
+static void libsais_unbwt_decode_6(
+    uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2,
+    uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r,
+    fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3,
+    fast_uint_t * i4, fast_uint_t * i5, fast_uint_t k) {
     uint16_t * RESTRICT U0 = (uint16_t *)(void *)U;
     uint16_t * RESTRICT U1 = (uint16_t *)(void *)(((uint8_t *)U0) + r);
     uint16_t * RESTRICT U2 = (uint16_t *)(void *)(((uint8_t *)U1) + r);
@@ -7349,21 +10682,70 @@ static void libsais_unbwt_decode_6(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P,
 
     fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4, p5 = *i5;
 
-    for (i = 0; i != k; ++i)
-    {
-        uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = libsais_bswap16(c0);
-        uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = libsais_bswap16(c1);
-        uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = libsais_bswap16(c2);
-        uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = libsais_bswap16(c3);
-        uint16_t c4 = fastbits[p4 >> shift]; if (bucket2[c4] <= p4) { do { c4++; } while (bucket2[c4] <= p4); } p4 = P[p4]; U4[i] = libsais_bswap16(c4);
-        uint16_t c5 = fastbits[p5 >> shift]; if (bucket2[c5] <= p5) { do { c5++; } while (bucket2[c5] <= p5); } p5 = P[p5]; U5[i] = libsais_bswap16(c5);
+    for (i = 0; i != k; ++i) {
+        uint16_t c0 = fastbits[p0 >> shift];
+        if (bucket2[c0] <= p0) {
+            do {
+                c0++;
+            } while (bucket2[c0] <= p0);
+        }
+        p0 = P[p0];
+        U0[i] = libsais_bswap16(c0);
+        uint16_t c1 = fastbits[p1 >> shift];
+        if (bucket2[c1] <= p1) {
+            do {
+                c1++;
+            } while (bucket2[c1] <= p1);
+        }
+        p1 = P[p1];
+        U1[i] = libsais_bswap16(c1);
+        uint16_t c2 = fastbits[p2 >> shift];
+        if (bucket2[c2] <= p2) {
+            do {
+                c2++;
+            } while (bucket2[c2] <= p2);
+        }
+        p2 = P[p2];
+        U2[i] = libsais_bswap16(c2);
+        uint16_t c3 = fastbits[p3 >> shift];
+        if (bucket2[c3] <= p3) {
+            do {
+                c3++;
+            } while (bucket2[c3] <= p3);
+        }
+        p3 = P[p3];
+        U3[i] = libsais_bswap16(c3);
+        uint16_t c4 = fastbits[p4 >> shift];
+        if (bucket2[c4] <= p4) {
+            do {
+                c4++;
+            } while (bucket2[c4] <= p4);
+        }
+        p4 = P[p4];
+        U4[i] = libsais_bswap16(c4);
+        uint16_t c5 = fastbits[p5 >> shift];
+        if (bucket2[c5] <= p5) {
+            do {
+                c5++;
+            } while (bucket2[c5] <= p5);
+        }
+        p5 = P[p5];
+        U5[i] = libsais_bswap16(c5);
     }
 
-    *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3; *i4 = p4; *i5 = p5;
-}
-
-static void libsais_unbwt_decode_7(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4, fast_uint_t * i5, fast_uint_t * i6, fast_uint_t k)
-{
+    *i0 = p0;
+    *i1 = p1;
+    *i2 = p2;
+    *i3 = p3;
+    *i4 = p4;
+    *i5 = p5;
+}
+
+static void libsais_unbwt_decode_7(
+    uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2,
+    uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r,
+    fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3,
+    fast_uint_t * i4, fast_uint_t * i5, fast_uint_t * i6, fast_uint_t k) {
     uint16_t * RESTRICT U0 = (uint16_t *)(void *)U;
     uint16_t * RESTRICT U1 = (uint16_t *)(void *)(((uint8_t *)U0) + r);
     uint16_t * RESTRICT U2 = (uint16_t *)(void *)(((uint8_t *)U1) + r);
@@ -7372,24 +10754,83 @@ static void libsais_unbwt_decode_7(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P,
     uint16_t * RESTRICT U5 = (uint16_t *)(void *)(((uint8_t *)U4) + r);
     uint16_t * RESTRICT U6 = (uint16_t *)(void *)(((uint8_t *)U5) + r);
 
-    fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4, p5 = *i5, p6 = *i6;
-
-    for (i = 0; i != k; ++i)
-    {
-        uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = libsais_bswap16(c0);
-        uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = libsais_bswap16(c1);
-        uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = libsais_bswap16(c2);
-        uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = libsais_bswap16(c3);
-        uint16_t c4 = fastbits[p4 >> shift]; if (bucket2[c4] <= p4) { do { c4++; } while (bucket2[c4] <= p4); } p4 = P[p4]; U4[i] = libsais_bswap16(c4);
-        uint16_t c5 = fastbits[p5 >> shift]; if (bucket2[c5] <= p5) { do { c5++; } while (bucket2[c5] <= p5); } p5 = P[p5]; U5[i] = libsais_bswap16(c5);
-        uint16_t c6 = fastbits[p6 >> shift]; if (bucket2[c6] <= p6) { do { c6++; } while (bucket2[c6] <= p6); } p6 = P[p6]; U6[i] = libsais_bswap16(c6);
+    fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4, p5 = *i5,
+                   p6 = *i6;
+
+    for (i = 0; i != k; ++i) {
+        uint16_t c0 = fastbits[p0 >> shift];
+        if (bucket2[c0] <= p0) {
+            do {
+                c0++;
+            } while (bucket2[c0] <= p0);
+        }
+        p0 = P[p0];
+        U0[i] = libsais_bswap16(c0);
+        uint16_t c1 = fastbits[p1 >> shift];
+        if (bucket2[c1] <= p1) {
+            do {
+                c1++;
+            } while (bucket2[c1] <= p1);
+        }
+        p1 = P[p1];
+        U1[i] = libsais_bswap16(c1);
+        uint16_t c2 = fastbits[p2 >> shift];
+        if (bucket2[c2] <= p2) {
+            do {
+                c2++;
+            } while (bucket2[c2] <= p2);
+        }
+        p2 = P[p2];
+        U2[i] = libsais_bswap16(c2);
+        uint16_t c3 = fastbits[p3 >> shift];
+        if (bucket2[c3] <= p3) {
+            do {
+                c3++;
+            } while (bucket2[c3] <= p3);
+        }
+        p3 = P[p3];
+        U3[i] = libsais_bswap16(c3);
+        uint16_t c4 = fastbits[p4 >> shift];
+        if (bucket2[c4] <= p4) {
+            do {
+                c4++;
+            } while (bucket2[c4] <= p4);
+        }
+        p4 = P[p4];
+        U4[i] = libsais_bswap16(c4);
+        uint16_t c5 = fastbits[p5 >> shift];
+        if (bucket2[c5] <= p5) {
+            do {
+                c5++;
+            } while (bucket2[c5] <= p5);
+        }
+        p5 = P[p5];
+        U5[i] = libsais_bswap16(c5);
+        uint16_t c6 = fastbits[p6 >> shift];
+        if (bucket2[c6] <= p6) {
+            do {
+                c6++;
+            } while (bucket2[c6] <= p6);
+        }
+        p6 = P[p6];
+        U6[i] = libsais_bswap16(c6);
     }
 
-    *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3; *i4 = p4; *i5 = p5; *i6 = p6;
-}
-
-static void libsais_unbwt_decode_8(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4, fast_uint_t * i5, fast_uint_t * i6, fast_uint_t * i7, fast_uint_t k)
-{
+    *i0 = p0;
+    *i1 = p1;
+    *i2 = p2;
+    *i3 = p3;
+    *i4 = p4;
+    *i5 = p5;
+    *i6 = p6;
+}
+
+static void libsais_unbwt_decode_8(
+    uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2,
+    uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r,
+    fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3,
+    fast_uint_t * i4, fast_uint_t * i5, fast_uint_t * i6, fast_uint_t * i7,
+    fast_uint_t k) {
     uint16_t * RESTRICT U0 = (uint16_t *)(void *)U;
     uint16_t * RESTRICT U1 = (uint16_t *)(void *)(((uint8_t *)U0) + r);
     uint16_t * RESTRICT U2 = (uint16_t *)(void *)(((uint8_t *)U1) + r);
@@ -7399,124 +10840,233 @@ static void libsais_unbwt_decode_8(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P,
     uint16_t * RESTRICT U6 = (uint16_t *)(void *)(((uint8_t *)U5) + r);
     uint16_t * RESTRICT U7 = (uint16_t *)(void *)(((uint8_t *)U6) + r);
 
-    fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4, p5 = *i5, p6 = *i6, p7 = *i7;
-
-    for (i = 0; i != k; ++i)
-    {
-        uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = libsais_bswap16(c0);
-        uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = libsais_bswap16(c1);
-        uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = libsais_bswap16(c2);
-        uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = libsais_bswap16(c3);
-        uint16_t c4 = fastbits[p4 >> shift]; if (bucket2[c4] <= p4) { do { c4++; } while (bucket2[c4] <= p4); } p4 = P[p4]; U4[i] = libsais_bswap16(c4);
-        uint16_t c5 = fastbits[p5 >> shift]; if (bucket2[c5] <= p5) { do { c5++; } while (bucket2[c5] <= p5); } p5 = P[p5]; U5[i] = libsais_bswap16(c5);
-        uint16_t c6 = fastbits[p6 >> shift]; if (bucket2[c6] <= p6) { do { c6++; } while (bucket2[c6] <= p6); } p6 = P[p6]; U6[i] = libsais_bswap16(c6);
-        uint16_t c7 = fastbits[p7 >> shift]; if (bucket2[c7] <= p7) { do { c7++; } while (bucket2[c7] <= p7); } p7 = P[p7]; U7[i] = libsais_bswap16(c7);
-    }
-
-    *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3; *i4 = p4; *i5 = p5; *i6 = p6; *i7 = p7;
-}
-
-static void libsais_unbwt_decode(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_sint_t n, sa_sint_t r, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_sint_t blocks, fast_uint_t reminder)
-{
-    fast_uint_t shift       = 0; while ((n >> shift) > (1 << UNBWT_FASTBITS)) { shift++; }
-    fast_uint_t offset      = 0;
-
-    while (blocks > 8)
-    {
-        fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5], i6 = I[6], i7 = I[7];
-        libsais_unbwt_decode_8(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, &i6, &i7, (fast_uint_t)r >> 1);
-        I += 8; blocks -= 8; offset += 8 * (fast_uint_t)r;
+    fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4, p5 = *i5,
+                   p6 = *i6, p7 = *i7;
+
+    for (i = 0; i != k; ++i) {
+        uint16_t c0 = fastbits[p0 >> shift];
+        if (bucket2[c0] <= p0) {
+            do {
+                c0++;
+            } while (bucket2[c0] <= p0);
+        }
+        p0 = P[p0];
+        U0[i] = libsais_bswap16(c0);
+        uint16_t c1 = fastbits[p1 >> shift];
+        if (bucket2[c1] <= p1) {
+            do {
+                c1++;
+            } while (bucket2[c1] <= p1);
+        }
+        p1 = P[p1];
+        U1[i] = libsais_bswap16(c1);
+        uint16_t c2 = fastbits[p2 >> shift];
+        if (bucket2[c2] <= p2) {
+            do {
+                c2++;
+            } while (bucket2[c2] <= p2);
+        }
+        p2 = P[p2];
+        U2[i] = libsais_bswap16(c2);
+        uint16_t c3 = fastbits[p3 >> shift];
+        if (bucket2[c3] <= p3) {
+            do {
+                c3++;
+            } while (bucket2[c3] <= p3);
+        }
+        p3 = P[p3];
+        U3[i] = libsais_bswap16(c3);
+        uint16_t c4 = fastbits[p4 >> shift];
+        if (bucket2[c4] <= p4) {
+            do {
+                c4++;
+            } while (bucket2[c4] <= p4);
+        }
+        p4 = P[p4];
+        U4[i] = libsais_bswap16(c4);
+        uint16_t c5 = fastbits[p5 >> shift];
+        if (bucket2[c5] <= p5) {
+            do {
+                c5++;
+            } while (bucket2[c5] <= p5);
+        }
+        p5 = P[p5];
+        U5[i] = libsais_bswap16(c5);
+        uint16_t c6 = fastbits[p6 >> shift];
+        if (bucket2[c6] <= p6) {
+            do {
+                c6++;
+            } while (bucket2[c6] <= p6);
+        }
+        p6 = P[p6];
+        U6[i] = libsais_bswap16(c6);
+        uint16_t c7 = fastbits[p7 >> shift];
+        if (bucket2[c7] <= p7) {
+            do {
+                c7++;
+            } while (bucket2[c7] <= p7);
+        }
+        p7 = P[p7];
+        U7[i] = libsais_bswap16(c7);
     }
 
-    if (blocks == 1)
-    {
+    *i0 = p0;
+    *i1 = p1;
+    *i2 = p2;
+    *i3 = p3;
+    *i4 = p4;
+    *i5 = p5;
+    *i6 = p6;
+    *i7 = p7;
+}
+
+static void libsais_unbwt_decode(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P,
+                                 sa_sint_t n, sa_sint_t r,
+                                 const sa_uint_t * RESTRICT I,
+                                 sa_uint_t * RESTRICT bucket2,
+                                 uint16_t * RESTRICT fastbits,
+                                 fast_sint_t blocks, fast_uint_t reminder) {
+    fast_uint_t shift = 0;
+    while ((n >> shift) > (1 << UNBWT_FASTBITS)) {
+        shift++;
+    }
+    fast_uint_t offset = 0;
+
+    while (blocks > 8) {
+        fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4],
+                    i5 = I[5], i6 = I[6], i7 = I[7];
+        libsais_unbwt_decode_8(U + offset, P, bucket2, fastbits, shift,
+                               (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5,
+                               &i6, &i7, (fast_uint_t)r >> 1);
+        I += 8;
+        blocks -= 8;
+        offset += 8 * (fast_uint_t)r;
+    }
+
+    if (blocks == 1) {
         fast_uint_t i0 = I[0];
-        libsais_unbwt_decode_1(U + offset, P, bucket2, fastbits, shift, &i0, reminder >> 1);
-    }
-    else if (blocks == 2)
-    {
+        libsais_unbwt_decode_1(U + offset, P, bucket2, fastbits, shift, &i0,
+                               reminder >> 1);
+    } else if (blocks == 2) {
         fast_uint_t i0 = I[0], i1 = I[1];
-        libsais_unbwt_decode_2(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, reminder >> 1);
-        libsais_unbwt_decode_1(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift, &i0, ((fast_uint_t)r >> 1) - (reminder >> 1));
-    }
-    else if (blocks == 3)
-    {
+        libsais_unbwt_decode_2(U + offset, P, bucket2, fastbits, shift,
+                               (fast_uint_t)r, &i0, &i1, reminder >> 1);
+        libsais_unbwt_decode_1(U + offset + 2 * (reminder >> 1), P, bucket2,
+                               fastbits, shift, &i0,
+                               ((fast_uint_t)r >> 1) - (reminder >> 1));
+    } else if (blocks == 3) {
         fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2];
-        libsais_unbwt_decode_3(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, reminder >> 1);
-        libsais_unbwt_decode_2(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, ((fast_uint_t)r >> 1) - (reminder >> 1));
-    }
-    else if (blocks == 4)
-    {
+        libsais_unbwt_decode_3(U + offset, P, bucket2, fastbits, shift,
+                               (fast_uint_t)r, &i0, &i1, &i2, reminder >> 1);
+        libsais_unbwt_decode_2(U + offset + 2 * (reminder >> 1), P, bucket2,
+                               fastbits, shift, (fast_uint_t)r, &i0, &i1,
+                               ((fast_uint_t)r >> 1) - (reminder >> 1));
+    } else if (blocks == 4) {
         fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3];
-        libsais_unbwt_decode_4(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, reminder >> 1);
-        libsais_unbwt_decode_3(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, ((fast_uint_t)r >> 1) - (reminder >> 1));
-    }
-    else if (blocks == 5)
-    {
+        libsais_unbwt_decode_4(U + offset, P, bucket2, fastbits, shift,
+                               (fast_uint_t)r, &i0, &i1, &i2, &i3,
+                               reminder >> 1);
+        libsais_unbwt_decode_3(U + offset + 2 * (reminder >> 1), P, bucket2,
+                               fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2,
+                               ((fast_uint_t)r >> 1) - (reminder >> 1));
+    } else if (blocks == 5) {
         fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4];
-        libsais_unbwt_decode_5(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, reminder >> 1);
-        libsais_unbwt_decode_4(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, ((fast_uint_t)r >> 1) - (reminder >> 1));
-    }
-    else if (blocks == 6)
-    {
-        fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5];
-        libsais_unbwt_decode_6(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, reminder >> 1);
-        libsais_unbwt_decode_5(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, ((fast_uint_t)r >> 1) - (reminder >> 1));
-    }
-    else if (blocks == 7)
-    {
-        fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5], i6 = I[6];
-        libsais_unbwt_decode_7(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, &i6, reminder >> 1);
-        libsais_unbwt_decode_6(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, ((fast_uint_t)r >> 1) - (reminder >> 1));
-    }
-    else
-    {
-        fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5], i6 = I[6], i7 = I[7];
-        libsais_unbwt_decode_8(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, &i6, &i7, reminder >> 1);
-        libsais_unbwt_decode_7(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, &i6, ((fast_uint_t)r >> 1) - (reminder >> 1));
-    }
-}
-
-static void libsais_unbwt_decode_omp(const uint8_t * RESTRICT T, uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_sint_t n, sa_sint_t r, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, sa_sint_t threads)
-{
-    fast_uint_t lastc       = T[0];
-    fast_sint_t blocks      = 1 + (((fast_sint_t)n - 1) / (fast_sint_t)r);
-    fast_uint_t reminder    = (fast_uint_t)n - ((fast_uint_t)r * ((fast_uint_t)blocks - 1));
+        libsais_unbwt_decode_5(U + offset, P, bucket2, fastbits, shift,
+                               (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4,
+                               reminder >> 1);
+        libsais_unbwt_decode_4(U + offset + 2 * (reminder >> 1), P, bucket2,
+                               fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2,
+                               &i3, ((fast_uint_t)r >> 1) - (reminder >> 1));
+    } else if (blocks == 6) {
+        fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4],
+                    i5 = I[5];
+        libsais_unbwt_decode_6(U + offset, P, bucket2, fastbits, shift,
+                               (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5,
+                               reminder >> 1);
+        libsais_unbwt_decode_5(U + offset + 2 * (reminder >> 1), P, bucket2,
+                               fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2,
+                               &i3, &i4,
+                               ((fast_uint_t)r >> 1) - (reminder >> 1));
+    } else if (blocks == 7) {
+        fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4],
+                    i5 = I[5], i6 = I[6];
+        libsais_unbwt_decode_7(U + offset, P, bucket2, fastbits, shift,
+                               (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5,
+                               &i6, reminder >> 1);
+        libsais_unbwt_decode_6(U + offset + 2 * (reminder >> 1), P, bucket2,
+                               fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2,
+                               &i3, &i4, &i5,
+                               ((fast_uint_t)r >> 1) - (reminder >> 1));
+    } else {
+        fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4],
+                    i5 = I[5], i6 = I[6], i7 = I[7];
+        libsais_unbwt_decode_8(U + offset, P, bucket2, fastbits, shift,
+                               (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5,
+                               &i6, &i7, reminder >> 1);
+        libsais_unbwt_decode_7(U + offset + 2 * (reminder >> 1), P, bucket2,
+                               fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2,
+                               &i3, &i4, &i5, &i6,
+                               ((fast_uint_t)r >> 1) - (reminder >> 1));
+    }
+}
+
+static void libsais_unbwt_decode_omp(const uint8_t * RESTRICT T,
+                                     uint8_t * RESTRICT U,
+                                     sa_uint_t * RESTRICT P, sa_sint_t n,
+                                     sa_sint_t r, const sa_uint_t * RESTRICT I,
+                                     sa_uint_t * RESTRICT bucket2,
+                                     uint16_t * RESTRICT fastbits,
+                                     sa_sint_t threads) {
+    fast_uint_t lastc = T[0];
+    fast_sint_t blocks = 1 + (((fast_sint_t)n - 1) / (fast_sint_t)r);
+    fast_uint_t reminder =
+        (fast_uint_t)n - ((fast_uint_t)r * ((fast_uint_t)blocks - 1));
 
 #if defined(_OPENMP)
     fast_sint_t max_threads = blocks < threads ? blocks : threads;
-    #pragma omp parallel num_threads(max_threads) if(max_threads > 1 && n >= 65536)
+    #pragma omp parallel num_threads(max_threads) if (max_threads > 1 && \
+                                                      n >= 65536)
 #endif
     {
 #if defined(_OPENMP)
-        fast_sint_t omp_thread_num      = omp_get_thread_num();
-        fast_sint_t omp_num_threads     = omp_get_num_threads();
+        fast_sint_t omp_thread_num = omp_get_thread_num();
+        fast_sint_t omp_num_threads = omp_get_num_threads();
 #else
         UNUSED(threads);
 
-        fast_sint_t omp_thread_num      = 0;
-        fast_sint_t omp_num_threads     = 1;
+        fast_sint_t omp_thread_num = 0;
+        fast_sint_t omp_num_threads = 1;
 #endif
 
-        fast_sint_t omp_block_stride    = blocks / omp_num_threads;
-        fast_sint_t omp_block_reminder  = blocks % omp_num_threads;
-        fast_sint_t omp_block_size      = omp_block_stride + (omp_thread_num < omp_block_reminder);
-        fast_sint_t omp_block_start     = omp_block_stride * omp_thread_num + (omp_thread_num < omp_block_reminder ? omp_thread_num : omp_block_reminder);
+        fast_sint_t omp_block_stride = blocks / omp_num_threads;
+        fast_sint_t omp_block_reminder = blocks % omp_num_threads;
+        fast_sint_t omp_block_size =
+            omp_block_stride + (omp_thread_num < omp_block_reminder);
+        fast_sint_t omp_block_start =
+            omp_block_stride * omp_thread_num +
+            (omp_thread_num < omp_block_reminder ? omp_thread_num
+                                                 : omp_block_reminder);
 
-        libsais_unbwt_decode(U + r * omp_block_start, P, n, r, I + omp_block_start, bucket2, fastbits, omp_block_size, omp_thread_num < omp_num_threads - 1 ? (fast_uint_t)r : reminder);
+        libsais_unbwt_decode(
+            U + r * omp_block_start, P, n, r, I + omp_block_start, bucket2,
+            fastbits, omp_block_size,
+            omp_thread_num < omp_num_threads - 1 ? (fast_uint_t)r : reminder);
     }
 
     U[n - 1] = (uint8_t)lastc;
 }
 
-static sa_sint_t libsais_unbwt_core(const uint8_t * RESTRICT T, uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_sint_t n, const sa_sint_t * freq, sa_sint_t r, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, sa_uint_t * RESTRICT buckets, sa_sint_t threads)
-{
+static sa_sint_t libsais_unbwt_core(
+    const uint8_t * RESTRICT T, uint8_t * RESTRICT U, sa_uint_t * RESTRICT P,
+    sa_sint_t n, const sa_sint_t * freq, sa_sint_t r,
+    const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2,
+    uint16_t * RESTRICT fastbits, sa_uint_t * RESTRICT buckets,
+    sa_sint_t threads) {
 #if defined(_OPENMP)
-    if (threads > 1 && n >= 262144)
-    {
-        libsais_unbwt_init_parallel(T, P, n, freq, I, bucket2, fastbits, buckets, threads);
-    }
-    else
+    if (threads > 1 && n >= 262144) {
+        libsais_unbwt_init_parallel(T, P, n, freq, I, bucket2, fastbits,
+                                    buckets, threads);
+    } else
 #else
     UNUSED(buckets);
 #endif
@@ -7528,17 +11078,33 @@ static sa_sint_t libsais_unbwt_core(const uint8_t * RESTRICT T, uint8_t * RESTRI
     return 0;
 }
 
-static sa_sint_t libsais_unbwt_main(const uint8_t * T, uint8_t * U, sa_uint_t * P, sa_sint_t n, const sa_sint_t * freq, sa_sint_t r, const sa_uint_t * I, sa_sint_t threads)
-{
-    fast_uint_t shift = 0; while ((n >> shift) > (1 << UNBWT_FASTBITS)) { shift++; }
-
-    sa_uint_t *     RESTRICT bucket2        = (sa_uint_t *)libsais_alloc_aligned(ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t), 4096);
-    uint16_t *      RESTRICT fastbits       = (uint16_t *)libsais_alloc_aligned(((size_t)1 + (size_t)(n >> shift)) * sizeof(uint16_t), 4096);
-    sa_uint_t *     RESTRICT buckets        = threads > 1 && n >= 262144 ? (sa_uint_t *)libsais_alloc_aligned((size_t)threads * (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)) * sizeof(sa_uint_t), 4096) : NULL;
-
-    sa_sint_t index = bucket2 != NULL && fastbits != NULL && (buckets != NULL || threads == 1 || n < 262144)
-        ? libsais_unbwt_core(T, U, P, n, freq, r, I, bucket2, fastbits, buckets, threads)
-        : -2;
+static sa_sint_t libsais_unbwt_main(const uint8_t * T, uint8_t * U,
+                                    sa_uint_t * P, sa_sint_t n,
+                                    const sa_sint_t * freq, sa_sint_t r,
+                                    const sa_uint_t * I, sa_sint_t threads) {
+    fast_uint_t shift = 0;
+    while ((n >> shift) > (1 << UNBWT_FASTBITS)) {
+        shift++;
+    }
+
+    sa_uint_t * RESTRICT bucket2 = (sa_uint_t *)libsais_alloc_aligned(
+        ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t), 4096);
+    uint16_t * RESTRICT fastbits = (uint16_t *)libsais_alloc_aligned(
+        ((size_t)1 + (size_t)(n >> shift)) * sizeof(uint16_t), 4096);
+    sa_uint_t * RESTRICT buckets =
+        threads > 1 && n >= 262144
+            ? (sa_uint_t *)libsais_alloc_aligned(
+                  (size_t)threads *
+                      (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)) *
+                      sizeof(sa_uint_t),
+                  4096)
+            : NULL;
+
+    sa_sint_t index = bucket2 != NULL && fastbits != NULL &&
+                              (buckets != NULL || threads == 1 || n < 262144)
+                          ? libsais_unbwt_core(T, U, P, n, freq, r, I, bucket2,
+                                               fastbits, buckets, threads)
+                          : -2;
 
     libsais_free_aligned(buckets);
     libsais_free_aligned(fastbits);
@@ -7547,208 +11113,270 @@ static sa_sint_t libsais_unbwt_main(const uint8_t * T, uint8_t * U, sa_uint_t *
     return index;
 }
 
-static sa_sint_t libsais_unbwt_main_ctx(const LIBSAIS_UNBWT_CONTEXT * ctx, const uint8_t * T, uint8_t * U, sa_uint_t * P, sa_sint_t n, const sa_sint_t * freq, sa_sint_t r, const sa_uint_t * I)
-{
-    return ctx != NULL && ctx->bucket2 != NULL && ctx->fastbits != NULL && (ctx->buckets != NULL || ctx->threads == 1)
-        ? libsais_unbwt_core(T, U, P, n, freq, r, I, ctx->bucket2, ctx->fastbits, ctx->buckets, (sa_sint_t)ctx->threads)
-        : -2;
+static sa_sint_t libsais_unbwt_main_ctx(const LIBSAIS_UNBWT_CONTEXT * ctx,
+                                        const uint8_t * T, uint8_t * U,
+                                        sa_uint_t * P, sa_sint_t n,
+                                        const sa_sint_t * freq, sa_sint_t r,
+                                        const sa_uint_t * I) {
+    return ctx != NULL && ctx->bucket2 != NULL && ctx->fastbits != NULL &&
+                   (ctx->buckets != NULL || ctx->threads == 1)
+               ? libsais_unbwt_core(T, U, P, n, freq, r, I, ctx->bucket2,
+                                    ctx->fastbits, ctx->buckets,
+                                    (sa_sint_t)ctx->threads)
+               : -2;
 }
 
-void * libsais_unbwt_create_ctx(void)
-{
+void * libsais_unbwt_create_ctx(void) {
     return (void *)libsais_unbwt_create_ctx_main(1);
 }
 
-void libsais_unbwt_free_ctx(void * ctx)
-{
+void libsais_unbwt_free_ctx(void * ctx) {
     libsais_unbwt_free_ctx_main((LIBSAIS_UNBWT_CONTEXT *)ctx);
 }
 
-int32_t libsais_unbwt(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i)
-{
+int32_t libsais_unbwt(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n,
+                      const int32_t * freq, int32_t i) {
     return libsais_unbwt_aux(T, U, A, n, freq, n, &i);
 }
 
-int32_t libsais_unbwt_ctx(const void * ctx, const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i)
-{
+int32_t libsais_unbwt_ctx(const void * ctx, const uint8_t * T, uint8_t * U,
+                          int32_t * A, int32_t n, const int32_t * freq,
+                          int32_t i) {
     return libsais_unbwt_aux_ctx(ctx, T, U, A, n, freq, n, &i);
 }
 
-int32_t libsais_unbwt_aux(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I)
-{
-    if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || ((r != n) && ((r < 2) || ((r & (r - 1)) != 0))) || (I == NULL))
-    {
+int32_t libsais_unbwt_aux(const uint8_t * T, uint8_t * U, int32_t * A,
+                          int32_t n, const int32_t * freq, int32_t r,
+                          const int32_t * I) {
+    if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) ||
+        ((r != n) && ((r < 2) || ((r & (r - 1)) != 0))) || (I == NULL)) {
         return -1;
-    }
-    else if (n <= 1)
-    {
-        if (I[0] != n) { return -1; }
-        if (n == 1) { U[0] = T[0]; }
+    } else if (n <= 1) {
+        if (I[0] != n) {
+            return -1;
+        }
+        if (n == 1) {
+            U[0] = T[0];
+        }
         return 0;
     }
 
-    fast_sint_t t; for (t = 0; t <= (n - 1) / r; ++t) { if (I[t] <= 0 || I[t] > n) { return -1; } }
+    fast_sint_t t;
+    for (t = 0; t <= (n - 1) / r; ++t) {
+        if (I[t] <= 0 || I[t] > n) {
+            return -1;
+        }
+    }
 
-    return libsais_unbwt_main(T, U, (sa_uint_t *)A, n, freq, r, (const sa_uint_t *)I, 1);
+    return libsais_unbwt_main(T, U, (sa_uint_t *)A, n, freq, r,
+                              (const sa_uint_t *)I, 1);
 }
 
-int32_t libsais_unbwt_aux_ctx(const void * ctx, const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I)
-{
-    if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || ((r != n) && ((r < 2) || ((r & (r - 1)) != 0))) || (I == NULL))
-    {
+int32_t libsais_unbwt_aux_ctx(const void * ctx, const uint8_t * T, uint8_t * U,
+                              int32_t * A, int32_t n, const int32_t * freq,
+                              int32_t r, const int32_t * I) {
+    if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) ||
+        ((r != n) && ((r < 2) || ((r & (r - 1)) != 0))) || (I == NULL)) {
         return -1;
-    }
-    else if (n <= 1)
-    {
-        if (I[0] != n) { return -1; }
-        if (n == 1) { U[0] = T[0]; }
+    } else if (n <= 1) {
+        if (I[0] != n) {
+            return -1;
+        }
+        if (n == 1) {
+            U[0] = T[0];
+        }
         return 0;
     }
 
-    fast_sint_t t; for (t = 0; t <= (n - 1) / r; ++t) { if (I[t] <= 0 || I[t] > n) { return -1; } }
+    fast_sint_t t;
+    for (t = 0; t <= (n - 1) / r; ++t) {
+        if (I[t] <= 0 || I[t] > n) {
+            return -1;
+        }
+    }
 
-    return libsais_unbwt_main_ctx((const LIBSAIS_UNBWT_CONTEXT *)ctx, T, U, (sa_uint_t *)A, n, freq, r, (const sa_uint_t *)I);
+    return libsais_unbwt_main_ctx((const LIBSAIS_UNBWT_CONTEXT *)ctx, T, U,
+                                  (sa_uint_t *)A, n, freq, r,
+                                  (const sa_uint_t *)I);
 }
 
 #if defined(_OPENMP)
 
-void * libsais_unbwt_create_ctx_omp(int32_t threads)
-{
-    if (threads < 0) { return NULL; }
+void * libsais_unbwt_create_ctx_omp(int32_t threads) {
+    if (threads < 0) {
+        return NULL;
+    }
 
     threads = threads > 0 ? threads : omp_get_max_threads();
     return (void *)libsais_unbwt_create_ctx_main(threads);
 }
 
-int32_t libsais_unbwt_omp(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i, int32_t threads)
-{
+int32_t libsais_unbwt_omp(const uint8_t * T, uint8_t * U, int32_t * A,
+                          int32_t n, const int32_t * freq, int32_t i,
+                          int32_t threads) {
     return libsais_unbwt_aux_omp(T, U, A, n, freq, n, &i, threads);
 }
 
-int32_t libsais_unbwt_aux_omp(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I, int32_t threads)
-{
-    if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || ((r != n) && ((r < 2) || ((r & (r - 1)) != 0))) || (I == NULL) || (threads < 0))
-    {
+int32_t libsais_unbwt_aux_omp(const uint8_t * T, uint8_t * U, int32_t * A,
+                              int32_t n, const int32_t * freq, int32_t r,
+                              const int32_t * I, int32_t threads) {
+    if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) ||
+        ((r != n) && ((r < 2) || ((r & (r - 1)) != 0))) || (I == NULL) ||
+        (threads < 0)) {
         return -1;
-    }
-    else if (n <= 1)
-    {
-        if (I[0] != n) { return -1; }
-        if (n == 1) { U[0] = T[0]; }
+    } else if (n <= 1) {
+        if (I[0] != n) {
+            return -1;
+        }
+        if (n == 1) {
+            U[0] = T[0];
+        }
         return 0;
     }
 
-    fast_sint_t t; for (t = 0; t <= (n - 1) / r; ++t) { if (I[t] <= 0 || I[t] > n) { return -1; } }
+    fast_sint_t t;
+    for (t = 0; t <= (n - 1) / r; ++t) {
+        if (I[t] <= 0 || I[t] > n) {
+            return -1;
+        }
+    }
 
     threads = threads > 0 ? threads : omp_get_max_threads();
-    return libsais_unbwt_main(T, U, (sa_uint_t *)A, n, freq, r, (const sa_uint_t *)I, threads);
+    return libsais_unbwt_main(T, U, (sa_uint_t *)A, n, freq, r,
+                              (const sa_uint_t *)I, threads);
 }
 
 #endif
 
-static void libsais_compute_phi(const sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT PLCP, sa_sint_t n, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static void libsais_compute_phi(const sa_sint_t * RESTRICT SA,
+                                sa_sint_t * RESTRICT PLCP, sa_sint_t n,
+                                fast_sint_t omp_block_start,
+                                fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
-    fast_sint_t i, j; sa_sint_t k = omp_block_start > 0 ? SA[omp_block_start - 1] : n;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4)
-    {
+    fast_sint_t i, j;
+    sa_sint_t k = omp_block_start > 0 ? SA[omp_block_start - 1] : n;
+    for (i = omp_block_start,
+        j = omp_block_start + omp_block_size - prefetch_distance - 3;
+         i < j; i += 4) {
         libsais_prefetchw(&PLCP[SA[i + prefetch_distance + 0]]);
         libsais_prefetchw(&PLCP[SA[i + prefetch_distance + 1]]);
 
-        PLCP[SA[i + 0]] = k; k = SA[i + 0];
-        PLCP[SA[i + 1]] = k; k = SA[i + 1];
+        PLCP[SA[i + 0]] = k;
+        k = SA[i + 0];
+        PLCP[SA[i + 1]] = k;
+        k = SA[i + 1];
 
         libsais_prefetchw(&PLCP[SA[i + prefetch_distance + 2]]);
         libsais_prefetchw(&PLCP[SA[i + prefetch_distance + 3]]);
 
-        PLCP[SA[i + 2]] = k; k = SA[i + 2];
-        PLCP[SA[i + 3]] = k; k = SA[i + 3];
+        PLCP[SA[i + 2]] = k;
+        k = SA[i + 2];
+        PLCP[SA[i + 3]] = k;
+        k = SA[i + 3];
     }
 
-    for (j += prefetch_distance + 3; i < j; i += 1)
-    {
-        PLCP[SA[i]] = k; k = SA[i];
+    for (j += prefetch_distance + 3; i < j; i += 1) {
+        PLCP[SA[i]] = k;
+        k = SA[i];
     }
 }
 
-static void libsais_compute_phi_omp(const sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT PLCP, sa_sint_t n, sa_sint_t threads)
-{
+static void libsais_compute_phi_omp(const sa_sint_t * RESTRICT SA,
+                                    sa_sint_t * RESTRICT PLCP, sa_sint_t n,
+                                    sa_sint_t threads) {
 #if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536)
+    #pragma omp parallel num_threads(threads) if (threads > 1 && n >= 65536)
 #endif
     {
 #if defined(_OPENMP)
-        fast_sint_t omp_thread_num    = omp_get_thread_num();
-        fast_sint_t omp_num_threads   = omp_get_num_threads();
+        fast_sint_t omp_thread_num = omp_get_thread_num();
+        fast_sint_t omp_num_threads = omp_get_num_threads();
 #else
         UNUSED(threads);
 
-        fast_sint_t omp_thread_num    = 0;
-        fast_sint_t omp_num_threads   = 1;
+        fast_sint_t omp_thread_num = 0;
+        fast_sint_t omp_num_threads = 1;
 #endif
-        fast_sint_t omp_block_stride  = (n / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+        fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
+                                         ? omp_block_stride
+                                         : n - omp_block_start;
 
         libsais_compute_phi(SA, PLCP, n, omp_block_start, omp_block_size);
     }
 }
 
-static void libsais_compute_plcp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT PLCP, fast_sint_t n, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static void libsais_compute_plcp(const uint8_t * RESTRICT T,
+                                 sa_sint_t * RESTRICT PLCP, fast_sint_t n,
+                                 fast_sint_t omp_block_start,
+                                 fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j, l = 0;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance; i < j; i += 1)
-    {
+    for (i = omp_block_start,
+        j = omp_block_start + omp_block_size - prefetch_distance;
+         i < j; i += 1) {
         libsais_prefetch(&T[PLCP[i + prefetch_distance] + l]);
 
         fast_sint_t k = PLCP[i], m = n - (i > k ? i : k);
-        while (l < m && T[i + l] == T[k + l]) { l++; }
+        while (l < m && T[i + l] == T[k + l]) {
+            l++;
+        }
 
-        PLCP[i] = (sa_sint_t)l; l -= (l != 0);
+        PLCP[i] = (sa_sint_t)l;
+        l -= (l != 0);
     }
 
-    for (j += prefetch_distance; i < j; i += 1)
-    {
+    for (j += prefetch_distance; i < j; i += 1) {
         fast_sint_t k = PLCP[i], m = n - (i > k ? i : k);
-        while (l < m && T[i + l] == T[k + l]) { l++; }
+        while (l < m && T[i + l] == T[k + l]) {
+            l++;
+        }
 
-        PLCP[i] = (sa_sint_t)l; l -= (l != 0);
+        PLCP[i] = (sa_sint_t)l;
+        l -= (l != 0);
     }
 }
 
-static void libsais_compute_plcp_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT PLCP, sa_sint_t n, sa_sint_t threads)
-{
+static void libsais_compute_plcp_omp(const uint8_t * RESTRICT T,
+                                     sa_sint_t * RESTRICT PLCP, sa_sint_t n,
+                                     sa_sint_t threads) {
 #if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536)
+    #pragma omp parallel num_threads(threads) if (threads > 1 && n >= 65536)
 #endif
     {
 #if defined(_OPENMP)
-        fast_sint_t omp_thread_num    = omp_get_thread_num();
-        fast_sint_t omp_num_threads   = omp_get_num_threads();
+        fast_sint_t omp_thread_num = omp_get_thread_num();
+        fast_sint_t omp_num_threads = omp_get_num_threads();
 #else
         UNUSED(threads);
 
-        fast_sint_t omp_thread_num    = 0;
-        fast_sint_t omp_num_threads   = 1;
+        fast_sint_t omp_thread_num = 0;
+        fast_sint_t omp_num_threads = 1;
 #endif
-        fast_sint_t omp_block_stride  = (n / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+        fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
+                                         ? omp_block_stride
+                                         : n - omp_block_start;
 
         libsais_compute_plcp(T, PLCP, n, omp_block_start, omp_block_size);
     }
 }
 
-static void libsais_compute_lcp(const sa_sint_t * RESTRICT PLCP, const sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT LCP, fast_sint_t omp_block_start, fast_sint_t omp_block_size)
-{
+static void libsais_compute_lcp(const sa_sint_t * RESTRICT PLCP,
+                                const sa_sint_t * RESTRICT SA,
+                                sa_sint_t * RESTRICT LCP,
+                                fast_sint_t omp_block_start,
+                                fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4)
-    {
+    for (i = omp_block_start,
+        j = omp_block_start + omp_block_size - prefetch_distance - 3;
+         i < j; i += 4) {
         libsais_prefetch(&PLCP[SA[i + prefetch_distance + 0]]);
         libsais_prefetch(&PLCP[SA[i + prefetch_distance + 1]]);
 
@@ -7762,44 +11390,46 @@ static void libsais_compute_lcp(const sa_sint_t * RESTRICT PLCP, const sa_sint_t
         LCP[i + 3] = PLCP[SA[i + 3]];
     }
 
-    for (j += prefetch_distance + 3; i < j; i += 1)
-    {
+    for (j += prefetch_distance + 3; i < j; i += 1) {
         LCP[i] = PLCP[SA[i]];
     }
 }
 
-static void libsais_compute_lcp_omp(const sa_sint_t * RESTRICT PLCP, const sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT LCP, sa_sint_t n, sa_sint_t threads)
-{
+static void libsais_compute_lcp_omp(const sa_sint_t * RESTRICT PLCP,
+                                    const sa_sint_t * RESTRICT SA,
+                                    sa_sint_t * RESTRICT LCP, sa_sint_t n,
+                                    sa_sint_t threads) {
 #if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536)
+    #pragma omp parallel num_threads(threads) if (threads > 1 && n >= 65536)
 #endif
     {
 #if defined(_OPENMP)
-        fast_sint_t omp_thread_num    = omp_get_thread_num();
-        fast_sint_t omp_num_threads   = omp_get_num_threads();
+        fast_sint_t omp_thread_num = omp_get_thread_num();
+        fast_sint_t omp_num_threads = omp_get_num_threads();
 #else
         UNUSED(threads);
 
-        fast_sint_t omp_thread_num    = 0;
-        fast_sint_t omp_num_threads   = 1;
+        fast_sint_t omp_thread_num = 0;
+        fast_sint_t omp_num_threads = 1;
 #endif
-        fast_sint_t omp_block_stride  = (n / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start   = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size    = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+        fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
+                                         ? omp_block_stride
+                                         : n - omp_block_start;
 
         libsais_compute_lcp(PLCP, SA, LCP, omp_block_start, omp_block_size);
     }
 }
 
-int32_t libsais_plcp(const uint8_t * T, const int32_t * SA, int32_t * PLCP, int32_t n)
-{
-    if ((T == NULL) || (SA == NULL) || (PLCP == NULL) || (n < 0))
-    {
+int32_t libsais_plcp(const uint8_t * T, const int32_t * SA, int32_t * PLCP,
+                     int32_t n) {
+    if ((T == NULL) || (SA == NULL) || (PLCP == NULL) || (n < 0)) {
         return -1;
-    }
-    else if (n <= 1)
-    {
-        if (n == 1) { PLCP[0] = 0; }
+    } else if (n <= 1) {
+        if (n == 1) {
+            PLCP[0] = 0;
+        }
         return 0;
     }
 
@@ -7809,15 +11439,14 @@ int32_t libsais_plcp(const uint8_t * T, const int32_t * SA, int32_t * PLCP, int3
     return 0;
 }
 
-int32_t libsais_lcp(const int32_t * PLCP, const int32_t * SA, int32_t * LCP, int32_t n)
-{
-    if ((PLCP == NULL) || (SA == NULL) || (LCP == NULL) || (n < 0))
-    {
+int32_t libsais_lcp(const int32_t * PLCP, const int32_t * SA, int32_t * LCP,
+                    int32_t n) {
+    if ((PLCP == NULL) || (SA == NULL) || (LCP == NULL) || (n < 0)) {
         return -1;
-    }
-    else if (n <= 1)
-    {
-        if (n == 1) { LCP[0] = PLCP[SA[0]]; }
+    } else if (n <= 1) {
+        if (n == 1) {
+            LCP[0] = PLCP[SA[0]];
+        }
         return 0;
     }
 
@@ -7828,18 +11457,18 @@ int32_t libsais_lcp(const int32_t * PLCP, const int32_t * SA, int32_t * LCP, int
 
 #if defined(_OPENMP)
 
-int32_t libsais_plcp_omp(const uint8_t * T, const int32_t * SA, int32_t * PLCP, int32_t n, int32_t threads)
-{
-    if ((T == NULL) || (SA == NULL) || (PLCP == NULL) || (n < 0) || (threads < 0))
-    {
+int32_t libsais_plcp_omp(const uint8_t * T, const int32_t * SA, int32_t * PLCP,
+                         int32_t n, int32_t threads) {
+    if ((T == NULL) || (SA == NULL) || (PLCP == NULL) || (n < 0) ||
+        (threads < 0)) {
         return -1;
-    }
-    else if (n <= 1)
-    {
-        if (n == 1) { PLCP[0] = 0; }
+    } else if (n <= 1) {
+        if (n == 1) {
+            PLCP[0] = 0;
+        }
         return 0;
     }
-    
+
     threads = threads > 0 ? threads : omp_get_max_threads();
 
     libsais_compute_phi_omp(SA, PLCP, n, threads);
@@ -7848,15 +11477,15 @@ int32_t libsais_plcp_omp(const uint8_t * T, const int32_t * SA, int32_t * PLCP,
     return 0;
 }
 
-int32_t libsais_lcp_omp(const int32_t * PLCP, const int32_t * SA, int32_t * LCP, int32_t n, int32_t threads)
-{
-    if ((PLCP == NULL) || (SA == NULL) || (LCP == NULL) || (n < 0) || (threads < 0))
-    {
+int32_t libsais_lcp_omp(const int32_t * PLCP, const int32_t * SA, int32_t * LCP,
+                        int32_t n, int32_t threads) {
+    if ((PLCP == NULL) || (SA == NULL) || (LCP == NULL) || (n < 0) ||
+        (threads < 0)) {
         return -1;
-    }
-    else if (n <= 1)
-    {
-        if (n == 1) { LCP[0] = PLCP[SA[0]]; }
+    } else if (n <= 1) {
+        if (n == 1) {
+            LCP[0] = PLCP[SA[0]];
+        }
         return 0;
     }
 
diff --git a/src/main.c b/src/main.c
index 28656cb..d2f8011 100644
--- a/src/main.c
+++ b/src/main.c
@@ -23,17 +23,20 @@
 #include <string.h>
 #include <unistd.h>
 
+#include "cm.h"
+#include "crc32.h"
 #include "libsais.h"
-#include "rle.h"
 #include "mtf.h"
+#include "rle.h"
 #include "srt.h"
-#include "crc32.h"
-#include "cm.h"
 
-#define KiB(x) ((x) * 1024)
-#define MiB(x) ((x) * 1024 * 1024)
+#define KiB(x) ((x)*1024)
+#define MiB(x) ((x)*1024 * 1024)
 
-void encode_block(int output_des, int32_t bytes_read, uint8_t * buffer, uint8_t * output, int32_t * sais_array, struct srt_state * srt_state, state * cm_state, uint32_t block_size, struct mtf_state * mtf_state) {
+void encode_block(int output_des, int32_t bytes_read, uint8_t * buffer,
+                  uint8_t * output, int32_t * sais_array,
+                  struct srt_state * srt_state, state * cm_state,
+                  uint32_t block_size, struct mtf_state * mtf_state) {
     uint32_t crc32 = crc32sum(1, buffer, bytes_read);
 
     int32_t new_size = mrlec(buffer, bytes_read, output);
@@ -41,7 +44,7 @@ void encode_block(int output_des, int32_t bytes_read, uint8_t * buffer, uint8_t
         libsais_bwt(output, output, sais_array, new_size, 16, NULL);
     int32_t new_size2;
 
-    if(new_size > MiB(3)) {
+    if (new_size > MiB(3)) {
         new_size2 = srt_encode(srt_state, output, buffer, new_size);
     } else {
         new_size2 = -1;
@@ -51,9 +54,10 @@ void encode_block(int output_des, int32_t bytes_read, uint8_t * buffer, uint8_t
     begin(cm_state);
     cm_state->out_queue = output;
     cm_state->output_ptr = 0;
-    if(new_size2 != -1)
-        for (int32_t i = 0; i < new_size2; i++) encode_byte(cm_state, buffer[i]);
-    else 
+    if (new_size2 != -1)
+        for (int32_t i = 0; i < new_size2; i++)
+            encode_byte(cm_state, buffer[i]);
+    else
         for (int32_t i = 0; i < new_size; i++) encode_byte(cm_state, buffer[i]);
     flush(cm_state);
     int32_t new_size3 = cm_state->output_ptr;
@@ -67,9 +71,12 @@ void encode_block(int output_des, int32_t bytes_read, uint8_t * buffer, uint8_t
     write(output_des, output, new_size3);
 }
 
-int decode_block(int input_des, int output_des, uint8_t * buffer, uint8_t * output, int32_t * sais_array, struct srt_state * srt_state, state * cm_state, struct mtf_state * mtf_state) {
-    #define safe_read(fd, buf, size) \
-        if (read(fd, buf, size) != size) return 1;
+int decode_block(int input_des, int output_des, uint8_t * buffer,
+                 uint8_t * output, int32_t * sais_array,
+                 struct srt_state * srt_state, state * cm_state,
+                 struct mtf_state * mtf_state) {
+#define safe_read(fd, buf, size) \
+    if (read(fd, buf, size) != size) return 1;
 
     uint32_t crc32;
     int32_t bytes_read, bwt_index, new_size, new_size2, new_size3;
@@ -87,15 +94,16 @@ int decode_block(int input_des, int output_des, uint8_t * buffer, uint8_t * outp
     cm_state->input_ptr = 0;
     cm_state->input_max = new_size3;
     init(cm_state);
-    if(new_size2 != -1) {
-        for (int32_t i = 0; i < new_size2; i++) output[i] = decode_byte(cm_state);
+    if (new_size2 != -1) {
+        for (int32_t i = 0; i < new_size2; i++)
+            output[i] = decode_byte(cm_state);
         srt_decode(srt_state, output, buffer, new_size2);
     } else {
-        for (int32_t i = 0; i < new_size; i++) output[i] = decode_byte(cm_state);
+        for (int32_t i = 0; i < new_size; i++)
+            output[i] = decode_byte(cm_state);
         mtf_decode(mtf_state, output, buffer, new_size);
     }
-    libsais_unbwt(buffer, output, sais_array, new_size, NULL,
-                    bwt_index);
+    libsais_unbwt(buffer, output, sais_array, new_size, NULL, bwt_index);
     mrled(output, buffer, bytes_read);
     if (crc32sum(1, buffer, bytes_read) != crc32) {
         fprintf(stderr, "CRC32 checksum mismatch.\n");
@@ -105,10 +113,10 @@ int decode_block(int input_des, int output_des, uint8_t * buffer, uint8_t * outp
     return 0;
 }
 
-int main(int argc, char *argv[]) {
+int main(int argc, char * argv[]) {
     int mode = 0;  // -1: encode, 0: unspecified, 1: encode
-    char *input = NULL, *output = NULL;  // input and output file names
-    uint32_t block_size = 8 * 1024 * 1024; // the block size
+    char *input = NULL, *output = NULL;     // input and output file names
+    uint32_t block_size = 8 * 1024 * 1024;  // the block size
 
     for (int i = 1; i < argc; i++) {
         if (argv[i][0] == '-') {
@@ -160,12 +168,12 @@ int main(int argc, char *argv[]) {
         output_des = STDOUT_FILENO;
     }
 
-    if(block_size < KiB(65)) {
+    if (block_size < KiB(65)) {
         fprintf(stderr, "Block size must be at least 65 KiB.\n");
         return 1;
     }
 
-    if(block_size > MiB(2047)) {
+    if (block_size > MiB(2047)) {
         fprintf(stderr, "Block size must be at most 2047 MiB.\n");
         return 1;
     }
@@ -175,9 +183,9 @@ int main(int argc, char *argv[]) {
 
     if (mode == 1) {
         // Encode
-        uint8_t *buffer = malloc(block_size + block_size / 3);
-        uint8_t *output = malloc(block_size + block_size / 3);
-        int32_t *sais_array = malloc(block_size * sizeof(int32_t) + 16);
+        uint8_t * buffer = malloc(block_size + block_size / 3);
+        uint8_t * output = malloc(block_size + block_size / 3);
+        int32_t * sais_array = malloc(block_size * sizeof(int32_t) + 16);
         int32_t bytes_read;
 
         state s;
@@ -186,7 +194,8 @@ int main(int argc, char *argv[]) {
         write(output_des, &block_size, sizeof(uint32_t));
 
         while ((bytes_read = read(input_des, buffer, block_size)) > 0) {
-            encode_block(output_des, bytes_read, buffer, output, sais_array, &srt_state, &s, block_size, &mtf_state);
+            encode_block(output_des, bytes_read, buffer, output, sais_array,
+                         &srt_state, &s, block_size, &mtf_state);
         }
 
         free(buffer);
@@ -201,13 +210,15 @@ int main(int argc, char *argv[]) {
             return 1;
         }
         read(input_des, &block_size, sizeof(uint32_t));
-        uint8_t *buffer = malloc(block_size + block_size / 2);
-        uint8_t *output = malloc(block_size + block_size / 2);
-        int32_t *sais_array = malloc(block_size * sizeof(int32_t) + 16);
+        uint8_t * buffer = malloc(block_size + block_size / 2);
+        uint8_t * output = malloc(block_size + block_size / 2);
+        int32_t * sais_array = malloc(block_size * sizeof(int32_t) + 16);
 
         state s;
 
-        while(decode_block(input_des, output_des, buffer, output, sais_array, &srt_state, &s, &mtf_state) == 0);
+        while (decode_block(input_des, output_des, buffer, output, sais_array,
+                            &srt_state, &s, &mtf_state) == 0)
+            ;
 
         free(buffer);
         free(output);
diff --git a/src/mtf.c b/src/mtf.c
index 4fafc1e..d2a0d94 100644
--- a/src/mtf.c
+++ b/src/mtf.c
@@ -19,7 +19,8 @@
 
 #include "mtf.h"
 
-void mtf_encode(struct mtf_state * mtf, uint8_t *src, uint8_t *dst, uint32_t count) {
+void mtf_encode(struct mtf_state * mtf, uint8_t * src, uint8_t * dst,
+                uint32_t count) {
     for (uint32_t i = 0; i < 256; i++) {
         mtf->prev[i] = mtf->curr[i] = 0;
         mtf->symbols[i] = mtf->ranks[i] = i;
@@ -41,7 +42,8 @@ void mtf_encode(struct mtf_state * mtf, uint8_t *src, uint8_t *dst, uint32_t cou
     }
 }
 
-void mtf_decode(struct mtf_state * mtf, uint8_t *src, uint8_t *dst, uint32_t count) {
+void mtf_decode(struct mtf_state * mtf, uint8_t * src, uint8_t * dst,
+                uint32_t count) {
     for (uint32_t i = 0; i < 256; i++) {
         mtf->prev[i] = mtf->curr[i] = 0;
         mtf->ranks[i] = i;
diff --git a/src/rle.c b/src/rle.c
index 32b8717..45ad4e3 100644
--- a/src/rle.c
+++ b/src/rle.c
@@ -3,17 +3,16 @@
 
 /* Derived from Matt Mahoney's public domain RLE code. */
 
-#define _putc(__ch, __out) *__out++ = (__ch)
-#define _getc(in, in_) (in < in_ ? (*in++) : -1)
-#define _rewind(in, _in) in = _in
+#define buffer_write(__ch, __out) *__out++ = (__ch)
+#define buffer_read(in, in_) (in < in_ ? (*in++) : -1)
 
-int mrlec(unsigned char *in, int inlen, unsigned char *out) {
-    unsigned char *ip = in, *in_ = in + inlen, *op = out;
-    int i;
-    int c, pc = -1;
-    long t[256] = {0};
-    long run = 0;
-    while ((c = _getc(ip, in_)) != -1) {
+int32_t mrlec(uint8_t * in, int32_t inlen, uint8_t * out) {
+    uint8_t *ip = in, *in_ = in + inlen, *op = out;
+    int32_t i;
+    int32_t c, pc = -1;
+    int64_t t[256] = { 0 };
+    int64_t run = 0;
+    while ((c = buffer_read(ip, in_)) != -1) {
         if (c == pc)
             t[c] += (++run % 255) != 0;
         else
@@ -21,41 +20,41 @@ int mrlec(unsigned char *in, int inlen, unsigned char *out) {
         pc = c;
     }
     for (i = 0; i < 32; ++i) {
-        int j;
+        int32_t j;
         c = 0;
         for (j = 0; j < 8; ++j) c += (t[i * 8 + j] > 0) << j;
-        _putc(c, op);
+        buffer_write(c, op);
     }
-    _rewind(ip, in);
+    ip = in;
     c = pc = -1;
     run = 0;
     do {
-        c = _getc(ip, in_);
+        c = buffer_read(ip, in_);
         if (c == pc)
             ++run;
         else if (run > 0 && t[pc] > 0) {
-            _putc(pc, op);
-            for (; run > 255; run -= 255) _putc(255, op);
-            _putc(run - 1, op);
+            buffer_write(pc, op);
+            for (; run > 255; run -= 255) buffer_write(255, op);
+            buffer_write(run - 1, op);
             run = 1;
         } else
-            for (++run; run > 1; --run) _putc(pc, op);
+            for (++run; run > 1; --run) buffer_write(pc, op);
         pc = c;
     } while (c != -1);
 
     return op - out;
 }
 
-int mrled(unsigned char *in, unsigned char *out, int outlen) {
-    unsigned char *ip = in, *op = out;
-    int i;
+int32_t mrled(uint8_t * in, uint8_t * out, int32_t outlen) {
+    uint8_t *ip = in, *op = out;
+    int32_t i;
 
-    int c, pc = -1;
-    long t[256] = {0};
-    long run = 0;
+    int32_t c, pc = -1;
+    int64_t t[256] = { 0 };
+    int64_t run = 0;
 
     for (i = 0; i < 32; ++i) {
-        int j;
+        int32_t j;
         c = *ip++;
         for (j = 0; j < 8; ++j) t[i * 8 + j] = (c >> j) & 1;
     }
@@ -66,9 +65,9 @@ int mrled(unsigned char *in, unsigned char *out, int outlen) {
             for (run = 0; (pc = *ip++) == 255; run += 255)
                 ;
             run += pc + 1;
-            for (; run > 0; --run) _putc(c, op);
+            for (; run > 0; --run) buffer_write(c, op);
         } else
-            _putc(c, op);
+            buffer_write(c, op);
     }
     return ip - in;
 }
diff --git a/src/srt.c b/src/srt.c
index 1cf379f..5b34109 100644
--- a/src/srt.c
+++ b/src/srt.c
@@ -19,56 +19,54 @@
 
 #include "srt.h"
 
-static const int MAX_HDR_SIZE = 4 * 256;
+static const int32_t MAX_HDR_SIZE = 4 * 256;
 
-static int preprocess(const uint32_t * freqs, uint8_t * symbols) {
-    int nb_symbols = 0;
-    for(int i = 0; i < 256; i++)
-        if(freqs[i] > 0)
-            symbols[nb_symbols++] = i;
+static int32_t preprocess(const uint32_t * freqs, uint8_t * symbols) {
+    int32_t nb_symbols = 0;
+    for (int32_t i = 0; i < 256; i++)
+        if (freqs[i] > 0) symbols[nb_symbols++] = i;
     uint32_t h = 4;
-    while(h < nb_symbols)
-        h = h * 3 + 1;
-    while(1) {
+    while (h < nb_symbols) h = h * 3 + 1;
+    while (1) {
         h /= 3;
-        for(uint32_t i = h; i < nb_symbols; i++) {
-            const int t = symbols[i] & 0xFF;
+        for (uint32_t i = h; i < nb_symbols; i++) {
+            const int32_t t = symbols[i] & 0xFF;
             int32_t b = i - h;
-            while((b >= 0) && freqs[symbols[b]] < freqs[t]
-            || (freqs[t] == freqs[symbols[b]]) && t < symbols[b])
-                { symbols[b + h] = symbols[b]; b -= h; }
+            while ((b >= 0) && freqs[symbols[b]] < freqs[t] ||
+                   (freqs[t] == freqs[symbols[b]]) && t < symbols[b]) {
+                symbols[b + h] = symbols[b];
+                b -= h;
+            }
             symbols[b + h] = t;
         }
-        if(h == 1)
-            break;
+        if (h == 1) break;
     }
     return nb_symbols;
 }
 
-static int encode_header(uint32_t * freqs, uint8_t * dst) {
+static int32_t encode_header(uint32_t * freqs, uint8_t * dst) {
     uint32_t idx = 0;
-    for(int i = 0; i < 256; i++) {
+    for (int32_t i = 0; i < 256; i++) {
         uint32_t f = freqs[i];
-        while(f >= 128) {
-            dst[idx++] = (uint8_t) (f | 0x80);
+        while (f >= 128) {
+            dst[idx++] = (uint8_t)(f | 0x80);
             f >>= 7;
         }
-        dst[idx++] = (uint8_t) f;
+        dst[idx++] = (uint8_t)f;
     }
     return idx;
 }
 
-static int decode_header(uint8_t * src, uint32_t * freqs) {
+static int32_t decode_header(uint8_t * src, uint32_t * freqs) {
     uint32_t idx = 0;
-    for(int i = 0; i < 256; i++) {
-        int val = src[idx++] & 0xFF;
-        int res = val & 0x7F;
-        int shift = 7;
-        while(val >= 128) {
+    for (int32_t i = 0; i < 256; i++) {
+        int32_t val = src[idx++] & 0xFF;
+        int32_t res = val & 0x7F;
+        int32_t shift = 7;
+        while (val >= 128) {
             val = src[idx++] & 0xFF;
             res |= (val & 0x7F) << shift;
-            if(shift > 21)
-                break;
+            if (shift > 21) break;
             shift += 7;
         }
         freqs[i] = res;
@@ -76,47 +74,46 @@ static int decode_header(uint8_t * src, uint32_t * freqs) {
     return idx;
 }
 
-uint32_t srt_encode(struct srt_state * mtf, uint8_t *src, uint8_t *dst, uint32_t count) {
+uint32_t srt_encode(struct srt_state * mtf, uint8_t * src, uint8_t * dst,
+                    uint32_t count) {
     // Find first symbols and build a histogram.
-    for(int i = 0; i < 256; i++)
-        mtf->freqs[i] = 0;
-    for(uint32_t i = 0, b = 0; i < count;) {
-        if(mtf->freqs[src[i]] == 0) {
+    for (int32_t i = 0; i < 256; i++) mtf->freqs[i] = 0;
+    for (uint32_t i = 0, b = 0; i < count;) {
+        if (mtf->freqs[src[i]] == 0) {
             mtf->r2s[b] = src[i];
             mtf->s2r[src[i]] = b;
             b++;
         }
         uint32_t j = i + 1;
-        while(j < count && src[j] == src[i])
-            j++;
+        while (j < count && src[j] == src[i]) j++;
         mtf->freqs[src[i]] += j - i;
         i = j;
     }
 
-    int n_symbols = preprocess(mtf->freqs, mtf->symbols);
-    for(uint32_t i = 0, bucket_pos = 0; i < n_symbols; i++) {
+    int32_t n_symbols = preprocess(mtf->freqs, mtf->symbols);
+    for (uint32_t i = 0, bucket_pos = 0; i < n_symbols; i++) {
         mtf->buckets[mtf->symbols[i]] = bucket_pos;
         bucket_pos += mtf->freqs[mtf->symbols[i]];
     }
 
     const uint32_t header_size = encode_header(mtf->freqs, dst);
-    const int dst_idx = header_size;
-    for(uint32_t i = 0; i < count; ) {
-        const int c = src[i] & 0xFF;
-        int r = mtf->s2r[c] & 0xFF;
+    const int32_t dst_idx = header_size;
+    for (uint32_t i = 0; i < count;) {
+        const int32_t c = src[i] & 0xFF;
+        int32_t r = mtf->s2r[c] & 0xFF;
         uint32_t p = mtf->buckets[c];
         dst[dst_idx + p++] = r;
-        if(r != 0) {
+        if (r != 0) {
             do {
                 mtf->r2s[r] = mtf->r2s[r - 1];
                 mtf->s2r[mtf->r2s[r]] = r;
                 r--;
-            } while(r != 0);
+            } while (r != 0);
             mtf->r2s[0] = c;
             mtf->s2r[c] = 0;
         }
         i++;
-        while(i < count && src[i] == c) {
+        while (i < count && src[i] == c) {
             dst[dst_idx + p++] = 0;
             i++;
         }
@@ -125,34 +122,32 @@ uint32_t srt_encode(struct srt_state * mtf, uint8_t *src, uint8_t *dst, uint32_t
     return count + header_size;
 }
 
-uint32_t srt_decode(struct srt_state * mtf, uint8_t *src, uint8_t *dst, uint32_t count) {
+uint32_t srt_decode(struct srt_state * mtf, uint8_t * src, uint8_t * dst,
+                    uint32_t count) {
     const uint32_t header_size = decode_header(src, mtf->freqs);
     const uint32_t src_idx = header_size;
-    int nb_symbols = preprocess(mtf->freqs, mtf->symbols);
-    for(uint32_t i = 0, bucket_pos = 0; i < nb_symbols; i++) {
-        const int c = mtf->symbols[i] & 0xFF;
+    int32_t nb_symbols = preprocess(mtf->freqs, mtf->symbols);
+    for (uint32_t i = 0, bucket_pos = 0; i < nb_symbols; i++) {
+        const int32_t c = mtf->symbols[i] & 0xFF;
         mtf->r2s[src[src_idx + bucket_pos] & 0xFF] = c;
         mtf->buckets[c] = bucket_pos + 1;
         bucket_pos += mtf->freqs[c];
         mtf->bucket_ends[c] = bucket_pos;
     }
     uint32_t c = mtf->r2s[0];
-    for(uint32_t i = 0; i < count; i++) {
+    for (uint32_t i = 0; i < count; i++) {
         dst[i] = c;
-        if(mtf->buckets[c] < mtf->bucket_ends[c]) {
-            const int r = src[src_idx + mtf->buckets[c]] & 0xFF;
+        if (mtf->buckets[c] < mtf->bucket_ends[c]) {
+            const int32_t r = src[src_idx + mtf->buckets[c]] & 0xFF;
             mtf->buckets[c]++;
-            if(r == 0)
-                continue;
-            for(int s = 0; s < r; s++)
-                mtf->r2s[s] = mtf->r2s[s + 1];
+            if (r == 0) continue;
+            for (int32_t s = 0; s < r; s++) mtf->r2s[s] = mtf->r2s[s + 1];
             mtf->r2s[r] = c;
             c = mtf->r2s[0];
         } else {
-            if(nb_symbols == 1)
-                continue;
+            if (nb_symbols == 1) continue;
             nb_symbols--;
-            for(int s = 0; s < nb_symbols; s++)
+            for (int32_t s = 0; s < nb_symbols; s++)
                 mtf->r2s[s] = mtf->r2s[s + 1];
             c = mtf->r2s[0];
         }
tab: 248 wrap: offon