:: commit 310b6039769e77069318c99d0149d56f947b5af5

Kamila Szewczyk <kspalaiologos@gmail.com> — 2022-05-03 09:09

parents: af9a24a0e1

strip openmp code from libsais

diff --git a/src/libsais.c b/src/libsais.c
index fd5f71a..c2d97a8 100644
--- a/src/libsais.c
+++ b/src/libsais.c
@@ -252,231 +252,6 @@ static void libsais_free_ctx_main(LIBSAIS_CONTEXT * ctx) {
         libsais_free_aligned(ctx);
     }
 }
-
-#if defined(_OPENMP)
-
-static sa_sint_t libsais_count_negative_marked_suffixes(sa_sint_t * RESTRICT SA,
-                                                        fast_sint_t omp_block_start,
-                                                        fast_sint_t omp_block_size) {
-    sa_sint_t count = 0;
-
-    fast_sint_t i;
-    for (i = omp_block_start; i < omp_block_start + omp_block_size; ++i) {
-        count += (SA[i] < 0);
-    }
-
-    return count;
-}
-
-static sa_sint_t libsais_count_zero_marked_suffixes(sa_sint_t * RESTRICT SA,
-                                                    fast_sint_t omp_block_start,
-                                                    fast_sint_t omp_block_size) {
-    sa_sint_t count = 0;
-
-    fast_sint_t i;
-    for (i = omp_block_start; i < omp_block_start + omp_block_size; ++i) {
-        count += (SA[i] == 0);
-    }
-
-    return count;
-}
-
-static void libsais_place_cached_suffixes(sa_sint_t * RESTRICT SA,
-                                          LIBSAIS_THREAD_CACHE * RESTRICT cache,
-                                          fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
-    const fast_sint_t prefetch_distance = 32;
-
-    fast_sint_t i, j;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j;
-         i += 4) {
-        libsais_prefetch(&cache[i + 2 * prefetch_distance]);
-
-        libsais_prefetchw(&SA[cache[i + prefetch_distance + 0].symbol]);
-        libsais_prefetchw(&SA[cache[i + prefetch_distance + 1].symbol]);
-        libsais_prefetchw(&SA[cache[i + prefetch_distance + 2].symbol]);
-        libsais_prefetchw(&SA[cache[i + prefetch_distance + 3].symbol]);
-
-        SA[cache[i + 0].symbol] = cache[i + 0].index;
-        SA[cache[i + 1].symbol] = cache[i + 1].index;
-        SA[cache[i + 2].symbol] = cache[i + 2].index;
-        SA[cache[i + 3].symbol] = cache[i + 3].index;
-    }
-
-    for (j += prefetch_distance + 3; i < j; i += 1) {
-        SA[cache[i].symbol] = cache[i].index;
-    }
-}
-
-static void libsais_compact_and_place_cached_suffixes(sa_sint_t * RESTRICT SA,
-                                                      LIBSAIS_THREAD_CACHE * RESTRICT cache,
-                                                      fast_sint_t omp_block_start,
-                                                      fast_sint_t omp_block_size) {
-    const fast_sint_t prefetch_distance = 32;
-
-    fast_sint_t i, j, l;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - 3, l = omp_block_start; i < j;
-         i += 4) {
-        libsais_prefetchw(&cache[i + prefetch_distance]);
-
-        cache[l] = cache[i + 0];
-        l += cache[l].symbol >= 0;
-        cache[l] = cache[i + 1];
-        l += cache[l].symbol >= 0;
-        cache[l] = cache[i + 2];
-        l += cache[l].symbol >= 0;
-        cache[l] = cache[i + 3];
-        l += cache[l].symbol >= 0;
-    }
-
-    for (j += 3; i < j; i += 1) {
-        cache[l] = cache[i];
-        l += cache[l].symbol >= 0;
-    }
-
-    libsais_place_cached_suffixes(SA, cache, omp_block_start, l - omp_block_start);
-}
-
-static void libsais_accumulate_counts_s32_2(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size,
-                                            fast_sint_t bucket_stride) {
-    sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
-    fast_sint_t s;
-    for (s = 0; s < bucket_size; s += 1) {
-        bucket00[s] = bucket00[s] + bucket01[s];
-    }
-}
-
-static void libsais_accumulate_counts_s32_3(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size,
-                                            fast_sint_t bucket_stride) {
-    sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
-    sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
-    fast_sint_t s;
-    for (s = 0; s < bucket_size; s += 1) {
-        bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s];
-    }
-}
-
-static void libsais_accumulate_counts_s32_4(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size,
-                                            fast_sint_t bucket_stride) {
-    sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
-    sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
-    sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride;
-    fast_sint_t s;
-    for (s = 0; s < bucket_size; s += 1) {
-        bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s];
-    }
-}
-
-static void libsais_accumulate_counts_s32_5(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size,
-                                            fast_sint_t bucket_stride) {
-    sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
-    sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
-    sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride;
-    sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride;
-    fast_sint_t s;
-    for (s = 0; s < bucket_size; s += 1) {
-        bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s];
-    }
-}
-
-static void libsais_accumulate_counts_s32_6(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size,
-                                            fast_sint_t bucket_stride) {
-    sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
-    sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
-    sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride;
-    sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride;
-    sa_sint_t * RESTRICT bucket05 = bucket04 - bucket_stride;
-    fast_sint_t s;
-    for (s = 0; s < bucket_size; s += 1) {
-        bucket00[s] =
-            bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] + bucket05[s];
-    }
-}
-
-static void libsais_accumulate_counts_s32_7(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size,
-                                            fast_sint_t bucket_stride) {
-    sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
-    sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
-    sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride;
-    sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride;
-    sa_sint_t * RESTRICT bucket05 = bucket04 - bucket_stride;
-    sa_sint_t * RESTRICT bucket06 = bucket05 - bucket_stride;
-    fast_sint_t s;
-    for (s = 0; s < bucket_size; s += 1) {
-        bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] +
-                      bucket05[s] + bucket06[s];
-    }
-}
-
-static void libsais_accumulate_counts_s32_8(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size,
-                                            fast_sint_t bucket_stride) {
-    sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
-    sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
-    sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride;
-    sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride;
-    sa_sint_t * RESTRICT bucket05 = bucket04 - bucket_stride;
-    sa_sint_t * RESTRICT bucket06 = bucket05 - bucket_stride;
-    sa_sint_t * RESTRICT bucket07 = bucket06 - bucket_stride;
-    fast_sint_t s;
-    for (s = 0; s < bucket_size; s += 1) {
-        bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] +
-                      bucket05[s] + bucket06[s] + bucket07[s];
-    }
-}
-
-static void libsais_accumulate_counts_s32_9(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size,
-                                            fast_sint_t bucket_stride) {
-    sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
-    sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
-    sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride;
-    sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride;
-    sa_sint_t * RESTRICT bucket05 = bucket04 - bucket_stride;
-    sa_sint_t * RESTRICT bucket06 = bucket05 - bucket_stride;
-    sa_sint_t * RESTRICT bucket07 = bucket06 - bucket_stride;
-    sa_sint_t * RESTRICT bucket08 = bucket07 - bucket_stride;
-    fast_sint_t s;
-    for (s = 0; s < bucket_size; s += 1) {
-        bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] +
-                      bucket05[s] + bucket06[s] + bucket07[s] + bucket08[s];
-    }
-}
-
-static void libsais_accumulate_counts_s32(sa_sint_t * RESTRICT buckets, fast_sint_t bucket_size,
-                                          fast_sint_t bucket_stride, fast_sint_t num_buckets) {
-    while (num_buckets >= 9) {
-        libsais_accumulate_counts_s32_9(buckets - (num_buckets - 9) * bucket_stride, bucket_size,
-                                        bucket_stride);
-        num_buckets -= 8;
-    }
-
-    switch (num_buckets) {
-        case 1:
-            break;
-        case 2:
-            libsais_accumulate_counts_s32_2(buckets, bucket_size, bucket_stride);
-            break;
-        case 3:
-            libsais_accumulate_counts_s32_3(buckets, bucket_size, bucket_stride);
-            break;
-        case 4:
-            libsais_accumulate_counts_s32_4(buckets, bucket_size, bucket_stride);
-            break;
-        case 5:
-            libsais_accumulate_counts_s32_5(buckets, bucket_size, bucket_stride);
-            break;
-        case 6:
-            libsais_accumulate_counts_s32_6(buckets, bucket_size, bucket_stride);
-            break;
-        case 7:
-            libsais_accumulate_counts_s32_7(buckets, bucket_size, bucket_stride);
-            break;
-        case 8:
-            libsais_accumulate_counts_s32_8(buckets, bucket_size, bucket_stride);
-            break;
-    }
-}
-
-#endif
-
 static void libsais_gather_lms_suffixes_8u(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
                                            sa_sint_t n, fast_sint_t m, fast_sint_t omp_block_start,
                                            fast_sint_t omp_block_size) {
@@ -528,21 +303,13 @@ static void libsais_gather_lms_suffixes_8u(const u8 * RESTRICT T, sa_sint_t * RE
 static void libsais_gather_lms_suffixes_8u_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
                                                sa_sint_t n, sa_sint_t threads,
                                                LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
-#if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if (threads > 1 && n >= 65536 && \
-                                                  omp_get_dynamic() == 0)
-#endif
     {
-#if defined(_OPENMP)
-        fast_sint_t omp_thread_num = omp_get_thread_num();
-        fast_sint_t omp_num_threads = omp_get_num_threads();
-#else
-        UNUSED(threads);
-        UNUSED(thread_state);
+        (void)(threads);
+        (void)(thread_state);
 
         fast_sint_t omp_thread_num = 0;
         fast_sint_t omp_num_threads = 1;
-#endif
+
         fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
         fast_sint_t omp_block_size =
@@ -552,24 +319,6 @@ static void libsais_gather_lms_suffixes_8u_omp(const u8 * RESTRICT T, sa_sint_t
             libsais_gather_lms_suffixes_8u(T, SA, n, (fast_sint_t)n - 1, omp_block_start,
                                            omp_block_size);
         }
-#if defined(_OPENMP)
-        else {
-            fast_sint_t t, m = 0;
-            for (t = omp_num_threads - 1; t > omp_thread_num; --t) {
-                m += thread_state[t].state.m;
-            }
-
-            libsais_gather_lms_suffixes_8u(T, SA, n, (fast_sint_t)n - 1 - m, omp_block_start,
-                                           omp_block_size);
-
-    #pragma omp barrier
-
-            if (thread_state[omp_thread_num].state.m > 0) {
-                SA[(fast_sint_t)n - 1 - m] =
-                    (sa_sint_t)thread_state[omp_thread_num].state.last_lms_suffix;
-            }
-        }
-#endif
     }
 }
 
@@ -656,57 +405,6 @@ static sa_sint_t libsais_gather_compacted_lms_suffixes_32s(const sa_sint_t * RES
 
     return n - 1 - m;
 }
-
-#if defined(_OPENMP)
-
-static void libsais_count_lms_suffixes_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t n,
-                                              sa_sint_t k, sa_sint_t * RESTRICT buckets) {
-    const fast_sint_t prefetch_distance = 32;
-
-    memset(buckets, 0, 4 * (size_t)k * sizeof(sa_sint_t));
-
-    sa_sint_t i = n - 2;
-    fast_uint_t s = 1;
-    fast_sint_t c0 = T[n - 1];
-    fast_sint_t c1 = 0;
-
-    for (; i >= prefetch_distance + 3; i -= 4) {
-        libsais_prefetch(&T[i - 2 * prefetch_distance]);
-
-        libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 0], 0)]);
-        libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 1], 0)]);
-        libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 2], 0)]);
-        libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 3], 0)]);
-
-        c1 = T[i - 0];
-        s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
-        buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
-
-        c0 = T[i - 1];
-        s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
-        buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
-
-        c1 = T[i - 2];
-        s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
-        buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
-
-        c0 = T[i - 3];
-        s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
-        buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
-    }
-
-    for (; i >= 0; i -= 1) {
-        c1 = c0;
-        c0 = T[i];
-        s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
-        buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
-    }
-
-    buckets[BUCKETS_INDEX4((fast_uint_t)c0, (s << 1) & 3)]++;
-}
-
-#endif
-
 static void libsais_count_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t n,
                                               sa_sint_t k, sa_sint_t * RESTRICT buckets) {
     const fast_sint_t prefetch_distance = 32;
@@ -752,63 +450,6 @@ static void libsais_count_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_s
 
     buckets[BUCKETS_INDEX2((fast_uint_t)c0, 0)]++;
 }
-
-#if defined(_OPENMP)
-
-static void libsais_count_compacted_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t n,
-                                                        sa_sint_t k, sa_sint_t * RESTRICT buckets) {
-    const fast_sint_t prefetch_distance = 32;
-
-    memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t));
-
-    sa_sint_t i = n - 2;
-    fast_uint_t s = 1;
-    fast_sint_t c0 = T[n - 1];
-    fast_sint_t c1 = 0;
-
-    for (; i >= prefetch_distance + 3; i -= 4) {
-        libsais_prefetch(&T[i - 2 * prefetch_distance]);
-
-        libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0] & SAINT_MAX, 0)]);
-        libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1] & SAINT_MAX, 0)]);
-        libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2] & SAINT_MAX, 0)]);
-        libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3] & SAINT_MAX, 0)]);
-
-        c1 = T[i - 0];
-        s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
-        c0 &= SAINT_MAX;
-        buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
-
-        c0 = T[i - 1];
-        s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
-        c1 &= SAINT_MAX;
-        buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
-
-        c1 = T[i - 2];
-        s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
-        c0 &= SAINT_MAX;
-        buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
-
-        c0 = T[i - 3];
-        s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
-        c1 &= SAINT_MAX;
-        buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
-    }
-
-    for (; i >= 0; i -= 1) {
-        c1 = c0;
-        c0 = T[i];
-        s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
-        c1 &= SAINT_MAX;
-        buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
-    }
-
-    c0 &= SAINT_MAX;
-    buckets[BUCKETS_INDEX2((fast_uint_t)c0, 0)]++;
-}
-
-#endif
-
 static sa_sint_t libsais_count_and_gather_lms_suffixes_8u(const u8 * RESTRICT T,
                                                           sa_sint_t * RESTRICT SA, sa_sint_t n,
                                                           sa_sint_t * RESTRICT buckets,
@@ -881,21 +522,13 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_8u_omp(
     sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     sa_sint_t m = 0;
 
-#if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if (threads > 1 && n >= 65536 && \
-                                                  omp_get_dynamic() == 0)
-#endif
     {
-#if defined(_OPENMP)
-        fast_sint_t omp_thread_num = omp_get_thread_num();
-        fast_sint_t omp_num_threads = omp_get_num_threads();
-#else
-        UNUSED(threads);
-        UNUSED(thread_state);
+        (void)(threads);
+        (void)(thread_state);
 
         fast_sint_t omp_thread_num = 0;
         fast_sint_t omp_num_threads = 1;
-#endif
+
         fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
         fast_sint_t omp_block_size =
@@ -905,49 +538,6 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_8u_omp(
             m = libsais_count_and_gather_lms_suffixes_8u(T, SA, n, buckets, omp_block_start,
                                                          omp_block_size);
         }
-#if defined(_OPENMP)
-        else {
-            {
-                thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size;
-                thread_state[omp_thread_num].state.m = libsais_count_and_gather_lms_suffixes_8u(
-                    T, SA, n, thread_state[omp_thread_num].state.buckets, omp_block_start,
-                    omp_block_size);
-
-                if (thread_state[omp_thread_num].state.m > 0) {
-                    thread_state[omp_thread_num].state.last_lms_suffix =
-                        SA[thread_state[omp_thread_num].state.position - 1];
-                }
-            }
-
-    #pragma omp barrier
-
-    #pragma omp master
-            {
-                memset(buckets, 0, 4 * ALPHABET_SIZE * sizeof(sa_sint_t));
-
-                fast_sint_t t;
-                for (t = omp_num_threads - 1; t >= 0; --t) {
-                    m += (sa_sint_t)thread_state[t].state.m;
-
-                    if (t != omp_num_threads - 1 && thread_state[t].state.m > 0) {
-                        memcpy(&SA[n - m],
-                               &SA[thread_state[t].state.position - thread_state[t].state.m],
-                               (size_t)thread_state[t].state.m * sizeof(sa_sint_t));
-                    }
-
-                    {
-                        sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
-                        fast_sint_t s;
-                        for (s = 0; s < 4 * ALPHABET_SIZE; s += 1) {
-                            sa_sint_t A = buckets[s], B = temp_bucket[s];
-                            buckets[s] = A + B;
-                            temp_bucket[s] = A;
-                        }
-                    }
-                }
-            }
-        }
-#endif
     }
 
     return m;
@@ -1172,392 +762,79 @@ static sa_sint_t libsais_count_and_gather_compacted_lms_suffixes_32s_2k(
 
     return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m);
 }
+static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k_nofs_omp(const sa_sint_t * RESTRICT T,
+                                                                       sa_sint_t * RESTRICT SA,
+                                                                       sa_sint_t n, sa_sint_t k,
+                                                                       sa_sint_t * RESTRICT buckets,
+                                                                       sa_sint_t threads) {
+    sa_sint_t m = 0;
+    {
+        (void)(threads);
 
-#if defined(_OPENMP)
+        fast_sint_t omp_num_threads = 1;
 
-static fast_sint_t libsais_get_bucket_stride(fast_sint_t free_space, fast_sint_t bucket_size,
-                                             fast_sint_t num_buckets) {
-    fast_sint_t bucket_size_1024 = (bucket_size + 1023) & (-1024);
-    if (free_space / (num_buckets - 1) >= bucket_size_1024) {
-        return bucket_size_1024;
-    }
-    fast_sint_t bucket_size_16 = (bucket_size + 15) & (-16);
-    if (free_space / (num_buckets - 1) >= bucket_size_16) {
-        return bucket_size_16;
+        if (omp_num_threads == 1) {
+            m = libsais_count_and_gather_lms_suffixes_32s_4k(T, SA, n, k, buckets, 0, n);
+        }
     }
 
-    return bucket_size;
+    return m;
 }
 
-static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k_fs_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
-    sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k_nofs_omp(const sa_sint_t * RESTRICT T,
+                                                                       sa_sint_t * RESTRICT SA,
+                                                                       sa_sint_t n, sa_sint_t k,
+                                                                       sa_sint_t * RESTRICT buckets,
+                                                                       sa_sint_t threads) {
     sa_sint_t m = 0;
-
-    #if defined(_OPENMP)
-        #pragma omp parallel num_threads(threads) if (threads > 1 && n >= 65536)
-    #endif
     {
-    #if defined(_OPENMP)
-        fast_sint_t omp_thread_num = omp_get_thread_num();
-        fast_sint_t omp_num_threads = omp_get_num_threads();
-    #else
-        UNUSED(threads);
-        UNUSED(thread_state);
+        (void)(threads);
 
-        fast_sint_t omp_thread_num = 0;
         fast_sint_t omp_num_threads = 1;
-    #endif
-        fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size =
-            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
 
         if (omp_num_threads == 1) {
-            m = libsais_count_and_gather_lms_suffixes_32s_4k(T, SA, n, k, buckets, omp_block_start,
-                                                             omp_block_size);
-        }
-    #if defined(_OPENMP)
-        else {
-            fast_sint_t bucket_size = 4 * (fast_sint_t)k;
-            fast_sint_t bucket_stride =
-                libsais_get_bucket_stride(buckets - &SA[n], bucket_size, omp_num_threads);
-
-            {
-                thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size;
-                thread_state[omp_thread_num].state.count =
-                    libsais_count_and_gather_lms_suffixes_32s_4k(
-                        T, SA, n, k, buckets - (omp_thread_num * bucket_stride), omp_block_start,
-                        omp_block_size);
-            }
-
-        #pragma omp barrier
-
-            if (omp_thread_num == omp_num_threads - 1) {
-                fast_sint_t t;
-                for (t = omp_num_threads - 1; t >= 0; --t) {
-                    m += (sa_sint_t)thread_state[t].state.count;
-
-                    if (t != omp_num_threads - 1 && thread_state[t].state.count > 0) {
-                        memcpy(&SA[n - m],
-                               &SA[thread_state[t].state.position - thread_state[t].state.count],
-                               (size_t)thread_state[t].state.count * sizeof(sa_sint_t));
-                    }
-                }
-            } else {
-                omp_num_threads = omp_num_threads - 1;
-                omp_block_stride = (bucket_size / omp_num_threads) & (-16);
-                omp_block_start = omp_thread_num * omp_block_stride;
-                omp_block_size = omp_thread_num < omp_num_threads - 1
-                                     ? omp_block_stride
-                                     : bucket_size - omp_block_start;
-
-                libsais_accumulate_counts_s32(buckets + omp_block_start, omp_block_size,
-                                              bucket_stride, omp_num_threads + 1);
-            }
+            m = libsais_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets, 0, n);
         }
-    #endif
     }
 
     return m;
 }
 
-static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k_fs_omp(
+static sa_sint_t libsais_count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp(
     const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
-    sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    sa_sint_t * RESTRICT buckets, sa_sint_t threads) {
     sa_sint_t m = 0;
-
-    #if defined(_OPENMP)
-        #pragma omp parallel num_threads(threads) if (threads > 1 && n >= 65536)
-    #endif
     {
-    #if defined(_OPENMP)
-        fast_sint_t omp_thread_num = omp_get_thread_num();
-        fast_sint_t omp_num_threads = omp_get_num_threads();
-    #else
-        UNUSED(threads);
-        UNUSED(thread_state);
+        (void)(threads);
 
-        fast_sint_t omp_thread_num = 0;
         fast_sint_t omp_num_threads = 1;
-    #endif
-        fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size =
-            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
 
         if (omp_num_threads == 1) {
-            m = libsais_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets, omp_block_start,
-                                                             omp_block_size);
+            m = libsais_count_and_gather_compacted_lms_suffixes_32s_2k(T, SA, n, k, buckets, 0, n);
         }
-    #if defined(_OPENMP)
-        else {
-            fast_sint_t bucket_size = 2 * (fast_sint_t)k;
-            fast_sint_t bucket_stride =
-                libsais_get_bucket_stride(buckets - &SA[n], bucket_size, omp_num_threads);
-
-            {
-                thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size;
-                thread_state[omp_thread_num].state.count =
-                    libsais_count_and_gather_lms_suffixes_32s_2k(
-                        T, SA, n, k, buckets - (omp_thread_num * bucket_stride), omp_block_start,
-                        omp_block_size);
-            }
+    }
 
-        #pragma omp barrier
+    return m;
+}
 
-            if (omp_thread_num == omp_num_threads - 1) {
-                fast_sint_t t;
-                for (t = omp_num_threads - 1; t >= 0; --t) {
-                    m += (sa_sint_t)thread_state[t].state.count;
+static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k_omp(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
+    sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    sa_sint_t m;
+    (void)(thread_state);
 
-                    if (t != omp_num_threads - 1 && thread_state[t].state.count > 0) {
-                        memcpy(&SA[n - m],
-                               &SA[thread_state[t].state.position - thread_state[t].state.count],
-                               (size_t)thread_state[t].state.count * sizeof(sa_sint_t));
-                    }
-                }
-            } else {
-                omp_num_threads = omp_num_threads - 1;
-                omp_block_stride = (bucket_size / omp_num_threads) & (-16);
-                omp_block_start = omp_thread_num * omp_block_stride;
-                omp_block_size = omp_thread_num < omp_num_threads - 1
-                                     ? omp_block_stride
-                                     : bucket_size - omp_block_start;
-
-                libsais_accumulate_counts_s32(buckets + omp_block_start, omp_block_size,
-                                              bucket_stride, omp_num_threads + 1);
-            }
-        }
-    #endif
-    }
+    { m = libsais_count_and_gather_lms_suffixes_32s_4k_nofs_omp(T, SA, n, k, buckets, threads); }
 
     return m;
 }
 
-static void libsais_count_and_gather_compacted_lms_suffixes_32s_2k_fs_omp(
+static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k_omp(
     const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
     sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
-    #if defined(_OPENMP)
-        #pragma omp parallel num_threads(threads) if (threads > 1 && n >= 65536)
-    #endif
-    {
-    #if defined(_OPENMP)
-        fast_sint_t omp_thread_num = omp_get_thread_num();
-        fast_sint_t omp_num_threads = omp_get_num_threads();
-    #else
-        UNUSED(threads);
-        UNUSED(thread_state);
+    sa_sint_t m;
+    (void)(thread_state);
 
-        fast_sint_t omp_thread_num = 0;
-        fast_sint_t omp_num_threads = 1;
-    #endif
-        fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size =
-            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
-
-        if (omp_num_threads == 1) {
-            libsais_count_and_gather_compacted_lms_suffixes_32s_2k(T, SA, n, k, buckets,
-                                                                   omp_block_start, omp_block_size);
-        }
-    #if defined(_OPENMP)
-        else {
-            fast_sint_t bucket_size = 2 * (fast_sint_t)k;
-            fast_sint_t bucket_stride =
-                libsais_get_bucket_stride(buckets - &SA[n + n], bucket_size, omp_num_threads);
-
-            {
-                thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size;
-                thread_state[omp_thread_num].state.count =
-                    libsais_count_and_gather_compacted_lms_suffixes_32s_2k(
-                        T, SA + n, n, k, buckets - (omp_thread_num * bucket_stride),
-                        omp_block_start, omp_block_size);
-            }
-
-        #pragma omp barrier
-
-            {
-                fast_sint_t t, m = 0;
-                for (t = omp_num_threads - 1; t >= omp_thread_num; --t) {
-                    m += (sa_sint_t)thread_state[t].state.count;
-                }
-
-                if (thread_state[omp_thread_num].state.count > 0) {
-                    memcpy(&SA[n - m],
-                           &SA[n + thread_state[omp_thread_num].state.position -
-                               thread_state[omp_thread_num].state.count],
-                           (size_t)thread_state[omp_thread_num].state.count * sizeof(sa_sint_t));
-                }
-            }
-
-            {
-                omp_block_stride = (bucket_size / omp_num_threads) & (-16);
-                omp_block_start = omp_thread_num * omp_block_stride;
-                omp_block_size = omp_thread_num < omp_num_threads - 1
-                                     ? omp_block_stride
-                                     : bucket_size - omp_block_start;
-
-                libsais_accumulate_counts_s32(buckets + omp_block_start, omp_block_size,
-                                              bucket_stride, omp_num_threads);
-            }
-        }
-    #endif
-    }
-}
-
-#endif
-
-static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k_nofs_omp(const sa_sint_t * RESTRICT T,
-                                                                       sa_sint_t * RESTRICT SA,
-                                                                       sa_sint_t n, sa_sint_t k,
-                                                                       sa_sint_t * RESTRICT buckets,
-                                                                       sa_sint_t threads) {
-    sa_sint_t m = 0;
-
-#if defined(_OPENMP)
-    #pragma omp parallel num_threads(2) if (threads > 1 && n >= 65536)
-#endif
-    {
-#if defined(_OPENMP)
-        fast_sint_t omp_thread_num = omp_get_thread_num();
-        fast_sint_t omp_num_threads = omp_get_num_threads();
-#else
-        UNUSED(threads);
-
-        fast_sint_t omp_num_threads = 1;
-#endif
-        if (omp_num_threads == 1) {
-            m = libsais_count_and_gather_lms_suffixes_32s_4k(T, SA, n, k, buckets, 0, n);
-        }
-#if defined(_OPENMP)
-        else if (omp_thread_num == 0) {
-            libsais_count_lms_suffixes_32s_4k(T, n, k, buckets);
-        } else {
-            m = libsais_gather_lms_suffixes_32s(T, SA, n);
-        }
-#endif
-    }
-
-    return m;
-}
-
-static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k_nofs_omp(const sa_sint_t * RESTRICT T,
-                                                                       sa_sint_t * RESTRICT SA,
-                                                                       sa_sint_t n, sa_sint_t k,
-                                                                       sa_sint_t * RESTRICT buckets,
-                                                                       sa_sint_t threads) {
-    sa_sint_t m = 0;
-
-#if defined(_OPENMP)
-    #pragma omp parallel num_threads(2) if (threads > 1 && n >= 65536)
-#endif
-    {
-#if defined(_OPENMP)
-        fast_sint_t omp_thread_num = omp_get_thread_num();
-        fast_sint_t omp_num_threads = omp_get_num_threads();
-#else
-        UNUSED(threads);
-
-        fast_sint_t omp_num_threads = 1;
-#endif
-        if (omp_num_threads == 1) {
-            m = libsais_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets, 0, n);
-        }
-#if defined(_OPENMP)
-        else if (omp_thread_num == 0) {
-            libsais_count_lms_suffixes_32s_2k(T, n, k, buckets);
-        } else {
-            m = libsais_gather_lms_suffixes_32s(T, SA, n);
-        }
-#endif
-    }
-
-    return m;
-}
-
-static sa_sint_t libsais_count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
-    sa_sint_t * RESTRICT buckets, sa_sint_t threads) {
-    sa_sint_t m = 0;
-
-#if defined(_OPENMP)
-    #pragma omp parallel num_threads(2) if (threads > 1 && n >= 65536)
-#endif
-    {
-#if defined(_OPENMP)
-        fast_sint_t omp_thread_num = omp_get_thread_num();
-        fast_sint_t omp_num_threads = omp_get_num_threads();
-#else
-        UNUSED(threads);
-
-        fast_sint_t omp_num_threads = 1;
-#endif
-        if (omp_num_threads == 1) {
-            m = libsais_count_and_gather_compacted_lms_suffixes_32s_2k(T, SA, n, k, buckets, 0, n);
-        }
-#if defined(_OPENMP)
-        else if (omp_thread_num == 0) {
-            libsais_count_compacted_lms_suffixes_32s_2k(T, n, k, buckets);
-        } else {
-            m = libsais_gather_compacted_lms_suffixes_32s(T, SA, n);
-        }
-#endif
-    }
-
-    return m;
-}
-
-static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
-    sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
-    sa_sint_t m;
-
-#if defined(_OPENMP)
-    sa_sint_t max_threads = (sa_sint_t)((buckets - &SA[n]) / ((4 * (fast_sint_t)k + 15) & (-16)));
-    if (max_threads > threads) {
-        max_threads = threads;
-    }
-    if (max_threads > 1 && n >= 65536 && n / k >= 2) {
-        if (max_threads > n / 16 / k) {
-            max_threads = n / 16 / k;
-        }
-        m = libsais_count_and_gather_lms_suffixes_32s_4k_fs_omp(
-            T, SA, n, k, buckets, max_threads > 2 ? max_threads : 2, thread_state);
-    } else
-#else
-    UNUSED(thread_state);
-#endif
-    {
-        m = libsais_count_and_gather_lms_suffixes_32s_4k_nofs_omp(T, SA, n, k, buckets, threads);
-    }
-
-    return m;
-}
-
-static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
-    sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
-    sa_sint_t m;
-
-#if defined(_OPENMP)
-    sa_sint_t max_threads = (sa_sint_t)((buckets - &SA[n]) / ((2 * (fast_sint_t)k + 15) & (-16)));
-    if (max_threads > threads) {
-        max_threads = threads;
-    }
-    if (max_threads > 1 && n >= 65536 && n / k >= 2) {
-        if (max_threads > n / 8 / k) {
-            max_threads = n / 8 / k;
-        }
-        m = libsais_count_and_gather_lms_suffixes_32s_2k_fs_omp(
-            T, SA, n, k, buckets, max_threads > 2 ? max_threads : 2, thread_state);
-    } else
-#else
-    UNUSED(thread_state);
-#endif
-    {
-        m = libsais_count_and_gather_lms_suffixes_32s_2k_nofs_omp(T, SA, n, k, buckets, threads);
-    }
+    { m = libsais_count_and_gather_lms_suffixes_32s_2k_nofs_omp(T, SA, n, k, buckets, threads); }
 
     return m;
 }
@@ -1565,22 +842,8 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k_omp(
 static void libsais_count_and_gather_compacted_lms_suffixes_32s_2k_omp(
     const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
     sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
-#if defined(_OPENMP)
-    sa_sint_t max_threads =
-        (sa_sint_t)((buckets - &SA[n + n]) / ((2 * (fast_sint_t)k + 15) & (-16)));
-    if (max_threads > threads) {
-        max_threads = threads;
-    }
-    if (max_threads > 1 && n >= 65536 && n / k >= 2) {
-        if (max_threads > n / 8 / k) {
-            max_threads = n / 8 / k;
-        }
-        libsais_count_and_gather_compacted_lms_suffixes_32s_2k_fs_omp(
-            T, SA, n, k, buckets, max_threads > 2 ? max_threads : 2, thread_state);
-    } else
-#else
-    UNUSED(thread_state);
-#endif
+    (void)(thread_state);
+
     {
         libsais_count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp(T, SA, n, k, buckets,
                                                                         threads);
@@ -1857,56 +1120,17 @@ static void libsais_radix_sort_lms_suffixes_8u_omp(const u8 * RESTRICT T, sa_sin
                                                    sa_sint_t n, sa_sint_t m,
                                                    sa_sint_t * RESTRICT buckets, sa_sint_t threads,
                                                    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
-#if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if (threads > 1 && n >= 65536 && m >= 65536 && \
-                                                  omp_get_dynamic() == 0)
-#endif
     {
-#if defined(_OPENMP)
-        fast_sint_t omp_thread_num = omp_get_thread_num();
-        fast_sint_t omp_num_threads = omp_get_num_threads();
-#else
-        UNUSED(threads);
-        UNUSED(thread_state);
+        (void)(threads);
+        (void)(thread_state);
 
         fast_sint_t omp_num_threads = 1;
-#endif
+
         if (omp_num_threads == 1) {
             libsais_radix_sort_lms_suffixes_8u(T, SA, &buckets[4 * ALPHABET_SIZE],
                                                (fast_sint_t)n - (fast_sint_t)m + 1,
                                                (fast_sint_t)m - 1);
         }
-#if defined(_OPENMP)
-        else {
-            {
-                sa_sint_t * RESTRICT src_bucket = &buckets[4 * ALPHABET_SIZE];
-                sa_sint_t * RESTRICT dst_bucket = thread_state[omp_thread_num].state.buckets;
-
-                fast_sint_t i, j;
-                for (i = BUCKETS_INDEX2(0, 0), j = BUCKETS_INDEX4(0, 1);
-                     i <= BUCKETS_INDEX2(ALPHABET_SIZE - 1, 0);
-                     i += BUCKETS_INDEX2(1, 0), j += BUCKETS_INDEX4(1, 0)) {
-                    dst_bucket[i] = src_bucket[i] - dst_bucket[j];
-                }
-            }
-
-            {
-                fast_sint_t t, omp_block_start = 0,
-                               omp_block_size = thread_state[omp_thread_num].state.m;
-                for (t = omp_num_threads - 1; t >= omp_thread_num; --t)
-                    omp_block_start += thread_state[t].state.m;
-
-                if (omp_block_start == (fast_sint_t)m && omp_block_size > 0) {
-                    omp_block_start -= 1;
-                    omp_block_size -= 1;
-                }
-
-                libsais_radix_sort_lms_suffixes_8u(
-                    T, SA, thread_state[omp_thread_num].state.buckets,
-                    (fast_sint_t)n - omp_block_start, omp_block_size);
-            }
-        }
-#endif
     }
 }
 
@@ -1985,339 +1209,104 @@ static void libsais_radix_sort_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T,
         SA[--induction_bucket[BUCKETS_INDEX2(T[p], 0)]] = p;
     }
 }
+static void libsais_radix_sort_lms_suffixes_32s_6k_omp(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
+    sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    if (threads == 1 || m < 65536) {
+        libsais_radix_sort_lms_suffixes_32s_6k(
+            T, SA, induction_bucket, (fast_sint_t)n - (fast_sint_t)m + 1, (fast_sint_t)m - 1);
+    }
+    (void)(thread_state);
+}
 
-#if defined(_OPENMP)
+static void libsais_radix_sort_lms_suffixes_32s_2k_omp(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
+    sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    if (threads == 1 || m < 65536) {
+        libsais_radix_sort_lms_suffixes_32s_2k(
+            T, SA, induction_bucket, (fast_sint_t)n - (fast_sint_t)m + 1, (fast_sint_t)m - 1);
+    }
+    (void)(thread_state);
+}
 
-static void libsais_radix_sort_lms_suffixes_32s_block_gather(const sa_sint_t * RESTRICT T,
-                                                             sa_sint_t * RESTRICT SA,
-                                                             LIBSAIS_THREAD_CACHE * RESTRICT cache,
-                                                             fast_sint_t omp_block_start,
-                                                             fast_sint_t omp_block_size) {
+static sa_sint_t libsais_radix_sort_lms_suffixes_32s_1k(const sa_sint_t * RESTRICT T,
+                                                        sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                                        sa_sint_t * RESTRICT buckets) {
     const fast_sint_t prefetch_distance = 32;
 
-    fast_sint_t i, j;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j;
-         i += 4) {
-        libsais_prefetch(&SA[i + 2 * prefetch_distance]);
-
-        libsais_prefetch(&T[SA[i + prefetch_distance + 0]]);
-        libsais_prefetch(&T[SA[i + prefetch_distance + 1]]);
-        libsais_prefetch(&T[SA[i + prefetch_distance + 2]]);
-        libsais_prefetch(&T[SA[i + prefetch_distance + 3]]);
+    sa_sint_t i = n - 2;
+    sa_sint_t m = 0;
+    fast_uint_t s = 1;
+    fast_sint_t c0 = T[n - 1];
+    fast_sint_t c1 = 0;
+    fast_sint_t c2 = 0;
 
-        libsais_prefetchw(&cache[i + prefetch_distance]);
+    for (; i >= prefetch_distance + 3; i -= 4) {
+        libsais_prefetch(&T[i - 2 * prefetch_distance]);
 
-        cache[i + 0].symbol = T[cache[i + 0].index = SA[i + 0]];
-        cache[i + 1].symbol = T[cache[i + 1].index = SA[i + 1]];
-        cache[i + 2].symbol = T[cache[i + 2].index = SA[i + 2]];
-        cache[i + 3].symbol = T[cache[i + 3].index = SA[i + 3]];
-    }
+        libsais_prefetchw(&buckets[T[i - prefetch_distance - 0]]);
+        libsais_prefetchw(&buckets[T[i - prefetch_distance - 1]]);
+        libsais_prefetchw(&buckets[T[i - prefetch_distance - 2]]);
+        libsais_prefetchw(&buckets[T[i - prefetch_distance - 3]]);
 
-    for (j += prefetch_distance + 3; i < j; i += 1) {
-        cache[i].symbol = T[cache[i].index = SA[i]];
-    }
-}
+        c1 = T[i - 0];
+        s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+        if ((s & 3) == 1) {
+            SA[--buckets[c2 = c0]] = i + 1;
+            m++;
+        }
 
-static void libsais_radix_sort_lms_suffixes_32s_6k_block_sort(sa_sint_t * RESTRICT induction_bucket,
-                                                              LIBSAIS_THREAD_CACHE * RESTRICT cache,
-                                                              fast_sint_t omp_block_start,
-                                                              fast_sint_t omp_block_size) {
-    const fast_sint_t prefetch_distance = 32;
+        c0 = T[i - 1];
+        s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+        if ((s & 3) == 1) {
+            SA[--buckets[c2 = c1]] = i - 0;
+            m++;
+        }
 
-    fast_sint_t i, j;
-    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 3;
-         i >= j; i -= 4) {
-        libsais_prefetchw(&cache[i - 2 * prefetch_distance]);
+        c1 = T[i - 2];
+        s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+        if ((s & 3) == 1) {
+            SA[--buckets[c2 = c0]] = i - 1;
+            m++;
+        }
 
-        libsais_prefetchw(&induction_bucket[cache[i - prefetch_distance - 0].symbol]);
-        libsais_prefetchw(&induction_bucket[cache[i - prefetch_distance - 1].symbol]);
-        libsais_prefetchw(&induction_bucket[cache[i - prefetch_distance - 2].symbol]);
-        libsais_prefetchw(&induction_bucket[cache[i - prefetch_distance - 3].symbol]);
+        c0 = T[i - 3];
+        s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+        if ((s & 3) == 1) {
+            SA[--buckets[c2 = c1]] = i - 2;
+            m++;
+        }
+    }
 
-        cache[i - 0].symbol = --induction_bucket[cache[i - 0].symbol];
-        cache[i - 1].symbol = --induction_bucket[cache[i - 1].symbol];
-        cache[i - 2].symbol = --induction_bucket[cache[i - 2].symbol];
-        cache[i - 3].symbol = --induction_bucket[cache[i - 3].symbol];
+    for (; i >= 0; i -= 1) {
+        c1 = c0;
+        c0 = T[i];
+        s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+        if ((s & 3) == 1) {
+            SA[--buckets[c2 = c1]] = i + 1;
+            m++;
+        }
     }
 
-    for (j -= prefetch_distance + 3; i >= j; i -= 1) {
-        cache[i].symbol = --induction_bucket[cache[i].symbol];
+    if (m > 1) {
+        SA[buckets[c2]] = 0;
     }
+
+    return m;
 }
 
-static void libsais_radix_sort_lms_suffixes_32s_2k_block_sort(sa_sint_t * RESTRICT induction_bucket,
-                                                              LIBSAIS_THREAD_CACHE * RESTRICT cache,
-                                                              fast_sint_t omp_block_start,
-                                                              fast_sint_t omp_block_size) {
+static void libsais_radix_sort_set_markers_32s_6k(sa_sint_t * RESTRICT SA,
+                                                  sa_sint_t * RESTRICT induction_bucket,
+                                                  fast_sint_t omp_block_start,
+                                                  fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 3;
-         i >= j; i -= 4) {
-        libsais_prefetchw(&cache[i - 2 * prefetch_distance]);
-
-        libsais_prefetchw(
-            &induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 0].symbol, 0)]);
-        libsais_prefetchw(
-            &induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 1].symbol, 0)]);
-        libsais_prefetchw(
-            &induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 2].symbol, 0)]);
-        libsais_prefetchw(
-            &induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 3].symbol, 0)]);
-
-        cache[i - 0].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 0].symbol, 0)];
-        cache[i - 1].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 1].symbol, 0)];
-        cache[i - 2].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 2].symbol, 0)];
-        cache[i - 3].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 3].symbol, 0)];
-    }
-
-    for (j -= prefetch_distance + 3; i >= j; i -= 1) {
-        cache[i].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i].symbol, 0)];
-    }
-}
-
-static void libsais_radix_sort_lms_suffixes_32s_6k_block_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket,
-    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size,
-    sa_sint_t threads) {
-    #if defined(_OPENMP)
-        #pragma omp parallel num_threads(threads) if (threads > 1 && block_size >= 16384)
-    #endif
-    {
-    #if defined(_OPENMP)
-        fast_sint_t omp_thread_num = omp_get_thread_num();
-        fast_sint_t omp_num_threads = omp_get_num_threads();
-    #else
-        UNUSED(threads);
-        UNUSED(cache);
-
-        fast_sint_t omp_thread_num = 0;
-        fast_sint_t omp_num_threads = 1;
-    #endif
-        fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size =
-            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
-
-        omp_block_start += block_start;
-
-        if (omp_num_threads == 1) {
-            libsais_radix_sort_lms_suffixes_32s_6k(T, SA, induction_bucket, omp_block_start,
-                                                   omp_block_size);
-        }
-    #if defined(_OPENMP)
-        else {
-            {
-                libsais_radix_sort_lms_suffixes_32s_block_gather(T, SA, cache - block_start,
-                                                                 omp_block_start, omp_block_size);
-            }
-
-        #pragma omp barrier
-
-        #pragma omp master
-            {
-                libsais_radix_sort_lms_suffixes_32s_6k_block_sort(
-                    induction_bucket, cache - block_start, block_start, block_size);
-            }
-
-        #pragma omp barrier
-
-            {
-                libsais_place_cached_suffixes(SA, cache - block_start, omp_block_start,
-                                              omp_block_size);
-            }
-        }
-    #endif
-    }
-}
-
-static void libsais_radix_sort_lms_suffixes_32s_2k_block_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket,
-    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size,
-    sa_sint_t threads) {
-    #if defined(_OPENMP)
-        #pragma omp parallel num_threads(threads) if (threads > 1 && block_size >= 16384)
-    #endif
-    {
-    #if defined(_OPENMP)
-        fast_sint_t omp_thread_num = omp_get_thread_num();
-        fast_sint_t omp_num_threads = omp_get_num_threads();
-    #else
-        UNUSED(threads);
-        UNUSED(cache);
-
-        fast_sint_t omp_thread_num = 0;
-        fast_sint_t omp_num_threads = 1;
-    #endif
-        fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size =
-            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
-
-        omp_block_start += block_start;
-
-        if (omp_num_threads == 1) {
-            libsais_radix_sort_lms_suffixes_32s_2k(T, SA, induction_bucket, omp_block_start,
-                                                   omp_block_size);
-        }
-    #if defined(_OPENMP)
-        else {
-            {
-                libsais_radix_sort_lms_suffixes_32s_block_gather(T, SA, cache - block_start,
-                                                                 omp_block_start, omp_block_size);
-            }
-
-        #pragma omp barrier
-
-        #pragma omp master
-            {
-                libsais_radix_sort_lms_suffixes_32s_2k_block_sort(
-                    induction_bucket, cache - block_start, block_start, block_size);
-            }
-
-        #pragma omp barrier
-
-            {
-                libsais_place_cached_suffixes(SA, cache - block_start, omp_block_start,
-                                              omp_block_size);
-            }
-        }
-    #endif
-    }
-}
-
-#endif
-
-static void libsais_radix_sort_lms_suffixes_32s_6k_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
-    sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
-    if (threads == 1 || m < 65536) {
-        libsais_radix_sort_lms_suffixes_32s_6k(
-            T, SA, induction_bucket, (fast_sint_t)n - (fast_sint_t)m + 1, (fast_sint_t)m - 1);
-    }
-#if defined(_OPENMP)
-    else {
-        fast_sint_t block_start, block_end;
-        for (block_start = 0; block_start < (fast_sint_t)m - 1; block_start = block_end) {
-            block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
-            if (block_end >= m) {
-                block_end = (fast_sint_t)m - 1;
-            }
-
-            libsais_radix_sort_lms_suffixes_32s_6k_block_omp(
-                T, SA, induction_bucket, thread_state[0].state.cache, (fast_sint_t)n - block_end,
-                block_end - block_start, threads);
-        }
-    }
-#else
-    UNUSED(thread_state);
-#endif
-}
-
-static void libsais_radix_sort_lms_suffixes_32s_2k_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
-    sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
-    if (threads == 1 || m < 65536) {
-        libsais_radix_sort_lms_suffixes_32s_2k(
-            T, SA, induction_bucket, (fast_sint_t)n - (fast_sint_t)m + 1, (fast_sint_t)m - 1);
-    }
-#if defined(_OPENMP)
-    else {
-        fast_sint_t block_start, block_end;
-        for (block_start = 0; block_start < (fast_sint_t)m - 1; block_start = block_end) {
-            block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
-            if (block_end >= m) {
-                block_end = (fast_sint_t)m - 1;
-            }
-
-            libsais_radix_sort_lms_suffixes_32s_2k_block_omp(
-                T, SA, induction_bucket, thread_state[0].state.cache, (fast_sint_t)n - block_end,
-                block_end - block_start, threads);
-        }
-    }
-#else
-    UNUSED(thread_state);
-#endif
-}
-
-static sa_sint_t libsais_radix_sort_lms_suffixes_32s_1k(const sa_sint_t * RESTRICT T,
-                                                        sa_sint_t * RESTRICT SA, sa_sint_t n,
-                                                        sa_sint_t * RESTRICT buckets) {
-    const fast_sint_t prefetch_distance = 32;
-
-    sa_sint_t i = n - 2;
-    sa_sint_t m = 0;
-    fast_uint_t s = 1;
-    fast_sint_t c0 = T[n - 1];
-    fast_sint_t c1 = 0;
-    fast_sint_t c2 = 0;
-
-    for (; i >= prefetch_distance + 3; i -= 4) {
-        libsais_prefetch(&T[i - 2 * prefetch_distance]);
-
-        libsais_prefetchw(&buckets[T[i - prefetch_distance - 0]]);
-        libsais_prefetchw(&buckets[T[i - prefetch_distance - 1]]);
-        libsais_prefetchw(&buckets[T[i - prefetch_distance - 2]]);
-        libsais_prefetchw(&buckets[T[i - prefetch_distance - 3]]);
-
-        c1 = T[i - 0];
-        s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
-        if ((s & 3) == 1) {
-            SA[--buckets[c2 = c0]] = i + 1;
-            m++;
-        }
-
-        c0 = T[i - 1];
-        s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
-        if ((s & 3) == 1) {
-            SA[--buckets[c2 = c1]] = i - 0;
-            m++;
-        }
-
-        c1 = T[i - 2];
-        s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
-        if ((s & 3) == 1) {
-            SA[--buckets[c2 = c0]] = i - 1;
-            m++;
-        }
-
-        c0 = T[i - 3];
-        s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
-        if ((s & 3) == 1) {
-            SA[--buckets[c2 = c1]] = i - 2;
-            m++;
-        }
-    }
-
-    for (; i >= 0; i -= 1) {
-        c1 = c0;
-        c0 = T[i];
-        s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
-        if ((s & 3) == 1) {
-            SA[--buckets[c2 = c1]] = i + 1;
-            m++;
-        }
-    }
-
-    if (m > 1) {
-        SA[buckets[c2]] = 0;
-    }
-
-    return m;
-}
-
-static void libsais_radix_sort_set_markers_32s_6k(sa_sint_t * RESTRICT SA,
-                                                  sa_sint_t * RESTRICT induction_bucket,
-                                                  fast_sint_t omp_block_start,
-                                                  fast_sint_t omp_block_size) {
-    const fast_sint_t prefetch_distance = 32;
-
-    fast_sint_t i, j;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j;
-         i += 4) {
-        libsais_prefetch(&induction_bucket[i + 2 * prefetch_distance]);
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j;
+         i += 4) {
+        libsais_prefetch(&induction_bucket[i + 2 * prefetch_distance]);
 
         libsais_prefetchw(&SA[induction_bucket[i + prefetch_distance + 0]]);
         libsais_prefetchw(&SA[induction_bucket[i + prefetch_distance + 1]]);
@@ -2365,25 +1354,11 @@ static void libsais_radix_sort_set_markers_32s_4k(sa_sint_t * RESTRICT SA,
 static void libsais_radix_sort_set_markers_32s_6k_omp(sa_sint_t * RESTRICT SA, sa_sint_t k,
                                                       sa_sint_t * RESTRICT induction_bucket,
                                                       sa_sint_t threads) {
-#if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if (threads > 1 && k >= 65536)
-#endif
     {
-#if defined(_OPENMP)
-        fast_sint_t omp_thread_num = omp_get_thread_num();
-        fast_sint_t omp_num_threads = omp_get_num_threads();
-        fast_sint_t omp_block_stride = (((fast_sint_t)k - 1) / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
-                                         ? omp_block_stride
-                                         : (fast_sint_t)k - 1 - omp_block_start;
-#else
-        UNUSED(threads);
+        (void)(threads);
 
         fast_sint_t omp_block_start = 0;
         fast_sint_t omp_block_size = (fast_sint_t)k - 1;
-#endif
-
         libsais_radix_sort_set_markers_32s_6k(SA, induction_bucket, omp_block_start,
                                               omp_block_size);
     }
@@ -2392,25 +1367,11 @@ static void libsais_radix_sort_set_markers_32s_6k_omp(sa_sint_t * RESTRICT SA, s
 static void libsais_radix_sort_set_markers_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t k,
                                                       sa_sint_t * RESTRICT induction_bucket,
                                                       sa_sint_t threads) {
-#if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if (threads > 1 && k >= 65536)
-#endif
     {
-#if defined(_OPENMP)
-        fast_sint_t omp_thread_num = omp_get_thread_num();
-        fast_sint_t omp_num_threads = omp_get_num_threads();
-        fast_sint_t omp_block_stride = (((fast_sint_t)k - 1) / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
-                                         ? omp_block_stride
-                                         : (fast_sint_t)k - 1 - omp_block_start;
-#else
-        UNUSED(threads);
+        (void)(threads);
 
         fast_sint_t omp_block_start = 0;
         fast_sint_t omp_block_size = (fast_sint_t)k - 1;
-#endif
-
         libsais_radix_sort_set_markers_32s_4k(SA, induction_bucket, omp_block_start,
                                               omp_block_size);
     }
@@ -2537,257 +1498,38 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u(
 
     return d;
 }
-
-#if defined(_OPENMP)
-
-static void libsais_partial_sorting_scan_left_to_right_8u_block_prepare(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
-    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size,
-    LIBSAIS_THREAD_STATE * RESTRICT state) {
-    const fast_sint_t prefetch_distance = 32;
-
-    sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
+static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u_omp(
+    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets,
+    sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    sa_sint_t * RESTRICT induction_bucket = &buckets[4 * ALPHABET_SIZE];
     sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE];
 
-    memset(buckets, 0, 4 * ALPHABET_SIZE * sizeof(sa_sint_t));
-
-    fast_sint_t i, j, count = 0;
-    sa_sint_t d = 1;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j;
-         i += 2) {
-        libsais_prefetch(&SA[i + 2 * prefetch_distance]);
-
-        libsais_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1);
-        libsais_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 2);
-        libsais_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 1);
-        libsais_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 2);
-
-        sa_sint_t p0 = cache[count].index = SA[i + 0];
-        d += (p0 < 0);
-        p0 &= SAINT_MAX;
-        sa_sint_t v0 = cache[count++].symbol = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] >= T[p0 - 1]);
-        induction_bucket[v0]++;
-        distinct_names[v0] = d;
-        sa_sint_t p1 = cache[count].index = SA[i + 1];
-        d += (p1 < 0);
-        p1 &= SAINT_MAX;
-        sa_sint_t v1 = cache[count++].symbol = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] >= T[p1 - 1]);
-        induction_bucket[v1]++;
-        distinct_names[v1] = d;
-    }
+    SA[induction_bucket[BUCKETS_INDEX2(T[n - 1], T[n - 2] >= T[n - 1])]++] = (n - 1) | SAINT_MIN;
+    distinct_names[BUCKETS_INDEX2(T[n - 1], T[n - 2] >= T[n - 1])] = ++d;
 
-    for (j += prefetch_distance + 1; i < j; i += 1) {
-        sa_sint_t p = cache[count].index = SA[i];
-        d += (p < 0);
-        p &= SAINT_MAX;
-        sa_sint_t v = cache[count++].symbol = BUCKETS_INDEX2(T[p - 1], T[p - 2] >= T[p - 1]);
-        induction_bucket[v]++;
-        distinct_names[v] = d;
+    if (threads == 1 || left_suffixes_count < 65536) {
+        d = libsais_partial_sorting_scan_left_to_right_8u(T, SA, buckets, d, 0,
+                                                          left_suffixes_count);
     }
-
-    state[0].state.position = (fast_sint_t)d - 1;
-    state[0].state.count = count;
+    (void)(thread_state);
+    return d;
 }
 
-static void libsais_partial_sorting_scan_left_to_right_8u_block_place(
-    sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache,
-    fast_sint_t count, sa_sint_t d) {
+static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
+    sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
-    sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
-    sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE];
-
     fast_sint_t i, j;
-    for (i = 0, j = count - 1; i < j; i += 2) {
-        libsais_prefetch(&cache[i + prefetch_distance]);
-
-        sa_sint_t p0 = cache[i + 0].index;
-        d += (p0 < 0);
-        sa_sint_t v0 = cache[i + 0].symbol;
-        SA[induction_bucket[v0]++] =
-            (p0 - 1) | ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1));
-        distinct_names[v0] = d;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1;
+         i < j; i += 2) {
+        libsais_prefetch(&SA[i + 3 * prefetch_distance]);
 
-        sa_sint_t p1 = cache[i + 1].index;
-        d += (p1 < 0);
-        sa_sint_t v1 = cache[i + 1].symbol;
-        SA[induction_bucket[v1]++] =
-            (p1 - 1) | ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1));
-        distinct_names[v1] = d;
-    }
-
-    for (j += 1; i < j; i += 1) {
-        sa_sint_t p = cache[i].index;
-        d += (p < 0);
-        sa_sint_t v = cache[i].symbol;
-        SA[induction_bucket[v]++] =
-            (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1));
-        distinct_names[v] = d;
-    }
-}
-
-static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u_block_omp(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d,
-    fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
-    #if defined(_OPENMP)
-        #pragma omp parallel num_threads(threads) if (threads > 1 &&                      \
-                                                      block_size >= 64 * ALPHABET_SIZE && \
-                                                      omp_get_dynamic() == 0)
-    #endif
-    {
-    #if defined(_OPENMP)
-        fast_sint_t omp_thread_num = omp_get_thread_num();
-        fast_sint_t omp_num_threads = omp_get_num_threads();
-    #else
-        UNUSED(threads);
-        UNUSED(thread_state);
-
-        fast_sint_t omp_thread_num = 0;
-        fast_sint_t omp_num_threads = 1;
-    #endif
-        fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size =
-            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
-
-        omp_block_start += block_start;
-
-        if (omp_num_threads == 1) {
-            d = libsais_partial_sorting_scan_left_to_right_8u(T, SA, buckets, d, omp_block_start,
-                                                              omp_block_size);
-        }
-    #if defined(_OPENMP)
-        else {
-            {
-                libsais_partial_sorting_scan_left_to_right_8u_block_prepare(
-                    T, SA, thread_state[omp_thread_num].state.buckets,
-                    thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size,
-                    &thread_state[omp_thread_num]);
-            }
-
-        #pragma omp barrier
-
-        #pragma omp master
-            {
-                sa_sint_t * RESTRICT induction_bucket = &buckets[4 * ALPHABET_SIZE];
-                sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE];
-
-                fast_sint_t t;
-                for (t = 0; t < omp_num_threads; ++t) {
-                    sa_sint_t * RESTRICT temp_induction_bucket =
-                        &thread_state[t].state.buckets[0 * ALPHABET_SIZE];
-                    sa_sint_t * RESTRICT temp_distinct_names =
-                        &thread_state[t].state.buckets[2 * ALPHABET_SIZE];
-
-                    fast_sint_t c;
-                    for (c = 0; c < 2 * ALPHABET_SIZE; c += 1) {
-                        sa_sint_t A = induction_bucket[c], B = temp_induction_bucket[c];
-                        induction_bucket[c] = A + B;
-                        temp_induction_bucket[c] = A;
-                    }
-
-                    for (d -= 1, c = 0; c < 2 * ALPHABET_SIZE; c += 1) {
-                        sa_sint_t A = distinct_names[c], B = temp_distinct_names[c], D = B + d;
-                        distinct_names[c] = B > 0 ? D : A;
-                        temp_distinct_names[c] = A;
-                    }
-                    d += 1 + (sa_sint_t)thread_state[t].state.position;
-                    thread_state[t].state.position =
-                        (fast_sint_t)d - thread_state[t].state.position;
-                }
-            }
-
-        #pragma omp barrier
-
-            {
-                libsais_partial_sorting_scan_left_to_right_8u_block_place(
-                    SA, thread_state[omp_thread_num].state.buckets,
-                    thread_state[omp_thread_num].state.cache,
-                    thread_state[omp_thread_num].state.count,
-                    (sa_sint_t)thread_state[omp_thread_num].state.position);
-            }
-        }
-    #endif
-    }
-
-    return d;
-}
-
-#endif
-
-static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u_omp(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets,
-    sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
-    sa_sint_t * RESTRICT induction_bucket = &buckets[4 * ALPHABET_SIZE];
-    sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE];
-
-    SA[induction_bucket[BUCKETS_INDEX2(T[n - 1], T[n - 2] >= T[n - 1])]++] = (n - 1) | SAINT_MIN;
-    distinct_names[BUCKETS_INDEX2(T[n - 1], T[n - 2] >= T[n - 1])] = ++d;
-
-    if (threads == 1 || left_suffixes_count < 65536) {
-        d = libsais_partial_sorting_scan_left_to_right_8u(T, SA, buckets, d, 0,
-                                                          left_suffixes_count);
-    }
-#if defined(_OPENMP)
-    else {
-        fast_sint_t block_start;
-        for (block_start = 0; block_start < left_suffixes_count;) {
-            if (SA[block_start] == 0) {
-                block_start++;
-            } else {
-                fast_sint_t block_max_end =
-                    block_start + ((fast_sint_t)threads) *
-                                      (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads);
-                if (block_max_end > left_suffixes_count) {
-                    block_max_end = left_suffixes_count;
-                }
-                fast_sint_t block_end = block_start + 1;
-                while (block_end < block_max_end && SA[block_end] != 0) {
-                    block_end++;
-                }
-                fast_sint_t block_size = block_end - block_start;
-
-                if (block_size < 32) {
-                    for (; block_start < block_end; block_start += 1) {
-                        sa_sint_t p = SA[block_start];
-                        d += (p < 0);
-                        p &= SAINT_MAX;
-                        sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] >= T[p - 1]);
-                        SA[induction_bucket[v]++] =
-                            (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1));
-                        distinct_names[v] = d;
-                    }
-                } else {
-                    d = libsais_partial_sorting_scan_left_to_right_8u_block_omp(
-                        T, SA, buckets, d, block_start, block_size, threads, thread_state);
-                    block_start = block_end;
-                }
-            }
-        }
-    }
-#else
-    UNUSED(thread_state);
-#endif
-
-    return d;
-}
-
-static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
-    sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
-    const fast_sint_t prefetch_distance = 32;
-
-    fast_sint_t i, j;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1;
-         i < j; i += 2) {
-        libsais_prefetch(&SA[i + 3 * prefetch_distance]);
-
-        libsais_prefetch(&T[SA[i + 2 * prefetch_distance + 0] & SAINT_MAX] - 1);
-        libsais_prefetch(&T[SA[i + 2 * prefetch_distance + 0] & SAINT_MAX] - 2);
-        libsais_prefetch(&T[SA[i + 2 * prefetch_distance + 1] & SAINT_MAX] - 1);
-        libsais_prefetch(&T[SA[i + 2 * prefetch_distance + 1] & SAINT_MAX] - 2);
+        libsais_prefetch(&T[SA[i + 2 * prefetch_distance + 0] & SAINT_MAX] - 1);
+        libsais_prefetch(&T[SA[i + 2 * prefetch_distance + 0] & SAINT_MAX] - 2);
+        libsais_prefetch(&T[SA[i + 2 * prefetch_distance + 1] & SAINT_MAX] - 1);
+        libsais_prefetch(&T[SA[i + 2 * prefetch_distance + 1] & SAINT_MAX] - 2);
 
         sa_sint_t p0 = SA[i + prefetch_distance + 0] & SAINT_MAX;
         sa_sint_t v0 = BUCKETS_INDEX4(T[p0 - (p0 > 0)], 0);
@@ -2960,3907 +1702,1331 @@ static void libsais_partial_sorting_scan_left_to_right_32s_1k(const sa_sint_t *
         }
     }
 }
+static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k_omp(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+    sa_sint_t * RESTRICT buckets, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    SA[buckets[BUCKETS_INDEX4(T[n - 1], T[n - 2] >= T[n - 1])]++] = (n - 1) | SAINT_MIN;
+    buckets[2 + BUCKETS_INDEX4(T[n - 1], T[n - 2] >= T[n - 1])] = ++d;
 
-#if defined(_OPENMP)
-
-static void libsais_partial_sorting_scan_left_to_right_32s_6k_block_gather(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache,
-    fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
-    const fast_sint_t prefetch_distance = 32;
-
-    fast_sint_t i, j;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j;
-         i += 2) {
-        libsais_prefetch(&SA[i + 2 * prefetch_distance]);
+    if (threads == 1 || left_suffixes_count < 65536) {
+        d = libsais_partial_sorting_scan_left_to_right_32s_6k(T, SA, buckets, d, 0,
+                                                              left_suffixes_count);
+    }
+    (void)(thread_state);
+    return d;
+}
 
-        libsais_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1);
-        libsais_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 2);
-        libsais_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 1);
-        libsais_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 2);
+static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k_omp(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
+    sa_sint_t * RESTRICT buckets, sa_sint_t d, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    sa_sint_t * RESTRICT induction_bucket = &buckets[2 * k];
+    sa_sint_t * RESTRICT distinct_names = &buckets[0 * k];
 
-        libsais_prefetchw(&cache[i + prefetch_distance]);
+    SA[induction_bucket[T[n - 1]]++] =
+        (n - 1) | ((sa_sint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1)) | SUFFIX_GROUP_MARKER;
+    distinct_names[BUCKETS_INDEX2(T[n - 1], T[n - 2] < T[n - 1])] = ++d;
 
-        sa_sint_t p0 = cache[i + 0].index = SA[i + 0];
-        sa_sint_t symbol0 = 0;
-        p0 &= SAINT_MAX;
-        if (p0 != 0) {
-            symbol0 = BUCKETS_INDEX4(T[p0 - 1], T[p0 - 2] >= T[p0 - 1]);
-        }
-        cache[i + 0].symbol = symbol0;
-        sa_sint_t p1 = cache[i + 1].index = SA[i + 1];
-        sa_sint_t symbol1 = 0;
-        p1 &= SAINT_MAX;
-        if (p1 != 0) {
-            symbol1 = BUCKETS_INDEX4(T[p1 - 1], T[p1 - 2] >= T[p1 - 1]);
-        }
-        cache[i + 1].symbol = symbol1;
+    if (threads == 1 || n < 65536) {
+        d = libsais_partial_sorting_scan_left_to_right_32s_4k(T, SA, k, buckets, d, 0, n);
     }
+    (void)(thread_state);
+    return d;
+}
 
-    for (j += prefetch_distance + 1; i < j; i += 1) {
-        sa_sint_t p = cache[i].index = SA[i];
-        sa_sint_t symbol = 0;
-        p &= SAINT_MAX;
-        if (p != 0) {
-            symbol = BUCKETS_INDEX4(T[p - 1], T[p - 2] >= T[p - 1]);
-        }
-        cache[i].symbol = symbol;
+static void libsais_partial_sorting_scan_left_to_right_32s_1k_omp(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+    sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    SA[buckets[T[n - 1]]++] = (n - 1) | ((sa_sint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1));
+
+    if (threads == 1 || n < 65536) {
+        libsais_partial_sorting_scan_left_to_right_32s_1k(T, SA, buckets, 0, n);
     }
+    (void)(thread_state);
 }
 
-static void libsais_partial_sorting_scan_left_to_right_32s_4k_block_gather(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache,
-    fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+static void libsais_partial_sorting_shift_markers_8u_omp(sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                                         const sa_sint_t * RESTRICT buckets,
+                                                         sa_sint_t threads) {
     const fast_sint_t prefetch_distance = 32;
 
-    fast_sint_t i, j;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j;
-         i += 2) {
-        libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
+    const sa_sint_t * RESTRICT temp_bucket = &buckets[4 * ALPHABET_SIZE];
 
-        sa_sint_t s0 = SA[i + prefetch_distance + 0];
-        const sa_sint_t * Ts0 = &T[s0 & ~SUFFIX_GROUP_MARKER] - 1;
-        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
-        Ts0--;
-        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
-        sa_sint_t s1 = SA[i + prefetch_distance + 1];
-        const sa_sint_t * Ts1 = &T[s1 & ~SUFFIX_GROUP_MARKER] - 1;
-        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
-        Ts1--;
-        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+    fast_sint_t c;
+    (void)(threads);
+    (void)(n);
 
-        libsais_prefetchw(&cache[i + prefetch_distance]);
+    for (c = BUCKETS_INDEX2(ALPHABET_SIZE - 1, 0); c >= BUCKETS_INDEX2(1, 0);
+         c -= BUCKETS_INDEX2(1, 0)) {
+        fast_sint_t i, j;
+        sa_sint_t s = SAINT_MIN;
+        for (i = (fast_sint_t)temp_bucket[c] - 1,
+            j = (fast_sint_t)buckets[c - BUCKETS_INDEX2(1, 0)] + 3;
+             i >= j; i -= 4) {
+            libsais_prefetchw(&SA[i - prefetch_distance]);
 
-        sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0];
-        if (p0 > 0) {
-            cache[i + 0].index = p0;
-            p0 &= ~SUFFIX_GROUP_MARKER;
-            symbol0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] < T[p0 - 1]);
-            p0 = 0;
-        }
-        cache[i + 0].symbol = symbol0;
-        SA[i + 0] = p0 & SAINT_MAX;
-        sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1];
-        if (p1 > 0) {
-            cache[i + 1].index = p1;
-            p1 &= ~SUFFIX_GROUP_MARKER;
-            symbol1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] < T[p1 - 1]);
-            p1 = 0;
+            sa_sint_t p0 = SA[i - 0], q0 = (p0 & SAINT_MIN) ^ s;
+            s = s ^ q0;
+            SA[i - 0] = p0 ^ q0;
+            sa_sint_t p1 = SA[i - 1], q1 = (p1 & SAINT_MIN) ^ s;
+            s = s ^ q1;
+            SA[i - 1] = p1 ^ q1;
+            sa_sint_t p2 = SA[i - 2], q2 = (p2 & SAINT_MIN) ^ s;
+            s = s ^ q2;
+            SA[i - 2] = p2 ^ q2;
+            sa_sint_t p3 = SA[i - 3], q3 = (p3 & SAINT_MIN) ^ s;
+            s = s ^ q3;
+            SA[i - 3] = p3 ^ q3;
         }
-        cache[i + 1].symbol = symbol1;
-        SA[i + 1] = p1 & SAINT_MAX;
-    }
 
-    for (j += prefetch_distance + 1; i < j; i += 1) {
-        sa_sint_t symbol = SAINT_MIN, p = SA[i];
-        if (p > 0) {
-            cache[i].index = p;
-            p &= ~SUFFIX_GROUP_MARKER;
-            symbol = BUCKETS_INDEX2(T[p - 1], T[p - 2] < T[p - 1]);
-            p = 0;
+        for (j -= 3; i >= j; i -= 1) {
+            sa_sint_t p = SA[i], q = (p & SAINT_MIN) ^ s;
+            s = s ^ q;
+            SA[i] = p ^ q;
         }
-        cache[i].symbol = symbol;
-        SA[i] = p & SAINT_MAX;
     }
 }
 
-static void libsais_partial_sorting_scan_left_to_right_32s_1k_block_gather(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache,
-    fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+static void libsais_partial_sorting_shift_markers_32s_6k_omp(sa_sint_t * RESTRICT SA, sa_sint_t k,
+                                                             const sa_sint_t * RESTRICT buckets,
+                                                             sa_sint_t threads) {
     const fast_sint_t prefetch_distance = 32;
 
-    fast_sint_t i, j;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j;
-         i += 2) {
-        libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
+    const sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k];
 
-        sa_sint_t s0 = SA[i + prefetch_distance + 0];
-        const sa_sint_t * Ts0 = &T[s0] - 1;
-        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
-        Ts0--;
-        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
-        sa_sint_t s1 = SA[i + prefetch_distance + 1];
-        const sa_sint_t * Ts1 = &T[s1] - 1;
-        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
-        Ts1--;
-        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+    fast_sint_t c;
+    (void)(threads);
 
-        libsais_prefetchw(&cache[i + prefetch_distance]);
+    for (c = (fast_sint_t)k - 1; c >= 1; c -= 1) {
+        fast_sint_t i, j;
+        sa_sint_t s = SAINT_MIN;
+        for (i = (fast_sint_t)buckets[BUCKETS_INDEX4(c, 0)] - 1,
+            j = (fast_sint_t)temp_bucket[BUCKETS_INDEX2(c - 1, 0)] + 3;
+             i >= j; i -= 4) {
+            libsais_prefetchw(&SA[i - prefetch_distance]);
 
-        sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0];
-        if (p0 > 0) {
-            cache[i + 0].index = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] < T[p0 - 1]) << (SAINT_BIT - 1));
-            symbol0 = T[p0 - 1];
-            p0 = 0;
-        }
-        cache[i + 0].symbol = symbol0;
-        SA[i + 0] = p0 & SAINT_MAX;
-        sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1];
-        if (p1 > 0) {
-            cache[i + 1].index = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] < T[p1 - 1]) << (SAINT_BIT - 1));
-            symbol1 = T[p1 - 1];
-            p1 = 0;
+            sa_sint_t p0 = SA[i - 0], q0 = (p0 & SAINT_MIN) ^ s;
+            s = s ^ q0;
+            SA[i - 0] = p0 ^ q0;
+            sa_sint_t p1 = SA[i - 1], q1 = (p1 & SAINT_MIN) ^ s;
+            s = s ^ q1;
+            SA[i - 1] = p1 ^ q1;
+            sa_sint_t p2 = SA[i - 2], q2 = (p2 & SAINT_MIN) ^ s;
+            s = s ^ q2;
+            SA[i - 2] = p2 ^ q2;
+            sa_sint_t p3 = SA[i - 3], q3 = (p3 & SAINT_MIN) ^ s;
+            s = s ^ q3;
+            SA[i - 3] = p3 ^ q3;
         }
-        cache[i + 1].symbol = symbol1;
-        SA[i + 1] = p1 & SAINT_MAX;
-    }
 
-    for (j += prefetch_distance + 1; i < j; i += 1) {
-        sa_sint_t symbol = SAINT_MIN, p = SA[i];
-        if (p > 0) {
-            cache[i].index = (p - 1) | ((sa_sint_t)(T[p - 2] < T[p - 1]) << (SAINT_BIT - 1));
-            symbol = T[p - 1];
-            p = 0;
+        for (j -= 3; i >= j; i -= 1) {
+            sa_sint_t p = SA[i], q = (p & SAINT_MIN) ^ s;
+            s = s ^ q;
+            SA[i] = p ^ q;
         }
-        cache[i].symbol = symbol;
-        SA[i] = p & SAINT_MAX;
     }
 }
 
-static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k_block_sort(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT buckets, sa_sint_t d,
-    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
-    fast_sint_t omp_block_size) {
+static void libsais_partial_sorting_shift_markers_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n) {
     const fast_sint_t prefetch_distance = 32;
 
-    fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size;
-    for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2) {
-        libsais_prefetchw(&cache[i + 2 * prefetch_distance]);
-
-        libsais_prefetchw(&buckets[cache[i + prefetch_distance + 0].symbol]);
-        libsais_prefetchw(&buckets[cache[i + prefetch_distance + 1].symbol]);
+    fast_sint_t i;
+    sa_sint_t s = SUFFIX_GROUP_MARKER;
+    for (i = (fast_sint_t)n - 1; i >= 3; i -= 4) {
+        libsais_prefetchw(&SA[i - prefetch_distance]);
 
-        sa_sint_t v0 = cache[i + 0].symbol, p0 = cache[i + 0].index;
+        sa_sint_t p0 = SA[i - 0], q0 = ((p0 & SUFFIX_GROUP_MARKER) ^ s) &
+                                       ((sa_sint_t)(p0 > 0) << ((SUFFIX_GROUP_BIT - 1)));
+        s = s ^ q0;
+        SA[i - 0] = p0 ^ q0;
+        sa_sint_t p1 = SA[i - 1], q1 = ((p1 & SUFFIX_GROUP_MARKER) ^ s) &
+                                       ((sa_sint_t)(p1 > 0) << ((SUFFIX_GROUP_BIT - 1)));
+        s = s ^ q1;
+        SA[i - 1] = p1 ^ q1;
+        sa_sint_t p2 = SA[i - 2], q2 = ((p2 & SUFFIX_GROUP_MARKER) ^ s) &
+                                       ((sa_sint_t)(p2 > 0) << ((SUFFIX_GROUP_BIT - 1)));
+        s = s ^ q2;
+        SA[i - 2] = p2 ^ q2;
+        sa_sint_t p3 = SA[i - 3], q3 = ((p3 & SUFFIX_GROUP_MARKER) ^ s) &
+                                       ((sa_sint_t)(p3 > 0) << ((SUFFIX_GROUP_BIT - 1)));
+        s = s ^ q3;
+        SA[i - 3] = p3 ^ q3;
+    }
+
+    for (; i >= 0; i -= 1) {
+        sa_sint_t p = SA[i], q = ((p & SUFFIX_GROUP_MARKER) ^ s) &
+                                 ((sa_sint_t)(p > 0) << ((SUFFIX_GROUP_BIT - 1)));
+        s = s ^ q;
+        SA[i] = p ^ q;
+    }
+}
+
+static void libsais_partial_sorting_shift_buckets_32s_6k(sa_sint_t k,
+                                                         sa_sint_t * RESTRICT buckets) {
+    sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k];
+
+    fast_sint_t i;
+    for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0);
+         i += BUCKETS_INDEX2(1, 0)) {
+        buckets[2 * i + BUCKETS_INDEX4(0, 0)] = temp_bucket[i + BUCKETS_INDEX2(0, 0)];
+        buckets[2 * i + BUCKETS_INDEX4(0, 1)] = temp_bucket[i + BUCKETS_INDEX2(0, 1)];
+    }
+}
+
+static sa_sint_t libsais_partial_sorting_scan_right_to_left_8u(
+    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d,
+    fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+    const fast_sint_t prefetch_distance = 32;
+
+    sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
+    sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE];
+
+    fast_sint_t i, j;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1;
+         i >= j; i -= 2) {
+        libsais_prefetch(&SA[i - 2 * prefetch_distance]);
+
+        libsais_prefetch(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 1);
+        libsais_prefetch(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 2);
+        libsais_prefetch(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 1);
+        libsais_prefetch(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 2);
+
+        sa_sint_t p0 = SA[i - 0];
         d += (p0 < 0);
-        cache[i + 0].symbol = buckets[v0]++;
-        cache[i + 0].index = (p0 - 1) | ((sa_sint_t)(buckets[2 + v0] != d) << (SAINT_BIT - 1));
-        buckets[2 + v0] = d;
-        if (cache[i + 0].symbol < omp_block_end) {
-            sa_sint_t s = cache[i + 0].symbol,
-                      q = (cache[s].index = cache[i + 0].index) & SAINT_MAX;
-            cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] >= T[q - 1]);
-        }
+        p0 &= SAINT_MAX;
+        sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]);
+        SA[--induction_bucket[v0]] =
+            (p0 - 1) | ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1));
+        distinct_names[v0] = d;
 
-        sa_sint_t v1 = cache[i + 1].symbol, p1 = cache[i + 1].index;
+        sa_sint_t p1 = SA[i - 1];
         d += (p1 < 0);
-        cache[i + 1].symbol = buckets[v1]++;
-        cache[i + 1].index = (p1 - 1) | ((sa_sint_t)(buckets[2 + v1] != d) << (SAINT_BIT - 1));
-        buckets[2 + v1] = d;
-        if (cache[i + 1].symbol < omp_block_end) {
-            sa_sint_t s = cache[i + 1].symbol,
-                      q = (cache[s].index = cache[i + 1].index) & SAINT_MAX;
-            cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] >= T[q - 1]);
-        }
+        p1 &= SAINT_MAX;
+        sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]);
+        SA[--induction_bucket[v1]] =
+            (p1 - 1) | ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1));
+        distinct_names[v1] = d;
     }
 
-    for (j += prefetch_distance + 1; i < j; i += 1) {
-        sa_sint_t v = cache[i].symbol, p = cache[i].index;
+    for (j -= prefetch_distance + 1; i >= j; i -= 1) {
+        sa_sint_t p = SA[i];
+        d += (p < 0);
+        p &= SAINT_MAX;
+        sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]);
+        SA[--induction_bucket[v]] =
+            (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1));
+        distinct_names[v] = d;
+    }
+
+    return d;
+}
+static void libsais_partial_sorting_scan_right_to_left_8u_omp(
+    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets,
+    sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    fast_sint_t scan_start = (fast_sint_t)left_suffixes_count + 1;
+    fast_sint_t scan_end = (fast_sint_t)n - (fast_sint_t)first_lms_suffix;
+
+    if (threads == 1 || (scan_end - scan_start) < 65536) {
+        libsais_partial_sorting_scan_right_to_left_8u(T, SA, buckets, d, scan_start,
+                                                      scan_end - scan_start);
+    }
+    (void)(thread_state);
+}
+
+static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
+    sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+    const fast_sint_t prefetch_distance = 32;
+
+    fast_sint_t i, j;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1;
+         i >= j; i -= 2) {
+        libsais_prefetch(&SA[i - 3 * prefetch_distance]);
+
+        libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 0] & SAINT_MAX] - 1);
+        libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 0] & SAINT_MAX] - 2);
+        libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 1] & SAINT_MAX] - 1);
+        libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 1] & SAINT_MAX] - 2);
+
+        sa_sint_t p0 = SA[i - prefetch_distance - 0] & SAINT_MAX;
+        sa_sint_t v0 = BUCKETS_INDEX4(T[p0 - (p0 > 0)], 0);
+        libsais_prefetchw(&buckets[v0]);
+        sa_sint_t p1 = SA[i - prefetch_distance - 1] & SAINT_MAX;
+        sa_sint_t v1 = BUCKETS_INDEX4(T[p1 - (p1 > 0)], 0);
+        libsais_prefetchw(&buckets[v1]);
+
+        sa_sint_t p2 = SA[i - 0];
+        d += (p2 < 0);
+        p2 &= SAINT_MAX;
+        sa_sint_t v2 = BUCKETS_INDEX4(T[p2 - 1], T[p2 - 2] > T[p2 - 1]);
+        SA[--buckets[v2]] = (p2 - 1) | ((sa_sint_t)(buckets[2 + v2] != d) << (SAINT_BIT - 1));
+        buckets[2 + v2] = d;
+
+        sa_sint_t p3 = SA[i - 1];
+        d += (p3 < 0);
+        p3 &= SAINT_MAX;
+        sa_sint_t v3 = BUCKETS_INDEX4(T[p3 - 1], T[p3 - 2] > T[p3 - 1]);
+        SA[--buckets[v3]] = (p3 - 1) | ((sa_sint_t)(buckets[2 + v3] != d) << (SAINT_BIT - 1));
+        buckets[2 + v3] = d;
+    }
+
+    for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1) {
+        sa_sint_t p = SA[i];
         d += (p < 0);
-        cache[i].symbol = buckets[v]++;
-        cache[i].index = (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1));
+        p &= SAINT_MAX;
+        sa_sint_t v = BUCKETS_INDEX4(T[p - 1], T[p - 2] > T[p - 1]);
+        SA[--buckets[v]] = (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1));
         buckets[2 + v] = d;
-        if (cache[i].symbol < omp_block_end) {
-            sa_sint_t s = cache[i].symbol, q = (cache[s].index = cache[i].index) & SAINT_MAX;
-            cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] >= T[q - 1]);
-        }
     }
 
     return d;
 }
 
-static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k_block_sort(
-    const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d,
-    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
+static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k,
+    sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start,
     fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
-    sa_sint_t * RESTRICT induction_bucket = &buckets[2 * k];
+    sa_sint_t * RESTRICT induction_bucket = &buckets[3 * k];
     sa_sint_t * RESTRICT distinct_names = &buckets[0 * k];
 
-    fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size;
-    for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2) {
-        libsais_prefetchw(&cache[i + 2 * prefetch_distance]);
-
-        sa_sint_t s0 = cache[i + prefetch_distance + 0].symbol;
-        const sa_sint_t * Is0 = &induction_bucket[s0 >> 1];
-        libsais_prefetchw(s0 >= 0 ? Is0 : NULL);
-        const sa_sint_t * Ds0 = &distinct_names[s0];
-        libsais_prefetchw(s0 >= 0 ? Ds0 : NULL);
-        sa_sint_t s1 = cache[i + prefetch_distance + 1].symbol;
-        const sa_sint_t * Is1 = &induction_bucket[s1 >> 1];
-        libsais_prefetchw(s1 >= 0 ? Is1 : NULL);
-        const sa_sint_t * Ds1 = &distinct_names[s1];
-        libsais_prefetchw(s1 >= 0 ? Ds1 : NULL);
-
-        sa_sint_t v0 = cache[i + 0].symbol;
-        if (v0 >= 0) {
-            sa_sint_t p0 = cache[i + 0].index;
+    fast_sint_t i, j;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1;
+         i >= j; i -= 2) {
+        libsais_prefetchw(&SA[i - 3 * prefetch_distance]);
+
+        sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0];
+        const sa_sint_t * Ts0 = &T[s0 & ~SUFFIX_GROUP_MARKER] - 1;
+        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        Ts0--;
+        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i - 2 * prefetch_distance - 1];
+        const sa_sint_t * Ts1 = &T[s1 & ~SUFFIX_GROUP_MARKER] - 1;
+        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+        Ts1--;
+        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+        sa_sint_t s2 = SA[i - 1 * prefetch_distance - 0];
+        if (s2 > 0) {
+            const fast_sint_t Ts2 = T[(s2 & ~SUFFIX_GROUP_MARKER) - 1];
+            libsais_prefetchw(&induction_bucket[Ts2]);
+            libsais_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts2, 0)]);
+        }
+        sa_sint_t s3 = SA[i - 1 * prefetch_distance - 1];
+        if (s3 > 0) {
+            const fast_sint_t Ts3 = T[(s3 & ~SUFFIX_GROUP_MARKER) - 1];
+            libsais_prefetchw(&induction_bucket[Ts3]);
+            libsais_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts3, 0)]);
+        }
+
+        sa_sint_t p0 = SA[i - 0];
+        if (p0 > 0) {
+            SA[i - 0] = 0;
             d += (p0 >> (SUFFIX_GROUP_BIT - 1));
-            cache[i + 0].symbol = induction_bucket[v0 >> 1]++;
-            cache[i + 0].index = (p0 - 1) | (v0 << (SAINT_BIT - 1)) |
-                                 ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1));
+            p0 &= ~SUFFIX_GROUP_MARKER;
+            sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]);
+            SA[--induction_bucket[T[p0 - 1]]] =
+                (p0 - 1) | ((sa_sint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1)) |
+                ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1));
             distinct_names[v0] = d;
-            if (cache[i + 0].symbol < omp_block_end) {
-                sa_sint_t ni = cache[i + 0].symbol, np = cache[i + 0].index;
-                if (np > 0) {
-                    cache[ni].index = np;
-                    np &= ~SUFFIX_GROUP_MARKER;
-                    cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] < T[np - 1]);
-                    np = 0;
-                }
-                cache[i + 0].index = np & SAINT_MAX;
-            }
         }
 
-        sa_sint_t v1 = cache[i + 1].symbol;
-        if (v1 >= 0) {
-            sa_sint_t p1 = cache[i + 1].index;
+        sa_sint_t p1 = SA[i - 1];
+        if (p1 > 0) {
+            SA[i - 1] = 0;
             d += (p1 >> (SUFFIX_GROUP_BIT - 1));
-            cache[i + 1].symbol = induction_bucket[v1 >> 1]++;
-            cache[i + 1].index = (p1 - 1) | (v1 << (SAINT_BIT - 1)) |
-                                 ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1));
+            p1 &= ~SUFFIX_GROUP_MARKER;
+            sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]);
+            SA[--induction_bucket[T[p1 - 1]]] =
+                (p1 - 1) | ((sa_sint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1)) |
+                ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1));
             distinct_names[v1] = d;
-            if (cache[i + 1].symbol < omp_block_end) {
-                sa_sint_t ni = cache[i + 1].symbol, np = cache[i + 1].index;
-                if (np > 0) {
-                    cache[ni].index = np;
-                    np &= ~SUFFIX_GROUP_MARKER;
-                    cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] < T[np - 1]);
-                    np = 0;
-                }
-                cache[i + 1].index = np & SAINT_MAX;
-            }
         }
     }
 
-    for (j += prefetch_distance + 1; i < j; i += 1) {
-        sa_sint_t v = cache[i].symbol;
-        if (v >= 0) {
-            sa_sint_t p = cache[i].index;
+    for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1) {
+        sa_sint_t p = SA[i];
+        if (p > 0) {
+            SA[i] = 0;
             d += (p >> (SUFFIX_GROUP_BIT - 1));
-            cache[i].symbol = induction_bucket[v >> 1]++;
-            cache[i].index = (p - 1) | (v << (SAINT_BIT - 1)) |
-                             ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1));
+            p &= ~SUFFIX_GROUP_MARKER;
+            sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]);
+            SA[--induction_bucket[T[p - 1]]] =
+                (p - 1) | ((sa_sint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1)) |
+                ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1));
             distinct_names[v] = d;
-            if (cache[i].symbol < omp_block_end) {
-                sa_sint_t ni = cache[i].symbol, np = cache[i].index;
-                if (np > 0) {
-                    cache[ni].index = np;
-                    np &= ~SUFFIX_GROUP_MARKER;
-                    cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] < T[np - 1]);
-                    np = 0;
-                }
-                cache[i].index = np & SAINT_MAX;
-            }
         }
     }
 
     return d;
 }
 
-static void libsais_partial_sorting_scan_left_to_right_32s_1k_block_sort(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT induction_bucket,
-    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
-    fast_sint_t omp_block_size) {
+static void libsais_partial_sorting_scan_right_to_left_32s_1k(const sa_sint_t * RESTRICT T,
+                                                              sa_sint_t * RESTRICT SA,
+                                                              sa_sint_t * RESTRICT induction_bucket,
+                                                              fast_sint_t omp_block_start,
+                                                              fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
-    fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size;
-    for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2) {
-        libsais_prefetchw(&cache[i + 2 * prefetch_distance]);
-
-        sa_sint_t s0 = cache[i + prefetch_distance + 0].symbol;
-        const sa_sint_t * Is0 = &induction_bucket[s0];
-        libsais_prefetchw(s0 >= 0 ? Is0 : NULL);
-        sa_sint_t s1 = cache[i + prefetch_distance + 1].symbol;
-        const sa_sint_t * Is1 = &induction_bucket[s1];
-        libsais_prefetchw(s1 >= 0 ? Is1 : NULL);
-
-        sa_sint_t v0 = cache[i + 0].symbol;
-        if (v0 >= 0) {
-            cache[i + 0].symbol = induction_bucket[v0]++;
-            if (cache[i + 0].symbol < omp_block_end) {
-                sa_sint_t ni = cache[i + 0].symbol, np = cache[i + 0].index;
-                if (np > 0) {
-                    cache[ni].index =
-                        (np - 1) | ((sa_sint_t)(T[np - 2] < T[np - 1]) << (SAINT_BIT - 1));
-                    cache[ni].symbol = T[np - 1];
-                    np = 0;
-                }
-                cache[i + 0].index = np & SAINT_MAX;
-            }
+    fast_sint_t i, j;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1;
+         i >= j; i -= 2) {
+        libsais_prefetchw(&SA[i - 3 * prefetch_distance]);
+
+        sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0];
+        const sa_sint_t * Ts0 = &T[s0] - 1;
+        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i - 2 * prefetch_distance - 1];
+        const sa_sint_t * Ts1 = &T[s1] - 1;
+        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+        sa_sint_t s2 = SA[i - 1 * prefetch_distance - 0];
+        if (s2 > 0) {
+            libsais_prefetchw(&induction_bucket[T[s2 - 1]]);
+            libsais_prefetch(&T[s2] - 2);
+        }
+        sa_sint_t s3 = SA[i - 1 * prefetch_distance - 1];
+        if (s3 > 0) {
+            libsais_prefetchw(&induction_bucket[T[s3 - 1]]);
+            libsais_prefetch(&T[s3] - 2);
         }
 
-        sa_sint_t v1 = cache[i + 1].symbol;
-        if (v1 >= 0) {
-            cache[i + 1].symbol = induction_bucket[v1]++;
-            if (cache[i + 1].symbol < omp_block_end) {
-                sa_sint_t ni = cache[i + 1].symbol, np = cache[i + 1].index;
-                if (np > 0) {
-                    cache[ni].index =
-                        (np - 1) | ((sa_sint_t)(T[np - 2] < T[np - 1]) << (SAINT_BIT - 1));
-                    cache[ni].symbol = T[np - 1];
-                    np = 0;
-                }
-                cache[i + 1].index = np & SAINT_MAX;
-            }
+        sa_sint_t p0 = SA[i - 0];
+        if (p0 > 0) {
+            SA[i - 0] = 0;
+            SA[--induction_bucket[T[p0 - 1]]] =
+                (p0 - 1) | ((sa_sint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1));
+        }
+        sa_sint_t p1 = SA[i - 1];
+        if (p1 > 0) {
+            SA[i - 1] = 0;
+            SA[--induction_bucket[T[p1 - 1]]] =
+                (p1 - 1) | ((sa_sint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1));
         }
     }
 
-    for (j += prefetch_distance + 1; i < j; i += 1) {
-        sa_sint_t v = cache[i].symbol;
-        if (v >= 0) {
-            cache[i].symbol = induction_bucket[v]++;
-            if (cache[i].symbol < omp_block_end) {
-                sa_sint_t ni = cache[i].symbol, np = cache[i].index;
-                if (np > 0) {
-                    cache[ni].index =
-                        (np - 1) | ((sa_sint_t)(T[np - 2] < T[np - 1]) << (SAINT_BIT - 1));
-                    cache[ni].symbol = T[np - 1];
-                    np = 0;
-                }
-                cache[i].index = np & SAINT_MAX;
-            }
+    for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1) {
+        sa_sint_t p = SA[i];
+        if (p > 0) {
+            SA[i] = 0;
+            SA[--induction_bucket[T[p - 1]]] =
+                (p - 1) | ((sa_sint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1));
         }
     }
 }
+static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k_omp(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+    sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count,
+    sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    fast_sint_t scan_start = (fast_sint_t)left_suffixes_count + 1;
+    fast_sint_t scan_end = (fast_sint_t)n - (fast_sint_t)first_lms_suffix;
 
-static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k_block_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
-    sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start,
-    fast_sint_t block_size, sa_sint_t threads) {
-    #if defined(_OPENMP)
-        #pragma omp parallel num_threads(threads) if (threads > 1 && block_size >= 16384)
-    #endif
-    {
-    #if defined(_OPENMP)
-        fast_sint_t omp_thread_num = omp_get_thread_num();
-        fast_sint_t omp_num_threads = omp_get_num_threads();
-    #else
-        UNUSED(threads);
-        UNUSED(cache);
-
-        fast_sint_t omp_thread_num = 0;
-        fast_sint_t omp_num_threads = 1;
-    #endif
-        fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size =
-            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+    if (threads == 1 || (scan_end - scan_start) < 65536) {
+        d = libsais_partial_sorting_scan_right_to_left_32s_6k(T, SA, buckets, d, scan_start,
+                                                              scan_end - scan_start);
+    }
+    (void)(thread_state);
+    return d;
+}
 
-        omp_block_start += block_start;
+static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k_omp(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
+    sa_sint_t * RESTRICT buckets, sa_sint_t d, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    if (threads == 1 || n < 65536) {
+        d = libsais_partial_sorting_scan_right_to_left_32s_4k(T, SA, k, buckets, d, 0, n);
+    }
+    (void)(thread_state);
+    return d;
+}
 
-        if (omp_num_threads == 1) {
-            d = libsais_partial_sorting_scan_left_to_right_32s_6k(T, SA, buckets, d,
-                                                                  omp_block_start, omp_block_size);
-        }
-    #if defined(_OPENMP)
-        else {
-            {
-                libsais_partial_sorting_scan_left_to_right_32s_6k_block_gather(
-                    T, SA, cache - block_start, omp_block_start, omp_block_size);
-            }
+static void libsais_partial_sorting_scan_right_to_left_32s_1k_omp(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+    sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    if (threads == 1 || n < 65536) {
+        libsais_partial_sorting_scan_right_to_left_32s_1k(T, SA, buckets, 0, n);
+    }
+    (void)(thread_state);
+}
 
-        #pragma omp barrier
+static fast_sint_t libsais_partial_sorting_gather_lms_suffixes_32s_4k(sa_sint_t * RESTRICT SA,
+                                                                      fast_sint_t omp_block_start,
+                                                                      fast_sint_t omp_block_size) {
+    const fast_sint_t prefetch_distance = 32;
 
-        #pragma omp master
-            {
-                d = libsais_partial_sorting_scan_left_to_right_32s_6k_block_sort(
-                    T, buckets, d, cache - block_start, block_start, block_size);
-            }
+    fast_sint_t i, j, l;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - 3, l = omp_block_start; i < j;
+         i += 4) {
+        libsais_prefetch(&SA[i + prefetch_distance]);
 
-        #pragma omp barrier
+        sa_sint_t s0 = SA[i + 0];
+        SA[l] = (s0 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER);
+        l += (s0 < 0);
+        sa_sint_t s1 = SA[i + 1];
+        SA[l] = (s1 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER);
+        l += (s1 < 0);
+        sa_sint_t s2 = SA[i + 2];
+        SA[l] = (s2 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER);
+        l += (s2 < 0);
+        sa_sint_t s3 = SA[i + 3];
+        SA[l] = (s3 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER);
+        l += (s3 < 0);
+    }
 
-            {
-                libsais_place_cached_suffixes(SA, cache - block_start, omp_block_start,
-                                              omp_block_size);
-            }
-        }
-    #endif
+    for (j += 3; i < j; i += 1) {
+        sa_sint_t s = SA[i];
+        SA[l] = (s - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER);
+        l += (s < 0);
     }
 
-    return d;
+    return l;
 }
 
-static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k_block_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k,
-    sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache,
-    fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) {
-    #if defined(_OPENMP)
-        #pragma omp parallel num_threads(threads) if (threads > 1 && block_size >= 16384)
-    #endif
-    {
-    #if defined(_OPENMP)
-        fast_sint_t omp_thread_num = omp_get_thread_num();
-        fast_sint_t omp_num_threads = omp_get_num_threads();
-    #else
-        UNUSED(threads);
-        UNUSED(cache);
-
-        fast_sint_t omp_thread_num = 0;
-        fast_sint_t omp_num_threads = 1;
-    #endif
-        fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size =
-            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
-
-        omp_block_start += block_start;
-
-        if (omp_num_threads == 1) {
-            d = libsais_partial_sorting_scan_left_to_right_32s_4k(T, SA, k, buckets, d,
-                                                                  omp_block_start, omp_block_size);
-        }
-    #if defined(_OPENMP)
-        else {
-            {
-                libsais_partial_sorting_scan_left_to_right_32s_4k_block_gather(
-                    T, SA, cache - block_start, omp_block_start, omp_block_size);
-            }
-
-        #pragma omp barrier
+static fast_sint_t libsais_partial_sorting_gather_lms_suffixes_32s_1k(sa_sint_t * RESTRICT SA,
+                                                                      fast_sint_t omp_block_start,
+                                                                      fast_sint_t omp_block_size) {
+    const fast_sint_t prefetch_distance = 32;
 
-        #pragma omp master
-            {
-                d = libsais_partial_sorting_scan_left_to_right_32s_4k_block_sort(
-                    T, k, buckets, d, cache - block_start, block_start, block_size);
-            }
+    fast_sint_t i, j, l;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - 3, l = omp_block_start; i < j;
+         i += 4) {
+        libsais_prefetch(&SA[i + prefetch_distance]);
 
-        #pragma omp barrier
+        sa_sint_t s0 = SA[i + 0];
+        SA[l] = s0 & SAINT_MAX;
+        l += (s0 < 0);
+        sa_sint_t s1 = SA[i + 1];
+        SA[l] = s1 & SAINT_MAX;
+        l += (s1 < 0);
+        sa_sint_t s2 = SA[i + 2];
+        SA[l] = s2 & SAINT_MAX;
+        l += (s2 < 0);
+        sa_sint_t s3 = SA[i + 3];
+        SA[l] = s3 & SAINT_MAX;
+        l += (s3 < 0);
+    }
 
-            {
-                libsais_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start,
-                                                          omp_block_size);
-            }
-        }
-    #endif
+    for (j += 3; i < j; i += 1) {
+        sa_sint_t s = SA[i];
+        SA[l] = s & SAINT_MAX;
+        l += (s < 0);
     }
 
-    return d;
+    return l;
 }
 
-static void libsais_partial_sorting_scan_left_to_right_32s_1k_block_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
-    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size,
-    sa_sint_t threads) {
-    #if defined(_OPENMP)
-        #pragma omp parallel num_threads(threads) if (threads > 1 && block_size >= 16384)
-    #endif
+static void libsais_partial_sorting_gather_lms_suffixes_32s_4k_omp(
+    sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     {
-    #if defined(_OPENMP)
-        fast_sint_t omp_thread_num = omp_get_thread_num();
-        fast_sint_t omp_num_threads = omp_get_num_threads();
-    #else
-        UNUSED(threads);
-        UNUSED(cache);
+        (void)(threads);
+        (void)(thread_state);
 
         fast_sint_t omp_thread_num = 0;
         fast_sint_t omp_num_threads = 1;
-    #endif
-        fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
+
+        fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
         fast_sint_t omp_block_size =
-            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
-
-        omp_block_start += block_start;
+            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
 
         if (omp_num_threads == 1) {
-            libsais_partial_sorting_scan_left_to_right_32s_1k(T, SA, buckets, omp_block_start,
-                                                              omp_block_size);
+            libsais_partial_sorting_gather_lms_suffixes_32s_4k(SA, omp_block_start, omp_block_size);
         }
-    #if defined(_OPENMP)
-        else {
-            {
-                libsais_partial_sorting_scan_left_to_right_32s_1k_block_gather(
-                    T, SA, cache - block_start, omp_block_start, omp_block_size);
-            }
+    }
+}
 
-        #pragma omp barrier
+static void libsais_partial_sorting_gather_lms_suffixes_32s_1k_omp(
+    sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    {
+        (void)(threads);
+        (void)(thread_state);
 
-        #pragma omp master
-            {
-                libsais_partial_sorting_scan_left_to_right_32s_1k_block_sort(
-                    T, buckets, cache - block_start, block_start, block_size);
-            }
+        fast_sint_t omp_thread_num = 0;
+        fast_sint_t omp_num_threads = 1;
 
-        #pragma omp barrier
+        fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size =
+            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
 
-            {
-                libsais_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start,
-                                                          omp_block_size);
-            }
+        if (omp_num_threads == 1) {
+            libsais_partial_sorting_gather_lms_suffixes_32s_1k(SA, omp_block_start, omp_block_size);
         }
-    #endif
     }
 }
 
-#endif
+static void libsais_induce_partial_order_8u_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
+                                                sa_sint_t n, sa_sint_t * RESTRICT buckets,
+                                                sa_sint_t first_lms_suffix,
+                                                sa_sint_t left_suffixes_count, sa_sint_t threads,
+                                                LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    memset(&buckets[2 * ALPHABET_SIZE], 0, 2 * ALPHABET_SIZE * sizeof(sa_sint_t));
 
-static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
-    sa_sint_t * RESTRICT buckets, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
-    SA[buckets[BUCKETS_INDEX4(T[n - 1], T[n - 2] >= T[n - 1])]++] = (n - 1) | SAINT_MIN;
-    buckets[2 + BUCKETS_INDEX4(T[n - 1], T[n - 2] >= T[n - 1])] = ++d;
+    sa_sint_t d = libsais_partial_sorting_scan_left_to_right_8u_omp(
+        T, SA, n, buckets, left_suffixes_count, 0, threads, thread_state);
+    libsais_partial_sorting_shift_markers_8u_omp(SA, n, buckets, threads);
+    libsais_partial_sorting_scan_right_to_left_8u_omp(
+        T, SA, n, buckets, first_lms_suffix, left_suffixes_count, d, threads, thread_state);
+}
 
-    if (threads == 1 || left_suffixes_count < 65536) {
-        d = libsais_partial_sorting_scan_left_to_right_32s_6k(T, SA, buckets, d, 0,
-                                                              left_suffixes_count);
-    }
-#if defined(_OPENMP)
-    else {
-        fast_sint_t block_start, block_end;
-        for (block_start = 0; block_start < left_suffixes_count; block_start = block_end) {
-            block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
-            if (block_end > left_suffixes_count) {
-                block_end = left_suffixes_count;
-            }
+static void libsais_induce_partial_order_32s_6k_omp(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
+    sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count,
+    sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    sa_sint_t d = libsais_partial_sorting_scan_left_to_right_32s_6k_omp(
+        T, SA, n, buckets, left_suffixes_count, 0, threads, thread_state);
+    libsais_partial_sorting_shift_markers_32s_6k_omp(SA, k, buckets, threads);
+    libsais_partial_sorting_shift_buckets_32s_6k(k, buckets);
+    libsais_partial_sorting_scan_right_to_left_32s_6k_omp(
+        T, SA, n, buckets, first_lms_suffix, left_suffixes_count, d, threads, thread_state);
+}
 
-            d = libsais_partial_sorting_scan_left_to_right_32s_6k_block_omp(
-                T, SA, buckets, d, thread_state[0].state.cache, block_start,
-                block_end - block_start, threads);
-        }
-    }
-#else
-    UNUSED(thread_state);
-#endif
+static void libsais_induce_partial_order_32s_4k_omp(const sa_sint_t * RESTRICT T,
+                                                    sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                                    sa_sint_t k, sa_sint_t * RESTRICT buckets,
+                                                    sa_sint_t threads,
+                                                    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t));
 
-    return d;
+    sa_sint_t d = libsais_partial_sorting_scan_left_to_right_32s_4k_omp(T, SA, n, k, buckets, 0,
+                                                                        threads, thread_state);
+    libsais_partial_sorting_shift_markers_32s_4k(SA, n);
+    libsais_partial_sorting_scan_right_to_left_32s_4k_omp(T, SA, n, k, buckets, d, threads,
+                                                          thread_state);
+    libsais_partial_sorting_gather_lms_suffixes_32s_4k_omp(SA, n, threads, thread_state);
 }
 
-static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
-    sa_sint_t * RESTRICT buckets, sa_sint_t d, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
-    sa_sint_t * RESTRICT induction_bucket = &buckets[2 * k];
-    sa_sint_t * RESTRICT distinct_names = &buckets[0 * k];
-
-    SA[induction_bucket[T[n - 1]]++] =
-        (n - 1) | ((sa_sint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1)) | SUFFIX_GROUP_MARKER;
-    distinct_names[BUCKETS_INDEX2(T[n - 1], T[n - 2] < T[n - 1])] = ++d;
+static void libsais_induce_partial_order_32s_2k_omp(const sa_sint_t * RESTRICT T,
+                                                    sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                                    sa_sint_t k, sa_sint_t * RESTRICT buckets,
+                                                    sa_sint_t threads,
+                                                    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    libsais_partial_sorting_scan_left_to_right_32s_1k_omp(T, SA, n, &buckets[1 * k], threads,
+                                                          thread_state);
+    libsais_partial_sorting_scan_right_to_left_32s_1k_omp(T, SA, n, &buckets[0 * k], threads,
+                                                          thread_state);
+    libsais_partial_sorting_gather_lms_suffixes_32s_1k_omp(SA, n, threads, thread_state);
+}
 
-    if (threads == 1 || n < 65536) {
-        d = libsais_partial_sorting_scan_left_to_right_32s_4k(T, SA, k, buckets, d, 0, n);
-    }
-#if defined(_OPENMP)
-    else {
-        fast_sint_t block_start, block_end;
-        for (block_start = 0; block_start < n; block_start = block_end) {
-            block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
-            if (block_end > n) {
-                block_end = n;
-            }
+static void libsais_induce_partial_order_32s_1k_omp(const sa_sint_t * RESTRICT T,
+                                                    sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                                    sa_sint_t k, sa_sint_t * RESTRICT buckets,
+                                                    sa_sint_t threads,
+                                                    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    libsais_count_suffixes_32s(T, n, k, buckets);
+    libsais_initialize_buckets_start_32s_1k(k, buckets);
+    libsais_partial_sorting_scan_left_to_right_32s_1k_omp(T, SA, n, buckets, threads, thread_state);
 
-            d = libsais_partial_sorting_scan_left_to_right_32s_4k_block_omp(
-                T, SA, k, buckets, d, thread_state[0].state.cache, block_start,
-                block_end - block_start, threads);
-        }
-    }
-#else
-    UNUSED(thread_state);
-#endif
+    libsais_count_suffixes_32s(T, n, k, buckets);
+    libsais_initialize_buckets_end_32s_1k(k, buckets);
+    libsais_partial_sorting_scan_right_to_left_32s_1k_omp(T, SA, n, buckets, threads, thread_state);
 
-    return d;
+    libsais_partial_sorting_gather_lms_suffixes_32s_1k_omp(SA, n, threads, thread_state);
 }
 
-static void libsais_partial_sorting_scan_left_to_right_32s_1k_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
-    sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
-    SA[buckets[T[n - 1]]++] = (n - 1) | ((sa_sint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1));
+static sa_sint_t libsais_renumber_lms_suffixes_8u(sa_sint_t * RESTRICT SA, sa_sint_t m,
+                                                  sa_sint_t name, fast_sint_t omp_block_start,
+                                                  fast_sint_t omp_block_size) {
+    const fast_sint_t prefetch_distance = 32;
 
-    if (threads == 1 || n < 65536) {
-        libsais_partial_sorting_scan_left_to_right_32s_1k(T, SA, buckets, 0, n);
+    sa_sint_t * RESTRICT SAm = &SA[m];
+
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j;
+         i += 4) {
+        libsais_prefetch(&SA[i + 2 * prefetch_distance]);
+
+        libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 0] & SAINT_MAX) >> 1]);
+        libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 1] & SAINT_MAX) >> 1]);
+        libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 2] & SAINT_MAX) >> 1]);
+        libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 3] & SAINT_MAX) >> 1]);
+
+        sa_sint_t p0 = SA[i + 0];
+        SAm[(p0 & SAINT_MAX) >> 1] = name | SAINT_MIN;
+        name += p0 < 0;
+        sa_sint_t p1 = SA[i + 1];
+        SAm[(p1 & SAINT_MAX) >> 1] = name | SAINT_MIN;
+        name += p1 < 0;
+        sa_sint_t p2 = SA[i + 2];
+        SAm[(p2 & SAINT_MAX) >> 1] = name | SAINT_MIN;
+        name += p2 < 0;
+        sa_sint_t p3 = SA[i + 3];
+        SAm[(p3 & SAINT_MAX) >> 1] = name | SAINT_MIN;
+        name += p3 < 0;
     }
-#if defined(_OPENMP)
-    else {
-        fast_sint_t block_start, block_end;
-        for (block_start = 0; block_start < n; block_start = block_end) {
-            block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
-            if (block_end > n) {
-                block_end = n;
-            }
 
-            libsais_partial_sorting_scan_left_to_right_32s_1k_block_omp(
-                T, SA, buckets, thread_state[0].state.cache, block_start, block_end - block_start,
-                threads);
-        }
+    for (j += prefetch_distance + 3; i < j; i += 1) {
+        sa_sint_t p = SA[i];
+        SAm[(p & SAINT_MAX) >> 1] = name | SAINT_MIN;
+        name += p < 0;
     }
-#else
-    UNUSED(thread_state);
-#endif
+
+    return name;
 }
 
-static void libsais_partial_sorting_shift_markers_8u_omp(sa_sint_t * RESTRICT SA, sa_sint_t n,
-                                                         const sa_sint_t * RESTRICT buckets,
-                                                         sa_sint_t threads) {
+static fast_sint_t libsais_gather_marked_suffixes_8u(sa_sint_t * RESTRICT SA, sa_sint_t m,
+                                                     fast_sint_t l, fast_sint_t omp_block_start,
+                                                     fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
-    const sa_sint_t * RESTRICT temp_bucket = &buckets[4 * ALPHABET_SIZE];
-
-    fast_sint_t c;
+    l -= 1;
 
-#if defined(_OPENMP)
-    #pragma omp parallel for schedule(static, 1) num_threads(threads) if (threads > 1 && n >= 65536)
-#else
-    UNUSED(threads);
-    UNUSED(n);
-#endif
-    for (c = BUCKETS_INDEX2(ALPHABET_SIZE - 1, 0); c >= BUCKETS_INDEX2(1, 0);
-         c -= BUCKETS_INDEX2(1, 0)) {
-        fast_sint_t i, j;
-        sa_sint_t s = SAINT_MIN;
-        for (i = (fast_sint_t)temp_bucket[c] - 1,
-            j = (fast_sint_t)buckets[c - BUCKETS_INDEX2(1, 0)] + 3;
-             i >= j; i -= 4) {
-            libsais_prefetchw(&SA[i - prefetch_distance]);
+    fast_sint_t i, j;
+    for (i = (fast_sint_t)m + omp_block_start + omp_block_size - 1,
+        j = (fast_sint_t)m + omp_block_start + 3;
+         i >= j; i -= 4) {
+        libsais_prefetch(&SA[i - prefetch_distance]);
 
-            sa_sint_t p0 = SA[i - 0], q0 = (p0 & SAINT_MIN) ^ s;
-            s = s ^ q0;
-            SA[i - 0] = p0 ^ q0;
-            sa_sint_t p1 = SA[i - 1], q1 = (p1 & SAINT_MIN) ^ s;
-            s = s ^ q1;
-            SA[i - 1] = p1 ^ q1;
-            sa_sint_t p2 = SA[i - 2], q2 = (p2 & SAINT_MIN) ^ s;
-            s = s ^ q2;
-            SA[i - 2] = p2 ^ q2;
-            sa_sint_t p3 = SA[i - 3], q3 = (p3 & SAINT_MIN) ^ s;
-            s = s ^ q3;
-            SA[i - 3] = p3 ^ q3;
-        }
+        sa_sint_t s0 = SA[i - 0];
+        SA[l] = s0 & SAINT_MAX;
+        l -= s0 < 0;
+        sa_sint_t s1 = SA[i - 1];
+        SA[l] = s1 & SAINT_MAX;
+        l -= s1 < 0;
+        sa_sint_t s2 = SA[i - 2];
+        SA[l] = s2 & SAINT_MAX;
+        l -= s2 < 0;
+        sa_sint_t s3 = SA[i - 3];
+        SA[l] = s3 & SAINT_MAX;
+        l -= s3 < 0;
+    }
 
-        for (j -= 3; i >= j; i -= 1) {
-            sa_sint_t p = SA[i], q = (p & SAINT_MIN) ^ s;
-            s = s ^ q;
-            SA[i] = p ^ q;
-        }
+    for (j -= 3; i >= j; i -= 1) {
+        sa_sint_t s = SA[i];
+        SA[l] = s & SAINT_MAX;
+        l -= s < 0;
     }
-}
 
-static void libsais_partial_sorting_shift_markers_32s_6k_omp(sa_sint_t * RESTRICT SA, sa_sint_t k,
-                                                             const sa_sint_t * RESTRICT buckets,
-                                                             sa_sint_t threads) {
-    const fast_sint_t prefetch_distance = 32;
+    l += 1;
 
-    const sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k];
+    return l;
+}
 
-    fast_sint_t c;
+static sa_sint_t libsais_renumber_lms_suffixes_8u_omp(
+    sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    sa_sint_t name = 0;
+    {
+        (void)(threads);
+        (void)(thread_state);
 
-#if defined(_OPENMP)
-    #pragma omp parallel for schedule(static, 1) num_threads(threads) if (threads > 1 && k >= 65536)
-#else
-    UNUSED(threads);
-#endif
-    for (c = (fast_sint_t)k - 1; c >= 1; c -= 1) {
-        fast_sint_t i, j;
-        sa_sint_t s = SAINT_MIN;
-        for (i = (fast_sint_t)buckets[BUCKETS_INDEX4(c, 0)] - 1,
-            j = (fast_sint_t)temp_bucket[BUCKETS_INDEX2(c - 1, 0)] + 3;
-             i >= j; i -= 4) {
-            libsais_prefetchw(&SA[i - prefetch_distance]);
+        fast_sint_t omp_thread_num = 0;
+        fast_sint_t omp_num_threads = 1;
 
-            sa_sint_t p0 = SA[i - 0], q0 = (p0 & SAINT_MIN) ^ s;
-            s = s ^ q0;
-            SA[i - 0] = p0 ^ q0;
-            sa_sint_t p1 = SA[i - 1], q1 = (p1 & SAINT_MIN) ^ s;
-            s = s ^ q1;
-            SA[i - 1] = p1 ^ q1;
-            sa_sint_t p2 = SA[i - 2], q2 = (p2 & SAINT_MIN) ^ s;
-            s = s ^ q2;
-            SA[i - 2] = p2 ^ q2;
-            sa_sint_t p3 = SA[i - 3], q3 = (p3 & SAINT_MIN) ^ s;
-            s = s ^ q3;
-            SA[i - 3] = p3 ^ q3;
-        }
+        fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size =
+            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
 
-        for (j -= 3; i >= j; i -= 1) {
-            sa_sint_t p = SA[i], q = (p & SAINT_MIN) ^ s;
-            s = s ^ q;
-            SA[i] = p ^ q;
+        if (omp_num_threads == 1) {
+            name = libsais_renumber_lms_suffixes_8u(SA, m, 0, omp_block_start, omp_block_size);
         }
     }
+
+    return name;
 }
 
-static void libsais_partial_sorting_shift_markers_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n) {
-    const fast_sint_t prefetch_distance = 32;
+static void libsais_gather_marked_lms_suffixes_8u_omp(
+    sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    {
+        (void)(threads);
+        (void)(thread_state);
 
-    fast_sint_t i;
-    sa_sint_t s = SUFFIX_GROUP_MARKER;
-    for (i = (fast_sint_t)n - 1; i >= 3; i -= 4) {
-        libsais_prefetchw(&SA[i - prefetch_distance]);
+        fast_sint_t omp_thread_num = 0;
+        fast_sint_t omp_num_threads = 1;
 
-        sa_sint_t p0 = SA[i - 0], q0 = ((p0 & SUFFIX_GROUP_MARKER) ^ s) &
-                                       ((sa_sint_t)(p0 > 0) << ((SUFFIX_GROUP_BIT - 1)));
-        s = s ^ q0;
-        SA[i - 0] = p0 ^ q0;
-        sa_sint_t p1 = SA[i - 1], q1 = ((p1 & SUFFIX_GROUP_MARKER) ^ s) &
-                                       ((sa_sint_t)(p1 > 0) << ((SUFFIX_GROUP_BIT - 1)));
-        s = s ^ q1;
-        SA[i - 1] = p1 ^ q1;
-        sa_sint_t p2 = SA[i - 2], q2 = ((p2 & SUFFIX_GROUP_MARKER) ^ s) &
-                                       ((sa_sint_t)(p2 > 0) << ((SUFFIX_GROUP_BIT - 1)));
-        s = s ^ q2;
-        SA[i - 2] = p2 ^ q2;
-        sa_sint_t p3 = SA[i - 3], q3 = ((p3 & SUFFIX_GROUP_MARKER) ^ s) &
-                                       ((sa_sint_t)(p3 > 0) << ((SUFFIX_GROUP_BIT - 1)));
-        s = s ^ q3;
-        SA[i - 3] = p3 ^ q3;
-    }
+        fast_sint_t omp_block_stride = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
+                                         ? omp_block_stride
+                                         : ((fast_sint_t)n >> 1) - omp_block_start;
 
-    for (; i >= 0; i -= 1) {
-        sa_sint_t p = SA[i], q = ((p & SUFFIX_GROUP_MARKER) ^ s) &
-                                 ((sa_sint_t)(p > 0) << ((SUFFIX_GROUP_BIT - 1)));
-        s = s ^ q;
-        SA[i] = p ^ q;
+        if (omp_num_threads == 1) {
+            libsais_gather_marked_suffixes_8u(SA, m, (fast_sint_t)n + (fast_sint_t)fs,
+                                              omp_block_start, omp_block_size);
+        }
     }
 }
 
-static void libsais_partial_sorting_shift_buckets_32s_6k(sa_sint_t k,
-                                                         sa_sint_t * RESTRICT buckets) {
-    sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k];
+static sa_sint_t libsais_renumber_and_gather_lms_suffixes_8u_omp(
+    sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    memset(&SA[m], 0, ((size_t)n >> 1) * sizeof(sa_sint_t));
 
-    fast_sint_t i;
-    for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0);
-         i += BUCKETS_INDEX2(1, 0)) {
-        buckets[2 * i + BUCKETS_INDEX4(0, 0)] = temp_bucket[i + BUCKETS_INDEX2(0, 0)];
-        buckets[2 * i + BUCKETS_INDEX4(0, 1)] = temp_bucket[i + BUCKETS_INDEX2(0, 1)];
+    sa_sint_t name = libsais_renumber_lms_suffixes_8u_omp(SA, m, threads, thread_state);
+    if (name < m) {
+        libsais_gather_marked_lms_suffixes_8u_omp(SA, n, m, fs, threads, thread_state);
+    } else {
+        fast_sint_t i;
+        for (i = 0; i < m; i += 1) {
+            SA[i] &= SAINT_MAX;
+        }
     }
+
+    return name;
 }
 
-static sa_sint_t libsais_partial_sorting_scan_right_to_left_8u(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d,
-    fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+static sa_sint_t libsais_renumber_distinct_lms_suffixes_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t m,
+                                                               sa_sint_t name,
+                                                               fast_sint_t omp_block_start,
+                                                               fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
-    sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
-    sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE];
+    sa_sint_t * RESTRICT SAm = &SA[m];
 
     fast_sint_t i, j;
-    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1;
-         i >= j; i -= 2) {
-        libsais_prefetch(&SA[i - 2 * prefetch_distance]);
-
-        libsais_prefetch(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 1);
-        libsais_prefetch(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 2);
-        libsais_prefetch(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 1);
-        libsais_prefetch(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 2);
+    sa_sint_t p0, p1, p2, p3 = 0;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j;
+         i += 4) {
+        libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
 
-        sa_sint_t p0 = SA[i - 0];
-        d += (p0 < 0);
-        p0 &= SAINT_MAX;
-        sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]);
-        SA[--induction_bucket[v0]] =
-            (p0 - 1) | ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1));
-        distinct_names[v0] = d;
+        libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 0] & SAINT_MAX) >> 1]);
+        libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 1] & SAINT_MAX) >> 1]);
+        libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 2] & SAINT_MAX) >> 1]);
+        libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 3] & SAINT_MAX) >> 1]);
 
-        sa_sint_t p1 = SA[i - 1];
-        d += (p1 < 0);
-        p1 &= SAINT_MAX;
-        sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]);
-        SA[--induction_bucket[v1]] =
-            (p1 - 1) | ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1));
-        distinct_names[v1] = d;
-    }
-
-    for (j -= prefetch_distance + 1; i >= j; i -= 1) {
-        sa_sint_t p = SA[i];
-        d += (p < 0);
-        p &= SAINT_MAX;
-        sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]);
-        SA[--induction_bucket[v]] =
-            (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1));
-        distinct_names[v] = d;
-    }
-
-    return d;
-}
-
-#if defined(_OPENMP)
-
-static void libsais_partial_sorting_scan_right_to_left_8u_block_prepare(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
-    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size,
-    LIBSAIS_THREAD_STATE * RESTRICT state) {
-    const fast_sint_t prefetch_distance = 32;
-
-    sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
-    sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE];
-
-    memset(buckets, 0, 4 * ALPHABET_SIZE * sizeof(sa_sint_t));
-
-    fast_sint_t i, j, count = 0;
-    sa_sint_t d = 1;
-    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1;
-         i >= j; i -= 2) {
-        libsais_prefetch(&SA[i - 2 * prefetch_distance]);
-
-        libsais_prefetch(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 1);
-        libsais_prefetch(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 2);
-        libsais_prefetch(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 1);
-        libsais_prefetch(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 2);
-
-        sa_sint_t p0 = cache[count].index = SA[i - 0];
-        d += (p0 < 0);
-        p0 &= SAINT_MAX;
-        sa_sint_t v0 = cache[count++].symbol = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]);
-        induction_bucket[v0]++;
-        distinct_names[v0] = d;
-        sa_sint_t p1 = cache[count].index = SA[i - 1];
-        d += (p1 < 0);
-        p1 &= SAINT_MAX;
-        sa_sint_t v1 = cache[count++].symbol = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]);
-        induction_bucket[v1]++;
-        distinct_names[v1] = d;
-    }
-
-    for (j -= prefetch_distance + 1; i >= j; i -= 1) {
-        sa_sint_t p = cache[count].index = SA[i];
-        d += (p < 0);
-        p &= SAINT_MAX;
-        sa_sint_t v = cache[count++].symbol = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]);
-        induction_bucket[v]++;
-        distinct_names[v] = d;
-    }
-
-    state[0].state.position = (fast_sint_t)d - 1;
-    state[0].state.count = count;
-}
-
-static void libsais_partial_sorting_scan_right_to_left_8u_block_place(
-    sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache,
-    fast_sint_t count, sa_sint_t d) {
-    const fast_sint_t prefetch_distance = 32;
-
-    sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
-    sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE];
-
-    fast_sint_t i, j;
-    for (i = 0, j = count - 1; i < j; i += 2) {
-        libsais_prefetch(&cache[i + prefetch_distance]);
-
-        sa_sint_t p0 = cache[i + 0].index;
-        d += (p0 < 0);
-        sa_sint_t v0 = cache[i + 0].symbol;
-        SA[--induction_bucket[v0]] =
-            (p0 - 1) | ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1));
-        distinct_names[v0] = d;
-
-        sa_sint_t p1 = cache[i + 1].index;
-        d += (p1 < 0);
-        sa_sint_t v1 = cache[i + 1].symbol;
-        SA[--induction_bucket[v1]] =
-            (p1 - 1) | ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1));
-        distinct_names[v1] = d;
-    }
-
-    for (j += 1; i < j; i += 1) {
-        sa_sint_t p = cache[i].index;
-        d += (p < 0);
-        sa_sint_t v = cache[i].symbol;
-        SA[--induction_bucket[v]] =
-            (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1));
-        distinct_names[v] = d;
-    }
-}
-
-static sa_sint_t libsais_partial_sorting_scan_right_to_left_8u_block_omp(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d,
-    fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
-    #if defined(_OPENMP)
-        #pragma omp parallel num_threads(threads) if (threads > 1 &&                      \
-                                                      block_size >= 64 * ALPHABET_SIZE && \
-                                                      omp_get_dynamic() == 0)
-    #endif
-    {
-    #if defined(_OPENMP)
-        fast_sint_t omp_thread_num = omp_get_thread_num();
-        fast_sint_t omp_num_threads = omp_get_num_threads();
-    #else
-        UNUSED(threads);
-        UNUSED(thread_state);
-
-        fast_sint_t omp_thread_num = 0;
-        fast_sint_t omp_num_threads = 1;
-    #endif
-        fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size =
-            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
-
-        omp_block_start += block_start;
-
-        if (omp_num_threads == 1) {
-            d = libsais_partial_sorting_scan_right_to_left_8u(T, SA, buckets, d, omp_block_start,
-                                                              omp_block_size);
-        }
-    #if defined(_OPENMP)
-        else {
-            {
-                libsais_partial_sorting_scan_right_to_left_8u_block_prepare(
-                    T, SA, thread_state[omp_thread_num].state.buckets,
-                    thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size,
-                    &thread_state[omp_thread_num]);
-            }
-
-        #pragma omp barrier
-
-        #pragma omp master
-            {
-                sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
-                sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE];
-
-                fast_sint_t t;
-                for (t = omp_num_threads - 1; t >= 0; --t) {
-                    sa_sint_t * RESTRICT temp_induction_bucket =
-                        &thread_state[t].state.buckets[0 * ALPHABET_SIZE];
-                    sa_sint_t * RESTRICT temp_distinct_names =
-                        &thread_state[t].state.buckets[2 * ALPHABET_SIZE];
-
-                    fast_sint_t c;
-                    for (c = 0; c < 2 * ALPHABET_SIZE; c += 1) {
-                        sa_sint_t A = induction_bucket[c], B = temp_induction_bucket[c];
-                        induction_bucket[c] = A - B;
-                        temp_induction_bucket[c] = A;
-                    }
-
-                    for (d -= 1, c = 0; c < 2 * ALPHABET_SIZE; c += 1) {
-                        sa_sint_t A = distinct_names[c], B = temp_distinct_names[c], D = B + d;
-                        distinct_names[c] = B > 0 ? D : A;
-                        temp_distinct_names[c] = A;
-                    }
-                    d += 1 + (sa_sint_t)thread_state[t].state.position;
-                    thread_state[t].state.position =
-                        (fast_sint_t)d - thread_state[t].state.position;
-                }
-            }
-
-        #pragma omp barrier
-
-            {
-                libsais_partial_sorting_scan_right_to_left_8u_block_place(
-                    SA, thread_state[omp_thread_num].state.buckets,
-                    thread_state[omp_thread_num].state.cache,
-                    thread_state[omp_thread_num].state.count,
-                    (sa_sint_t)thread_state[omp_thread_num].state.position);
-            }
-        }
-    #endif
-    }
-
-    return d;
-}
-
-#endif
-
-static void libsais_partial_sorting_scan_right_to_left_8u_omp(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets,
-    sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
-    fast_sint_t scan_start = (fast_sint_t)left_suffixes_count + 1;
-    fast_sint_t scan_end = (fast_sint_t)n - (fast_sint_t)first_lms_suffix;
-
-    if (threads == 1 || (scan_end - scan_start) < 65536) {
-        libsais_partial_sorting_scan_right_to_left_8u(T, SA, buckets, d, scan_start,
-                                                      scan_end - scan_start);
-    }
-#if defined(_OPENMP)
-    else {
-        sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
-        sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE];
-
-        fast_sint_t block_start;
-        for (block_start = scan_end - 1; block_start >= scan_start;) {
-            if (SA[block_start] == 0) {
-                block_start--;
-            } else {
-                fast_sint_t block_max_end =
-                    block_start - ((fast_sint_t)threads) *
-                                      (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads);
-                if (block_max_end < scan_start) {
-                    block_max_end = scan_start - 1;
-                }
-                fast_sint_t block_end = block_start - 1;
-                while (block_end > block_max_end && SA[block_end] != 0) {
-                    block_end--;
-                }
-                fast_sint_t block_size = block_start - block_end;
-
-                if (block_size < 32) {
-                    for (; block_start > block_end; block_start -= 1) {
-                        sa_sint_t p = SA[block_start];
-                        d += (p < 0);
-                        p &= SAINT_MAX;
-                        sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]);
-                        SA[--induction_bucket[v]] =
-                            (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1));
-                        distinct_names[v] = d;
-                    }
-                } else {
-                    d = libsais_partial_sorting_scan_right_to_left_8u_block_omp(
-                        T, SA, buckets, d, block_end + 1, block_size, threads, thread_state);
-                    block_start = block_end;
-                }
-            }
-        }
-    }
-#else
-    UNUSED(thread_state);
-#endif
-}
-
-static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
-    sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
-    const fast_sint_t prefetch_distance = 32;
-
-    fast_sint_t i, j;
-    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1;
-         i >= j; i -= 2) {
-        libsais_prefetch(&SA[i - 3 * prefetch_distance]);
-
-        libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 0] & SAINT_MAX] - 1);
-        libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 0] & SAINT_MAX] - 2);
-        libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 1] & SAINT_MAX] - 1);
-        libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 1] & SAINT_MAX] - 2);
-
-        sa_sint_t p0 = SA[i - prefetch_distance - 0] & SAINT_MAX;
-        sa_sint_t v0 = BUCKETS_INDEX4(T[p0 - (p0 > 0)], 0);
-        libsais_prefetchw(&buckets[v0]);
-        sa_sint_t p1 = SA[i - prefetch_distance - 1] & SAINT_MAX;
-        sa_sint_t v1 = BUCKETS_INDEX4(T[p1 - (p1 > 0)], 0);
-        libsais_prefetchw(&buckets[v1]);
-
-        sa_sint_t p2 = SA[i - 0];
-        d += (p2 < 0);
-        p2 &= SAINT_MAX;
-        sa_sint_t v2 = BUCKETS_INDEX4(T[p2 - 1], T[p2 - 2] > T[p2 - 1]);
-        SA[--buckets[v2]] = (p2 - 1) | ((sa_sint_t)(buckets[2 + v2] != d) << (SAINT_BIT - 1));
-        buckets[2 + v2] = d;
-
-        sa_sint_t p3 = SA[i - 1];
-        d += (p3 < 0);
-        p3 &= SAINT_MAX;
-        sa_sint_t v3 = BUCKETS_INDEX4(T[p3 - 1], T[p3 - 2] > T[p3 - 1]);
-        SA[--buckets[v3]] = (p3 - 1) | ((sa_sint_t)(buckets[2 + v3] != d) << (SAINT_BIT - 1));
-        buckets[2 + v3] = d;
-    }
-
-    for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1) {
-        sa_sint_t p = SA[i];
-        d += (p < 0);
-        p &= SAINT_MAX;
-        sa_sint_t v = BUCKETS_INDEX4(T[p - 1], T[p - 2] > T[p - 1]);
-        SA[--buckets[v]] = (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1));
-        buckets[2 + v] = d;
-    }
-
-    return d;
-}
-
-static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k,
-    sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start,
-    fast_sint_t omp_block_size) {
-    const fast_sint_t prefetch_distance = 32;
-
-    sa_sint_t * RESTRICT induction_bucket = &buckets[3 * k];
-    sa_sint_t * RESTRICT distinct_names = &buckets[0 * k];
-
-    fast_sint_t i, j;
-    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1;
-         i >= j; i -= 2) {
-        libsais_prefetchw(&SA[i - 3 * prefetch_distance]);
-
-        sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0];
-        const sa_sint_t * Ts0 = &T[s0 & ~SUFFIX_GROUP_MARKER] - 1;
-        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
-        Ts0--;
-        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
-        sa_sint_t s1 = SA[i - 2 * prefetch_distance - 1];
-        const sa_sint_t * Ts1 = &T[s1 & ~SUFFIX_GROUP_MARKER] - 1;
-        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
-        Ts1--;
-        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
-        sa_sint_t s2 = SA[i - 1 * prefetch_distance - 0];
-        if (s2 > 0) {
-            const fast_sint_t Ts2 = T[(s2 & ~SUFFIX_GROUP_MARKER) - 1];
-            libsais_prefetchw(&induction_bucket[Ts2]);
-            libsais_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts2, 0)]);
-        }
-        sa_sint_t s3 = SA[i - 1 * prefetch_distance - 1];
-        if (s3 > 0) {
-            const fast_sint_t Ts3 = T[(s3 & ~SUFFIX_GROUP_MARKER) - 1];
-            libsais_prefetchw(&induction_bucket[Ts3]);
-            libsais_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts3, 0)]);
-        }
-
-        sa_sint_t p0 = SA[i - 0];
-        if (p0 > 0) {
-            SA[i - 0] = 0;
-            d += (p0 >> (SUFFIX_GROUP_BIT - 1));
-            p0 &= ~SUFFIX_GROUP_MARKER;
-            sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]);
-            SA[--induction_bucket[T[p0 - 1]]] =
-                (p0 - 1) | ((sa_sint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1)) |
-                ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1));
-            distinct_names[v0] = d;
-        }
-
-        sa_sint_t p1 = SA[i - 1];
-        if (p1 > 0) {
-            SA[i - 1] = 0;
-            d += (p1 >> (SUFFIX_GROUP_BIT - 1));
-            p1 &= ~SUFFIX_GROUP_MARKER;
-            sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]);
-            SA[--induction_bucket[T[p1 - 1]]] =
-                (p1 - 1) | ((sa_sint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1)) |
-                ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1));
-            distinct_names[v1] = d;
-        }
-    }
-
-    for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1) {
-        sa_sint_t p = SA[i];
-        if (p > 0) {
-            SA[i] = 0;
-            d += (p >> (SUFFIX_GROUP_BIT - 1));
-            p &= ~SUFFIX_GROUP_MARKER;
-            sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]);
-            SA[--induction_bucket[T[p - 1]]] =
-                (p - 1) | ((sa_sint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1)) |
-                ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1));
-            distinct_names[v] = d;
-        }
-    }
-
-    return d;
-}
-
-static void libsais_partial_sorting_scan_right_to_left_32s_1k(const sa_sint_t * RESTRICT T,
-                                                              sa_sint_t * RESTRICT SA,
-                                                              sa_sint_t * RESTRICT induction_bucket,
-                                                              fast_sint_t omp_block_start,
-                                                              fast_sint_t omp_block_size) {
-    const fast_sint_t prefetch_distance = 32;
-
-    fast_sint_t i, j;
-    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1;
-         i >= j; i -= 2) {
-        libsais_prefetchw(&SA[i - 3 * prefetch_distance]);
-
-        sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0];
-        const sa_sint_t * Ts0 = &T[s0] - 1;
-        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
-        sa_sint_t s1 = SA[i - 2 * prefetch_distance - 1];
-        const sa_sint_t * Ts1 = &T[s1] - 1;
-        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
-        sa_sint_t s2 = SA[i - 1 * prefetch_distance - 0];
-        if (s2 > 0) {
-            libsais_prefetchw(&induction_bucket[T[s2 - 1]]);
-            libsais_prefetch(&T[s2] - 2);
-        }
-        sa_sint_t s3 = SA[i - 1 * prefetch_distance - 1];
-        if (s3 > 0) {
-            libsais_prefetchw(&induction_bucket[T[s3 - 1]]);
-            libsais_prefetch(&T[s3] - 2);
-        }
-
-        sa_sint_t p0 = SA[i - 0];
-        if (p0 > 0) {
-            SA[i - 0] = 0;
-            SA[--induction_bucket[T[p0 - 1]]] =
-                (p0 - 1) | ((sa_sint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1));
-        }
-        sa_sint_t p1 = SA[i - 1];
-        if (p1 > 0) {
-            SA[i - 1] = 0;
-            SA[--induction_bucket[T[p1 - 1]]] =
-                (p1 - 1) | ((sa_sint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1));
-        }
-    }
-
-    for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1) {
-        sa_sint_t p = SA[i];
-        if (p > 0) {
-            SA[i] = 0;
-            SA[--induction_bucket[T[p - 1]]] =
-                (p - 1) | ((sa_sint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1));
-        }
-    }
-}
-
-#if defined(_OPENMP)
-
-static void libsais_partial_sorting_scan_right_to_left_32s_6k_block_gather(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache,
-    fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
-    const fast_sint_t prefetch_distance = 32;
-
-    fast_sint_t i, j;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j;
-         i += 2) {
-        libsais_prefetch(&SA[i + 2 * prefetch_distance]);
-
-        libsais_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1);
-        libsais_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 2);
-        libsais_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 1);
-        libsais_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 2);
-
-        libsais_prefetchw(&cache[i + prefetch_distance]);
-
-        sa_sint_t p0 = cache[i + 0].index = SA[i + 0];
-        sa_sint_t symbol0 = 0;
-        p0 &= SAINT_MAX;
-        if (p0 != 0) {
-            symbol0 = BUCKETS_INDEX4(T[p0 - 1], T[p0 - 2] > T[p0 - 1]);
-        }
-        cache[i + 0].symbol = symbol0;
-        sa_sint_t p1 = cache[i + 1].index = SA[i + 1];
-        sa_sint_t symbol1 = 0;
-        p1 &= SAINT_MAX;
-        if (p1 != 0) {
-            symbol1 = BUCKETS_INDEX4(T[p1 - 1], T[p1 - 2] > T[p1 - 1]);
-        }
-        cache[i + 1].symbol = symbol1;
-    }
-
-    for (j += prefetch_distance + 1; i < j; i += 1) {
-        sa_sint_t p = cache[i].index = SA[i];
-        sa_sint_t symbol = 0;
-        p &= SAINT_MAX;
-        if (p != 0) {
-            symbol = BUCKETS_INDEX4(T[p - 1], T[p - 2] > T[p - 1]);
-        }
-        cache[i].symbol = symbol;
-    }
-}
-
-static void libsais_partial_sorting_scan_right_to_left_32s_4k_block_gather(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache,
-    fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
-    const fast_sint_t prefetch_distance = 32;
-
-    fast_sint_t i, j;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j;
-         i += 2) {
-        libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
-
-        sa_sint_t s0 = SA[i + prefetch_distance + 0];
-        const sa_sint_t * Ts0 = &T[s0 & ~SUFFIX_GROUP_MARKER] - 1;
-        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
-        Ts0--;
-        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
-        sa_sint_t s1 = SA[i + prefetch_distance + 1];
-        const sa_sint_t * Ts1 = &T[s1 & ~SUFFIX_GROUP_MARKER] - 1;
-        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
-        Ts1--;
-        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
-
-        libsais_prefetchw(&cache[i + prefetch_distance]);
-
-        sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0];
-        if (p0 > 0) {
-            SA[i + 0] = 0;
-            cache[i + 0].index = p0;
-            p0 &= ~SUFFIX_GROUP_MARKER;
-            symbol0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]);
-        }
-        cache[i + 0].symbol = symbol0;
-        sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1];
-        if (p1 > 0) {
-            SA[i + 1] = 0;
-            cache[i + 1].index = p1;
-            p1 &= ~SUFFIX_GROUP_MARKER;
-            symbol1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]);
-        }
-        cache[i + 1].symbol = symbol1;
-    }
-
-    for (j += prefetch_distance + 1; i < j; i += 1) {
-        sa_sint_t symbol = SAINT_MIN, p = SA[i];
-        if (p > 0) {
-            SA[i] = 0;
-            cache[i].index = p;
-            p &= ~SUFFIX_GROUP_MARKER;
-            symbol = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]);
-        }
-        cache[i].symbol = symbol;
-    }
-}
-
-static void libsais_partial_sorting_scan_right_to_left_32s_1k_block_gather(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache,
-    fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
-    const fast_sint_t prefetch_distance = 32;
-
-    fast_sint_t i, j;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j;
-         i += 2) {
-        libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
-
-        sa_sint_t s0 = SA[i + prefetch_distance + 0];
-        const sa_sint_t * Ts0 = &T[s0] - 1;
-        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
-        Ts0--;
-        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
-        sa_sint_t s1 = SA[i + prefetch_distance + 1];
-        const sa_sint_t * Ts1 = &T[s1] - 1;
-        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
-        Ts1--;
-        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
-
-        libsais_prefetchw(&cache[i + prefetch_distance]);
-
-        sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0];
-        if (p0 > 0) {
-            SA[i + 0] = 0;
-            cache[i + 0].index = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1));
-            symbol0 = T[p0 - 1];
-        }
-        cache[i + 0].symbol = symbol0;
-        sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1];
-        if (p1 > 0) {
-            SA[i + 1] = 0;
-            cache[i + 1].index = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1));
-            symbol1 = T[p1 - 1];
-        }
-        cache[i + 1].symbol = symbol1;
-    }
-
-    for (j += prefetch_distance + 1; i < j; i += 1) {
-        sa_sint_t symbol = SAINT_MIN, p = SA[i];
-        if (p > 0) {
-            SA[i] = 0;
-            cache[i].index = (p - 1) | ((sa_sint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1));
-            symbol = T[p - 1];
-        }
-        cache[i].symbol = symbol;
-    }
-}
-
-static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k_block_sort(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT buckets, sa_sint_t d,
-    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
-    fast_sint_t omp_block_size) {
-    const fast_sint_t prefetch_distance = 32;
-
-    fast_sint_t i, j;
-    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1;
-         i >= j; i -= 2) {
-        libsais_prefetchw(&cache[i - 2 * prefetch_distance]);
-
-        libsais_prefetchw(&buckets[cache[i - prefetch_distance - 0].symbol]);
-        libsais_prefetchw(&buckets[cache[i - prefetch_distance - 1].symbol]);
-
-        sa_sint_t v0 = cache[i - 0].symbol, p0 = cache[i - 0].index;
-        d += (p0 < 0);
-        cache[i - 0].symbol = --buckets[v0];
-        cache[i - 0].index = (p0 - 1) | ((sa_sint_t)(buckets[2 + v0] != d) << (SAINT_BIT - 1));
-        buckets[2 + v0] = d;
-        if (cache[i - 0].symbol >= omp_block_start) {
-            sa_sint_t s = cache[i - 0].symbol,
-                      q = (cache[s].index = cache[i - 0].index) & SAINT_MAX;
-            cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] > T[q - 1]);
-        }
-
-        sa_sint_t v1 = cache[i - 1].symbol, p1 = cache[i - 1].index;
-        d += (p1 < 0);
-        cache[i - 1].symbol = --buckets[v1];
-        cache[i - 1].index = (p1 - 1) | ((sa_sint_t)(buckets[2 + v1] != d) << (SAINT_BIT - 1));
-        buckets[2 + v1] = d;
-        if (cache[i - 1].symbol >= omp_block_start) {
-            sa_sint_t s = cache[i - 1].symbol,
-                      q = (cache[s].index = cache[i - 1].index) & SAINT_MAX;
-            cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] > T[q - 1]);
-        }
-    }
-
-    for (j -= prefetch_distance + 1; i >= j; i -= 1) {
-        sa_sint_t v = cache[i].symbol, p = cache[i].index;
-        d += (p < 0);
-        cache[i].symbol = --buckets[v];
-        cache[i].index = (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1));
-        buckets[2 + v] = d;
-        if (cache[i].symbol >= omp_block_start) {
-            sa_sint_t s = cache[i].symbol, q = (cache[s].index = cache[i].index) & SAINT_MAX;
-            cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] > T[q - 1]);
-        }
-    }
-
-    return d;
-}
-
-static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k_block_sort(
-    const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d,
-    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
-    fast_sint_t omp_block_size) {
-    const fast_sint_t prefetch_distance = 32;
-
-    sa_sint_t * RESTRICT induction_bucket = &buckets[3 * k];
-    sa_sint_t * RESTRICT distinct_names = &buckets[0 * k];
-
-    fast_sint_t i, j;
-    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1;
-         i >= j; i -= 2) {
-        libsais_prefetchw(&cache[i - 2 * prefetch_distance]);
-
-        sa_sint_t s0 = cache[i - prefetch_distance - 0].symbol;
-        const sa_sint_t * Is0 = &induction_bucket[s0 >> 1];
-        libsais_prefetchw(s0 >= 0 ? Is0 : NULL);
-        const sa_sint_t * Ds0 = &distinct_names[s0];
-        libsais_prefetchw(s0 >= 0 ? Ds0 : NULL);
-        sa_sint_t s1 = cache[i - prefetch_distance - 1].symbol;
-        const sa_sint_t * Is1 = &induction_bucket[s1 >> 1];
-        libsais_prefetchw(s1 >= 0 ? Is1 : NULL);
-        const sa_sint_t * Ds1 = &distinct_names[s1];
-        libsais_prefetchw(s1 >= 0 ? Ds1 : NULL);
-
-        sa_sint_t v0 = cache[i - 0].symbol;
-        if (v0 >= 0) {
-            sa_sint_t p0 = cache[i - 0].index;
-            d += (p0 >> (SUFFIX_GROUP_BIT - 1));
-            cache[i - 0].symbol = --induction_bucket[v0 >> 1];
-            cache[i - 0].index = (p0 - 1) | (v0 << (SAINT_BIT - 1)) |
-                                 ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1));
-            distinct_names[v0] = d;
-            if (cache[i - 0].symbol >= omp_block_start) {
-                sa_sint_t ni = cache[i - 0].symbol, np = cache[i - 0].index;
-                if (np > 0) {
-                    cache[i - 0].index = 0;
-                    cache[ni].index = np;
-                    np &= ~SUFFIX_GROUP_MARKER;
-                    cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] > T[np - 1]);
-                }
-            }
-        }
-
-        sa_sint_t v1 = cache[i - 1].symbol;
-        if (v1 >= 0) {
-            sa_sint_t p1 = cache[i - 1].index;
-            d += (p1 >> (SUFFIX_GROUP_BIT - 1));
-            cache[i - 1].symbol = --induction_bucket[v1 >> 1];
-            cache[i - 1].index = (p1 - 1) | (v1 << (SAINT_BIT - 1)) |
-                                 ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1));
-            distinct_names[v1] = d;
-            if (cache[i - 1].symbol >= omp_block_start) {
-                sa_sint_t ni = cache[i - 1].symbol, np = cache[i - 1].index;
-                if (np > 0) {
-                    cache[i - 1].index = 0;
-                    cache[ni].index = np;
-                    np &= ~SUFFIX_GROUP_MARKER;
-                    cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] > T[np - 1]);
-                }
-            }
-        }
-    }
-
-    for (j -= prefetch_distance + 1; i >= j; i -= 1) {
-        sa_sint_t v = cache[i].symbol;
-        if (v >= 0) {
-            sa_sint_t p = cache[i].index;
-            d += (p >> (SUFFIX_GROUP_BIT - 1));
-            cache[i].symbol = --induction_bucket[v >> 1];
-            cache[i].index = (p - 1) | (v << (SAINT_BIT - 1)) |
-                             ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1));
-            distinct_names[v] = d;
-            if (cache[i].symbol >= omp_block_start) {
-                sa_sint_t ni = cache[i].symbol, np = cache[i].index;
-                if (np > 0) {
-                    cache[i].index = 0;
-                    cache[ni].index = np;
-                    np &= ~SUFFIX_GROUP_MARKER;
-                    cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] > T[np - 1]);
-                }
-            }
-        }
-    }
-
-    return d;
-}
-
-static void libsais_partial_sorting_scan_right_to_left_32s_1k_block_sort(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT induction_bucket,
-    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
-    fast_sint_t omp_block_size) {
-    const fast_sint_t prefetch_distance = 32;
-
-    fast_sint_t i, j;
-    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1;
-         i >= j; i -= 2) {
-        libsais_prefetchw(&cache[i - 2 * prefetch_distance]);
-
-        sa_sint_t s0 = cache[i - prefetch_distance - 0].symbol;
-        const sa_sint_t * Is0 = &induction_bucket[s0];
-        libsais_prefetchw(s0 >= 0 ? Is0 : NULL);
-        sa_sint_t s1 = cache[i - prefetch_distance - 1].symbol;
-        const sa_sint_t * Is1 = &induction_bucket[s1];
-        libsais_prefetchw(s1 >= 0 ? Is1 : NULL);
-
-        sa_sint_t v0 = cache[i - 0].symbol;
-        if (v0 >= 0) {
-            cache[i - 0].symbol = --induction_bucket[v0];
-            if (cache[i - 0].symbol >= omp_block_start) {
-                sa_sint_t ni = cache[i - 0].symbol, np = cache[i - 0].index;
-                if (np > 0) {
-                    cache[i - 0].index = 0;
-                    cache[ni].index =
-                        (np - 1) | ((sa_sint_t)(T[np - 2] > T[np - 1]) << (SAINT_BIT - 1));
-                    cache[ni].symbol = T[np - 1];
-                }
-            }
-        }
-
-        sa_sint_t v1 = cache[i - 1].symbol;
-        if (v1 >= 0) {
-            cache[i - 1].symbol = --induction_bucket[v1];
-            if (cache[i - 1].symbol >= omp_block_start) {
-                sa_sint_t ni = cache[i - 1].symbol, np = cache[i - 1].index;
-                if (np > 0) {
-                    cache[i - 1].index = 0;
-                    cache[ni].index =
-                        (np - 1) | ((sa_sint_t)(T[np - 2] > T[np - 1]) << (SAINT_BIT - 1));
-                    cache[ni].symbol = T[np - 1];
-                }
-            }
-        }
-    }
-
-    for (j -= prefetch_distance + 1; i >= j; i -= 1) {
-        sa_sint_t v = cache[i].symbol;
-        if (v >= 0) {
-            cache[i].symbol = --induction_bucket[v];
-            if (cache[i].symbol >= omp_block_start) {
-                sa_sint_t ni = cache[i].symbol, np = cache[i].index;
-                if (np > 0) {
-                    cache[i].index = 0;
-                    cache[ni].index =
-                        (np - 1) | ((sa_sint_t)(T[np - 2] > T[np - 1]) << (SAINT_BIT - 1));
-                    cache[ni].symbol = T[np - 1];
-                }
-            }
-        }
-    }
-}
-
-static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k_block_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
-    sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start,
-    fast_sint_t block_size, sa_sint_t threads) {
-    #if defined(_OPENMP)
-        #pragma omp parallel num_threads(threads) if (threads > 1 && block_size >= 16384)
-    #endif
-    {
-    #if defined(_OPENMP)
-        fast_sint_t omp_thread_num = omp_get_thread_num();
-        fast_sint_t omp_num_threads = omp_get_num_threads();
-    #else
-        UNUSED(threads);
-        UNUSED(cache);
-
-        fast_sint_t omp_thread_num = 0;
-        fast_sint_t omp_num_threads = 1;
-    #endif
-        fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size =
-            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
-
-        omp_block_start += block_start;
-
-        if (omp_num_threads == 1) {
-            d = libsais_partial_sorting_scan_right_to_left_32s_6k(T, SA, buckets, d,
-                                                                  omp_block_start, omp_block_size);
-        }
-    #if defined(_OPENMP)
-        else {
-            {
-                libsais_partial_sorting_scan_right_to_left_32s_6k_block_gather(
-                    T, SA, cache - block_start, omp_block_start, omp_block_size);
-            }
-
-        #pragma omp barrier
-
-        #pragma omp master
-            {
-                d = libsais_partial_sorting_scan_right_to_left_32s_6k_block_sort(
-                    T, buckets, d, cache - block_start, block_start, block_size);
-            }
-
-        #pragma omp barrier
-
-            {
-                libsais_place_cached_suffixes(SA, cache - block_start, omp_block_start,
-                                              omp_block_size);
-            }
-        }
-    #endif
-    }
-
-    return d;
-}
-
-static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k_block_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k,
-    sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache,
-    fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) {
-    #if defined(_OPENMP)
-        #pragma omp parallel num_threads(threads) if (threads > 1 && block_size >= 16384)
-    #endif
-    {
-    #if defined(_OPENMP)
-        fast_sint_t omp_thread_num = omp_get_thread_num();
-        fast_sint_t omp_num_threads = omp_get_num_threads();
-    #else
-        UNUSED(threads);
-        UNUSED(cache);
-
-        fast_sint_t omp_thread_num = 0;
-        fast_sint_t omp_num_threads = 1;
-    #endif
-        fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size =
-            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
-
-        omp_block_start += block_start;
-
-        if (omp_num_threads == 1) {
-            d = libsais_partial_sorting_scan_right_to_left_32s_4k(T, SA, k, buckets, d,
-                                                                  omp_block_start, omp_block_size);
-        }
-    #if defined(_OPENMP)
-        else {
-            {
-                libsais_partial_sorting_scan_right_to_left_32s_4k_block_gather(
-                    T, SA, cache - block_start, omp_block_start, omp_block_size);
-            }
-
-        #pragma omp barrier
-
-        #pragma omp master
-            {
-                d = libsais_partial_sorting_scan_right_to_left_32s_4k_block_sort(
-                    T, k, buckets, d, cache - block_start, block_start, block_size);
-            }
-
-        #pragma omp barrier
-
-            {
-                libsais_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start,
-                                                          omp_block_size);
-            }
-        }
-    #endif
-    }
-
-    return d;
-}
-
-static void libsais_partial_sorting_scan_right_to_left_32s_1k_block_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
-    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size,
-    sa_sint_t threads) {
-    #if defined(_OPENMP)
-        #pragma omp parallel num_threads(threads) if (threads > 1 && block_size >= 16384)
-    #endif
-    {
-    #if defined(_OPENMP)
-        fast_sint_t omp_thread_num = omp_get_thread_num();
-        fast_sint_t omp_num_threads = omp_get_num_threads();
-    #else
-        UNUSED(threads);
-        UNUSED(cache);
-
-        fast_sint_t omp_thread_num = 0;
-        fast_sint_t omp_num_threads = 1;
-    #endif
-        fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size =
-            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
-
-        omp_block_start += block_start;
-
-        if (omp_num_threads == 1) {
-            libsais_partial_sorting_scan_right_to_left_32s_1k(T, SA, buckets, omp_block_start,
-                                                              omp_block_size);
-        }
-    #if defined(_OPENMP)
-        else {
-            {
-                libsais_partial_sorting_scan_right_to_left_32s_1k_block_gather(
-                    T, SA, cache - block_start, omp_block_start, omp_block_size);
-            }
-
-        #pragma omp barrier
-
-        #pragma omp master
-            {
-                libsais_partial_sorting_scan_right_to_left_32s_1k_block_sort(
-                    T, buckets, cache - block_start, block_start, block_size);
-            }
-
-        #pragma omp barrier
-
-            {
-                libsais_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start,
-                                                          omp_block_size);
-            }
-        }
-    #endif
-    }
-}
-
-#endif
-
-static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
-    sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count,
-    sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
-    fast_sint_t scan_start = (fast_sint_t)left_suffixes_count + 1;
-    fast_sint_t scan_end = (fast_sint_t)n - (fast_sint_t)first_lms_suffix;
-
-    if (threads == 1 || (scan_end - scan_start) < 65536) {
-        d = libsais_partial_sorting_scan_right_to_left_32s_6k(T, SA, buckets, d, scan_start,
-                                                              scan_end - scan_start);
-    }
-#if defined(_OPENMP)
-    else {
-        fast_sint_t block_start, block_end;
-        for (block_start = scan_end - 1; block_start >= scan_start; block_start = block_end) {
-            block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
-            if (block_end < scan_start) {
-                block_end = scan_start - 1;
-            }
-
-            d = libsais_partial_sorting_scan_right_to_left_32s_6k_block_omp(
-                T, SA, buckets, d, thread_state[0].state.cache, block_end + 1,
-                block_start - block_end, threads);
-        }
-    }
-#else
-    UNUSED(thread_state);
-#endif
-
-    return d;
-}
-
-static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
-    sa_sint_t * RESTRICT buckets, sa_sint_t d, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
-    if (threads == 1 || n < 65536) {
-        d = libsais_partial_sorting_scan_right_to_left_32s_4k(T, SA, k, buckets, d, 0, n);
-    }
-#if defined(_OPENMP)
-    else {
-        fast_sint_t block_start, block_end;
-        for (block_start = (fast_sint_t)n - 1; block_start >= 0; block_start = block_end) {
-            block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
-            if (block_end < 0) {
-                block_end = -1;
-            }
-
-            d = libsais_partial_sorting_scan_right_to_left_32s_4k_block_omp(
-                T, SA, k, buckets, d, thread_state[0].state.cache, block_end + 1,
-                block_start - block_end, threads);
-        }
-    }
-#else
-    UNUSED(thread_state);
-#endif
-
-    return d;
-}
-
-static void libsais_partial_sorting_scan_right_to_left_32s_1k_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
-    sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
-    if (threads == 1 || n < 65536) {
-        libsais_partial_sorting_scan_right_to_left_32s_1k(T, SA, buckets, 0, n);
-    }
-#if defined(_OPENMP)
-    else {
-        fast_sint_t block_start, block_end;
-        for (block_start = (fast_sint_t)n - 1; block_start >= 0; block_start = block_end) {
-            block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
-            if (block_end < 0) {
-                block_end = -1;
-            }
-
-            libsais_partial_sorting_scan_right_to_left_32s_1k_block_omp(
-                T, SA, buckets, thread_state[0].state.cache, block_end + 1, block_start - block_end,
-                threads);
-        }
-    }
-#else
-    UNUSED(thread_state);
-#endif
-}
-
-static fast_sint_t libsais_partial_sorting_gather_lms_suffixes_32s_4k(sa_sint_t * RESTRICT SA,
-                                                                      fast_sint_t omp_block_start,
-                                                                      fast_sint_t omp_block_size) {
-    const fast_sint_t prefetch_distance = 32;
-
-    fast_sint_t i, j, l;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - 3, l = omp_block_start; i < j;
-         i += 4) {
-        libsais_prefetch(&SA[i + prefetch_distance]);
-
-        sa_sint_t s0 = SA[i + 0];
-        SA[l] = (s0 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER);
-        l += (s0 < 0);
-        sa_sint_t s1 = SA[i + 1];
-        SA[l] = (s1 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER);
-        l += (s1 < 0);
-        sa_sint_t s2 = SA[i + 2];
-        SA[l] = (s2 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER);
-        l += (s2 < 0);
-        sa_sint_t s3 = SA[i + 3];
-        SA[l] = (s3 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER);
-        l += (s3 < 0);
-    }
-
-    for (j += 3; i < j; i += 1) {
-        sa_sint_t s = SA[i];
-        SA[l] = (s - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER);
-        l += (s < 0);
-    }
-
-    return l;
-}
-
-static fast_sint_t libsais_partial_sorting_gather_lms_suffixes_32s_1k(sa_sint_t * RESTRICT SA,
-                                                                      fast_sint_t omp_block_start,
-                                                                      fast_sint_t omp_block_size) {
-    const fast_sint_t prefetch_distance = 32;
-
-    fast_sint_t i, j, l;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - 3, l = omp_block_start; i < j;
-         i += 4) {
-        libsais_prefetch(&SA[i + prefetch_distance]);
-
-        sa_sint_t s0 = SA[i + 0];
-        SA[l] = s0 & SAINT_MAX;
-        l += (s0 < 0);
-        sa_sint_t s1 = SA[i + 1];
-        SA[l] = s1 & SAINT_MAX;
-        l += (s1 < 0);
-        sa_sint_t s2 = SA[i + 2];
-        SA[l] = s2 & SAINT_MAX;
-        l += (s2 < 0);
-        sa_sint_t s3 = SA[i + 3];
-        SA[l] = s3 & SAINT_MAX;
-        l += (s3 < 0);
-    }
-
-    for (j += 3; i < j; i += 1) {
-        sa_sint_t s = SA[i];
-        SA[l] = s & SAINT_MAX;
-        l += (s < 0);
-    }
-
-    return l;
-}
-
-static void libsais_partial_sorting_gather_lms_suffixes_32s_4k_omp(
-    sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
-#if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if (threads > 1 && n >= 65536)
-#endif
-    {
-#if defined(_OPENMP)
-        fast_sint_t omp_thread_num = omp_get_thread_num();
-        fast_sint_t omp_num_threads = omp_get_num_threads();
-#else
-        UNUSED(threads);
-        UNUSED(thread_state);
-
-        fast_sint_t omp_thread_num = 0;
-        fast_sint_t omp_num_threads = 1;
-#endif
-        fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size =
-            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
-
-        if (omp_num_threads == 1) {
-            libsais_partial_sorting_gather_lms_suffixes_32s_4k(SA, omp_block_start, omp_block_size);
-        }
-#if defined(_OPENMP)
-        else {
-            {
-                thread_state[omp_thread_num].state.position = omp_block_start;
-                thread_state[omp_thread_num].state.count =
-                    libsais_partial_sorting_gather_lms_suffixes_32s_4k(SA, omp_block_start,
-                                                                       omp_block_size) -
-                    omp_block_start;
-            }
-
-    #pragma omp barrier
-
-    #pragma omp master
-            {
-                fast_sint_t t, position = 0;
-                for (t = 0; t < omp_num_threads; ++t) {
-                    if (t > 0 && thread_state[t].state.count > 0) {
-                        memmove(&SA[position], &SA[thread_state[t].state.position],
-                                (size_t)thread_state[t].state.count * sizeof(sa_sint_t));
-                    }
-
-                    position += thread_state[t].state.count;
-                }
-            }
-        }
-#endif
-    }
-}
-
-static void libsais_partial_sorting_gather_lms_suffixes_32s_1k_omp(
-    sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
-#if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if (threads > 1 && n >= 65536)
-#endif
-    {
-#if defined(_OPENMP)
-        fast_sint_t omp_thread_num = omp_get_thread_num();
-        fast_sint_t omp_num_threads = omp_get_num_threads();
-#else
-        UNUSED(threads);
-        UNUSED(thread_state);
-
-        fast_sint_t omp_thread_num = 0;
-        fast_sint_t omp_num_threads = 1;
-#endif
-        fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size =
-            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
-
-        if (omp_num_threads == 1) {
-            libsais_partial_sorting_gather_lms_suffixes_32s_1k(SA, omp_block_start, omp_block_size);
-        }
-#if defined(_OPENMP)
-        else {
-            {
-                thread_state[omp_thread_num].state.position = omp_block_start;
-                thread_state[omp_thread_num].state.count =
-                    libsais_partial_sorting_gather_lms_suffixes_32s_1k(SA, omp_block_start,
-                                                                       omp_block_size) -
-                    omp_block_start;
-            }
-
-    #pragma omp barrier
-
-    #pragma omp master
-            {
-                fast_sint_t t, position = 0;
-                for (t = 0; t < omp_num_threads; ++t) {
-                    if (t > 0 && thread_state[t].state.count > 0) {
-                        memmove(&SA[position], &SA[thread_state[t].state.position],
-                                (size_t)thread_state[t].state.count * sizeof(sa_sint_t));
-                    }
-
-                    position += thread_state[t].state.count;
-                }
-            }
-        }
-#endif
-    }
-}
-
-static void libsais_induce_partial_order_8u_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
-                                                sa_sint_t n, sa_sint_t * RESTRICT buckets,
-                                                sa_sint_t first_lms_suffix,
-                                                sa_sint_t left_suffixes_count, sa_sint_t threads,
-                                                LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
-    memset(&buckets[2 * ALPHABET_SIZE], 0, 2 * ALPHABET_SIZE * sizeof(sa_sint_t));
-
-    sa_sint_t d = libsais_partial_sorting_scan_left_to_right_8u_omp(
-        T, SA, n, buckets, left_suffixes_count, 0, threads, thread_state);
-    libsais_partial_sorting_shift_markers_8u_omp(SA, n, buckets, threads);
-    libsais_partial_sorting_scan_right_to_left_8u_omp(
-        T, SA, n, buckets, first_lms_suffix, left_suffixes_count, d, threads, thread_state);
-}
-
-static void libsais_induce_partial_order_32s_6k_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
-    sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count,
-    sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
-    sa_sint_t d = libsais_partial_sorting_scan_left_to_right_32s_6k_omp(
-        T, SA, n, buckets, left_suffixes_count, 0, threads, thread_state);
-    libsais_partial_sorting_shift_markers_32s_6k_omp(SA, k, buckets, threads);
-    libsais_partial_sorting_shift_buckets_32s_6k(k, buckets);
-    libsais_partial_sorting_scan_right_to_left_32s_6k_omp(
-        T, SA, n, buckets, first_lms_suffix, left_suffixes_count, d, threads, thread_state);
-}
-
-static void libsais_induce_partial_order_32s_4k_omp(const sa_sint_t * RESTRICT T,
-                                                    sa_sint_t * RESTRICT SA, sa_sint_t n,
-                                                    sa_sint_t k, sa_sint_t * RESTRICT buckets,
-                                                    sa_sint_t threads,
-                                                    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
-    memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t));
-
-    sa_sint_t d = libsais_partial_sorting_scan_left_to_right_32s_4k_omp(T, SA, n, k, buckets, 0,
-                                                                        threads, thread_state);
-    libsais_partial_sorting_shift_markers_32s_4k(SA, n);
-    libsais_partial_sorting_scan_right_to_left_32s_4k_omp(T, SA, n, k, buckets, d, threads,
-                                                          thread_state);
-    libsais_partial_sorting_gather_lms_suffixes_32s_4k_omp(SA, n, threads, thread_state);
-}
-
-static void libsais_induce_partial_order_32s_2k_omp(const sa_sint_t * RESTRICT T,
-                                                    sa_sint_t * RESTRICT SA, sa_sint_t n,
-                                                    sa_sint_t k, sa_sint_t * RESTRICT buckets,
-                                                    sa_sint_t threads,
-                                                    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
-    libsais_partial_sorting_scan_left_to_right_32s_1k_omp(T, SA, n, &buckets[1 * k], threads,
-                                                          thread_state);
-    libsais_partial_sorting_scan_right_to_left_32s_1k_omp(T, SA, n, &buckets[0 * k], threads,
-                                                          thread_state);
-    libsais_partial_sorting_gather_lms_suffixes_32s_1k_omp(SA, n, threads, thread_state);
-}
-
-static void libsais_induce_partial_order_32s_1k_omp(const sa_sint_t * RESTRICT T,
-                                                    sa_sint_t * RESTRICT SA, sa_sint_t n,
-                                                    sa_sint_t k, sa_sint_t * RESTRICT buckets,
-                                                    sa_sint_t threads,
-                                                    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
-    libsais_count_suffixes_32s(T, n, k, buckets);
-    libsais_initialize_buckets_start_32s_1k(k, buckets);
-    libsais_partial_sorting_scan_left_to_right_32s_1k_omp(T, SA, n, buckets, threads, thread_state);
-
-    libsais_count_suffixes_32s(T, n, k, buckets);
-    libsais_initialize_buckets_end_32s_1k(k, buckets);
-    libsais_partial_sorting_scan_right_to_left_32s_1k_omp(T, SA, n, buckets, threads, thread_state);
-
-    libsais_partial_sorting_gather_lms_suffixes_32s_1k_omp(SA, n, threads, thread_state);
-}
-
-static sa_sint_t libsais_renumber_lms_suffixes_8u(sa_sint_t * RESTRICT SA, sa_sint_t m,
-                                                  sa_sint_t name, fast_sint_t omp_block_start,
-                                                  fast_sint_t omp_block_size) {
-    const fast_sint_t prefetch_distance = 32;
-
-    sa_sint_t * RESTRICT SAm = &SA[m];
-
-    fast_sint_t i, j;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j;
-         i += 4) {
-        libsais_prefetch(&SA[i + 2 * prefetch_distance]);
-
-        libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 0] & SAINT_MAX) >> 1]);
-        libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 1] & SAINT_MAX) >> 1]);
-        libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 2] & SAINT_MAX) >> 1]);
-        libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 3] & SAINT_MAX) >> 1]);
-
-        sa_sint_t p0 = SA[i + 0];
-        SAm[(p0 & SAINT_MAX) >> 1] = name | SAINT_MIN;
-        name += p0 < 0;
-        sa_sint_t p1 = SA[i + 1];
-        SAm[(p1 & SAINT_MAX) >> 1] = name | SAINT_MIN;
-        name += p1 < 0;
-        sa_sint_t p2 = SA[i + 2];
-        SAm[(p2 & SAINT_MAX) >> 1] = name | SAINT_MIN;
-        name += p2 < 0;
-        sa_sint_t p3 = SA[i + 3];
-        SAm[(p3 & SAINT_MAX) >> 1] = name | SAINT_MIN;
-        name += p3 < 0;
-    }
-
-    for (j += prefetch_distance + 3; i < j; i += 1) {
-        sa_sint_t p = SA[i];
-        SAm[(p & SAINT_MAX) >> 1] = name | SAINT_MIN;
-        name += p < 0;
-    }
-
-    return name;
-}
-
-static fast_sint_t libsais_gather_marked_suffixes_8u(sa_sint_t * RESTRICT SA, sa_sint_t m,
-                                                     fast_sint_t l, fast_sint_t omp_block_start,
-                                                     fast_sint_t omp_block_size) {
-    const fast_sint_t prefetch_distance = 32;
-
-    l -= 1;
-
-    fast_sint_t i, j;
-    for (i = (fast_sint_t)m + omp_block_start + omp_block_size - 1,
-        j = (fast_sint_t)m + omp_block_start + 3;
-         i >= j; i -= 4) {
-        libsais_prefetch(&SA[i - prefetch_distance]);
-
-        sa_sint_t s0 = SA[i - 0];
-        SA[l] = s0 & SAINT_MAX;
-        l -= s0 < 0;
-        sa_sint_t s1 = SA[i - 1];
-        SA[l] = s1 & SAINT_MAX;
-        l -= s1 < 0;
-        sa_sint_t s2 = SA[i - 2];
-        SA[l] = s2 & SAINT_MAX;
-        l -= s2 < 0;
-        sa_sint_t s3 = SA[i - 3];
-        SA[l] = s3 & SAINT_MAX;
-        l -= s3 < 0;
-    }
-
-    for (j -= 3; i >= j; i -= 1) {
-        sa_sint_t s = SA[i];
-        SA[l] = s & SAINT_MAX;
-        l -= s < 0;
-    }
-
-    l += 1;
-
-    return l;
-}
-
-static sa_sint_t libsais_renumber_lms_suffixes_8u_omp(
-    sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
-    sa_sint_t name = 0;
-
-#if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if (threads > 1 && m >= 65536)
-#endif
-    {
-#if defined(_OPENMP)
-        fast_sint_t omp_thread_num = omp_get_thread_num();
-        fast_sint_t omp_num_threads = omp_get_num_threads();
-#else
-        UNUSED(threads);
-        UNUSED(thread_state);
-
-        fast_sint_t omp_thread_num = 0;
-        fast_sint_t omp_num_threads = 1;
-#endif
-        fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size =
-            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
-
-        if (omp_num_threads == 1) {
-            name = libsais_renumber_lms_suffixes_8u(SA, m, 0, omp_block_start, omp_block_size);
-        }
-#if defined(_OPENMP)
-        else {
-            {
-                thread_state[omp_thread_num].state.count =
-                    libsais_count_negative_marked_suffixes(SA, omp_block_start, omp_block_size);
-            }
-
-    #pragma omp barrier
-
-            {
-                fast_sint_t t, count = 0;
-                for (t = 0; t < omp_thread_num; ++t) {
-                    count += thread_state[t].state.count;
-                }
-
-                if (omp_thread_num == omp_num_threads - 1) {
-                    name = (sa_sint_t)(count + thread_state[omp_thread_num].state.count);
-                }
-
-                libsais_renumber_lms_suffixes_8u(SA, m, (sa_sint_t)count, omp_block_start,
-                                                 omp_block_size);
-            }
-        }
-#endif
-    }
-
-    return name;
-}
-
-static void libsais_gather_marked_lms_suffixes_8u_omp(
-    sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
-#if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if (threads > 1 && n >= 131072)
-#endif
-    {
-#if defined(_OPENMP)
-        fast_sint_t omp_thread_num = omp_get_thread_num();
-        fast_sint_t omp_num_threads = omp_get_num_threads();
-#else
-        UNUSED(threads);
-        UNUSED(thread_state);
-
-        fast_sint_t omp_thread_num = 0;
-        fast_sint_t omp_num_threads = 1;
-#endif
-        fast_sint_t omp_block_stride = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
-                                         ? omp_block_stride
-                                         : ((fast_sint_t)n >> 1) - omp_block_start;
-
-        if (omp_num_threads == 1) {
-            libsais_gather_marked_suffixes_8u(SA, m, (fast_sint_t)n + (fast_sint_t)fs,
-                                              omp_block_start, omp_block_size);
-        }
-#if defined(_OPENMP)
-        else {
-            {
-                if (omp_thread_num < omp_num_threads - 1) {
-                    thread_state[omp_thread_num].state.position = libsais_gather_marked_suffixes_8u(
-                        SA, m, (fast_sint_t)m + omp_block_start + omp_block_size, omp_block_start,
-                        omp_block_size);
-                    thread_state[omp_thread_num].state.count =
-                        (fast_sint_t)m + omp_block_start + omp_block_size -
-                        thread_state[omp_thread_num].state.position;
-                } else {
-                    thread_state[omp_thread_num].state.position = libsais_gather_marked_suffixes_8u(
-                        SA, m, (fast_sint_t)n + (fast_sint_t)fs, omp_block_start, omp_block_size);
-                    thread_state[omp_thread_num].state.count =
-                        (fast_sint_t)n + (fast_sint_t)fs -
-                        thread_state[omp_thread_num].state.position;
-                }
-            }
-
-    #pragma omp barrier
-
-    #pragma omp master
-            {
-                fast_sint_t t, position = (fast_sint_t)n + (fast_sint_t)fs;
-
-                for (t = omp_num_threads - 1; t >= 0; --t) {
-                    position -= thread_state[t].state.count;
-                    if (t != omp_num_threads - 1 && thread_state[t].state.count > 0) {
-                        memmove(&SA[position], &SA[thread_state[t].state.position],
-                                (size_t)thread_state[t].state.count * sizeof(sa_sint_t));
-                    }
-                }
-            }
-        }
-#endif
-    }
-}
-
-static sa_sint_t libsais_renumber_and_gather_lms_suffixes_8u_omp(
-    sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
-    memset(&SA[m], 0, ((size_t)n >> 1) * sizeof(sa_sint_t));
-
-    sa_sint_t name = libsais_renumber_lms_suffixes_8u_omp(SA, m, threads, thread_state);
-    if (name < m) {
-        libsais_gather_marked_lms_suffixes_8u_omp(SA, n, m, fs, threads, thread_state);
-    } else {
-        fast_sint_t i;
-        for (i = 0; i < m; i += 1) {
-            SA[i] &= SAINT_MAX;
-        }
-    }
-
-    return name;
-}
-
-static sa_sint_t libsais_renumber_distinct_lms_suffixes_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t m,
-                                                               sa_sint_t name,
-                                                               fast_sint_t omp_block_start,
-                                                               fast_sint_t omp_block_size) {
-    const fast_sint_t prefetch_distance = 32;
-
-    sa_sint_t * RESTRICT SAm = &SA[m];
-
-    fast_sint_t i, j;
-    sa_sint_t p0, p1, p2, p3 = 0;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j;
-         i += 4) {
-        libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
-
-        libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 0] & SAINT_MAX) >> 1]);
-        libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 1] & SAINT_MAX) >> 1]);
-        libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 2] & SAINT_MAX) >> 1]);
-        libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 3] & SAINT_MAX) >> 1]);
-
-        p0 = SA[i + 0];
-        SAm[(SA[i + 0] = p0 & SAINT_MAX) >> 1] = name | (p0 & p3 & SAINT_MIN);
-        name += p0 < 0;
-        p1 = SA[i + 1];
-        SAm[(SA[i + 1] = p1 & SAINT_MAX) >> 1] = name | (p1 & p0 & SAINT_MIN);
-        name += p1 < 0;
-        p2 = SA[i + 2];
-        SAm[(SA[i + 2] = p2 & SAINT_MAX) >> 1] = name | (p2 & p1 & SAINT_MIN);
-        name += p2 < 0;
-        p3 = SA[i + 3];
-        SAm[(SA[i + 3] = p3 & SAINT_MAX) >> 1] = name | (p3 & p2 & SAINT_MIN);
-        name += p3 < 0;
-    }
-
-    for (j += prefetch_distance + 3; i < j; i += 1) {
-        p2 = p3;
-        p3 = SA[i];
-        SAm[(SA[i] = p3 & SAINT_MAX) >> 1] = name | (p3 & p2 & SAINT_MIN);
-        name += p3 < 0;
-    }
-
-    return name;
-}
-
-static void libsais_mark_distinct_lms_suffixes_32s(sa_sint_t * RESTRICT SA, sa_sint_t m,
-                                                   fast_sint_t omp_block_start,
-                                                   fast_sint_t omp_block_size) {
-    const fast_sint_t prefetch_distance = 32;
-
-    fast_sint_t i, j;
-    sa_sint_t p0, p1, p2, p3 = 0;
-    for (i = (fast_sint_t)m + omp_block_start,
-        j = (fast_sint_t)m + omp_block_start + omp_block_size - 3;
-         i < j; i += 4) {
-        libsais_prefetchw(&SA[i + prefetch_distance]);
-
-        p0 = SA[i + 0];
-        SA[i + 0] = p0 & (p3 | SAINT_MAX);
-        p0 = (p0 == 0) ? p3 : p0;
-        p1 = SA[i + 1];
-        SA[i + 1] = p1 & (p0 | SAINT_MAX);
-        p1 = (p1 == 0) ? p0 : p1;
-        p2 = SA[i + 2];
-        SA[i + 2] = p2 & (p1 | SAINT_MAX);
-        p2 = (p2 == 0) ? p1 : p2;
-        p3 = SA[i + 3];
-        SA[i + 3] = p3 & (p2 | SAINT_MAX);
-        p3 = (p3 == 0) ? p2 : p3;
-    }
-
-    for (j += 3; i < j; i += 1) {
-        p2 = p3;
-        p3 = SA[i];
-        SA[i] = p3 & (p2 | SAINT_MAX);
-        p3 = (p3 == 0) ? p2 : p3;
-    }
-}
-
-static void libsais_clamp_lms_suffixes_length_32s(sa_sint_t * RESTRICT SA, sa_sint_t m,
-                                                  fast_sint_t omp_block_start,
-                                                  fast_sint_t omp_block_size) {
-    const fast_sint_t prefetch_distance = 32;
-
-    sa_sint_t * RESTRICT SAm = &SA[m];
-
-    fast_sint_t i, j;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - 3; i < j; i += 4) {
-        libsais_prefetchw(&SAm[i + prefetch_distance]);
-
-        SAm[i + 0] = (SAm[i + 0] < 0 ? SAm[i + 0] : 0) & SAINT_MAX;
-        SAm[i + 1] = (SAm[i + 1] < 0 ? SAm[i + 1] : 0) & SAINT_MAX;
-        SAm[i + 2] = (SAm[i + 2] < 0 ? SAm[i + 2] : 0) & SAINT_MAX;
-        SAm[i + 3] = (SAm[i + 3] < 0 ? SAm[i + 3] : 0) & SAINT_MAX;
-    }
-
-    for (j += 3; i < j; i += 1) {
-        SAm[i] = (SAm[i] < 0 ? SAm[i] : 0) & SAINT_MAX;
-    }
-}
-
-static sa_sint_t libsais_renumber_distinct_lms_suffixes_32s_4k_omp(
-    sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
-    sa_sint_t name = 0;
-
-#if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if (threads > 1 && m >= 65536)
-#endif
-    {
-#if defined(_OPENMP)
-        fast_sint_t omp_thread_num = omp_get_thread_num();
-        fast_sint_t omp_num_threads = omp_get_num_threads();
-#else
-        UNUSED(threads);
-        UNUSED(thread_state);
-
-        fast_sint_t omp_thread_num = 0;
-        fast_sint_t omp_num_threads = 1;
-#endif
-        fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size =
-            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
-
-        if (omp_num_threads == 1) {
-            name = libsais_renumber_distinct_lms_suffixes_32s_4k(SA, m, 1, omp_block_start,
-                                                                 omp_block_size);
-        }
-#if defined(_OPENMP)
-        else {
-            {
-                thread_state[omp_thread_num].state.count =
-                    libsais_count_negative_marked_suffixes(SA, omp_block_start, omp_block_size);
-            }
-
-    #pragma omp barrier
-
-            {
-                fast_sint_t t, count = 1;
-                for (t = 0; t < omp_thread_num; ++t) {
-                    count += thread_state[t].state.count;
-                }
-
-                if (omp_thread_num == omp_num_threads - 1) {
-                    name = (sa_sint_t)(count + thread_state[omp_thread_num].state.count);
-                }
-
-                libsais_renumber_distinct_lms_suffixes_32s_4k(SA, m, (sa_sint_t)count,
-                                                              omp_block_start, omp_block_size);
-            }
-        }
-#endif
-    }
-
-    return name - 1;
-}
-
-static void libsais_mark_distinct_lms_suffixes_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n,
-                                                       sa_sint_t m, sa_sint_t threads) {
-#if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if (threads > 1 && n >= 131072)
-#endif
-    {
-#if defined(_OPENMP)
-        fast_sint_t omp_thread_num = omp_get_thread_num();
-        fast_sint_t omp_num_threads = omp_get_num_threads();
-        fast_sint_t omp_block_stride = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
-                                         ? omp_block_stride
-                                         : ((fast_sint_t)n >> 1) - omp_block_start;
-#else
-        UNUSED(threads);
-
-        fast_sint_t omp_block_start = 0;
-        fast_sint_t omp_block_size = (fast_sint_t)n >> 1;
-#endif
-        libsais_mark_distinct_lms_suffixes_32s(SA, m, omp_block_start, omp_block_size);
-    }
-}
-
-static void libsais_clamp_lms_suffixes_length_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n,
-                                                      sa_sint_t m, sa_sint_t threads) {
-#if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if (threads > 1 && n >= 131072)
-#endif
-    {
-#if defined(_OPENMP)
-        fast_sint_t omp_thread_num = omp_get_thread_num();
-        fast_sint_t omp_num_threads = omp_get_num_threads();
-        fast_sint_t omp_block_stride = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
-                                         ? omp_block_stride
-                                         : ((fast_sint_t)n >> 1) - omp_block_start;
-#else
-        UNUSED(threads);
-
-        fast_sint_t omp_block_start = 0;
-        fast_sint_t omp_block_size = (fast_sint_t)n >> 1;
-#endif
-        libsais_clamp_lms_suffixes_length_32s(SA, m, omp_block_start, omp_block_size);
-    }
-}
-
-static sa_sint_t libsais_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(
-    sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
-    memset(&SA[m], 0, ((size_t)n >> 1) * sizeof(sa_sint_t));
-
-    sa_sint_t name =
-        libsais_renumber_distinct_lms_suffixes_32s_4k_omp(SA, m, threads, thread_state);
-    if (name < m) {
-        libsais_mark_distinct_lms_suffixes_32s_omp(SA, n, m, threads);
-    }
-
-    return name;
-}
-
-static sa_sint_t libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(
-    sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads) {
-    const fast_sint_t prefetch_distance = 32;
-
-    sa_sint_t * RESTRICT SAm = &SA[m];
-
-    {
-        libsais_gather_lms_suffixes_32s(T, SA, n);
-
-        memset(&SA[m], 0, ((size_t)n - (size_t)m - (size_t)m) * sizeof(sa_sint_t));
-
-        fast_sint_t i, j;
-        for (i = (fast_sint_t)n - (fast_sint_t)m, j = (fast_sint_t)n - 1 - prefetch_distance - 3;
-             i < j; i += 4) {
-            libsais_prefetch(&SA[i + 2 * prefetch_distance]);
-
-            libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]);
-            libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]);
-            libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 2]) >> 1]);
-            libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 3]) >> 1]);
-
-            SAm[((sa_uint_t)SA[i + 0]) >> 1] = SA[i + 1] - SA[i + 0] + 1 + SAINT_MIN;
-            SAm[((sa_uint_t)SA[i + 1]) >> 1] = SA[i + 2] - SA[i + 1] + 1 + SAINT_MIN;
-            SAm[((sa_uint_t)SA[i + 2]) >> 1] = SA[i + 3] - SA[i + 2] + 1 + SAINT_MIN;
-            SAm[((sa_uint_t)SA[i + 3]) >> 1] = SA[i + 4] - SA[i + 3] + 1 + SAINT_MIN;
-        }
-
-        for (j += prefetch_distance + 3; i < j; i += 1) {
-            SAm[((sa_uint_t)SA[i]) >> 1] = SA[i + 1] - SA[i] + 1 + SAINT_MIN;
-        }
-
-        SAm[((sa_uint_t)SA[n - 1]) >> 1] = 1 + SAINT_MIN;
-    }
-
-    { libsais_clamp_lms_suffixes_length_32s_omp(SA, n, m, threads); }
-
-    sa_sint_t name = 1;
-
-    {
-        fast_sint_t i, j, p = SA[0], plen = SAm[p >> 1];
-        sa_sint_t pdiff = SAINT_MIN;
-        for (i = 1, j = m - prefetch_distance - 1; i < j; i += 2) {
-            libsais_prefetch(&SA[i + 2 * prefetch_distance]);
-
-            libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]);
-            libsais_prefetch(&T[((sa_uint_t)SA[i + prefetch_distance + 0])]);
-            libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]);
-            libsais_prefetch(&T[((sa_uint_t)SA[i + prefetch_distance + 1])]);
-
-            fast_sint_t q = SA[i + 0], qlen = SAm[q >> 1];
-            sa_sint_t qdiff = SAINT_MIN;
-            if (plen == qlen) {
-                fast_sint_t l = 0;
-                do {
-                    if (T[p + l] != T[q + l]) {
-                        break;
-                    }
-                } while (++l < qlen);
-                qdiff = (sa_sint_t)(l - qlen) & SAINT_MIN;
-            }
-            SAm[p >> 1] = name | (pdiff & qdiff);
-            name += (qdiff < 0);
-
-            p = SA[i + 1];
-            plen = SAm[p >> 1];
-            pdiff = SAINT_MIN;
-            if (qlen == plen) {
-                fast_sint_t l = 0;
-                do {
-                    if (T[q + l] != T[p + l]) {
-                        break;
-                    }
-                } while (++l < plen);
-                pdiff = (sa_sint_t)(l - plen) & SAINT_MIN;
-            }
-            SAm[q >> 1] = name | (qdiff & pdiff);
-            name += (pdiff < 0);
-        }
-
-        for (j += prefetch_distance + 1; i < j; i += 1) {
-            fast_sint_t q = SA[i], qlen = SAm[q >> 1];
-            sa_sint_t qdiff = SAINT_MIN;
-            if (plen == qlen) {
-                fast_sint_t l = 0;
-                do {
-                    if (T[p + l] != T[q + l]) {
-                        break;
-                    }
-                } while (++l < plen);
-                qdiff = (sa_sint_t)(l - plen) & SAINT_MIN;
-            }
-            SAm[p >> 1] = name | (pdiff & qdiff);
-            name += (qdiff < 0);
-
-            p = q;
-            plen = qlen;
-            pdiff = qdiff;
-        }
-
-        SAm[p >> 1] = name | pdiff;
-        name++;
-    }
-
-    if (name <= m) {
-        libsais_mark_distinct_lms_suffixes_32s_omp(SA, n, m, threads);
-    }
-
-    return name - 1;
-}
-
-static void libsais_reconstruct_lms_suffixes(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
-                                             fast_sint_t omp_block_start,
-                                             fast_sint_t omp_block_size) {
-    const fast_sint_t prefetch_distance = 32;
-
-    const sa_sint_t * RESTRICT SAnm = &SA[n - m];
-
-    fast_sint_t i, j;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j;
-         i += 4) {
-        libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
-
-        libsais_prefetch(&SAnm[SA[i + prefetch_distance + 0]]);
-        libsais_prefetch(&SAnm[SA[i + prefetch_distance + 1]]);
-        libsais_prefetch(&SAnm[SA[i + prefetch_distance + 2]]);
-        libsais_prefetch(&SAnm[SA[i + prefetch_distance + 3]]);
-
-        SA[i + 0] = SAnm[SA[i + 0]];
-        SA[i + 1] = SAnm[SA[i + 1]];
-        SA[i + 2] = SAnm[SA[i + 2]];
-        SA[i + 3] = SAnm[SA[i + 3]];
-    }
-
-    for (j += prefetch_distance + 3; i < j; i += 1) {
-        SA[i] = SAnm[SA[i]];
-    }
-}
-
-static void libsais_reconstruct_lms_suffixes_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
-                                                 sa_sint_t threads) {
-#if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if (threads > 1 && m >= 65536)
-#endif
-    {
-#if defined(_OPENMP)
-        fast_sint_t omp_thread_num = omp_get_thread_num();
-        fast_sint_t omp_num_threads = omp_get_num_threads();
-        fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size =
-            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
-#else
-        UNUSED(threads);
-
-        fast_sint_t omp_block_start = 0;
-        fast_sint_t omp_block_size = m;
-#endif
-
-        libsais_reconstruct_lms_suffixes(SA, n, m, omp_block_start, omp_block_size);
-    }
-}
-
-static void libsais_place_lms_suffixes_interval_8u(sa_sint_t * RESTRICT SA, sa_sint_t n,
-                                                   sa_sint_t m,
-                                                   const sa_sint_t * RESTRICT buckets) {
-    const sa_sint_t * RESTRICT bucket_end = &buckets[7 * ALPHABET_SIZE];
-
-    fast_sint_t c, j = n;
-    for (c = ALPHABET_SIZE - 2; c >= 0; --c) {
-        fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1) + BUCKETS_INDEX2(1, 0)] -
-                        (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1)];
-        if (l > 0) {
-            fast_sint_t i = bucket_end[c];
-            if (j - i > 0) {
-                memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
-            }
-
-            memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
-        }
-    }
-
-    memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
-}
-
-static void libsais_place_lms_suffixes_interval_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n,
-                                                       sa_sint_t k, sa_sint_t m,
-                                                       const sa_sint_t * RESTRICT buckets) {
-    const sa_sint_t * RESTRICT bucket_end = &buckets[3 * k];
-
-    fast_sint_t c, j = n;
-    for (c = (fast_sint_t)k - 2; c >= 0; --c) {
-        fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1) + BUCKETS_INDEX2(1, 0)] -
-                        (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1)];
-        if (l > 0) {
-            fast_sint_t i = bucket_end[c];
-            if (j - i > 0) {
-                memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
-            }
-
-            memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
-        }
-    }
-
-    memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
-}
-
-static void libsais_place_lms_suffixes_interval_32s_2k(sa_sint_t * RESTRICT SA, sa_sint_t n,
-                                                       sa_sint_t k, sa_sint_t m,
-                                                       const sa_sint_t * RESTRICT buckets) {
-    fast_sint_t j = n;
-
-    if (k > 1) {
-        fast_sint_t c;
-        for (c = BUCKETS_INDEX2((fast_sint_t)k - 2, 0); c >= BUCKETS_INDEX2(0, 0);
-             c -= BUCKETS_INDEX2(1, 0)) {
-            fast_sint_t l = (fast_sint_t)buckets[c + BUCKETS_INDEX2(1, 1)] -
-                            (fast_sint_t)buckets[c + BUCKETS_INDEX2(0, 1)];
-            if (l > 0) {
-                fast_sint_t i = buckets[c];
-                if (j - i > 0) {
-                    memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
-                }
-
-                memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
-            }
-        }
-    }
-
-    memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
-}
-
-static void libsais_place_lms_suffixes_interval_32s_1k(const sa_sint_t * RESTRICT T,
-                                                       sa_sint_t * RESTRICT SA, sa_sint_t k,
-                                                       sa_sint_t m, sa_sint_t * RESTRICT buckets) {
-    const fast_sint_t prefetch_distance = 32;
-
-    sa_sint_t c = k - 1;
-    fast_sint_t i, l = buckets[c];
-    for (i = (fast_sint_t)m - 1; i >= prefetch_distance + 3; i -= 4) {
-        libsais_prefetch(&SA[i - 2 * prefetch_distance]);
-
-        libsais_prefetch(&T[SA[i - prefetch_distance - 0]]);
-        libsais_prefetch(&T[SA[i - prefetch_distance - 1]]);
-        libsais_prefetch(&T[SA[i - prefetch_distance - 2]]);
-        libsais_prefetch(&T[SA[i - prefetch_distance - 3]]);
-
-        sa_sint_t p0 = SA[i - 0];
-        if (T[p0] != c) {
-            c = T[p0];
-            memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t));
-            l = buckets[c];
-        }
-        SA[--l] = p0;
-        sa_sint_t p1 = SA[i - 1];
-        if (T[p1] != c) {
-            c = T[p1];
-            memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t));
-            l = buckets[c];
-        }
-        SA[--l] = p1;
-        sa_sint_t p2 = SA[i - 2];
-        if (T[p2] != c) {
-            c = T[p2];
-            memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t));
-            l = buckets[c];
-        }
-        SA[--l] = p2;
-        sa_sint_t p3 = SA[i - 3];
-        if (T[p3] != c) {
-            c = T[p3];
-            memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t));
-            l = buckets[c];
-        }
-        SA[--l] = p3;
-    }
-
-    for (; i >= 0; i -= 1) {
-        sa_sint_t p = SA[i];
-        if (T[p] != c) {
-            c = T[p];
-            memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t));
-            l = buckets[c];
-        }
-        SA[--l] = p;
-    }
-
-    memset(&SA[0], 0, (size_t)l * sizeof(sa_sint_t));
-}
-
-static void libsais_place_lms_suffixes_histogram_32s_6k(sa_sint_t * RESTRICT SA, sa_sint_t n,
-                                                        sa_sint_t k, sa_sint_t m,
-                                                        const sa_sint_t * RESTRICT buckets) {
-    const sa_sint_t * RESTRICT bucket_end = &buckets[5 * k];
-
-    fast_sint_t c, j = n;
-    for (c = (fast_sint_t)k - 2; c >= 0; --c) {
-        fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX4(c, 1)];
-        if (l > 0) {
-            fast_sint_t i = bucket_end[c];
-            if (j - i > 0) {
-                memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
-            }
-
-            memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
-        }
-    }
-
-    memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
-}
-
-static void libsais_place_lms_suffixes_histogram_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n,
-                                                        sa_sint_t k, sa_sint_t m,
-                                                        const sa_sint_t * RESTRICT buckets) {
-    const sa_sint_t * RESTRICT bucket_end = &buckets[3 * k];
-
-    fast_sint_t c, j = n;
-    for (c = (fast_sint_t)k - 2; c >= 0; --c) {
-        fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1)];
-        if (l > 0) {
-            fast_sint_t i = bucket_end[c];
-            if (j - i > 0) {
-                memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
-            }
-
-            memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
-        }
-    }
-
-    memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
-}
-
-static void libsais_place_lms_suffixes_histogram_32s_2k(sa_sint_t * RESTRICT SA, sa_sint_t n,
-                                                        sa_sint_t k, sa_sint_t m,
-                                                        const sa_sint_t * RESTRICT buckets) {
-    fast_sint_t j = n;
-
-    if (k > 1) {
-        fast_sint_t c;
-        for (c = BUCKETS_INDEX2((fast_sint_t)k - 2, 0); c >= BUCKETS_INDEX2(0, 0);
-             c -= BUCKETS_INDEX2(1, 0)) {
-            fast_sint_t l = (fast_sint_t)buckets[c + BUCKETS_INDEX2(0, 1)];
-            if (l > 0) {
-                fast_sint_t i = buckets[c];
-                if (j - i > 0) {
-                    memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
-                }
-
-                memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
-            }
-        }
-    }
-
-    memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
-}
-
-static void libsais_final_bwt_scan_left_to_right_8u(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
-                                                    sa_sint_t * RESTRICT induction_bucket,
-                                                    fast_sint_t omp_block_start,
-                                                    fast_sint_t omp_block_size) {
-    const fast_sint_t prefetch_distance = 32;
-
-    fast_sint_t i, j;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j;
-         i += 2) {
-        libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
-
-        sa_sint_t s0 = SA[i + prefetch_distance + 0];
-        const u8 * Ts0 = &T[s0] - 1;
-        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
-        Ts0--;
-        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
-        sa_sint_t s1 = SA[i + prefetch_distance + 1];
-        const u8 * Ts1 = &T[s1] - 1;
-        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
-        Ts1--;
-        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
-
-        sa_sint_t p0 = SA[i + 0];
-        SA[i + 0] = p0 & SAINT_MAX;
-        if (p0 > 0) {
-            p0--;
-            SA[i + 0] = T[p0] | SAINT_MIN;
-            SA[induction_bucket[T[p0]]++] =
-                p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1));
-        }
-        sa_sint_t p1 = SA[i + 1];
-        SA[i + 1] = p1 & SAINT_MAX;
-        if (p1 > 0) {
-            p1--;
-            SA[i + 1] = T[p1] | SAINT_MIN;
-            SA[induction_bucket[T[p1]]++] =
-                p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1));
-        }
-    }
-
-    for (j += prefetch_distance + 1; i < j; i += 1) {
-        sa_sint_t p = SA[i];
-        SA[i] = p & SAINT_MAX;
-        if (p > 0) {
-            p--;
-            SA[i] = T[p] | SAINT_MIN;
-            SA[induction_bucket[T[p]]++] =
-                p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1));
-        }
-    }
-}
-
-static void libsais_final_bwt_aux_scan_left_to_right_8u(const u8 * RESTRICT T,
-                                                        sa_sint_t * RESTRICT SA, sa_sint_t rm,
-                                                        sa_sint_t * RESTRICT I,
-                                                        sa_sint_t * RESTRICT induction_bucket,
-                                                        fast_sint_t omp_block_start,
-                                                        fast_sint_t omp_block_size) {
-    const fast_sint_t prefetch_distance = 32;
-
-    fast_sint_t i, j;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j;
-         i += 2) {
-        libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
-
-        sa_sint_t s0 = SA[i + prefetch_distance + 0];
-        const u8 * Ts0 = &T[s0] - 1;
-        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
-        Ts0--;
-        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
-        sa_sint_t s1 = SA[i + prefetch_distance + 1];
-        const u8 * Ts1 = &T[s1] - 1;
-        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
-        Ts1--;
-        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
-
-        sa_sint_t p0 = SA[i + 0];
-        SA[i + 0] = p0 & SAINT_MAX;
-        if (p0 > 0) {
-            p0--;
-            SA[i + 0] = T[p0] | SAINT_MIN;
-            SA[induction_bucket[T[p0]]++] =
-                p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1));
-            if ((p0 & rm) == 0) {
-                I[p0 / (rm + 1)] = induction_bucket[T[p0]];
-            }
-        }
-        sa_sint_t p1 = SA[i + 1];
-        SA[i + 1] = p1 & SAINT_MAX;
-        if (p1 > 0) {
-            p1--;
-            SA[i + 1] = T[p1] | SAINT_MIN;
-            SA[induction_bucket[T[p1]]++] =
-                p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1));
-            if ((p1 & rm) == 0) {
-                I[p1 / (rm + 1)] = induction_bucket[T[p1]];
-            }
-        }
-    }
-
-    for (j += prefetch_distance + 1; i < j; i += 1) {
-        sa_sint_t p = SA[i];
-        SA[i] = p & SAINT_MAX;
-        if (p > 0) {
-            p--;
-            SA[i] = T[p] | SAINT_MIN;
-            SA[induction_bucket[T[p]]++] =
-                p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1));
-            if ((p & rm) == 0) {
-                I[p / (rm + 1)] = induction_bucket[T[p]];
-            }
-        }
+        p0 = SA[i + 0];
+        SAm[(SA[i + 0] = p0 & SAINT_MAX) >> 1] = name | (p0 & p3 & SAINT_MIN);
+        name += p0 < 0;
+        p1 = SA[i + 1];
+        SAm[(SA[i + 1] = p1 & SAINT_MAX) >> 1] = name | (p1 & p0 & SAINT_MIN);
+        name += p1 < 0;
+        p2 = SA[i + 2];
+        SAm[(SA[i + 2] = p2 & SAINT_MAX) >> 1] = name | (p2 & p1 & SAINT_MIN);
+        name += p2 < 0;
+        p3 = SA[i + 3];
+        SAm[(SA[i + 3] = p3 & SAINT_MAX) >> 1] = name | (p3 & p2 & SAINT_MIN);
+        name += p3 < 0;
     }
-}
-
-static void libsais_final_sorting_scan_left_to_right_8u(const u8 * RESTRICT T,
-                                                        sa_sint_t * RESTRICT SA,
-                                                        sa_sint_t * RESTRICT induction_bucket,
-                                                        fast_sint_t omp_block_start,
-                                                        fast_sint_t omp_block_size) {
-    const fast_sint_t prefetch_distance = 32;
-
-    fast_sint_t i, j;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j;
-         i += 2) {
-        libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
-
-        sa_sint_t s0 = SA[i + prefetch_distance + 0];
-        const u8 * Ts0 = &T[s0] - 1;
-        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
-        Ts0--;
-        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
-        sa_sint_t s1 = SA[i + prefetch_distance + 1];
-        const u8 * Ts1 = &T[s1] - 1;
-        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
-        Ts1--;
-        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
 
-        sa_sint_t p0 = SA[i + 0];
-        SA[i + 0] = p0 ^ SAINT_MIN;
-        if (p0 > 0) {
-            p0--;
-            SA[induction_bucket[T[p0]]++] =
-                p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1));
-        }
-        sa_sint_t p1 = SA[i + 1];
-        SA[i + 1] = p1 ^ SAINT_MIN;
-        if (p1 > 0) {
-            p1--;
-            SA[induction_bucket[T[p1]]++] =
-                p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1));
-        }
+    for (j += prefetch_distance + 3; i < j; i += 1) {
+        p2 = p3;
+        p3 = SA[i];
+        SAm[(SA[i] = p3 & SAINT_MAX) >> 1] = name | (p3 & p2 & SAINT_MIN);
+        name += p3 < 0;
     }
 
-    for (j += prefetch_distance + 1; i < j; i += 1) {
-        sa_sint_t p = SA[i];
-        SA[i] = p ^ SAINT_MIN;
-        if (p > 0) {
-            p--;
-            SA[induction_bucket[T[p]]++] =
-                p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1));
-        }
-    }
+    return name;
 }
 
-static void libsais_final_sorting_scan_left_to_right_32s(const sa_sint_t * RESTRICT T,
-                                                         sa_sint_t * RESTRICT SA,
-                                                         sa_sint_t * RESTRICT induction_bucket,
-                                                         fast_sint_t omp_block_start,
-                                                         fast_sint_t omp_block_size) {
+static void libsais_mark_distinct_lms_suffixes_32s(sa_sint_t * RESTRICT SA, sa_sint_t m,
+                                                   fast_sint_t omp_block_start,
+                                                   fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1;
-         i < j; i += 2) {
-        libsais_prefetchw(&SA[i + 3 * prefetch_distance]);
-
-        sa_sint_t s0 = SA[i + 2 * prefetch_distance + 0];
-        const sa_sint_t * Ts0 = &T[s0] - 1;
-        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
-        sa_sint_t s1 = SA[i + 2 * prefetch_distance + 1];
-        const sa_sint_t * Ts1 = &T[s1] - 1;
-        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
-        sa_sint_t s2 = SA[i + 1 * prefetch_distance + 0];
-        if (s2 > 0) {
-            libsais_prefetchw(&induction_bucket[T[s2 - 1]]);
-            libsais_prefetch(&T[s2] - 2);
-        }
-        sa_sint_t s3 = SA[i + 1 * prefetch_distance + 1];
-        if (s3 > 0) {
-            libsais_prefetchw(&induction_bucket[T[s3 - 1]]);
-            libsais_prefetch(&T[s3] - 2);
-        }
-
-        sa_sint_t p0 = SA[i + 0];
-        SA[i + 0] = p0 ^ SAINT_MIN;
-        if (p0 > 0) {
-            p0--;
-            SA[induction_bucket[T[p0]]++] =
-                p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1));
-        }
-        sa_sint_t p1 = SA[i + 1];
-        SA[i + 1] = p1 ^ SAINT_MIN;
-        if (p1 > 0) {
-            p1--;
-            SA[induction_bucket[T[p1]]++] =
-                p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1));
-        }
-    }
-
-    for (j += 2 * prefetch_distance + 1; i < j; i += 1) {
-        sa_sint_t p = SA[i];
-        SA[i] = p ^ SAINT_MIN;
-        if (p > 0) {
-            p--;
-            SA[induction_bucket[T[p]]++] =
-                p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1));
-        }
-    }
-}
-
-#if defined(_OPENMP)
-
-static fast_sint_t libsais_final_bwt_scan_left_to_right_8u_block_prepare(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
-    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
-    fast_sint_t omp_block_size) {
-    const fast_sint_t prefetch_distance = 32;
-
-    memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t));
-
-    fast_sint_t i, j, count = 0;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j;
-         i += 2) {
-        libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
-
-        sa_sint_t s0 = SA[i + prefetch_distance + 0];
-        const u8 * Ts0 = &T[s0] - 1;
-        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
-        Ts0--;
-        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
-        sa_sint_t s1 = SA[i + prefetch_distance + 1];
-        const u8 * Ts1 = &T[s1] - 1;
-        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
-        Ts1--;
-        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
-
-        sa_sint_t p0 = SA[i + 0];
-        SA[i + 0] = p0 & SAINT_MAX;
-        if (p0 > 0) {
-            p0--;
-            SA[i + 0] = T[p0] | SAINT_MIN;
-            buckets[cache[count].symbol = T[p0]]++;
-            cache[count++].index = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1));
-        }
-        sa_sint_t p1 = SA[i + 1];
-        SA[i + 1] = p1 & SAINT_MAX;
-        if (p1 > 0) {
-            p1--;
-            SA[i + 1] = T[p1] | SAINT_MIN;
-            buckets[cache[count].symbol = T[p1]]++;
-            cache[count++].index = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1));
-        }
-    }
-
-    for (j += prefetch_distance + 1; i < j; i += 1) {
-        sa_sint_t p = SA[i];
-        SA[i] = p & SAINT_MAX;
-        if (p > 0) {
-            p--;
-            SA[i] = T[p] | SAINT_MIN;
-            buckets[cache[count].symbol = T[p]]++;
-            cache[count++].index = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1));
-        }
-    }
-
-    return count;
-}
-
-static fast_sint_t libsais_final_sorting_scan_left_to_right_8u_block_prepare(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
-    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
-    fast_sint_t omp_block_size) {
-    const fast_sint_t prefetch_distance = 32;
-
-    memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t));
-
-    fast_sint_t i, j, count = 0;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j;
-         i += 2) {
-        libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
-
-        sa_sint_t s0 = SA[i + prefetch_distance + 0];
-        const u8 * Ts0 = &T[s0] - 1;
-        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
-        Ts0--;
-        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
-        sa_sint_t s1 = SA[i + prefetch_distance + 1];
-        const u8 * Ts1 = &T[s1] - 1;
-        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
-        Ts1--;
-        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+    sa_sint_t p0, p1, p2, p3 = 0;
+    for (i = (fast_sint_t)m + omp_block_start,
+        j = (fast_sint_t)m + omp_block_start + omp_block_size - 3;
+         i < j; i += 4) {
+        libsais_prefetchw(&SA[i + prefetch_distance]);
 
-        sa_sint_t p0 = SA[i + 0];
-        SA[i + 0] = p0 ^ SAINT_MIN;
-        if (p0 > 0) {
-            p0--;
-            buckets[cache[count].symbol = T[p0]]++;
-            cache[count++].index = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1));
-        }
-        sa_sint_t p1 = SA[i + 1];
-        SA[i + 1] = p1 ^ SAINT_MIN;
-        if (p1 > 0) {
-            p1--;
-            buckets[cache[count].symbol = T[p1]]++;
-            cache[count++].index = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1));
-        }
+        p0 = SA[i + 0];
+        SA[i + 0] = p0 & (p3 | SAINT_MAX);
+        p0 = (p0 == 0) ? p3 : p0;
+        p1 = SA[i + 1];
+        SA[i + 1] = p1 & (p0 | SAINT_MAX);
+        p1 = (p1 == 0) ? p0 : p1;
+        p2 = SA[i + 2];
+        SA[i + 2] = p2 & (p1 | SAINT_MAX);
+        p2 = (p2 == 0) ? p1 : p2;
+        p3 = SA[i + 3];
+        SA[i + 3] = p3 & (p2 | SAINT_MAX);
+        p3 = (p3 == 0) ? p2 : p3;
     }
 
-    for (j += prefetch_distance + 1; i < j; i += 1) {
-        sa_sint_t p = SA[i];
-        SA[i] = p ^ SAINT_MIN;
-        if (p > 0) {
-            p--;
-            buckets[cache[count].symbol = T[p]]++;
-            cache[count++].index = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1));
-        }
+    for (j += 3; i < j; i += 1) {
+        p2 = p3;
+        p3 = SA[i];
+        SA[i] = p3 & (p2 | SAINT_MAX);
+        p3 = (p3 == 0) ? p2 : p3;
     }
-
-    return count;
 }
 
-static void libsais_final_order_scan_left_to_right_8u_block_place(
-    sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache,
-    fast_sint_t count) {
+static void libsais_clamp_lms_suffixes_length_32s(sa_sint_t * RESTRICT SA, sa_sint_t m,
+                                                  fast_sint_t omp_block_start,
+                                                  fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
+    sa_sint_t * RESTRICT SAm = &SA[m];
+
     fast_sint_t i, j;
-    for (i = 0, j = count - 3; i < j; i += 4) {
-        libsais_prefetch(&cache[i + prefetch_distance]);
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - 3; i < j; i += 4) {
+        libsais_prefetchw(&SAm[i + prefetch_distance]);
 
-        SA[buckets[cache[i + 0].symbol]++] = cache[i + 0].index;
-        SA[buckets[cache[i + 1].symbol]++] = cache[i + 1].index;
-        SA[buckets[cache[i + 2].symbol]++] = cache[i + 2].index;
-        SA[buckets[cache[i + 3].symbol]++] = cache[i + 3].index;
+        SAm[i + 0] = (SAm[i + 0] < 0 ? SAm[i + 0] : 0) & SAINT_MAX;
+        SAm[i + 1] = (SAm[i + 1] < 0 ? SAm[i + 1] : 0) & SAINT_MAX;
+        SAm[i + 2] = (SAm[i + 2] < 0 ? SAm[i + 2] : 0) & SAINT_MAX;
+        SAm[i + 3] = (SAm[i + 3] < 0 ? SAm[i + 3] : 0) & SAINT_MAX;
     }
 
     for (j += 3; i < j; i += 1) {
-        SA[buckets[cache[i].symbol]++] = cache[i].index;
+        SAm[i] = (SAm[i] < 0 ? SAm[i] : 0) & SAINT_MAX;
     }
 }
 
-static void libsais_final_bwt_aux_scan_left_to_right_8u_block_place(
-    sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT buckets,
-    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count) {
-    const fast_sint_t prefetch_distance = 32;
+static sa_sint_t libsais_renumber_distinct_lms_suffixes_32s_4k_omp(
+    sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    sa_sint_t name = 0;
+    {
+        (void)(threads);
+        (void)(thread_state);
 
-    fast_sint_t i, j;
-    for (i = 0, j = count - 3; i < j; i += 4) {
-        libsais_prefetch(&cache[i + prefetch_distance]);
+        fast_sint_t omp_thread_num = 0;
+        fast_sint_t omp_num_threads = 1;
 
-        SA[buckets[cache[i + 0].symbol]++] = cache[i + 0].index;
-        if ((cache[i + 0].index & rm) == 0) {
-            I[(cache[i + 0].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 0].symbol];
-        }
-        SA[buckets[cache[i + 1].symbol]++] = cache[i + 1].index;
-        if ((cache[i + 1].index & rm) == 0) {
-            I[(cache[i + 1].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 1].symbol];
-        }
-        SA[buckets[cache[i + 2].symbol]++] = cache[i + 2].index;
-        if ((cache[i + 2].index & rm) == 0) {
-            I[(cache[i + 2].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 2].symbol];
-        }
-        SA[buckets[cache[i + 3].symbol]++] = cache[i + 3].index;
-        if ((cache[i + 3].index & rm) == 0) {
-            I[(cache[i + 3].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 3].symbol];
-        }
-    }
+        fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16);
+        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+        fast_sint_t omp_block_size =
+            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
 
-    for (j += 3; i < j; i += 1) {
-        SA[buckets[cache[i].symbol]++] = cache[i].index;
-        if ((cache[i].index & rm) == 0) {
-            I[(cache[i].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i].symbol];
+        if (omp_num_threads == 1) {
+            name = libsais_renumber_distinct_lms_suffixes_32s_4k(SA, m, 1, omp_block_start,
+                                                                 omp_block_size);
         }
     }
+
+    return name - 1;
 }
 
-static void libsais_final_sorting_scan_left_to_right_32s_block_gather(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache,
-    fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
-    const fast_sint_t prefetch_distance = 32;
+static void libsais_mark_distinct_lms_suffixes_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                                       sa_sint_t m, sa_sint_t threads) {
+    {
+        (void)(threads);
 
-    fast_sint_t i, j;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j;
-         i += 2) {
-        libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
+        fast_sint_t omp_block_start = 0;
+        fast_sint_t omp_block_size = (fast_sint_t)n >> 1;
 
-        sa_sint_t s0 = SA[i + prefetch_distance + 0];
-        const sa_sint_t * Ts0 = &T[s0] - 1;
-        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
-        Ts0--;
-        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
-        sa_sint_t s1 = SA[i + prefetch_distance + 1];
-        const sa_sint_t * Ts1 = &T[s1] - 1;
-        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
-        Ts1--;
-        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+        libsais_mark_distinct_lms_suffixes_32s(SA, m, omp_block_start, omp_block_size);
+    }
+}
 
-        libsais_prefetchw(&cache[i + prefetch_distance]);
+static void libsais_clamp_lms_suffixes_length_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                                      sa_sint_t m, sa_sint_t threads) {
+    {
+        (void)(threads);
 
-        sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0];
-        SA[i + 0] = p0 ^ SAINT_MIN;
-        if (p0 > 0) {
-            p0--;
-            cache[i + 0].index = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1));
-            symbol0 = T[p0];
-        }
-        cache[i + 0].symbol = symbol0;
-        sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1];
-        SA[i + 1] = p1 ^ SAINT_MIN;
-        if (p1 > 0) {
-            p1--;
-            cache[i + 1].index = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1));
-            symbol1 = T[p1];
-        }
-        cache[i + 1].symbol = symbol1;
+        fast_sint_t omp_block_start = 0;
+        fast_sint_t omp_block_size = (fast_sint_t)n >> 1;
+
+        libsais_clamp_lms_suffixes_length_32s(SA, m, omp_block_start, omp_block_size);
     }
+}
 
-    for (j += prefetch_distance + 1; i < j; i += 1) {
-        sa_sint_t symbol = SAINT_MIN, p = SA[i];
-        SA[i] = p ^ SAINT_MIN;
-        if (p > 0) {
-            p--;
-            cache[i].index = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1));
-            symbol = T[p];
-        }
-        cache[i].symbol = symbol;
+static sa_sint_t libsais_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(
+    sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    memset(&SA[m], 0, ((size_t)n >> 1) * sizeof(sa_sint_t));
+
+    sa_sint_t name =
+        libsais_renumber_distinct_lms_suffixes_32s_4k_omp(SA, m, threads, thread_state);
+    if (name < m) {
+        libsais_mark_distinct_lms_suffixes_32s_omp(SA, n, m, threads);
     }
+
+    return name;
 }
 
-static void libsais_final_sorting_scan_left_to_right_32s_block_sort(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT induction_bucket,
-    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
-    fast_sint_t omp_block_size) {
+static sa_sint_t libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(
+    sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads) {
     const fast_sint_t prefetch_distance = 32;
 
-    fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size;
-    for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2) {
-        libsais_prefetchw(&cache[i + 2 * prefetch_distance]);
-
-        sa_sint_t s0 = cache[i + prefetch_distance + 0].symbol;
-        const sa_sint_t * Is0 = &induction_bucket[s0];
-        libsais_prefetchw(s0 >= 0 ? Is0 : NULL);
-        sa_sint_t s1 = cache[i + prefetch_distance + 1].symbol;
-        const sa_sint_t * Is1 = &induction_bucket[s1];
-        libsais_prefetchw(s1 >= 0 ? Is1 : NULL);
-
-        sa_sint_t v0 = cache[i + 0].symbol;
-        if (v0 >= 0) {
-            cache[i + 0].symbol = induction_bucket[v0]++;
-            if (cache[i + 0].symbol < omp_block_end) {
-                sa_sint_t ni = cache[i + 0].symbol, np = cache[i + 0].index;
-                cache[i + 0].index = np ^ SAINT_MIN;
-                if (np > 0) {
-                    np--;
-                    cache[ni].index =
-                        np | ((sa_sint_t)(T[np - (np > 0)] < T[np]) << (SAINT_BIT - 1));
-                    cache[ni].symbol = T[np];
-                }
-            }
-        }
+    sa_sint_t * RESTRICT SAm = &SA[m];
 
-        sa_sint_t v1 = cache[i + 1].symbol;
-        if (v1 >= 0) {
-            cache[i + 1].symbol = induction_bucket[v1]++;
-            if (cache[i + 1].symbol < omp_block_end) {
-                sa_sint_t ni = cache[i + 1].symbol, np = cache[i + 1].index;
-                cache[i + 1].index = np ^ SAINT_MIN;
-                if (np > 0) {
-                    np--;
-                    cache[ni].index =
-                        np | ((sa_sint_t)(T[np - (np > 0)] < T[np]) << (SAINT_BIT - 1));
-                    cache[ni].symbol = T[np];
-                }
-            }
+    {
+        libsais_gather_lms_suffixes_32s(T, SA, n);
+
+        memset(&SA[m], 0, ((size_t)n - (size_t)m - (size_t)m) * sizeof(sa_sint_t));
+
+        fast_sint_t i, j;
+        for (i = (fast_sint_t)n - (fast_sint_t)m, j = (fast_sint_t)n - 1 - prefetch_distance - 3;
+             i < j; i += 4) {
+            libsais_prefetch(&SA[i + 2 * prefetch_distance]);
+
+            libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]);
+            libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]);
+            libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 2]) >> 1]);
+            libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 3]) >> 1]);
+
+            SAm[((sa_uint_t)SA[i + 0]) >> 1] = SA[i + 1] - SA[i + 0] + 1 + SAINT_MIN;
+            SAm[((sa_uint_t)SA[i + 1]) >> 1] = SA[i + 2] - SA[i + 1] + 1 + SAINT_MIN;
+            SAm[((sa_uint_t)SA[i + 2]) >> 1] = SA[i + 3] - SA[i + 2] + 1 + SAINT_MIN;
+            SAm[((sa_uint_t)SA[i + 3]) >> 1] = SA[i + 4] - SA[i + 3] + 1 + SAINT_MIN;
         }
-    }
 
-    for (j += prefetch_distance + 1; i < j; i += 1) {
-        sa_sint_t v = cache[i].symbol;
-        if (v >= 0) {
-            cache[i].symbol = induction_bucket[v]++;
-            if (cache[i].symbol < omp_block_end) {
-                sa_sint_t ni = cache[i].symbol, np = cache[i].index;
-                cache[i].index = np ^ SAINT_MIN;
-                if (np > 0) {
-                    np--;
-                    cache[ni].index =
-                        np | ((sa_sint_t)(T[np - (np > 0)] < T[np]) << (SAINT_BIT - 1));
-                    cache[ni].symbol = T[np];
-                }
-            }
+        for (j += prefetch_distance + 3; i < j; i += 1) {
+            SAm[((sa_uint_t)SA[i]) >> 1] = SA[i + 1] - SA[i] + 1 + SAINT_MIN;
         }
-    }
-}
 
-static void libsais_final_bwt_scan_left_to_right_8u_block_omp(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket,
-    fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
-    #if defined(_OPENMP)
-        #pragma omp parallel num_threads(threads) if (threads > 1 &&                      \
-                                                      block_size >= 64 * ALPHABET_SIZE && \
-                                                      omp_get_dynamic() == 0)
-    #endif
-    {
-    #if defined(_OPENMP)
-        fast_sint_t omp_thread_num = omp_get_thread_num();
-        fast_sint_t omp_num_threads = omp_get_num_threads();
-    #else
-        UNUSED(threads);
-        UNUSED(thread_state);
+        SAm[((sa_uint_t)SA[n - 1]) >> 1] = 1 + SAINT_MIN;
+    }
 
-        fast_sint_t omp_thread_num = 0;
-        fast_sint_t omp_num_threads = 1;
-    #endif
-        fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size =
-            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+    { libsais_clamp_lms_suffixes_length_32s_omp(SA, n, m, threads); }
 
-        omp_block_start += block_start;
+    sa_sint_t name = 1;
 
-        if (omp_num_threads == 1) {
-            libsais_final_bwt_scan_left_to_right_8u(T, SA, induction_bucket, omp_block_start,
-                                                    omp_block_size);
-        }
-    #if defined(_OPENMP)
-        else {
-            {
-                thread_state[omp_thread_num].state.count =
-                    libsais_final_bwt_scan_left_to_right_8u_block_prepare(
-                        T, SA, thread_state[omp_thread_num].state.buckets,
-                        thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size);
-            }
+    {
+        fast_sint_t i, j, p = SA[0], plen = SAm[p >> 1];
+        sa_sint_t pdiff = SAINT_MIN;
+        for (i = 1, j = m - prefetch_distance - 1; i < j; i += 2) {
+            libsais_prefetch(&SA[i + 2 * prefetch_distance]);
 
-        #pragma omp barrier
+            libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]);
+            libsais_prefetch(&T[((sa_uint_t)SA[i + prefetch_distance + 0])]);
+            libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]);
+            libsais_prefetch(&T[((sa_uint_t)SA[i + prefetch_distance + 1])]);
 
-        #pragma omp master
-            {
-                fast_sint_t t;
-                for (t = 0; t < omp_num_threads; ++t) {
-                    sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
-                    fast_sint_t c;
-                    for (c = 0; c < ALPHABET_SIZE; c += 1) {
-                        sa_sint_t A = induction_bucket[c], B = temp_bucket[c];
-                        induction_bucket[c] = A + B;
-                        temp_bucket[c] = A;
+            fast_sint_t q = SA[i + 0], qlen = SAm[q >> 1];
+            sa_sint_t qdiff = SAINT_MIN;
+            if (plen == qlen) {
+                fast_sint_t l = 0;
+                do {
+                    if (T[p + l] != T[q + l]) {
+                        break;
                     }
-                }
+                } while (++l < qlen);
+                qdiff = (sa_sint_t)(l - qlen) & SAINT_MIN;
             }
+            SAm[p >> 1] = name | (pdiff & qdiff);
+            name += (qdiff < 0);
 
-        #pragma omp barrier
+            p = SA[i + 1];
+            plen = SAm[p >> 1];
+            pdiff = SAINT_MIN;
+            if (qlen == plen) {
+                fast_sint_t l = 0;
+                do {
+                    if (T[q + l] != T[p + l]) {
+                        break;
+                    }
+                } while (++l < plen);
+                pdiff = (sa_sint_t)(l - plen) & SAINT_MIN;
+            }
+            SAm[q >> 1] = name | (qdiff & pdiff);
+            name += (pdiff < 0);
+        }
 
-            {
-                libsais_final_order_scan_left_to_right_8u_block_place(
-                    SA, thread_state[omp_thread_num].state.buckets,
-                    thread_state[omp_thread_num].state.cache,
-                    thread_state[omp_thread_num].state.count);
+        for (j += prefetch_distance + 1; i < j; i += 1) {
+            fast_sint_t q = SA[i], qlen = SAm[q >> 1];
+            sa_sint_t qdiff = SAINT_MIN;
+            if (plen == qlen) {
+                fast_sint_t l = 0;
+                do {
+                    if (T[p + l] != T[q + l]) {
+                        break;
+                    }
+                } while (++l < plen);
+                qdiff = (sa_sint_t)(l - plen) & SAINT_MIN;
             }
+            SAm[p >> 1] = name | (pdiff & qdiff);
+            name += (qdiff < 0);
+
+            p = q;
+            plen = qlen;
+            pdiff = qdiff;
         }
-    #endif
+
+        SAm[p >> 1] = name | pdiff;
+        name++;
     }
-}
 
-static void libsais_final_bwt_aux_scan_left_to_right_8u_block_omp(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I,
-    sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size,
-    sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
-    #if defined(_OPENMP)
-        #pragma omp parallel num_threads(threads) if (threads > 1 &&                      \
-                                                      block_size >= 64 * ALPHABET_SIZE && \
-                                                      omp_get_dynamic() == 0)
-    #endif
-    {
-    #if defined(_OPENMP)
-        fast_sint_t omp_thread_num = omp_get_thread_num();
-        fast_sint_t omp_num_threads = omp_get_num_threads();
-    #else
-        UNUSED(threads);
-        UNUSED(thread_state);
+    if (name <= m) {
+        libsais_mark_distinct_lms_suffixes_32s_omp(SA, n, m, threads);
+    }
 
-        fast_sint_t omp_thread_num = 0;
-        fast_sint_t omp_num_threads = 1;
-    #endif
-        fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size =
-            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+    return name - 1;
+}
 
-        omp_block_start += block_start;
+static void libsais_reconstruct_lms_suffixes(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
+                                             fast_sint_t omp_block_start,
+                                             fast_sint_t omp_block_size) {
+    const fast_sint_t prefetch_distance = 32;
 
-        if (omp_num_threads == 1) {
-            libsais_final_bwt_aux_scan_left_to_right_8u(T, SA, rm, I, induction_bucket,
-                                                        omp_block_start, omp_block_size);
-        }
-    #if defined(_OPENMP)
-        else {
-            {
-                thread_state[omp_thread_num].state.count =
-                    libsais_final_bwt_scan_left_to_right_8u_block_prepare(
-                        T, SA, thread_state[omp_thread_num].state.buckets,
-                        thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size);
-            }
+    const sa_sint_t * RESTRICT SAnm = &SA[n - m];
 
-        #pragma omp barrier
+    fast_sint_t i, j;
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j;
+         i += 4) {
+        libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
 
-        #pragma omp master
-            {
-                fast_sint_t t;
-                for (t = 0; t < omp_num_threads; ++t) {
-                    sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
-                    fast_sint_t c;
-                    for (c = 0; c < ALPHABET_SIZE; c += 1) {
-                        sa_sint_t A = induction_bucket[c], B = temp_bucket[c];
-                        induction_bucket[c] = A + B;
-                        temp_bucket[c] = A;
-                    }
-                }
-            }
+        libsais_prefetch(&SAnm[SA[i + prefetch_distance + 0]]);
+        libsais_prefetch(&SAnm[SA[i + prefetch_distance + 1]]);
+        libsais_prefetch(&SAnm[SA[i + prefetch_distance + 2]]);
+        libsais_prefetch(&SAnm[SA[i + prefetch_distance + 3]]);
 
-        #pragma omp barrier
+        SA[i + 0] = SAnm[SA[i + 0]];
+        SA[i + 1] = SAnm[SA[i + 1]];
+        SA[i + 2] = SAnm[SA[i + 2]];
+        SA[i + 3] = SAnm[SA[i + 3]];
+    }
 
-            {
-                libsais_final_bwt_aux_scan_left_to_right_8u_block_place(
-                    SA, rm, I, thread_state[omp_thread_num].state.buckets,
-                    thread_state[omp_thread_num].state.cache,
-                    thread_state[omp_thread_num].state.count);
-            }
-        }
-    #endif
+    for (j += prefetch_distance + 3; i < j; i += 1) {
+        SA[i] = SAnm[SA[i]];
     }
 }
 
-static void libsais_final_sorting_scan_left_to_right_8u_block_omp(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket,
-    fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
-    #if defined(_OPENMP)
-        #pragma omp parallel num_threads(threads) if (threads > 1 &&                      \
-                                                      block_size >= 64 * ALPHABET_SIZE && \
-                                                      omp_get_dynamic() == 0)
-    #endif
+static void libsais_reconstruct_lms_suffixes_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
+                                                 sa_sint_t threads) {
     {
-    #if defined(_OPENMP)
-        fast_sint_t omp_thread_num = omp_get_thread_num();
-        fast_sint_t omp_num_threads = omp_get_num_threads();
-    #else
-        UNUSED(threads);
-        UNUSED(thread_state);
-
-        fast_sint_t omp_thread_num = 0;
-        fast_sint_t omp_num_threads = 1;
-    #endif
-        fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size =
-            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+        (void)(threads);
 
-        omp_block_start += block_start;
-
-        if (omp_num_threads == 1) {
-            libsais_final_sorting_scan_left_to_right_8u(T, SA, induction_bucket, omp_block_start,
-                                                        omp_block_size);
-        }
-    #if defined(_OPENMP)
-        else {
-            {
-                thread_state[omp_thread_num].state.count =
-                    libsais_final_sorting_scan_left_to_right_8u_block_prepare(
-                        T, SA, thread_state[omp_thread_num].state.buckets,
-                        thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size);
-            }
+        fast_sint_t omp_block_start = 0;
+        fast_sint_t omp_block_size = m;
+        libsais_reconstruct_lms_suffixes(SA, n, m, omp_block_start, omp_block_size);
+    }
+}
 
-        #pragma omp barrier
+static void libsais_place_lms_suffixes_interval_8u(sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                                   sa_sint_t m,
+                                                   const sa_sint_t * RESTRICT buckets) {
+    const sa_sint_t * RESTRICT bucket_end = &buckets[7 * ALPHABET_SIZE];
 
-        #pragma omp master
-            {
-                fast_sint_t t;
-                for (t = 0; t < omp_num_threads; ++t) {
-                    sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
-                    fast_sint_t c;
-                    for (c = 0; c < ALPHABET_SIZE; c += 1) {
-                        sa_sint_t A = induction_bucket[c], B = temp_bucket[c];
-                        induction_bucket[c] = A + B;
-                        temp_bucket[c] = A;
-                    }
-                }
+    fast_sint_t c, j = n;
+    for (c = ALPHABET_SIZE - 2; c >= 0; --c) {
+        fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1) + BUCKETS_INDEX2(1, 0)] -
+                        (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1)];
+        if (l > 0) {
+            fast_sint_t i = bucket_end[c];
+            if (j - i > 0) {
+                memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
             }
 
-        #pragma omp barrier
-
-            {
-                libsais_final_order_scan_left_to_right_8u_block_place(
-                    SA, thread_state[omp_thread_num].state.buckets,
-                    thread_state[omp_thread_num].state.cache,
-                    thread_state[omp_thread_num].state.count);
-            }
+            memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
         }
-    #endif
     }
-}
 
-static void libsais_final_sorting_scan_left_to_right_32s_block_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
-    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size,
-    sa_sint_t threads) {
-    #if defined(_OPENMP)
-        #pragma omp parallel num_threads(threads) if (threads > 1 && block_size >= 16384)
-    #endif
-    {
-    #if defined(_OPENMP)
-        fast_sint_t omp_thread_num = omp_get_thread_num();
-        fast_sint_t omp_num_threads = omp_get_num_threads();
-    #else
-        UNUSED(threads);
-        UNUSED(cache);
+    memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
+}
 
-        fast_sint_t omp_thread_num = 0;
-        fast_sint_t omp_num_threads = 1;
-    #endif
-        fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size =
-            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
+static void libsais_place_lms_suffixes_interval_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                                       sa_sint_t k, sa_sint_t m,
+                                                       const sa_sint_t * RESTRICT buckets) {
+    const sa_sint_t * RESTRICT bucket_end = &buckets[3 * k];
 
-        omp_block_start += block_start;
+    fast_sint_t c, j = n;
+    for (c = (fast_sint_t)k - 2; c >= 0; --c) {
+        fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1) + BUCKETS_INDEX2(1, 0)] -
+                        (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1)];
+        if (l > 0) {
+            fast_sint_t i = bucket_end[c];
+            if (j - i > 0) {
+                memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
+            }
 
-        if (omp_num_threads == 1) {
-            libsais_final_sorting_scan_left_to_right_32s(T, SA, buckets, omp_block_start,
-                                                         omp_block_size);
+            memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
         }
-    #if defined(_OPENMP)
-        else {
-            {
-                libsais_final_sorting_scan_left_to_right_32s_block_gather(
-                    T, SA, cache - block_start, omp_block_start, omp_block_size);
-            }
+    }
 
-        #pragma omp barrier
+    memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
+}
 
-        #pragma omp master
-            {
-                libsais_final_sorting_scan_left_to_right_32s_block_sort(
-                    T, buckets, cache - block_start, block_start, block_size);
-            }
+static void libsais_place_lms_suffixes_interval_32s_2k(sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                                       sa_sint_t k, sa_sint_t m,
+                                                       const sa_sint_t * RESTRICT buckets) {
+    fast_sint_t j = n;
 
-        #pragma omp barrier
+    if (k > 1) {
+        fast_sint_t c;
+        for (c = BUCKETS_INDEX2((fast_sint_t)k - 2, 0); c >= BUCKETS_INDEX2(0, 0);
+             c -= BUCKETS_INDEX2(1, 0)) {
+            fast_sint_t l = (fast_sint_t)buckets[c + BUCKETS_INDEX2(1, 1)] -
+                            (fast_sint_t)buckets[c + BUCKETS_INDEX2(0, 1)];
+            if (l > 0) {
+                fast_sint_t i = buckets[c];
+                if (j - i > 0) {
+                    memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
+                }
 
-            {
-                libsais_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start,
-                                                          omp_block_size);
+                memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
             }
         }
-    #endif
     }
+
+    memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
 }
 
-#endif
+static void libsais_place_lms_suffixes_interval_32s_1k(const sa_sint_t * RESTRICT T,
+                                                       sa_sint_t * RESTRICT SA, sa_sint_t k,
+                                                       sa_sint_t m, sa_sint_t * RESTRICT buckets) {
+    const fast_sint_t prefetch_distance = 32;
 
-static void libsais_final_bwt_scan_left_to_right_8u_omp(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t n,
-    sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
-    SA[induction_bucket[T[(sa_sint_t)n - 1]]++] =
-        ((sa_sint_t)n - 1) |
-        ((sa_sint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1));
+    sa_sint_t c = k - 1;
+    fast_sint_t i, l = buckets[c];
+    for (i = (fast_sint_t)m - 1; i >= prefetch_distance + 3; i -= 4) {
+        libsais_prefetch(&SA[i - 2 * prefetch_distance]);
 
-    if (threads == 1 || n < 65536) {
-        libsais_final_bwt_scan_left_to_right_8u(T, SA, induction_bucket, 0, n);
+        libsais_prefetch(&T[SA[i - prefetch_distance - 0]]);
+        libsais_prefetch(&T[SA[i - prefetch_distance - 1]]);
+        libsais_prefetch(&T[SA[i - prefetch_distance - 2]]);
+        libsais_prefetch(&T[SA[i - prefetch_distance - 3]]);
+
+        sa_sint_t p0 = SA[i - 0];
+        if (T[p0] != c) {
+            c = T[p0];
+            memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t));
+            l = buckets[c];
+        }
+        SA[--l] = p0;
+        sa_sint_t p1 = SA[i - 1];
+        if (T[p1] != c) {
+            c = T[p1];
+            memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t));
+            l = buckets[c];
+        }
+        SA[--l] = p1;
+        sa_sint_t p2 = SA[i - 2];
+        if (T[p2] != c) {
+            c = T[p2];
+            memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t));
+            l = buckets[c];
+        }
+        SA[--l] = p2;
+        sa_sint_t p3 = SA[i - 3];
+        if (T[p3] != c) {
+            c = T[p3];
+            memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t));
+            l = buckets[c];
+        }
+        SA[--l] = p3;
     }
-#if defined(_OPENMP)
-    else {
-        fast_sint_t block_start;
-        for (block_start = 0; block_start < n;) {
-            if (SA[block_start] == 0) {
-                block_start++;
-            } else {
-                fast_sint_t block_max_end =
-                    block_start + ((fast_sint_t)threads) *
-                                      (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads);
-                if (block_max_end > n) {
-                    block_max_end = n;
-                }
-                fast_sint_t block_end = block_start + 1;
-                while (block_end < block_max_end && SA[block_end] != 0) {
-                    block_end++;
-                }
-                fast_sint_t block_size = block_end - block_start;
-
-                if (block_size < 32) {
-                    for (; block_start < block_end; block_start += 1) {
-                        sa_sint_t p = SA[block_start];
-                        SA[block_start] = p & SAINT_MAX;
-                        if (p > 0) {
-                            p--;
-                            SA[block_start] = T[p] | SAINT_MIN;
-                            SA[induction_bucket[T[p]]++] =
-                                p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1));
-                        }
-                    }
-                } else {
-                    libsais_final_bwt_scan_left_to_right_8u_block_omp(
-                        T, SA, induction_bucket, block_start, block_size, threads, thread_state);
-                    block_start = block_end;
-                }
-            }
+
+    for (; i >= 0; i -= 1) {
+        sa_sint_t p = SA[i];
+        if (T[p] != c) {
+            c = T[p];
+            memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t));
+            l = buckets[c];
         }
+        SA[--l] = p;
     }
-#else
-    UNUSED(thread_state);
-#endif
-}
 
-static void libsais_final_bwt_aux_scan_left_to_right_8u_omp(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t n, sa_sint_t rm,
-    sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
-    SA[induction_bucket[T[(sa_sint_t)n - 1]]++] =
-        ((sa_sint_t)n - 1) |
-        ((sa_sint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1));
+    memset(&SA[0], 0, (size_t)l * sizeof(sa_sint_t));
+}
 
-    if ((((sa_sint_t)n - 1) & rm) == 0) {
-        I[((sa_sint_t)n - 1) / (rm + 1)] = induction_bucket[T[(sa_sint_t)n - 1]];
-    }
+static void libsais_place_lms_suffixes_histogram_32s_6k(sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                                        sa_sint_t k, sa_sint_t m,
+                                                        const sa_sint_t * RESTRICT buckets) {
+    const sa_sint_t * RESTRICT bucket_end = &buckets[5 * k];
 
-    if (threads == 1 || n < 65536) {
-        libsais_final_bwt_aux_scan_left_to_right_8u(T, SA, rm, I, induction_bucket, 0, n);
-    }
-#if defined(_OPENMP)
-    else {
-        fast_sint_t block_start;
-        for (block_start = 0; block_start < n;) {
-            if (SA[block_start] == 0) {
-                block_start++;
-            } else {
-                fast_sint_t block_max_end =
-                    block_start + ((fast_sint_t)threads) *
-                                      (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads);
-                if (block_max_end > n) {
-                    block_max_end = n;
-                }
-                fast_sint_t block_end = block_start + 1;
-                while (block_end < block_max_end && SA[block_end] != 0) {
-                    block_end++;
-                }
-                fast_sint_t block_size = block_end - block_start;
-
-                if (block_size < 32) {
-                    for (; block_start < block_end; block_start += 1) {
-                        sa_sint_t p = SA[block_start];
-                        SA[block_start] = p & SAINT_MAX;
-                        if (p > 0) {
-                            p--;
-                            SA[block_start] = T[p] | SAINT_MIN;
-                            SA[induction_bucket[T[p]]++] =
-                                p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1));
-                            if ((p & rm) == 0) {
-                                I[p / (rm + 1)] = induction_bucket[T[p]];
-                            }
-                        }
-                    }
-                } else {
-                    libsais_final_bwt_aux_scan_left_to_right_8u_block_omp(
-                        T, SA, rm, I, induction_bucket, block_start, block_size, threads,
-                        thread_state);
-                    block_start = block_end;
-                }
+    fast_sint_t c, j = n;
+    for (c = (fast_sint_t)k - 2; c >= 0; --c) {
+        fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX4(c, 1)];
+        if (l > 0) {
+            fast_sint_t i = bucket_end[c];
+            if (j - i > 0) {
+                memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
             }
+
+            memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
         }
     }
-#else
-    UNUSED(thread_state);
-#endif
+
+    memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
 }
 
-static void libsais_final_sorting_scan_left_to_right_8u_omp(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t n,
-    sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
-    SA[induction_bucket[T[(sa_sint_t)n - 1]]++] =
-        ((sa_sint_t)n - 1) |
-        ((sa_sint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1));
+static void libsais_place_lms_suffixes_histogram_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                                        sa_sint_t k, sa_sint_t m,
+                                                        const sa_sint_t * RESTRICT buckets) {
+    const sa_sint_t * RESTRICT bucket_end = &buckets[3 * k];
 
-    if (threads == 1 || n < 65536) {
-        libsais_final_sorting_scan_left_to_right_8u(T, SA, induction_bucket, 0, n);
-    }
-#if defined(_OPENMP)
-    else {
-        fast_sint_t block_start;
-        for (block_start = 0; block_start < n;) {
-            if (SA[block_start] == 0) {
-                block_start++;
-            } else {
-                fast_sint_t block_max_end =
-                    block_start + ((fast_sint_t)threads) *
-                                      (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads);
-                if (block_max_end > n) {
-                    block_max_end = n;
-                }
-                fast_sint_t block_end = block_start + 1;
-                while (block_end < block_max_end && SA[block_end] != 0) {
-                    block_end++;
-                }
-                fast_sint_t block_size = block_end - block_start;
-
-                if (block_size < 32) {
-                    for (; block_start < block_end; block_start += 1) {
-                        sa_sint_t p = SA[block_start];
-                        SA[block_start] = p ^ SAINT_MIN;
-                        if (p > 0) {
-                            p--;
-                            SA[induction_bucket[T[p]]++] =
-                                p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1));
-                        }
-                    }
-                } else {
-                    libsais_final_sorting_scan_left_to_right_8u_block_omp(
-                        T, SA, induction_bucket, block_start, block_size, threads, thread_state);
-                    block_start = block_end;
-                }
+    fast_sint_t c, j = n;
+    for (c = (fast_sint_t)k - 2; c >= 0; --c) {
+        fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1)];
+        if (l > 0) {
+            fast_sint_t i = bucket_end[c];
+            if (j - i > 0) {
+                memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
             }
+
+            memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
         }
     }
-#else
-    UNUSED(thread_state);
-#endif
+
+    memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
 }
 
-static void libsais_final_sorting_scan_left_to_right_32s_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
-    sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
-    SA[induction_bucket[T[n - 1]]++] =
-        (n - 1) | ((sa_sint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1));
+static void libsais_place_lms_suffixes_histogram_32s_2k(sa_sint_t * RESTRICT SA, sa_sint_t n,
+                                                        sa_sint_t k, sa_sint_t m,
+                                                        const sa_sint_t * RESTRICT buckets) {
+    fast_sint_t j = n;
+
+    if (k > 1) {
+        fast_sint_t c;
+        for (c = BUCKETS_INDEX2((fast_sint_t)k - 2, 0); c >= BUCKETS_INDEX2(0, 0);
+             c -= BUCKETS_INDEX2(1, 0)) {
+            fast_sint_t l = (fast_sint_t)buckets[c + BUCKETS_INDEX2(0, 1)];
+            if (l > 0) {
+                fast_sint_t i = buckets[c];
+                if (j - i > 0) {
+                    memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
+                }
 
-    if (threads == 1 || n < 65536) {
-        libsais_final_sorting_scan_left_to_right_32s(T, SA, induction_bucket, 0, n);
-    }
-#if defined(_OPENMP)
-    else {
-        fast_sint_t block_start, block_end;
-        for (block_start = 0; block_start < n; block_start = block_end) {
-            block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
-            if (block_end > n) {
-                block_end = n;
+                memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
             }
-
-            libsais_final_sorting_scan_left_to_right_32s_block_omp(
-                T, SA, induction_bucket, thread_state[0].state.cache, block_start,
-                block_end - block_start, threads);
         }
     }
-#else
-    UNUSED(thread_state);
-#endif
+
+    memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
 }
 
-static sa_sint_t libsais_final_bwt_scan_right_to_left_8u(const u8 * RESTRICT T,
-                                                         sa_sint_t * RESTRICT SA,
-                                                         sa_sint_t * RESTRICT induction_bucket,
-                                                         fast_sint_t omp_block_start,
-                                                         fast_sint_t omp_block_size) {
+static void libsais_final_bwt_scan_left_to_right_8u(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
+                                                    sa_sint_t * RESTRICT induction_bucket,
+                                                    fast_sint_t omp_block_start,
+                                                    fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    sa_sint_t index = -1;
-    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1;
-         i >= j; i -= 2) {
-        libsais_prefetchw(&SA[i - 2 * prefetch_distance]);
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j;
+         i += 2) {
+        libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
 
-        sa_sint_t s0 = SA[i - prefetch_distance - 0];
+        sa_sint_t s0 = SA[i + prefetch_distance + 0];
         const u8 * Ts0 = &T[s0] - 1;
         libsais_prefetch(s0 > 0 ? Ts0 : NULL);
         Ts0--;
         libsais_prefetch(s0 > 0 ? Ts0 : NULL);
-        sa_sint_t s1 = SA[i - prefetch_distance - 1];
+        sa_sint_t s1 = SA[i + prefetch_distance + 1];
         const u8 * Ts1 = &T[s1] - 1;
         libsais_prefetch(s1 > 0 ? Ts1 : NULL);
         Ts1--;
         libsais_prefetch(s1 > 0 ? Ts1 : NULL);
 
-        sa_sint_t p0 = SA[i - 0];
-        index = (p0 == 0) ? (sa_sint_t)(i - 0) : index;
-        SA[i - 0] = p0 & SAINT_MAX;
+        sa_sint_t p0 = SA[i + 0];
+        SA[i + 0] = p0 & SAINT_MAX;
         if (p0 > 0) {
             p0--;
-            u8 c0 = T[p0 - (p0 > 0)], c1 = T[p0];
-            SA[i - 0] = c1;
-            sa_sint_t t = c0 | SAINT_MIN;
-            SA[--induction_bucket[c1]] = (c0 <= c1) ? p0 : t;
+            SA[i + 0] = T[p0] | SAINT_MIN;
+            SA[induction_bucket[T[p0]]++] =
+                p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1));
         }
-
-        sa_sint_t p1 = SA[i - 1];
-        index = (p1 == 0) ? (sa_sint_t)(i - 1) : index;
-        SA[i - 1] = p1 & SAINT_MAX;
+        sa_sint_t p1 = SA[i + 1];
+        SA[i + 1] = p1 & SAINT_MAX;
         if (p1 > 0) {
             p1--;
-            u8 c0 = T[p1 - (p1 > 0)], c1 = T[p1];
-            SA[i - 1] = c1;
-            sa_sint_t t = c0 | SAINT_MIN;
-            SA[--induction_bucket[c1]] = (c0 <= c1) ? p1 : t;
+            SA[i + 1] = T[p1] | SAINT_MIN;
+            SA[induction_bucket[T[p1]]++] =
+                p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1));
         }
     }
 
-    for (j -= prefetch_distance + 1; i >= j; i -= 1) {
+    for (j += prefetch_distance + 1; i < j; i += 1) {
         sa_sint_t p = SA[i];
-        index = (p == 0) ? (sa_sint_t)i : index;
         SA[i] = p & SAINT_MAX;
         if (p > 0) {
             p--;
-            u8 c0 = T[p - (p > 0)], c1 = T[p];
-            SA[i] = c1;
-            sa_sint_t t = c0 | SAINT_MIN;
-            SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t;
+            SA[i] = T[p] | SAINT_MIN;
+            SA[induction_bucket[T[p]]++] =
+                p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1));
         }
     }
-
-    return index;
 }
 
-static void libsais_final_bwt_aux_scan_right_to_left_8u(const u8 * RESTRICT T,
+static void libsais_final_bwt_aux_scan_left_to_right_8u(const u8 * RESTRICT T,
                                                         sa_sint_t * RESTRICT SA, sa_sint_t rm,
                                                         sa_sint_t * RESTRICT I,
                                                         sa_sint_t * RESTRICT induction_bucket,
@@ -6869,65 +3035,61 @@ static void libsais_final_bwt_aux_scan_right_to_left_8u(const u8 * RESTRICT T,
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1;
-         i >= j; i -= 2) {
-        libsais_prefetchw(&SA[i - 2 * prefetch_distance]);
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j;
+         i += 2) {
+        libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
 
-        sa_sint_t s0 = SA[i - prefetch_distance - 0];
+        sa_sint_t s0 = SA[i + prefetch_distance + 0];
         const u8 * Ts0 = &T[s0] - 1;
         libsais_prefetch(s0 > 0 ? Ts0 : NULL);
         Ts0--;
         libsais_prefetch(s0 > 0 ? Ts0 : NULL);
-        sa_sint_t s1 = SA[i - prefetch_distance - 1];
+        sa_sint_t s1 = SA[i + prefetch_distance + 1];
         const u8 * Ts1 = &T[s1] - 1;
         libsais_prefetch(s1 > 0 ? Ts1 : NULL);
         Ts1--;
         libsais_prefetch(s1 > 0 ? Ts1 : NULL);
 
-        sa_sint_t p0 = SA[i - 0];
-        SA[i - 0] = p0 & SAINT_MAX;
+        sa_sint_t p0 = SA[i + 0];
+        SA[i + 0] = p0 & SAINT_MAX;
         if (p0 > 0) {
             p0--;
-            u8 c0 = T[p0 - (p0 > 0)], c1 = T[p0];
-            SA[i - 0] = c1;
-            sa_sint_t t = c0 | SAINT_MIN;
-            SA[--induction_bucket[c1]] = (c0 <= c1) ? p0 : t;
+            SA[i + 0] = T[p0] | SAINT_MIN;
+            SA[induction_bucket[T[p0]]++] =
+                p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1));
             if ((p0 & rm) == 0) {
-                I[p0 / (rm + 1)] = induction_bucket[T[p0]] + 1;
+                I[p0 / (rm + 1)] = induction_bucket[T[p0]];
             }
         }
-
-        sa_sint_t p1 = SA[i - 1];
-        SA[i - 1] = p1 & SAINT_MAX;
+        sa_sint_t p1 = SA[i + 1];
+        SA[i + 1] = p1 & SAINT_MAX;
         if (p1 > 0) {
             p1--;
-            u8 c0 = T[p1 - (p1 > 0)], c1 = T[p1];
-            SA[i - 1] = c1;
-            sa_sint_t t = c0 | SAINT_MIN;
-            SA[--induction_bucket[c1]] = (c0 <= c1) ? p1 : t;
+            SA[i + 1] = T[p1] | SAINT_MIN;
+            SA[induction_bucket[T[p1]]++] =
+                p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1));
             if ((p1 & rm) == 0) {
-                I[p1 / (rm + 1)] = induction_bucket[T[p1]] + 1;
+                I[p1 / (rm + 1)] = induction_bucket[T[p1]];
             }
         }
     }
 
-    for (j -= prefetch_distance + 1; i >= j; i -= 1) {
+    for (j += prefetch_distance + 1; i < j; i += 1) {
         sa_sint_t p = SA[i];
         SA[i] = p & SAINT_MAX;
         if (p > 0) {
             p--;
-            u8 c0 = T[p - (p > 0)], c1 = T[p];
-            SA[i] = c1;
-            sa_sint_t t = c0 | SAINT_MIN;
-            SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t;
+            SA[i] = T[p] | SAINT_MIN;
+            SA[induction_bucket[T[p]]++] =
+                p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1));
             if ((p & rm) == 0) {
-                I[p / (rm + 1)] = induction_bucket[T[p]] + 1;
+                I[p / (rm + 1)] = induction_bucket[T[p]];
             }
         }
     }
 }
 
-static void libsais_final_sorting_scan_right_to_left_8u(const u8 * RESTRICT T,
+static void libsais_final_sorting_scan_left_to_right_8u(const u8 * RESTRICT T,
                                                         sa_sint_t * RESTRICT SA,
                                                         sa_sint_t * RESTRICT induction_bucket,
                                                         fast_sint_t omp_block_start,
@@ -6935,49 +3097,49 @@ static void libsais_final_sorting_scan_right_to_left_8u(const u8 * RESTRICT T,
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1;
-         i >= j; i -= 2) {
-        libsais_prefetchw(&SA[i - 2 * prefetch_distance]);
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j;
+         i += 2) {
+        libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
 
-        sa_sint_t s0 = SA[i - prefetch_distance - 0];
+        sa_sint_t s0 = SA[i + prefetch_distance + 0];
         const u8 * Ts0 = &T[s0] - 1;
         libsais_prefetch(s0 > 0 ? Ts0 : NULL);
         Ts0--;
         libsais_prefetch(s0 > 0 ? Ts0 : NULL);
-        sa_sint_t s1 = SA[i - prefetch_distance - 1];
+        sa_sint_t s1 = SA[i + prefetch_distance + 1];
         const u8 * Ts1 = &T[s1] - 1;
         libsais_prefetch(s1 > 0 ? Ts1 : NULL);
         Ts1--;
         libsais_prefetch(s1 > 0 ? Ts1 : NULL);
 
-        sa_sint_t p0 = SA[i - 0];
-        SA[i - 0] = p0 & SAINT_MAX;
+        sa_sint_t p0 = SA[i + 0];
+        SA[i + 0] = p0 ^ SAINT_MIN;
         if (p0 > 0) {
             p0--;
-            SA[--induction_bucket[T[p0]]] =
-                p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1));
+            SA[induction_bucket[T[p0]]++] =
+                p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1));
         }
-        sa_sint_t p1 = SA[i - 1];
-        SA[i - 1] = p1 & SAINT_MAX;
+        sa_sint_t p1 = SA[i + 1];
+        SA[i + 1] = p1 ^ SAINT_MIN;
         if (p1 > 0) {
             p1--;
-            SA[--induction_bucket[T[p1]]] =
-                p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1));
+            SA[induction_bucket[T[p1]]++] =
+                p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1));
         }
     }
 
-    for (j -= prefetch_distance + 1; i >= j; i -= 1) {
+    for (j += prefetch_distance + 1; i < j; i += 1) {
         sa_sint_t p = SA[i];
-        SA[i] = p & SAINT_MAX;
+        SA[i] = p ^ SAINT_MIN;
         if (p > 0) {
             p--;
-            SA[--induction_bucket[T[p]]] =
-                p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1));
+            SA[induction_bucket[T[p]]++] =
+                p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1));
         }
     }
 }
 
-static void libsais_final_sorting_scan_right_to_left_32s(const sa_sint_t * RESTRICT T,
+static void libsais_final_sorting_scan_left_to_right_32s(const sa_sint_t * RESTRICT T,
                                                          sa_sint_t * RESTRICT SA,
                                                          sa_sint_t * RESTRICT induction_bucket,
                                                          fast_sint_t omp_block_start,
@@ -6985,65 +3147,121 @@ static void libsais_final_sorting_scan_right_to_left_32s(const sa_sint_t * RESTR
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1;
-         i >= j; i -= 2) {
-        libsais_prefetchw(&SA[i - 3 * prefetch_distance]);
+    for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1;
+         i < j; i += 2) {
+        libsais_prefetchw(&SA[i + 3 * prefetch_distance]);
 
-        sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0];
+        sa_sint_t s0 = SA[i + 2 * prefetch_distance + 0];
         const sa_sint_t * Ts0 = &T[s0] - 1;
         libsais_prefetch(s0 > 0 ? Ts0 : NULL);
-        sa_sint_t s1 = SA[i - 2 * prefetch_distance - 1];
+        sa_sint_t s1 = SA[i + 2 * prefetch_distance + 1];
         const sa_sint_t * Ts1 = &T[s1] - 1;
         libsais_prefetch(s1 > 0 ? Ts1 : NULL);
-        sa_sint_t s2 = SA[i - 1 * prefetch_distance - 0];
+        sa_sint_t s2 = SA[i + 1 * prefetch_distance + 0];
         if (s2 > 0) {
             libsais_prefetchw(&induction_bucket[T[s2 - 1]]);
             libsais_prefetch(&T[s2] - 2);
         }
-        sa_sint_t s3 = SA[i - 1 * prefetch_distance - 1];
+        sa_sint_t s3 = SA[i + 1 * prefetch_distance + 1];
         if (s3 > 0) {
             libsais_prefetchw(&induction_bucket[T[s3 - 1]]);
             libsais_prefetch(&T[s3] - 2);
         }
 
-        sa_sint_t p0 = SA[i - 0];
-        SA[i - 0] = p0 & SAINT_MAX;
+        sa_sint_t p0 = SA[i + 0];
+        SA[i + 0] = p0 ^ SAINT_MIN;
         if (p0 > 0) {
             p0--;
-            SA[--induction_bucket[T[p0]]] =
-                p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1));
+            SA[induction_bucket[T[p0]]++] =
+                p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1));
         }
-        sa_sint_t p1 = SA[i - 1];
-        SA[i - 1] = p1 & SAINT_MAX;
+        sa_sint_t p1 = SA[i + 1];
+        SA[i + 1] = p1 ^ SAINT_MIN;
         if (p1 > 0) {
             p1--;
-            SA[--induction_bucket[T[p1]]] =
-                p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1));
+            SA[induction_bucket[T[p1]]++] =
+                p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1));
         }
     }
 
-    for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1) {
+    for (j += 2 * prefetch_distance + 1; i < j; i += 1) {
         sa_sint_t p = SA[i];
-        SA[i] = p & SAINT_MAX;
+        SA[i] = p ^ SAINT_MIN;
         if (p > 0) {
             p--;
-            SA[--induction_bucket[T[p]]] =
-                p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1));
+            SA[induction_bucket[T[p]]++] =
+                p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1));
         }
     }
 }
+static void libsais_final_bwt_scan_left_to_right_8u_omp(
+    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t n,
+    sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    SA[induction_bucket[T[(sa_sint_t)n - 1]]++] =
+        ((sa_sint_t)n - 1) |
+        ((sa_sint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1));
+
+    if (threads == 1 || n < 65536) {
+        libsais_final_bwt_scan_left_to_right_8u(T, SA, induction_bucket, 0, n);
+    }
+    (void)(thread_state);
+}
+
+static void libsais_final_bwt_aux_scan_left_to_right_8u_omp(
+    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t n, sa_sint_t rm,
+    sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    SA[induction_bucket[T[(sa_sint_t)n - 1]]++] =
+        ((sa_sint_t)n - 1) |
+        ((sa_sint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1));
+
+    if ((((sa_sint_t)n - 1) & rm) == 0) {
+        I[((sa_sint_t)n - 1) / (rm + 1)] = induction_bucket[T[(sa_sint_t)n - 1]];
+    }
+
+    if (threads == 1 || n < 65536) {
+        libsais_final_bwt_aux_scan_left_to_right_8u(T, SA, rm, I, induction_bucket, 0, n);
+    }
+    (void)(thread_state);
+}
+
+static void libsais_final_sorting_scan_left_to_right_8u_omp(
+    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t n,
+    sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    SA[induction_bucket[T[(sa_sint_t)n - 1]]++] =
+        ((sa_sint_t)n - 1) |
+        ((sa_sint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1));
+
+    if (threads == 1 || n < 65536) {
+        libsais_final_sorting_scan_left_to_right_8u(T, SA, induction_bucket, 0, n);
+    }
+    (void)(thread_state);
+}
+
+static void libsais_final_sorting_scan_left_to_right_32s_omp(
+    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+    sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads,
+    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+    SA[induction_bucket[T[n - 1]]++] =
+        (n - 1) | ((sa_sint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1));
+
+    if (threads == 1 || n < 65536) {
+        libsais_final_sorting_scan_left_to_right_32s(T, SA, induction_bucket, 0, n);
+    }
+    (void)(thread_state);
+}
 
-#if defined(_OPENMP)
-
-static fast_sint_t libsais_final_bwt_scan_right_to_left_8u_block_prepare(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
-    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
-    fast_sint_t omp_block_size) {
+static sa_sint_t libsais_final_bwt_scan_right_to_left_8u(const u8 * RESTRICT T,
+                                                         sa_sint_t * RESTRICT SA,
+                                                         sa_sint_t * RESTRICT induction_bucket,
+                                                         fast_sint_t omp_block_start,
+                                                         fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
-    memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t));
-
-    fast_sint_t i, j, count = 0;
+    fast_sint_t i, j;
+    sa_sint_t index = -1;
     for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1;
          i >= j; i -= 2) {
         libsais_prefetchw(&SA[i - 2 * prefetch_distance]);
@@ -7060,52 +3278,53 @@ static fast_sint_t libsais_final_bwt_scan_right_to_left_8u_block_prepare(
         libsais_prefetch(s1 > 0 ? Ts1 : NULL);
 
         sa_sint_t p0 = SA[i - 0];
+        index = (p0 == 0) ? (sa_sint_t)(i - 0) : index;
         SA[i - 0] = p0 & SAINT_MAX;
         if (p0 > 0) {
             p0--;
             u8 c0 = T[p0 - (p0 > 0)], c1 = T[p0];
             SA[i - 0] = c1;
             sa_sint_t t = c0 | SAINT_MIN;
-            buckets[cache[count].symbol = c1]++;
-            cache[count++].index = (c0 <= c1) ? p0 : t;
+            SA[--induction_bucket[c1]] = (c0 <= c1) ? p0 : t;
         }
+
         sa_sint_t p1 = SA[i - 1];
+        index = (p1 == 0) ? (sa_sint_t)(i - 1) : index;
         SA[i - 1] = p1 & SAINT_MAX;
         if (p1 > 0) {
             p1--;
             u8 c0 = T[p1 - (p1 > 0)], c1 = T[p1];
             SA[i - 1] = c1;
             sa_sint_t t = c0 | SAINT_MIN;
-            buckets[cache[count].symbol = c1]++;
-            cache[count++].index = (c0 <= c1) ? p1 : t;
+            SA[--induction_bucket[c1]] = (c0 <= c1) ? p1 : t;
         }
     }
 
     for (j -= prefetch_distance + 1; i >= j; i -= 1) {
         sa_sint_t p = SA[i];
+        index = (p == 0) ? (sa_sint_t)i : index;
         SA[i] = p & SAINT_MAX;
         if (p > 0) {
             p--;
             u8 c0 = T[p - (p > 0)], c1 = T[p];
             SA[i] = c1;
             sa_sint_t t = c0 | SAINT_MIN;
-            buckets[cache[count].symbol = c1]++;
-            cache[count++].index = (c0 <= c1) ? p : t;
+            SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t;
         }
     }
 
-    return count;
+    return index;
 }
 
-static fast_sint_t libsais_final_bwt_aux_scan_right_to_left_8u_block_prepare(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
-    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
-    fast_sint_t omp_block_size) {
+static void libsais_final_bwt_aux_scan_right_to_left_8u(const u8 * RESTRICT T,
+                                                        sa_sint_t * RESTRICT SA, sa_sint_t rm,
+                                                        sa_sint_t * RESTRICT I,
+                                                        sa_sint_t * RESTRICT induction_bucket,
+                                                        fast_sint_t omp_block_start,
+                                                        fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
-    memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t));
-
-    fast_sint_t i, j, count = 0;
+    fast_sint_t i, j;
     for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1;
          i >= j; i -= 2) {
         libsais_prefetchw(&SA[i - 2 * prefetch_distance]);
@@ -7128,11 +3347,12 @@ static fast_sint_t libsais_final_bwt_aux_scan_right_to_left_8u_block_prepare(
             u8 c0 = T[p0 - (p0 > 0)], c1 = T[p0];
             SA[i - 0] = c1;
             sa_sint_t t = c0 | SAINT_MIN;
-            buckets[cache[count].symbol = c1]++;
-            cache[count].index = (c0 <= c1) ? p0 : t;
-            cache[count + 1].index = p0;
-            count += 2;
+            SA[--induction_bucket[c1]] = (c0 <= c1) ? p0 : t;
+            if ((p0 & rm) == 0) {
+                I[p0 / (rm + 1)] = induction_bucket[T[p0]] + 1;
+            }
         }
+
         sa_sint_t p1 = SA[i - 1];
         SA[i - 1] = p1 & SAINT_MAX;
         if (p1 > 0) {
@@ -7140,10 +3360,10 @@ static fast_sint_t libsais_final_bwt_aux_scan_right_to_left_8u_block_prepare(
             u8 c0 = T[p1 - (p1 > 0)], c1 = T[p1];
             SA[i - 1] = c1;
             sa_sint_t t = c0 | SAINT_MIN;
-            buckets[cache[count].symbol = c1]++;
-            cache[count].index = (c0 <= c1) ? p1 : t;
-            cache[count + 1].index = p1;
-            count += 2;
+            SA[--induction_bucket[c1]] = (c0 <= c1) ? p1 : t;
+            if ((p1 & rm) == 0) {
+                I[p1 / (rm + 1)] = induction_bucket[T[p1]] + 1;
+            }
         }
     }
 
@@ -7155,25 +3375,22 @@ static fast_sint_t libsais_final_bwt_aux_scan_right_to_left_8u_block_prepare(
             u8 c0 = T[p - (p > 0)], c1 = T[p];
             SA[i] = c1;
             sa_sint_t t = c0 | SAINT_MIN;
-            buckets[cache[count].symbol = c1]++;
-            cache[count].index = (c0 <= c1) ? p : t;
-            cache[count + 1].index = p;
-            count += 2;
+            SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t;
+            if ((p & rm) == 0) {
+                I[p / (rm + 1)] = induction_bucket[T[p]] + 1;
+            }
         }
     }
-
-    return count;
 }
 
-static fast_sint_t libsais_final_sorting_scan_right_to_left_8u_block_prepare(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
-    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
-    fast_sint_t omp_block_size) {
+static void libsais_final_sorting_scan_right_to_left_8u(const u8 * RESTRICT T,
+                                                        sa_sint_t * RESTRICT SA,
+                                                        sa_sint_t * RESTRICT induction_bucket,
+                                                        fast_sint_t omp_block_start,
+                                                        fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
-    memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t));
-
-    fast_sint_t i, j, count = 0;
+    fast_sint_t i, j;
     for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1;
          i >= j; i -= 2) {
         libsais_prefetchw(&SA[i - 2 * prefetch_distance]);
@@ -7193,15 +3410,15 @@ static fast_sint_t libsais_final_sorting_scan_right_to_left_8u_block_prepare(
         SA[i - 0] = p0 & SAINT_MAX;
         if (p0 > 0) {
             p0--;
-            buckets[cache[count].symbol = T[p0]]++;
-            cache[count++].index = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1));
+            SA[--induction_bucket[T[p0]]] =
+                p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1));
         }
         sa_sint_t p1 = SA[i - 1];
         SA[i - 1] = p1 & SAINT_MAX;
         if (p1 > 0) {
             p1--;
-            buckets[cache[count].symbol = T[p1]]++;
-            cache[count++].index = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1));
+            SA[--induction_bucket[T[p1]]] =
+                p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1));
         }
     }
 
@@ -7210,453 +3427,67 @@ static fast_sint_t libsais_final_sorting_scan_right_to_left_8u_block_prepare(
         SA[i] = p & SAINT_MAX;
         if (p > 0) {
             p--;
-            buckets[cache[count].symbol = T[p]]++;
-            cache[count++].index = p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1));
-        }
-    }
-
-    return count;
-}
-
-static void libsais_final_order_scan_right_to_left_8u_block_place(
-    sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache,
-    fast_sint_t count) {
-    const fast_sint_t prefetch_distance = 32;
-
-    fast_sint_t i, j;
-    for (i = 0, j = count - 3; i < j; i += 4) {
-        libsais_prefetch(&cache[i + prefetch_distance]);
-
-        SA[--buckets[cache[i + 0].symbol]] = cache[i + 0].index;
-        SA[--buckets[cache[i + 1].symbol]] = cache[i + 1].index;
-        SA[--buckets[cache[i + 2].symbol]] = cache[i + 2].index;
-        SA[--buckets[cache[i + 3].symbol]] = cache[i + 3].index;
-    }
-
-    for (j += 3; i < j; i += 1) {
-        SA[--buckets[cache[i].symbol]] = cache[i].index;
-    }
-}
-
-static void libsais_final_bwt_aux_scan_right_to_left_8u_block_place(
-    sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT buckets,
-    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count) {
-    const fast_sint_t prefetch_distance = 32;
-
-    fast_sint_t i, j;
-    for (i = 0, j = count - 6; i < j; i += 8) {
-        libsais_prefetch(&cache[i + prefetch_distance]);
-
-        SA[--buckets[cache[i + 0].symbol]] = cache[i + 0].index;
-        if ((cache[i + 1].index & rm) == 0) {
-            I[cache[i + 1].index / (rm + 1)] = buckets[cache[i + 0].symbol] + 1;
-        }
-        SA[--buckets[cache[i + 2].symbol]] = cache[i + 2].index;
-        if ((cache[i + 3].index & rm) == 0) {
-            I[cache[i + 3].index / (rm + 1)] = buckets[cache[i + 2].symbol] + 1;
-        }
-        SA[--buckets[cache[i + 4].symbol]] = cache[i + 4].index;
-        if ((cache[i + 5].index & rm) == 0) {
-            I[cache[i + 5].index / (rm + 1)] = buckets[cache[i + 4].symbol] + 1;
-        }
-        SA[--buckets[cache[i + 6].symbol]] = cache[i + 6].index;
-        if ((cache[i + 7].index & rm) == 0) {
-            I[cache[i + 7].index / (rm + 1)] = buckets[cache[i + 6].symbol] + 1;
-        }
-    }
-
-    for (j += 6; i < j; i += 2) {
-        SA[--buckets[cache[i].symbol]] = cache[i].index;
-        if ((cache[i + 1].index & rm) == 0) {
-            I[(cache[i + 1].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i].symbol] + 1;
-        }
-    }
-}
-
-static void libsais_final_sorting_scan_right_to_left_32s_block_gather(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache,
-    fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
-    const fast_sint_t prefetch_distance = 32;
-
-    fast_sint_t i, j;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j;
-         i += 2) {
-        libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
-
-        sa_sint_t s0 = SA[i + prefetch_distance + 0];
-        const sa_sint_t * Ts0 = &T[s0] - 1;
-        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
-        Ts0--;
-        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
-        sa_sint_t s1 = SA[i + prefetch_distance + 1];
-        const sa_sint_t * Ts1 = &T[s1] - 1;
-        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
-        Ts1--;
-        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
-
-        libsais_prefetchw(&cache[i + prefetch_distance]);
-
-        sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0];
-        SA[i + 0] = p0 & SAINT_MAX;
-        if (p0 > 0) {
-            p0--;
-            cache[i + 0].index = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1));
-            symbol0 = T[p0];
-        }
-        cache[i + 0].symbol = symbol0;
-        sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1];
-        SA[i + 1] = p1 & SAINT_MAX;
-        if (p1 > 0) {
-            p1--;
-            cache[i + 1].index = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1));
-            symbol1 = T[p1];
-        }
-        cache[i + 1].symbol = symbol1;
-    }
-
-    for (j += prefetch_distance + 1; i < j; i += 1) {
-        sa_sint_t symbol = SAINT_MIN, p = SA[i];
-        SA[i] = p & SAINT_MAX;
-        if (p > 0) {
-            p--;
-            cache[i].index = p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1));
-            symbol = T[p];
+            SA[--induction_bucket[T[p]]] =
+                p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1));
         }
-        cache[i].symbol = symbol;
     }
 }
 
-static void libsais_final_sorting_scan_right_to_left_32s_block_sort(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT induction_bucket,
-    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
-    fast_sint_t omp_block_size) {
+static void libsais_final_sorting_scan_right_to_left_32s(const sa_sint_t * RESTRICT T,
+                                                         sa_sint_t * RESTRICT SA,
+                                                         sa_sint_t * RESTRICT induction_bucket,
+                                                         fast_sint_t omp_block_start,
+                                                         fast_sint_t omp_block_size) {
     const fast_sint_t prefetch_distance = 32;
 
     fast_sint_t i, j;
-    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1;
+    for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1;
          i >= j; i -= 2) {
-        libsais_prefetchw(&cache[i - 2 * prefetch_distance]);
-
-        sa_sint_t s0 = cache[i - prefetch_distance - 0].symbol;
-        const sa_sint_t * Is0 = &induction_bucket[s0];
-        libsais_prefetchw(s0 >= 0 ? Is0 : NULL);
-        sa_sint_t s1 = cache[i - prefetch_distance - 1].symbol;
-        const sa_sint_t * Is1 = &induction_bucket[s1];
-        libsais_prefetchw(s1 >= 0 ? Is1 : NULL);
-
-        sa_sint_t v0 = cache[i - 0].symbol;
-        if (v0 >= 0) {
-            cache[i - 0].symbol = --induction_bucket[v0];
-            if (cache[i - 0].symbol >= omp_block_start) {
-                sa_sint_t ni = cache[i - 0].symbol, np = cache[i - 0].index;
-                cache[i - 0].index = np & SAINT_MAX;
-                if (np > 0) {
-                    np--;
-                    cache[ni].index =
-                        np | ((sa_sint_t)(T[np - (np > 0)] > T[np]) << (SAINT_BIT - 1));
-                    cache[ni].symbol = T[np];
-                }
-            }
-        }
-
-        sa_sint_t v1 = cache[i - 1].symbol;
-        if (v1 >= 0) {
-            cache[i - 1].symbol = --induction_bucket[v1];
-            if (cache[i - 1].symbol >= omp_block_start) {
-                sa_sint_t ni = cache[i - 1].symbol, np = cache[i - 1].index;
-                cache[i - 1].index = np & SAINT_MAX;
-                if (np > 0) {
-                    np--;
-                    cache[ni].index =
-                        np | ((sa_sint_t)(T[np - (np > 0)] > T[np]) << (SAINT_BIT - 1));
-                    cache[ni].symbol = T[np];
-                }
-            }
-        }
-    }
-
-    for (j -= prefetch_distance + 1; i >= j; i -= 1) {
-        sa_sint_t v = cache[i].symbol;
-        if (v >= 0) {
-            cache[i].symbol = --induction_bucket[v];
-            if (cache[i].symbol >= omp_block_start) {
-                sa_sint_t ni = cache[i].symbol, np = cache[i].index;
-                cache[i].index = np & SAINT_MAX;
-                if (np > 0) {
-                    np--;
-                    cache[ni].index =
-                        np | ((sa_sint_t)(T[np - (np > 0)] > T[np]) << (SAINT_BIT - 1));
-                    cache[ni].symbol = T[np];
-                }
-            }
-        }
-    }
-}
-
-static void libsais_final_bwt_scan_right_to_left_8u_block_omp(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket,
-    fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
-    #if defined(_OPENMP)
-        #pragma omp parallel num_threads(threads) if (threads > 1 &&                      \
-                                                      block_size >= 64 * ALPHABET_SIZE && \
-                                                      omp_get_dynamic() == 0)
-    #endif
-    {
-    #if defined(_OPENMP)
-        fast_sint_t omp_thread_num = omp_get_thread_num();
-        fast_sint_t omp_num_threads = omp_get_num_threads();
-    #else
-        UNUSED(threads);
-        UNUSED(thread_state);
-
-        fast_sint_t omp_thread_num = 0;
-        fast_sint_t omp_num_threads = 1;
-    #endif
-        fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size =
-            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
-
-        omp_block_start += block_start;
-
-        if (omp_num_threads == 1) {
-            libsais_final_bwt_scan_right_to_left_8u(T, SA, induction_bucket, omp_block_start,
-                                                    omp_block_size);
-        }
-    #if defined(_OPENMP)
-        else {
-            {
-                thread_state[omp_thread_num].state.count =
-                    libsais_final_bwt_scan_right_to_left_8u_block_prepare(
-                        T, SA, thread_state[omp_thread_num].state.buckets,
-                        thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size);
-            }
-
-        #pragma omp barrier
-
-        #pragma omp master
-            {
-                fast_sint_t t;
-                for (t = omp_num_threads - 1; t >= 0; --t) {
-                    sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
-                    fast_sint_t c;
-                    for (c = 0; c < ALPHABET_SIZE; c += 1) {
-                        sa_sint_t A = induction_bucket[c], B = temp_bucket[c];
-                        induction_bucket[c] = A - B;
-                        temp_bucket[c] = A;
-                    }
-                }
-            }
-
-        #pragma omp barrier
-
-            {
-                libsais_final_order_scan_right_to_left_8u_block_place(
-                    SA, thread_state[omp_thread_num].state.buckets,
-                    thread_state[omp_thread_num].state.cache,
-                    thread_state[omp_thread_num].state.count);
-            }
-        }
-    #endif
-    }
-}
-
-static void libsais_final_bwt_aux_scan_right_to_left_8u_block_omp(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I,
-    sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size,
-    sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
-    #if defined(_OPENMP)
-        #pragma omp parallel num_threads(threads) if (threads > 1 &&                      \
-                                                      block_size >= 64 * ALPHABET_SIZE && \
-                                                      omp_get_dynamic() == 0)
-    #endif
-    {
-    #if defined(_OPENMP)
-        fast_sint_t omp_thread_num = omp_get_thread_num();
-        fast_sint_t omp_num_threads = omp_get_num_threads();
-    #else
-        UNUSED(threads);
-        UNUSED(thread_state);
-
-        fast_sint_t omp_thread_num = 0;
-        fast_sint_t omp_num_threads = 1;
-    #endif
-        fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size =
-            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
-
-        omp_block_start += block_start;
-
-        if (omp_num_threads == 1) {
-            libsais_final_bwt_aux_scan_right_to_left_8u(T, SA, rm, I, induction_bucket,
-                                                        omp_block_start, omp_block_size);
-        }
-    #if defined(_OPENMP)
-        else {
-            {
-                thread_state[omp_thread_num].state.count =
-                    libsais_final_bwt_aux_scan_right_to_left_8u_block_prepare(
-                        T, SA, thread_state[omp_thread_num].state.buckets,
-                        thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size);
-            }
-
-        #pragma omp barrier
-
-        #pragma omp master
-            {
-                fast_sint_t t;
-                for (t = omp_num_threads - 1; t >= 0; --t) {
-                    sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
-                    fast_sint_t c;
-                    for (c = 0; c < ALPHABET_SIZE; c += 1) {
-                        sa_sint_t A = induction_bucket[c], B = temp_bucket[c];
-                        induction_bucket[c] = A - B;
-                        temp_bucket[c] = A;
-                    }
-                }
-            }
-
-        #pragma omp barrier
-
-            {
-                libsais_final_bwt_aux_scan_right_to_left_8u_block_place(
-                    SA, rm, I, thread_state[omp_thread_num].state.buckets,
-                    thread_state[omp_thread_num].state.cache,
-                    thread_state[omp_thread_num].state.count);
-            }
-        }
-    #endif
-    }
-}
-
-static void libsais_final_sorting_scan_right_to_left_8u_block_omp(
-    const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket,
-    fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads,
-    LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
-    #if defined(_OPENMP)
-        #pragma omp parallel num_threads(threads) if (threads > 1 &&                      \
-                                                      block_size >= 64 * ALPHABET_SIZE && \
-                                                      omp_get_dynamic() == 0)
-    #endif
-    {
-    #if defined(_OPENMP)
-        fast_sint_t omp_thread_num = omp_get_thread_num();
-        fast_sint_t omp_num_threads = omp_get_num_threads();
-    #else
-        UNUSED(threads);
-        UNUSED(thread_state);
-
-        fast_sint_t omp_thread_num = 0;
-        fast_sint_t omp_num_threads = 1;
-    #endif
-        fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size =
-            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
-
-        omp_block_start += block_start;
-
-        if (omp_num_threads == 1) {
-            libsais_final_sorting_scan_right_to_left_8u(T, SA, induction_bucket, omp_block_start,
-                                                        omp_block_size);
-        }
-    #if defined(_OPENMP)
-        else {
-            {
-                thread_state[omp_thread_num].state.count =
-                    libsais_final_sorting_scan_right_to_left_8u_block_prepare(
-                        T, SA, thread_state[omp_thread_num].state.buckets,
-                        thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size);
-            }
-
-        #pragma omp barrier
-
-        #pragma omp master
-            {
-                fast_sint_t t;
-                for (t = omp_num_threads - 1; t >= 0; --t) {
-                    sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
-                    fast_sint_t c;
-                    for (c = 0; c < ALPHABET_SIZE; c += 1) {
-                        sa_sint_t A = induction_bucket[c], B = temp_bucket[c];
-                        induction_bucket[c] = A - B;
-                        temp_bucket[c] = A;
-                    }
-                }
-            }
-
-        #pragma omp barrier
-
-            {
-                libsais_final_order_scan_right_to_left_8u_block_place(
-                    SA, thread_state[omp_thread_num].state.buckets,
-                    thread_state[omp_thread_num].state.cache,
-                    thread_state[omp_thread_num].state.count);
-            }
-        }
-    #endif
-    }
-}
-
-static void libsais_final_sorting_scan_right_to_left_32s_block_omp(
-    const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
-    LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size,
-    sa_sint_t threads) {
-    #if defined(_OPENMP)
-        #pragma omp parallel num_threads(threads) if (threads > 1 && block_size >= 16384)
-    #endif
-    {
-    #if defined(_OPENMP)
-        fast_sint_t omp_thread_num = omp_get_thread_num();
-        fast_sint_t omp_num_threads = omp_get_num_threads();
-    #else
-        UNUSED(threads);
-        UNUSED(cache);
-
-        fast_sint_t omp_thread_num = 0;
-        fast_sint_t omp_num_threads = 1;
-    #endif
-        fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size =
-            omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
-
-        omp_block_start += block_start;
-
-        if (omp_num_threads == 1) {
-            libsais_final_sorting_scan_right_to_left_32s(T, SA, buckets, omp_block_start,
-                                                         omp_block_size);
-        }
-    #if defined(_OPENMP)
-        else {
-            {
-                libsais_final_sorting_scan_right_to_left_32s_block_gather(
-                    T, SA, cache - block_start, omp_block_start, omp_block_size);
-            }
-
-        #pragma omp barrier
+        libsais_prefetchw(&SA[i - 3 * prefetch_distance]);
 
-        #pragma omp master
-            {
-                libsais_final_sorting_scan_right_to_left_32s_block_sort(
-                    T, buckets, cache - block_start, block_start, block_size);
-            }
+        sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0];
+        const sa_sint_t * Ts0 = &T[s0] - 1;
+        libsais_prefetch(s0 > 0 ? Ts0 : NULL);
+        sa_sint_t s1 = SA[i - 2 * prefetch_distance - 1];
+        const sa_sint_t * Ts1 = &T[s1] - 1;
+        libsais_prefetch(s1 > 0 ? Ts1 : NULL);
+        sa_sint_t s2 = SA[i - 1 * prefetch_distance - 0];
+        if (s2 > 0) {
+            libsais_prefetchw(&induction_bucket[T[s2 - 1]]);
+            libsais_prefetch(&T[s2] - 2);
+        }
+        sa_sint_t s3 = SA[i - 1 * prefetch_distance - 1];
+        if (s3 > 0) {
+            libsais_prefetchw(&induction_bucket[T[s3 - 1]]);
+            libsais_prefetch(&T[s3] - 2);
+        }
 
-        #pragma omp barrier
+        sa_sint_t p0 = SA[i - 0];
+        SA[i - 0] = p0 & SAINT_MAX;
+        if (p0 > 0) {
+            p0--;
+            SA[--induction_bucket[T[p0]]] =
+                p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1));
+        }
+        sa_sint_t p1 = SA[i - 1];
+        SA[i - 1] = p1 & SAINT_MAX;
+        if (p1 > 0) {
+            p1--;
+            SA[--induction_bucket[T[p1]]] =
+                p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1));
+        }
+    }
 
-            {
-                libsais_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start,
-                                                          omp_block_size);
-            }
+    for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1) {
+        sa_sint_t p = SA[i];
+        SA[i] = p & SAINT_MAX;
+        if (p > 0) {
+            p--;
+            SA[--induction_bucket[T[p]]] =
+                p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1));
         }
-    #endif
     }
 }
-
-#endif
-
 static sa_sint_t libsais_final_bwt_scan_right_to_left_8u_omp(
     const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
     sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads,
@@ -7666,49 +3497,7 @@ static sa_sint_t libsais_final_bwt_scan_right_to_left_8u_omp(
     if (threads == 1 || n < 65536) {
         index = libsais_final_bwt_scan_right_to_left_8u(T, SA, induction_bucket, 0, n);
     }
-#if defined(_OPENMP)
-    else {
-        fast_sint_t block_start;
-        for (block_start = (fast_sint_t)n - 1; block_start >= 0;) {
-            if (SA[block_start] == 0) {
-                index = (sa_sint_t)block_start--;
-            } else {
-                fast_sint_t block_max_end =
-                    block_start - ((fast_sint_t)threads) *
-                                      (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads);
-                if (block_max_end < 0) {
-                    block_max_end = -1;
-                }
-                fast_sint_t block_end = block_start - 1;
-                while (block_end > block_max_end && SA[block_end] != 0) {
-                    block_end--;
-                }
-                fast_sint_t block_size = block_start - block_end;
-
-                if (block_size < 32) {
-                    for (; block_start > block_end; block_start -= 1) {
-                        sa_sint_t p = SA[block_start];
-                        SA[block_start] = p & SAINT_MAX;
-                        if (p > 0) {
-                            p--;
-                            u8 c0 = T[p - (p > 0)], c1 = T[p];
-                            SA[block_start] = c1;
-                            sa_sint_t t = c0 | SAINT_MIN;
-                            SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t;
-                        }
-                    }
-                } else {
-                    libsais_final_bwt_scan_right_to_left_8u_block_omp(
-                        T, SA, induction_bucket, block_end + 1, block_size, threads, thread_state);
-                    block_start = block_end;
-                }
-            }
-        }
-    }
-#else
-    UNUSED(thread_state);
-#endif
-
+    (void)(thread_state);
     return index;
 }
 
@@ -7719,53 +3508,7 @@ static void libsais_final_bwt_aux_scan_right_to_left_8u_omp(
     if (threads == 1 || n < 65536) {
         libsais_final_bwt_aux_scan_right_to_left_8u(T, SA, rm, I, induction_bucket, 0, n);
     }
-#if defined(_OPENMP)
-    else {
-        fast_sint_t block_start;
-        for (block_start = (fast_sint_t)n - 1; block_start >= 0;) {
-            if (SA[block_start] == 0) {
-                block_start--;
-            } else {
-                fast_sint_t block_max_end =
-                    block_start -
-                    ((fast_sint_t)threads) *
-                        ((LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads) / 2);
-                if (block_max_end < 0) {
-                    block_max_end = -1;
-                }
-                fast_sint_t block_end = block_start - 1;
-                while (block_end > block_max_end && SA[block_end] != 0) {
-                    block_end--;
-                }
-                fast_sint_t block_size = block_start - block_end;
-
-                if (block_size < 32) {
-                    for (; block_start > block_end; block_start -= 1) {
-                        sa_sint_t p = SA[block_start];
-                        SA[block_start] = p & SAINT_MAX;
-                        if (p > 0) {
-                            p--;
-                            u8 c0 = T[p - (p > 0)], c1 = T[p];
-                            SA[block_start] = c1;
-                            sa_sint_t t = c0 | SAINT_MIN;
-                            SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t;
-                            if ((p & rm) == 0) {
-                                I[p / (rm + 1)] = induction_bucket[T[p]] + 1;
-                            }
-                        }
-                    }
-                } else {
-                    libsais_final_bwt_aux_scan_right_to_left_8u_block_omp(
-                        T, SA, rm, I, induction_bucket, block_end + 1, block_size, threads,
-                        thread_state);
-                    block_start = block_end;
-                }
-            }
-        }
-    }
-#else
-    UNUSED(thread_state);
-#endif
+    (void)(thread_state);
 }
 
 static void libsais_final_sorting_scan_right_to_left_8u_omp(
@@ -7775,46 +3518,7 @@ static void libsais_final_sorting_scan_right_to_left_8u_omp(
     if (threads == 1 || n < 65536) {
         libsais_final_sorting_scan_right_to_left_8u(T, SA, induction_bucket, 0, n);
     }
-#if defined(_OPENMP)
-    else {
-        fast_sint_t block_start;
-        for (block_start = (fast_sint_t)n - 1; block_start >= 0;) {
-            if (SA[block_start] == 0) {
-                block_start--;
-            } else {
-                fast_sint_t block_max_end =
-                    block_start - ((fast_sint_t)threads) *
-                                      (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads);
-                if (block_max_end < -1) {
-                    block_max_end = -1;
-                }
-                fast_sint_t block_end = block_start - 1;
-                while (block_end > block_max_end && SA[block_end] != 0) {
-                    block_end--;
-                }
-                fast_sint_t block_size = block_start - block_end;
-
-                if (block_size < 32) {
-                    for (; block_start > block_end; block_start -= 1) {
-                        sa_sint_t p = SA[block_start];
-                        SA[block_start] = p & SAINT_MAX;
-                        if (p > 0) {
-                            p--;
-                            SA[--induction_bucket[T[p]]] =
-                                p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1));
-                        }
-                    }
-                } else {
-                    libsais_final_sorting_scan_right_to_left_8u_block_omp(
-                        T, SA, induction_bucket, block_end + 1, block_size, threads, thread_state);
-                    block_start = block_end;
-                }
-            }
-        }
-    }
-#else
-    UNUSED(thread_state);
-#endif
+    (void)(thread_state);
 }
 
 static void libsais_final_sorting_scan_right_to_left_32s_omp(
@@ -7824,36 +3528,16 @@ static void libsais_final_sorting_scan_right_to_left_32s_omp(
     if (threads == 1 || n < 65536) {
         libsais_final_sorting_scan_right_to_left_32s(T, SA, induction_bucket, 0, n);
     }
-#if defined(_OPENMP)
-    else {
-        fast_sint_t block_start, block_end;
-        for (block_start = (fast_sint_t)n - 1; block_start >= 0; block_start = block_end) {
-            block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
-            if (block_end < 0) {
-                block_end = -1;
-            }
-
-            libsais_final_sorting_scan_right_to_left_32s_block_omp(
-                T, SA, induction_bucket, thread_state[0].state.cache, block_end + 1,
-                block_start - block_end, threads);
-        }
-    }
-#else
-    UNUSED(thread_state);
-#endif
+    (void)(thread_state);
 }
 
 static void libsais_clear_lms_suffixes_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
                                            sa_sint_t * RESTRICT bucket_start,
                                            sa_sint_t * RESTRICT bucket_end, sa_sint_t threads) {
     fast_sint_t c;
+    (void)(threads);
+    (void)(n);
 
-#if defined(_OPENMP)
-    #pragma omp parallel for schedule(static, 1) num_threads(threads) if (threads > 1 && n >= 65536)
-#else
-    UNUSED(threads);
-    UNUSED(n);
-#endif
     for (c = 0; c < k; ++c) {
         if (bucket_end[c] > bucket_start[c]) {
             memset(&SA[bucket_start[c]], 0,
@@ -8071,61 +3755,17 @@ static void libsais_compact_unique_and_nonunique_lms_suffixes_32s(sa_sint_t * RE
     *pl = l + 1;
     *pr = r + 1;
 }
-
-#if defined(_OPENMP)
-
-static sa_sint_t libsais_count_unique_suffixes(sa_sint_t * RESTRICT SA, sa_sint_t m,
-                                               fast_sint_t omp_block_start,
-                                               fast_sint_t omp_block_size) {
-    const fast_sint_t prefetch_distance = 32;
-
-    sa_sint_t * RESTRICT SAm = &SA[m];
-
-    fast_sint_t i, j;
-    sa_sint_t f0 = 0, f1 = 0, f2 = 0, f3 = 0;
-    for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j;
-         i += 4) {
-        libsais_prefetch(&SA[i + 2 * prefetch_distance]);
-
-        libsais_prefetch(&SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]);
-        libsais_prefetch(&SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]);
-        libsais_prefetch(&SAm[((sa_uint_t)SA[i + prefetch_distance + 2]) >> 1]);
-        libsais_prefetch(&SAm[((sa_uint_t)SA[i + prefetch_distance + 3]) >> 1]);
-
-        f0 += SAm[((sa_uint_t)SA[i + 0]) >> 1] < 0;
-        f1 += SAm[((sa_uint_t)SA[i + 1]) >> 1] < 0;
-        f2 += SAm[((sa_uint_t)SA[i + 2]) >> 1] < 0;
-        f3 += SAm[((sa_uint_t)SA[i + 3]) >> 1] < 0;
-    }
-
-    for (j += prefetch_distance + 3; i < j; i += 1) {
-        f0 += SAm[((sa_uint_t)SA[i]) >> 1] < 0;
-    }
-
-    return f0 + f1 + f2 + f3;
-}
-
-#endif
-
 static sa_sint_t libsais_renumber_unique_and_nonunique_lms_suffixes_32s_omp(
     sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t threads,
     LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
     sa_sint_t f = 0;
-
-#if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if (threads > 1 && m >= 65536)
-#endif
     {
-#if defined(_OPENMP)
-        fast_sint_t omp_thread_num = omp_get_thread_num();
-        fast_sint_t omp_num_threads = omp_get_num_threads();
-#else
-        UNUSED(threads);
-        UNUSED(thread_state);
+        (void)(threads);
+        (void)(thread_state);
 
         fast_sint_t omp_thread_num = 0;
         fast_sint_t omp_num_threads = 1;
-#endif
+
         fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
         fast_sint_t omp_block_size =
@@ -8135,30 +3775,6 @@ static sa_sint_t libsais_renumber_unique_and_nonunique_lms_suffixes_32s_omp(
             f = libsais_renumber_unique_and_nonunique_lms_suffixes_32s(T, SA, m, 0, omp_block_start,
                                                                        omp_block_size);
         }
-#if defined(_OPENMP)
-        else {
-            {
-                thread_state[omp_thread_num].state.count =
-                    libsais_count_unique_suffixes(SA, m, omp_block_start, omp_block_size);
-            }
-
-    #pragma omp barrier
-
-            {
-                fast_sint_t t, count = 0;
-                for (t = 0; t < omp_thread_num; ++t) {
-                    count += thread_state[t].state.count;
-                }
-
-                if (omp_thread_num == omp_num_threads - 1) {
-                    f = (sa_sint_t)(count + thread_state[omp_thread_num].state.count);
-                }
-
-                libsais_renumber_unique_and_nonunique_lms_suffixes_32s(
-                    T, SA, m, (sa_sint_t)count, omp_block_start, omp_block_size);
-            }
-        }
-#endif
     }
 
     return f;
@@ -8167,20 +3783,13 @@ static sa_sint_t libsais_renumber_unique_and_nonunique_lms_suffixes_32s_omp(
 static void libsais_compact_unique_and_nonunique_lms_suffixes_32s_omp(
     sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t f, sa_sint_t threads,
     LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
-#if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if (threads > 1 && n >= 131072 && m < fs)
-#endif
     {
-#if defined(_OPENMP)
-        fast_sint_t omp_thread_num = omp_get_thread_num();
-        fast_sint_t omp_num_threads = omp_get_num_threads();
-#else
-        UNUSED(threads);
-        UNUSED(thread_state);
+        (void)(threads);
+        (void)(thread_state);
 
         fast_sint_t omp_thread_num = 0;
         fast_sint_t omp_num_threads = 1;
-#endif
+
         fast_sint_t omp_block_stride = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
         fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
@@ -8192,54 +3801,6 @@ static void libsais_compact_unique_and_nonunique_lms_suffixes_32s_omp(
             libsais_compact_unique_and_nonunique_lms_suffixes_32s(SA, m, &l, &r, omp_block_start,
                                                                   omp_block_size);
         }
-#if defined(_OPENMP)
-        else {
-            {
-                thread_state[omp_thread_num].state.position =
-                    (fast_sint_t)m + ((fast_sint_t)n >> 1) + omp_block_start + omp_block_size;
-                thread_state[omp_thread_num].state.count =
-                    (fast_sint_t)m + omp_block_start + omp_block_size;
-
-                libsais_compact_unique_and_nonunique_lms_suffixes_32s(
-                    SA, m, &thread_state[omp_thread_num].state.position,
-                    &thread_state[omp_thread_num].state.count, omp_block_start, omp_block_size);
-            }
-
-    #pragma omp barrier
-
-    #pragma omp master
-            {
-                fast_sint_t t, position;
-
-                for (position = m, t = omp_num_threads - 1; t >= 0; --t) {
-                    fast_sint_t omp_block_end = t < omp_num_threads - 1 ? omp_block_stride * (t + 1)
-                                                                        : ((fast_sint_t)n >> 1);
-                    fast_sint_t count = ((fast_sint_t)m + ((fast_sint_t)n >> 1) + omp_block_end -
-                                         thread_state[t].state.position);
-
-                    if (count > 0) {
-                        position -= count;
-                        memcpy(&SA[position], &SA[thread_state[t].state.position],
-                               (size_t)count * sizeof(sa_sint_t));
-                    }
-                }
-
-                for (position = (fast_sint_t)n + (fast_sint_t)fs, t = omp_num_threads - 1; t >= 0;
-                     --t) {
-                    fast_sint_t omp_block_end = t < omp_num_threads - 1 ? omp_block_stride * (t + 1)
-                                                                        : ((fast_sint_t)n >> 1);
-                    fast_sint_t count =
-                        ((fast_sint_t)m + omp_block_end - thread_state[t].state.count);
-
-                    if (count > 0) {
-                        position -= count;
-                        memcpy(&SA[position], &SA[thread_state[t].state.count],
-                               (size_t)count * sizeof(sa_sint_t));
-                    }
-                }
-            }
-        }
-#endif
     }
 
     memcpy(&SA[(fast_sint_t)n + (fast_sint_t)fs - (fast_sint_t)m],
@@ -8355,20 +3916,13 @@ static void libsais_merge_nonunique_lms_suffixes_32s(sa_sint_t * RESTRICT SA, sa
 static void libsais_merge_unique_lms_suffixes_32s_omp(
     sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads,
     LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
-#if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if (threads > 1 && n >= 65536)
-#endif
     {
-#if defined(_OPENMP)
-        fast_sint_t omp_thread_num = omp_get_thread_num();
-        fast_sint_t omp_num_threads = omp_get_num_threads();
-#else
-        UNUSED(threads);
-        UNUSED(thread_state);
+        (void)(threads);
+        (void)(thread_state);
 
         fast_sint_t omp_thread_num = 0;
         fast_sint_t omp_num_threads = 1;
-#endif
+
         fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
         fast_sint_t omp_block_size =
@@ -8377,46 +3931,19 @@ static void libsais_merge_unique_lms_suffixes_32s_omp(
         if (omp_num_threads == 1) {
             libsais_merge_unique_lms_suffixes_32s(T, SA, n, m, 0, omp_block_start, omp_block_size);
         }
-#if defined(_OPENMP)
-        else {
-            {
-                thread_state[omp_thread_num].state.count =
-                    libsais_count_negative_marked_suffixes(T, omp_block_start, omp_block_size);
-            }
-
-    #pragma omp barrier
-
-            {
-                fast_sint_t t, count = 0;
-                for (t = 0; t < omp_thread_num; ++t) {
-                    count += thread_state[t].state.count;
-                }
-
-                libsais_merge_unique_lms_suffixes_32s(T, SA, n, m, count, omp_block_start,
-                                                      omp_block_size);
-            }
-        }
-#endif
     }
 }
 
 static void libsais_merge_nonunique_lms_suffixes_32s_omp(
     sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t f, sa_sint_t threads,
     LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
-#if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if (threads > 1 && m >= 65536)
-#endif
     {
-#if defined(_OPENMP)
-        fast_sint_t omp_thread_num = omp_get_thread_num();
-        fast_sint_t omp_num_threads = omp_get_num_threads();
-#else
-        UNUSED(threads);
-        UNUSED(thread_state);
+        (void)(threads);
+        (void)(thread_state);
 
         fast_sint_t omp_thread_num = 0;
         fast_sint_t omp_num_threads = 1;
-#endif
+
         fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
         fast_sint_t omp_block_size =
@@ -8425,26 +3952,6 @@ static void libsais_merge_nonunique_lms_suffixes_32s_omp(
         if (omp_num_threads == 1) {
             libsais_merge_nonunique_lms_suffixes_32s(SA, n, m, f, omp_block_start, omp_block_size);
         }
-#if defined(_OPENMP)
-        else {
-            {
-                thread_state[omp_thread_num].state.count =
-                    libsais_count_zero_marked_suffixes(SA, omp_block_start, omp_block_size);
-            }
-
-    #pragma omp barrier
-
-            {
-                fast_sint_t t, count = f;
-                for (t = 0; t < omp_thread_num; ++t) {
-                    count += thread_state[t].state.count;
-                }
-
-                libsais_merge_nonunique_lms_suffixes_32s(SA, n, m, count, omp_block_start,
-                                                         omp_block_size);
-            }
-        }
-#endif
     }
 }
 
@@ -8830,36 +4337,6 @@ static void libsais_bwt_copy_8u(u8 * RESTRICT U, sa_sint_t * RESTRICT A, sa_sint
         U[i] = (u8)A[i];
     }
 }
-
-#if defined(_OPENMP)
-
-static void libsais_bwt_copy_8u_omp(u8 * RESTRICT U, sa_sint_t * RESTRICT A, sa_sint_t n,
-                                    sa_sint_t threads) {
-    #if defined(_OPENMP)
-        #pragma omp parallel num_threads(threads) if (threads > 1 && n >= 65536)
-    #endif
-    {
-    #if defined(_OPENMP)
-        fast_sint_t omp_thread_num = omp_get_thread_num();
-        fast_sint_t omp_num_threads = omp_get_num_threads();
-        fast_sint_t omp_block_stride = ((fast_sint_t)n / omp_num_threads) & (-16);
-        fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-        fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
-                                         ? omp_block_stride
-                                         : (fast_sint_t)n - omp_block_start;
-    #else
-        UNUSED(threads);
-
-        fast_sint_t omp_block_start = 0;
-        fast_sint_t omp_block_size = (fast_sint_t)n;
-    #endif
-
-        libsais_bwt_copy_8u(U + omp_block_start, A + omp_block_start, (sa_sint_t)omp_block_size);
-    }
-}
-
-#endif
-
 void * libsais_create_ctx(void) { return (void *)libsais_create_ctx_main(1); }
 
 void libsais_free_ctx(void * ctx) { libsais_free_ctx_main((LIBSAIS_CONTEXT *)ctx); }
@@ -8904,76 +4381,19 @@ s32 libsais_ctx(const void * ctx, const u8 * T, s32 * SA, s32 n, s32 fs, s32 * f
             memset(freq, 0, ALPHABET_SIZE * sizeof(s32));
         }
         if (n == 1) {
-            SA[0] = 0;
-            if (freq != NULL) {
-                freq[T[0]]++;
-            }
-        }
-        return 0;
-    }
-
-    return libsais_main_ctx((const LIBSAIS_CONTEXT *)ctx, T, SA, n, 0, 0, NULL, fs, freq);
-}
-
-s32 libsais_bwt(const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq) {
-    if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0)) {
-        return -1;
-    } else if (n <= 1) {
-        if (freq != NULL) {
-            memset(freq, 0, ALPHABET_SIZE * sizeof(s32));
-        }
-        if (n == 1) {
-            U[0] = T[0];
-            if (freq != NULL) {
-                freq[T[0]]++;
-            }
-        }
-        return n;
-    }
-
-    sa_sint_t index = libsais_main(T, A, n, 1, 0, NULL, fs, freq, 1);
-    if (index >= 0) {
-        index++;
-
-        U[0] = T[n - 1];
-        libsais_bwt_copy_8u(U + 1, A, index - 1);
-        libsais_bwt_copy_8u(U + index, A + index, n - index);
-    }
-
-    return index;
-}
-
-s32 libsais_bwt_aux(const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq, s32 r, s32 * I) {
-    if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (r < 2) ||
-        ((r & (r - 1)) != 0) || (I == NULL)) {
-        return -1;
-    } else if (n <= 1) {
-        if (freq != NULL) {
-            memset(freq, 0, ALPHABET_SIZE * sizeof(s32));
-        }
-        if (n == 1) {
-            U[0] = T[0];
+            SA[0] = 0;
             if (freq != NULL) {
                 freq[T[0]]++;
             }
         }
-        I[0] = n;
         return 0;
     }
 
-    if (libsais_main(T, A, n, 1, r, I, fs, freq, 1) != 0) {
-        return -2;
-    }
-
-    U[0] = T[n - 1];
-    libsais_bwt_copy_8u(U + 1, A, I[0] - 1);
-    libsais_bwt_copy_8u(U + I[0], A + I[0], n - I[0]);
-
-    return 0;
+    return libsais_main_ctx((const LIBSAIS_CONTEXT *)ctx, T, SA, n, 0, 0, NULL, fs, freq);
 }
 
-s32 libsais_bwt_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq) {
-    if ((ctx == NULL) || (T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0)) {
+s32 libsais_bwt(const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq) {
+    if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0)) {
         return -1;
     } else if (n <= 1) {
         if (freq != NULL) {
@@ -8988,30 +4408,21 @@ s32 libsais_bwt_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n, s32
         return n;
     }
 
-    sa_sint_t index = libsais_main_ctx((const LIBSAIS_CONTEXT *)ctx, T, A, n, 1, 0, NULL, fs, freq);
+    sa_sint_t index = libsais_main(T, A, n, 1, 0, NULL, fs, freq, 1);
     if (index >= 0) {
         index++;
 
         U[0] = T[n - 1];
-
-#if defined(_OPENMP)
-        libsais_bwt_copy_8u_omp(U + 1, A, index - 1,
-                                (sa_sint_t)((const LIBSAIS_CONTEXT *)ctx)->threads);
-        libsais_bwt_copy_8u_omp(U + index, A + index, n - index,
-                                (sa_sint_t)((const LIBSAIS_CONTEXT *)ctx)->threads);
-#else
         libsais_bwt_copy_8u(U + 1, A, index - 1);
         libsais_bwt_copy_8u(U + index, A + index, n - index);
-#endif
     }
 
     return index;
 }
 
-s32 libsais_bwt_aux_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq,
-                        s32 r, s32 * I) {
-    if ((ctx == NULL) || (T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) ||
-        (r < 2) || ((r & (r - 1)) != 0) || (I == NULL)) {
+s32 libsais_bwt_aux(const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq, s32 r, s32 * I) {
+    if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (r < 2) ||
+        ((r & (r - 1)) != 0) || (I == NULL)) {
         return -1;
     } else if (n <= 1) {
         if (freq != NULL) {
@@ -9027,73 +4438,19 @@ s32 libsais_bwt_aux_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n,
         return 0;
     }
 
-    if (libsais_main_ctx((const LIBSAIS_CONTEXT *)ctx, T, A, n, 1, r, I, fs, freq) != 0) {
+    if (libsais_main(T, A, n, 1, r, I, fs, freq, 1) != 0) {
         return -2;
     }
 
     U[0] = T[n - 1];
-
-#if defined(_OPENMP)
-    libsais_bwt_copy_8u_omp(U + 1, A, I[0] - 1, (sa_sint_t)((const LIBSAIS_CONTEXT *)ctx)->threads);
-    libsais_bwt_copy_8u_omp(U + I[0], A + I[0], n - I[0],
-                            (sa_sint_t)((const LIBSAIS_CONTEXT *)ctx)->threads);
-#else
     libsais_bwt_copy_8u(U + 1, A, I[0] - 1);
     libsais_bwt_copy_8u(U + I[0], A + I[0], n - I[0]);
-#endif
 
     return 0;
 }
 
-#if defined(_OPENMP)
-
-void * libsais_create_ctx_omp(s32 threads) {
-    if (threads < 0) {
-        return NULL;
-    }
-
-    threads = threads > 0 ? threads : omp_get_max_threads();
-    return (void *)libsais_create_ctx_main(threads);
-}
-
-s32 libsais_omp(const u8 * T, s32 * SA, s32 n, s32 fs, s32 * freq, s32 threads) {
-    if ((T == NULL) || (SA == NULL) || (n < 0) || (fs < 0) || (threads < 0)) {
-        return -1;
-    } else if (n < 2) {
-        if (freq != NULL) {
-            memset(freq, 0, ALPHABET_SIZE * sizeof(s32));
-        }
-        if (n == 1) {
-            SA[0] = 0;
-            if (freq != NULL) {
-                freq[T[0]]++;
-            }
-        }
-        return 0;
-    }
-
-    threads = threads > 0 ? threads : omp_get_max_threads();
-
-    return libsais_main(T, SA, n, 0, 0, NULL, fs, freq, threads);
-}
-
-s32 libsais_int_omp(s32 * T, s32 * SA, s32 n, s32 k, s32 fs, s32 threads) {
-    if ((T == NULL) || (SA == NULL) || (n < 0) || (fs < 0) || (threads < 0)) {
-        return -1;
-    } else if (n < 2) {
-        if (n == 1) {
-            SA[0] = 0;
-        }
-        return 0;
-    }
-
-    threads = threads > 0 ? threads : omp_get_max_threads();
-
-    return libsais_main_int(T, SA, n, k, fs, threads);
-}
-
-s32 libsais_bwt_omp(const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq, s32 threads) {
-    if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (threads < 0)) {
+s32 libsais_bwt_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq) {
+    if ((ctx == NULL) || (T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0)) {
         return -1;
     } else if (n <= 1) {
         if (freq != NULL) {
@@ -9108,24 +4465,23 @@ s32 libsais_bwt_omp(const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq, s3
         return n;
     }
 
-    threads = threads > 0 ? threads : omp_get_max_threads();
-
-    sa_sint_t index = libsais_main(T, A, n, 1, 0, NULL, fs, freq, threads);
+    sa_sint_t index = libsais_main_ctx((const LIBSAIS_CONTEXT *)ctx, T, A, n, 1, 0, NULL, fs, freq);
     if (index >= 0) {
         index++;
 
         U[0] = T[n - 1];
-        libsais_bwt_copy_8u_omp(U + 1, A, index - 1, threads);
-        libsais_bwt_copy_8u_omp(U + index, A + index, n - index, threads);
+
+        libsais_bwt_copy_8u(U + 1, A, index - 1);
+        libsais_bwt_copy_8u(U + index, A + index, n - index);
     }
 
     return index;
 }
 
-s32 libsais_bwt_aux_omp(const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq, s32 r, s32 * I,
-                        s32 threads) {
-    if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (r < 2) ||
-        ((r & (r - 1)) != 0) || (I == NULL) || (threads < 0)) {
+s32 libsais_bwt_aux_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq,
+                        s32 r, s32 * I) {
+    if ((ctx == NULL) || (T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) ||
+        (r < 2) || ((r & (r - 1)) != 0) || (I == NULL)) {
         return -1;
     } else if (n <= 1) {
         if (freq != NULL) {
@@ -9141,21 +4497,15 @@ s32 libsais_bwt_aux_omp(const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq
         return 0;
     }
 
-    threads = threads > 0 ? threads : omp_get_max_threads();
-
-    if (libsais_main(T, A, n, 1, r, I, fs, freq, threads) != 0) {
+    if (libsais_main_ctx((const LIBSAIS_CONTEXT *)ctx, T, A, n, 1, r, I, fs, freq) != 0) {
         return -2;
     }
 
     U[0] = T[n - 1];
-    libsais_bwt_copy_8u_omp(U + 1, A, I[0] - 1, threads);
-    libsais_bwt_copy_8u_omp(U + I[0], A + I[0], n - I[0], threads);
-
+    libsais_bwt_copy_8u(U + 1, A, I[0] - 1);
+    libsais_bwt_copy_8u(U + I[0], A + I[0], n - I[0]);
     return 0;
 }
-
-#endif
-
 static LIBSAIS_UNBWT_CONTEXT * libsais_unbwt_create_ctx_main(sa_sint_t threads) {
     LIBSAIS_UNBWT_CONTEXT * RESTRICT ctx =
         (LIBSAIS_UNBWT_CONTEXT *)libsais_alloc_aligned(sizeof(LIBSAIS_UNBWT_CONTEXT), 64);
@@ -9580,183 +4930,6 @@ static void libsais_unbwt_init_single(const u8 * RESTRICT T, sa_uint_t * RESTRIC
     libsais_unbwt_calculate_fastbits(bucket2, fastbits, lastc, shift);
     libsais_unbwt_calculate_biPSI(T, P, bucket1, bucket2, index, 0, n);
 }
-
-#if defined(_OPENMP)
-
-static void libsais_unbwt_compute_bigram_histogram_parallel(
-    const u8 * RESTRICT T, fast_uint_t index, sa_uint_t * RESTRICT bucket1,
-    sa_uint_t * RESTRICT bucket2, fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
-    fast_sint_t i;
-    for (i = omp_block_start; i < omp_block_start + omp_block_size; ++i) {
-        fast_uint_t c = T[i];
-        fast_uint_t p = bucket1[c]++;
-        fast_sint_t t = (fast_sint_t)(index - p);
-
-        if (t != 0) {
-            fast_uint_t w =
-                (((fast_uint_t)T[p + (fast_uint_t)(t >> ((sizeof(fast_sint_t) * 8) - 1))]) << 8) +
-                c;
-            bucket2[w]++;
-        }
-    }
-}
-
-static void libsais_unbwt_init_parallel(const u8 * RESTRICT T, sa_uint_t * RESTRICT P, sa_sint_t n,
-                                        const sa_sint_t * freq, const sa_uint_t * RESTRICT I,
-                                        sa_uint_t * RESTRICT bucket2, u16 * RESTRICT fastbits,
-                                        sa_uint_t * RESTRICT buckets, sa_sint_t threads) {
-    sa_uint_t bucket1[ALPHABET_SIZE];
-
-    fast_uint_t index = I[0];
-    fast_uint_t lastc = T[0];
-    fast_uint_t shift = 0;
-    while ((n >> shift) > (1 << UNBWT_FASTBITS)) {
-        shift++;
-    }
-
-    memset(bucket1, 0, ALPHABET_SIZE * sizeof(sa_uint_t));
-    memset(bucket2, 0, ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t));
-
-    #pragma omp parallel num_threads(threads) if (threads > 1 && n >= 65536)
-    {
-        fast_sint_t omp_thread_num = omp_get_thread_num();
-        fast_sint_t omp_num_threads = omp_get_num_threads();
-
-        if (omp_num_threads == 1) {
-            libsais_unbwt_init_single(T, P, n, freq, I, bucket2, fastbits);
-        } else {
-            sa_uint_t * RESTRICT bucket1_local =
-                buckets + omp_thread_num * (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE));
-            sa_uint_t * RESTRICT bucket2_local = bucket1_local + ALPHABET_SIZE;
-
-            fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
-            fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
-            fast_sint_t omp_block_size =
-                omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
-
-            {
-                memset(bucket1_local, 0, ALPHABET_SIZE * sizeof(sa_uint_t));
-                libsais_unbwt_compute_histogram(T + omp_block_start, omp_block_size, bucket1_local);
-            }
-
-    #pragma omp barrier
-
-    #pragma omp master
-            {
-                {
-                    sa_uint_t * RESTRICT bucket1_temp = buckets;
-
-                    fast_sint_t t;
-                    for (t = 0; t < omp_num_threads;
-                         ++t, bucket1_temp += ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)) {
-                        fast_sint_t c;
-                        for (c = 0; c < ALPHABET_SIZE; c += 1) {
-                            sa_uint_t A = bucket1[c], B = bucket1_temp[c];
-                            bucket1[c] = A + B;
-                            bucket1_temp[c] = A;
-                        }
-                    }
-                }
-
-                {
-                    fast_uint_t sum, c;
-                    for (sum = 1, c = 0; c < ALPHABET_SIZE; ++c) {
-                        fast_uint_t prev = sum;
-                        sum += bucket1[c];
-                        bucket1[c] = (sa_uint_t)prev;
-                    }
-                }
-            }
-
-    #pragma omp barrier
-
-            {
-                fast_sint_t c;
-                for (c = 0; c < ALPHABET_SIZE; c += 1) {
-                    sa_uint_t A = bucket1[c], B = bucket1_local[c];
-                    bucket1_local[c] = A + B;
-                }
-
-                memset(bucket2_local, 0, ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t));
-                libsais_unbwt_compute_bigram_histogram_parallel(
-                    T, index, bucket1_local, bucket2_local, omp_block_start, omp_block_size);
-            }
-
-    #pragma omp barrier
-
-            {
-                fast_sint_t omp_bucket2_stride =
-                    ((ALPHABET_SIZE * ALPHABET_SIZE) / omp_num_threads) & (-16);
-                fast_sint_t omp_bucket2_start = omp_thread_num * omp_bucket2_stride;
-                fast_sint_t omp_bucket2_size =
-                    omp_thread_num < omp_num_threads - 1
-                        ? omp_bucket2_stride
-                        : (ALPHABET_SIZE * ALPHABET_SIZE) - omp_bucket2_start;
-
-                sa_uint_t * RESTRICT bucket2_temp = buckets + ALPHABET_SIZE;
-
-                fast_sint_t t;
-                for (t = 0; t < omp_num_threads;
-                     ++t, bucket2_temp += ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)) {
-                    fast_sint_t c;
-                    for (c = omp_bucket2_start; c < omp_bucket2_start + omp_bucket2_size; c += 1) {
-                        sa_uint_t A = bucket2[c], B = bucket2_temp[c];
-                        bucket2[c] = A + B;
-                        bucket2_temp[c] = A;
-                    }
-                }
-            }
-
-    #pragma omp barrier
-
-    #pragma omp master
-            {
-                libsais_unbwt_calculate_fastbits(bucket2, fastbits, lastc, shift);
-
-                {
-                    fast_sint_t t;
-                    for (t = omp_num_threads - 1; t >= 1; --t) {
-                        sa_uint_t * RESTRICT dst_bucket1 =
-                            buckets + t * (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE));
-                        sa_uint_t * RESTRICT src_bucket1 =
-                            dst_bucket1 - (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE));
-
-                        memcpy(dst_bucket1, src_bucket1, ALPHABET_SIZE * sizeof(sa_uint_t));
-                    }
-
-                    memcpy(buckets, bucket1, ALPHABET_SIZE * sizeof(sa_uint_t));
-                }
-            }
-
-    #pragma omp barrier
-
-            {
-                fast_sint_t c;
-                for (c = 0; c < ALPHABET_SIZE * ALPHABET_SIZE; c += 1) {
-                    sa_uint_t A = bucket2[c], B = bucket2_local[c];
-                    bucket2_local[c] = A + B;
-                }
-
-                libsais_unbwt_calculate_biPSI(T, P, bucket1_local, bucket2_local, index,
-                                              omp_block_start, omp_block_start + omp_block_size);
-            }
-
-    #pragma omp barrier
-
-    #pragma omp master
-            {
-                memcpy(
-                    bucket2,
-                    buckets + ALPHABET_SIZE +
-                        (omp_num_threads - 1) * (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)),
-                    ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t));
-            }
-        }
-    }
-}
-
-#endif
-
 static void libsais_unbwt_decode_1(u8 * RESTRICT U, sa_uint_t * RESTRICT P,
                                    sa_uint_t * RESTRICT bucket2, u16 * RESTRICT fastbits,
                                    fast_uint_t shift, fast_uint_t * i0, fast_uint_t k) {
@@ -10301,21 +5474,11 @@ static void libsais_unbwt_decode_omp(const u8 * RESTRICT T, u8 * RESTRICT U, sa_
     fast_sint_t blocks = 1 + (((fast_sint_t)n - 1) / (fast_sint_t)r);
     fast_uint_t reminder = (fast_uint_t)n - ((fast_uint_t)r * ((fast_uint_t)blocks - 1));
 
-#if defined(_OPENMP)
-    fast_sint_t max_threads = blocks < threads ? blocks : threads;
-    #pragma omp parallel num_threads(max_threads) if (max_threads > 1 && n >= 65536)
-#endif
     {
-#if defined(_OPENMP)
-        fast_sint_t omp_thread_num = omp_get_thread_num();
-        fast_sint_t omp_num_threads = omp_get_num_threads();
-#else
-        UNUSED(threads);
+        (void)(threads);
 
         fast_sint_t omp_thread_num = 0;
         fast_sint_t omp_num_threads = 1;
-#endif
-
         fast_sint_t omp_block_stride = blocks / omp_num_threads;
         fast_sint_t omp_block_reminder = blocks % omp_num_threads;
         fast_sint_t omp_block_size = omp_block_stride + (omp_thread_num < omp_block_reminder);
@@ -10336,16 +5499,9 @@ static sa_sint_t libsais_unbwt_core(const u8 * RESTRICT T, u8 * RESTRICT U, sa_u
                                     const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2,
                                     u16 * RESTRICT fastbits, sa_uint_t * RESTRICT buckets,
                                     sa_sint_t threads) {
-#if defined(_OPENMP)
-    if (threads > 1 && n >= 262144) {
-        libsais_unbwt_init_parallel(T, P, n, freq, I, bucket2, fastbits, buckets, threads);
-    } else
-#else
-    UNUSED(buckets);
-#endif
-    {
-        libsais_unbwt_init_single(T, P, n, freq, I, bucket2, fastbits);
-    }
+    (void)(buckets);
+
+    { libsais_unbwt_init_single(T, P, n, freq, I, bucket2, fastbits); }
 
     libsais_unbwt_decode_omp(T, U, P, n, r, I, bucket2, fastbits, threads);
     return 0;
@@ -10458,50 +5614,6 @@ s32 libsais_unbwt_aux_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n
     return libsais_unbwt_main_ctx((const LIBSAIS_UNBWT_CONTEXT *)ctx, T, U, (sa_uint_t *)A, n, freq,
                                   r, (const sa_uint_t *)I);
 }
-
-#if defined(_OPENMP)
-
-void * libsais_unbwt_create_ctx_omp(s32 threads) {
-    if (threads < 0) {
-        return NULL;
-    }
-
-    threads = threads > 0 ? threads : omp_get_max_threads();
-    return (void *)libsais_unbwt_create_ctx_main(threads);
-}
-
-s32 libsais_unbwt_omp(const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq, s32 i, s32 threads) {
-    return libsais_unbwt_aux_omp(T, U, A, n, freq, n, &i, threads);
-}
-
-s32 libsais_unbwt_aux_omp(const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq, s32 r,
-                          const s32 * I, s32 threads) {
-    if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) ||
-        ((r != n) && ((r < 2) || ((r & (r - 1)) != 0))) || (I == NULL) || (threads < 0)) {
-        return -1;
-    } else if (n <= 1) {
-        if (I[0] != n) {
-            return -1;
-        }
-        if (n == 1) {
-            U[0] = T[0];
-        }
-        return 0;
-    }
-
-    fast_sint_t t;
-    for (t = 0; t <= (n - 1) / r; ++t) {
-        if (I[t] <= 0 || I[t] > n) {
-            return -1;
-        }
-    }
-
-    threads = threads > 0 ? threads : omp_get_max_threads();
-    return libsais_unbwt_main(T, U, (sa_uint_t *)A, n, freq, r, (const sa_uint_t *)I, threads);
-}
-
-#endif
-
 static void libsais_compute_phi(const sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT PLCP,
                                 sa_sint_t n, fast_sint_t omp_block_start,
                                 fast_sint_t omp_block_size) {
@@ -10536,19 +5648,12 @@ static void libsais_compute_phi(const sa_sint_t * RESTRICT SA, sa_sint_t * RESTR
 
 static void libsais_compute_phi_omp(const sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT PLCP,
                                     sa_sint_t n, sa_sint_t threads) {
-#if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if (threads > 1 && n >= 65536)
-#endif
     {
-#if defined(_OPENMP)
-        fast_sint_t omp_thread_num = omp_get_thread_num();
-        fast_sint_t omp_num_threads = omp_get_num_threads();
-#else
-        UNUSED(threads);
+        (void)(threads);
 
         fast_sint_t omp_thread_num = 0;
         fast_sint_t omp_num_threads = 1;
-#endif
+
         fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
         fast_sint_t omp_block_size =
@@ -10589,19 +5694,12 @@ static void libsais_compute_plcp(const u8 * RESTRICT T, sa_sint_t * RESTRICT PLC
 
 static void libsais_compute_plcp_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT PLCP, sa_sint_t n,
                                      sa_sint_t threads) {
-#if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if (threads > 1 && n >= 65536)
-#endif
     {
-#if defined(_OPENMP)
-        fast_sint_t omp_thread_num = omp_get_thread_num();
-        fast_sint_t omp_num_threads = omp_get_num_threads();
-#else
-        UNUSED(threads);
+        (void)(threads);
 
         fast_sint_t omp_thread_num = 0;
         fast_sint_t omp_num_threads = 1;
-#endif
+
         fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
         fast_sint_t omp_block_size =
@@ -10639,19 +5737,12 @@ static void libsais_compute_lcp(const sa_sint_t * RESTRICT PLCP, const sa_sint_t
 
 static void libsais_compute_lcp_omp(const sa_sint_t * RESTRICT PLCP, const sa_sint_t * RESTRICT SA,
                                     sa_sint_t * RESTRICT LCP, sa_sint_t n, sa_sint_t threads) {
-#if defined(_OPENMP)
-    #pragma omp parallel num_threads(threads) if (threads > 1 && n >= 65536)
-#endif
     {
-#if defined(_OPENMP)
-        fast_sint_t omp_thread_num = omp_get_thread_num();
-        fast_sint_t omp_num_threads = omp_get_num_threads();
-#else
-        UNUSED(threads);
+        (void)(threads);
 
         fast_sint_t omp_thread_num = 0;
         fast_sint_t omp_num_threads = 1;
-#endif
+
         fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
         fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
         fast_sint_t omp_block_size =
@@ -10691,42 +5782,3 @@ s32 libsais_lcp(const s32 * PLCP, const s32 * SA, s32 * LCP, s32 n) {
 
     return 0;
 }
-
-#if defined(_OPENMP)
-
-s32 libsais_plcp_omp(const u8 * T, const s32 * SA, s32 * PLCP, s32 n, s32 threads) {
-    if ((T == NULL) || (SA == NULL) || (PLCP == NULL) || (n < 0) || (threads < 0)) {
-        return -1;
-    } else if (n <= 1) {
-        if (n == 1) {
-            PLCP[0] = 0;
-        }
-        return 0;
-    }
-
-    threads = threads > 0 ? threads : omp_get_max_threads();
-
-    libsais_compute_phi_omp(SA, PLCP, n, threads);
-    libsais_compute_plcp_omp(T, PLCP, n, threads);
-
-    return 0;
-}
-
-s32 libsais_lcp_omp(const s32 * PLCP, const s32 * SA, s32 * LCP, s32 n, s32 threads) {
-    if ((PLCP == NULL) || (SA == NULL) || (LCP == NULL) || (n < 0) || (threads < 0)) {
-        return -1;
-    } else if (n <= 1) {
-        if (n == 1) {
-            LCP[0] = PLCP[SA[0]];
-        }
-        return 0;
-    }
-
-    threads = threads > 0 ? threads : omp_get_max_threads();
-
-    libsais_compute_lcp_omp(PLCP, SA, LCP, n, threads);
-
-    return 0;
-}
-
-#endif
tab: 248 wrap: offon