Merge compression code to libbz3: solve #47 and #46. (#48)
* incorporate lzp & crc32 to libbz3.c * merge rle & ac into libbz3 * libsais: make a header-only amalgamate * hide the AC symbols * whitespace fixes * conservative fix for #47 Prioritise our `restrict` detection mechanism over anything autoconf has to offer, meaning that source code in common.h won't be changed, but also define plain `restrict` as fallback for the undetected case. * PUBLIC_API clarification
diff --git a/Makefile.am b/Makefile.am
index e053070..67364ca 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -8,21 +8,12 @@ EXTRA_DIST = LICENSE PORTING.md README.md build-aux/git-version-gen
pkgconfig_DATA = bzip3.pc
include_HEADERS = include/libbz3.h
-noinst_HEADERS = include/cm.h \
- include/common.h \
- include/crc32.h \
+noinst_HEADERS = include/common.h \
include/libsais.h \
- include/lzp.h \
- include/rle.h
include/getopt-shim.h
lib_LTLIBRARIES = libbzip3.la
-libbzip3_la_SOURCES = src/cm.c \
- src/crc32.c \
- src/libbz3.c \
- src/libsais.c \
- src/lzp.c \
- src/rle.c
+libbzip3_la_SOURCES = src/libbz3.c
bin_PROGRAMS = bzip3
bzip3_CFLAGS = $(AM_CFLAGS)
diff --git a/PORTING.md b/PORTING.md
index 6931c19..9d3c2b3 100644
--- a/PORTING.md
+++ b/PORTING.md
@@ -12,6 +12,8 @@ $ ./configure CC=i866-w64-mingw32-gcc --host i686-w64-mingw32
$ make
```
+If building a dynamic library is desired, change `PUBLIC_API` in `include/common.h` to `__declspec(dllexport)` or `__attribute__((visibility("default")))` depending on your compiler.
+
## M1 MacOS
Clang doesn't support `-march=native -mtune=native`, so you should remove them.
diff --git a/configure.ac b/configure.ac
index 4848d2e..940879a 100644
--- a/configure.ac
+++ b/configure.ac
@@ -19,6 +19,8 @@ PKG_INSTALLDIR
AC_CHECK_HEADERS([getopt.h])
AC_CHECK_FUNCS([getopt_long])
+AC_C_RESTRICT
+
AC_ARG_WITH([pthread],
AS_HELP_STRING([--without-pthread], [Disable use of pthread library]))
AM_CONDITIONAL([WITH_PTHREAD], [test x"$with_pthread" != xno])
diff --git a/include/cm.h b/include/cm.h
deleted file mode 100644
index 1b3a806..0000000
--- a/include/cm.h
+++ /dev/null
@@ -1,23 +0,0 @@
-
-#ifndef _CM_H
-#define _CM_H
-
-#include <inttypes.h>
-#include <stdint.h>
-
-#include "common.h"
-
-typedef struct {
- u32 low, high, code;
- s32 c1, c2, run;
- u8 *in_queue, *out_queue;
- s32 input_ptr, output_ptr, input_max;
-
- u16 C0[256], C1[256][256], C2[512][17];
-} state;
-
-void begin(state * s);
-void encode_bytes(state * s, u8 * c, s32 size);
-void decode_bytes(state * s, u8 * c, s32 size);
-
-#endif
diff --git a/include/common.h b/include/common.h
index 04b5649..e59978b 100644
--- a/include/common.h
+++ b/include/common.h
@@ -46,11 +46,7 @@ static void write_neutral_s32(u8 * data, s32 value) {
data[3] = (value >> 24) & 0xFF;
}
-#ifdef __MINGW32__
- #define PUBLIC_API __declspec(dllexport)
-#else
- #define PUBLIC_API
-#endif
+#define PUBLIC_API
#if defined(__GNUC__) || defined(__clang__)
#define RESTRICT __restrict__
@@ -61,4 +57,69 @@ static void write_neutral_s32(u8 * data, s32 value) {
#warning Your compiler, configuration or platform might not be supported.
#endif
+#if defined(__has_builtin)
+ #if __has_builtin(__builtin_prefetch)
+ #define HAS_BUILTIN_PREFECTCH
+ #endif
+#elif defined(__GNUC__) && (((__GNUC__ == 3) && (__GNUC_MINOR__ >= 2)) || (__GNUC__ >= 4))
+ #define HAS_BUILTIN_PREFECTCH
+#endif
+
+#if defined(__has_builtin)
+ #if __has_builtin(__builtin_bswap16)
+ #define HAS_BUILTIN_BSWAP16
+ #endif
+#elif defined(__GNUC__) && (((__GNUC__ == 4) && (__GNUC_MINOR__ >= 8)) || (__GNUC__ >= 5))
+ #define HAS_BUILTIN_BSWAP16
+#endif
+
+#if defined(HAS_BUILTIN_PREFECTCH)
+ #define prefetch(address) __builtin_prefetch((const void *)(address), 0, 0)
+ #define prefetchw(address) __builtin_prefetch((const void *)(address), 1, 0)
+#elif defined(_M_IX86) || defined(_M_AMD64)
+ #include <intrin.h>
+ #define prefetch(address) _mm_prefetch((const void *)(address), _MM_HINT_NTA)
+ #define prefetchw(address) _m_prefetchw((const void *)(address))
+#elif defined(_M_ARM)
+ #include <intrin.h>
+ #define prefetch(address) __prefetch((const void *)(address))
+ #define prefetchw(address) __prefetchw((const void *)(address))
+#elif defined(_M_ARM64)
+ #include <intrin.h>
+ #define prefetch(address) __prefetch2((const void *)(address), 1)
+ #define prefetchw(address) __prefetch2((const void *)(address), 17)
+#else
+ #error Your compiler, configuration or platform is not supported.
+#endif
+
+#if !defined(__LITTLE_ENDIAN__) && !defined(__BIG_ENDIAN__)
+ #if defined(_LITTLE_ENDIAN) || (defined(BYTE_ORDER) && defined(LITTLE_ENDIAN) && BYTE_ORDER == LITTLE_ENDIAN) || \
+ (defined(_BYTE_ORDER) && defined(_LITTLE_ENDIAN) && _BYTE_ORDER == _LITTLE_ENDIAN) || \
+ (defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && __BYTE_ORDER == __LITTLE_ENDIAN) || \
+ (defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+ #define __LITTLE_ENDIAN__
+ #elif defined(_BIG_ENDIAN) || (defined(BYTE_ORDER) && defined(BIG_ENDIAN) && BYTE_ORDER == BIG_ENDIAN) || \
+ (defined(_BYTE_ORDER) && defined(_BIG_ENDIAN) && _BYTE_ORDER == _BIG_ENDIAN) || \
+ (defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && __BYTE_ORDER == __BIG_ENDIAN) || \
+ (defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+ #define __BIG_ENDIAN__
+ #elif defined(_WIN32)
+ #define __LITTLE_ENDIAN__
+ #endif
+#endif
+
+#if defined(__LITTLE_ENDIAN__) && !defined(__BIG_ENDIAN__)
+ #if defined(HAS_BUILTIN_BSWAP16)
+ #define bswap16(x) (__builtin_bswap16(x))
+ #elif defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+ #define bswap16(x) (_byteswap_ushort(x))
+ #else
+ #define bswap16(x) ((u16)(x >> 8) | (u16)(x << 8))
+ #endif
+#elif !defined(__LITTLE_ENDIAN__) && defined(__BIG_ENDIAN__)
+ #define bswap16(x) (x)
+#else
+ #error Your compiler, configuration or platform is not supported.
+#endif
+
#endif
diff --git a/include/crc32.h b/include/crc32.h
deleted file mode 100644
index 84de554..0000000
--- a/include/crc32.h
+++ /dev/null
@@ -1,30 +0,0 @@
-
-/*
- * BZip3 - A spiritual successor to BZip2.
- * Copyright (C) 2022 Kamila Szewczyk
- *
- * This program is free software: you can redistribute it and/or modify it
- * under the terms of the GNU Lesser General Public License as published by the Free
- * Software Foundation, either version 3 of the License, or (at your option)
- * any later version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU Lesser General Public License along with
- * this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef _CRC32_H
-#define _CRC32_H
-
-#include <inttypes.h>
-#include <stddef.h>
-
-#include "common.h"
-
-u32 crc32sum(u32 crc, u8 * buf, size_t size);
-
-#endif
diff --git a/include/getopt-shim.h b/include/getopt-shim.h
index fde2108..8c15d8c 100644
--- a/include/getopt-shim.h
+++ b/include/getopt-shim.h
@@ -1,6 +1,6 @@
/*
Copyright 2005-2014 Rich Felker, et al.
-
+
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
@@ -8,10 +8,10 @@
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
-
+
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
-
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
@@ -24,230 +24,208 @@
#ifndef _GETOPT_H
#define _GETOPT_H
-int getopt(int, char * const [], const char *);
-extern char *optarg;
+int getopt(int, char * const[], const char *);
+extern char * optarg;
extern int optind, opterr, optopt, optreset;
struct option {
- const char *name;
- int has_arg;
- int *flag;
- int val;
+ const char * name;
+ int has_arg;
+ int * flag;
+ int val;
};
-int getopt_long(int, char *const *, const char *, const struct option *, int *);
-int getopt_long_only(int, char *const *, const char *, const struct option *, int *);
+int getopt_long(int, char * const *, const char *, const struct option *, int *);
+int getopt_long_only(int, char * const *, const char *, const struct option *, int *);
-#define no_argument 0
-#define required_argument 1
-#define optional_argument 2
+#define no_argument 0
+#define required_argument 1
+#define optional_argument 2
-char *optarg;
-int optind=1, opterr=1, optopt, __optpos, optreset=0;
+char * optarg;
+int optind = 1, opterr = 1, optopt, __optpos, optreset = 0;
#define optpos __optpos
-static void __getopt_msg(const char *a, const char *b, const char *c, size_t l)
-{
- FILE *f = stderr;
- flockfile(f);
- fputs(a, f)>=0
- && fwrite(b, strlen(b), 1, f)
- && fwrite(c, 1, l, f)==l
- && putc('\n', f);
- funlockfile(f);
+static void __getopt_msg(const char * a, const char * b, const char * c, size_t l) {
+ FILE * f = stderr;
+ flockfile(f);
+ fputs(a, f) >= 0 && fwrite(b, strlen(b), 1, f) && fwrite(c, 1, l, f) == l && putc('\n', f);
+ funlockfile(f);
}
-int getopt(int argc, char * const argv[], const char *optstring)
-{
- int i, c, d;
- int k, l;
- char *optchar;
-
- if (!optind || optreset) {
- optreset = 0;
- __optpos = 0;
- optind = 1;
- }
-
- if (optind >= argc || !argv[optind])
- return -1;
-
- if (argv[optind][0] != '-') {
- if (optstring[0] == '-') {
- optarg = argv[optind++];
- return 1;
- }
- return -1;
- }
-
- if (!argv[optind][1])
- return -1;
-
- if (argv[optind][1] == '-' && !argv[optind][2])
- return optind++, -1;
-
- if (!optpos) optpos++;
- c = argv[optind][optpos], k = 1;
- optchar = argv[optind]+optpos;
- optopt = c;
- optpos += k;
-
- if (!argv[optind][optpos]) {
- optind++;
- optpos = 0;
- }
-
- if (optstring[0] == '-' || optstring[0] == '+')
- optstring++;
-
- i = 0;
- d = 0;
- do {
- d = optstring[i], l = 1;
- if (l>0) i+=l; else i++;
- } while (l && d != c);
-
- if (d != c) {
- if (optstring[0] != ':' && opterr)
- __getopt_msg(argv[0], ": unrecognized option: ", optchar, k);
- return '?';
- }
- if (optstring[i] == ':') {
- if (optstring[i+1] == ':') optarg = 0;
- else if (optind >= argc) {
- if (optstring[0] == ':') return ':';
- if (opterr) __getopt_msg(argv[0],
- ": option requires an argument: ",
- optchar, k);
- return '?';
- }
- if (optstring[i+1] != ':' || optpos) {
- optarg = argv[optind++] + optpos;
- optpos = 0;
- }
- }
- return c;
+int getopt(int argc, char * const argv[], const char * optstring) {
+ int i, c, d;
+ int k, l;
+ char * optchar;
+
+ if (!optind || optreset) {
+ optreset = 0;
+ __optpos = 0;
+ optind = 1;
+ }
+
+ if (optind >= argc || !argv[optind]) return -1;
+
+ if (argv[optind][0] != '-') {
+ if (optstring[0] == '-') {
+ optarg = argv[optind++];
+ return 1;
+ }
+ return -1;
+ }
+
+ if (!argv[optind][1]) return -1;
+
+ if (argv[optind][1] == '-' && !argv[optind][2]) return optind++, -1;
+
+ if (!optpos) optpos++;
+ c = argv[optind][optpos], k = 1;
+ optchar = argv[optind] + optpos;
+ optopt = c;
+ optpos += k;
+
+ if (!argv[optind][optpos]) {
+ optind++;
+ optpos = 0;
+ }
+
+ if (optstring[0] == '-' || optstring[0] == '+') optstring++;
+
+ i = 0;
+ d = 0;
+ do {
+ d = optstring[i], l = 1;
+ if (l > 0)
+ i += l;
+ else
+ i++;
+ } while (l && d != c);
+
+ if (d != c) {
+ if (optstring[0] != ':' && opterr) __getopt_msg(argv[0], ": unrecognized option: ", optchar, k);
+ return '?';
+ }
+ if (optstring[i] == ':') {
+ if (optstring[i + 1] == ':')
+ optarg = 0;
+ else if (optind >= argc) {
+ if (optstring[0] == ':') return ':';
+ if (opterr) __getopt_msg(argv[0], ": option requires an argument: ", optchar, k);
+ return '?';
+ }
+ if (optstring[i + 1] != ':' || optpos) {
+ optarg = argv[optind++] + optpos;
+ optpos = 0;
+ }
+ }
+ return c;
}
-static void permute(char *const *argv, int dest, int src)
-{
- char **av = (char **)argv;
- char *tmp = av[src];
- int i;
- for (i=src; i>dest; i--)
- av[i] = av[i-1];
- av[dest] = tmp;
+static void permute(char * const * argv, int dest, int src) {
+ char ** av = (char **)argv;
+ char * tmp = av[src];
+ int i;
+ for (i = src; i > dest; i--) av[i] = av[i - 1];
+ av[dest] = tmp;
}
-static int __getopt_long_core(int argc, char *const *argv, const char *optstring, const struct option *longopts, int *idx, int longonly)
-{
- optarg = 0;
- if (longopts && argv[optind][0] == '-' &&
- ((longonly && argv[optind][1] && argv[optind][1] != '-') ||
- (argv[optind][1] == '-' && argv[optind][2])))
- {
- int colon = optstring[optstring[0]=='+'||optstring[0]=='-']==':';
- int i, cnt, match;
- char *opt;
- for (cnt=i=0; longopts[i].name; i++) {
- const char *name = longopts[i].name;
- opt = argv[optind]+1;
- if (*opt == '-') opt++;
- for (; *name && *name == *opt; name++, opt++);
- if (*opt && *opt != '=') continue;
- match = i;
- if (!*name) {
- cnt = 1;
- break;
- }
- cnt++;
- }
- if (cnt==1) {
- i = match;
- optind++;
- optopt = longopts[i].val;
- if (*opt == '=') {
- if (!longopts[i].has_arg) {
- if (colon || !opterr)
- return '?';
- __getopt_msg(argv[0],
- ": option does not take an argument: ",
- longopts[i].name,
- strlen(longopts[i].name));
- return '?';
- }
- optarg = opt+1;
- } else if (longopts[i].has_arg == required_argument) {
- if (!(optarg = argv[optind])) {
- if (colon) return ':';
- if (!opterr) return '?';
- __getopt_msg(argv[0],
- ": option requires an argument: ",
- longopts[i].name,
- strlen(longopts[i].name));
- return '?';
- }
- optind++;
- }
- if (idx) *idx = i;
- if (longopts[i].flag) {
- *longopts[i].flag = longopts[i].val;
- return 0;
- }
- return longopts[i].val;
- }
- if (argv[optind][1] == '-') {
- if (!colon && opterr)
- __getopt_msg(argv[0], cnt ?
- ": option is ambiguous: " :
- ": unrecognized option: ",
- argv[optind]+2,
- strlen(argv[optind]+2));
- optind++;
- return '?';
- }
- }
- return getopt(argc, argv, optstring);
+static int __getopt_long_core(int argc, char * const * argv, const char * optstring, const struct option * longopts,
+ int * idx, int longonly) {
+ optarg = 0;
+ if (longopts && argv[optind][0] == '-' &&
+ ((longonly && argv[optind][1] && argv[optind][1] != '-') || (argv[optind][1] == '-' && argv[optind][2]))) {
+ int colon = optstring[optstring[0] == '+' || optstring[0] == '-'] == ':';
+ int i, cnt, match;
+ char * opt;
+ for (cnt = i = 0; longopts[i].name; i++) {
+ const char * name = longopts[i].name;
+ opt = argv[optind] + 1;
+ if (*opt == '-') opt++;
+ for (; *name && *name == *opt; name++, opt++)
+ ;
+ if (*opt && *opt != '=') continue;
+ match = i;
+ if (!*name) {
+ cnt = 1;
+ break;
+ }
+ cnt++;
+ }
+ if (cnt == 1) {
+ i = match;
+ optind++;
+ optopt = longopts[i].val;
+ if (*opt == '=') {
+ if (!longopts[i].has_arg) {
+ if (colon || !opterr) return '?';
+ __getopt_msg(argv[0], ": option does not take an argument: ", longopts[i].name,
+ strlen(longopts[i].name));
+ return '?';
+ }
+ optarg = opt + 1;
+ } else if (longopts[i].has_arg == required_argument) {
+ if (!(optarg = argv[optind])) {
+ if (colon) return ':';
+ if (!opterr) return '?';
+ __getopt_msg(argv[0], ": option requires an argument: ", longopts[i].name,
+ strlen(longopts[i].name));
+ return '?';
+ }
+ optind++;
+ }
+ if (idx) *idx = i;
+ if (longopts[i].flag) {
+ *longopts[i].flag = longopts[i].val;
+ return 0;
+ }
+ return longopts[i].val;
+ }
+ if (argv[optind][1] == '-') {
+ if (!colon && opterr)
+ __getopt_msg(argv[0], cnt ? ": option is ambiguous: " : ": unrecognized option: ", argv[optind] + 2,
+ strlen(argv[optind] + 2));
+ optind++;
+ return '?';
+ }
+ }
+ return getopt(argc, argv, optstring);
}
-static int __getopt_long(int argc, char *const *argv, const char *optstring, const struct option *longopts, int *idx, int longonly)
-{
- int ret, skipped, resumed;
- if (!optind || optreset) {
- optreset = 0;
- __optpos = 0;
- optind = 1;
- }
- if (optind >= argc || !argv[optind]) return -1;
- skipped = optind;
- if (optstring[0] != '+' && optstring[0] != '-') {
- int i;
- for (i=optind; ; i++) {
- if (i >= argc || !argv[i]) return -1;
- if (argv[i][0] == '-' && argv[i][1]) break;
- }
- optind = i;
- }
- resumed = optind;
- ret = __getopt_long_core(argc, argv, optstring, longopts, idx, longonly);
- if (resumed > skipped) {
- int i, cnt = optind-resumed;
- for (i=0; i<cnt; i++)
- permute(argv, skipped, optind-1);
- optind = skipped + cnt;
- }
- return ret;
+static int __getopt_long(int argc, char * const * argv, const char * optstring, const struct option * longopts,
+ int * idx, int longonly) {
+ int ret, skipped, resumed;
+ if (!optind || optreset) {
+ optreset = 0;
+ __optpos = 0;
+ optind = 1;
+ }
+ if (optind >= argc || !argv[optind]) return -1;
+ skipped = optind;
+ if (optstring[0] != '+' && optstring[0] != '-') {
+ int i;
+ for (i = optind;; i++) {
+ if (i >= argc || !argv[i]) return -1;
+ if (argv[i][0] == '-' && argv[i][1]) break;
+ }
+ optind = i;
+ }
+ resumed = optind;
+ ret = __getopt_long_core(argc, argv, optstring, longopts, idx, longonly);
+ if (resumed > skipped) {
+ int i, cnt = optind - resumed;
+ for (i = 0; i < cnt; i++) permute(argv, skipped, optind - 1);
+ optind = skipped + cnt;
+ }
+ return ret;
}
-int getopt_long(int argc, char *const *argv, const char *optstring, const struct option *longopts, int *idx)
-{
- return __getopt_long(argc, argv, optstring, longopts, idx, 0);
+int getopt_long(int argc, char * const * argv, const char * optstring, const struct option * longopts, int * idx) {
+ return __getopt_long(argc, argv, optstring, longopts, idx, 0);
}
-int getopt_long_only(int argc, char *const *argv, const char *optstring, const struct option *longopts, int *idx)
-{
- return __getopt_long(argc, argv, optstring, longopts, idx, 1);
+int getopt_long_only(int argc, char * const * argv, const char * optstring, const struct option * longopts, int * idx) {
+ return __getopt_long(argc, argv, optstring, longopts, idx, 1);
}
#endif
diff --git a/include/libsais.h b/include/libsais.h
index dc3b704..d4a2d80 100644
--- a/include/libsais.h
+++ b/include/libsais.h
@@ -26,216 +26,5397 @@ Please see the file LICENSE for full copyright information.
#include "common.h"
-/**
- * Creates the libsais context that allows reusing allocated memory with each
- * libsais operation. In multi-threaded environments, use one context per thread
- * for parallel executions.
- * @return the libsais context, NULL otherwise.
- */
-void * libsais_create_ctx(void);
-
-/**
- * Destroys the libsass context and free previusly allocated memory.
- * @param ctx The libsais context (can be NULL).
- */
-void libsais_free_ctx(void * ctx);
-
-/**
- * Constructs the suffix array of a given string.
- * @param T [0..n-1] The input string.
- * @param SA [0..n-1+fs] The output array of suffixes.
- * @param n The length of the given string.
- * @param fs The extra space available at the end of SA array (0 should be
- * enough for most cases).
- * @param freq [0..255] The output symbol frequency table (can be NULL).
- * @return 0 if no error occurred, -1 or -2 otherwise.
- */
-s32 libsais(const u8 * T, s32 * SA, s32 n, s32 fs, s32 * freq);
-
-/**
- * Constructs the suffix array of a given integer array.
- * Note, during construction input array will be modified, but restored at the
- * end if no errors occurred.
- * @param T [0..n-1] The input integer array.
- * @param SA [0..n-1+fs] The output array of suffixes.
- * @param n The length of the integer array.
- * @param k The alphabet size of the input integer array.
- * @param fs Extra space available at the end of SA array (can be 0, but 4k or
- * better 6k is recommended for optimal performance).
- * @return 0 if no error occurred, -1 or -2 otherwise.
- */
-s32 libsais_int(s32 * T, s32 * SA, s32 n, s32 k, s32 fs);
-
-/**
- * Constructs the suffix array of a given string using libsais context.
- * @param ctx The libsais context.
- * @param T [0..n-1] The input string.
- * @param SA [0..n-1+fs] The output array of suffixes.
- * @param n The length of the given string.
- * @param fs The extra space available at the end of SA array (0 should be
- * enough for most cases).
- * @param freq [0..255] The output symbol frequency table (can be NULL).
- * @return 0 if no error occurred, -1 or -2 otherwise.
- */
-s32 libsais_ctx(const void * ctx, const u8 * T, s32 * SA, s32 n, s32 fs, s32 * freq);
-
-/**
- * Constructs the burrows-wheeler transformed string (BWT) of a given string.
- * @param T [0..n-1] The input string.
- * @param U [0..n-1] The output string (can be T).
- * @param A [0..n-1+fs] The temporary array.
- * @param n The length of the given string.
- * @param fs The extra space available at the end of A array (0 should be enough
- * for most cases).
- * @param freq [0..255] The output symbol frequency table (can be NULL).
- * @return The primary index if no error occurred, -1 or -2 otherwise.
- */
-s32 libsais_bwt(const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq);
-
-/**
- * Constructs the burrows-wheeler transformed string (BWT) of a given string
- * with auxiliary indexes.
- * @param T [0..n-1] The input string.
- * @param U [0..n-1] The output string (can be T).
- * @param A [0..n-1+fs] The temporary array.
- * @param n The length of the given string.
- * @param fs The extra space available at the end of A array (0 should be enough
- * for most cases).
- * @param freq [0..255] The output symbol frequency table (can be NULL).
- * @param r The sampling rate for auxiliary indexes (must be power of 2).
- * @param I [0..(n-1)/r] The output auxiliary indexes.
- * @return 0 if no error occurred, -1 or -2 otherwise.
- */
-s32 libsais_bwt_aux(const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq, s32 r, s32 * I);
-
-/**
- * Constructs the burrows-wheeler transformed string (BWT) of a given string
- * using libsais context.
- * @param ctx The libsais context.
- * @param T [0..n-1] The input string.
- * @param U [0..n-1] The output string (can be T).
- * @param A [0..n-1+fs] The temporary array.
- * @param n The length of the given string.
- * @param fs The extra space available at the end of A array (0 should be enough
- * for most cases).
- * @param freq [0..255] The output symbol frequency table (can be NULL).
- * @return The primary index if no error occurred, -1 or -2 otherwise.
- */
-s32 libsais_bwt_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq);
-
-/**
- * Constructs the burrows-wheeler transformed string (BWT) of a given string
- * with auxiliary indexes using libsais context.
- * @param ctx The libsais context.
- * @param T [0..n-1] The input string.
- * @param U [0..n-1] The output string (can be T).
- * @param A [0..n-1+fs] The temporary array.
- * @param n The length of the given string.
- * @param fs The extra space available at the end of A array (0 should be enough
- * for most cases).
- * @param freq [0..255] The output symbol frequency table (can be NULL).
- * @param r The sampling rate for auxiliary indexes (must be power of 2).
- * @param I [0..(n-1)/r] The output auxiliary indexes.
- * @return 0 if no error occurred, -1 or -2 otherwise.
- */
-s32 libsais_bwt_aux_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq, s32 r, s32 * I);
-
-/**
- * Creates the libsais reverse BWT context that allows reusing allocated memory
- * with each libsais_unbwt_* operation. In multi-threaded environments, use one
- * context per thread for parallel executions.
- * @return the libsais context, NULL otherwise.
- */
-void * libsais_unbwt_create_ctx(void);
-
-/**
- * Destroys the libsass reverse BWT context and free previusly allocated memory.
- * @param ctx The libsais context (can be NULL).
- */
-void libsais_unbwt_free_ctx(void * ctx);
-
-/**
- * Constructs the original string from a given burrows-wheeler transformed
- * string (BWT) with primary index.
- * @param T [0..n-1] The input string.
- * @param U [0..n-1] The output string (can be T).
- * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1
- * size).
- * @param n The length of the given string.
- * @param freq [0..255] The input symbol frequency table (can be NULL).
- * @param i The primary index.
- * @return 0 if no error occurred, -1 or -2 otherwise.
- */
-s32 libsais_unbwt(const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq, s32 i);
-
-/**
- * Constructs the original string from a given burrows-wheeler transformed
- * string (BWT) with primary index using libsais reverse BWT context.
- * @param ctx The libsais reverse BWT context.
- * @param T [0..n-1] The input string.
- * @param U [0..n-1] The output string (can be T).
- * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1
- * size).
- * @param n The length of the given string.
- * @param freq [0..255] The input symbol frequency table (can be NULL).
- * @param i The primary index.
- * @return 0 if no error occurred, -1 or -2 otherwise.
- */
-s32 libsais_unbwt_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq, s32 i);
-
-/**
- * Constructs the original string from a given burrows-wheeler transformed
- * string (BWT) with auxiliary indexes.
- * @param T [0..n-1] The input string.
- * @param U [0..n-1] The output string (can be T).
- * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1
- * size).
- * @param n The length of the given string.
- * @param freq [0..255] The input symbol frequency table (can be NULL).
- * @param r The sampling rate for auxiliary indexes (must be power of 2).
- * @param I [0..(n-1)/r] The input auxiliary indexes.
- * @return 0 if no error occurred, -1 or -2 otherwise.
- */
-s32 libsais_unbwt_aux(const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq, s32 r, const s32 * I);
-
-/**
- * Constructs the original string from a given burrows-wheeler transformed
- * string (BWT) with auxiliary indexes using libsais reverse BWT context.
- * @param ctx The libsais reverse BWT context.
- * @param T [0..n-1] The input string.
- * @param U [0..n-1] The output string (can be T).
- * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1
- * size).
- * @param n The length of the given string.
- * @param freq [0..255] The input symbol frequency table (can be NULL).
- * @param r The sampling rate for auxiliary indexes (must be power of 2).
- * @param I [0..(n-1)/r] The input auxiliary indexes.
- * @return 0 if no error occurred, -1 or -2 otherwise.
- */
-s32 libsais_unbwt_aux_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq, s32 r,
- const s32 * I);
-
-/**
- * Constructs the permuted longest common prefix array (PLCP) of a given string
- * and a suffix array.
- * @param T [0..n-1] The input string.
- * @param SA [0..n-1] The input suffix array.
- * @param PLCP [0..n-1] The output permuted longest common prefix array.
- * @param n The length of the string and the suffix array.
- * @return 0 if no error occurred, -1 otherwise.
- */
-s32 libsais_plcp(const u8 * T, const s32 * SA, s32 * PLCP, s32 n);
-
-/**
- * Constructs the longest common prefix array (LCP) of a given permuted longest
- * common prefix array (PLCP) and a suffix array.
- * @param PLCP [0..n-1] The input permuted longest common prefix array.
- * @param SA [0..n-1] The input suffix array.
- * @param LCP [0..n-1] The output longest common prefix array (can be SA).
- * @param n The length of the permuted longest common prefix array and the
- * suffix array.
- * @return 0 if no error occurred, -1 otherwise.
- */
-s32 libsais_lcp(const s32 * PLCP, const s32 * SA, s32 * LCP, s32 n);
+/* libsais source code amalgamate. */
+
+#include <limits.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define UNUSED(_x) (void)(_x)
+
+typedef s32 sa_sint_t;
+typedef u32 sa_uint_t;
+typedef ptrdiff_t fast_sint_t;
+typedef size_t fast_uint_t;
+
+#define SAINT_BIT (32)
+#define SAINT_MAX INT32_MAX
+#define SAINT_MIN INT32_MIN
+
+#define ALPHABET_SIZE (1 << CHAR_BIT)
+#define UNBWT_FASTBITS (17)
+
+#define SUFFIX_GROUP_BIT (SAINT_BIT - 1)
+#define SUFFIX_GROUP_MARKER (((sa_sint_t)1) << (SUFFIX_GROUP_BIT - 1))
+
+#define BUCKETS_INDEX2(_c, _s) (((_c) << 1) + (_s))
+#define BUCKETS_INDEX4(_c, _s) (((_c) << 2) + (_s))
+
+#define LIBSAIS_PER_THREAD_CACHE_SIZE (24576)
+
+typedef struct LIBSAIS_THREAD_CACHE {
+ sa_sint_t symbol;
+ sa_sint_t index;
+} LIBSAIS_THREAD_CACHE;
+
+typedef union LIBSAIS_THREAD_STATE {
+ struct {
+ fast_sint_t position;
+ fast_sint_t count;
+
+ fast_sint_t m;
+ fast_sint_t last_lms_suffix;
+
+ sa_sint_t * buckets;
+ LIBSAIS_THREAD_CACHE * cache;
+ } state;
+
+ u8 padding[64];
+} LIBSAIS_THREAD_STATE;
+
+typedef struct LIBSAIS_CONTEXT {
+ sa_sint_t * buckets;
+ LIBSAIS_THREAD_STATE * thread_state;
+ fast_sint_t threads;
+} LIBSAIS_CONTEXT;
+
+typedef struct LIBSAIS_UNBWT_CONTEXT {
+ sa_uint_t * bucket2;
+ u16 * fastbits;
+ sa_uint_t * buckets;
+ fast_sint_t threads;
+} LIBSAIS_UNBWT_CONTEXT;
+
+static void * libsais_align_up(const void * address, size_t alignment) {
+ return (void *)((((ptrdiff_t)address) + ((ptrdiff_t)alignment) - 1) & (-((ptrdiff_t)alignment)));
+}
+
+static void * libsais_alloc_aligned(size_t size, size_t alignment) {
+ void * address = malloc(size + sizeof(short) + alignment - 1);
+ if (address != NULL) {
+ void * aligned_address = libsais_align_up((void *)((ptrdiff_t)address + (ptrdiff_t)(sizeof(short))), alignment);
+ ((short *)aligned_address)[-1] = (short)((ptrdiff_t)aligned_address - (ptrdiff_t)address);
+
+ return aligned_address;
+ }
+
+ return NULL;
+}
+
+static void libsais_free_aligned(void * aligned_address) {
+ if (aligned_address != NULL) {
+ free((void *)((ptrdiff_t)aligned_address - ((short *)aligned_address)[-1]));
+ }
+}
+
+static LIBSAIS_THREAD_STATE * libsais_alloc_thread_state(sa_sint_t threads) {
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state =
+ (LIBSAIS_THREAD_STATE *)libsais_alloc_aligned((size_t)threads * sizeof(LIBSAIS_THREAD_STATE), 4096);
+ sa_sint_t * RESTRICT thread_buckets =
+ (sa_sint_t *)libsais_alloc_aligned((size_t)threads * 4 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096);
+ LIBSAIS_THREAD_CACHE * RESTRICT thread_cache = (LIBSAIS_THREAD_CACHE *)libsais_alloc_aligned(
+ (size_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE * sizeof(LIBSAIS_THREAD_CACHE), 4096);
+
+ if (thread_state != NULL && thread_buckets != NULL && thread_cache != NULL) {
+ fast_sint_t t;
+ for (t = 0; t < threads; ++t) {
+ thread_state[t].state.buckets = thread_buckets;
+ thread_buckets += 4 * ALPHABET_SIZE;
+ thread_state[t].state.cache = thread_cache;
+ thread_cache += LIBSAIS_PER_THREAD_CACHE_SIZE;
+ }
+
+ return thread_state;
+ }
+
+ libsais_free_aligned(thread_cache);
+ libsais_free_aligned(thread_buckets);
+ libsais_free_aligned(thread_state);
+ return NULL;
+}
+
+static void libsais_free_thread_state(LIBSAIS_THREAD_STATE * thread_state) {
+ if (thread_state != NULL) {
+ libsais_free_aligned(thread_state[0].state.cache);
+ libsais_free_aligned(thread_state[0].state.buckets);
+ libsais_free_aligned(thread_state);
+ }
+}
+
+static LIBSAIS_CONTEXT * libsais_create_ctx_main(sa_sint_t threads) {
+ LIBSAIS_CONTEXT * RESTRICT ctx = (LIBSAIS_CONTEXT *)libsais_alloc_aligned(sizeof(LIBSAIS_CONTEXT), 64);
+ sa_sint_t * RESTRICT buckets = (sa_sint_t *)libsais_alloc_aligned(8 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096);
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state = threads > 1 ? libsais_alloc_thread_state(threads) : NULL;
+
+ if (ctx != NULL && buckets != NULL && (thread_state != NULL || threads == 1)) {
+ ctx->buckets = buckets;
+ ctx->threads = threads;
+ ctx->thread_state = thread_state;
+
+ return ctx;
+ }
+
+ libsais_free_thread_state(thread_state);
+ libsais_free_aligned(buckets);
+ libsais_free_aligned(ctx);
+ return NULL;
+}
+
+static void libsais_free_ctx_main(LIBSAIS_CONTEXT * ctx) {
+ if (ctx != NULL) {
+ libsais_free_thread_state(ctx->thread_state);
+ libsais_free_aligned(ctx->buckets);
+ libsais_free_aligned(ctx);
+ }
+}
+static void libsais_gather_lms_suffixes_8u(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, fast_sint_t m,
+ fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+ if (omp_block_size > 0) {
+ const fast_sint_t prefetch_distance = 128;
+
+ fast_sint_t i, j = omp_block_start + omp_block_size, c0 = T[omp_block_start + omp_block_size - 1], c1 = -1;
+
+ while (j < n && (c1 = T[j]) == c0) {
+ ++j;
+ }
+
+ fast_uint_t s = c0 >= c1;
+
+ for (i = omp_block_start + omp_block_size - 2, j = omp_block_start + 3; i >= j; i -= 4) {
+ prefetch(&T[i - prefetch_distance]);
+
+ c1 = T[i - 0];
+ s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+ SA[m] = (sa_sint_t)(i + 1);
+ m -= ((s & 3) == 1);
+ c0 = T[i - 1];
+ s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+ SA[m] = (sa_sint_t)(i - 0);
+ m -= ((s & 3) == 1);
+ c1 = T[i - 2];
+ s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+ SA[m] = (sa_sint_t)(i - 1);
+ m -= ((s & 3) == 1);
+ c0 = T[i - 3];
+ s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+ SA[m] = (sa_sint_t)(i - 2);
+ m -= ((s & 3) == 1);
+ }
+
+ for (j -= 3; i >= j; i -= 1) {
+ c1 = c0;
+ c0 = T[i];
+ s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+ SA[m] = (sa_sint_t)(i + 1);
+ m -= ((s & 3) == 1);
+ }
+
+ SA[m] = (sa_sint_t)(i + 1);
+ }
+}
+
+static void libsais_gather_lms_suffixes_8u_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+ sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ {
+ (void)(threads);
+ (void)(thread_state);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+
+ fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+
+ if (omp_num_threads == 1) {
+ libsais_gather_lms_suffixes_8u(T, SA, n, (fast_sint_t)n - 1, omp_block_start, omp_block_size);
+ }
+ }
+}
+
+static sa_sint_t libsais_gather_lms_suffixes_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n) {
+ const fast_sint_t prefetch_distance = 32;
+
+ sa_sint_t i = n - 2;
+ sa_sint_t m = n - 1;
+ fast_uint_t s = 1;
+ fast_sint_t c0 = T[n - 1];
+ fast_sint_t c1 = 0;
+
+ for (; i >= 3; i -= 4) {
+ prefetch(&T[i - prefetch_distance]);
+
+ c1 = T[i - 0];
+ s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+ SA[m] = i + 1;
+ m -= ((s & 3) == 1);
+ c0 = T[i - 1];
+ s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+ SA[m] = i - 0;
+ m -= ((s & 3) == 1);
+ c1 = T[i - 2];
+ s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+ SA[m] = i - 1;
+ m -= ((s & 3) == 1);
+ c0 = T[i - 3];
+ s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+ SA[m] = i - 2;
+ m -= ((s & 3) == 1);
+ }
+
+ for (; i >= 0; i -= 1) {
+ c1 = c0;
+ c0 = T[i];
+ s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+ SA[m] = i + 1;
+ m -= ((s & 3) == 1);
+ }
+
+ return n - 1 - m;
+}
+
+static sa_sint_t libsais_gather_compacted_lms_suffixes_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+ sa_sint_t n) {
+ const fast_sint_t prefetch_distance = 32;
+
+ sa_sint_t i = n - 2;
+ sa_sint_t m = n - 1;
+ fast_uint_t s = 1;
+ fast_sint_t c0 = T[n - 1];
+ fast_sint_t c1 = 0;
+
+ for (; i >= 3; i -= 4) {
+ prefetch(&T[i - prefetch_distance]);
+
+ c1 = T[i - 0];
+ s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+ SA[m] = i + 1;
+ m -= ((fast_sint_t)(s & 3) == (c0 >= 0));
+ c0 = T[i - 1];
+ s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+ SA[m] = i - 0;
+ m -= ((fast_sint_t)(s & 3) == (c1 >= 0));
+ c1 = T[i - 2];
+ s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+ SA[m] = i - 1;
+ m -= ((fast_sint_t)(s & 3) == (c0 >= 0));
+ c0 = T[i - 3];
+ s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+ SA[m] = i - 2;
+ m -= ((fast_sint_t)(s & 3) == (c1 >= 0));
+ }
+
+ for (; i >= 0; i -= 1) {
+ c1 = c0;
+ c0 = T[i];
+ s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+ SA[m] = i + 1;
+ m -= ((fast_sint_t)(s & 3) == (c1 >= 0));
+ }
+
+ return n - 1 - m;
+}
+static void libsais_count_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t n, sa_sint_t k,
+ sa_sint_t * RESTRICT buckets) {
+ const fast_sint_t prefetch_distance = 32;
+
+ memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t));
+
+ sa_sint_t i = n - 2;
+ fast_uint_t s = 1;
+ fast_sint_t c0 = T[n - 1];
+ fast_sint_t c1 = 0;
+
+ for (; i >= prefetch_distance + 3; i -= 4) {
+ prefetch(&T[i - 2 * prefetch_distance]);
+
+ prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0], 0)]);
+ prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1], 0)]);
+ prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2], 0)]);
+ prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3], 0)]);
+
+ c1 = T[i - 0];
+ s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+ buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+
+ c0 = T[i - 1];
+ s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+ buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+
+ c1 = T[i - 2];
+ s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+ buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+
+ c0 = T[i - 3];
+ s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+ buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+ }
+
+ for (; i >= 0; i -= 1) {
+ c1 = c0;
+ c0 = T[i];
+ s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+ buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+ }
+
+ buckets[BUCKETS_INDEX2((fast_uint_t)c0, 0)]++;
+}
+static sa_sint_t libsais_count_and_gather_lms_suffixes_8u(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+ sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start,
+ fast_sint_t omp_block_size) {
+ memset(buckets, 0, 4 * ALPHABET_SIZE * sizeof(sa_sint_t));
+
+ fast_sint_t m = omp_block_start + omp_block_size - 1;
+
+ if (omp_block_size > 0) {
+ const fast_sint_t prefetch_distance = 128;
+
+ fast_sint_t i, j = m + 1, c0 = T[m], c1 = -1;
+
+ while (j < n && (c1 = T[j]) == c0) {
+ ++j;
+ }
+
+ fast_uint_t s = c0 >= c1;
+
+ for (i = m - 1, j = omp_block_start + 3; i >= j; i -= 4) {
+ prefetch(&T[i - prefetch_distance]);
+
+ c1 = T[i - 0];
+ s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+ SA[m] = (sa_sint_t)(i + 1);
+ m -= ((s & 3) == 1);
+ buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
+
+ c0 = T[i - 1];
+ s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+ SA[m] = (sa_sint_t)(i - 0);
+ m -= ((s & 3) == 1);
+ buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
+
+ c1 = T[i - 2];
+ s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+ SA[m] = (sa_sint_t)(i - 1);
+ m -= ((s & 3) == 1);
+ buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
+
+ c0 = T[i - 3];
+ s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+ SA[m] = (sa_sint_t)(i - 2);
+ m -= ((s & 3) == 1);
+ buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
+ }
+
+ for (j -= 3; i >= j; i -= 1) {
+ c1 = c0;
+ c0 = T[i];
+ s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+ SA[m] = (sa_sint_t)(i + 1);
+ m -= ((s & 3) == 1);
+ buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
+ }
+
+ c1 = (i >= 0) ? T[i] : -1;
+ s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+ SA[m] = (sa_sint_t)(i + 1);
+ m -= ((s & 3) == 1);
+ buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
+ }
+
+ return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m);
+}
+
+static sa_sint_t libsais_count_and_gather_lms_suffixes_8u_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
+ sa_sint_t n, sa_sint_t * RESTRICT buckets,
+ sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ sa_sint_t m = 0;
+
+ {
+ (void)(threads);
+ (void)(thread_state);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+
+ fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+
+ if (omp_num_threads == 1) {
+ m = libsais_count_and_gather_lms_suffixes_8u(T, SA, n, buckets, omp_block_start, omp_block_size);
+ }
+ }
+
+ return m;
+}
+
+static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+ sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets,
+ fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+ memset(buckets, 0, 4 * (size_t)k * sizeof(sa_sint_t));
+
+ fast_sint_t m = omp_block_start + omp_block_size - 1;
+
+ if (omp_block_size > 0) {
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j = m + 1, c0 = T[m], c1 = -1;
+
+ while (j < n && (c1 = T[j]) == c0) {
+ ++j;
+ }
+
+ fast_uint_t s = c0 >= c1;
+
+ for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4) {
+ prefetch(&T[i - 2 * prefetch_distance]);
+
+ prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 0], 0)]);
+ prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 1], 0)]);
+ prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 2], 0)]);
+ prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 3], 0)]);
+
+ c1 = T[i - 0];
+ s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+ SA[m] = (sa_sint_t)(i + 1);
+ m -= ((s & 3) == 1);
+ buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
+
+ c0 = T[i - 1];
+ s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+ SA[m] = (sa_sint_t)(i - 0);
+ m -= ((s & 3) == 1);
+ buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
+
+ c1 = T[i - 2];
+ s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+ SA[m] = (sa_sint_t)(i - 1);
+ m -= ((s & 3) == 1);
+ buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
+
+ c0 = T[i - 3];
+ s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+ SA[m] = (sa_sint_t)(i - 2);
+ m -= ((s & 3) == 1);
+ buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
+ }
+
+ for (j -= prefetch_distance + 3; i >= j; i -= 1) {
+ c1 = c0;
+ c0 = T[i];
+ s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+ SA[m] = (sa_sint_t)(i + 1);
+ m -= ((s & 3) == 1);
+ buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
+ }
+
+ c1 = (i >= 0) ? T[i] : -1;
+ s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+ SA[m] = (sa_sint_t)(i + 1);
+ m -= ((s & 3) == 1);
+ buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
+ }
+
+ return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m);
+}
+
+static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+ sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets,
+ fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+ memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t));
+
+ fast_sint_t m = omp_block_start + omp_block_size - 1;
+
+ if (omp_block_size > 0) {
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j = m + 1, c0 = T[m], c1 = -1;
+
+ while (j < n && (c1 = T[j]) == c0) {
+ ++j;
+ }
+
+ fast_uint_t s = c0 >= c1;
+
+ for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4) {
+ prefetch(&T[i - 2 * prefetch_distance]);
+
+ prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0], 0)]);
+ prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1], 0)]);
+ prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2], 0)]);
+ prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3], 0)]);
+
+ c1 = T[i - 0];
+ s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+ SA[m] = (sa_sint_t)(i + 1);
+ m -= ((s & 3) == 1);
+ buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+
+ c0 = T[i - 1];
+ s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+ SA[m] = (sa_sint_t)(i - 0);
+ m -= ((s & 3) == 1);
+ buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+
+ c1 = T[i - 2];
+ s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+ SA[m] = (sa_sint_t)(i - 1);
+ m -= ((s & 3) == 1);
+ buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+
+ c0 = T[i - 3];
+ s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+ SA[m] = (sa_sint_t)(i - 2);
+ m -= ((s & 3) == 1);
+ buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+ }
+
+ for (j -= prefetch_distance + 3; i >= j; i -= 1) {
+ c1 = c0;
+ c0 = T[i];
+ s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+ SA[m] = (sa_sint_t)(i + 1);
+ m -= ((s & 3) == 1);
+ buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+ }
+
+ c1 = (i >= 0) ? T[i] : -1;
+ s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+ SA[m] = (sa_sint_t)(i + 1);
+ m -= ((s & 3) == 1);
+ buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+ }
+
+ return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m);
+}
+
+static sa_sint_t libsais_count_and_gather_compacted_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T,
+ sa_sint_t * RESTRICT SA, sa_sint_t n,
+ sa_sint_t k, sa_sint_t * RESTRICT buckets,
+ fast_sint_t omp_block_start,
+ fast_sint_t omp_block_size) {
+ memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t));
+
+ fast_sint_t m = omp_block_start + omp_block_size - 1;
+
+ if (omp_block_size > 0) {
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j = m + 1, c0 = T[m], c1 = -1;
+
+ while (j < n && (c1 = T[j]) == c0) {
+ ++j;
+ }
+
+ fast_uint_t s = c0 >= c1;
+
+ for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4) {
+ prefetch(&T[i - 2 * prefetch_distance]);
+
+ prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0] & SAINT_MAX, 0)]);
+ prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1] & SAINT_MAX, 0)]);
+ prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2] & SAINT_MAX, 0)]);
+ prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3] & SAINT_MAX, 0)]);
+
+ c1 = T[i - 0];
+ s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+ SA[m] = (sa_sint_t)(i + 1);
+ m -= ((fast_sint_t)(s & 3) == (c0 >= 0));
+ c0 &= SAINT_MAX;
+ buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+
+ c0 = T[i - 1];
+ s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+ SA[m] = (sa_sint_t)(i - 0);
+ m -= ((fast_sint_t)(s & 3) == (c1 >= 0));
+ c1 &= SAINT_MAX;
+ buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+
+ c1 = T[i - 2];
+ s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+ SA[m] = (sa_sint_t)(i - 1);
+ m -= ((fast_sint_t)(s & 3) == (c0 >= 0));
+ c0 &= SAINT_MAX;
+ buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+
+ c0 = T[i - 3];
+ s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+ SA[m] = (sa_sint_t)(i - 2);
+ m -= ((fast_sint_t)(s & 3) == (c1 >= 0));
+ c1 &= SAINT_MAX;
+ buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+ }
+
+ for (j -= prefetch_distance + 3; i >= j; i -= 1) {
+ c1 = c0;
+ c0 = T[i];
+ s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+ SA[m] = (sa_sint_t)(i + 1);
+ m -= ((fast_sint_t)(s & 3) == (c1 >= 0));
+ c1 &= SAINT_MAX;
+ buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
+ }
+
+ c1 = (i >= 0) ? T[i] : -1;
+ s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+ SA[m] = (sa_sint_t)(i + 1);
+ m -= ((fast_sint_t)(s & 3) == (c0 >= 0));
+ c0 &= SAINT_MAX;
+ buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
+ }
+
+ return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m);
+}
+static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k_nofs_omp(const sa_sint_t * RESTRICT T,
+ sa_sint_t * RESTRICT SA, sa_sint_t n,
+ sa_sint_t k, sa_sint_t * RESTRICT buckets,
+ sa_sint_t threads) {
+ sa_sint_t m = 0;
+ {
+ (void)(threads);
+
+ fast_sint_t omp_num_threads = 1;
+
+ if (omp_num_threads == 1) {
+ m = libsais_count_and_gather_lms_suffixes_32s_4k(T, SA, n, k, buckets, 0, n);
+ }
+ }
+
+ return m;
+}
+
+static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k_nofs_omp(const sa_sint_t * RESTRICT T,
+ sa_sint_t * RESTRICT SA, sa_sint_t n,
+ sa_sint_t k, sa_sint_t * RESTRICT buckets,
+ sa_sint_t threads) {
+ sa_sint_t m = 0;
+ {
+ (void)(threads);
+
+ fast_sint_t omp_num_threads = 1;
+
+ if (omp_num_threads == 1) {
+ m = libsais_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets, 0, n);
+ }
+ }
+
+ return m;
+}
+
+static sa_sint_t libsais_count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp(const sa_sint_t * RESTRICT T,
+ sa_sint_t * RESTRICT SA, sa_sint_t n,
+ sa_sint_t k,
+ sa_sint_t * RESTRICT buckets,
+ sa_sint_t threads) {
+ sa_sint_t m = 0;
+ {
+ (void)(threads);
+
+ fast_sint_t omp_num_threads = 1;
+
+ if (omp_num_threads == 1) {
+ m = libsais_count_and_gather_compacted_lms_suffixes_32s_2k(T, SA, n, k, buckets, 0, n);
+ }
+ }
+
+ return m;
+}
+
+static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+ sa_sint_t n, sa_sint_t k,
+ sa_sint_t * RESTRICT buckets, sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ sa_sint_t m;
+ (void)(thread_state);
+
+ { m = libsais_count_and_gather_lms_suffixes_32s_4k_nofs_omp(T, SA, n, k, buckets, threads); }
+
+ return m;
+}
+
+static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+ sa_sint_t n, sa_sint_t k,
+ sa_sint_t * RESTRICT buckets, sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ sa_sint_t m;
+ (void)(thread_state);
+
+ { m = libsais_count_and_gather_lms_suffixes_32s_2k_nofs_omp(T, SA, n, k, buckets, threads); }
+
+ return m;
+}
+
+static void libsais_count_and_gather_compacted_lms_suffixes_32s_2k_omp(const sa_sint_t * RESTRICT T,
+ sa_sint_t * RESTRICT SA, sa_sint_t n,
+ sa_sint_t k, sa_sint_t * RESTRICT buckets,
+ sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ (void)(thread_state);
+
+ { libsais_count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp(T, SA, n, k, buckets, threads); }
+}
+
+static void libsais_count_suffixes_32s(const sa_sint_t * RESTRICT T, sa_sint_t n, sa_sint_t k,
+ sa_sint_t * RESTRICT buckets) {
+ const fast_sint_t prefetch_distance = 32;
+
+ memset(buckets, 0, (size_t)k * sizeof(sa_sint_t));
+
+ fast_sint_t i, j;
+ for (i = 0, j = (fast_sint_t)n - 7; i < j; i += 8) {
+ prefetch(&T[i + prefetch_distance]);
+
+ buckets[T[i + 0]]++;
+ buckets[T[i + 1]]++;
+ buckets[T[i + 2]]++;
+ buckets[T[i + 3]]++;
+ buckets[T[i + 4]]++;
+ buckets[T[i + 5]]++;
+ buckets[T[i + 6]]++;
+ buckets[T[i + 7]]++;
+ }
+
+ for (j += 7; i < j; i += 1) {
+ buckets[T[i]]++;
+ }
+}
+
+static void libsais_initialize_buckets_start_and_end_8u(sa_sint_t * RESTRICT buckets, sa_sint_t * RESTRICT freq) {
+ sa_sint_t * RESTRICT bucket_start = &buckets[6 * ALPHABET_SIZE];
+ sa_sint_t * RESTRICT bucket_end = &buckets[7 * ALPHABET_SIZE];
+
+ if (freq != NULL) {
+ fast_sint_t i, j;
+ sa_sint_t sum = 0;
+ for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0);
+ i += BUCKETS_INDEX4(1, 0), j += 1) {
+ bucket_start[j] = sum;
+ sum += (freq[j] = buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 1)] +
+ buckets[i + BUCKETS_INDEX4(0, 2)] + buckets[i + BUCKETS_INDEX4(0, 3)]);
+ bucket_end[j] = sum;
+ }
+ } else {
+ fast_sint_t i, j;
+ sa_sint_t sum = 0;
+ for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0);
+ i += BUCKETS_INDEX4(1, 0), j += 1) {
+ bucket_start[j] = sum;
+ sum += buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 1)] +
+ buckets[i + BUCKETS_INDEX4(0, 2)] + buckets[i + BUCKETS_INDEX4(0, 3)];
+ bucket_end[j] = sum;
+ }
+ }
+}
+
+static void libsais_initialize_buckets_start_and_end_32s_6k(sa_sint_t k, sa_sint_t * RESTRICT buckets) {
+ sa_sint_t * RESTRICT bucket_start = &buckets[4 * k];
+ sa_sint_t * RESTRICT bucket_end = &buckets[5 * k];
+
+ fast_sint_t i, j;
+ sa_sint_t sum = 0;
+ for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0);
+ i += BUCKETS_INDEX4(1, 0), j += 1) {
+ bucket_start[j] = sum;
+ sum += buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 1)] +
+ buckets[i + BUCKETS_INDEX4(0, 2)] + buckets[i + BUCKETS_INDEX4(0, 3)];
+ bucket_end[j] = sum;
+ }
+}
+
+static void libsais_initialize_buckets_start_and_end_32s_4k(sa_sint_t k, sa_sint_t * RESTRICT buckets) {
+ sa_sint_t * RESTRICT bucket_start = &buckets[2 * k];
+ sa_sint_t * RESTRICT bucket_end = &buckets[3 * k];
+
+ fast_sint_t i, j;
+ sa_sint_t sum = 0;
+ for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0);
+ i += BUCKETS_INDEX2(1, 0), j += 1) {
+ bucket_start[j] = sum;
+ sum += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)];
+ bucket_end[j] = sum;
+ }
+}
+
+static void libsais_initialize_buckets_end_32s_2k(sa_sint_t k, sa_sint_t * RESTRICT buckets) {
+ fast_sint_t i;
+ sa_sint_t sum0 = 0;
+ for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0)) {
+ sum0 += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)];
+ buckets[i + BUCKETS_INDEX2(0, 0)] = sum0;
+ }
+}
+
+static void libsais_initialize_buckets_start_and_end_32s_2k(sa_sint_t k, sa_sint_t * RESTRICT buckets) {
+ fast_sint_t i, j;
+ for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0);
+ i += BUCKETS_INDEX2(1, 0), j += 1) {
+ buckets[j] = buckets[i];
+ }
+
+ buckets[k] = 0;
+ memcpy(&buckets[k + 1], buckets, ((size_t)k - 1) * sizeof(sa_sint_t));
+}
+
+static void libsais_initialize_buckets_start_32s_1k(sa_sint_t k, sa_sint_t * RESTRICT buckets) {
+ fast_sint_t i;
+ sa_sint_t sum = 0;
+ for (i = 0; i <= (fast_sint_t)k - 1; i += 1) {
+ sa_sint_t tmp = buckets[i];
+ buckets[i] = sum;
+ sum += tmp;
+ }
+}
+
+static void libsais_initialize_buckets_end_32s_1k(sa_sint_t k, sa_sint_t * RESTRICT buckets) {
+ fast_sint_t i;
+ sa_sint_t sum = 0;
+ for (i = 0; i <= (fast_sint_t)k - 1; i += 1) {
+ sum += buckets[i];
+ buckets[i] = sum;
+ }
+}
+
+static sa_sint_t libsais_initialize_buckets_for_lms_suffixes_radix_sort_8u(const u8 * RESTRICT T,
+ sa_sint_t * RESTRICT buckets,
+ sa_sint_t first_lms_suffix) {
+ {
+ fast_uint_t s = 0;
+ fast_sint_t c0 = T[first_lms_suffix];
+ fast_sint_t c1 = 0;
+
+ for (; --first_lms_suffix >= 0;) {
+ c1 = c0;
+ c0 = T[first_lms_suffix];
+ s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+ buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]--;
+ }
+
+ buckets[BUCKETS_INDEX4((fast_uint_t)c0, (s << 1) & 3)]--;
+ }
+
+ {
+ sa_sint_t * RESTRICT temp_bucket = &buckets[4 * ALPHABET_SIZE];
+
+ fast_sint_t i, j;
+ sa_sint_t sum = 0;
+ for (i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0);
+ i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) {
+ temp_bucket[j + BUCKETS_INDEX2(0, 1)] = sum;
+ sum += buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 3)];
+ temp_bucket[j] = sum;
+ }
+
+ return sum;
+ }
+}
+
+static void libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t k,
+ sa_sint_t * RESTRICT buckets,
+ sa_sint_t first_lms_suffix) {
+ buckets[BUCKETS_INDEX2(T[first_lms_suffix], 0)]++;
+ buckets[BUCKETS_INDEX2(T[first_lms_suffix], 1)]--;
+
+ fast_sint_t i;
+ sa_sint_t sum0 = 0, sum1 = 0;
+ for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0)) {
+ sum0 += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)];
+ sum1 += buckets[i + BUCKETS_INDEX2(0, 1)];
+
+ buckets[i + BUCKETS_INDEX2(0, 0)] = sum0;
+ buckets[i + BUCKETS_INDEX2(0, 1)] = sum1;
+ }
+}
+
+static sa_sint_t libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(const sa_sint_t * RESTRICT T,
+ sa_sint_t k,
+ sa_sint_t * RESTRICT buckets,
+ sa_sint_t first_lms_suffix) {
+ {
+ fast_uint_t s = 0;
+ fast_sint_t c0 = T[first_lms_suffix];
+ fast_sint_t c1 = 0;
+
+ for (; --first_lms_suffix >= 0;) {
+ c1 = c0;
+ c0 = T[first_lms_suffix];
+ s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+ buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]--;
+ }
+
+ buckets[BUCKETS_INDEX4((fast_uint_t)c0, (s << 1) & 3)]--;
+ }
+
+ {
+ sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k];
+
+ fast_sint_t i, j;
+ sa_sint_t sum = 0;
+ for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0);
+ i += BUCKETS_INDEX4(1, 0), j += 1) {
+ sum += buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 3)];
+ temp_bucket[j] = sum;
+ }
+
+ return sum;
+ }
+}
+
+static void libsais_initialize_buckets_for_radix_and_partial_sorting_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t k,
+ sa_sint_t * RESTRICT buckets,
+ sa_sint_t first_lms_suffix) {
+ sa_sint_t * RESTRICT bucket_start = &buckets[2 * k];
+ sa_sint_t * RESTRICT bucket_end = &buckets[3 * k];
+
+ buckets[BUCKETS_INDEX2(T[first_lms_suffix], 0)]++;
+ buckets[BUCKETS_INDEX2(T[first_lms_suffix], 1)]--;
+
+ fast_sint_t i, j;
+ sa_sint_t sum0 = 0, sum1 = 0;
+ for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0);
+ i += BUCKETS_INDEX2(1, 0), j += 1) {
+ bucket_start[j] = sum1;
+
+ sum0 += buckets[i + BUCKETS_INDEX2(0, 1)];
+ sum1 += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)];
+ buckets[i + BUCKETS_INDEX2(0, 1)] = sum0;
+
+ bucket_end[j] = sum1;
+ }
+}
+
+static void libsais_radix_sort_lms_suffixes_8u(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
+ sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start,
+ fast_sint_t omp_block_size) {
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4) {
+ prefetch(&SA[i - 2 * prefetch_distance]);
+
+ prefetch(&T[SA[i - prefetch_distance - 0]]);
+ prefetch(&T[SA[i - prefetch_distance - 1]]);
+ prefetch(&T[SA[i - prefetch_distance - 2]]);
+ prefetch(&T[SA[i - prefetch_distance - 3]]);
+
+ sa_sint_t p0 = SA[i - 0];
+ SA[--induction_bucket[BUCKETS_INDEX2(T[p0], 0)]] = p0;
+ sa_sint_t p1 = SA[i - 1];
+ SA[--induction_bucket[BUCKETS_INDEX2(T[p1], 0)]] = p1;
+ sa_sint_t p2 = SA[i - 2];
+ SA[--induction_bucket[BUCKETS_INDEX2(T[p2], 0)]] = p2;
+ sa_sint_t p3 = SA[i - 3];
+ SA[--induction_bucket[BUCKETS_INDEX2(T[p3], 0)]] = p3;
+ }
+
+ for (j -= prefetch_distance + 3; i >= j; i -= 1) {
+ sa_sint_t p = SA[i];
+ SA[--induction_bucket[BUCKETS_INDEX2(T[p], 0)]] = p;
+ }
+}
+
+static void libsais_radix_sort_lms_suffixes_8u_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+ sa_sint_t m, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ {
+ (void)(threads);
+ (void)(thread_state);
+
+ fast_sint_t omp_num_threads = 1;
+
+ if (omp_num_threads == 1) {
+ libsais_radix_sort_lms_suffixes_8u(T, SA, &buckets[4 * ALPHABET_SIZE], (fast_sint_t)n - (fast_sint_t)m + 1,
+ (fast_sint_t)m - 1);
+ }
+ }
+}
+
+static void libsais_radix_sort_lms_suffixes_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+ sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start,
+ fast_sint_t omp_block_size) {
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 3; i >= j; i -= 4) {
+ prefetch(&SA[i - 3 * prefetch_distance]);
+
+ prefetch(&T[SA[i - 2 * prefetch_distance - 0]]);
+ prefetch(&T[SA[i - 2 * prefetch_distance - 1]]);
+ prefetch(&T[SA[i - 2 * prefetch_distance - 2]]);
+ prefetch(&T[SA[i - 2 * prefetch_distance - 3]]);
+
+ prefetchw(&induction_bucket[T[SA[i - prefetch_distance - 0]]]);
+ prefetchw(&induction_bucket[T[SA[i - prefetch_distance - 1]]]);
+ prefetchw(&induction_bucket[T[SA[i - prefetch_distance - 2]]]);
+ prefetchw(&induction_bucket[T[SA[i - prefetch_distance - 3]]]);
+
+ sa_sint_t p0 = SA[i - 0];
+ SA[--induction_bucket[T[p0]]] = p0;
+ sa_sint_t p1 = SA[i - 1];
+ SA[--induction_bucket[T[p1]]] = p1;
+ sa_sint_t p2 = SA[i - 2];
+ SA[--induction_bucket[T[p2]]] = p2;
+ sa_sint_t p3 = SA[i - 3];
+ SA[--induction_bucket[T[p3]]] = p3;
+ }
+
+ for (j -= 2 * prefetch_distance + 3; i >= j; i -= 1) {
+ sa_sint_t p = SA[i];
+ SA[--induction_bucket[T[p]]] = p;
+ }
+}
+
+static void libsais_radix_sort_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+ sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start,
+ fast_sint_t omp_block_size) {
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 3; i >= j; i -= 4) {
+ prefetch(&SA[i - 3 * prefetch_distance]);
+
+ prefetch(&T[SA[i - 2 * prefetch_distance - 0]]);
+ prefetch(&T[SA[i - 2 * prefetch_distance - 1]]);
+ prefetch(&T[SA[i - 2 * prefetch_distance - 2]]);
+ prefetch(&T[SA[i - 2 * prefetch_distance - 3]]);
+
+ prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 0]], 0)]);
+ prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 1]], 0)]);
+ prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 2]], 0)]);
+ prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 3]], 0)]);
+
+ sa_sint_t p0 = SA[i - 0];
+ SA[--induction_bucket[BUCKETS_INDEX2(T[p0], 0)]] = p0;
+ sa_sint_t p1 = SA[i - 1];
+ SA[--induction_bucket[BUCKETS_INDEX2(T[p1], 0)]] = p1;
+ sa_sint_t p2 = SA[i - 2];
+ SA[--induction_bucket[BUCKETS_INDEX2(T[p2], 0)]] = p2;
+ sa_sint_t p3 = SA[i - 3];
+ SA[--induction_bucket[BUCKETS_INDEX2(T[p3], 0)]] = p3;
+ }
+
+ for (j -= 2 * prefetch_distance + 3; i >= j; i -= 1) {
+ sa_sint_t p = SA[i];
+ SA[--induction_bucket[BUCKETS_INDEX2(T[p], 0)]] = p;
+ }
+}
+static void libsais_radix_sort_lms_suffixes_32s_6k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+ sa_sint_t n, sa_sint_t m, sa_sint_t * RESTRICT induction_bucket,
+ sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ if (threads == 1 || m < 65536) {
+ libsais_radix_sort_lms_suffixes_32s_6k(T, SA, induction_bucket, (fast_sint_t)n - (fast_sint_t)m + 1,
+ (fast_sint_t)m - 1);
+ }
+ (void)(thread_state);
+}
+
+static void libsais_radix_sort_lms_suffixes_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+ sa_sint_t n, sa_sint_t m, sa_sint_t * RESTRICT induction_bucket,
+ sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ if (threads == 1 || m < 65536) {
+ libsais_radix_sort_lms_suffixes_32s_2k(T, SA, induction_bucket, (fast_sint_t)n - (fast_sint_t)m + 1,
+ (fast_sint_t)m - 1);
+ }
+ (void)(thread_state);
+}
+
+static sa_sint_t libsais_radix_sort_lms_suffixes_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+ sa_sint_t n, sa_sint_t * RESTRICT buckets) {
+ const fast_sint_t prefetch_distance = 32;
+
+ sa_sint_t i = n - 2;
+ sa_sint_t m = 0;
+ fast_uint_t s = 1;
+ fast_sint_t c0 = T[n - 1];
+ fast_sint_t c1 = 0;
+ fast_sint_t c2 = 0;
+
+ for (; i >= prefetch_distance + 3; i -= 4) {
+ prefetch(&T[i - 2 * prefetch_distance]);
+
+ prefetchw(&buckets[T[i - prefetch_distance - 0]]);
+ prefetchw(&buckets[T[i - prefetch_distance - 1]]);
+ prefetchw(&buckets[T[i - prefetch_distance - 2]]);
+ prefetchw(&buckets[T[i - prefetch_distance - 3]]);
+
+ c1 = T[i - 0];
+ s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+ if ((s & 3) == 1) {
+ SA[--buckets[c2 = c0]] = i + 1;
+ m++;
+ }
+
+ c0 = T[i - 1];
+ s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+ if ((s & 3) == 1) {
+ SA[--buckets[c2 = c1]] = i - 0;
+ m++;
+ }
+
+ c1 = T[i - 2];
+ s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
+ if ((s & 3) == 1) {
+ SA[--buckets[c2 = c0]] = i - 1;
+ m++;
+ }
+
+ c0 = T[i - 3];
+ s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+ if ((s & 3) == 1) {
+ SA[--buckets[c2 = c1]] = i - 2;
+ m++;
+ }
+ }
+
+ for (; i >= 0; i -= 1) {
+ c1 = c0;
+ c0 = T[i];
+ s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
+ if ((s & 3) == 1) {
+ SA[--buckets[c2 = c1]] = i + 1;
+ m++;
+ }
+ }
+
+ if (m > 1) {
+ SA[buckets[c2]] = 0;
+ }
+
+ return m;
+}
+
+static void libsais_radix_sort_set_markers_32s_6k(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket,
+ fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) {
+ prefetch(&induction_bucket[i + 2 * prefetch_distance]);
+
+ prefetchw(&SA[induction_bucket[i + prefetch_distance + 0]]);
+ prefetchw(&SA[induction_bucket[i + prefetch_distance + 1]]);
+ prefetchw(&SA[induction_bucket[i + prefetch_distance + 2]]);
+ prefetchw(&SA[induction_bucket[i + prefetch_distance + 3]]);
+
+ SA[induction_bucket[i + 0]] |= SAINT_MIN;
+ SA[induction_bucket[i + 1]] |= SAINT_MIN;
+ SA[induction_bucket[i + 2]] |= SAINT_MIN;
+ SA[induction_bucket[i + 3]] |= SAINT_MIN;
+ }
+
+ for (j += prefetch_distance + 3; i < j; i += 1) {
+ SA[induction_bucket[i]] |= SAINT_MIN;
+ }
+}
+
+static void libsais_radix_sort_set_markers_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket,
+ fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) {
+ prefetch(&induction_bucket[BUCKETS_INDEX2(i + 2 * prefetch_distance, 0)]);
+
+ prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 0, 0)]]);
+ prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 1, 0)]]);
+ prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 2, 0)]]);
+ prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 3, 0)]]);
+
+ SA[induction_bucket[BUCKETS_INDEX2(i + 0, 0)]] |= SUFFIX_GROUP_MARKER;
+ SA[induction_bucket[BUCKETS_INDEX2(i + 1, 0)]] |= SUFFIX_GROUP_MARKER;
+ SA[induction_bucket[BUCKETS_INDEX2(i + 2, 0)]] |= SUFFIX_GROUP_MARKER;
+ SA[induction_bucket[BUCKETS_INDEX2(i + 3, 0)]] |= SUFFIX_GROUP_MARKER;
+ }
+
+ for (j += prefetch_distance + 3; i < j; i += 1) {
+ SA[induction_bucket[BUCKETS_INDEX2(i, 0)]] |= SUFFIX_GROUP_MARKER;
+ }
+}
+
+static void libsais_radix_sort_set_markers_32s_6k_omp(sa_sint_t * RESTRICT SA, sa_sint_t k,
+ sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads) {
+ {
+ (void)(threads);
+
+ fast_sint_t omp_block_start = 0;
+ fast_sint_t omp_block_size = (fast_sint_t)k - 1;
+ libsais_radix_sort_set_markers_32s_6k(SA, induction_bucket, omp_block_start, omp_block_size);
+ }
+}
+
+static void libsais_radix_sort_set_markers_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t k,
+ sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads) {
+ {
+ (void)(threads);
+
+ fast_sint_t omp_block_start = 0;
+ fast_sint_t omp_block_size = (fast_sint_t)k - 1;
+ libsais_radix_sort_set_markers_32s_4k(SA, induction_bucket, omp_block_start, omp_block_size);
+ }
+}
+
+static void libsais_initialize_buckets_for_partial_sorting_8u(const u8 * RESTRICT T, sa_sint_t * RESTRICT buckets,
+ sa_sint_t first_lms_suffix,
+ sa_sint_t left_suffixes_count) {
+ sa_sint_t * RESTRICT temp_bucket = &buckets[4 * ALPHABET_SIZE];
+
+ buckets[BUCKETS_INDEX4((fast_uint_t)T[first_lms_suffix], 1)]++;
+
+ fast_sint_t i, j;
+ sa_sint_t sum0 = left_suffixes_count + 1, sum1 = 0;
+ for (i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0);
+ i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) {
+ temp_bucket[j + BUCKETS_INDEX2(0, 0)] = sum0;
+
+ sum0 += buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 2)];
+ sum1 += buckets[i + BUCKETS_INDEX4(0, 1)];
+
+ buckets[j + BUCKETS_INDEX2(0, 0)] = sum0;
+ buckets[j + BUCKETS_INDEX2(0, 1)] = sum1;
+ }
+}
+
+static void libsais_initialize_buckets_for_partial_sorting_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t k,
+ sa_sint_t * RESTRICT buckets,
+ sa_sint_t first_lms_suffix,
+ sa_sint_t left_suffixes_count) {
+ sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k];
+
+ fast_sint_t i, j;
+ sa_sint_t sum0 = left_suffixes_count + 1, sum1 = 0, sum2 = 0;
+ for (first_lms_suffix = T[first_lms_suffix], i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0);
+ i <= BUCKETS_INDEX4((fast_sint_t)first_lms_suffix - 1, 0);
+ i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) {
+ sa_sint_t SS = buckets[i + BUCKETS_INDEX4(0, 0)];
+ sa_sint_t LS = buckets[i + BUCKETS_INDEX4(0, 1)];
+ sa_sint_t SL = buckets[i + BUCKETS_INDEX4(0, 2)];
+ sa_sint_t LL = buckets[i + BUCKETS_INDEX4(0, 3)];
+
+ buckets[i + BUCKETS_INDEX4(0, 0)] = sum0;
+ buckets[i + BUCKETS_INDEX4(0, 1)] = sum2;
+ buckets[i + BUCKETS_INDEX4(0, 2)] = 0;
+ buckets[i + BUCKETS_INDEX4(0, 3)] = 0;
+
+ sum0 += SS + SL;
+ sum1 += LS;
+ sum2 += LS + LL;
+
+ temp_bucket[j + BUCKETS_INDEX2(0, 0)] = sum0;
+ temp_bucket[j + BUCKETS_INDEX2(0, 1)] = sum1;
+ }
+
+ for (sum1 += 1; i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) {
+ sa_sint_t SS = buckets[i + BUCKETS_INDEX4(0, 0)];
+ sa_sint_t LS = buckets[i + BUCKETS_INDEX4(0, 1)];
+ sa_sint_t SL = buckets[i + BUCKETS_INDEX4(0, 2)];
+ sa_sint_t LL = buckets[i + BUCKETS_INDEX4(0, 3)];
+
+ buckets[i + BUCKETS_INDEX4(0, 0)] = sum0;
+ buckets[i + BUCKETS_INDEX4(0, 1)] = sum2;
+ buckets[i + BUCKETS_INDEX4(0, 2)] = 0;
+ buckets[i + BUCKETS_INDEX4(0, 3)] = 0;
+
+ sum0 += SS + SL;
+ sum1 += LS;
+ sum2 += LS + LL;
+
+ temp_bucket[j + BUCKETS_INDEX2(0, 0)] = sum0;
+ temp_bucket[j + BUCKETS_INDEX2(0, 1)] = sum1;
+ }
+}
+
+static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
+ sa_sint_t * RESTRICT buckets, sa_sint_t d,
+ fast_sint_t omp_block_start,
+ fast_sint_t omp_block_size) {
+ const fast_sint_t prefetch_distance = 32;
+
+ sa_sint_t * RESTRICT induction_bucket = &buckets[4 * ALPHABET_SIZE];
+ sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE];
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) {
+ prefetch(&SA[i + 2 * prefetch_distance]);
+
+ prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1);
+ prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 2);
+ prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 1);
+ prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 2);
+
+ sa_sint_t p0 = SA[i + 0];
+ d += (p0 < 0);
+ p0 &= SAINT_MAX;
+ sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] >= T[p0 - 1]);
+ SA[induction_bucket[v0]++] = (p0 - 1) | ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1));
+ distinct_names[v0] = d;
+
+ sa_sint_t p1 = SA[i + 1];
+ d += (p1 < 0);
+ p1 &= SAINT_MAX;
+ sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] >= T[p1 - 1]);
+ SA[induction_bucket[v1]++] = (p1 - 1) | ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1));
+ distinct_names[v1] = d;
+ }
+
+ for (j += prefetch_distance + 1; i < j; i += 1) {
+ sa_sint_t p = SA[i];
+ d += (p < 0);
+ p &= SAINT_MAX;
+ sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] >= T[p - 1]);
+ SA[induction_bucket[v]++] = (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1));
+ distinct_names[v] = d;
+ }
+
+ return d;
+}
+static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
+ sa_sint_t n, sa_sint_t * RESTRICT buckets,
+ sa_sint_t left_suffixes_count, sa_sint_t d,
+ sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ sa_sint_t * RESTRICT induction_bucket = &buckets[4 * ALPHABET_SIZE];
+ sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE];
+
+ SA[induction_bucket[BUCKETS_INDEX2(T[n - 1], T[n - 2] >= T[n - 1])]++] = (n - 1) | SAINT_MIN;
+ distinct_names[BUCKETS_INDEX2(T[n - 1], T[n - 2] >= T[n - 1])] = ++d;
+
+ if (threads == 1 || left_suffixes_count < 65536) {
+ d = libsais_partial_sorting_scan_left_to_right_8u(T, SA, buckets, d, 0, left_suffixes_count);
+ }
+ (void)(thread_state);
+ return d;
+}
+
+static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k(const sa_sint_t * RESTRICT T,
+ sa_sint_t * RESTRICT SA,
+ sa_sint_t * RESTRICT buckets, sa_sint_t d,
+ fast_sint_t omp_block_start,
+ fast_sint_t omp_block_size) {
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2) {
+ prefetch(&SA[i + 3 * prefetch_distance]);
+
+ prefetch(&T[SA[i + 2 * prefetch_distance + 0] & SAINT_MAX] - 1);
+ prefetch(&T[SA[i + 2 * prefetch_distance + 0] & SAINT_MAX] - 2);
+ prefetch(&T[SA[i + 2 * prefetch_distance + 1] & SAINT_MAX] - 1);
+ prefetch(&T[SA[i + 2 * prefetch_distance + 1] & SAINT_MAX] - 2);
+
+ sa_sint_t p0 = SA[i + prefetch_distance + 0] & SAINT_MAX;
+ sa_sint_t v0 = BUCKETS_INDEX4(T[p0 - (p0 > 0)], 0);
+ prefetchw(&buckets[v0]);
+ sa_sint_t p1 = SA[i + prefetch_distance + 1] & SAINT_MAX;
+ sa_sint_t v1 = BUCKETS_INDEX4(T[p1 - (p1 > 0)], 0);
+ prefetchw(&buckets[v1]);
+
+ sa_sint_t p2 = SA[i + 0];
+ d += (p2 < 0);
+ p2 &= SAINT_MAX;
+ sa_sint_t v2 = BUCKETS_INDEX4(T[p2 - 1], T[p2 - 2] >= T[p2 - 1]);
+ SA[buckets[v2]++] = (p2 - 1) | ((sa_sint_t)(buckets[2 + v2] != d) << (SAINT_BIT - 1));
+ buckets[2 + v2] = d;
+
+ sa_sint_t p3 = SA[i + 1];
+ d += (p3 < 0);
+ p3 &= SAINT_MAX;
+ sa_sint_t v3 = BUCKETS_INDEX4(T[p3 - 1], T[p3 - 2] >= T[p3 - 1]);
+ SA[buckets[v3]++] = (p3 - 1) | ((sa_sint_t)(buckets[2 + v3] != d) << (SAINT_BIT - 1));
+ buckets[2 + v3] = d;
+ }
+
+ for (j += 2 * prefetch_distance + 1; i < j; i += 1) {
+ sa_sint_t p = SA[i];
+ d += (p < 0);
+ p &= SAINT_MAX;
+ sa_sint_t v = BUCKETS_INDEX4(T[p - 1], T[p - 2] >= T[p - 1]);
+ SA[buckets[v]++] = (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1));
+ buckets[2 + v] = d;
+ }
+
+ return d;
+}
+
+static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k(const sa_sint_t * RESTRICT T,
+ sa_sint_t * RESTRICT SA, sa_sint_t k,
+ sa_sint_t * RESTRICT buckets, sa_sint_t d,
+ fast_sint_t omp_block_start,
+ fast_sint_t omp_block_size) {
+ const fast_sint_t prefetch_distance = 32;
+
+ sa_sint_t * RESTRICT induction_bucket = &buckets[2 * k];
+ sa_sint_t * RESTRICT distinct_names = &buckets[0 * k];
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2) {
+ prefetchw(&SA[i + 3 * prefetch_distance]);
+
+ sa_sint_t s0 = SA[i + 2 * prefetch_distance + 0];
+ const sa_sint_t * Ts0 = &T[s0 & ~SUFFIX_GROUP_MARKER] - 1;
+ prefetch(s0 > 0 ? Ts0 : NULL);
+ Ts0--;
+ prefetch(s0 > 0 ? Ts0 : NULL);
+ sa_sint_t s1 = SA[i + 2 * prefetch_distance + 1];
+ const sa_sint_t * Ts1 = &T[s1 & ~SUFFIX_GROUP_MARKER] - 1;
+ prefetch(s1 > 0 ? Ts1 : NULL);
+ Ts1--;
+ prefetch(s1 > 0 ? Ts1 : NULL);
+ sa_sint_t s2 = SA[i + 1 * prefetch_distance + 0];
+ if (s2 > 0) {
+ const fast_sint_t Ts2 = T[(s2 & ~SUFFIX_GROUP_MARKER) - 1];
+ prefetchw(&induction_bucket[Ts2]);
+ prefetchw(&distinct_names[BUCKETS_INDEX2(Ts2, 0)]);
+ }
+ sa_sint_t s3 = SA[i + 1 * prefetch_distance + 1];
+ if (s3 > 0) {
+ const fast_sint_t Ts3 = T[(s3 & ~SUFFIX_GROUP_MARKER) - 1];
+ prefetchw(&induction_bucket[Ts3]);
+ prefetchw(&distinct_names[BUCKETS_INDEX2(Ts3, 0)]);
+ }
+
+ sa_sint_t p0 = SA[i + 0];
+ SA[i + 0] = p0 & SAINT_MAX;
+ if (p0 > 0) {
+ SA[i + 0] = 0;
+ d += (p0 >> (SUFFIX_GROUP_BIT - 1));
+ p0 &= ~SUFFIX_GROUP_MARKER;
+ sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] < T[p0 - 1]);
+ SA[induction_bucket[T[p0 - 1]]++] = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] < T[p0 - 1]) << (SAINT_BIT - 1)) |
+ ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1));
+ distinct_names[v0] = d;
+ }
+
+ sa_sint_t p1 = SA[i + 1];
+ SA[i + 1] = p1 & SAINT_MAX;
+ if (p1 > 0) {
+ SA[i + 1] = 0;
+ d += (p1 >> (SUFFIX_GROUP_BIT - 1));
+ p1 &= ~SUFFIX_GROUP_MARKER;
+ sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] < T[p1 - 1]);
+ SA[induction_bucket[T[p1 - 1]]++] = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] < T[p1 - 1]) << (SAINT_BIT - 1)) |
+ ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1));
+ distinct_names[v1] = d;
+ }
+ }
+
+ for (j += 2 * prefetch_distance + 1; i < j; i += 1) {
+ sa_sint_t p = SA[i];
+ SA[i] = p & SAINT_MAX;
+ if (p > 0) {
+ SA[i] = 0;
+ d += (p >> (SUFFIX_GROUP_BIT - 1));
+ p &= ~SUFFIX_GROUP_MARKER;
+ sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] < T[p - 1]);
+ SA[induction_bucket[T[p - 1]]++] = (p - 1) | ((sa_sint_t)(T[p - 2] < T[p - 1]) << (SAINT_BIT - 1)) |
+ ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1));
+ distinct_names[v] = d;
+ }
+ }
+
+ return d;
+}
+
+static void libsais_partial_sorting_scan_left_to_right_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+ sa_sint_t * RESTRICT induction_bucket,
+ fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2) {
+ prefetchw(&SA[i + 3 * prefetch_distance]);
+
+ sa_sint_t s0 = SA[i + 2 * prefetch_distance + 0];
+ const sa_sint_t * Ts0 = &T[s0] - 1;
+ prefetch(s0 > 0 ? Ts0 : NULL);
+ sa_sint_t s1 = SA[i + 2 * prefetch_distance + 1];
+ const sa_sint_t * Ts1 = &T[s1] - 1;
+ prefetch(s1 > 0 ? Ts1 : NULL);
+ sa_sint_t s2 = SA[i + 1 * prefetch_distance + 0];
+ if (s2 > 0) {
+ prefetchw(&induction_bucket[T[s2 - 1]]);
+ prefetch(&T[s2] - 2);
+ }
+ sa_sint_t s3 = SA[i + 1 * prefetch_distance + 1];
+ if (s3 > 0) {
+ prefetchw(&induction_bucket[T[s3 - 1]]);
+ prefetch(&T[s3] - 2);
+ }
+
+ sa_sint_t p0 = SA[i + 0];
+ SA[i + 0] = p0 & SAINT_MAX;
+ if (p0 > 0) {
+ SA[i + 0] = 0;
+ SA[induction_bucket[T[p0 - 1]]++] = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] < T[p0 - 1]) << (SAINT_BIT - 1));
+ }
+ sa_sint_t p1 = SA[i + 1];
+ SA[i + 1] = p1 & SAINT_MAX;
+ if (p1 > 0) {
+ SA[i + 1] = 0;
+ SA[induction_bucket[T[p1 - 1]]++] = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] < T[p1 - 1]) << (SAINT_BIT - 1));
+ }
+ }
+
+ for (j += 2 * prefetch_distance + 1; i < j; i += 1) {
+ sa_sint_t p = SA[i];
+ SA[i] = p & SAINT_MAX;
+ if (p > 0) {
+ SA[i] = 0;
+ SA[induction_bucket[T[p - 1]]++] = (p - 1) | ((sa_sint_t)(T[p - 2] < T[p - 1]) << (SAINT_BIT - 1));
+ }
+ }
+}
+static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k_omp(
+ const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets,
+ sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ SA[buckets[BUCKETS_INDEX4(T[n - 1], T[n - 2] >= T[n - 1])]++] = (n - 1) | SAINT_MIN;
+ buckets[2 + BUCKETS_INDEX4(T[n - 1], T[n - 2] >= T[n - 1])] = ++d;
+
+ if (threads == 1 || left_suffixes_count < 65536) {
+ d = libsais_partial_sorting_scan_left_to_right_32s_6k(T, SA, buckets, d, 0, left_suffixes_count);
+ }
+ (void)(thread_state);
+ return d;
+}
+
+static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k_omp(const sa_sint_t * RESTRICT T,
+ sa_sint_t * RESTRICT SA, sa_sint_t n,
+ sa_sint_t k, sa_sint_t * RESTRICT buckets,
+ sa_sint_t d, sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ sa_sint_t * RESTRICT induction_bucket = &buckets[2 * k];
+ sa_sint_t * RESTRICT distinct_names = &buckets[0 * k];
+
+ SA[induction_bucket[T[n - 1]]++] =
+ (n - 1) | ((sa_sint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1)) | SUFFIX_GROUP_MARKER;
+ distinct_names[BUCKETS_INDEX2(T[n - 1], T[n - 2] < T[n - 1])] = ++d;
+
+ if (threads == 1 || n < 65536) {
+ d = libsais_partial_sorting_scan_left_to_right_32s_4k(T, SA, k, buckets, d, 0, n);
+ }
+ (void)(thread_state);
+ return d;
+}
+
+static void libsais_partial_sorting_scan_left_to_right_32s_1k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+ sa_sint_t n, sa_sint_t * RESTRICT buckets,
+ sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ SA[buckets[T[n - 1]]++] = (n - 1) | ((sa_sint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1));
+
+ if (threads == 1 || n < 65536) {
+ libsais_partial_sorting_scan_left_to_right_32s_1k(T, SA, buckets, 0, n);
+ }
+ (void)(thread_state);
+}
+
+static void libsais_partial_sorting_shift_markers_8u_omp(sa_sint_t * RESTRICT SA, sa_sint_t n,
+ const sa_sint_t * RESTRICT buckets, sa_sint_t threads) {
+ const fast_sint_t prefetch_distance = 32;
+
+ const sa_sint_t * RESTRICT temp_bucket = &buckets[4 * ALPHABET_SIZE];
+
+ fast_sint_t c;
+ (void)(threads);
+ (void)(n);
+
+ for (c = BUCKETS_INDEX2(ALPHABET_SIZE - 1, 0); c >= BUCKETS_INDEX2(1, 0); c -= BUCKETS_INDEX2(1, 0)) {
+ fast_sint_t i, j;
+ sa_sint_t s = SAINT_MIN;
+ for (i = (fast_sint_t)temp_bucket[c] - 1, j = (fast_sint_t)buckets[c - BUCKETS_INDEX2(1, 0)] + 3; i >= j;
+ i -= 4) {
+ prefetchw(&SA[i - prefetch_distance]);
+
+ sa_sint_t p0 = SA[i - 0], q0 = (p0 & SAINT_MIN) ^ s;
+ s = s ^ q0;
+ SA[i - 0] = p0 ^ q0;
+ sa_sint_t p1 = SA[i - 1], q1 = (p1 & SAINT_MIN) ^ s;
+ s = s ^ q1;
+ SA[i - 1] = p1 ^ q1;
+ sa_sint_t p2 = SA[i - 2], q2 = (p2 & SAINT_MIN) ^ s;
+ s = s ^ q2;
+ SA[i - 2] = p2 ^ q2;
+ sa_sint_t p3 = SA[i - 3], q3 = (p3 & SAINT_MIN) ^ s;
+ s = s ^ q3;
+ SA[i - 3] = p3 ^ q3;
+ }
+
+ for (j -= 3; i >= j; i -= 1) {
+ sa_sint_t p = SA[i], q = (p & SAINT_MIN) ^ s;
+ s = s ^ q;
+ SA[i] = p ^ q;
+ }
+ }
+}
+
+static void libsais_partial_sorting_shift_markers_32s_6k_omp(sa_sint_t * RESTRICT SA, sa_sint_t k,
+ const sa_sint_t * RESTRICT buckets, sa_sint_t threads) {
+ const fast_sint_t prefetch_distance = 32;
+
+ const sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k];
+
+ fast_sint_t c;
+ (void)(threads);
+
+ for (c = (fast_sint_t)k - 1; c >= 1; c -= 1) {
+ fast_sint_t i, j;
+ sa_sint_t s = SAINT_MIN;
+ for (i = (fast_sint_t)buckets[BUCKETS_INDEX4(c, 0)] - 1,
+ j = (fast_sint_t)temp_bucket[BUCKETS_INDEX2(c - 1, 0)] + 3;
+ i >= j; i -= 4) {
+ prefetchw(&SA[i - prefetch_distance]);
+
+ sa_sint_t p0 = SA[i - 0], q0 = (p0 & SAINT_MIN) ^ s;
+ s = s ^ q0;
+ SA[i - 0] = p0 ^ q0;
+ sa_sint_t p1 = SA[i - 1], q1 = (p1 & SAINT_MIN) ^ s;
+ s = s ^ q1;
+ SA[i - 1] = p1 ^ q1;
+ sa_sint_t p2 = SA[i - 2], q2 = (p2 & SAINT_MIN) ^ s;
+ s = s ^ q2;
+ SA[i - 2] = p2 ^ q2;
+ sa_sint_t p3 = SA[i - 3], q3 = (p3 & SAINT_MIN) ^ s;
+ s = s ^ q3;
+ SA[i - 3] = p3 ^ q3;
+ }
+
+ for (j -= 3; i >= j; i -= 1) {
+ sa_sint_t p = SA[i], q = (p & SAINT_MIN) ^ s;
+ s = s ^ q;
+ SA[i] = p ^ q;
+ }
+ }
+}
+
+static void libsais_partial_sorting_shift_markers_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n) {
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i;
+ sa_sint_t s = SUFFIX_GROUP_MARKER;
+ for (i = (fast_sint_t)n - 1; i >= 3; i -= 4) {
+ prefetchw(&SA[i - prefetch_distance]);
+
+ sa_sint_t p0 = SA[i - 0],
+ q0 = ((p0 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p0 > 0) << ((SUFFIX_GROUP_BIT - 1)));
+ s = s ^ q0;
+ SA[i - 0] = p0 ^ q0;
+ sa_sint_t p1 = SA[i - 1],
+ q1 = ((p1 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p1 > 0) << ((SUFFIX_GROUP_BIT - 1)));
+ s = s ^ q1;
+ SA[i - 1] = p1 ^ q1;
+ sa_sint_t p2 = SA[i - 2],
+ q2 = ((p2 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p2 > 0) << ((SUFFIX_GROUP_BIT - 1)));
+ s = s ^ q2;
+ SA[i - 2] = p2 ^ q2;
+ sa_sint_t p3 = SA[i - 3],
+ q3 = ((p3 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p3 > 0) << ((SUFFIX_GROUP_BIT - 1)));
+ s = s ^ q3;
+ SA[i - 3] = p3 ^ q3;
+ }
+
+ for (; i >= 0; i -= 1) {
+ sa_sint_t p = SA[i], q = ((p & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p > 0) << ((SUFFIX_GROUP_BIT - 1)));
+ s = s ^ q;
+ SA[i] = p ^ q;
+ }
+}
+
+static void libsais_partial_sorting_shift_buckets_32s_6k(sa_sint_t k, sa_sint_t * RESTRICT buckets) {
+ sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k];
+
+ fast_sint_t i;
+ for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0)) {
+ buckets[2 * i + BUCKETS_INDEX4(0, 0)] = temp_bucket[i + BUCKETS_INDEX2(0, 0)];
+ buckets[2 * i + BUCKETS_INDEX4(0, 1)] = temp_bucket[i + BUCKETS_INDEX2(0, 1)];
+ }
+}
+
+static sa_sint_t libsais_partial_sorting_scan_right_to_left_8u(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
+ sa_sint_t * RESTRICT buckets, sa_sint_t d,
+ fast_sint_t omp_block_start,
+ fast_sint_t omp_block_size) {
+ const fast_sint_t prefetch_distance = 32;
+
+ sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
+ sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE];
+
+ fast_sint_t i, j;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) {
+ prefetch(&SA[i - 2 * prefetch_distance]);
+
+ prefetch(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 1);
+ prefetch(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 2);
+ prefetch(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 1);
+ prefetch(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 2);
+
+ sa_sint_t p0 = SA[i - 0];
+ d += (p0 < 0);
+ p0 &= SAINT_MAX;
+ sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]);
+ SA[--induction_bucket[v0]] = (p0 - 1) | ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1));
+ distinct_names[v0] = d;
+
+ sa_sint_t p1 = SA[i - 1];
+ d += (p1 < 0);
+ p1 &= SAINT_MAX;
+ sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]);
+ SA[--induction_bucket[v1]] = (p1 - 1) | ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1));
+ distinct_names[v1] = d;
+ }
+
+ for (j -= prefetch_distance + 1; i >= j; i -= 1) {
+ sa_sint_t p = SA[i];
+ d += (p < 0);
+ p &= SAINT_MAX;
+ sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]);
+ SA[--induction_bucket[v]] = (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1));
+ distinct_names[v] = d;
+ }
+
+ return d;
+}
+static void libsais_partial_sorting_scan_right_to_left_8u_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
+ sa_sint_t n, sa_sint_t * RESTRICT buckets,
+ sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count,
+ sa_sint_t d, sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ fast_sint_t scan_start = (fast_sint_t)left_suffixes_count + 1;
+ fast_sint_t scan_end = (fast_sint_t)n - (fast_sint_t)first_lms_suffix;
+
+ if (threads == 1 || (scan_end - scan_start) < 65536) {
+ libsais_partial_sorting_scan_right_to_left_8u(T, SA, buckets, d, scan_start, scan_end - scan_start);
+ }
+ (void)(thread_state);
+}
+
+static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k(const sa_sint_t * RESTRICT T,
+ sa_sint_t * RESTRICT SA,
+ sa_sint_t * RESTRICT buckets, sa_sint_t d,
+ fast_sint_t omp_block_start,
+ fast_sint_t omp_block_size) {
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2) {
+ prefetch(&SA[i - 3 * prefetch_distance]);
+
+ prefetch(&T[SA[i - 2 * prefetch_distance - 0] & SAINT_MAX] - 1);
+ prefetch(&T[SA[i - 2 * prefetch_distance - 0] & SAINT_MAX] - 2);
+ prefetch(&T[SA[i - 2 * prefetch_distance - 1] & SAINT_MAX] - 1);
+ prefetch(&T[SA[i - 2 * prefetch_distance - 1] & SAINT_MAX] - 2);
+
+ sa_sint_t p0 = SA[i - prefetch_distance - 0] & SAINT_MAX;
+ sa_sint_t v0 = BUCKETS_INDEX4(T[p0 - (p0 > 0)], 0);
+ prefetchw(&buckets[v0]);
+ sa_sint_t p1 = SA[i - prefetch_distance - 1] & SAINT_MAX;
+ sa_sint_t v1 = BUCKETS_INDEX4(T[p1 - (p1 > 0)], 0);
+ prefetchw(&buckets[v1]);
+
+ sa_sint_t p2 = SA[i - 0];
+ d += (p2 < 0);
+ p2 &= SAINT_MAX;
+ sa_sint_t v2 = BUCKETS_INDEX4(T[p2 - 1], T[p2 - 2] > T[p2 - 1]);
+ SA[--buckets[v2]] = (p2 - 1) | ((sa_sint_t)(buckets[2 + v2] != d) << (SAINT_BIT - 1));
+ buckets[2 + v2] = d;
+
+ sa_sint_t p3 = SA[i - 1];
+ d += (p3 < 0);
+ p3 &= SAINT_MAX;
+ sa_sint_t v3 = BUCKETS_INDEX4(T[p3 - 1], T[p3 - 2] > T[p3 - 1]);
+ SA[--buckets[v3]] = (p3 - 1) | ((sa_sint_t)(buckets[2 + v3] != d) << (SAINT_BIT - 1));
+ buckets[2 + v3] = d;
+ }
+
+ for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1) {
+ sa_sint_t p = SA[i];
+ d += (p < 0);
+ p &= SAINT_MAX;
+ sa_sint_t v = BUCKETS_INDEX4(T[p - 1], T[p - 2] > T[p - 1]);
+ SA[--buckets[v]] = (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1));
+ buckets[2 + v] = d;
+ }
+
+ return d;
+}
+
+static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k(const sa_sint_t * RESTRICT T,
+ sa_sint_t * RESTRICT SA, sa_sint_t k,
+ sa_sint_t * RESTRICT buckets, sa_sint_t d,
+ fast_sint_t omp_block_start,
+ fast_sint_t omp_block_size) {
+ const fast_sint_t prefetch_distance = 32;
+
+ sa_sint_t * RESTRICT induction_bucket = &buckets[3 * k];
+ sa_sint_t * RESTRICT distinct_names = &buckets[0 * k];
+
+ fast_sint_t i, j;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2) {
+ prefetchw(&SA[i - 3 * prefetch_distance]);
+
+ sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0];
+ const sa_sint_t * Ts0 = &T[s0 & ~SUFFIX_GROUP_MARKER] - 1;
+ prefetch(s0 > 0 ? Ts0 : NULL);
+ Ts0--;
+ prefetch(s0 > 0 ? Ts0 : NULL);
+ sa_sint_t s1 = SA[i - 2 * prefetch_distance - 1];
+ const sa_sint_t * Ts1 = &T[s1 & ~SUFFIX_GROUP_MARKER] - 1;
+ prefetch(s1 > 0 ? Ts1 : NULL);
+ Ts1--;
+ prefetch(s1 > 0 ? Ts1 : NULL);
+ sa_sint_t s2 = SA[i - 1 * prefetch_distance - 0];
+ if (s2 > 0) {
+ const fast_sint_t Ts2 = T[(s2 & ~SUFFIX_GROUP_MARKER) - 1];
+ prefetchw(&induction_bucket[Ts2]);
+ prefetchw(&distinct_names[BUCKETS_INDEX2(Ts2, 0)]);
+ }
+ sa_sint_t s3 = SA[i - 1 * prefetch_distance - 1];
+ if (s3 > 0) {
+ const fast_sint_t Ts3 = T[(s3 & ~SUFFIX_GROUP_MARKER) - 1];
+ prefetchw(&induction_bucket[Ts3]);
+ prefetchw(&distinct_names[BUCKETS_INDEX2(Ts3, 0)]);
+ }
+
+ sa_sint_t p0 = SA[i - 0];
+ if (p0 > 0) {
+ SA[i - 0] = 0;
+ d += (p0 >> (SUFFIX_GROUP_BIT - 1));
+ p0 &= ~SUFFIX_GROUP_MARKER;
+ sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]);
+ SA[--induction_bucket[T[p0 - 1]]] = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1)) |
+ ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1));
+ distinct_names[v0] = d;
+ }
+
+ sa_sint_t p1 = SA[i - 1];
+ if (p1 > 0) {
+ SA[i - 1] = 0;
+ d += (p1 >> (SUFFIX_GROUP_BIT - 1));
+ p1 &= ~SUFFIX_GROUP_MARKER;
+ sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]);
+ SA[--induction_bucket[T[p1 - 1]]] = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1)) |
+ ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1));
+ distinct_names[v1] = d;
+ }
+ }
+
+ for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1) {
+ sa_sint_t p = SA[i];
+ if (p > 0) {
+ SA[i] = 0;
+ d += (p >> (SUFFIX_GROUP_BIT - 1));
+ p &= ~SUFFIX_GROUP_MARKER;
+ sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]);
+ SA[--induction_bucket[T[p - 1]]] = (p - 1) | ((sa_sint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1)) |
+ ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1));
+ distinct_names[v] = d;
+ }
+ }
+
+ return d;
+}
+
+static void libsais_partial_sorting_scan_right_to_left_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+ sa_sint_t * RESTRICT induction_bucket,
+ fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2) {
+ prefetchw(&SA[i - 3 * prefetch_distance]);
+
+ sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0];
+ const sa_sint_t * Ts0 = &T[s0] - 1;
+ prefetch(s0 > 0 ? Ts0 : NULL);
+ sa_sint_t s1 = SA[i - 2 * prefetch_distance - 1];
+ const sa_sint_t * Ts1 = &T[s1] - 1;
+ prefetch(s1 > 0 ? Ts1 : NULL);
+ sa_sint_t s2 = SA[i - 1 * prefetch_distance - 0];
+ if (s2 > 0) {
+ prefetchw(&induction_bucket[T[s2 - 1]]);
+ prefetch(&T[s2] - 2);
+ }
+ sa_sint_t s3 = SA[i - 1 * prefetch_distance - 1];
+ if (s3 > 0) {
+ prefetchw(&induction_bucket[T[s3 - 1]]);
+ prefetch(&T[s3] - 2);
+ }
+
+ sa_sint_t p0 = SA[i - 0];
+ if (p0 > 0) {
+ SA[i - 0] = 0;
+ SA[--induction_bucket[T[p0 - 1]]] = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1));
+ }
+ sa_sint_t p1 = SA[i - 1];
+ if (p1 > 0) {
+ SA[i - 1] = 0;
+ SA[--induction_bucket[T[p1 - 1]]] = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1));
+ }
+ }
+
+ for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1) {
+ sa_sint_t p = SA[i];
+ if (p > 0) {
+ SA[i] = 0;
+ SA[--induction_bucket[T[p - 1]]] = (p - 1) | ((sa_sint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1));
+ }
+ }
+}
+static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k_omp(
+ const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets,
+ sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ fast_sint_t scan_start = (fast_sint_t)left_suffixes_count + 1;
+ fast_sint_t scan_end = (fast_sint_t)n - (fast_sint_t)first_lms_suffix;
+
+ if (threads == 1 || (scan_end - scan_start) < 65536) {
+ d = libsais_partial_sorting_scan_right_to_left_32s_6k(T, SA, buckets, d, scan_start, scan_end - scan_start);
+ }
+ (void)(thread_state);
+ return d;
+}
+
+static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k_omp(const sa_sint_t * RESTRICT T,
+ sa_sint_t * RESTRICT SA, sa_sint_t n,
+ sa_sint_t k, sa_sint_t * RESTRICT buckets,
+ sa_sint_t d, sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ if (threads == 1 || n < 65536) {
+ d = libsais_partial_sorting_scan_right_to_left_32s_4k(T, SA, k, buckets, d, 0, n);
+ }
+ (void)(thread_state);
+ return d;
+}
+
+static void libsais_partial_sorting_scan_right_to_left_32s_1k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+ sa_sint_t n, sa_sint_t * RESTRICT buckets,
+ sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ if (threads == 1 || n < 65536) {
+ libsais_partial_sorting_scan_right_to_left_32s_1k(T, SA, buckets, 0, n);
+ }
+ (void)(thread_state);
+}
+
+static fast_sint_t libsais_partial_sorting_gather_lms_suffixes_32s_4k(sa_sint_t * RESTRICT SA,
+ fast_sint_t omp_block_start,
+ fast_sint_t omp_block_size) {
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j, l;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - 3, l = omp_block_start; i < j; i += 4) {
+ prefetch(&SA[i + prefetch_distance]);
+
+ sa_sint_t s0 = SA[i + 0];
+ SA[l] = (s0 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER);
+ l += (s0 < 0);
+ sa_sint_t s1 = SA[i + 1];
+ SA[l] = (s1 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER);
+ l += (s1 < 0);
+ sa_sint_t s2 = SA[i + 2];
+ SA[l] = (s2 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER);
+ l += (s2 < 0);
+ sa_sint_t s3 = SA[i + 3];
+ SA[l] = (s3 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER);
+ l += (s3 < 0);
+ }
+
+ for (j += 3; i < j; i += 1) {
+ sa_sint_t s = SA[i];
+ SA[l] = (s - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER);
+ l += (s < 0);
+ }
+
+ return l;
+}
+
+static fast_sint_t libsais_partial_sorting_gather_lms_suffixes_32s_1k(sa_sint_t * RESTRICT SA,
+ fast_sint_t omp_block_start,
+ fast_sint_t omp_block_size) {
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j, l;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - 3, l = omp_block_start; i < j; i += 4) {
+ prefetch(&SA[i + prefetch_distance]);
+
+ sa_sint_t s0 = SA[i + 0];
+ SA[l] = s0 & SAINT_MAX;
+ l += (s0 < 0);
+ sa_sint_t s1 = SA[i + 1];
+ SA[l] = s1 & SAINT_MAX;
+ l += (s1 < 0);
+ sa_sint_t s2 = SA[i + 2];
+ SA[l] = s2 & SAINT_MAX;
+ l += (s2 < 0);
+ sa_sint_t s3 = SA[i + 3];
+ SA[l] = s3 & SAINT_MAX;
+ l += (s3 < 0);
+ }
+
+ for (j += 3; i < j; i += 1) {
+ sa_sint_t s = SA[i];
+ SA[l] = s & SAINT_MAX;
+ l += (s < 0);
+ }
+
+ return l;
+}
+
+static void libsais_partial_sorting_gather_lms_suffixes_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t n,
+ sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ {
+ (void)(threads);
+ (void)(thread_state);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+
+ fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+
+ if (omp_num_threads == 1) {
+ libsais_partial_sorting_gather_lms_suffixes_32s_4k(SA, omp_block_start, omp_block_size);
+ }
+ }
+}
+
+static void libsais_partial_sorting_gather_lms_suffixes_32s_1k_omp(sa_sint_t * RESTRICT SA, sa_sint_t n,
+ sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ {
+ (void)(threads);
+ (void)(thread_state);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+
+ fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+
+ if (omp_num_threads == 1) {
+ libsais_partial_sorting_gather_lms_suffixes_32s_1k(SA, omp_block_start, omp_block_size);
+ }
+ }
+}
+
+static void libsais_induce_partial_order_8u_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+ sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix,
+ sa_sint_t left_suffixes_count, sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ memset(&buckets[2 * ALPHABET_SIZE], 0, 2 * ALPHABET_SIZE * sizeof(sa_sint_t));
+
+ sa_sint_t d = libsais_partial_sorting_scan_left_to_right_8u_omp(T, SA, n, buckets, left_suffixes_count, 0, threads,
+ thread_state);
+ libsais_partial_sorting_shift_markers_8u_omp(SA, n, buckets, threads);
+ libsais_partial_sorting_scan_right_to_left_8u_omp(T, SA, n, buckets, first_lms_suffix, left_suffixes_count, d,
+ threads, thread_state);
+}
+
+static void libsais_induce_partial_order_32s_6k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+ sa_sint_t k, sa_sint_t * RESTRICT buckets,
+ sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count,
+ sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ sa_sint_t d = libsais_partial_sorting_scan_left_to_right_32s_6k_omp(T, SA, n, buckets, left_suffixes_count, 0,
+ threads, thread_state);
+ libsais_partial_sorting_shift_markers_32s_6k_omp(SA, k, buckets, threads);
+ libsais_partial_sorting_shift_buckets_32s_6k(k, buckets);
+ libsais_partial_sorting_scan_right_to_left_32s_6k_omp(T, SA, n, buckets, first_lms_suffix, left_suffixes_count, d,
+ threads, thread_state);
+}
+
+static void libsais_induce_partial_order_32s_4k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+ sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t));
+
+ sa_sint_t d = libsais_partial_sorting_scan_left_to_right_32s_4k_omp(T, SA, n, k, buckets, 0, threads, thread_state);
+ libsais_partial_sorting_shift_markers_32s_4k(SA, n);
+ libsais_partial_sorting_scan_right_to_left_32s_4k_omp(T, SA, n, k, buckets, d, threads, thread_state);
+ libsais_partial_sorting_gather_lms_suffixes_32s_4k_omp(SA, n, threads, thread_state);
+}
+
+static void libsais_induce_partial_order_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+ sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ libsais_partial_sorting_scan_left_to_right_32s_1k_omp(T, SA, n, &buckets[1 * k], threads, thread_state);
+ libsais_partial_sorting_scan_right_to_left_32s_1k_omp(T, SA, n, &buckets[0 * k], threads, thread_state);
+ libsais_partial_sorting_gather_lms_suffixes_32s_1k_omp(SA, n, threads, thread_state);
+}
+
+static void libsais_induce_partial_order_32s_1k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+ sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ libsais_count_suffixes_32s(T, n, k, buckets);
+ libsais_initialize_buckets_start_32s_1k(k, buckets);
+ libsais_partial_sorting_scan_left_to_right_32s_1k_omp(T, SA, n, buckets, threads, thread_state);
+
+ libsais_count_suffixes_32s(T, n, k, buckets);
+ libsais_initialize_buckets_end_32s_1k(k, buckets);
+ libsais_partial_sorting_scan_right_to_left_32s_1k_omp(T, SA, n, buckets, threads, thread_state);
+
+ libsais_partial_sorting_gather_lms_suffixes_32s_1k_omp(SA, n, threads, thread_state);
+}
+
+static sa_sint_t libsais_renumber_lms_suffixes_8u(sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t name,
+ fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+ const fast_sint_t prefetch_distance = 32;
+
+ sa_sint_t * RESTRICT SAm = &SA[m];
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) {
+ prefetch(&SA[i + 2 * prefetch_distance]);
+
+ prefetchw(&SAm[(SA[i + prefetch_distance + 0] & SAINT_MAX) >> 1]);
+ prefetchw(&SAm[(SA[i + prefetch_distance + 1] & SAINT_MAX) >> 1]);
+ prefetchw(&SAm[(SA[i + prefetch_distance + 2] & SAINT_MAX) >> 1]);
+ prefetchw(&SAm[(SA[i + prefetch_distance + 3] & SAINT_MAX) >> 1]);
+
+ sa_sint_t p0 = SA[i + 0];
+ SAm[(p0 & SAINT_MAX) >> 1] = name | SAINT_MIN;
+ name += p0 < 0;
+ sa_sint_t p1 = SA[i + 1];
+ SAm[(p1 & SAINT_MAX) >> 1] = name | SAINT_MIN;
+ name += p1 < 0;
+ sa_sint_t p2 = SA[i + 2];
+ SAm[(p2 & SAINT_MAX) >> 1] = name | SAINT_MIN;
+ name += p2 < 0;
+ sa_sint_t p3 = SA[i + 3];
+ SAm[(p3 & SAINT_MAX) >> 1] = name | SAINT_MIN;
+ name += p3 < 0;
+ }
+
+ for (j += prefetch_distance + 3; i < j; i += 1) {
+ sa_sint_t p = SA[i];
+ SAm[(p & SAINT_MAX) >> 1] = name | SAINT_MIN;
+ name += p < 0;
+ }
+
+ return name;
+}
+
+static fast_sint_t libsais_gather_marked_suffixes_8u(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t l,
+ fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+ const fast_sint_t prefetch_distance = 32;
+
+ l -= 1;
+
+ fast_sint_t i, j;
+ for (i = (fast_sint_t)m + omp_block_start + omp_block_size - 1, j = (fast_sint_t)m + omp_block_start + 3; i >= j;
+ i -= 4) {
+ prefetch(&SA[i - prefetch_distance]);
+
+ sa_sint_t s0 = SA[i - 0];
+ SA[l] = s0 & SAINT_MAX;
+ l -= s0 < 0;
+ sa_sint_t s1 = SA[i - 1];
+ SA[l] = s1 & SAINT_MAX;
+ l -= s1 < 0;
+ sa_sint_t s2 = SA[i - 2];
+ SA[l] = s2 & SAINT_MAX;
+ l -= s2 < 0;
+ sa_sint_t s3 = SA[i - 3];
+ SA[l] = s3 & SAINT_MAX;
+ l -= s3 < 0;
+ }
+
+ for (j -= 3; i >= j; i -= 1) {
+ sa_sint_t s = SA[i];
+ SA[l] = s & SAINT_MAX;
+ l -= s < 0;
+ }
+
+ l += 1;
+
+ return l;
+}
+
+static sa_sint_t libsais_renumber_lms_suffixes_8u_omp(sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ sa_sint_t name = 0;
+ {
+ (void)(threads);
+ (void)(thread_state);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+
+ fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
+
+ if (omp_num_threads == 1) {
+ name = libsais_renumber_lms_suffixes_8u(SA, m, 0, omp_block_start, omp_block_size);
+ }
+ }
+
+ return name;
+}
+
+static void libsais_gather_marked_lms_suffixes_8u_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs,
+ sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ {
+ (void)(threads);
+ (void)(thread_state);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+
+ fast_sint_t omp_block_stride = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size =
+ omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ((fast_sint_t)n >> 1) - omp_block_start;
+
+ if (omp_num_threads == 1) {
+ libsais_gather_marked_suffixes_8u(SA, m, (fast_sint_t)n + (fast_sint_t)fs, omp_block_start, omp_block_size);
+ }
+ }
+}
+
+static sa_sint_t libsais_renumber_and_gather_lms_suffixes_8u_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
+ sa_sint_t fs, sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ memset(&SA[m], 0, ((size_t)n >> 1) * sizeof(sa_sint_t));
+
+ sa_sint_t name = libsais_renumber_lms_suffixes_8u_omp(SA, m, threads, thread_state);
+ if (name < m) {
+ libsais_gather_marked_lms_suffixes_8u_omp(SA, n, m, fs, threads, thread_state);
+ } else {
+ fast_sint_t i;
+ for (i = 0; i < m; i += 1) {
+ SA[i] &= SAINT_MAX;
+ }
+ }
+
+ return name;
+}
+
+static sa_sint_t libsais_renumber_distinct_lms_suffixes_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t name,
+ fast_sint_t omp_block_start,
+ fast_sint_t omp_block_size) {
+ const fast_sint_t prefetch_distance = 32;
+
+ sa_sint_t * RESTRICT SAm = &SA[m];
+
+ fast_sint_t i, j;
+ sa_sint_t p0, p1, p2, p3 = 0;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) {
+ prefetchw(&SA[i + 2 * prefetch_distance]);
+
+ prefetchw(&SAm[(SA[i + prefetch_distance + 0] & SAINT_MAX) >> 1]);
+ prefetchw(&SAm[(SA[i + prefetch_distance + 1] & SAINT_MAX) >> 1]);
+ prefetchw(&SAm[(SA[i + prefetch_distance + 2] & SAINT_MAX) >> 1]);
+ prefetchw(&SAm[(SA[i + prefetch_distance + 3] & SAINT_MAX) >> 1]);
+
+ p0 = SA[i + 0];
+ SAm[(SA[i + 0] = p0 & SAINT_MAX) >> 1] = name | (p0 & p3 & SAINT_MIN);
+ name += p0 < 0;
+ p1 = SA[i + 1];
+ SAm[(SA[i + 1] = p1 & SAINT_MAX) >> 1] = name | (p1 & p0 & SAINT_MIN);
+ name += p1 < 0;
+ p2 = SA[i + 2];
+ SAm[(SA[i + 2] = p2 & SAINT_MAX) >> 1] = name | (p2 & p1 & SAINT_MIN);
+ name += p2 < 0;
+ p3 = SA[i + 3];
+ SAm[(SA[i + 3] = p3 & SAINT_MAX) >> 1] = name | (p3 & p2 & SAINT_MIN);
+ name += p3 < 0;
+ }
+
+ for (j += prefetch_distance + 3; i < j; i += 1) {
+ p2 = p3;
+ p3 = SA[i];
+ SAm[(SA[i] = p3 & SAINT_MAX) >> 1] = name | (p3 & p2 & SAINT_MIN);
+ name += p3 < 0;
+ }
+
+ return name;
+}
+
+static void libsais_mark_distinct_lms_suffixes_32s(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t omp_block_start,
+ fast_sint_t omp_block_size) {
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ sa_sint_t p0, p1, p2, p3 = 0;
+ for (i = (fast_sint_t)m + omp_block_start, j = (fast_sint_t)m + omp_block_start + omp_block_size - 3; i < j;
+ i += 4) {
+ prefetchw(&SA[i + prefetch_distance]);
+
+ p0 = SA[i + 0];
+ SA[i + 0] = p0 & (p3 | SAINT_MAX);
+ p0 = (p0 == 0) ? p3 : p0;
+ p1 = SA[i + 1];
+ SA[i + 1] = p1 & (p0 | SAINT_MAX);
+ p1 = (p1 == 0) ? p0 : p1;
+ p2 = SA[i + 2];
+ SA[i + 2] = p2 & (p1 | SAINT_MAX);
+ p2 = (p2 == 0) ? p1 : p2;
+ p3 = SA[i + 3];
+ SA[i + 3] = p3 & (p2 | SAINT_MAX);
+ p3 = (p3 == 0) ? p2 : p3;
+ }
+
+ for (j += 3; i < j; i += 1) {
+ p2 = p3;
+ p3 = SA[i];
+ SA[i] = p3 & (p2 | SAINT_MAX);
+ p3 = (p3 == 0) ? p2 : p3;
+ }
+}
+
+static void libsais_clamp_lms_suffixes_length_32s(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t omp_block_start,
+ fast_sint_t omp_block_size) {
+ const fast_sint_t prefetch_distance = 32;
+
+ sa_sint_t * RESTRICT SAm = &SA[m];
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - 3; i < j; i += 4) {
+ prefetchw(&SAm[i + prefetch_distance]);
+
+ SAm[i + 0] = (SAm[i + 0] < 0 ? SAm[i + 0] : 0) & SAINT_MAX;
+ SAm[i + 1] = (SAm[i + 1] < 0 ? SAm[i + 1] : 0) & SAINT_MAX;
+ SAm[i + 2] = (SAm[i + 2] < 0 ? SAm[i + 2] : 0) & SAINT_MAX;
+ SAm[i + 3] = (SAm[i + 3] < 0 ? SAm[i + 3] : 0) & SAINT_MAX;
+ }
+
+ for (j += 3; i < j; i += 1) {
+ SAm[i] = (SAm[i] < 0 ? SAm[i] : 0) & SAINT_MAX;
+ }
+}
+
+static sa_sint_t libsais_renumber_distinct_lms_suffixes_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t m,
+ sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ sa_sint_t name = 0;
+ {
+ (void)(threads);
+ (void)(thread_state);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+
+ fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
+
+ if (omp_num_threads == 1) {
+ name = libsais_renumber_distinct_lms_suffixes_32s_4k(SA, m, 1, omp_block_start, omp_block_size);
+ }
+ }
+
+ return name - 1;
+}
+
+static void libsais_mark_distinct_lms_suffixes_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
+ sa_sint_t threads) {
+ {
+ (void)(threads);
+
+ fast_sint_t omp_block_start = 0;
+ fast_sint_t omp_block_size = (fast_sint_t)n >> 1;
+
+ libsais_mark_distinct_lms_suffixes_32s(SA, m, omp_block_start, omp_block_size);
+ }
+}
+
+static void libsais_clamp_lms_suffixes_length_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
+ sa_sint_t threads) {
+ {
+ (void)(threads);
+
+ fast_sint_t omp_block_start = 0;
+ fast_sint_t omp_block_size = (fast_sint_t)n >> 1;
+
+ libsais_clamp_lms_suffixes_length_32s(SA, m, omp_block_start, omp_block_size);
+ }
+}
+
+static sa_sint_t libsais_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(
+ sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ memset(&SA[m], 0, ((size_t)n >> 1) * sizeof(sa_sint_t));
+
+ sa_sint_t name = libsais_renumber_distinct_lms_suffixes_32s_4k_omp(SA, m, threads, thread_state);
+ if (name < m) {
+ libsais_mark_distinct_lms_suffixes_32s_omp(SA, n, m, threads);
+ }
+
+ return name;
+}
+
+static sa_sint_t libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(sa_sint_t * RESTRICT T,
+ sa_sint_t * RESTRICT SA, sa_sint_t n,
+ sa_sint_t m, sa_sint_t threads) {
+ const fast_sint_t prefetch_distance = 32;
+
+ sa_sint_t * RESTRICT SAm = &SA[m];
+
+ {
+ libsais_gather_lms_suffixes_32s(T, SA, n);
+
+ memset(&SA[m], 0, ((size_t)n - (size_t)m - (size_t)m) * sizeof(sa_sint_t));
+
+ fast_sint_t i, j;
+ for (i = (fast_sint_t)n - (fast_sint_t)m, j = (fast_sint_t)n - 1 - prefetch_distance - 3; i < j; i += 4) {
+ prefetch(&SA[i + 2 * prefetch_distance]);
+
+ prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]);
+ prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]);
+ prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 2]) >> 1]);
+ prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 3]) >> 1]);
+
+ SAm[((sa_uint_t)SA[i + 0]) >> 1] = SA[i + 1] - SA[i + 0] + 1 + SAINT_MIN;
+ SAm[((sa_uint_t)SA[i + 1]) >> 1] = SA[i + 2] - SA[i + 1] + 1 + SAINT_MIN;
+ SAm[((sa_uint_t)SA[i + 2]) >> 1] = SA[i + 3] - SA[i + 2] + 1 + SAINT_MIN;
+ SAm[((sa_uint_t)SA[i + 3]) >> 1] = SA[i + 4] - SA[i + 3] + 1 + SAINT_MIN;
+ }
+
+ for (j += prefetch_distance + 3; i < j; i += 1) {
+ SAm[((sa_uint_t)SA[i]) >> 1] = SA[i + 1] - SA[i] + 1 + SAINT_MIN;
+ }
+
+ SAm[((sa_uint_t)SA[n - 1]) >> 1] = 1 + SAINT_MIN;
+ }
+
+ { libsais_clamp_lms_suffixes_length_32s_omp(SA, n, m, threads); }
+
+ sa_sint_t name = 1;
+
+ {
+ fast_sint_t i, j, p = SA[0], plen = SAm[p >> 1];
+ sa_sint_t pdiff = SAINT_MIN;
+ for (i = 1, j = m - prefetch_distance - 1; i < j; i += 2) {
+ prefetch(&SA[i + 2 * prefetch_distance]);
+
+ prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]);
+ prefetch(&T[((sa_uint_t)SA[i + prefetch_distance + 0])]);
+ prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]);
+ prefetch(&T[((sa_uint_t)SA[i + prefetch_distance + 1])]);
+
+ fast_sint_t q = SA[i + 0], qlen = SAm[q >> 1];
+ sa_sint_t qdiff = SAINT_MIN;
+ if (plen == qlen) {
+ fast_sint_t l = 0;
+ do {
+ if (T[p + l] != T[q + l]) {
+ break;
+ }
+ } while (++l < qlen);
+ qdiff = (sa_sint_t)(l - qlen) & SAINT_MIN;
+ }
+ SAm[p >> 1] = name | (pdiff & qdiff);
+ name += (qdiff < 0);
+
+ p = SA[i + 1];
+ plen = SAm[p >> 1];
+ pdiff = SAINT_MIN;
+ if (qlen == plen) {
+ fast_sint_t l = 0;
+ do {
+ if (T[q + l] != T[p + l]) {
+ break;
+ }
+ } while (++l < plen);
+ pdiff = (sa_sint_t)(l - plen) & SAINT_MIN;
+ }
+ SAm[q >> 1] = name | (qdiff & pdiff);
+ name += (pdiff < 0);
+ }
+
+ for (j += prefetch_distance + 1; i < j; i += 1) {
+ fast_sint_t q = SA[i], qlen = SAm[q >> 1];
+ sa_sint_t qdiff = SAINT_MIN;
+ if (plen == qlen) {
+ fast_sint_t l = 0;
+ do {
+ if (T[p + l] != T[q + l]) {
+ break;
+ }
+ } while (++l < plen);
+ qdiff = (sa_sint_t)(l - plen) & SAINT_MIN;
+ }
+ SAm[p >> 1] = name | (pdiff & qdiff);
+ name += (qdiff < 0);
+
+ p = q;
+ plen = qlen;
+ pdiff = qdiff;
+ }
+
+ SAm[p >> 1] = name | pdiff;
+ name++;
+ }
+
+ if (name <= m) {
+ libsais_mark_distinct_lms_suffixes_32s_omp(SA, n, m, threads);
+ }
+
+ return name - 1;
+}
+
+static void libsais_reconstruct_lms_suffixes(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
+ fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+ const fast_sint_t prefetch_distance = 32;
+
+ const sa_sint_t * RESTRICT SAnm = &SA[n - m];
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) {
+ prefetchw(&SA[i + 2 * prefetch_distance]);
+
+ prefetch(&SAnm[SA[i + prefetch_distance + 0]]);
+ prefetch(&SAnm[SA[i + prefetch_distance + 1]]);
+ prefetch(&SAnm[SA[i + prefetch_distance + 2]]);
+ prefetch(&SAnm[SA[i + prefetch_distance + 3]]);
+
+ SA[i + 0] = SAnm[SA[i + 0]];
+ SA[i + 1] = SAnm[SA[i + 1]];
+ SA[i + 2] = SAnm[SA[i + 2]];
+ SA[i + 3] = SAnm[SA[i + 3]];
+ }
+
+ for (j += prefetch_distance + 3; i < j; i += 1) {
+ SA[i] = SAnm[SA[i]];
+ }
+}
+
+static void libsais_reconstruct_lms_suffixes_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads) {
+ {
+ (void)(threads);
+
+ fast_sint_t omp_block_start = 0;
+ fast_sint_t omp_block_size = m;
+ libsais_reconstruct_lms_suffixes(SA, n, m, omp_block_start, omp_block_size);
+ }
+}
+
+static void libsais_place_lms_suffixes_interval_8u(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
+ const sa_sint_t * RESTRICT buckets) {
+ const sa_sint_t * RESTRICT bucket_end = &buckets[7 * ALPHABET_SIZE];
+
+ fast_sint_t c, j = n;
+ for (c = ALPHABET_SIZE - 2; c >= 0; --c) {
+ fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1) + BUCKETS_INDEX2(1, 0)] -
+ (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1)];
+ if (l > 0) {
+ fast_sint_t i = bucket_end[c];
+ if (j - i > 0) {
+ memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
+ }
+
+ memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
+ }
+ }
+
+ memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
+}
+
+static void libsais_place_lms_suffixes_interval_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m,
+ const sa_sint_t * RESTRICT buckets) {
+ const sa_sint_t * RESTRICT bucket_end = &buckets[3 * k];
+
+ fast_sint_t c, j = n;
+ for (c = (fast_sint_t)k - 2; c >= 0; --c) {
+ fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1) + BUCKETS_INDEX2(1, 0)] -
+ (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1)];
+ if (l > 0) {
+ fast_sint_t i = bucket_end[c];
+ if (j - i > 0) {
+ memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
+ }
+
+ memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
+ }
+ }
+
+ memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
+}
+
+static void libsais_place_lms_suffixes_interval_32s_2k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m,
+ const sa_sint_t * RESTRICT buckets) {
+ fast_sint_t j = n;
+
+ if (k > 1) {
+ fast_sint_t c;
+ for (c = BUCKETS_INDEX2((fast_sint_t)k - 2, 0); c >= BUCKETS_INDEX2(0, 0); c -= BUCKETS_INDEX2(1, 0)) {
+ fast_sint_t l =
+ (fast_sint_t)buckets[c + BUCKETS_INDEX2(1, 1)] - (fast_sint_t)buckets[c + BUCKETS_INDEX2(0, 1)];
+ if (l > 0) {
+ fast_sint_t i = buckets[c];
+ if (j - i > 0) {
+ memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
+ }
+
+ memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
+ }
+ }
+ }
+
+ memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
+}
+
+static void libsais_place_lms_suffixes_interval_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+ sa_sint_t k, sa_sint_t m, sa_sint_t * RESTRICT buckets) {
+ const fast_sint_t prefetch_distance = 32;
+
+ sa_sint_t c = k - 1;
+ fast_sint_t i, l = buckets[c];
+ for (i = (fast_sint_t)m - 1; i >= prefetch_distance + 3; i -= 4) {
+ prefetch(&SA[i - 2 * prefetch_distance]);
+
+ prefetch(&T[SA[i - prefetch_distance - 0]]);
+ prefetch(&T[SA[i - prefetch_distance - 1]]);
+ prefetch(&T[SA[i - prefetch_distance - 2]]);
+ prefetch(&T[SA[i - prefetch_distance - 3]]);
+
+ sa_sint_t p0 = SA[i - 0];
+ if (T[p0] != c) {
+ c = T[p0];
+ memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t));
+ l = buckets[c];
+ }
+ SA[--l] = p0;
+ sa_sint_t p1 = SA[i - 1];
+ if (T[p1] != c) {
+ c = T[p1];
+ memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t));
+ l = buckets[c];
+ }
+ SA[--l] = p1;
+ sa_sint_t p2 = SA[i - 2];
+ if (T[p2] != c) {
+ c = T[p2];
+ memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t));
+ l = buckets[c];
+ }
+ SA[--l] = p2;
+ sa_sint_t p3 = SA[i - 3];
+ if (T[p3] != c) {
+ c = T[p3];
+ memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t));
+ l = buckets[c];
+ }
+ SA[--l] = p3;
+ }
+
+ for (; i >= 0; i -= 1) {
+ sa_sint_t p = SA[i];
+ if (T[p] != c) {
+ c = T[p];
+ memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t));
+ l = buckets[c];
+ }
+ SA[--l] = p;
+ }
+
+ memset(&SA[0], 0, (size_t)l * sizeof(sa_sint_t));
+}
+
+static void libsais_place_lms_suffixes_histogram_32s_6k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m,
+ const sa_sint_t * RESTRICT buckets) {
+ const sa_sint_t * RESTRICT bucket_end = &buckets[5 * k];
+
+ fast_sint_t c, j = n;
+ for (c = (fast_sint_t)k - 2; c >= 0; --c) {
+ fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX4(c, 1)];
+ if (l > 0) {
+ fast_sint_t i = bucket_end[c];
+ if (j - i > 0) {
+ memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
+ }
+
+ memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
+ }
+ }
+
+ memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
+}
+
+static void libsais_place_lms_suffixes_histogram_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m,
+ const sa_sint_t * RESTRICT buckets) {
+ const sa_sint_t * RESTRICT bucket_end = &buckets[3 * k];
+
+ fast_sint_t c, j = n;
+ for (c = (fast_sint_t)k - 2; c >= 0; --c) {
+ fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1)];
+ if (l > 0) {
+ fast_sint_t i = bucket_end[c];
+ if (j - i > 0) {
+ memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
+ }
+
+ memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
+ }
+ }
+
+ memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
+}
+
+static void libsais_place_lms_suffixes_histogram_32s_2k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m,
+ const sa_sint_t * RESTRICT buckets) {
+ fast_sint_t j = n;
+
+ if (k > 1) {
+ fast_sint_t c;
+ for (c = BUCKETS_INDEX2((fast_sint_t)k - 2, 0); c >= BUCKETS_INDEX2(0, 0); c -= BUCKETS_INDEX2(1, 0)) {
+ fast_sint_t l = (fast_sint_t)buckets[c + BUCKETS_INDEX2(0, 1)];
+ if (l > 0) {
+ fast_sint_t i = buckets[c];
+ if (j - i > 0) {
+ memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
+ }
+
+ memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
+ }
+ }
+ }
+
+ memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
+}
+
+static void libsais_final_bwt_scan_left_to_right_8u(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
+ sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start,
+ fast_sint_t omp_block_size) {
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) {
+ prefetchw(&SA[i + 2 * prefetch_distance]);
+
+ sa_sint_t s0 = SA[i + prefetch_distance + 0];
+ const u8 * Ts0 = &T[s0] - 1;
+ prefetch(s0 > 0 ? Ts0 : NULL);
+ Ts0--;
+ prefetch(s0 > 0 ? Ts0 : NULL);
+ sa_sint_t s1 = SA[i + prefetch_distance + 1];
+ const u8 * Ts1 = &T[s1] - 1;
+ prefetch(s1 > 0 ? Ts1 : NULL);
+ Ts1--;
+ prefetch(s1 > 0 ? Ts1 : NULL);
+
+ sa_sint_t p0 = SA[i + 0];
+ SA[i + 0] = p0 & SAINT_MAX;
+ if (p0 > 0) {
+ p0--;
+ SA[i + 0] = T[p0] | SAINT_MIN;
+ SA[induction_bucket[T[p0]]++] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1));
+ }
+ sa_sint_t p1 = SA[i + 1];
+ SA[i + 1] = p1 & SAINT_MAX;
+ if (p1 > 0) {
+ p1--;
+ SA[i + 1] = T[p1] | SAINT_MIN;
+ SA[induction_bucket[T[p1]]++] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1));
+ }
+ }
+
+ for (j += prefetch_distance + 1; i < j; i += 1) {
+ sa_sint_t p = SA[i];
+ SA[i] = p & SAINT_MAX;
+ if (p > 0) {
+ p--;
+ SA[i] = T[p] | SAINT_MIN;
+ SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1));
+ }
+ }
+}
+
+static void libsais_final_bwt_aux_scan_left_to_right_8u(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm,
+ sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket,
+ fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) {
+ prefetchw(&SA[i + 2 * prefetch_distance]);
+
+ sa_sint_t s0 = SA[i + prefetch_distance + 0];
+ const u8 * Ts0 = &T[s0] - 1;
+ prefetch(s0 > 0 ? Ts0 : NULL);
+ Ts0--;
+ prefetch(s0 > 0 ? Ts0 : NULL);
+ sa_sint_t s1 = SA[i + prefetch_distance + 1];
+ const u8 * Ts1 = &T[s1] - 1;
+ prefetch(s1 > 0 ? Ts1 : NULL);
+ Ts1--;
+ prefetch(s1 > 0 ? Ts1 : NULL);
+
+ sa_sint_t p0 = SA[i + 0];
+ SA[i + 0] = p0 & SAINT_MAX;
+ if (p0 > 0) {
+ p0--;
+ SA[i + 0] = T[p0] | SAINT_MIN;
+ SA[induction_bucket[T[p0]]++] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1));
+ if ((p0 & rm) == 0) {
+ I[p0 / (rm + 1)] = induction_bucket[T[p0]];
+ }
+ }
+ sa_sint_t p1 = SA[i + 1];
+ SA[i + 1] = p1 & SAINT_MAX;
+ if (p1 > 0) {
+ p1--;
+ SA[i + 1] = T[p1] | SAINT_MIN;
+ SA[induction_bucket[T[p1]]++] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1));
+ if ((p1 & rm) == 0) {
+ I[p1 / (rm + 1)] = induction_bucket[T[p1]];
+ }
+ }
+ }
+
+ for (j += prefetch_distance + 1; i < j; i += 1) {
+ sa_sint_t p = SA[i];
+ SA[i] = p & SAINT_MAX;
+ if (p > 0) {
+ p--;
+ SA[i] = T[p] | SAINT_MIN;
+ SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1));
+ if ((p & rm) == 0) {
+ I[p / (rm + 1)] = induction_bucket[T[p]];
+ }
+ }
+ }
+}
+
+static void libsais_final_sorting_scan_left_to_right_8u(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
+ sa_sint_t * RESTRICT induction_bucket,
+ fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) {
+ prefetchw(&SA[i + 2 * prefetch_distance]);
+
+ sa_sint_t s0 = SA[i + prefetch_distance + 0];
+ const u8 * Ts0 = &T[s0] - 1;
+ prefetch(s0 > 0 ? Ts0 : NULL);
+ Ts0--;
+ prefetch(s0 > 0 ? Ts0 : NULL);
+ sa_sint_t s1 = SA[i + prefetch_distance + 1];
+ const u8 * Ts1 = &T[s1] - 1;
+ prefetch(s1 > 0 ? Ts1 : NULL);
+ Ts1--;
+ prefetch(s1 > 0 ? Ts1 : NULL);
+
+ sa_sint_t p0 = SA[i + 0];
+ SA[i + 0] = p0 ^ SAINT_MIN;
+ if (p0 > 0) {
+ p0--;
+ SA[induction_bucket[T[p0]]++] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1));
+ }
+ sa_sint_t p1 = SA[i + 1];
+ SA[i + 1] = p1 ^ SAINT_MIN;
+ if (p1 > 0) {
+ p1--;
+ SA[induction_bucket[T[p1]]++] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1));
+ }
+ }
+
+ for (j += prefetch_distance + 1; i < j; i += 1) {
+ sa_sint_t p = SA[i];
+ SA[i] = p ^ SAINT_MIN;
+ if (p > 0) {
+ p--;
+ SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1));
+ }
+ }
+}
+
+static void libsais_final_sorting_scan_left_to_right_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+ sa_sint_t * RESTRICT induction_bucket,
+ fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2) {
+ prefetchw(&SA[i + 3 * prefetch_distance]);
+
+ sa_sint_t s0 = SA[i + 2 * prefetch_distance + 0];
+ const sa_sint_t * Ts0 = &T[s0] - 1;
+ prefetch(s0 > 0 ? Ts0 : NULL);
+ sa_sint_t s1 = SA[i + 2 * prefetch_distance + 1];
+ const sa_sint_t * Ts1 = &T[s1] - 1;
+ prefetch(s1 > 0 ? Ts1 : NULL);
+ sa_sint_t s2 = SA[i + 1 * prefetch_distance + 0];
+ if (s2 > 0) {
+ prefetchw(&induction_bucket[T[s2 - 1]]);
+ prefetch(&T[s2] - 2);
+ }
+ sa_sint_t s3 = SA[i + 1 * prefetch_distance + 1];
+ if (s3 > 0) {
+ prefetchw(&induction_bucket[T[s3 - 1]]);
+ prefetch(&T[s3] - 2);
+ }
+
+ sa_sint_t p0 = SA[i + 0];
+ SA[i + 0] = p0 ^ SAINT_MIN;
+ if (p0 > 0) {
+ p0--;
+ SA[induction_bucket[T[p0]]++] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1));
+ }
+ sa_sint_t p1 = SA[i + 1];
+ SA[i + 1] = p1 ^ SAINT_MIN;
+ if (p1 > 0) {
+ p1--;
+ SA[induction_bucket[T[p1]]++] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1));
+ }
+ }
+
+ for (j += 2 * prefetch_distance + 1; i < j; i += 1) {
+ sa_sint_t p = SA[i];
+ SA[i] = p ^ SAINT_MIN;
+ if (p > 0) {
+ p--;
+ SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1));
+ }
+ }
+}
+static void libsais_final_bwt_scan_left_to_right_8u_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t n,
+ sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ SA[induction_bucket[T[(sa_sint_t)n - 1]]++] =
+ ((sa_sint_t)n - 1) | ((sa_sint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1));
+
+ if (threads == 1 || n < 65536) {
+ libsais_final_bwt_scan_left_to_right_8u(T, SA, induction_bucket, 0, n);
+ }
+ (void)(thread_state);
+}
+
+static void libsais_final_bwt_aux_scan_left_to_right_8u_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
+ fast_sint_t n, sa_sint_t rm, sa_sint_t * RESTRICT I,
+ sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ SA[induction_bucket[T[(sa_sint_t)n - 1]]++] =
+ ((sa_sint_t)n - 1) | ((sa_sint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1));
+
+ if ((((sa_sint_t)n - 1) & rm) == 0) {
+ I[((sa_sint_t)n - 1) / (rm + 1)] = induction_bucket[T[(sa_sint_t)n - 1]];
+ }
+
+ if (threads == 1 || n < 65536) {
+ libsais_final_bwt_aux_scan_left_to_right_8u(T, SA, rm, I, induction_bucket, 0, n);
+ }
+ (void)(thread_state);
+}
+
+static void libsais_final_sorting_scan_left_to_right_8u_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
+ fast_sint_t n, sa_sint_t * RESTRICT induction_bucket,
+ sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ SA[induction_bucket[T[(sa_sint_t)n - 1]]++] =
+ ((sa_sint_t)n - 1) | ((sa_sint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1));
+
+ if (threads == 1 || n < 65536) {
+ libsais_final_sorting_scan_left_to_right_8u(T, SA, induction_bucket, 0, n);
+ }
+ (void)(thread_state);
+}
+
+static void libsais_final_sorting_scan_left_to_right_32s_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+ sa_sint_t n, sa_sint_t * RESTRICT induction_bucket,
+ sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ SA[induction_bucket[T[n - 1]]++] = (n - 1) | ((sa_sint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1));
+
+ if (threads == 1 || n < 65536) {
+ libsais_final_sorting_scan_left_to_right_32s(T, SA, induction_bucket, 0, n);
+ }
+ (void)(thread_state);
+}
+
+static sa_sint_t libsais_final_bwt_scan_right_to_left_8u(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
+ sa_sint_t * RESTRICT induction_bucket,
+ fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ sa_sint_t index = -1;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) {
+ prefetchw(&SA[i - 2 * prefetch_distance]);
+
+ sa_sint_t s0 = SA[i - prefetch_distance - 0];
+ const u8 * Ts0 = &T[s0] - 1;
+ prefetch(s0 > 0 ? Ts0 : NULL);
+ Ts0--;
+ prefetch(s0 > 0 ? Ts0 : NULL);
+ sa_sint_t s1 = SA[i - prefetch_distance - 1];
+ const u8 * Ts1 = &T[s1] - 1;
+ prefetch(s1 > 0 ? Ts1 : NULL);
+ Ts1--;
+ prefetch(s1 > 0 ? Ts1 : NULL);
+
+ sa_sint_t p0 = SA[i - 0];
+ index = (p0 == 0) ? (sa_sint_t)(i - 0) : index;
+ SA[i - 0] = p0 & SAINT_MAX;
+ if (p0 > 0) {
+ p0--;
+ u8 c0 = T[p0 - (p0 > 0)], c1 = T[p0];
+ SA[i - 0] = c1;
+ sa_sint_t t = c0 | SAINT_MIN;
+ SA[--induction_bucket[c1]] = (c0 <= c1) ? p0 : t;
+ }
+
+ sa_sint_t p1 = SA[i - 1];
+ index = (p1 == 0) ? (sa_sint_t)(i - 1) : index;
+ SA[i - 1] = p1 & SAINT_MAX;
+ if (p1 > 0) {
+ p1--;
+ u8 c0 = T[p1 - (p1 > 0)], c1 = T[p1];
+ SA[i - 1] = c1;
+ sa_sint_t t = c0 | SAINT_MIN;
+ SA[--induction_bucket[c1]] = (c0 <= c1) ? p1 : t;
+ }
+ }
+
+ for (j -= prefetch_distance + 1; i >= j; i -= 1) {
+ sa_sint_t p = SA[i];
+ index = (p == 0) ? (sa_sint_t)i : index;
+ SA[i] = p & SAINT_MAX;
+ if (p > 0) {
+ p--;
+ u8 c0 = T[p - (p > 0)], c1 = T[p];
+ SA[i] = c1;
+ sa_sint_t t = c0 | SAINT_MIN;
+ SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t;
+ }
+ }
+
+ return index;
+}
+
+static void libsais_final_bwt_aux_scan_right_to_left_8u(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm,
+ sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket,
+ fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) {
+ prefetchw(&SA[i - 2 * prefetch_distance]);
+
+ sa_sint_t s0 = SA[i - prefetch_distance - 0];
+ const u8 * Ts0 = &T[s0] - 1;
+ prefetch(s0 > 0 ? Ts0 : NULL);
+ Ts0--;
+ prefetch(s0 > 0 ? Ts0 : NULL);
+ sa_sint_t s1 = SA[i - prefetch_distance - 1];
+ const u8 * Ts1 = &T[s1] - 1;
+ prefetch(s1 > 0 ? Ts1 : NULL);
+ Ts1--;
+ prefetch(s1 > 0 ? Ts1 : NULL);
+
+ sa_sint_t p0 = SA[i - 0];
+ SA[i - 0] = p0 & SAINT_MAX;
+ if (p0 > 0) {
+ p0--;
+ u8 c0 = T[p0 - (p0 > 0)], c1 = T[p0];
+ SA[i - 0] = c1;
+ sa_sint_t t = c0 | SAINT_MIN;
+ SA[--induction_bucket[c1]] = (c0 <= c1) ? p0 : t;
+ if ((p0 & rm) == 0) {
+ I[p0 / (rm + 1)] = induction_bucket[T[p0]] + 1;
+ }
+ }
+
+ sa_sint_t p1 = SA[i - 1];
+ SA[i - 1] = p1 & SAINT_MAX;
+ if (p1 > 0) {
+ p1--;
+ u8 c0 = T[p1 - (p1 > 0)], c1 = T[p1];
+ SA[i - 1] = c1;
+ sa_sint_t t = c0 | SAINT_MIN;
+ SA[--induction_bucket[c1]] = (c0 <= c1) ? p1 : t;
+ if ((p1 & rm) == 0) {
+ I[p1 / (rm + 1)] = induction_bucket[T[p1]] + 1;
+ }
+ }
+ }
+
+ for (j -= prefetch_distance + 1; i >= j; i -= 1) {
+ sa_sint_t p = SA[i];
+ SA[i] = p & SAINT_MAX;
+ if (p > 0) {
+ p--;
+ u8 c0 = T[p - (p > 0)], c1 = T[p];
+ SA[i] = c1;
+ sa_sint_t t = c0 | SAINT_MIN;
+ SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t;
+ if ((p & rm) == 0) {
+ I[p / (rm + 1)] = induction_bucket[T[p]] + 1;
+ }
+ }
+ }
+}
+
+static void libsais_final_sorting_scan_right_to_left_8u(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
+ sa_sint_t * RESTRICT induction_bucket,
+ fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) {
+ prefetchw(&SA[i - 2 * prefetch_distance]);
+
+ sa_sint_t s0 = SA[i - prefetch_distance - 0];
+ const u8 * Ts0 = &T[s0] - 1;
+ prefetch(s0 > 0 ? Ts0 : NULL);
+ Ts0--;
+ prefetch(s0 > 0 ? Ts0 : NULL);
+ sa_sint_t s1 = SA[i - prefetch_distance - 1];
+ const u8 * Ts1 = &T[s1] - 1;
+ prefetch(s1 > 0 ? Ts1 : NULL);
+ Ts1--;
+ prefetch(s1 > 0 ? Ts1 : NULL);
+
+ sa_sint_t p0 = SA[i - 0];
+ SA[i - 0] = p0 & SAINT_MAX;
+ if (p0 > 0) {
+ p0--;
+ SA[--induction_bucket[T[p0]]] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1));
+ }
+ sa_sint_t p1 = SA[i - 1];
+ SA[i - 1] = p1 & SAINT_MAX;
+ if (p1 > 0) {
+ p1--;
+ SA[--induction_bucket[T[p1]]] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1));
+ }
+ }
+
+ for (j -= prefetch_distance + 1; i >= j; i -= 1) {
+ sa_sint_t p = SA[i];
+ SA[i] = p & SAINT_MAX;
+ if (p > 0) {
+ p--;
+ SA[--induction_bucket[T[p]]] = p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1));
+ }
+ }
+}
+
+static void libsais_final_sorting_scan_right_to_left_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+ sa_sint_t * RESTRICT induction_bucket,
+ fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2) {
+ prefetchw(&SA[i - 3 * prefetch_distance]);
+
+ sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0];
+ const sa_sint_t * Ts0 = &T[s0] - 1;
+ prefetch(s0 > 0 ? Ts0 : NULL);
+ sa_sint_t s1 = SA[i - 2 * prefetch_distance - 1];
+ const sa_sint_t * Ts1 = &T[s1] - 1;
+ prefetch(s1 > 0 ? Ts1 : NULL);
+ sa_sint_t s2 = SA[i - 1 * prefetch_distance - 0];
+ if (s2 > 0) {
+ prefetchw(&induction_bucket[T[s2 - 1]]);
+ prefetch(&T[s2] - 2);
+ }
+ sa_sint_t s3 = SA[i - 1 * prefetch_distance - 1];
+ if (s3 > 0) {
+ prefetchw(&induction_bucket[T[s3 - 1]]);
+ prefetch(&T[s3] - 2);
+ }
+
+ sa_sint_t p0 = SA[i - 0];
+ SA[i - 0] = p0 & SAINT_MAX;
+ if (p0 > 0) {
+ p0--;
+ SA[--induction_bucket[T[p0]]] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1));
+ }
+ sa_sint_t p1 = SA[i - 1];
+ SA[i - 1] = p1 & SAINT_MAX;
+ if (p1 > 0) {
+ p1--;
+ SA[--induction_bucket[T[p1]]] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1));
+ }
+ }
+
+ for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1) {
+ sa_sint_t p = SA[i];
+ SA[i] = p & SAINT_MAX;
+ if (p > 0) {
+ p--;
+ SA[--induction_bucket[T[p]]] = p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1));
+ }
+ }
+}
+static sa_sint_t libsais_final_bwt_scan_right_to_left_8u_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
+ sa_sint_t n, sa_sint_t * RESTRICT induction_bucket,
+ sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ sa_sint_t index = -1;
+
+ if (threads == 1 || n < 65536) {
+ index = libsais_final_bwt_scan_right_to_left_8u(T, SA, induction_bucket, 0, n);
+ }
+ (void)(thread_state);
+ return index;
+}
+
+static void libsais_final_bwt_aux_scan_right_to_left_8u_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+ sa_sint_t rm, sa_sint_t * RESTRICT I,
+ sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ if (threads == 1 || n < 65536) {
+ libsais_final_bwt_aux_scan_right_to_left_8u(T, SA, rm, I, induction_bucket, 0, n);
+ }
+ (void)(thread_state);
+}
+
+static void libsais_final_sorting_scan_right_to_left_8u_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+ sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ if (threads == 1 || n < 65536) {
+ libsais_final_sorting_scan_right_to_left_8u(T, SA, induction_bucket, 0, n);
+ }
+ (void)(thread_state);
+}
+
+static void libsais_final_sorting_scan_right_to_left_32s_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+ sa_sint_t n, sa_sint_t * RESTRICT induction_bucket,
+ sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ if (threads == 1 || n < 65536) {
+ libsais_final_sorting_scan_right_to_left_32s(T, SA, induction_bucket, 0, n);
+ }
+ (void)(thread_state);
+}
+
+static void libsais_clear_lms_suffixes_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
+ sa_sint_t * RESTRICT bucket_start, sa_sint_t * RESTRICT bucket_end,
+ sa_sint_t threads) {
+ fast_sint_t c;
+ (void)(threads);
+ (void)(n);
+
+ for (c = 0; c < k; ++c) {
+ if (bucket_end[c] > bucket_start[c]) {
+ memset(&SA[bucket_start[c]], 0, ((size_t)bucket_end[c] - (size_t)bucket_start[c]) * sizeof(sa_sint_t));
+ }
+ }
+}
+
+static sa_sint_t libsais_induce_final_order_8u_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+ sa_sint_t bwt, sa_sint_t r, sa_sint_t * RESTRICT I,
+ sa_sint_t * RESTRICT buckets, sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ if (!bwt) {
+ libsais_final_sorting_scan_left_to_right_8u_omp(T, SA, n, &buckets[6 * ALPHABET_SIZE], threads, thread_state);
+ if (threads > 1 && n >= 65536) {
+ libsais_clear_lms_suffixes_omp(SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE],
+ &buckets[7 * ALPHABET_SIZE], threads);
+ }
+ libsais_final_sorting_scan_right_to_left_8u_omp(T, SA, n, &buckets[7 * ALPHABET_SIZE], threads, thread_state);
+ return 0;
+ } else if (I != NULL) {
+ libsais_final_bwt_aux_scan_left_to_right_8u_omp(T, SA, n, r - 1, I, &buckets[6 * ALPHABET_SIZE], threads,
+ thread_state);
+ if (threads > 1 && n >= 65536) {
+ libsais_clear_lms_suffixes_omp(SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE],
+ &buckets[7 * ALPHABET_SIZE], threads);
+ }
+ libsais_final_bwt_aux_scan_right_to_left_8u_omp(T, SA, n, r - 1, I, &buckets[7 * ALPHABET_SIZE], threads,
+ thread_state);
+ return 0;
+ } else {
+ libsais_final_bwt_scan_left_to_right_8u_omp(T, SA, n, &buckets[6 * ALPHABET_SIZE], threads, thread_state);
+ if (threads > 1 && n >= 65536) {
+ libsais_clear_lms_suffixes_omp(SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE],
+ &buckets[7 * ALPHABET_SIZE], threads);
+ }
+ return libsais_final_bwt_scan_right_to_left_8u_omp(T, SA, n, &buckets[7 * ALPHABET_SIZE], threads,
+ thread_state);
+ }
+}
+
+static void libsais_induce_final_order_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+ sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[4 * k], threads, thread_state);
+ libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[5 * k], threads, thread_state);
+}
+
+static void libsais_induce_final_order_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+ sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[2 * k], threads, thread_state);
+ libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[3 * k], threads, thread_state);
+}
+
+static void libsais_induce_final_order_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+ sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[1 * k], threads, thread_state);
+ libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[0 * k], threads, thread_state);
+}
+
+static void libsais_induce_final_order_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+ sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ libsais_count_suffixes_32s(T, n, k, buckets);
+ libsais_initialize_buckets_start_32s_1k(k, buckets);
+ libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, buckets, threads, thread_state);
+
+ libsais_count_suffixes_32s(T, n, k, buckets);
+ libsais_initialize_buckets_end_32s_1k(k, buckets);
+ libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, buckets, threads, thread_state);
+}
+
+static sa_sint_t libsais_renumber_unique_and_nonunique_lms_suffixes_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+ sa_sint_t m, sa_sint_t f,
+ fast_sint_t omp_block_start,
+ fast_sint_t omp_block_size) {
+ const fast_sint_t prefetch_distance = 32;
+
+ sa_sint_t * RESTRICT SAm = &SA[m];
+
+ sa_sint_t i, j;
+ for (i = (sa_sint_t)omp_block_start,
+ j = (sa_sint_t)omp_block_start + (sa_sint_t)omp_block_size - 2 * (sa_sint_t)prefetch_distance - 3;
+ i < j; i += 4) {
+ prefetch(&SA[i + 3 * prefetch_distance]);
+
+ prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 0]) >> 1]);
+ prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 1]) >> 1]);
+ prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 2]) >> 1]);
+ prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 3]) >> 1]);
+
+ sa_uint_t q0 = (sa_uint_t)SA[i + prefetch_distance + 0];
+ const sa_sint_t * Tq0 = &T[q0];
+ prefetchw(SAm[q0 >> 1] < 0 ? Tq0 : NULL);
+ sa_uint_t q1 = (sa_uint_t)SA[i + prefetch_distance + 1];
+ const sa_sint_t * Tq1 = &T[q1];
+ prefetchw(SAm[q1 >> 1] < 0 ? Tq1 : NULL);
+ sa_uint_t q2 = (sa_uint_t)SA[i + prefetch_distance + 2];
+ const sa_sint_t * Tq2 = &T[q2];
+ prefetchw(SAm[q2 >> 1] < 0 ? Tq2 : NULL);
+ sa_uint_t q3 = (sa_uint_t)SA[i + prefetch_distance + 3];
+ const sa_sint_t * Tq3 = &T[q3];
+ prefetchw(SAm[q3 >> 1] < 0 ? Tq3 : NULL);
+
+ sa_uint_t p0 = (sa_uint_t)SA[i + 0];
+ sa_sint_t s0 = SAm[p0 >> 1];
+ if (s0 < 0) {
+ T[p0] |= SAINT_MIN;
+ f++;
+ s0 = i + 0 + SAINT_MIN + f;
+ }
+ SAm[p0 >> 1] = s0 - f;
+ sa_uint_t p1 = (sa_uint_t)SA[i + 1];
+ sa_sint_t s1 = SAm[p1 >> 1];
+ if (s1 < 0) {
+ T[p1] |= SAINT_MIN;
+ f++;
+ s1 = i + 1 + SAINT_MIN + f;
+ }
+ SAm[p1 >> 1] = s1 - f;
+ sa_uint_t p2 = (sa_uint_t)SA[i + 2];
+ sa_sint_t s2 = SAm[p2 >> 1];
+ if (s2 < 0) {
+ T[p2] |= SAINT_MIN;
+ f++;
+ s2 = i + 2 + SAINT_MIN + f;
+ }
+ SAm[p2 >> 1] = s2 - f;
+ sa_uint_t p3 = (sa_uint_t)SA[i + 3];
+ sa_sint_t s3 = SAm[p3 >> 1];
+ if (s3 < 0) {
+ T[p3] |= SAINT_MIN;
+ f++;
+ s3 = i + 3 + SAINT_MIN + f;
+ }
+ SAm[p3 >> 1] = s3 - f;
+ }
+
+ for (j += 2 * (sa_sint_t)prefetch_distance + 3; i < j; i += 1) {
+ sa_uint_t p = (sa_uint_t)SA[i];
+ sa_sint_t s = SAm[p >> 1];
+ if (s < 0) {
+ T[p] |= SAINT_MIN;
+ f++;
+ s = i + SAINT_MIN + f;
+ }
+ SAm[p >> 1] = s - f;
+ }
+
+ return f;
+}
+
+static void libsais_compact_unique_and_nonunique_lms_suffixes_32s(sa_sint_t * RESTRICT SA, sa_sint_t m,
+ fast_sint_t * pl, fast_sint_t * pr,
+ fast_sint_t omp_block_start,
+ fast_sint_t omp_block_size) {
+ const fast_sint_t prefetch_distance = 32;
+
+ sa_sint_t * RESTRICT SAl = &SA[0];
+ sa_sint_t * RESTRICT SAr = &SA[0];
+
+ fast_sint_t i, j, l = *pl - 1, r = *pr - 1;
+ for (i = (fast_sint_t)m + omp_block_start + omp_block_size - 1, j = (fast_sint_t)m + omp_block_start + 3; i >= j;
+ i -= 4) {
+ prefetch(&SA[i - prefetch_distance]);
+
+ sa_sint_t p0 = SA[i - 0];
+ SAl[l] = p0 & SAINT_MAX;
+ l -= p0 < 0;
+ SAr[r] = p0 - 1;
+ r -= p0 > 0;
+ sa_sint_t p1 = SA[i - 1];
+ SAl[l] = p1 & SAINT_MAX;
+ l -= p1 < 0;
+ SAr[r] = p1 - 1;
+ r -= p1 > 0;
+ sa_sint_t p2 = SA[i - 2];
+ SAl[l] = p2 & SAINT_MAX;
+ l -= p2 < 0;
+ SAr[r] = p2 - 1;
+ r -= p2 > 0;
+ sa_sint_t p3 = SA[i - 3];
+ SAl[l] = p3 & SAINT_MAX;
+ l -= p3 < 0;
+ SAr[r] = p3 - 1;
+ r -= p3 > 0;
+ }
+
+ for (j -= 3; i >= j; i -= 1) {
+ sa_sint_t p = SA[i];
+ SAl[l] = p & SAINT_MAX;
+ l -= p < 0;
+ SAr[r] = p - 1;
+ r -= p > 0;
+ }
+
+ *pl = l + 1;
+ *pr = r + 1;
+}
+static sa_sint_t libsais_renumber_unique_and_nonunique_lms_suffixes_32s_omp(
+ sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ sa_sint_t f = 0;
+ {
+ (void)(threads);
+ (void)(thread_state);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+
+ fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
+
+ if (omp_num_threads == 1) {
+ f = libsais_renumber_unique_and_nonunique_lms_suffixes_32s(T, SA, m, 0, omp_block_start, omp_block_size);
+ }
+ }
+
+ return f;
+}
+
+static void libsais_compact_unique_and_nonunique_lms_suffixes_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
+ sa_sint_t fs, sa_sint_t f, sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ {
+ (void)(threads);
+ (void)(thread_state);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+
+ fast_sint_t omp_block_stride = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size =
+ omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ((fast_sint_t)n >> 1) - omp_block_start;
+
+ if (omp_num_threads == 1) {
+ fast_sint_t l = m, r = (fast_sint_t)n + (fast_sint_t)fs;
+ libsais_compact_unique_and_nonunique_lms_suffixes_32s(SA, m, &l, &r, omp_block_start, omp_block_size);
+ }
+ }
+
+ memcpy(&SA[(fast_sint_t)n + (fast_sint_t)fs - (fast_sint_t)m], &SA[(fast_sint_t)m - (fast_sint_t)f],
+ (size_t)f * sizeof(sa_sint_t));
+}
+
+static sa_sint_t libsais_compact_lms_suffixes_32s_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+ sa_sint_t m, sa_sint_t fs, sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ sa_sint_t f = libsais_renumber_unique_and_nonunique_lms_suffixes_32s_omp(T, SA, m, threads, thread_state);
+ libsais_compact_unique_and_nonunique_lms_suffixes_32s_omp(SA, n, m, fs, f, threads, thread_state);
+
+ return f;
+}
+
+static void libsais_merge_unique_lms_suffixes_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+ sa_sint_t m, fast_sint_t l, fast_sint_t omp_block_start,
+ fast_sint_t omp_block_size) {
+ const fast_sint_t prefetch_distance = 32;
+
+ const sa_sint_t * RESTRICT SAnm = &SA[(fast_sint_t)n - (fast_sint_t)m - 1 + l];
+
+ sa_sint_t i, j;
+ fast_sint_t tmp = *SAnm++;
+ for (i = (sa_sint_t)omp_block_start, j = (sa_sint_t)omp_block_start + (sa_sint_t)omp_block_size - 6; i < j;
+ i += 4) {
+ prefetch(&T[i + prefetch_distance]);
+
+ sa_sint_t c0 = T[i + 0];
+ if (c0 < 0) {
+ T[i + 0] = c0 & SAINT_MAX;
+ SA[tmp] = i + 0;
+ i++;
+ tmp = *SAnm++;
+ }
+ sa_sint_t c1 = T[i + 1];
+ if (c1 < 0) {
+ T[i + 1] = c1 & SAINT_MAX;
+ SA[tmp] = i + 1;
+ i++;
+ tmp = *SAnm++;
+ }
+ sa_sint_t c2 = T[i + 2];
+ if (c2 < 0) {
+ T[i + 2] = c2 & SAINT_MAX;
+ SA[tmp] = i + 2;
+ i++;
+ tmp = *SAnm++;
+ }
+ sa_sint_t c3 = T[i + 3];
+ if (c3 < 0) {
+ T[i + 3] = c3 & SAINT_MAX;
+ SA[tmp] = i + 3;
+ i++;
+ tmp = *SAnm++;
+ }
+ }
+
+ for (j += 6; i < j; i += 1) {
+ sa_sint_t c = T[i];
+ if (c < 0) {
+ T[i] = c & SAINT_MAX;
+ SA[tmp] = i;
+ i++;
+ tmp = *SAnm++;
+ }
+ }
+}
+
+static void libsais_merge_nonunique_lms_suffixes_32s(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, fast_sint_t l,
+ fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+ const fast_sint_t prefetch_distance = 32;
+
+ const sa_sint_t * RESTRICT SAnm = &SA[(fast_sint_t)n - (fast_sint_t)m - 1 + l];
+
+ fast_sint_t i, j;
+ sa_sint_t tmp = *SAnm++;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - 3; i < j; i += 4) {
+ prefetch(&SA[i + prefetch_distance]);
+
+ if (SA[i + 0] == 0) {
+ SA[i + 0] = tmp;
+ tmp = *SAnm++;
+ }
+ if (SA[i + 1] == 0) {
+ SA[i + 1] = tmp;
+ tmp = *SAnm++;
+ }
+ if (SA[i + 2] == 0) {
+ SA[i + 2] = tmp;
+ tmp = *SAnm++;
+ }
+ if (SA[i + 3] == 0) {
+ SA[i + 3] = tmp;
+ tmp = *SAnm++;
+ }
+ }
+
+ for (j += 3; i < j; i += 1) {
+ if (SA[i] == 0) {
+ SA[i] = tmp;
+ tmp = *SAnm++;
+ }
+ }
+}
+
+static void libsais_merge_unique_lms_suffixes_32s_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+ sa_sint_t m, sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ {
+ (void)(threads);
+ (void)(thread_state);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+
+ fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+
+ if (omp_num_threads == 1) {
+ libsais_merge_unique_lms_suffixes_32s(T, SA, n, m, 0, omp_block_start, omp_block_size);
+ }
+ }
+}
+
+static void libsais_merge_nonunique_lms_suffixes_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t f,
+ sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ {
+ (void)(threads);
+ (void)(thread_state);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+
+ fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
+
+ if (omp_num_threads == 1) {
+ libsais_merge_nonunique_lms_suffixes_32s(SA, n, m, f, omp_block_start, omp_block_size);
+ }
+ }
+}
+
+static void libsais_merge_compacted_lms_suffixes_32s_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+ sa_sint_t m, sa_sint_t f, sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ libsais_merge_unique_lms_suffixes_32s_omp(T, SA, n, m, threads, thread_state);
+ libsais_merge_nonunique_lms_suffixes_32s_omp(SA, n, m, f, threads, thread_state);
+}
+
+static void libsais_reconstruct_compacted_lms_suffixes_32s_2k_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+ sa_sint_t n, sa_sint_t k, sa_sint_t m, sa_sint_t fs,
+ sa_sint_t f, sa_sint_t * RESTRICT buckets,
+ sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ if (f > 0) {
+ memmove(&SA[n - m - 1], &SA[n + fs - m], (size_t)f * sizeof(sa_sint_t));
+
+ libsais_count_and_gather_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state);
+ libsais_reconstruct_lms_suffixes_omp(SA, n, m - f, threads);
+
+ memcpy(&SA[n - m - 1 + f], &SA[0], ((size_t)m - (size_t)f) * sizeof(sa_sint_t));
+ memset(&SA[0], 0, (size_t)m * sizeof(sa_sint_t));
+
+ libsais_merge_compacted_lms_suffixes_32s_omp(T, SA, n, m, f, threads, thread_state);
+ } else {
+ libsais_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets, 0, n);
+ libsais_reconstruct_lms_suffixes_omp(SA, n, m, threads);
+ }
+}
+
+static void libsais_reconstruct_compacted_lms_suffixes_32s_1k_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+ sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t f,
+ sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ if (f > 0) {
+ memmove(&SA[n - m - 1], &SA[n + fs - m], (size_t)f * sizeof(sa_sint_t));
+
+ libsais_gather_compacted_lms_suffixes_32s(T, SA, n);
+ libsais_reconstruct_lms_suffixes_omp(SA, n, m - f, threads);
+
+ memcpy(&SA[n - m - 1 + f], &SA[0], ((size_t)m - (size_t)f) * sizeof(sa_sint_t));
+ memset(&SA[0], 0, (size_t)m * sizeof(sa_sint_t));
+
+ libsais_merge_compacted_lms_suffixes_32s_omp(T, SA, n, m, f, threads, thread_state);
+ } else {
+ libsais_gather_lms_suffixes_32s(T, SA, n);
+ libsais_reconstruct_lms_suffixes_omp(SA, n, m, threads);
+ }
+}
+
+static sa_sint_t libsais_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
+ sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ fs = fs < (SAINT_MAX - n) ? fs : (SAINT_MAX - n);
+
+ if (k > 0 && fs / k >= 6) {
+ sa_sint_t alignment = (fs - 1024) / k >= 6 ? 1024 : 16;
+ sa_sint_t * RESTRICT buckets =
+ (fs - alignment) / k >= 6
+ ? (sa_sint_t *)libsais_align_up(&SA[n + fs - 6 * k - alignment], (size_t)alignment * sizeof(sa_sint_t))
+ : &SA[n + fs - 6 * k];
+
+ sa_sint_t m = libsais_count_and_gather_lms_suffixes_32s_4k_omp(T, SA, n, k, buckets, threads, thread_state);
+ if (m > 1) {
+ memset(SA, 0, ((size_t)n - (size_t)m) * sizeof(sa_sint_t));
+
+ sa_sint_t first_lms_suffix = SA[n - m];
+ sa_sint_t left_suffixes_count =
+ libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(T, k, buckets, first_lms_suffix);
+
+ libsais_radix_sort_lms_suffixes_32s_6k_omp(T, SA, n, m, &buckets[4 * k], threads, thread_state);
+ libsais_radix_sort_set_markers_32s_6k_omp(SA, k, &buckets[4 * k], threads);
+
+ if (threads > 1 && n >= 65536) {
+ memset(&SA[(fast_sint_t)n - (fast_sint_t)m], 0, (size_t)m * sizeof(sa_sint_t));
+ }
+
+ libsais_initialize_buckets_for_partial_sorting_32s_6k(T, k, buckets, first_lms_suffix, left_suffixes_count);
+ libsais_induce_partial_order_32s_6k_omp(T, SA, n, k, buckets, first_lms_suffix, left_suffixes_count,
+ threads, thread_state);
+
+ sa_sint_t names =
+ libsais_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(SA, n, m, threads, thread_state);
+ if (names < m) {
+ sa_sint_t f = libsais_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state);
+
+ if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads,
+ thread_state) != 0) {
+ return -2;
+ }
+
+ libsais_reconstruct_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, m, fs, f, buckets, threads,
+ thread_state);
+ } else {
+ libsais_count_lms_suffixes_32s_2k(T, n, k, buckets);
+ }
+
+ libsais_initialize_buckets_start_and_end_32s_4k(k, buckets);
+ libsais_place_lms_suffixes_histogram_32s_4k(SA, n, k, m, buckets);
+ libsais_induce_final_order_32s_4k(T, SA, n, k, buckets, threads, thread_state);
+ } else {
+ SA[0] = SA[n - 1];
+
+ libsais_initialize_buckets_start_and_end_32s_6k(k, buckets);
+ libsais_place_lms_suffixes_histogram_32s_6k(SA, n, k, m, buckets);
+ libsais_induce_final_order_32s_6k(T, SA, n, k, buckets, threads, thread_state);
+ }
+
+ return 0;
+ } else if (k > 0 && fs / k >= 4) {
+ sa_sint_t alignment = (fs - 1024) / k >= 4 ? 1024 : 16;
+ sa_sint_t * RESTRICT buckets =
+ (fs - alignment) / k >= 4
+ ? (sa_sint_t *)libsais_align_up(&SA[n + fs - 4 * k - alignment], (size_t)alignment * sizeof(sa_sint_t))
+ : &SA[n + fs - 4 * k];
+
+ sa_sint_t m = libsais_count_and_gather_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state);
+ if (m > 1) {
+ libsais_initialize_buckets_for_radix_and_partial_sorting_32s_4k(T, k, buckets, SA[n - m]);
+
+ libsais_radix_sort_lms_suffixes_32s_2k_omp(T, SA, n, m, &buckets[1], threads, thread_state);
+ libsais_radix_sort_set_markers_32s_4k_omp(SA, k, &buckets[1], threads);
+
+ libsais_place_lms_suffixes_interval_32s_4k(SA, n, k, m - 1, buckets);
+ libsais_induce_partial_order_32s_4k_omp(T, SA, n, k, buckets, threads, thread_state);
+
+ sa_sint_t names =
+ libsais_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(SA, n, m, threads, thread_state);
+ if (names < m) {
+ sa_sint_t f = libsais_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state);
+
+ if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads,
+ thread_state) != 0) {
+ return -2;
+ }
+
+ libsais_reconstruct_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, m, fs, f, buckets, threads,
+ thread_state);
+ } else {
+ libsais_count_lms_suffixes_32s_2k(T, n, k, buckets);
+ }
+ } else {
+ SA[0] = SA[n - 1];
+ }
+
+ libsais_initialize_buckets_start_and_end_32s_4k(k, buckets);
+ libsais_place_lms_suffixes_histogram_32s_4k(SA, n, k, m, buckets);
+ libsais_induce_final_order_32s_4k(T, SA, n, k, buckets, threads, thread_state);
+
+ return 0;
+ } else if (k > 0 && fs / k >= 2) {
+ sa_sint_t alignment = (fs - 1024) / k >= 2 ? 1024 : 16;
+ sa_sint_t * RESTRICT buckets =
+ (fs - alignment) / k >= 2
+ ? (sa_sint_t *)libsais_align_up(&SA[n + fs - 2 * k - alignment], (size_t)alignment * sizeof(sa_sint_t))
+ : &SA[n + fs - 2 * k];
+
+ sa_sint_t m = libsais_count_and_gather_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state);
+ if (m > 1) {
+ libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_2k(T, k, buckets, SA[n - m]);
+
+ libsais_radix_sort_lms_suffixes_32s_2k_omp(T, SA, n, m, &buckets[1], threads, thread_state);
+ libsais_place_lms_suffixes_interval_32s_2k(SA, n, k, m - 1, buckets);
+
+ libsais_initialize_buckets_start_and_end_32s_2k(k, buckets);
+ libsais_induce_partial_order_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state);
+
+ sa_sint_t names = libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(T, SA, n, m, threads);
+ if (names < m) {
+ sa_sint_t f = libsais_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state);
+
+ if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads,
+ thread_state) != 0) {
+ return -2;
+ }
+
+ libsais_reconstruct_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, m, fs, f, buckets, threads,
+ thread_state);
+ } else {
+ libsais_count_lms_suffixes_32s_2k(T, n, k, buckets);
+ }
+ } else {
+ SA[0] = SA[n - 1];
+ }
+
+ libsais_initialize_buckets_end_32s_2k(k, buckets);
+ libsais_place_lms_suffixes_histogram_32s_2k(SA, n, k, m, buckets);
+
+ libsais_initialize_buckets_start_and_end_32s_2k(k, buckets);
+ libsais_induce_final_order_32s_2k(T, SA, n, k, buckets, threads, thread_state);
+
+ return 0;
+ } else {
+ sa_sint_t * buffer =
+ fs < k ? (sa_sint_t *)libsais_alloc_aligned((size_t)k * sizeof(sa_sint_t), 4096) : (sa_sint_t *)NULL;
+
+ sa_sint_t alignment = fs - 1024 >= k ? 1024 : 16;
+ sa_sint_t * RESTRICT buckets =
+ fs - alignment >= k
+ ? (sa_sint_t *)libsais_align_up(&SA[n + fs - k - alignment], (size_t)alignment * sizeof(sa_sint_t))
+ : fs >= k ? &SA[n + fs - k]
+ : buffer;
+
+ if (buckets == NULL) {
+ return -2;
+ }
+
+ memset(SA, 0, (size_t)n * sizeof(sa_sint_t));
+
+ libsais_count_suffixes_32s(T, n, k, buckets);
+ libsais_initialize_buckets_end_32s_1k(k, buckets);
+
+ sa_sint_t m = libsais_radix_sort_lms_suffixes_32s_1k(T, SA, n, buckets);
+ if (m > 1) {
+ libsais_induce_partial_order_32s_1k_omp(T, SA, n, k, buckets, threads, thread_state);
+
+ sa_sint_t names = libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(T, SA, n, m, threads);
+ if (names < m) {
+ if (buffer != NULL) {
+ libsais_free_aligned(buffer);
+ buckets = NULL;
+ }
+
+ sa_sint_t f = libsais_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state);
+
+ if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads,
+ thread_state) != 0) {
+ return -2;
+ }
+
+ libsais_reconstruct_compacted_lms_suffixes_32s_1k_omp(T, SA, n, m, fs, f, threads, thread_state);
+
+ if (buckets == NULL) {
+ buckets = buffer = (sa_sint_t *)libsais_alloc_aligned((size_t)k * sizeof(sa_sint_t), 4096);
+ }
+ if (buckets == NULL) {
+ return -2;
+ }
+ }
+
+ libsais_count_suffixes_32s(T, n, k, buckets);
+ libsais_initialize_buckets_end_32s_1k(k, buckets);
+ libsais_place_lms_suffixes_interval_32s_1k(T, SA, k, m, buckets);
+ }
+
+ libsais_induce_final_order_32s_1k(T, SA, n, k, buckets, threads, thread_state);
+ libsais_free_aligned(buffer);
+
+ return 0;
+ }
+}
+
+static sa_sint_t libsais_main_8u(const u8 * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t bwt,
+ sa_sint_t r, sa_sint_t * RESTRICT I, sa_sint_t fs, sa_sint_t * freq, sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ fs = fs < (SAINT_MAX - n) ? fs : (SAINT_MAX - n);
+
+ sa_sint_t m = libsais_count_and_gather_lms_suffixes_8u_omp(T, SA, n, buckets, threads, thread_state);
+
+ libsais_initialize_buckets_start_and_end_8u(buckets, freq);
+
+ if (m > 0) {
+ sa_sint_t first_lms_suffix = SA[n - m];
+ sa_sint_t left_suffixes_count =
+ libsais_initialize_buckets_for_lms_suffixes_radix_sort_8u(T, buckets, first_lms_suffix);
+
+ if (threads > 1 && n >= 65536) {
+ memset(SA, 0, ((size_t)n - (size_t)m) * sizeof(sa_sint_t));
+ }
+ libsais_radix_sort_lms_suffixes_8u_omp(T, SA, n, m, buckets, threads, thread_state);
+ if (threads > 1 && n >= 65536) {
+ memset(&SA[(fast_sint_t)n - (fast_sint_t)m], 0, (size_t)m * sizeof(sa_sint_t));
+ }
+
+ libsais_initialize_buckets_for_partial_sorting_8u(T, buckets, first_lms_suffix, left_suffixes_count);
+ libsais_induce_partial_order_8u_omp(T, SA, n, buckets, first_lms_suffix, left_suffixes_count, threads,
+ thread_state);
+
+ sa_sint_t names = libsais_renumber_and_gather_lms_suffixes_8u_omp(SA, n, m, fs, threads, thread_state);
+ if (names < m) {
+ if (libsais_main_32s(SA + n + fs - m, SA, m, names, fs + n - 2 * m, threads, thread_state) != 0) {
+ return -2;
+ }
+
+ libsais_gather_lms_suffixes_8u_omp(T, SA, n, threads, thread_state);
+ libsais_reconstruct_lms_suffixes_omp(SA, n, m, threads);
+ }
+
+ libsais_place_lms_suffixes_interval_8u(SA, n, m, buckets);
+ } else {
+ memset(SA, 0, (size_t)n * sizeof(sa_sint_t));
+ }
+
+ return libsais_induce_final_order_8u_omp(T, SA, n, bwt, r, I, buckets, threads, thread_state);
+}
+
+static sa_sint_t libsais_main(const u8 * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t bwt, sa_sint_t r, sa_sint_t * I,
+ sa_sint_t fs, sa_sint_t * freq, sa_sint_t threads) {
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state = threads > 1 ? libsais_alloc_thread_state(threads) : NULL;
+ sa_sint_t * RESTRICT buckets = (sa_sint_t *)libsais_alloc_aligned(8 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096);
+
+ sa_sint_t index = buckets != NULL && (thread_state != NULL || threads == 1)
+ ? libsais_main_8u(T, SA, n, buckets, bwt, r, I, fs, freq, threads, thread_state)
+ : -2;
+
+ libsais_free_aligned(buckets);
+ libsais_free_thread_state(thread_state);
+
+ return index;
+}
+
+static s32 libsais_main_int(sa_sint_t * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t k, sa_sint_t fs, sa_sint_t threads) {
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state = threads > 1 ? libsais_alloc_thread_state(threads) : NULL;
+
+ sa_sint_t index =
+ thread_state != NULL || threads == 1 ? libsais_main_32s(T, SA, n, k, fs, threads, thread_state) : -2;
+
+ libsais_free_thread_state(thread_state);
+
+ return index;
+}
+
+static sa_sint_t libsais_main_ctx(const LIBSAIS_CONTEXT * ctx, const u8 * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t bwt,
+ sa_sint_t r, sa_sint_t * I, sa_sint_t fs, sa_sint_t * freq) {
+ return ctx != NULL && (ctx->buckets != NULL && (ctx->thread_state != NULL || ctx->threads == 1))
+ ? libsais_main_8u(T, SA, n, ctx->buckets, bwt, r, I, fs, freq, (sa_sint_t)ctx->threads,
+ ctx->thread_state)
+ : -2;
+}
+
+static void libsais_bwt_copy_8u(u8 * RESTRICT U, sa_sint_t * RESTRICT A, sa_sint_t n) {
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = 0, j = (fast_sint_t)n - 7; i < j; i += 8) {
+ prefetch(&A[i + prefetch_distance]);
+
+ U[i + 0] = (u8)A[i + 0];
+ U[i + 1] = (u8)A[i + 1];
+ U[i + 2] = (u8)A[i + 2];
+ U[i + 3] = (u8)A[i + 3];
+ U[i + 4] = (u8)A[i + 4];
+ U[i + 5] = (u8)A[i + 5];
+ U[i + 6] = (u8)A[i + 6];
+ U[i + 7] = (u8)A[i + 7];
+ }
+
+ for (j += 7; i < j; i += 1) {
+ U[i] = (u8)A[i];
+ }
+}
+static void * libsais_create_ctx(void) { return (void *)libsais_create_ctx_main(1); }
+
+static void libsais_free_ctx(void * ctx) { libsais_free_ctx_main((LIBSAIS_CONTEXT *)ctx); }
+
+static s32 libsais(const u8 * T, s32 * SA, s32 n, s32 fs, s32 * freq) {
+ if ((T == NULL) || (SA == NULL) || (n < 0) || (fs < 0)) {
+ return -1;
+ } else if (n < 2) {
+ if (freq != NULL) {
+ memset(freq, 0, ALPHABET_SIZE * sizeof(s32));
+ }
+ if (n == 1) {
+ SA[0] = 0;
+ if (freq != NULL) {
+ freq[T[0]]++;
+ }
+ }
+ return 0;
+ }
+
+ return libsais_main(T, SA, n, 0, 0, NULL, fs, freq, 1);
+}
+
+static s32 libsais_int(s32 * T, s32 * SA, s32 n, s32 k, s32 fs) {
+ if ((T == NULL) || (SA == NULL) || (n < 0) || (fs < 0)) {
+ return -1;
+ } else if (n < 2) {
+ if (n == 1) {
+ SA[0] = 0;
+ }
+ return 0;
+ }
+
+ return libsais_main_int(T, SA, n, k, fs, 1);
+}
+
+static s32 libsais_ctx(const void * ctx, const u8 * T, s32 * SA, s32 n, s32 fs, s32 * freq) {
+ if ((ctx == NULL) || (T == NULL) || (SA == NULL) || (n < 0) || (fs < 0)) {
+ return -1;
+ } else if (n < 2) {
+ if (freq != NULL) {
+ memset(freq, 0, ALPHABET_SIZE * sizeof(s32));
+ }
+ if (n == 1) {
+ SA[0] = 0;
+ if (freq != NULL) {
+ freq[T[0]]++;
+ }
+ }
+ return 0;
+ }
+
+ return libsais_main_ctx((const LIBSAIS_CONTEXT *)ctx, T, SA, n, 0, 0, NULL, fs, freq);
+}
+
+static s32 libsais_bwt(const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq) {
+ if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0)) {
+ return -1;
+ } else if (n <= 1) {
+ if (freq != NULL) {
+ memset(freq, 0, ALPHABET_SIZE * sizeof(s32));
+ }
+ if (n == 1) {
+ U[0] = T[0];
+ if (freq != NULL) {
+ freq[T[0]]++;
+ }
+ }
+ return n;
+ }
+
+ sa_sint_t index = libsais_main(T, A, n, 1, 0, NULL, fs, freq, 1);
+ if (index >= 0) {
+ index++;
+
+ U[0] = T[n - 1];
+ libsais_bwt_copy_8u(U + 1, A, index - 1);
+ libsais_bwt_copy_8u(U + index, A + index, n - index);
+ }
+
+ return index;
+}
+
+static s32 libsais_bwt_aux(const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq, s32 r, s32 * I) {
+ if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (r < 2) || ((r & (r - 1)) != 0) ||
+ (I == NULL)) {
+ return -1;
+ } else if (n <= 1) {
+ if (freq != NULL) {
+ memset(freq, 0, ALPHABET_SIZE * sizeof(s32));
+ }
+ if (n == 1) {
+ U[0] = T[0];
+ if (freq != NULL) {
+ freq[T[0]]++;
+ }
+ }
+ I[0] = n;
+ return 0;
+ }
+
+ if (libsais_main(T, A, n, 1, r, I, fs, freq, 1) != 0) {
+ return -2;
+ }
+
+ U[0] = T[n - 1];
+ libsais_bwt_copy_8u(U + 1, A, I[0] - 1);
+ libsais_bwt_copy_8u(U + I[0], A + I[0], n - I[0]);
+
+ return 0;
+}
+
+static s32 libsais_bwt_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq) {
+ if ((ctx == NULL) || (T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0)) {
+ return -1;
+ } else if (n <= 1) {
+ if (freq != NULL) {
+ memset(freq, 0, ALPHABET_SIZE * sizeof(s32));
+ }
+ if (n == 1) {
+ U[0] = T[0];
+ if (freq != NULL) {
+ freq[T[0]]++;
+ }
+ }
+ return n;
+ }
+
+ sa_sint_t index = libsais_main_ctx((const LIBSAIS_CONTEXT *)ctx, T, A, n, 1, 0, NULL, fs, freq);
+ if (index >= 0) {
+ index++;
+
+ U[0] = T[n - 1];
+
+ libsais_bwt_copy_8u(U + 1, A, index - 1);
+ libsais_bwt_copy_8u(U + index, A + index, n - index);
+ }
+
+ return index;
+}
+
+static s32 libsais_bwt_aux_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq, s32 r,
+ s32 * I) {
+ if ((ctx == NULL) || (T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (r < 2) ||
+ ((r & (r - 1)) != 0) || (I == NULL)) {
+ return -1;
+ } else if (n <= 1) {
+ if (freq != NULL) {
+ memset(freq, 0, ALPHABET_SIZE * sizeof(s32));
+ }
+ if (n == 1) {
+ U[0] = T[0];
+ if (freq != NULL) {
+ freq[T[0]]++;
+ }
+ }
+ I[0] = n;
+ return 0;
+ }
+
+ if (libsais_main_ctx((const LIBSAIS_CONTEXT *)ctx, T, A, n, 1, r, I, fs, freq) != 0) {
+ return -2;
+ }
+
+ U[0] = T[n - 1];
+ libsais_bwt_copy_8u(U + 1, A, I[0] - 1);
+ libsais_bwt_copy_8u(U + I[0], A + I[0], n - I[0]);
+ return 0;
+}
+static LIBSAIS_UNBWT_CONTEXT * libsais_unbwt_create_ctx_main(sa_sint_t threads) {
+ LIBSAIS_UNBWT_CONTEXT * RESTRICT ctx =
+ (LIBSAIS_UNBWT_CONTEXT *)libsais_alloc_aligned(sizeof(LIBSAIS_UNBWT_CONTEXT), 64);
+ sa_uint_t * RESTRICT bucket2 =
+ (sa_uint_t *)libsais_alloc_aligned(ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t), 4096);
+ u16 * RESTRICT fastbits = (u16 *)libsais_alloc_aligned((1 + (1 << UNBWT_FASTBITS)) * sizeof(u16), 4096);
+ sa_uint_t * RESTRICT buckets =
+ threads > 1 ? (sa_uint_t *)libsais_alloc_aligned(
+ (size_t)threads * (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)) * sizeof(sa_uint_t), 4096)
+ : NULL;
+
+ if (ctx != NULL && bucket2 != NULL && fastbits != NULL && (buckets != NULL || threads == 1)) {
+ ctx->bucket2 = bucket2;
+ ctx->fastbits = fastbits;
+ ctx->buckets = buckets;
+ ctx->threads = threads;
+
+ return ctx;
+ }
+
+ libsais_free_aligned(buckets);
+ libsais_free_aligned(fastbits);
+ libsais_free_aligned(bucket2);
+ libsais_free_aligned(ctx);
+
+ return NULL;
+}
+
+static void libsais_unbwt_free_ctx_main(LIBSAIS_UNBWT_CONTEXT * ctx) {
+ if (ctx != NULL) {
+ libsais_free_aligned(ctx->buckets);
+ libsais_free_aligned(ctx->fastbits);
+ libsais_free_aligned(ctx->bucket2);
+ libsais_free_aligned(ctx);
+ }
+}
+
+static void libsais_unbwt_compute_histogram(const u8 * RESTRICT T, fast_sint_t n, sa_uint_t * RESTRICT count) {
+ const fast_sint_t prefetch_distance = 256;
+
+ const u8 * RESTRICT T_p = T;
+
+ if (n >= 1024) {
+ sa_uint_t copy[4 * (ALPHABET_SIZE + 16)];
+
+ memset(copy, 0, 4 * (ALPHABET_SIZE + 16) * sizeof(sa_uint_t));
+
+ sa_uint_t * RESTRICT copy0 = copy + 0 * (ALPHABET_SIZE + 16);
+ sa_uint_t * RESTRICT copy1 = copy + 1 * (ALPHABET_SIZE + 16);
+ sa_uint_t * RESTRICT copy2 = copy + 2 * (ALPHABET_SIZE + 16);
+ sa_uint_t * RESTRICT copy3 = copy + 3 * (ALPHABET_SIZE + 16);
+
+ for (; T_p < (u8 *)((ptrdiff_t)(T + 63) & (-64)); T_p += 1) {
+ copy0[T_p[0]]++;
+ }
+
+ fast_uint_t x = ((const u32 *)(const void *)T_p)[0], y = ((const u32 *)(const void *)T_p)[1];
+
+ for (; T_p < (u8 *)((ptrdiff_t)(T + n - 8) & (-64)); T_p += 64) {
+ prefetch(&T_p[prefetch_distance]);
+
+ fast_uint_t z = ((const u32 *)(const void *)T_p)[2], w = ((const u32 *)(const void *)T_p)[3];
+ copy0[(u8)x]++;
+ x >>= 8;
+ copy1[(u8)x]++;
+ x >>= 8;
+ copy2[(u8)x]++;
+ x >>= 8;
+ copy3[x]++;
+ copy0[(u8)y]++;
+ y >>= 8;
+ copy1[(u8)y]++;
+ y >>= 8;
+ copy2[(u8)y]++;
+ y >>= 8;
+ copy3[y]++;
+
+ x = ((const u32 *)(const void *)T_p)[4];
+ y = ((const u32 *)(const void *)T_p)[5];
+ copy0[(u8)z]++;
+ z >>= 8;
+ copy1[(u8)z]++;
+ z >>= 8;
+ copy2[(u8)z]++;
+ z >>= 8;
+ copy3[z]++;
+ copy0[(u8)w]++;
+ w >>= 8;
+ copy1[(u8)w]++;
+ w >>= 8;
+ copy2[(u8)w]++;
+ w >>= 8;
+ copy3[w]++;
+
+ z = ((const u32 *)(const void *)T_p)[6];
+ w = ((const u32 *)(const void *)T_p)[7];
+ copy0[(u8)x]++;
+ x >>= 8;
+ copy1[(u8)x]++;
+ x >>= 8;
+ copy2[(u8)x]++;
+ x >>= 8;
+ copy3[x]++;
+ copy0[(u8)y]++;
+ y >>= 8;
+ copy1[(u8)y]++;
+ y >>= 8;
+ copy2[(u8)y]++;
+ y >>= 8;
+ copy3[y]++;
+
+ x = ((const u32 *)(const void *)T_p)[8];
+ y = ((const u32 *)(const void *)T_p)[9];
+ copy0[(u8)z]++;
+ z >>= 8;
+ copy1[(u8)z]++;
+ z >>= 8;
+ copy2[(u8)z]++;
+ z >>= 8;
+ copy3[z]++;
+ copy0[(u8)w]++;
+ w >>= 8;
+ copy1[(u8)w]++;
+ w >>= 8;
+ copy2[(u8)w]++;
+ w >>= 8;
+ copy3[w]++;
+
+ z = ((const u32 *)(const void *)T_p)[10];
+ w = ((const u32 *)(const void *)T_p)[11];
+ copy0[(u8)x]++;
+ x >>= 8;
+ copy1[(u8)x]++;
+ x >>= 8;
+ copy2[(u8)x]++;
+ x >>= 8;
+ copy3[x]++;
+ copy0[(u8)y]++;
+ y >>= 8;
+ copy1[(u8)y]++;
+ y >>= 8;
+ copy2[(u8)y]++;
+ y >>= 8;
+ copy3[y]++;
+
+ x = ((const u32 *)(const void *)T_p)[12];
+ y = ((const u32 *)(const void *)T_p)[13];
+ copy0[(u8)z]++;
+ z >>= 8;
+ copy1[(u8)z]++;
+ z >>= 8;
+ copy2[(u8)z]++;
+ z >>= 8;
+ copy3[z]++;
+ copy0[(u8)w]++;
+ w >>= 8;
+ copy1[(u8)w]++;
+ w >>= 8;
+ copy2[(u8)w]++;
+ w >>= 8;
+ copy3[w]++;
+
+ z = ((const u32 *)(const void *)T_p)[14];
+ w = ((const u32 *)(const void *)T_p)[15];
+ copy0[(u8)x]++;
+ x >>= 8;
+ copy1[(u8)x]++;
+ x >>= 8;
+ copy2[(u8)x]++;
+ x >>= 8;
+ copy3[x]++;
+ copy0[(u8)y]++;
+ y >>= 8;
+ copy1[(u8)y]++;
+ y >>= 8;
+ copy2[(u8)y]++;
+ y >>= 8;
+ copy3[y]++;
+
+ x = ((const u32 *)(const void *)T_p)[16];
+ y = ((const u32 *)(const void *)T_p)[17];
+ copy0[(u8)z]++;
+ z >>= 8;
+ copy1[(u8)z]++;
+ z >>= 8;
+ copy2[(u8)z]++;
+ z >>= 8;
+ copy3[z]++;
+ copy0[(u8)w]++;
+ w >>= 8;
+ copy1[(u8)w]++;
+ w >>= 8;
+ copy2[(u8)w]++;
+ w >>= 8;
+ copy3[w]++;
+ }
+
+ copy0[(u8)x]++;
+ x >>= 8;
+ copy1[(u8)x]++;
+ x >>= 8;
+ copy2[(u8)x]++;
+ x >>= 8;
+ copy3[x]++;
+ copy0[(u8)y]++;
+ y >>= 8;
+ copy1[(u8)y]++;
+ y >>= 8;
+ copy2[(u8)y]++;
+ y >>= 8;
+ copy3[y]++;
+
+ T_p += 8;
+
+ fast_uint_t i;
+ for (i = 0; i < ALPHABET_SIZE; i++) {
+ count[i] += copy0[i] + copy1[i] + copy2[i] + copy3[i];
+ }
+ }
+
+ for (; T_p < T + n; T_p += 1) {
+ count[T_p[0]]++;
+ }
+}
+
+static void libsais_unbwt_transpose_bucket2(sa_uint_t * RESTRICT bucket2) {
+ fast_uint_t x, y, c, d;
+ for (x = 0; x != ALPHABET_SIZE; x += 16) {
+ for (c = x; c != x + 16; ++c) {
+ for (d = c + 1; d != x + 16; ++d) {
+ sa_uint_t tmp = bucket2[(d << 8) + c];
+ bucket2[(d << 8) + c] = bucket2[(c << 8) + d];
+ bucket2[(c << 8) + d] = tmp;
+ }
+ }
+
+ for (y = x + 16; y != ALPHABET_SIZE; y += 16) {
+ for (c = x; c != x + 16; ++c) {
+ sa_uint_t * bucket2_yc = &bucket2[(y << 8) + c];
+ sa_uint_t * bucket2_cy = &bucket2[(c << 8) + y];
+
+ sa_uint_t tmp00 = bucket2_yc[0 * 256];
+ bucket2_yc[0 * 256] = bucket2_cy[0];
+ bucket2_cy[0] = tmp00;
+ sa_uint_t tmp01 = bucket2_yc[1 * 256];
+ bucket2_yc[1 * 256] = bucket2_cy[1];
+ bucket2_cy[1] = tmp01;
+ sa_uint_t tmp02 = bucket2_yc[2 * 256];
+ bucket2_yc[2 * 256] = bucket2_cy[2];
+ bucket2_cy[2] = tmp02;
+ sa_uint_t tmp03 = bucket2_yc[3 * 256];
+ bucket2_yc[3 * 256] = bucket2_cy[3];
+ bucket2_cy[3] = tmp03;
+ sa_uint_t tmp04 = bucket2_yc[4 * 256];
+ bucket2_yc[4 * 256] = bucket2_cy[4];
+ bucket2_cy[4] = tmp04;
+ sa_uint_t tmp05 = bucket2_yc[5 * 256];
+ bucket2_yc[5 * 256] = bucket2_cy[5];
+ bucket2_cy[5] = tmp05;
+ sa_uint_t tmp06 = bucket2_yc[6 * 256];
+ bucket2_yc[6 * 256] = bucket2_cy[6];
+ bucket2_cy[6] = tmp06;
+ sa_uint_t tmp07 = bucket2_yc[7 * 256];
+ bucket2_yc[7 * 256] = bucket2_cy[7];
+ bucket2_cy[7] = tmp07;
+ sa_uint_t tmp08 = bucket2_yc[8 * 256];
+ bucket2_yc[8 * 256] = bucket2_cy[8];
+ bucket2_cy[8] = tmp08;
+ sa_uint_t tmp09 = bucket2_yc[9 * 256];
+ bucket2_yc[9 * 256] = bucket2_cy[9];
+ bucket2_cy[9] = tmp09;
+ sa_uint_t tmp10 = bucket2_yc[10 * 256];
+ bucket2_yc[10 * 256] = bucket2_cy[10];
+ bucket2_cy[10] = tmp10;
+ sa_uint_t tmp11 = bucket2_yc[11 * 256];
+ bucket2_yc[11 * 256] = bucket2_cy[11];
+ bucket2_cy[11] = tmp11;
+ sa_uint_t tmp12 = bucket2_yc[12 * 256];
+ bucket2_yc[12 * 256] = bucket2_cy[12];
+ bucket2_cy[12] = tmp12;
+ sa_uint_t tmp13 = bucket2_yc[13 * 256];
+ bucket2_yc[13 * 256] = bucket2_cy[13];
+ bucket2_cy[13] = tmp13;
+ sa_uint_t tmp14 = bucket2_yc[14 * 256];
+ bucket2_yc[14 * 256] = bucket2_cy[14];
+ bucket2_cy[14] = tmp14;
+ sa_uint_t tmp15 = bucket2_yc[15 * 256];
+ bucket2_yc[15 * 256] = bucket2_cy[15];
+ bucket2_cy[15] = tmp15;
+ }
+ }
+ }
+}
+
+static void libsais_unbwt_compute_bigram_histogram_single(const u8 * RESTRICT T, sa_uint_t * RESTRICT bucket1,
+ sa_uint_t * RESTRICT bucket2, fast_uint_t index) {
+ fast_uint_t sum, c;
+ for (sum = 1, c = 0; c < ALPHABET_SIZE; ++c) {
+ fast_uint_t prev = sum;
+ sum += bucket1[c];
+ bucket1[c] = (sa_uint_t)prev;
+ if (prev != sum) {
+ sa_uint_t * RESTRICT bucket2_p = &bucket2[c << 8];
+
+ {
+ fast_uint_t hi = index;
+ if (sum < hi) {
+ hi = sum;
+ }
+ libsais_unbwt_compute_histogram(&T[prev], (fast_sint_t)(hi - prev), bucket2_p);
+ }
+
+ {
+ fast_uint_t lo = index + 1;
+ if (prev > lo) {
+ lo = prev;
+ }
+ libsais_unbwt_compute_histogram(&T[lo - 1], (fast_sint_t)(sum - lo), bucket2_p);
+ }
+ }
+ }
+
+ libsais_unbwt_transpose_bucket2(bucket2);
+}
+
+static void libsais_unbwt_calculate_fastbits(sa_uint_t * RESTRICT bucket2, u16 * RESTRICT fastbits, fast_uint_t lastc,
+ fast_uint_t shift) {
+ fast_uint_t v, w, sum, c, d;
+ for (v = 0, w = 0, sum = 1, c = 0; c < ALPHABET_SIZE; ++c) {
+ if (c == lastc) {
+ sum += 1;
+ }
+
+ for (d = 0; d < ALPHABET_SIZE; ++d, ++w) {
+ fast_uint_t prev = sum;
+ sum += bucket2[w];
+ bucket2[w] = (sa_uint_t)prev;
+ if (prev != sum) {
+ for (; v <= ((sum - 1) >> shift); ++v) {
+ fastbits[v] = (u16)w;
+ }
+ }
+ }
+ }
+}
+
+static void libsais_unbwt_calculate_biPSI(const u8 * RESTRICT T, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket1,
+ sa_uint_t * RESTRICT bucket2, fast_uint_t index, fast_sint_t omp_block_start,
+ fast_sint_t omp_block_end) {
+ {
+ fast_sint_t i = omp_block_start, j = (fast_sint_t)index;
+ if (omp_block_end < j) {
+ j = omp_block_end;
+ }
+ for (; i < j; ++i) {
+ fast_uint_t c = T[i];
+ fast_uint_t p = bucket1[c]++;
+ fast_sint_t t = (fast_sint_t)(index - p);
+
+ if (t != 0) {
+ fast_uint_t w = (((fast_uint_t)T[p + (fast_uint_t)(t >> ((sizeof(fast_sint_t) * 8) - 1))]) << 8) + c;
+ P[bucket2[w]++] = (sa_uint_t)i;
+ }
+ }
+ }
+
+ {
+ fast_sint_t i = (fast_sint_t)index, j = omp_block_end;
+ if (omp_block_start > i) {
+ i = omp_block_start;
+ }
+ for (i += 1; i <= j; ++i) {
+ fast_uint_t c = T[i - 1];
+ fast_uint_t p = bucket1[c]++;
+ fast_sint_t t = (fast_sint_t)(index - p);
+
+ if (t != 0) {
+ fast_uint_t w = (((fast_uint_t)T[p + (fast_uint_t)(t >> ((sizeof(fast_sint_t) * 8) - 1))]) << 8) + c;
+ P[bucket2[w]++] = (sa_uint_t)i;
+ }
+ }
+ }
+}
+
+static void libsais_unbwt_init_single(const u8 * RESTRICT T, sa_uint_t * RESTRICT P, sa_sint_t n,
+ const sa_sint_t * freq, const sa_uint_t * RESTRICT I,
+ sa_uint_t * RESTRICT bucket2, u16 * RESTRICT fastbits) {
+ sa_uint_t bucket1[ALPHABET_SIZE];
+
+ fast_uint_t index = I[0];
+ fast_uint_t lastc = T[0];
+ fast_uint_t shift = 0;
+ while ((n >> shift) > (1 << UNBWT_FASTBITS)) {
+ shift++;
+ }
+
+ if (freq != NULL) {
+ memcpy(bucket1, freq, ALPHABET_SIZE * sizeof(sa_uint_t));
+ } else {
+ memset(bucket1, 0, ALPHABET_SIZE * sizeof(sa_uint_t));
+ libsais_unbwt_compute_histogram(T, n, bucket1);
+ }
+
+ memset(bucket2, 0, ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t));
+ libsais_unbwt_compute_bigram_histogram_single(T, bucket1, bucket2, index);
+
+ libsais_unbwt_calculate_fastbits(bucket2, fastbits, lastc, shift);
+ libsais_unbwt_calculate_biPSI(T, P, bucket1, bucket2, index, 0, n);
+}
+static void libsais_unbwt_decode_1(u8 * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2,
+ u16 * RESTRICT fastbits, fast_uint_t shift, fast_uint_t * i0, fast_uint_t k) {
+ u16 * RESTRICT U0 = (u16 *)(void *)U;
+
+ fast_uint_t i, p0 = *i0;
+
+ for (i = 0; i != k; ++i) {
+ u16 c0 = fastbits[p0 >> shift];
+ if (bucket2[c0] <= p0) {
+ do {
+ c0++;
+ } while (bucket2[c0] <= p0);
+ }
+ p0 = P[p0];
+ U0[i] = bswap16(c0);
+ }
+
+ *i0 = p0;
+}
+
+static void libsais_unbwt_decode_2(u8 * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2,
+ u16 * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0,
+ fast_uint_t * i1, fast_uint_t k) {
+ u16 * RESTRICT U0 = (u16 *)(void *)U;
+ u16 * RESTRICT U1 = (u16 *)(void *)(((u8 *)U0) + r);
+
+ fast_uint_t i, p0 = *i0, p1 = *i1;
+
+ for (i = 0; i != k; ++i) {
+ u16 c0 = fastbits[p0 >> shift];
+ if (bucket2[c0] <= p0) {
+ do {
+ c0++;
+ } while (bucket2[c0] <= p0);
+ }
+ p0 = P[p0];
+ U0[i] = bswap16(c0);
+ u16 c1 = fastbits[p1 >> shift];
+ if (bucket2[c1] <= p1) {
+ do {
+ c1++;
+ } while (bucket2[c1] <= p1);
+ }
+ p1 = P[p1];
+ U1[i] = bswap16(c1);
+ }
+
+ *i0 = p0;
+ *i1 = p1;
+}
+
+static void libsais_unbwt_decode_3(u8 * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2,
+ u16 * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0,
+ fast_uint_t * i1, fast_uint_t * i2, fast_uint_t k) {
+ u16 * RESTRICT U0 = (u16 *)(void *)U;
+ u16 * RESTRICT U1 = (u16 *)(void *)(((u8 *)U0) + r);
+ u16 * RESTRICT U2 = (u16 *)(void *)(((u8 *)U1) + r);
+
+ fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2;
+
+ for (i = 0; i != k; ++i) {
+ u16 c0 = fastbits[p0 >> shift];
+ if (bucket2[c0] <= p0) {
+ do {
+ c0++;
+ } while (bucket2[c0] <= p0);
+ }
+ p0 = P[p0];
+ U0[i] = bswap16(c0);
+ u16 c1 = fastbits[p1 >> shift];
+ if (bucket2[c1] <= p1) {
+ do {
+ c1++;
+ } while (bucket2[c1] <= p1);
+ }
+ p1 = P[p1];
+ U1[i] = bswap16(c1);
+ u16 c2 = fastbits[p2 >> shift];
+ if (bucket2[c2] <= p2) {
+ do {
+ c2++;
+ } while (bucket2[c2] <= p2);
+ }
+ p2 = P[p2];
+ U2[i] = bswap16(c2);
+ }
+
+ *i0 = p0;
+ *i1 = p1;
+ *i2 = p2;
+}
+
+static void libsais_unbwt_decode_4(u8 * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2,
+ u16 * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0,
+ fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t k) {
+ u16 * RESTRICT U0 = (u16 *)(void *)U;
+ u16 * RESTRICT U1 = (u16 *)(void *)(((u8 *)U0) + r);
+ u16 * RESTRICT U2 = (u16 *)(void *)(((u8 *)U1) + r);
+ u16 * RESTRICT U3 = (u16 *)(void *)(((u8 *)U2) + r);
+
+ fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3;
+
+ for (i = 0; i != k; ++i) {
+ u16 c0 = fastbits[p0 >> shift];
+ if (bucket2[c0] <= p0) {
+ do {
+ c0++;
+ } while (bucket2[c0] <= p0);
+ }
+ p0 = P[p0];
+ U0[i] = bswap16(c0);
+ u16 c1 = fastbits[p1 >> shift];
+ if (bucket2[c1] <= p1) {
+ do {
+ c1++;
+ } while (bucket2[c1] <= p1);
+ }
+ p1 = P[p1];
+ U1[i] = bswap16(c1);
+ u16 c2 = fastbits[p2 >> shift];
+ if (bucket2[c2] <= p2) {
+ do {
+ c2++;
+ } while (bucket2[c2] <= p2);
+ }
+ p2 = P[p2];
+ U2[i] = bswap16(c2);
+ u16 c3 = fastbits[p3 >> shift];
+ if (bucket2[c3] <= p3) {
+ do {
+ c3++;
+ } while (bucket2[c3] <= p3);
+ }
+ p3 = P[p3];
+ U3[i] = bswap16(c3);
+ }
+
+ *i0 = p0;
+ *i1 = p1;
+ *i2 = p2;
+ *i3 = p3;
+}
+
+static void libsais_unbwt_decode_5(u8 * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2,
+ u16 * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0,
+ fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4,
+ fast_uint_t k) {
+ u16 * RESTRICT U0 = (u16 *)(void *)U;
+ u16 * RESTRICT U1 = (u16 *)(void *)(((u8 *)U0) + r);
+ u16 * RESTRICT U2 = (u16 *)(void *)(((u8 *)U1) + r);
+ u16 * RESTRICT U3 = (u16 *)(void *)(((u8 *)U2) + r);
+ u16 * RESTRICT U4 = (u16 *)(void *)(((u8 *)U3) + r);
+
+ fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4;
+
+ for (i = 0; i != k; ++i) {
+ u16 c0 = fastbits[p0 >> shift];
+ if (bucket2[c0] <= p0) {
+ do {
+ c0++;
+ } while (bucket2[c0] <= p0);
+ }
+ p0 = P[p0];
+ U0[i] = bswap16(c0);
+ u16 c1 = fastbits[p1 >> shift];
+ if (bucket2[c1] <= p1) {
+ do {
+ c1++;
+ } while (bucket2[c1] <= p1);
+ }
+ p1 = P[p1];
+ U1[i] = bswap16(c1);
+ u16 c2 = fastbits[p2 >> shift];
+ if (bucket2[c2] <= p2) {
+ do {
+ c2++;
+ } while (bucket2[c2] <= p2);
+ }
+ p2 = P[p2];
+ U2[i] = bswap16(c2);
+ u16 c3 = fastbits[p3 >> shift];
+ if (bucket2[c3] <= p3) {
+ do {
+ c3++;
+ } while (bucket2[c3] <= p3);
+ }
+ p3 = P[p3];
+ U3[i] = bswap16(c3);
+ u16 c4 = fastbits[p4 >> shift];
+ if (bucket2[c4] <= p4) {
+ do {
+ c4++;
+ } while (bucket2[c4] <= p4);
+ }
+ p4 = P[p4];
+ U4[i] = bswap16(c4);
+ }
+
+ *i0 = p0;
+ *i1 = p1;
+ *i2 = p2;
+ *i3 = p3;
+ *i4 = p4;
+}
+
+static void libsais_unbwt_decode_6(u8 * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2,
+ u16 * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0,
+ fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4,
+ fast_uint_t * i5, fast_uint_t k) {
+ u16 * RESTRICT U0 = (u16 *)(void *)U;
+ u16 * RESTRICT U1 = (u16 *)(void *)(((u8 *)U0) + r);
+ u16 * RESTRICT U2 = (u16 *)(void *)(((u8 *)U1) + r);
+ u16 * RESTRICT U3 = (u16 *)(void *)(((u8 *)U2) + r);
+ u16 * RESTRICT U4 = (u16 *)(void *)(((u8 *)U3) + r);
+ u16 * RESTRICT U5 = (u16 *)(void *)(((u8 *)U4) + r);
+
+ fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4, p5 = *i5;
+
+ for (i = 0; i != k; ++i) {
+ u16 c0 = fastbits[p0 >> shift];
+ if (bucket2[c0] <= p0) {
+ do {
+ c0++;
+ } while (bucket2[c0] <= p0);
+ }
+ p0 = P[p0];
+ U0[i] = bswap16(c0);
+ u16 c1 = fastbits[p1 >> shift];
+ if (bucket2[c1] <= p1) {
+ do {
+ c1++;
+ } while (bucket2[c1] <= p1);
+ }
+ p1 = P[p1];
+ U1[i] = bswap16(c1);
+ u16 c2 = fastbits[p2 >> shift];
+ if (bucket2[c2] <= p2) {
+ do {
+ c2++;
+ } while (bucket2[c2] <= p2);
+ }
+ p2 = P[p2];
+ U2[i] = bswap16(c2);
+ u16 c3 = fastbits[p3 >> shift];
+ if (bucket2[c3] <= p3) {
+ do {
+ c3++;
+ } while (bucket2[c3] <= p3);
+ }
+ p3 = P[p3];
+ U3[i] = bswap16(c3);
+ u16 c4 = fastbits[p4 >> shift];
+ if (bucket2[c4] <= p4) {
+ do {
+ c4++;
+ } while (bucket2[c4] <= p4);
+ }
+ p4 = P[p4];
+ U4[i] = bswap16(c4);
+ u16 c5 = fastbits[p5 >> shift];
+ if (bucket2[c5] <= p5) {
+ do {
+ c5++;
+ } while (bucket2[c5] <= p5);
+ }
+ p5 = P[p5];
+ U5[i] = bswap16(c5);
+ }
+
+ *i0 = p0;
+ *i1 = p1;
+ *i2 = p2;
+ *i3 = p3;
+ *i4 = p4;
+ *i5 = p5;
+}
+
+static void libsais_unbwt_decode_7(u8 * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2,
+ u16 * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0,
+ fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4,
+ fast_uint_t * i5, fast_uint_t * i6, fast_uint_t k) {
+ u16 * RESTRICT U0 = (u16 *)(void *)U;
+ u16 * RESTRICT U1 = (u16 *)(void *)(((u8 *)U0) + r);
+ u16 * RESTRICT U2 = (u16 *)(void *)(((u8 *)U1) + r);
+ u16 * RESTRICT U3 = (u16 *)(void *)(((u8 *)U2) + r);
+ u16 * RESTRICT U4 = (u16 *)(void *)(((u8 *)U3) + r);
+ u16 * RESTRICT U5 = (u16 *)(void *)(((u8 *)U4) + r);
+ u16 * RESTRICT U6 = (u16 *)(void *)(((u8 *)U5) + r);
+
+ fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4, p5 = *i5, p6 = *i6;
+
+ for (i = 0; i != k; ++i) {
+ u16 c0 = fastbits[p0 >> shift];
+ if (bucket2[c0] <= p0) {
+ do {
+ c0++;
+ } while (bucket2[c0] <= p0);
+ }
+ p0 = P[p0];
+ U0[i] = bswap16(c0);
+ u16 c1 = fastbits[p1 >> shift];
+ if (bucket2[c1] <= p1) {
+ do {
+ c1++;
+ } while (bucket2[c1] <= p1);
+ }
+ p1 = P[p1];
+ U1[i] = bswap16(c1);
+ u16 c2 = fastbits[p2 >> shift];
+ if (bucket2[c2] <= p2) {
+ do {
+ c2++;
+ } while (bucket2[c2] <= p2);
+ }
+ p2 = P[p2];
+ U2[i] = bswap16(c2);
+ u16 c3 = fastbits[p3 >> shift];
+ if (bucket2[c3] <= p3) {
+ do {
+ c3++;
+ } while (bucket2[c3] <= p3);
+ }
+ p3 = P[p3];
+ U3[i] = bswap16(c3);
+ u16 c4 = fastbits[p4 >> shift];
+ if (bucket2[c4] <= p4) {
+ do {
+ c4++;
+ } while (bucket2[c4] <= p4);
+ }
+ p4 = P[p4];
+ U4[i] = bswap16(c4);
+ u16 c5 = fastbits[p5 >> shift];
+ if (bucket2[c5] <= p5) {
+ do {
+ c5++;
+ } while (bucket2[c5] <= p5);
+ }
+ p5 = P[p5];
+ U5[i] = bswap16(c5);
+ u16 c6 = fastbits[p6 >> shift];
+ if (bucket2[c6] <= p6) {
+ do {
+ c6++;
+ } while (bucket2[c6] <= p6);
+ }
+ p6 = P[p6];
+ U6[i] = bswap16(c6);
+ }
+
+ *i0 = p0;
+ *i1 = p1;
+ *i2 = p2;
+ *i3 = p3;
+ *i4 = p4;
+ *i5 = p5;
+ *i6 = p6;
+}
+
+static void libsais_unbwt_decode_8(u8 * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2,
+ u16 * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0,
+ fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4,
+ fast_uint_t * i5, fast_uint_t * i6, fast_uint_t * i7, fast_uint_t k) {
+ u16 * RESTRICT U0 = (u16 *)(void *)U;
+ u16 * RESTRICT U1 = (u16 *)(void *)(((u8 *)U0) + r);
+ u16 * RESTRICT U2 = (u16 *)(void *)(((u8 *)U1) + r);
+ u16 * RESTRICT U3 = (u16 *)(void *)(((u8 *)U2) + r);
+ u16 * RESTRICT U4 = (u16 *)(void *)(((u8 *)U3) + r);
+ u16 * RESTRICT U5 = (u16 *)(void *)(((u8 *)U4) + r);
+ u16 * RESTRICT U6 = (u16 *)(void *)(((u8 *)U5) + r);
+ u16 * RESTRICT U7 = (u16 *)(void *)(((u8 *)U6) + r);
+
+ fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4, p5 = *i5, p6 = *i6, p7 = *i7;
+
+ for (i = 0; i != k; ++i) {
+ u16 c0 = fastbits[p0 >> shift];
+ if (bucket2[c0] <= p0) {
+ do {
+ c0++;
+ } while (bucket2[c0] <= p0);
+ }
+ p0 = P[p0];
+ U0[i] = bswap16(c0);
+ u16 c1 = fastbits[p1 >> shift];
+ if (bucket2[c1] <= p1) {
+ do {
+ c1++;
+ } while (bucket2[c1] <= p1);
+ }
+ p1 = P[p1];
+ U1[i] = bswap16(c1);
+ u16 c2 = fastbits[p2 >> shift];
+ if (bucket2[c2] <= p2) {
+ do {
+ c2++;
+ } while (bucket2[c2] <= p2);
+ }
+ p2 = P[p2];
+ U2[i] = bswap16(c2);
+ u16 c3 = fastbits[p3 >> shift];
+ if (bucket2[c3] <= p3) {
+ do {
+ c3++;
+ } while (bucket2[c3] <= p3);
+ }
+ p3 = P[p3];
+ U3[i] = bswap16(c3);
+ u16 c4 = fastbits[p4 >> shift];
+ if (bucket2[c4] <= p4) {
+ do {
+ c4++;
+ } while (bucket2[c4] <= p4);
+ }
+ p4 = P[p4];
+ U4[i] = bswap16(c4);
+ u16 c5 = fastbits[p5 >> shift];
+ if (bucket2[c5] <= p5) {
+ do {
+ c5++;
+ } while (bucket2[c5] <= p5);
+ }
+ p5 = P[p5];
+ U5[i] = bswap16(c5);
+ u16 c6 = fastbits[p6 >> shift];
+ if (bucket2[c6] <= p6) {
+ do {
+ c6++;
+ } while (bucket2[c6] <= p6);
+ }
+ p6 = P[p6];
+ U6[i] = bswap16(c6);
+ u16 c7 = fastbits[p7 >> shift];
+ if (bucket2[c7] <= p7) {
+ do {
+ c7++;
+ } while (bucket2[c7] <= p7);
+ }
+ p7 = P[p7];
+ U7[i] = bswap16(c7);
+ }
+
+ *i0 = p0;
+ *i1 = p1;
+ *i2 = p2;
+ *i3 = p3;
+ *i4 = p4;
+ *i5 = p5;
+ *i6 = p6;
+ *i7 = p7;
+}
+
+static void libsais_unbwt_decode(u8 * RESTRICT U, sa_uint_t * RESTRICT P, sa_sint_t n, sa_sint_t r,
+ const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, u16 * RESTRICT fastbits,
+ fast_sint_t blocks, fast_uint_t reminder) {
+ fast_uint_t shift = 0;
+ while ((n >> shift) > (1 << UNBWT_FASTBITS)) {
+ shift++;
+ }
+ fast_uint_t offset = 0;
+
+ while (blocks > 8) {
+ fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5], i6 = I[6], i7 = I[7];
+ libsais_unbwt_decode_8(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5,
+ &i6, &i7, (fast_uint_t)r >> 1);
+ I += 8;
+ blocks -= 8;
+ offset += 8 * (fast_uint_t)r;
+ }
+
+ if (blocks == 1) {
+ fast_uint_t i0 = I[0];
+ libsais_unbwt_decode_1(U + offset, P, bucket2, fastbits, shift, &i0, reminder >> 1);
+ } else if (blocks == 2) {
+ fast_uint_t i0 = I[0], i1 = I[1];
+ libsais_unbwt_decode_2(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, reminder >> 1);
+ libsais_unbwt_decode_1(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift, &i0,
+ ((fast_uint_t)r >> 1) - (reminder >> 1));
+ } else if (blocks == 3) {
+ fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2];
+ libsais_unbwt_decode_3(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, reminder >> 1);
+ libsais_unbwt_decode_2(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1,
+ ((fast_uint_t)r >> 1) - (reminder >> 1));
+ } else if (blocks == 4) {
+ fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3];
+ libsais_unbwt_decode_4(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3,
+ reminder >> 1);
+ libsais_unbwt_decode_3(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1,
+ &i2, ((fast_uint_t)r >> 1) - (reminder >> 1));
+ } else if (blocks == 5) {
+ fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4];
+ libsais_unbwt_decode_5(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4,
+ reminder >> 1);
+ libsais_unbwt_decode_4(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1,
+ &i2, &i3, ((fast_uint_t)r >> 1) - (reminder >> 1));
+ } else if (blocks == 6) {
+ fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5];
+ libsais_unbwt_decode_6(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5,
+ reminder >> 1);
+ libsais_unbwt_decode_5(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1,
+ &i2, &i3, &i4, ((fast_uint_t)r >> 1) - (reminder >> 1));
+ } else if (blocks == 7) {
+ fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5], i6 = I[6];
+ libsais_unbwt_decode_7(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5,
+ &i6, reminder >> 1);
+ libsais_unbwt_decode_6(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1,
+ &i2, &i3, &i4, &i5, ((fast_uint_t)r >> 1) - (reminder >> 1));
+ } else {
+ fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5], i6 = I[6], i7 = I[7];
+ libsais_unbwt_decode_8(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5,
+ &i6, &i7, reminder >> 1);
+ libsais_unbwt_decode_7(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1,
+ &i2, &i3, &i4, &i5, &i6, ((fast_uint_t)r >> 1) - (reminder >> 1));
+ }
+}
+
+static void libsais_unbwt_decode_omp(const u8 * RESTRICT T, u8 * RESTRICT U, sa_uint_t * RESTRICT P, sa_sint_t n,
+ sa_sint_t r, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2,
+ u16 * RESTRICT fastbits, sa_sint_t threads) {
+ fast_uint_t lastc = T[0];
+ fast_sint_t blocks = 1 + (((fast_sint_t)n - 1) / (fast_sint_t)r);
+ fast_uint_t reminder = (fast_uint_t)n - ((fast_uint_t)r * ((fast_uint_t)blocks - 1));
+
+ {
+ (void)(threads);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+ fast_sint_t omp_block_stride = blocks / omp_num_threads;
+ fast_sint_t omp_block_reminder = blocks % omp_num_threads;
+ fast_sint_t omp_block_size = omp_block_stride + (omp_thread_num < omp_block_reminder);
+ fast_sint_t omp_block_start = omp_block_stride * omp_thread_num +
+ (omp_thread_num < omp_block_reminder ? omp_thread_num : omp_block_reminder);
+
+ libsais_unbwt_decode(U + r * omp_block_start, P, n, r, I + omp_block_start, bucket2, fastbits, omp_block_size,
+ omp_thread_num < omp_num_threads - 1 ? (fast_uint_t)r : reminder);
+ }
+
+ U[n - 1] = (u8)lastc;
+}
+
+static sa_sint_t libsais_unbwt_core(const u8 * RESTRICT T, u8 * RESTRICT U, sa_uint_t * RESTRICT P, sa_sint_t n,
+ const sa_sint_t * freq, sa_sint_t r, const sa_uint_t * RESTRICT I,
+ sa_uint_t * RESTRICT bucket2, u16 * RESTRICT fastbits, sa_uint_t * RESTRICT buckets,
+ sa_sint_t threads) {
+ (void)(buckets);
+
+ { libsais_unbwt_init_single(T, P, n, freq, I, bucket2, fastbits); }
+
+ libsais_unbwt_decode_omp(T, U, P, n, r, I, bucket2, fastbits, threads);
+ return 0;
+}
+
+static sa_sint_t libsais_unbwt_main(const u8 * T, u8 * U, sa_uint_t * P, sa_sint_t n, const sa_sint_t * freq,
+ sa_sint_t r, const sa_uint_t * I, sa_sint_t threads) {
+ fast_uint_t shift = 0;
+ while ((n >> shift) > (1 << UNBWT_FASTBITS)) {
+ shift++;
+ }
+
+ sa_uint_t * RESTRICT bucket2 =
+ (sa_uint_t *)libsais_alloc_aligned(ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t), 4096);
+ u16 * RESTRICT fastbits = (u16 *)libsais_alloc_aligned(((size_t)1 + (size_t)(n >> shift)) * sizeof(u16), 4096);
+ memset(fastbits, 0, ((size_t)1 + (size_t)(n >> shift)) * sizeof(u16));
+ sa_uint_t * RESTRICT buckets =
+ threads > 1 && n >= 262144
+ ? (sa_uint_t *)libsais_alloc_aligned(
+ (size_t)threads * (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)) * sizeof(sa_uint_t), 4096)
+ : NULL;
+
+ sa_sint_t index = bucket2 != NULL && fastbits != NULL && (buckets != NULL || threads == 1 || n < 262144)
+ ? libsais_unbwt_core(T, U, P, n, freq, r, I, bucket2, fastbits, buckets, threads)
+ : -2;
+
+ libsais_free_aligned(buckets);
+ libsais_free_aligned(fastbits);
+ libsais_free_aligned(bucket2);
+
+ return index;
+}
+
+static sa_sint_t libsais_unbwt_main_ctx(const LIBSAIS_UNBWT_CONTEXT * ctx, const u8 * T, u8 * U, sa_uint_t * P,
+ sa_sint_t n, const sa_sint_t * freq, sa_sint_t r, const sa_uint_t * I) {
+ return ctx != NULL && ctx->bucket2 != NULL && ctx->fastbits != NULL && (ctx->buckets != NULL || ctx->threads == 1)
+ ? libsais_unbwt_core(T, U, P, n, freq, r, I, ctx->bucket2, ctx->fastbits, ctx->buckets,
+ (sa_sint_t)ctx->threads)
+ : -2;
+}
+
+static void * libsais_unbwt_create_ctx(void) { return (void *)libsais_unbwt_create_ctx_main(1); }
+
+static void libsais_unbwt_free_ctx(void * ctx) { libsais_unbwt_free_ctx_main((LIBSAIS_UNBWT_CONTEXT *)ctx); }
+
+static s32 libsais_unbwt_aux(const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq, s32 r, const s32 * I) {
+ if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || ((r != n) && ((r < 2) || ((r & (r - 1)) != 0))) ||
+ (I == NULL)) {
+ return -1;
+ } else if (n <= 1) {
+ if (I[0] != n) {
+ return -1;
+ }
+ if (n == 1) {
+ U[0] = T[0];
+ }
+ return 0;
+ }
+
+ fast_sint_t t;
+ for (t = 0; t <= (n - 1) / r; ++t) {
+ if (I[t] <= 0 || I[t] > n) {
+ return -1;
+ }
+ }
+
+ return libsais_unbwt_main(T, U, (sa_uint_t *)A, n, freq, r, (const sa_uint_t *)I, 1);
+}
+
+static s32 libsais_unbwt_aux_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq, s32 r,
+ const s32 * I) {
+ if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || ((r != n) && ((r < 2) || ((r & (r - 1)) != 0))) ||
+ (I == NULL)) {
+ return -1;
+ } else if (n <= 1) {
+ if (I[0] != n) {
+ return -1;
+ }
+ if (n == 1) {
+ U[0] = T[0];
+ }
+ return 0;
+ }
+
+ fast_sint_t t;
+ for (t = 0; t <= (n - 1) / r; ++t) {
+ if (I[t] <= 0 || I[t] > n) {
+ return -1;
+ }
+ }
+
+ return libsais_unbwt_main_ctx((const LIBSAIS_UNBWT_CONTEXT *)ctx, T, U, (sa_uint_t *)A, n, freq, r,
+ (const sa_uint_t *)I);
+}
+
+static s32 libsais_unbwt(const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq, s32 i) {
+ return libsais_unbwt_aux(T, U, A, n, freq, n, &i);
+}
+
+static s32 libsais_unbwt_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq, s32 i) {
+ return libsais_unbwt_aux_ctx(ctx, T, U, A, n, freq, n, &i);
+}
+
+static void libsais_compute_phi(const sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT PLCP, sa_sint_t n,
+ fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ sa_sint_t k = omp_block_start > 0 ? SA[omp_block_start - 1] : n;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) {
+ prefetchw(&PLCP[SA[i + prefetch_distance + 0]]);
+ prefetchw(&PLCP[SA[i + prefetch_distance + 1]]);
+
+ PLCP[SA[i + 0]] = k;
+ k = SA[i + 0];
+ PLCP[SA[i + 1]] = k;
+ k = SA[i + 1];
+
+ prefetchw(&PLCP[SA[i + prefetch_distance + 2]]);
+ prefetchw(&PLCP[SA[i + prefetch_distance + 3]]);
+
+ PLCP[SA[i + 2]] = k;
+ k = SA[i + 2];
+ PLCP[SA[i + 3]] = k;
+ k = SA[i + 3];
+ }
+
+ for (j += prefetch_distance + 3; i < j; i += 1) {
+ PLCP[SA[i]] = k;
+ k = SA[i];
+ }
+}
+
+static void libsais_compute_phi_omp(const sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT PLCP, sa_sint_t n,
+ sa_sint_t threads) {
+ {
+ (void)(threads);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+
+ fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+
+ libsais_compute_phi(SA, PLCP, n, omp_block_start, omp_block_size);
+ }
+}
+
+static void libsais_compute_plcp(const u8 * RESTRICT T, sa_sint_t * RESTRICT PLCP, fast_sint_t n,
+ fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j, l = 0;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance; i < j; i += 1) {
+ prefetch(&T[PLCP[i + prefetch_distance] + l]);
+
+ fast_sint_t k = PLCP[i], m = n - (i > k ? i : k);
+ while (l < m && T[i + l] == T[k + l]) {
+ l++;
+ }
+
+ PLCP[i] = (sa_sint_t)l;
+ l -= (l != 0);
+ }
+
+ for (j += prefetch_distance; i < j; i += 1) {
+ fast_sint_t k = PLCP[i], m = n - (i > k ? i : k);
+ while (l < m && T[i + l] == T[k + l]) {
+ l++;
+ }
+
+ PLCP[i] = (sa_sint_t)l;
+ l -= (l != 0);
+ }
+}
+
+static void libsais_compute_plcp_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT PLCP, sa_sint_t n, sa_sint_t threads) {
+ {
+ (void)(threads);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+
+ fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+
+ libsais_compute_plcp(T, PLCP, n, omp_block_start, omp_block_size);
+ }
+}
+
+static void libsais_compute_lcp(const sa_sint_t * RESTRICT PLCP, const sa_sint_t * RESTRICT SA,
+ sa_sint_t * RESTRICT LCP, fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+ const fast_sint_t prefetch_distance = 32;
+
+ fast_sint_t i, j;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) {
+ prefetch(&PLCP[SA[i + prefetch_distance + 0]]);
+ prefetch(&PLCP[SA[i + prefetch_distance + 1]]);
+
+ LCP[i + 0] = PLCP[SA[i + 0]];
+ LCP[i + 1] = PLCP[SA[i + 1]];
+
+ prefetch(&PLCP[SA[i + prefetch_distance + 2]]);
+ prefetch(&PLCP[SA[i + prefetch_distance + 3]]);
+
+ LCP[i + 2] = PLCP[SA[i + 2]];
+ LCP[i + 3] = PLCP[SA[i + 3]];
+ }
+
+ for (j += prefetch_distance + 3; i < j; i += 1) {
+ LCP[i] = PLCP[SA[i]];
+ }
+}
+
+static void libsais_compute_lcp_omp(const sa_sint_t * RESTRICT PLCP, const sa_sint_t * RESTRICT SA,
+ sa_sint_t * RESTRICT LCP, sa_sint_t n, sa_sint_t threads) {
+ {
+ (void)(threads);
+
+ fast_sint_t omp_thread_num = 0;
+ fast_sint_t omp_num_threads = 1;
+
+ fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
+
+ libsais_compute_lcp(PLCP, SA, LCP, omp_block_start, omp_block_size);
+ }
+}
+
+static s32 libsais_plcp(const u8 * T, const s32 * SA, s32 * PLCP, s32 n) {
+ if ((T == NULL) || (SA == NULL) || (PLCP == NULL) || (n < 0)) {
+ return -1;
+ } else if (n <= 1) {
+ if (n == 1) {
+ PLCP[0] = 0;
+ }
+ return 0;
+ }
+
+ libsais_compute_phi_omp(SA, PLCP, n, 1);
+ libsais_compute_plcp_omp(T, PLCP, n, 1);
+
+ return 0;
+}
+
+static s32 libsais_lcp(const s32 * PLCP, const s32 * SA, s32 * LCP, s32 n) {
+ if ((PLCP == NULL) || (SA == NULL) || (LCP == NULL) || (n < 0)) {
+ return -1;
+ } else if (n <= 1) {
+ if (n == 1) {
+ LCP[0] = PLCP[SA[0]];
+ }
+ return 0;
+ }
+
+ libsais_compute_lcp_omp(PLCP, SA, LCP, n, 1);
+
+ return 0;
+}
#endif
diff --git a/include/lzp.h b/include/lzp.h
deleted file mode 100644
index 5ea5e06..0000000
--- a/include/lzp.h
+++ /dev/null
@@ -1,11 +0,0 @@
-
-#ifndef _LZP_H
-#define _LZP_H
-
-#include "common.h"
-
-s32 lzp_compress(const u8 * input, u8 * output, s32 n, s32 hash, s32 min, s32 * lut);
-
-s32 lzp_decompress(const u8 * input, u8 * output, s32 n, s32 hash, s32 min, s32 * lut);
-
-#endif
diff --git a/include/rle.h b/include/rle.h
deleted file mode 100644
index e62d8ca..0000000
--- a/include/rle.h
+++ /dev/null
@@ -1,31 +0,0 @@
-
-/*
- * BZip3 - A spiritual successor to BZip2.
- * Copyright (C) 2022 Kamila Szewczyk
- *
- * This program is free software: you can redistribute it and/or modify it
- * under the terms of the GNU Lesser General Public License as published by the Free
- * Software Foundation, either version 3 of the License, or (at your option)
- * any later version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU Lesser General Public License along with
- * this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef _RLE_H
-#define _RLE_H
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include "common.h"
-
-s32 mrlec(u8 * in, s32 inlen, u8 * out);
-void mrled(u8 * in, u8 * out, s32 outlen);
-
-#endif
diff --git a/src/cm.c b/src/cm.c
deleted file mode 100644
index 8a30874..0000000
--- a/src/cm.c
+++ /dev/null
@@ -1,198 +0,0 @@
-
-#include "cm.h"
-
-#include "common.h"
-
-#if defined(__has_builtin)
- #if __has_builtin(__builtin_prefetch)
- #define HAS_BUILTIN_PREFECTCH
- #endif
-#elif defined(__GNUC__) && (((__GNUC__ == 3) && (__GNUC_MINOR__ >= 2)) || (__GNUC__ >= 4))
- #define HAS_BUILTIN_PREFECTCH
-#endif
-
-#if defined(HAS_BUILTIN_PREFECTCH)
- #define prefetch(address) __builtin_prefetch((const void *)(address), 0, 0)
-#elif defined(_M_IX86) || defined(_M_AMD64)
- #include <intrin.h>
- #define prefetch(address) _mm_prefetch((const void *)(address), _MM_HINT_NTA)
-#elif defined(_M_ARM)
- #include <intrin.h>
- #define prefetch(address) __prefetch((const void *)(address))
-#elif defined(_M_ARM64)
- #include <intrin.h>
- #define prefetch(address) __prefetch2((const void *)(address), 1)
-#else
- #define prefetch(address)
-#endif
-
-// Uses an arithmetic coder implementation outlined in:
-// http://mattmahoney.net/dc/dce.html#Section_31
-
-#define write_out(s, c) (s)->out_queue[(s)->output_ptr++] = (c)
-#define read_in(s) ((s)->input_ptr < (s)->input_max ? (s)->in_queue[(s)->input_ptr++] : -1)
-
-#define update0(p, x) (p) = ((p) - ((p) >> x))
-#define update1(p, x) (p) = ((p) + (((p) ^ 65535) >> x))
-
-void begin(state * s) {
- prefetch(s);
- s->c1 = s->c2 = 0;
- s->run = 0;
- s->low = 0;
- s->high = 0xFFFFFFFF;
- s->code = 0;
- for (int i = 0; i < 256; i++) s->C0[i] = 1 << 15;
- for (int i = 0; i < 256; i++)
- for (int j = 0; j < 256; j++) s->C1[i][j] = 1 << 15;
- for (int i = 0; i < 2; i++)
- for (int j = 0; j < 256; j++)
- for (int k = 0; k < 17; k++) s->C2[2 * j + i][k] = (k << 12) - (k == 16); // Firm difference from stdpack.
-}
-
-void encode_bytes(state * s, u8 * buf, s32 size) {
- u32 high = s->high, low = s->low, c1 = s->c1, c2 = s->c2, run = s->run;
- for (s32 i = 0; i < size; i++) {
- u8 c = buf[i];
-
- if (c1 == c2)
- ++run;
- else
- run = 0;
-
- const int f = run > 2;
-
- int ctx = 1;
-
- while (ctx < 256) {
- const int p0 = s->C0[ctx];
- const int p1 = s->C1[c1][ctx];
- const int p2 = s->C1[c2][ctx];
- const int p = ((p0 + p1) * 7 + p2 + p2) >> 4;
-
- const int j = p >> 12;
- const int x1 = s->C2[2 * ctx + f][j];
- const int x2 = s->C2[2 * ctx + f][j + 1];
- const int ssep = x1 + (((x2 - x1) * (p & 4095)) >> 12);
-
- if (c & 128) {
- high = low + (((u64)(high - low) * (ssep * 3 + p)) >> 18);
-
- while ((low ^ high) < (1 << 24)) {
- write_out(s, low >> 24);
- low <<= 8;
- high = (high << 8) + 0xFF;
- }
-
- update1(s->C0[ctx], 2);
- update1(s->C1[c1][ctx], 4);
- update1(s->C2[2 * ctx + f][j], 6);
- update1(s->C2[2 * ctx + f][j + 1], 6);
- ctx += ctx + 1;
- } else {
- low += (((u64)(high - low) * (ssep * 3 + p)) >> 18) + 1;
-
- // Write identical bits.
- while ((low ^ high) < (1 << 24)) {
- write_out(s, low >> 24); // Same as high >> 24
- low <<= 8;
- high = (high << 8) + 0xFF;
- }
-
- update0(s->C0[ctx], 2);
- update0(s->C1[c1][ctx], 4);
- update0(s->C2[2 * ctx + f][j], 6);
- update0(s->C2[2 * ctx + f][j + 1], 6);
- ctx += ctx;
- }
-
- c <<= 1;
- }
-
- c2 = c1;
- c1 = ctx & 255;
- }
-
- write_out(s, low >> 24);
- low <<= 8;
- write_out(s, low >> 24);
- low <<= 8;
- write_out(s, low >> 24);
- low <<= 8;
- write_out(s, low >> 24);
- low <<= 8;
-
- s->high = high;
- s->low = low;
- s->c1 = c1;
- s->c2 = c2;
- s->run = run;
-}
-
-void decode_bytes(state * s, u8 * c, s32 size) {
- u32 high = s->high, low = s->low, c1 = s->c1, c2 = s->c2, run = s->run, code = s->code;
-
- code = (code << 8) + read_in(s);
- code = (code << 8) + read_in(s);
- code = (code << 8) + read_in(s);
- code = (code << 8) + read_in(s);
-
- for (s32 i = 0; i < size; i++) {
- if (c1 == c2)
- ++run;
- else
- run = 0;
-
- const int f = run > 2;
-
- int ctx = 1;
-
- while (ctx < 256) {
- const int p0 = s->C0[ctx];
- const int p1 = s->C1[c1][ctx];
- const int p2 = s->C1[c2][ctx];
- const int p = ((p0 + p1) * 7 + p2 + p2) >> 4;
-
- const int j = p >> 12;
- const int x1 = s->C2[2 * ctx + f][j];
- const int x2 = s->C2[2 * ctx + f][j + 1];
- const int ssep = x1 + (((x2 - x1) * (p & 4095)) >> 12);
-
- const u32 mid = low + (((u64)(high - low) * (ssep * 3 + p)) >> 18);
- const u8 bit = code <= mid;
- if (bit)
- high = mid;
- else
- low = mid + 1;
- while ((low ^ high) < (1 << 24)) {
- low <<= 8;
- high = (high << 8) + 255;
- code = (code << 8) + read_in(s);
- }
-
- if (bit) {
- update1(s->C0[ctx], 2);
- update1(s->C1[c1][ctx], 4);
- update1(s->C2[2 * ctx + f][j], 6);
- update1(s->C2[2 * ctx + f][j + 1], 6);
- ctx += ctx + 1;
- } else {
- update0(s->C0[ctx], 2);
- update0(s->C1[c1][ctx], 4);
- update0(s->C2[2 * ctx + f][j], 6);
- update0(s->C2[2 * ctx + f][j + 1], 6);
- ctx += ctx;
- }
- }
-
- c2 = c1;
- c[i] = c1 = ctx & 255;
- }
-
- s->high = high;
- s->low = low;
- s->c1 = c1;
- s->c2 = c2;
- s->run = run;
- s->code = code;
-}
diff --git a/src/crc32.c b/src/crc32.c
deleted file mode 100644
index 851cd6c..0000000
--- a/src/crc32.c
+++ /dev/null
@@ -1,59 +0,0 @@
-
-/*
- * BZip3 - A spiritual successor to BZip2.
- * Copyright (C) 2022 Kamila Szewczyk
- *
- * This program is free software: you can redistribute it and/or modify it
- * under the terms of the GNU Lesser General Public License as published by the Free
- * Software Foundation, either version 3 of the License, or (at your option)
- * any later version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU Lesser General Public License along with
- * this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "crc32.h"
-
-#include "common.h"
-
-static const u32 crc32Table[256] = {
- 0x00000000L, 0xF26B8303L, 0xE13B70F7L, 0x1350F3F4L, 0xC79A971FL, 0x35F1141CL, 0x26A1E7E8L, 0xD4CA64EBL, 0x8AD958CFL,
- 0x78B2DBCCL, 0x6BE22838L, 0x9989AB3BL, 0x4D43CFD0L, 0xBF284CD3L, 0xAC78BF27L, 0x5E133C24L, 0x105EC76FL, 0xE235446CL,
- 0xF165B798L, 0x030E349BL, 0xD7C45070L, 0x25AFD373L, 0x36FF2087L, 0xC494A384L, 0x9A879FA0L, 0x68EC1CA3L, 0x7BBCEF57L,
- 0x89D76C54L, 0x5D1D08BFL, 0xAF768BBCL, 0xBC267848L, 0x4E4DFB4BL, 0x20BD8EDEL, 0xD2D60DDDL, 0xC186FE29L, 0x33ED7D2AL,
- 0xE72719C1L, 0x154C9AC2L, 0x061C6936L, 0xF477EA35L, 0xAA64D611L, 0x580F5512L, 0x4B5FA6E6L, 0xB93425E5L, 0x6DFE410EL,
- 0x9F95C20DL, 0x8CC531F9L, 0x7EAEB2FAL, 0x30E349B1L, 0xC288CAB2L, 0xD1D83946L, 0x23B3BA45L, 0xF779DEAEL, 0x05125DADL,
- 0x1642AE59L, 0xE4292D5AL, 0xBA3A117EL, 0x4851927DL, 0x5B016189L, 0xA96AE28AL, 0x7DA08661L, 0x8FCB0562L, 0x9C9BF696L,
- 0x6EF07595L, 0x417B1DBCL, 0xB3109EBFL, 0xA0406D4BL, 0x522BEE48L, 0x86E18AA3L, 0x748A09A0L, 0x67DAFA54L, 0x95B17957L,
- 0xCBA24573L, 0x39C9C670L, 0x2A993584L, 0xD8F2B687L, 0x0C38D26CL, 0xFE53516FL, 0xED03A29BL, 0x1F682198L, 0x5125DAD3L,
- 0xA34E59D0L, 0xB01EAA24L, 0x42752927L, 0x96BF4DCCL, 0x64D4CECFL, 0x77843D3BL, 0x85EFBE38L, 0xDBFC821CL, 0x2997011FL,
- 0x3AC7F2EBL, 0xC8AC71E8L, 0x1C661503L, 0xEE0D9600L, 0xFD5D65F4L, 0x0F36E6F7L, 0x61C69362L, 0x93AD1061L, 0x80FDE395L,
- 0x72966096L, 0xA65C047DL, 0x5437877EL, 0x4767748AL, 0xB50CF789L, 0xEB1FCBADL, 0x197448AEL, 0x0A24BB5AL, 0xF84F3859L,
- 0x2C855CB2L, 0xDEEEDFB1L, 0xCDBE2C45L, 0x3FD5AF46L, 0x7198540DL, 0x83F3D70EL, 0x90A324FAL, 0x62C8A7F9L, 0xB602C312L,
- 0x44694011L, 0x5739B3E5L, 0xA55230E6L, 0xFB410CC2L, 0x092A8FC1L, 0x1A7A7C35L, 0xE811FF36L, 0x3CDB9BDDL, 0xCEB018DEL,
- 0xDDE0EB2AL, 0x2F8B6829L, 0x82F63B78L, 0x709DB87BL, 0x63CD4B8FL, 0x91A6C88CL, 0x456CAC67L, 0xB7072F64L, 0xA457DC90L,
- 0x563C5F93L, 0x082F63B7L, 0xFA44E0B4L, 0xE9141340L, 0x1B7F9043L, 0xCFB5F4A8L, 0x3DDE77ABL, 0x2E8E845FL, 0xDCE5075CL,
- 0x92A8FC17L, 0x60C37F14L, 0x73938CE0L, 0x81F80FE3L, 0x55326B08L, 0xA759E80BL, 0xB4091BFFL, 0x466298FCL, 0x1871A4D8L,
- 0xEA1A27DBL, 0xF94AD42FL, 0x0B21572CL, 0xDFEB33C7L, 0x2D80B0C4L, 0x3ED04330L, 0xCCBBC033L, 0xA24BB5A6L, 0x502036A5L,
- 0x4370C551L, 0xB11B4652L, 0x65D122B9L, 0x97BAA1BAL, 0x84EA524EL, 0x7681D14DL, 0x2892ED69L, 0xDAF96E6AL, 0xC9A99D9EL,
- 0x3BC21E9DL, 0xEF087A76L, 0x1D63F975L, 0x0E330A81L, 0xFC588982L, 0xB21572C9L, 0x407EF1CAL, 0x532E023EL, 0xA145813DL,
- 0x758FE5D6L, 0x87E466D5L, 0x94B49521L, 0x66DF1622L, 0x38CC2A06L, 0xCAA7A905L, 0xD9F75AF1L, 0x2B9CD9F2L, 0xFF56BD19L,
- 0x0D3D3E1AL, 0x1E6DCDEEL, 0xEC064EEDL, 0xC38D26C4L, 0x31E6A5C7L, 0x22B65633L, 0xD0DDD530L, 0x0417B1DBL, 0xF67C32D8L,
- 0xE52CC12CL, 0x1747422FL, 0x49547E0BL, 0xBB3FFD08L, 0xA86F0EFCL, 0x5A048DFFL, 0x8ECEE914L, 0x7CA56A17L, 0x6FF599E3L,
- 0x9D9E1AE0L, 0xD3D3E1ABL, 0x21B862A8L, 0x32E8915CL, 0xC083125FL, 0x144976B4L, 0xE622F5B7L, 0xF5720643L, 0x07198540L,
- 0x590AB964L, 0xAB613A67L, 0xB831C993L, 0x4A5A4A90L, 0x9E902E7BL, 0x6CFBAD78L, 0x7FAB5E8CL, 0x8DC0DD8FL, 0xE330A81AL,
- 0x115B2B19L, 0x020BD8EDL, 0xF0605BEEL, 0x24AA3F05L, 0xD6C1BC06L, 0xC5914FF2L, 0x37FACCF1L, 0x69E9F0D5L, 0x9B8273D6L,
- 0x88D28022L, 0x7AB90321L, 0xAE7367CAL, 0x5C18E4C9L, 0x4F48173DL, 0xBD23943EL, 0xF36E6F75L, 0x0105EC76L, 0x12551F82L,
- 0xE03E9C81L, 0x34F4F86AL, 0xC69F7B69L, 0xD5CF889DL, 0x27A40B9EL, 0x79B737BAL, 0x8BDCB4B9L, 0x988C474DL, 0x6AE7C44EL,
- 0xBE2DA0A5L, 0x4C4623A6L, 0x5F16D052L, 0xAD7D5351L
-};
-
-u32 crc32sum(u32 crc, u8 * RESTRICT buf, size_t size) {
- while (size--) crc = crc32Table[(crc ^ *(buf++)) & 0xff] ^ (crc >> 8);
- return crc;
-}
diff --git a/src/libbz3.c b/src/libbz3.c
index 487274d..58108f2 100644
--- a/src/libbz3.c
+++ b/src/libbz3.c
@@ -22,16 +22,429 @@
#include <stdlib.h>
#include <string.h>
-#include "cm.h"
-#include "common.h"
-#include "crc32.h"
#include "libsais.h"
-#include "lzp.h"
-#include "rle.h"
+
+/* CRC32 implementation. Since CRC32 generally takes less than 1% of the runtime on real-world data (e.g. the
+ Silesia corpus), I decided against using hardware CRC32. This implementation is simple, fast, fool-proof and
+ good enough to be used with bzip3. */
+
+static const u32 crc32Table[256] = {
+ 0x00000000L, 0xF26B8303L, 0xE13B70F7L, 0x1350F3F4L, 0xC79A971FL, 0x35F1141CL, 0x26A1E7E8L, 0xD4CA64EBL, 0x8AD958CFL,
+ 0x78B2DBCCL, 0x6BE22838L, 0x9989AB3BL, 0x4D43CFD0L, 0xBF284CD3L, 0xAC78BF27L, 0x5E133C24L, 0x105EC76FL, 0xE235446CL,
+ 0xF165B798L, 0x030E349BL, 0xD7C45070L, 0x25AFD373L, 0x36FF2087L, 0xC494A384L, 0x9A879FA0L, 0x68EC1CA3L, 0x7BBCEF57L,
+ 0x89D76C54L, 0x5D1D08BFL, 0xAF768BBCL, 0xBC267848L, 0x4E4DFB4BL, 0x20BD8EDEL, 0xD2D60DDDL, 0xC186FE29L, 0x33ED7D2AL,
+ 0xE72719C1L, 0x154C9AC2L, 0x061C6936L, 0xF477EA35L, 0xAA64D611L, 0x580F5512L, 0x4B5FA6E6L, 0xB93425E5L, 0x6DFE410EL,
+ 0x9F95C20DL, 0x8CC531F9L, 0x7EAEB2FAL, 0x30E349B1L, 0xC288CAB2L, 0xD1D83946L, 0x23B3BA45L, 0xF779DEAEL, 0x05125DADL,
+ 0x1642AE59L, 0xE4292D5AL, 0xBA3A117EL, 0x4851927DL, 0x5B016189L, 0xA96AE28AL, 0x7DA08661L, 0x8FCB0562L, 0x9C9BF696L,
+ 0x6EF07595L, 0x417B1DBCL, 0xB3109EBFL, 0xA0406D4BL, 0x522BEE48L, 0x86E18AA3L, 0x748A09A0L, 0x67DAFA54L, 0x95B17957L,
+ 0xCBA24573L, 0x39C9C670L, 0x2A993584L, 0xD8F2B687L, 0x0C38D26CL, 0xFE53516FL, 0xED03A29BL, 0x1F682198L, 0x5125DAD3L,
+ 0xA34E59D0L, 0xB01EAA24L, 0x42752927L, 0x96BF4DCCL, 0x64D4CECFL, 0x77843D3BL, 0x85EFBE38L, 0xDBFC821CL, 0x2997011FL,
+ 0x3AC7F2EBL, 0xC8AC71E8L, 0x1C661503L, 0xEE0D9600L, 0xFD5D65F4L, 0x0F36E6F7L, 0x61C69362L, 0x93AD1061L, 0x80FDE395L,
+ 0x72966096L, 0xA65C047DL, 0x5437877EL, 0x4767748AL, 0xB50CF789L, 0xEB1FCBADL, 0x197448AEL, 0x0A24BB5AL, 0xF84F3859L,
+ 0x2C855CB2L, 0xDEEEDFB1L, 0xCDBE2C45L, 0x3FD5AF46L, 0x7198540DL, 0x83F3D70EL, 0x90A324FAL, 0x62C8A7F9L, 0xB602C312L,
+ 0x44694011L, 0x5739B3E5L, 0xA55230E6L, 0xFB410CC2L, 0x092A8FC1L, 0x1A7A7C35L, 0xE811FF36L, 0x3CDB9BDDL, 0xCEB018DEL,
+ 0xDDE0EB2AL, 0x2F8B6829L, 0x82F63B78L, 0x709DB87BL, 0x63CD4B8FL, 0x91A6C88CL, 0x456CAC67L, 0xB7072F64L, 0xA457DC90L,
+ 0x563C5F93L, 0x082F63B7L, 0xFA44E0B4L, 0xE9141340L, 0x1B7F9043L, 0xCFB5F4A8L, 0x3DDE77ABL, 0x2E8E845FL, 0xDCE5075CL,
+ 0x92A8FC17L, 0x60C37F14L, 0x73938CE0L, 0x81F80FE3L, 0x55326B08L, 0xA759E80BL, 0xB4091BFFL, 0x466298FCL, 0x1871A4D8L,
+ 0xEA1A27DBL, 0xF94AD42FL, 0x0B21572CL, 0xDFEB33C7L, 0x2D80B0C4L, 0x3ED04330L, 0xCCBBC033L, 0xA24BB5A6L, 0x502036A5L,
+ 0x4370C551L, 0xB11B4652L, 0x65D122B9L, 0x97BAA1BAL, 0x84EA524EL, 0x7681D14DL, 0x2892ED69L, 0xDAF96E6AL, 0xC9A99D9EL,
+ 0x3BC21E9DL, 0xEF087A76L, 0x1D63F975L, 0x0E330A81L, 0xFC588982L, 0xB21572C9L, 0x407EF1CAL, 0x532E023EL, 0xA145813DL,
+ 0x758FE5D6L, 0x87E466D5L, 0x94B49521L, 0x66DF1622L, 0x38CC2A06L, 0xCAA7A905L, 0xD9F75AF1L, 0x2B9CD9F2L, 0xFF56BD19L,
+ 0x0D3D3E1AL, 0x1E6DCDEEL, 0xEC064EEDL, 0xC38D26C4L, 0x31E6A5C7L, 0x22B65633L, 0xD0DDD530L, 0x0417B1DBL, 0xF67C32D8L,
+ 0xE52CC12CL, 0x1747422FL, 0x49547E0BL, 0xBB3FFD08L, 0xA86F0EFCL, 0x5A048DFFL, 0x8ECEE914L, 0x7CA56A17L, 0x6FF599E3L,
+ 0x9D9E1AE0L, 0xD3D3E1ABL, 0x21B862A8L, 0x32E8915CL, 0xC083125FL, 0x144976B4L, 0xE622F5B7L, 0xF5720643L, 0x07198540L,
+ 0x590AB964L, 0xAB613A67L, 0xB831C993L, 0x4A5A4A90L, 0x9E902E7BL, 0x6CFBAD78L, 0x7FAB5E8CL, 0x8DC0DD8FL, 0xE330A81AL,
+ 0x115B2B19L, 0x020BD8EDL, 0xF0605BEEL, 0x24AA3F05L, 0xD6C1BC06L, 0xC5914FF2L, 0x37FACCF1L, 0x69E9F0D5L, 0x9B8273D6L,
+ 0x88D28022L, 0x7AB90321L, 0xAE7367CAL, 0x5C18E4C9L, 0x4F48173DL, 0xBD23943EL, 0xF36E6F75L, 0x0105EC76L, 0x12551F82L,
+ 0xE03E9C81L, 0x34F4F86AL, 0xC69F7B69L, 0xD5CF889DL, 0x27A40B9EL, 0x79B737BAL, 0x8BDCB4B9L, 0x988C474DL, 0x6AE7C44EL,
+ 0xBE2DA0A5L, 0x4C4623A6L, 0x5F16D052L, 0xAD7D5351L
+};
+
+static u32 crc32sum(u32 crc, u8 * RESTRICT buf, size_t size) {
+ while (size--) crc = crc32Table[(crc ^ *(buf++)) & 0xff] ^ (crc >> 8);
+ return crc;
+}
+
+/* LZP code. These constants were manually tuned to give the best compression ratio while using relatively
+ little resources. The LZP dictionary is only around 1MiB in size and the minimum match length was chosen
+ so that LZP would not interfere too much with the Burrows-Wheeler transform and the arithmetic coder, and
+ just collapse long redundant data instead (for a major speed-up at a low compression ratio cost - in fact,
+ LZP preprocessing often improves compression in some cases). */
+
+/* A heavily modified version of libbsc's LZP predictor w/ unaligned accesses follows. This one has single thread
+ performance and provides better compression ratio. It is also mostly UB-free and less brittle during
+ AFL fuzzing. */
#define LZP_DICTIONARY 18
#define LZP_MIN_MATCH 40
+#define MATCH 0xf2
+
+static s32 lzp_encode_block(const u8 * RESTRICT in, const u8 * in_end, u8 * RESTRICT out, u8 * out_end,
+ s32 * RESTRICT lut) {
+ const u8 * ins = in;
+ const u8 * outs = out;
+ const u8 * out_eob = out_end - 8;
+ const u8 * heur = in;
+
+ u32 ctx;
+
+ for (s32 i = 0; i < 4; ++i) *out++ = *in++;
+
+ ctx = ((u32)in[-1]) | (((u32)in[-2]) << 8) | (((u32)in[-3]) << 16) | (((u32)in[-4]) << 24);
+
+ while (in < in_end - LZP_MIN_MATCH - 32 && out < out_eob) {
+ u32 idx = (ctx >> 15 ^ ctx ^ ctx >> 3) & ((s32)(1 << LZP_DICTIONARY) - 1);
+ s32 val = lut[idx];
+ lut[idx] = in - ins;
+ if (val > 0) {
+ const u8 * RESTRICT ref = ins + val;
+ if (memcmp(in + LZP_MIN_MATCH - 4, ref + LZP_MIN_MATCH - 4, sizeof(u32)) == 0 &&
+ memcmp(in, ref, sizeof(u32)) == 0) {
+ if (heur > in && *(u32 *)heur != *(u32 *)(ref + (heur - in))) goto not_found;
+
+ s32 len = 4;
+ for (; in + len < in_end - LZP_MIN_MATCH - 32; len += sizeof(u32)) {
+ if (*(u32 *)(in + len) != *(u32 *)(ref + len)) break;
+ }
+
+ if (len < LZP_MIN_MATCH) {
+ if (heur < in + len) heur = in + len;
+ goto not_found;
+ }
+
+ len += in[len] == ref[len];
+ len += in[len] == ref[len];
+ len += in[len] == ref[len];
+
+ in += len;
+ ctx = ((u32)in[-1]) | (((u32)in[-2]) << 8) | (((u32)in[-3]) << 16) | (((u32)in[-4]) << 24);
+
+ *out++ = MATCH;
+
+ len -= LZP_MIN_MATCH;
+ while (len >= 254) {
+ len -= 254;
+ *out++ = 254;
+ if (out >= out_eob) break;
+ }
+
+ *out++ = len;
+ } else {
+ not_found:;
+ u8 next = *out++ = *in++;
+ ctx = ctx << 8 | next;
+ if (next == MATCH) *out++ = 255;
+ }
+ } else {
+ ctx = (ctx << 8) | (*out++ = *in++);
+ }
+ }
+
+ ctx = ((u32)in[-1]) | (((u32)in[-2]) << 8) | (((u32)in[-3]) << 16) | (((u32)in[-4]) << 24);
+
+ while (in < in_end && out < out_eob) {
+ u32 idx = (ctx >> 15 ^ ctx ^ ctx >> 3) & ((s32)(1 << LZP_DICTIONARY) - 1);
+ s32 val = lut[idx];
+ lut[idx] = (s32)(in - ins);
+
+ u8 next = *out++ = *in++;
+ ctx = ctx << 8 | next;
+ if (next == MATCH && val > 0) *out++ = 255;
+ }
+
+ return out >= out_eob ? -1 : (s32)(out - outs);
+}
+
+static s32 lzp_decode_block(const u8 * RESTRICT in, const u8 * in_end, s32 * RESTRICT lut, u8 * RESTRICT out) {
+ const u8 * outs = out;
+
+ for (s32 i = 0; i < 4; ++i) *out++ = *in++;
+
+ u32 ctx = ((u32)out[-1]) | (((u32)out[-2]) << 8) | (((u32)out[-3]) << 16) | (((u32)out[-4]) << 24);
+
+ while (in < in_end) {
+ u32 idx = (ctx >> 15 ^ ctx ^ ctx >> 3) & ((s32)(1 << LZP_DICTIONARY) - 1);
+ s32 val = lut[idx];
+ lut[idx] = (s32)(out - outs);
+ if (*in == MATCH && val > 0) {
+ in++;
+ if (*in != 255) {
+ s32 len = LZP_MIN_MATCH;
+ while (1) {
+ len += *in;
+ if (*in++ != 254) break;
+ }
+
+ const u8 * ref = outs + val;
+ u8 * out_end = out + len;
+
+ while (out < out_end) *out++ = *ref++;
+
+ ctx = ((u32)out[-1]) | (((u32)out[-2]) << 8) | (((u32)out[-3]) << 16) | (((u32)out[-4]) << 24);
+ } else {
+ in++;
+ ctx = (ctx << 8) | (*out++ = MATCH);
+ }
+ } else {
+ ctx = (ctx << 8) | (*out++ = *in++);
+ }
+ }
+
+ return out - outs;
+}
+
+static s32 lzp_compress(const u8 * RESTRICT in, u8 * RESTRICT out, s32 n, s32 * RESTRICT lut) {
+ if (n < LZP_MIN_MATCH + 32) return -1;
+
+ memset(lut, 0, sizeof(s32) * (1 << LZP_DICTIONARY));
+
+ return lzp_encode_block(in, in + n, out, out + n, lut);
+}
+
+static s32 lzp_decompress(const u8 * RESTRICT in, u8 * RESTRICT out, s32 n, s32 * RESTRICT lut) {
+ if (n < 4) return -1;
+
+ memset(lut, 0, sizeof(s32) * (1 << LZP_DICTIONARY));
+
+ return lzp_decode_block(in, in + n, lut, out);
+}
+
+/* RLE code. Unlike RLE in other compressors, we collapse all runs if they yield a net gain
+ for a given character and encode this as a set bit in the RLE metadata. This improves the
+ performance and reduces the amount of collapsing done in normal blocks (so that BWT+AC can
+ be more efficient) while we still filter out all the pathological data. */
+
+static s32 mrlec(u8 * in, s32 inlen, u8 * out) {
+ u8 * ip = in;
+ u8 * in_end = in + inlen;
+ s32 op = 0;
+ s32 c, pc = -1;
+ s32 t[256] = { 0 };
+ s32 run = 0;
+ while ((c = (ip < in_end ? *ip++ : -1)) != -1) {
+ if (c == pc)
+ t[c] += (++run % 255) != 0;
+ else
+ --t[c], run = 0;
+ pc = c;
+ }
+ for (s32 i = 0; i < 32; ++i) {
+ c = 0;
+ for (s32 j = 0; j < 8; ++j) c += (t[i * 8 + j] > 0) << j;
+ out[op++] = c;
+ }
+ ip = in;
+ c = pc = -1;
+ run = 0;
+ do {
+ c = ip < in_end ? *ip++ : -1;
+ if (c == pc)
+ ++run;
+ else if (run > 0 && t[pc] > 0) {
+ out[op++] = pc;
+ for (; run > 255; run -= 255) out[op++] = 255;
+ out[op++] = run - 1;
+ run = 1;
+ } else
+ for (++run; run > 1; --run) out[op++] = pc;
+ pc = c;
+ } while (c != -1);
+
+ return op;
+}
+
+static void mrled(u8 * RESTRICT in, u8 * RESTRICT out, s32 outlen) {
+ s32 op = 0, ip = 0;
+
+ s32 c, pc = -1;
+ s32 t[256] = { 0 };
+ s32 run = 0;
+
+ for (s32 i = 0; i < 32; ++i) {
+ c = in[ip++];
+ for (s32 j = 0; j < 8; ++j) t[i * 8 + j] = (c >> j) & 1;
+ }
+
+ while (op < outlen) {
+ c = in[ip++];
+ if (t[c]) {
+ for (run = 0; (pc = in[ip++]) == 255; run += 255)
+ ;
+ run += pc + 1;
+ for (; run > 0; --run) out[op++] = c;
+ } else
+ out[op++] = c;
+ }
+}
+
+/* The entropy coder. Uses an arithmetic coder implementation outlined in Matt Mahoney's DCE. */
+
+typedef struct {
+ /* Input/output. */
+ u8 *in_queue, *out_queue;
+ s32 input_ptr, output_ptr, input_max;
+
+ /* C0, C1 - used for making the initial prediction, C2 used for an APM with a slightly low
+ learning rate (6) and 512 contexts. kanzi merges C0 and C1, uses slightly different
+ counter initialisation code and prediction code which from my tests tends to be suboptimal. */
+ u16 C0[256], C1[256][256], C2[512][17];
+} state;
+
+#define write_out(s, c) (s)->out_queue[(s)->output_ptr++] = (c)
+#define read_in(s) ((s)->input_ptr < (s)->input_max ? (s)->in_queue[(s)->input_ptr++] : -1)
+
+#define update0(p, x) (p) = ((p) - ((p) >> x))
+#define update1(p, x) (p) = ((p) + (((p) ^ 65535) >> x))
+
+static void begin(state * s) {
+ prefetch(s);
+ for (int i = 0; i < 256; i++) s->C0[i] = 1 << 15;
+ for (int i = 0; i < 256; i++)
+ for (int j = 0; j < 256; j++) s->C1[i][j] = 1 << 15;
+ for (int i = 0; i < 2; i++)
+ for (int j = 0; j < 256; j++)
+ for (int k = 0; k < 17; k++) s->C2[2 * j + i][k] = (k << 12) - (k == 16); // Firm difference from stdpack.
+}
+
+static void encode_bytes(state * s, u8 * buf, s32 size) {
+ /* Arithmetic coding, detecting runs of characters in the file */
+ u32 high = 0xFFFFFFFF, low = 0, c1 = 0, c2 = 0, run = 0;
+
+ for (s32 i = 0; i < size; i++) {
+ u8 c = buf[i];
+
+ if (c1 == c2)
+ ++run;
+ else
+ run = 0;
+
+ const int f = run > 2;
+
+ int ctx = 1;
+
+ while (ctx < 256) {
+ const int p0 = s->C0[ctx];
+ const int p1 = s->C1[c1][ctx];
+ const int p2 = s->C1[c2][ctx];
+ const int p = ((p0 + p1) * 7 + p2 + p2) >> 4;
+
+ const int j = p >> 12;
+ const int x1 = s->C2[2 * ctx + f][j];
+ const int x2 = s->C2[2 * ctx + f][j + 1];
+ const int ssep = x1 + (((x2 - x1) * (p & 4095)) >> 12);
+
+ if (c & 128) {
+ high = low + (((u64)(high - low) * (ssep * 3 + p)) >> 18);
+
+ while ((low ^ high) < (1 << 24)) {
+ write_out(s, low >> 24);
+ low <<= 8;
+ high = (high << 8) + 0xFF;
+ }
+
+ update1(s->C0[ctx], 2);
+ update1(s->C1[c1][ctx], 4);
+ update1(s->C2[2 * ctx + f][j], 6);
+ update1(s->C2[2 * ctx + f][j + 1], 6);
+ ctx += ctx + 1;
+ } else {
+ low += (((u64)(high - low) * (ssep * 3 + p)) >> 18) + 1;
+
+ // Write identical bits.
+ while ((low ^ high) < (1 << 24)) {
+ write_out(s, low >> 24); // Same as high >> 24
+ low <<= 8;
+ high = (high << 8) + 0xFF;
+ }
+
+ update0(s->C0[ctx], 2);
+ update0(s->C1[c1][ctx], 4);
+ update0(s->C2[2 * ctx + f][j], 6);
+ update0(s->C2[2 * ctx + f][j + 1], 6);
+ ctx += ctx;
+ }
+
+ c <<= 1;
+ }
+
+ c2 = c1;
+ c1 = ctx & 255;
+ }
+
+ write_out(s, low >> 24);
+ low <<= 8;
+ write_out(s, low >> 24);
+ low <<= 8;
+ write_out(s, low >> 24);
+ low <<= 8;
+ write_out(s, low >> 24);
+ low <<= 8;
+}
+
+static void decode_bytes(state * s, u8 * c, s32 size) {
+ u32 high = 0xFFFFFFFF, low = 0, c1 = 0, c2 = 0, run = 0, code = 0;
+
+ code = (code << 8) + read_in(s);
+ code = (code << 8) + read_in(s);
+ code = (code << 8) + read_in(s);
+ code = (code << 8) + read_in(s);
+
+ for (s32 i = 0; i < size; i++) {
+ if (c1 == c2)
+ ++run;
+ else
+ run = 0;
+
+ const int f = run > 2;
+
+ int ctx = 1;
+
+ while (ctx < 256) {
+ const int p0 = s->C0[ctx];
+ const int p1 = s->C1[c1][ctx];
+ const int p2 = s->C1[c2][ctx];
+ const int p = ((p0 + p1) * 7 + p2 + p2) >> 4;
+
+ const int j = p >> 12;
+ const int x1 = s->C2[2 * ctx + f][j];
+ const int x2 = s->C2[2 * ctx + f][j + 1];
+ const int ssep = x1 + (((x2 - x1) * (p & 4095)) >> 12);
+
+ const u32 mid = low + (((u64)(high - low) * (ssep * 3 + p)) >> 18);
+ const u8 bit = code <= mid;
+ if (bit)
+ high = mid;
+ else
+ low = mid + 1;
+ while ((low ^ high) < (1 << 24)) {
+ low <<= 8;
+ high = (high << 8) + 255;
+ code = (code << 8) + read_in(s);
+ }
+
+ if (bit) {
+ update1(s->C0[ctx], 2);
+ update1(s->C1[c1][ctx], 4);
+ update1(s->C2[2 * ctx + f][j], 6);
+ update1(s->C2[2 * ctx + f][j + 1], 6);
+ ctx += ctx + 1;
+ } else {
+ update0(s->C0[ctx], 2);
+ update0(s->C1[c1][ctx], 4);
+ update0(s->C2[2 * ctx + f][j], 6);
+ update0(s->C2[2 * ctx + f][j + 1], 6);
+ ctx += ctx;
+ }
+ }
+
+ c2 = c1;
+ c[i] = c1 = ctx & 255;
+ }
+}
+
+/* Public API. */
+
struct bz3_state {
u8 * swap_buffer;
s32 block_size;
@@ -144,7 +557,7 @@ PUBLIC_API s32 bz3_encode_block(struct bz3_state * state, u8 * buffer, s32 data_
model |= 4;
}
- lzp_size = lzp_compress(b1, b2, data_size, LZP_DICTIONARY, LZP_MIN_MATCH, state->lzp_lut);
+ lzp_size = lzp_compress(b1, b2, data_size, state->lzp_lut);
if (lzp_size > 0 && lzp_size < data_size + 64) {
swap(b1, b2);
data_size = lzp_size;
@@ -265,7 +678,7 @@ PUBLIC_API s32 bz3_decode_block(struct bz3_state * state, u8 * buffer, s32 data_
// Undo LZP
if (model & 2) {
- size_src = lzp_decompress(b1, b2, lzp_size, LZP_DICTIONARY, LZP_MIN_MATCH, state->lzp_lut);
+ size_src = lzp_decompress(b1, b2, lzp_size, state->lzp_lut);
swap(b1, b2);
}
diff --git a/src/libsais.c b/src/libsais.c
deleted file mode 100644
index 5587f28..0000000
--- a/src/libsais.c
+++ /dev/null
@@ -1,5480 +0,0 @@
-/*--
-
-This file is a part of libsais, a library for linear time suffix array,
-longest common prefix array and burrows wheeler transform construction.
-
- Copyright (c) 2021-2022 Ilya Grebnov <ilya.grebnov@gmail.com>
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-
-Please see the file LICENSE for full copyright information.
-
---*/
-
-#include "libsais.h"
-
-#include <limits.h>
-#include <stddef.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "common.h"
-
-#define UNUSED(_x) (void)(_x)
-
-typedef s32 sa_sint_t;
-typedef u32 sa_uint_t;
-typedef ptrdiff_t fast_sint_t;
-typedef size_t fast_uint_t;
-
-#define SAINT_BIT (32)
-#define SAINT_MAX INT32_MAX
-#define SAINT_MIN INT32_MIN
-
-#define ALPHABET_SIZE (1 << CHAR_BIT)
-#define UNBWT_FASTBITS (17)
-
-#define SUFFIX_GROUP_BIT (SAINT_BIT - 1)
-#define SUFFIX_GROUP_MARKER (((sa_sint_t)1) << (SUFFIX_GROUP_BIT - 1))
-
-#define BUCKETS_INDEX2(_c, _s) (((_c) << 1) + (_s))
-#define BUCKETS_INDEX4(_c, _s) (((_c) << 2) + (_s))
-
-#define LIBSAIS_PER_THREAD_CACHE_SIZE (24576)
-
-typedef struct LIBSAIS_THREAD_CACHE {
- sa_sint_t symbol;
- sa_sint_t index;
-} LIBSAIS_THREAD_CACHE;
-
-typedef union LIBSAIS_THREAD_STATE {
- struct {
- fast_sint_t position;
- fast_sint_t count;
-
- fast_sint_t m;
- fast_sint_t last_lms_suffix;
-
- sa_sint_t * buckets;
- LIBSAIS_THREAD_CACHE * cache;
- } state;
-
- u8 padding[64];
-} LIBSAIS_THREAD_STATE;
-
-typedef struct LIBSAIS_CONTEXT {
- sa_sint_t * buckets;
- LIBSAIS_THREAD_STATE * thread_state;
- fast_sint_t threads;
-} LIBSAIS_CONTEXT;
-
-typedef struct LIBSAIS_UNBWT_CONTEXT {
- sa_uint_t * bucket2;
- u16 * fastbits;
- sa_uint_t * buckets;
- fast_sint_t threads;
-} LIBSAIS_UNBWT_CONTEXT;
-
-#if defined(__has_builtin)
- #if __has_builtin(__builtin_prefetch)
- #define HAS_BUILTIN_PREFECTCH
- #endif
-#elif defined(__GNUC__) && (((__GNUC__ == 3) && (__GNUC_MINOR__ >= 2)) || (__GNUC__ >= 4))
- #define HAS_BUILTIN_PREFECTCH
-#endif
-
-#if defined(__has_builtin)
- #if __has_builtin(__builtin_bswap16)
- #define HAS_BUILTIN_BSWAP16
- #endif
-#elif defined(__GNUC__) && (((__GNUC__ == 4) && (__GNUC_MINOR__ >= 8)) || (__GNUC__ >= 5))
- #define HAS_BUILTIN_BSWAP16
-#endif
-
-#if defined(HAS_BUILTIN_PREFECTCH)
- #define libsais_prefetch(address) __builtin_prefetch((const void *)(address), 0, 0)
- #define libsais_prefetchw(address) __builtin_prefetch((const void *)(address), 1, 0)
-#elif defined(_M_IX86) || defined(_M_AMD64)
- #include <intrin.h>
- #define libsais_prefetch(address) _mm_prefetch((const void *)(address), _MM_HINT_NTA)
- #define libsais_prefetchw(address) _m_prefetchw((const void *)(address))
-#elif defined(_M_ARM)
- #include <intrin.h>
- #define libsais_prefetch(address) __prefetch((const void *)(address))
- #define libsais_prefetchw(address) __prefetchw((const void *)(address))
-#elif defined(_M_ARM64)
- #include <intrin.h>
- #define libsais_prefetch(address) __prefetch2((const void *)(address), 1)
- #define libsais_prefetchw(address) __prefetch2((const void *)(address), 17)
-#else
- #error Your compiler, configuration or platform is not supported.
-#endif
-
-#if !defined(__LITTLE_ENDIAN__) && !defined(__BIG_ENDIAN__)
- #if defined(_LITTLE_ENDIAN) || (defined(BYTE_ORDER) && defined(LITTLE_ENDIAN) && BYTE_ORDER == LITTLE_ENDIAN) || \
- (defined(_BYTE_ORDER) && defined(_LITTLE_ENDIAN) && _BYTE_ORDER == _LITTLE_ENDIAN) || \
- (defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && __BYTE_ORDER == __LITTLE_ENDIAN) || \
- (defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
- #define __LITTLE_ENDIAN__
- #elif defined(_BIG_ENDIAN) || (defined(BYTE_ORDER) && defined(BIG_ENDIAN) && BYTE_ORDER == BIG_ENDIAN) || \
- (defined(_BYTE_ORDER) && defined(_BIG_ENDIAN) && _BYTE_ORDER == _BIG_ENDIAN) || \
- (defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && __BYTE_ORDER == __BIG_ENDIAN) || \
- (defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
- #define __BIG_ENDIAN__
- #elif defined(_WIN32)
- #define __LITTLE_ENDIAN__
- #endif
-#endif
-
-#if defined(__LITTLE_ENDIAN__) && !defined(__BIG_ENDIAN__)
- #if defined(HAS_BUILTIN_BSWAP16)
- #define libsais_bswap16(x) (__builtin_bswap16(x))
- #elif defined(_MSC_VER) && !defined(__INTEL_COMPILER)
- #define libsais_bswap16(x) (_byteswap_ushort(x))
- #else
- #define libsais_bswap16(x) ((u16)(x >> 8) | (u16)(x << 8))
- #endif
-#elif !defined(__LITTLE_ENDIAN__) && defined(__BIG_ENDIAN__)
- #define libsais_bswap16(x) (x)
-#else
- #error Your compiler, configuration or platform is not supported.
-#endif
-
-static void * libsais_align_up(const void * address, size_t alignment) {
- return (void *)((((ptrdiff_t)address) + ((ptrdiff_t)alignment) - 1) & (-((ptrdiff_t)alignment)));
-}
-
-static void * libsais_alloc_aligned(size_t size, size_t alignment) {
- void * address = malloc(size + sizeof(short) + alignment - 1);
- if (address != NULL) {
- void * aligned_address = libsais_align_up((void *)((ptrdiff_t)address + (ptrdiff_t)(sizeof(short))), alignment);
- ((short *)aligned_address)[-1] = (short)((ptrdiff_t)aligned_address - (ptrdiff_t)address);
-
- return aligned_address;
- }
-
- return NULL;
-}
-
-static void libsais_free_aligned(void * aligned_address) {
- if (aligned_address != NULL) {
- free((void *)((ptrdiff_t)aligned_address - ((short *)aligned_address)[-1]));
- }
-}
-
-static LIBSAIS_THREAD_STATE * libsais_alloc_thread_state(sa_sint_t threads) {
- LIBSAIS_THREAD_STATE * RESTRICT thread_state =
- (LIBSAIS_THREAD_STATE *)libsais_alloc_aligned((size_t)threads * sizeof(LIBSAIS_THREAD_STATE), 4096);
- sa_sint_t * RESTRICT thread_buckets =
- (sa_sint_t *)libsais_alloc_aligned((size_t)threads * 4 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096);
- LIBSAIS_THREAD_CACHE * RESTRICT thread_cache = (LIBSAIS_THREAD_CACHE *)libsais_alloc_aligned(
- (size_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE * sizeof(LIBSAIS_THREAD_CACHE), 4096);
-
- if (thread_state != NULL && thread_buckets != NULL && thread_cache != NULL) {
- fast_sint_t t;
- for (t = 0; t < threads; ++t) {
- thread_state[t].state.buckets = thread_buckets;
- thread_buckets += 4 * ALPHABET_SIZE;
- thread_state[t].state.cache = thread_cache;
- thread_cache += LIBSAIS_PER_THREAD_CACHE_SIZE;
- }
-
- return thread_state;
- }
-
- libsais_free_aligned(thread_cache);
- libsais_free_aligned(thread_buckets);
- libsais_free_aligned(thread_state);
- return NULL;
-}
-
-static void libsais_free_thread_state(LIBSAIS_THREAD_STATE * thread_state) {
- if (thread_state != NULL) {
- libsais_free_aligned(thread_state[0].state.cache);
- libsais_free_aligned(thread_state[0].state.buckets);
- libsais_free_aligned(thread_state);
- }
-}
-
-static LIBSAIS_CONTEXT * libsais_create_ctx_main(sa_sint_t threads) {
- LIBSAIS_CONTEXT * RESTRICT ctx = (LIBSAIS_CONTEXT *)libsais_alloc_aligned(sizeof(LIBSAIS_CONTEXT), 64);
- sa_sint_t * RESTRICT buckets = (sa_sint_t *)libsais_alloc_aligned(8 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096);
- LIBSAIS_THREAD_STATE * RESTRICT thread_state = threads > 1 ? libsais_alloc_thread_state(threads) : NULL;
-
- if (ctx != NULL && buckets != NULL && (thread_state != NULL || threads == 1)) {
- ctx->buckets = buckets;
- ctx->threads = threads;
- ctx->thread_state = thread_state;
-
- return ctx;
- }
-
- libsais_free_thread_state(thread_state);
- libsais_free_aligned(buckets);
- libsais_free_aligned(ctx);
- return NULL;
-}
-
-static void libsais_free_ctx_main(LIBSAIS_CONTEXT * ctx) {
- if (ctx != NULL) {
- libsais_free_thread_state(ctx->thread_state);
- libsais_free_aligned(ctx->buckets);
- libsais_free_aligned(ctx);
- }
-}
-static void libsais_gather_lms_suffixes_8u(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, fast_sint_t m,
- fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
- if (omp_block_size > 0) {
- const fast_sint_t prefetch_distance = 128;
-
- fast_sint_t i, j = omp_block_start + omp_block_size, c0 = T[omp_block_start + omp_block_size - 1], c1 = -1;
-
- while (j < n && (c1 = T[j]) == c0) {
- ++j;
- }
-
- fast_uint_t s = c0 >= c1;
-
- for (i = omp_block_start + omp_block_size - 2, j = omp_block_start + 3; i >= j; i -= 4) {
- libsais_prefetch(&T[i - prefetch_distance]);
-
- c1 = T[i - 0];
- s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
- SA[m] = (sa_sint_t)(i + 1);
- m -= ((s & 3) == 1);
- c0 = T[i - 1];
- s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
- SA[m] = (sa_sint_t)(i - 0);
- m -= ((s & 3) == 1);
- c1 = T[i - 2];
- s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
- SA[m] = (sa_sint_t)(i - 1);
- m -= ((s & 3) == 1);
- c0 = T[i - 3];
- s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
- SA[m] = (sa_sint_t)(i - 2);
- m -= ((s & 3) == 1);
- }
-
- for (j -= 3; i >= j; i -= 1) {
- c1 = c0;
- c0 = T[i];
- s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
- SA[m] = (sa_sint_t)(i + 1);
- m -= ((s & 3) == 1);
- }
-
- SA[m] = (sa_sint_t)(i + 1);
- }
-}
-
-static void libsais_gather_lms_suffixes_8u_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- {
- (void)(threads);
- (void)(thread_state);
-
- fast_sint_t omp_thread_num = 0;
- fast_sint_t omp_num_threads = 1;
-
- fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
- fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
- fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
-
- if (omp_num_threads == 1) {
- libsais_gather_lms_suffixes_8u(T, SA, n, (fast_sint_t)n - 1, omp_block_start, omp_block_size);
- }
- }
-}
-
-static sa_sint_t libsais_gather_lms_suffixes_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n) {
- const fast_sint_t prefetch_distance = 32;
-
- sa_sint_t i = n - 2;
- sa_sint_t m = n - 1;
- fast_uint_t s = 1;
- fast_sint_t c0 = T[n - 1];
- fast_sint_t c1 = 0;
-
- for (; i >= 3; i -= 4) {
- libsais_prefetch(&T[i - prefetch_distance]);
-
- c1 = T[i - 0];
- s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
- SA[m] = i + 1;
- m -= ((s & 3) == 1);
- c0 = T[i - 1];
- s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
- SA[m] = i - 0;
- m -= ((s & 3) == 1);
- c1 = T[i - 2];
- s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
- SA[m] = i - 1;
- m -= ((s & 3) == 1);
- c0 = T[i - 3];
- s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
- SA[m] = i - 2;
- m -= ((s & 3) == 1);
- }
-
- for (; i >= 0; i -= 1) {
- c1 = c0;
- c0 = T[i];
- s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
- SA[m] = i + 1;
- m -= ((s & 3) == 1);
- }
-
- return n - 1 - m;
-}
-
-static sa_sint_t libsais_gather_compacted_lms_suffixes_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t n) {
- const fast_sint_t prefetch_distance = 32;
-
- sa_sint_t i = n - 2;
- sa_sint_t m = n - 1;
- fast_uint_t s = 1;
- fast_sint_t c0 = T[n - 1];
- fast_sint_t c1 = 0;
-
- for (; i >= 3; i -= 4) {
- libsais_prefetch(&T[i - prefetch_distance]);
-
- c1 = T[i - 0];
- s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
- SA[m] = i + 1;
- m -= ((fast_sint_t)(s & 3) == (c0 >= 0));
- c0 = T[i - 1];
- s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
- SA[m] = i - 0;
- m -= ((fast_sint_t)(s & 3) == (c1 >= 0));
- c1 = T[i - 2];
- s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
- SA[m] = i - 1;
- m -= ((fast_sint_t)(s & 3) == (c0 >= 0));
- c0 = T[i - 3];
- s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
- SA[m] = i - 2;
- m -= ((fast_sint_t)(s & 3) == (c1 >= 0));
- }
-
- for (; i >= 0; i -= 1) {
- c1 = c0;
- c0 = T[i];
- s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
- SA[m] = i + 1;
- m -= ((fast_sint_t)(s & 3) == (c1 >= 0));
- }
-
- return n - 1 - m;
-}
-static void libsais_count_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t n, sa_sint_t k,
- sa_sint_t * RESTRICT buckets) {
- const fast_sint_t prefetch_distance = 32;
-
- memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t));
-
- sa_sint_t i = n - 2;
- fast_uint_t s = 1;
- fast_sint_t c0 = T[n - 1];
- fast_sint_t c1 = 0;
-
- for (; i >= prefetch_distance + 3; i -= 4) {
- libsais_prefetch(&T[i - 2 * prefetch_distance]);
-
- libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0], 0)]);
- libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1], 0)]);
- libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2], 0)]);
- libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3], 0)]);
-
- c1 = T[i - 0];
- s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
- buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
-
- c0 = T[i - 1];
- s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
- buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
-
- c1 = T[i - 2];
- s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
- buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
-
- c0 = T[i - 3];
- s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
- buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
- }
-
- for (; i >= 0; i -= 1) {
- c1 = c0;
- c0 = T[i];
- s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
- buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
- }
-
- buckets[BUCKETS_INDEX2((fast_uint_t)c0, 0)]++;
-}
-static sa_sint_t libsais_count_and_gather_lms_suffixes_8u(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start,
- fast_sint_t omp_block_size) {
- memset(buckets, 0, 4 * ALPHABET_SIZE * sizeof(sa_sint_t));
-
- fast_sint_t m = omp_block_start + omp_block_size - 1;
-
- if (omp_block_size > 0) {
- const fast_sint_t prefetch_distance = 128;
-
- fast_sint_t i, j = m + 1, c0 = T[m], c1 = -1;
-
- while (j < n && (c1 = T[j]) == c0) {
- ++j;
- }
-
- fast_uint_t s = c0 >= c1;
-
- for (i = m - 1, j = omp_block_start + 3; i >= j; i -= 4) {
- libsais_prefetch(&T[i - prefetch_distance]);
-
- c1 = T[i - 0];
- s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
- SA[m] = (sa_sint_t)(i + 1);
- m -= ((s & 3) == 1);
- buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
-
- c0 = T[i - 1];
- s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
- SA[m] = (sa_sint_t)(i - 0);
- m -= ((s & 3) == 1);
- buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
-
- c1 = T[i - 2];
- s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
- SA[m] = (sa_sint_t)(i - 1);
- m -= ((s & 3) == 1);
- buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
-
- c0 = T[i - 3];
- s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
- SA[m] = (sa_sint_t)(i - 2);
- m -= ((s & 3) == 1);
- buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
- }
-
- for (j -= 3; i >= j; i -= 1) {
- c1 = c0;
- c0 = T[i];
- s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
- SA[m] = (sa_sint_t)(i + 1);
- m -= ((s & 3) == 1);
- buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
- }
-
- c1 = (i >= 0) ? T[i] : -1;
- s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
- SA[m] = (sa_sint_t)(i + 1);
- m -= ((s & 3) == 1);
- buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
- }
-
- return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m);
-}
-
-static sa_sint_t libsais_count_and_gather_lms_suffixes_8u_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t n, sa_sint_t * RESTRICT buckets,
- sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- sa_sint_t m = 0;
-
- {
- (void)(threads);
- (void)(thread_state);
-
- fast_sint_t omp_thread_num = 0;
- fast_sint_t omp_num_threads = 1;
-
- fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
- fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
- fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
-
- if (omp_num_threads == 1) {
- m = libsais_count_and_gather_lms_suffixes_8u(T, SA, n, buckets, omp_block_start, omp_block_size);
- }
- }
-
- return m;
-}
-
-static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets,
- fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
- memset(buckets, 0, 4 * (size_t)k * sizeof(sa_sint_t));
-
- fast_sint_t m = omp_block_start + omp_block_size - 1;
-
- if (omp_block_size > 0) {
- const fast_sint_t prefetch_distance = 32;
-
- fast_sint_t i, j = m + 1, c0 = T[m], c1 = -1;
-
- while (j < n && (c1 = T[j]) == c0) {
- ++j;
- }
-
- fast_uint_t s = c0 >= c1;
-
- for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4) {
- libsais_prefetch(&T[i - 2 * prefetch_distance]);
-
- libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 0], 0)]);
- libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 1], 0)]);
- libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 2], 0)]);
- libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 3], 0)]);
-
- c1 = T[i - 0];
- s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
- SA[m] = (sa_sint_t)(i + 1);
- m -= ((s & 3) == 1);
- buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
-
- c0 = T[i - 1];
- s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
- SA[m] = (sa_sint_t)(i - 0);
- m -= ((s & 3) == 1);
- buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
-
- c1 = T[i - 2];
- s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
- SA[m] = (sa_sint_t)(i - 1);
- m -= ((s & 3) == 1);
- buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
-
- c0 = T[i - 3];
- s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
- SA[m] = (sa_sint_t)(i - 2);
- m -= ((s & 3) == 1);
- buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
- }
-
- for (j -= prefetch_distance + 3; i >= j; i -= 1) {
- c1 = c0;
- c0 = T[i];
- s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
- SA[m] = (sa_sint_t)(i + 1);
- m -= ((s & 3) == 1);
- buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]++;
- }
-
- c1 = (i >= 0) ? T[i] : -1;
- s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
- SA[m] = (sa_sint_t)(i + 1);
- m -= ((s & 3) == 1);
- buckets[BUCKETS_INDEX4((fast_uint_t)c0, s & 3)]++;
- }
-
- return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m);
-}
-
-static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets,
- fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
- memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t));
-
- fast_sint_t m = omp_block_start + omp_block_size - 1;
-
- if (omp_block_size > 0) {
- const fast_sint_t prefetch_distance = 32;
-
- fast_sint_t i, j = m + 1, c0 = T[m], c1 = -1;
-
- while (j < n && (c1 = T[j]) == c0) {
- ++j;
- }
-
- fast_uint_t s = c0 >= c1;
-
- for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4) {
- libsais_prefetch(&T[i - 2 * prefetch_distance]);
-
- libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0], 0)]);
- libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1], 0)]);
- libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2], 0)]);
- libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3], 0)]);
-
- c1 = T[i - 0];
- s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
- SA[m] = (sa_sint_t)(i + 1);
- m -= ((s & 3) == 1);
- buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
-
- c0 = T[i - 1];
- s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
- SA[m] = (sa_sint_t)(i - 0);
- m -= ((s & 3) == 1);
- buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
-
- c1 = T[i - 2];
- s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
- SA[m] = (sa_sint_t)(i - 1);
- m -= ((s & 3) == 1);
- buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
-
- c0 = T[i - 3];
- s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
- SA[m] = (sa_sint_t)(i - 2);
- m -= ((s & 3) == 1);
- buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
- }
-
- for (j -= prefetch_distance + 3; i >= j; i -= 1) {
- c1 = c0;
- c0 = T[i];
- s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
- SA[m] = (sa_sint_t)(i + 1);
- m -= ((s & 3) == 1);
- buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
- }
-
- c1 = (i >= 0) ? T[i] : -1;
- s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
- SA[m] = (sa_sint_t)(i + 1);
- m -= ((s & 3) == 1);
- buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
- }
-
- return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m);
-}
-
-static sa_sint_t libsais_count_and_gather_compacted_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T,
- sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t k, sa_sint_t * RESTRICT buckets,
- fast_sint_t omp_block_start,
- fast_sint_t omp_block_size) {
- memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t));
-
- fast_sint_t m = omp_block_start + omp_block_size - 1;
-
- if (omp_block_size > 0) {
- const fast_sint_t prefetch_distance = 32;
-
- fast_sint_t i, j = m + 1, c0 = T[m], c1 = -1;
-
- while (j < n && (c1 = T[j]) == c0) {
- ++j;
- }
-
- fast_uint_t s = c0 >= c1;
-
- for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4) {
- libsais_prefetch(&T[i - 2 * prefetch_distance]);
-
- libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0] & SAINT_MAX, 0)]);
- libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1] & SAINT_MAX, 0)]);
- libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2] & SAINT_MAX, 0)]);
- libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3] & SAINT_MAX, 0)]);
-
- c1 = T[i - 0];
- s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
- SA[m] = (sa_sint_t)(i + 1);
- m -= ((fast_sint_t)(s & 3) == (c0 >= 0));
- c0 &= SAINT_MAX;
- buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
-
- c0 = T[i - 1];
- s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
- SA[m] = (sa_sint_t)(i - 0);
- m -= ((fast_sint_t)(s & 3) == (c1 >= 0));
- c1 &= SAINT_MAX;
- buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
-
- c1 = T[i - 2];
- s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
- SA[m] = (sa_sint_t)(i - 1);
- m -= ((fast_sint_t)(s & 3) == (c0 >= 0));
- c0 &= SAINT_MAX;
- buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
-
- c0 = T[i - 3];
- s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
- SA[m] = (sa_sint_t)(i - 2);
- m -= ((fast_sint_t)(s & 3) == (c1 >= 0));
- c1 &= SAINT_MAX;
- buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
- }
-
- for (j -= prefetch_distance + 3; i >= j; i -= 1) {
- c1 = c0;
- c0 = T[i];
- s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
- SA[m] = (sa_sint_t)(i + 1);
- m -= ((fast_sint_t)(s & 3) == (c1 >= 0));
- c1 &= SAINT_MAX;
- buckets[BUCKETS_INDEX2((fast_uint_t)c1, (s & 3) == 1)]++;
- }
-
- c1 = (i >= 0) ? T[i] : -1;
- s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
- SA[m] = (sa_sint_t)(i + 1);
- m -= ((fast_sint_t)(s & 3) == (c0 >= 0));
- c0 &= SAINT_MAX;
- buckets[BUCKETS_INDEX2((fast_uint_t)c0, (s & 3) == 1)]++;
- }
-
- return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m);
-}
-static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k_nofs_omp(const sa_sint_t * RESTRICT T,
- sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t k, sa_sint_t * RESTRICT buckets,
- sa_sint_t threads) {
- sa_sint_t m = 0;
- {
- (void)(threads);
-
- fast_sint_t omp_num_threads = 1;
-
- if (omp_num_threads == 1) {
- m = libsais_count_and_gather_lms_suffixes_32s_4k(T, SA, n, k, buckets, 0, n);
- }
- }
-
- return m;
-}
-
-static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k_nofs_omp(const sa_sint_t * RESTRICT T,
- sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t k, sa_sint_t * RESTRICT buckets,
- sa_sint_t threads) {
- sa_sint_t m = 0;
- {
- (void)(threads);
-
- fast_sint_t omp_num_threads = 1;
-
- if (omp_num_threads == 1) {
- m = libsais_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets, 0, n);
- }
- }
-
- return m;
-}
-
-static sa_sint_t libsais_count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp(const sa_sint_t * RESTRICT T,
- sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t k,
- sa_sint_t * RESTRICT buckets,
- sa_sint_t threads) {
- sa_sint_t m = 0;
- {
- (void)(threads);
-
- fast_sint_t omp_num_threads = 1;
-
- if (omp_num_threads == 1) {
- m = libsais_count_and_gather_compacted_lms_suffixes_32s_2k(T, SA, n, k, buckets, 0, n);
- }
- }
-
- return m;
-}
-
-static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t n, sa_sint_t k,
- sa_sint_t * RESTRICT buckets, sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- sa_sint_t m;
- (void)(thread_state);
-
- { m = libsais_count_and_gather_lms_suffixes_32s_4k_nofs_omp(T, SA, n, k, buckets, threads); }
-
- return m;
-}
-
-static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t n, sa_sint_t k,
- sa_sint_t * RESTRICT buckets, sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- sa_sint_t m;
- (void)(thread_state);
-
- { m = libsais_count_and_gather_lms_suffixes_32s_2k_nofs_omp(T, SA, n, k, buckets, threads); }
-
- return m;
-}
-
-static void libsais_count_and_gather_compacted_lms_suffixes_32s_2k_omp(const sa_sint_t * RESTRICT T,
- sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t k, sa_sint_t * RESTRICT buckets,
- sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- (void)(thread_state);
-
- { libsais_count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp(T, SA, n, k, buckets, threads); }
-}
-
-static void libsais_count_suffixes_32s(const sa_sint_t * RESTRICT T, sa_sint_t n, sa_sint_t k,
- sa_sint_t * RESTRICT buckets) {
- const fast_sint_t prefetch_distance = 32;
-
- memset(buckets, 0, (size_t)k * sizeof(sa_sint_t));
-
- fast_sint_t i, j;
- for (i = 0, j = (fast_sint_t)n - 7; i < j; i += 8) {
- libsais_prefetch(&T[i + prefetch_distance]);
-
- buckets[T[i + 0]]++;
- buckets[T[i + 1]]++;
- buckets[T[i + 2]]++;
- buckets[T[i + 3]]++;
- buckets[T[i + 4]]++;
- buckets[T[i + 5]]++;
- buckets[T[i + 6]]++;
- buckets[T[i + 7]]++;
- }
-
- for (j += 7; i < j; i += 1) {
- buckets[T[i]]++;
- }
-}
-
-static void libsais_initialize_buckets_start_and_end_8u(sa_sint_t * RESTRICT buckets, sa_sint_t * RESTRICT freq) {
- sa_sint_t * RESTRICT bucket_start = &buckets[6 * ALPHABET_SIZE];
- sa_sint_t * RESTRICT bucket_end = &buckets[7 * ALPHABET_SIZE];
-
- if (freq != NULL) {
- fast_sint_t i, j;
- sa_sint_t sum = 0;
- for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0);
- i += BUCKETS_INDEX4(1, 0), j += 1) {
- bucket_start[j] = sum;
- sum += (freq[j] = buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 1)] +
- buckets[i + BUCKETS_INDEX4(0, 2)] + buckets[i + BUCKETS_INDEX4(0, 3)]);
- bucket_end[j] = sum;
- }
- } else {
- fast_sint_t i, j;
- sa_sint_t sum = 0;
- for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0);
- i += BUCKETS_INDEX4(1, 0), j += 1) {
- bucket_start[j] = sum;
- sum += buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 1)] +
- buckets[i + BUCKETS_INDEX4(0, 2)] + buckets[i + BUCKETS_INDEX4(0, 3)];
- bucket_end[j] = sum;
- }
- }
-}
-
-static void libsais_initialize_buckets_start_and_end_32s_6k(sa_sint_t k, sa_sint_t * RESTRICT buckets) {
- sa_sint_t * RESTRICT bucket_start = &buckets[4 * k];
- sa_sint_t * RESTRICT bucket_end = &buckets[5 * k];
-
- fast_sint_t i, j;
- sa_sint_t sum = 0;
- for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0);
- i += BUCKETS_INDEX4(1, 0), j += 1) {
- bucket_start[j] = sum;
- sum += buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 1)] +
- buckets[i + BUCKETS_INDEX4(0, 2)] + buckets[i + BUCKETS_INDEX4(0, 3)];
- bucket_end[j] = sum;
- }
-}
-
-static void libsais_initialize_buckets_start_and_end_32s_4k(sa_sint_t k, sa_sint_t * RESTRICT buckets) {
- sa_sint_t * RESTRICT bucket_start = &buckets[2 * k];
- sa_sint_t * RESTRICT bucket_end = &buckets[3 * k];
-
- fast_sint_t i, j;
- sa_sint_t sum = 0;
- for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0);
- i += BUCKETS_INDEX2(1, 0), j += 1) {
- bucket_start[j] = sum;
- sum += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)];
- bucket_end[j] = sum;
- }
-}
-
-static void libsais_initialize_buckets_end_32s_2k(sa_sint_t k, sa_sint_t * RESTRICT buckets) {
- fast_sint_t i;
- sa_sint_t sum0 = 0;
- for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0)) {
- sum0 += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)];
- buckets[i + BUCKETS_INDEX2(0, 0)] = sum0;
- }
-}
-
-static void libsais_initialize_buckets_start_and_end_32s_2k(sa_sint_t k, sa_sint_t * RESTRICT buckets) {
- fast_sint_t i, j;
- for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0);
- i += BUCKETS_INDEX2(1, 0), j += 1) {
- buckets[j] = buckets[i];
- }
-
- buckets[k] = 0;
- memcpy(&buckets[k + 1], buckets, ((size_t)k - 1) * sizeof(sa_sint_t));
-}
-
-static void libsais_initialize_buckets_start_32s_1k(sa_sint_t k, sa_sint_t * RESTRICT buckets) {
- fast_sint_t i;
- sa_sint_t sum = 0;
- for (i = 0; i <= (fast_sint_t)k - 1; i += 1) {
- sa_sint_t tmp = buckets[i];
- buckets[i] = sum;
- sum += tmp;
- }
-}
-
-static void libsais_initialize_buckets_end_32s_1k(sa_sint_t k, sa_sint_t * RESTRICT buckets) {
- fast_sint_t i;
- sa_sint_t sum = 0;
- for (i = 0; i <= (fast_sint_t)k - 1; i += 1) {
- sum += buckets[i];
- buckets[i] = sum;
- }
-}
-
-static sa_sint_t libsais_initialize_buckets_for_lms_suffixes_radix_sort_8u(const u8 * RESTRICT T,
- sa_sint_t * RESTRICT buckets,
- sa_sint_t first_lms_suffix) {
- {
- fast_uint_t s = 0;
- fast_sint_t c0 = T[first_lms_suffix];
- fast_sint_t c1 = 0;
-
- for (; --first_lms_suffix >= 0;) {
- c1 = c0;
- c0 = T[first_lms_suffix];
- s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
- buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]--;
- }
-
- buckets[BUCKETS_INDEX4((fast_uint_t)c0, (s << 1) & 3)]--;
- }
-
- {
- sa_sint_t * RESTRICT temp_bucket = &buckets[4 * ALPHABET_SIZE];
-
- fast_sint_t i, j;
- sa_sint_t sum = 0;
- for (i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0);
- i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) {
- temp_bucket[j + BUCKETS_INDEX2(0, 1)] = sum;
- sum += buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 3)];
- temp_bucket[j] = sum;
- }
-
- return sum;
- }
-}
-
-static void libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t k,
- sa_sint_t * RESTRICT buckets,
- sa_sint_t first_lms_suffix) {
- buckets[BUCKETS_INDEX2(T[first_lms_suffix], 0)]++;
- buckets[BUCKETS_INDEX2(T[first_lms_suffix], 1)]--;
-
- fast_sint_t i;
- sa_sint_t sum0 = 0, sum1 = 0;
- for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0)) {
- sum0 += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)];
- sum1 += buckets[i + BUCKETS_INDEX2(0, 1)];
-
- buckets[i + BUCKETS_INDEX2(0, 0)] = sum0;
- buckets[i + BUCKETS_INDEX2(0, 1)] = sum1;
- }
-}
-
-static sa_sint_t libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(const sa_sint_t * RESTRICT T,
- sa_sint_t k,
- sa_sint_t * RESTRICT buckets,
- sa_sint_t first_lms_suffix) {
- {
- fast_uint_t s = 0;
- fast_sint_t c0 = T[first_lms_suffix];
- fast_sint_t c1 = 0;
-
- for (; --first_lms_suffix >= 0;) {
- c1 = c0;
- c0 = T[first_lms_suffix];
- s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
- buckets[BUCKETS_INDEX4((fast_uint_t)c1, s & 3)]--;
- }
-
- buckets[BUCKETS_INDEX4((fast_uint_t)c0, (s << 1) & 3)]--;
- }
-
- {
- sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k];
-
- fast_sint_t i, j;
- sa_sint_t sum = 0;
- for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0);
- i += BUCKETS_INDEX4(1, 0), j += 1) {
- sum += buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 3)];
- temp_bucket[j] = sum;
- }
-
- return sum;
- }
-}
-
-static void libsais_initialize_buckets_for_radix_and_partial_sorting_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t k,
- sa_sint_t * RESTRICT buckets,
- sa_sint_t first_lms_suffix) {
- sa_sint_t * RESTRICT bucket_start = &buckets[2 * k];
- sa_sint_t * RESTRICT bucket_end = &buckets[3 * k];
-
- buckets[BUCKETS_INDEX2(T[first_lms_suffix], 0)]++;
- buckets[BUCKETS_INDEX2(T[first_lms_suffix], 1)]--;
-
- fast_sint_t i, j;
- sa_sint_t sum0 = 0, sum1 = 0;
- for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0);
- i += BUCKETS_INDEX2(1, 0), j += 1) {
- bucket_start[j] = sum1;
-
- sum0 += buckets[i + BUCKETS_INDEX2(0, 1)];
- sum1 += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)];
- buckets[i + BUCKETS_INDEX2(0, 1)] = sum0;
-
- bucket_end[j] = sum1;
- }
-}
-
-static void libsais_radix_sort_lms_suffixes_8u(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start,
- fast_sint_t omp_block_size) {
- const fast_sint_t prefetch_distance = 32;
-
- fast_sint_t i, j;
- for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4) {
- libsais_prefetch(&SA[i - 2 * prefetch_distance]);
-
- libsais_prefetch(&T[SA[i - prefetch_distance - 0]]);
- libsais_prefetch(&T[SA[i - prefetch_distance - 1]]);
- libsais_prefetch(&T[SA[i - prefetch_distance - 2]]);
- libsais_prefetch(&T[SA[i - prefetch_distance - 3]]);
-
- sa_sint_t p0 = SA[i - 0];
- SA[--induction_bucket[BUCKETS_INDEX2(T[p0], 0)]] = p0;
- sa_sint_t p1 = SA[i - 1];
- SA[--induction_bucket[BUCKETS_INDEX2(T[p1], 0)]] = p1;
- sa_sint_t p2 = SA[i - 2];
- SA[--induction_bucket[BUCKETS_INDEX2(T[p2], 0)]] = p2;
- sa_sint_t p3 = SA[i - 3];
- SA[--induction_bucket[BUCKETS_INDEX2(T[p3], 0)]] = p3;
- }
-
- for (j -= prefetch_distance + 3; i >= j; i -= 1) {
- sa_sint_t p = SA[i];
- SA[--induction_bucket[BUCKETS_INDEX2(T[p], 0)]] = p;
- }
-}
-
-static void libsais_radix_sort_lms_suffixes_8u_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t m, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- {
- (void)(threads);
- (void)(thread_state);
-
- fast_sint_t omp_num_threads = 1;
-
- if (omp_num_threads == 1) {
- libsais_radix_sort_lms_suffixes_8u(T, SA, &buckets[4 * ALPHABET_SIZE], (fast_sint_t)n - (fast_sint_t)m + 1,
- (fast_sint_t)m - 1);
- }
- }
-}
-
-static void libsais_radix_sort_lms_suffixes_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start,
- fast_sint_t omp_block_size) {
- const fast_sint_t prefetch_distance = 32;
-
- fast_sint_t i, j;
- for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 3; i >= j; i -= 4) {
- libsais_prefetch(&SA[i - 3 * prefetch_distance]);
-
- libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 0]]);
- libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 1]]);
- libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 2]]);
- libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 3]]);
-
- libsais_prefetchw(&induction_bucket[T[SA[i - prefetch_distance - 0]]]);
- libsais_prefetchw(&induction_bucket[T[SA[i - prefetch_distance - 1]]]);
- libsais_prefetchw(&induction_bucket[T[SA[i - prefetch_distance - 2]]]);
- libsais_prefetchw(&induction_bucket[T[SA[i - prefetch_distance - 3]]]);
-
- sa_sint_t p0 = SA[i - 0];
- SA[--induction_bucket[T[p0]]] = p0;
- sa_sint_t p1 = SA[i - 1];
- SA[--induction_bucket[T[p1]]] = p1;
- sa_sint_t p2 = SA[i - 2];
- SA[--induction_bucket[T[p2]]] = p2;
- sa_sint_t p3 = SA[i - 3];
- SA[--induction_bucket[T[p3]]] = p3;
- }
-
- for (j -= 2 * prefetch_distance + 3; i >= j; i -= 1) {
- sa_sint_t p = SA[i];
- SA[--induction_bucket[T[p]]] = p;
- }
-}
-
-static void libsais_radix_sort_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start,
- fast_sint_t omp_block_size) {
- const fast_sint_t prefetch_distance = 32;
-
- fast_sint_t i, j;
- for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 3; i >= j; i -= 4) {
- libsais_prefetch(&SA[i - 3 * prefetch_distance]);
-
- libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 0]]);
- libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 1]]);
- libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 2]]);
- libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 3]]);
-
- libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 0]], 0)]);
- libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 1]], 0)]);
- libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 2]], 0)]);
- libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 3]], 0)]);
-
- sa_sint_t p0 = SA[i - 0];
- SA[--induction_bucket[BUCKETS_INDEX2(T[p0], 0)]] = p0;
- sa_sint_t p1 = SA[i - 1];
- SA[--induction_bucket[BUCKETS_INDEX2(T[p1], 0)]] = p1;
- sa_sint_t p2 = SA[i - 2];
- SA[--induction_bucket[BUCKETS_INDEX2(T[p2], 0)]] = p2;
- sa_sint_t p3 = SA[i - 3];
- SA[--induction_bucket[BUCKETS_INDEX2(T[p3], 0)]] = p3;
- }
-
- for (j -= 2 * prefetch_distance + 3; i >= j; i -= 1) {
- sa_sint_t p = SA[i];
- SA[--induction_bucket[BUCKETS_INDEX2(T[p], 0)]] = p;
- }
-}
-static void libsais_radix_sort_lms_suffixes_32s_6k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t n, sa_sint_t m, sa_sint_t * RESTRICT induction_bucket,
- sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- if (threads == 1 || m < 65536) {
- libsais_radix_sort_lms_suffixes_32s_6k(T, SA, induction_bucket, (fast_sint_t)n - (fast_sint_t)m + 1,
- (fast_sint_t)m - 1);
- }
- (void)(thread_state);
-}
-
-static void libsais_radix_sort_lms_suffixes_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t n, sa_sint_t m, sa_sint_t * RESTRICT induction_bucket,
- sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- if (threads == 1 || m < 65536) {
- libsais_radix_sort_lms_suffixes_32s_2k(T, SA, induction_bucket, (fast_sint_t)n - (fast_sint_t)m + 1,
- (fast_sint_t)m - 1);
- }
- (void)(thread_state);
-}
-
-static sa_sint_t libsais_radix_sort_lms_suffixes_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t n, sa_sint_t * RESTRICT buckets) {
- const fast_sint_t prefetch_distance = 32;
-
- sa_sint_t i = n - 2;
- sa_sint_t m = 0;
- fast_uint_t s = 1;
- fast_sint_t c0 = T[n - 1];
- fast_sint_t c1 = 0;
- fast_sint_t c2 = 0;
-
- for (; i >= prefetch_distance + 3; i -= 4) {
- libsais_prefetch(&T[i - 2 * prefetch_distance]);
-
- libsais_prefetchw(&buckets[T[i - prefetch_distance - 0]]);
- libsais_prefetchw(&buckets[T[i - prefetch_distance - 1]]);
- libsais_prefetchw(&buckets[T[i - prefetch_distance - 2]]);
- libsais_prefetchw(&buckets[T[i - prefetch_distance - 3]]);
-
- c1 = T[i - 0];
- s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
- if ((s & 3) == 1) {
- SA[--buckets[c2 = c0]] = i + 1;
- m++;
- }
-
- c0 = T[i - 1];
- s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
- if ((s & 3) == 1) {
- SA[--buckets[c2 = c1]] = i - 0;
- m++;
- }
-
- c1 = T[i - 2];
- s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
- if ((s & 3) == 1) {
- SA[--buckets[c2 = c0]] = i - 1;
- m++;
- }
-
- c0 = T[i - 3];
- s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
- if ((s & 3) == 1) {
- SA[--buckets[c2 = c1]] = i - 2;
- m++;
- }
- }
-
- for (; i >= 0; i -= 1) {
- c1 = c0;
- c0 = T[i];
- s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1)));
- if ((s & 3) == 1) {
- SA[--buckets[c2 = c1]] = i + 1;
- m++;
- }
- }
-
- if (m > 1) {
- SA[buckets[c2]] = 0;
- }
-
- return m;
-}
-
-static void libsais_radix_sort_set_markers_32s_6k(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket,
- fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
- const fast_sint_t prefetch_distance = 32;
-
- fast_sint_t i, j;
- for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) {
- libsais_prefetch(&induction_bucket[i + 2 * prefetch_distance]);
-
- libsais_prefetchw(&SA[induction_bucket[i + prefetch_distance + 0]]);
- libsais_prefetchw(&SA[induction_bucket[i + prefetch_distance + 1]]);
- libsais_prefetchw(&SA[induction_bucket[i + prefetch_distance + 2]]);
- libsais_prefetchw(&SA[induction_bucket[i + prefetch_distance + 3]]);
-
- SA[induction_bucket[i + 0]] |= SAINT_MIN;
- SA[induction_bucket[i + 1]] |= SAINT_MIN;
- SA[induction_bucket[i + 2]] |= SAINT_MIN;
- SA[induction_bucket[i + 3]] |= SAINT_MIN;
- }
-
- for (j += prefetch_distance + 3; i < j; i += 1) {
- SA[induction_bucket[i]] |= SAINT_MIN;
- }
-}
-
-static void libsais_radix_sort_set_markers_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket,
- fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
- const fast_sint_t prefetch_distance = 32;
-
- fast_sint_t i, j;
- for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) {
- libsais_prefetch(&induction_bucket[BUCKETS_INDEX2(i + 2 * prefetch_distance, 0)]);
-
- libsais_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 0, 0)]]);
- libsais_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 1, 0)]]);
- libsais_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 2, 0)]]);
- libsais_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 3, 0)]]);
-
- SA[induction_bucket[BUCKETS_INDEX2(i + 0, 0)]] |= SUFFIX_GROUP_MARKER;
- SA[induction_bucket[BUCKETS_INDEX2(i + 1, 0)]] |= SUFFIX_GROUP_MARKER;
- SA[induction_bucket[BUCKETS_INDEX2(i + 2, 0)]] |= SUFFIX_GROUP_MARKER;
- SA[induction_bucket[BUCKETS_INDEX2(i + 3, 0)]] |= SUFFIX_GROUP_MARKER;
- }
-
- for (j += prefetch_distance + 3; i < j; i += 1) {
- SA[induction_bucket[BUCKETS_INDEX2(i, 0)]] |= SUFFIX_GROUP_MARKER;
- }
-}
-
-static void libsais_radix_sort_set_markers_32s_6k_omp(sa_sint_t * RESTRICT SA, sa_sint_t k,
- sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads) {
- {
- (void)(threads);
-
- fast_sint_t omp_block_start = 0;
- fast_sint_t omp_block_size = (fast_sint_t)k - 1;
- libsais_radix_sort_set_markers_32s_6k(SA, induction_bucket, omp_block_start, omp_block_size);
- }
-}
-
-static void libsais_radix_sort_set_markers_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t k,
- sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads) {
- {
- (void)(threads);
-
- fast_sint_t omp_block_start = 0;
- fast_sint_t omp_block_size = (fast_sint_t)k - 1;
- libsais_radix_sort_set_markers_32s_4k(SA, induction_bucket, omp_block_start, omp_block_size);
- }
-}
-
-static void libsais_initialize_buckets_for_partial_sorting_8u(const u8 * RESTRICT T, sa_sint_t * RESTRICT buckets,
- sa_sint_t first_lms_suffix,
- sa_sint_t left_suffixes_count) {
- sa_sint_t * RESTRICT temp_bucket = &buckets[4 * ALPHABET_SIZE];
-
- buckets[BUCKETS_INDEX4((fast_uint_t)T[first_lms_suffix], 1)]++;
-
- fast_sint_t i, j;
- sa_sint_t sum0 = left_suffixes_count + 1, sum1 = 0;
- for (i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0);
- i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) {
- temp_bucket[j + BUCKETS_INDEX2(0, 0)] = sum0;
-
- sum0 += buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 2)];
- sum1 += buckets[i + BUCKETS_INDEX4(0, 1)];
-
- buckets[j + BUCKETS_INDEX2(0, 0)] = sum0;
- buckets[j + BUCKETS_INDEX2(0, 1)] = sum1;
- }
-}
-
-static void libsais_initialize_buckets_for_partial_sorting_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t k,
- sa_sint_t * RESTRICT buckets,
- sa_sint_t first_lms_suffix,
- sa_sint_t left_suffixes_count) {
- sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k];
-
- fast_sint_t i, j;
- sa_sint_t sum0 = left_suffixes_count + 1, sum1 = 0, sum2 = 0;
- for (first_lms_suffix = T[first_lms_suffix], i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0);
- i <= BUCKETS_INDEX4((fast_sint_t)first_lms_suffix - 1, 0);
- i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) {
- sa_sint_t SS = buckets[i + BUCKETS_INDEX4(0, 0)];
- sa_sint_t LS = buckets[i + BUCKETS_INDEX4(0, 1)];
- sa_sint_t SL = buckets[i + BUCKETS_INDEX4(0, 2)];
- sa_sint_t LL = buckets[i + BUCKETS_INDEX4(0, 3)];
-
- buckets[i + BUCKETS_INDEX4(0, 0)] = sum0;
- buckets[i + BUCKETS_INDEX4(0, 1)] = sum2;
- buckets[i + BUCKETS_INDEX4(0, 2)] = 0;
- buckets[i + BUCKETS_INDEX4(0, 3)] = 0;
-
- sum0 += SS + SL;
- sum1 += LS;
- sum2 += LS + LL;
-
- temp_bucket[j + BUCKETS_INDEX2(0, 0)] = sum0;
- temp_bucket[j + BUCKETS_INDEX2(0, 1)] = sum1;
- }
-
- for (sum1 += 1; i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) {
- sa_sint_t SS = buckets[i + BUCKETS_INDEX4(0, 0)];
- sa_sint_t LS = buckets[i + BUCKETS_INDEX4(0, 1)];
- sa_sint_t SL = buckets[i + BUCKETS_INDEX4(0, 2)];
- sa_sint_t LL = buckets[i + BUCKETS_INDEX4(0, 3)];
-
- buckets[i + BUCKETS_INDEX4(0, 0)] = sum0;
- buckets[i + BUCKETS_INDEX4(0, 1)] = sum2;
- buckets[i + BUCKETS_INDEX4(0, 2)] = 0;
- buckets[i + BUCKETS_INDEX4(0, 3)] = 0;
-
- sum0 += SS + SL;
- sum1 += LS;
- sum2 += LS + LL;
-
- temp_bucket[j + BUCKETS_INDEX2(0, 0)] = sum0;
- temp_bucket[j + BUCKETS_INDEX2(0, 1)] = sum1;
- }
-}
-
-static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t * RESTRICT buckets, sa_sint_t d,
- fast_sint_t omp_block_start,
- fast_sint_t omp_block_size) {
- const fast_sint_t prefetch_distance = 32;
-
- sa_sint_t * RESTRICT induction_bucket = &buckets[4 * ALPHABET_SIZE];
- sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE];
-
- fast_sint_t i, j;
- for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) {
- libsais_prefetch(&SA[i + 2 * prefetch_distance]);
-
- libsais_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1);
- libsais_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 2);
- libsais_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 1);
- libsais_prefetch(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 2);
-
- sa_sint_t p0 = SA[i + 0];
- d += (p0 < 0);
- p0 &= SAINT_MAX;
- sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] >= T[p0 - 1]);
- SA[induction_bucket[v0]++] = (p0 - 1) | ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1));
- distinct_names[v0] = d;
-
- sa_sint_t p1 = SA[i + 1];
- d += (p1 < 0);
- p1 &= SAINT_MAX;
- sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] >= T[p1 - 1]);
- SA[induction_bucket[v1]++] = (p1 - 1) | ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1));
- distinct_names[v1] = d;
- }
-
- for (j += prefetch_distance + 1; i < j; i += 1) {
- sa_sint_t p = SA[i];
- d += (p < 0);
- p &= SAINT_MAX;
- sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] >= T[p - 1]);
- SA[induction_bucket[v]++] = (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1));
- distinct_names[v] = d;
- }
-
- return d;
-}
-static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t n, sa_sint_t * RESTRICT buckets,
- sa_sint_t left_suffixes_count, sa_sint_t d,
- sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- sa_sint_t * RESTRICT induction_bucket = &buckets[4 * ALPHABET_SIZE];
- sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE];
-
- SA[induction_bucket[BUCKETS_INDEX2(T[n - 1], T[n - 2] >= T[n - 1])]++] = (n - 1) | SAINT_MIN;
- distinct_names[BUCKETS_INDEX2(T[n - 1], T[n - 2] >= T[n - 1])] = ++d;
-
- if (threads == 1 || left_suffixes_count < 65536) {
- d = libsais_partial_sorting_scan_left_to_right_8u(T, SA, buckets, d, 0, left_suffixes_count);
- }
- (void)(thread_state);
- return d;
-}
-
-static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k(const sa_sint_t * RESTRICT T,
- sa_sint_t * RESTRICT SA,
- sa_sint_t * RESTRICT buckets, sa_sint_t d,
- fast_sint_t omp_block_start,
- fast_sint_t omp_block_size) {
- const fast_sint_t prefetch_distance = 32;
-
- fast_sint_t i, j;
- for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2) {
- libsais_prefetch(&SA[i + 3 * prefetch_distance]);
-
- libsais_prefetch(&T[SA[i + 2 * prefetch_distance + 0] & SAINT_MAX] - 1);
- libsais_prefetch(&T[SA[i + 2 * prefetch_distance + 0] & SAINT_MAX] - 2);
- libsais_prefetch(&T[SA[i + 2 * prefetch_distance + 1] & SAINT_MAX] - 1);
- libsais_prefetch(&T[SA[i + 2 * prefetch_distance + 1] & SAINT_MAX] - 2);
-
- sa_sint_t p0 = SA[i + prefetch_distance + 0] & SAINT_MAX;
- sa_sint_t v0 = BUCKETS_INDEX4(T[p0 - (p0 > 0)], 0);
- libsais_prefetchw(&buckets[v0]);
- sa_sint_t p1 = SA[i + prefetch_distance + 1] & SAINT_MAX;
- sa_sint_t v1 = BUCKETS_INDEX4(T[p1 - (p1 > 0)], 0);
- libsais_prefetchw(&buckets[v1]);
-
- sa_sint_t p2 = SA[i + 0];
- d += (p2 < 0);
- p2 &= SAINT_MAX;
- sa_sint_t v2 = BUCKETS_INDEX4(T[p2 - 1], T[p2 - 2] >= T[p2 - 1]);
- SA[buckets[v2]++] = (p2 - 1) | ((sa_sint_t)(buckets[2 + v2] != d) << (SAINT_BIT - 1));
- buckets[2 + v2] = d;
-
- sa_sint_t p3 = SA[i + 1];
- d += (p3 < 0);
- p3 &= SAINT_MAX;
- sa_sint_t v3 = BUCKETS_INDEX4(T[p3 - 1], T[p3 - 2] >= T[p3 - 1]);
- SA[buckets[v3]++] = (p3 - 1) | ((sa_sint_t)(buckets[2 + v3] != d) << (SAINT_BIT - 1));
- buckets[2 + v3] = d;
- }
-
- for (j += 2 * prefetch_distance + 1; i < j; i += 1) {
- sa_sint_t p = SA[i];
- d += (p < 0);
- p &= SAINT_MAX;
- sa_sint_t v = BUCKETS_INDEX4(T[p - 1], T[p - 2] >= T[p - 1]);
- SA[buckets[v]++] = (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1));
- buckets[2 + v] = d;
- }
-
- return d;
-}
-
-static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k(const sa_sint_t * RESTRICT T,
- sa_sint_t * RESTRICT SA, sa_sint_t k,
- sa_sint_t * RESTRICT buckets, sa_sint_t d,
- fast_sint_t omp_block_start,
- fast_sint_t omp_block_size) {
- const fast_sint_t prefetch_distance = 32;
-
- sa_sint_t * RESTRICT induction_bucket = &buckets[2 * k];
- sa_sint_t * RESTRICT distinct_names = &buckets[0 * k];
-
- fast_sint_t i, j;
- for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2) {
- libsais_prefetchw(&SA[i + 3 * prefetch_distance]);
-
- sa_sint_t s0 = SA[i + 2 * prefetch_distance + 0];
- const sa_sint_t * Ts0 = &T[s0 & ~SUFFIX_GROUP_MARKER] - 1;
- libsais_prefetch(s0 > 0 ? Ts0 : NULL);
- Ts0--;
- libsais_prefetch(s0 > 0 ? Ts0 : NULL);
- sa_sint_t s1 = SA[i + 2 * prefetch_distance + 1];
- const sa_sint_t * Ts1 = &T[s1 & ~SUFFIX_GROUP_MARKER] - 1;
- libsais_prefetch(s1 > 0 ? Ts1 : NULL);
- Ts1--;
- libsais_prefetch(s1 > 0 ? Ts1 : NULL);
- sa_sint_t s2 = SA[i + 1 * prefetch_distance + 0];
- if (s2 > 0) {
- const fast_sint_t Ts2 = T[(s2 & ~SUFFIX_GROUP_MARKER) - 1];
- libsais_prefetchw(&induction_bucket[Ts2]);
- libsais_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts2, 0)]);
- }
- sa_sint_t s3 = SA[i + 1 * prefetch_distance + 1];
- if (s3 > 0) {
- const fast_sint_t Ts3 = T[(s3 & ~SUFFIX_GROUP_MARKER) - 1];
- libsais_prefetchw(&induction_bucket[Ts3]);
- libsais_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts3, 0)]);
- }
-
- sa_sint_t p0 = SA[i + 0];
- SA[i + 0] = p0 & SAINT_MAX;
- if (p0 > 0) {
- SA[i + 0] = 0;
- d += (p0 >> (SUFFIX_GROUP_BIT - 1));
- p0 &= ~SUFFIX_GROUP_MARKER;
- sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] < T[p0 - 1]);
- SA[induction_bucket[T[p0 - 1]]++] = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] < T[p0 - 1]) << (SAINT_BIT - 1)) |
- ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1));
- distinct_names[v0] = d;
- }
-
- sa_sint_t p1 = SA[i + 1];
- SA[i + 1] = p1 & SAINT_MAX;
- if (p1 > 0) {
- SA[i + 1] = 0;
- d += (p1 >> (SUFFIX_GROUP_BIT - 1));
- p1 &= ~SUFFIX_GROUP_MARKER;
- sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] < T[p1 - 1]);
- SA[induction_bucket[T[p1 - 1]]++] = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] < T[p1 - 1]) << (SAINT_BIT - 1)) |
- ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1));
- distinct_names[v1] = d;
- }
- }
-
- for (j += 2 * prefetch_distance + 1; i < j; i += 1) {
- sa_sint_t p = SA[i];
- SA[i] = p & SAINT_MAX;
- if (p > 0) {
- SA[i] = 0;
- d += (p >> (SUFFIX_GROUP_BIT - 1));
- p &= ~SUFFIX_GROUP_MARKER;
- sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] < T[p - 1]);
- SA[induction_bucket[T[p - 1]]++] = (p - 1) | ((sa_sint_t)(T[p - 2] < T[p - 1]) << (SAINT_BIT - 1)) |
- ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1));
- distinct_names[v] = d;
- }
- }
-
- return d;
-}
-
-static void libsais_partial_sorting_scan_left_to_right_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t * RESTRICT induction_bucket,
- fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
- const fast_sint_t prefetch_distance = 32;
-
- fast_sint_t i, j;
- for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2) {
- libsais_prefetchw(&SA[i + 3 * prefetch_distance]);
-
- sa_sint_t s0 = SA[i + 2 * prefetch_distance + 0];
- const sa_sint_t * Ts0 = &T[s0] - 1;
- libsais_prefetch(s0 > 0 ? Ts0 : NULL);
- sa_sint_t s1 = SA[i + 2 * prefetch_distance + 1];
- const sa_sint_t * Ts1 = &T[s1] - 1;
- libsais_prefetch(s1 > 0 ? Ts1 : NULL);
- sa_sint_t s2 = SA[i + 1 * prefetch_distance + 0];
- if (s2 > 0) {
- libsais_prefetchw(&induction_bucket[T[s2 - 1]]);
- libsais_prefetch(&T[s2] - 2);
- }
- sa_sint_t s3 = SA[i + 1 * prefetch_distance + 1];
- if (s3 > 0) {
- libsais_prefetchw(&induction_bucket[T[s3 - 1]]);
- libsais_prefetch(&T[s3] - 2);
- }
-
- sa_sint_t p0 = SA[i + 0];
- SA[i + 0] = p0 & SAINT_MAX;
- if (p0 > 0) {
- SA[i + 0] = 0;
- SA[induction_bucket[T[p0 - 1]]++] = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] < T[p0 - 1]) << (SAINT_BIT - 1));
- }
- sa_sint_t p1 = SA[i + 1];
- SA[i + 1] = p1 & SAINT_MAX;
- if (p1 > 0) {
- SA[i + 1] = 0;
- SA[induction_bucket[T[p1 - 1]]++] = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] < T[p1 - 1]) << (SAINT_BIT - 1));
- }
- }
-
- for (j += 2 * prefetch_distance + 1; i < j; i += 1) {
- sa_sint_t p = SA[i];
- SA[i] = p & SAINT_MAX;
- if (p > 0) {
- SA[i] = 0;
- SA[induction_bucket[T[p - 1]]++] = (p - 1) | ((sa_sint_t)(T[p - 2] < T[p - 1]) << (SAINT_BIT - 1));
- }
- }
-}
-static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k_omp(
- const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets,
- sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- SA[buckets[BUCKETS_INDEX4(T[n - 1], T[n - 2] >= T[n - 1])]++] = (n - 1) | SAINT_MIN;
- buckets[2 + BUCKETS_INDEX4(T[n - 1], T[n - 2] >= T[n - 1])] = ++d;
-
- if (threads == 1 || left_suffixes_count < 65536) {
- d = libsais_partial_sorting_scan_left_to_right_32s_6k(T, SA, buckets, d, 0, left_suffixes_count);
- }
- (void)(thread_state);
- return d;
-}
-
-static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k_omp(const sa_sint_t * RESTRICT T,
- sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t k, sa_sint_t * RESTRICT buckets,
- sa_sint_t d, sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- sa_sint_t * RESTRICT induction_bucket = &buckets[2 * k];
- sa_sint_t * RESTRICT distinct_names = &buckets[0 * k];
-
- SA[induction_bucket[T[n - 1]]++] =
- (n - 1) | ((sa_sint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1)) | SUFFIX_GROUP_MARKER;
- distinct_names[BUCKETS_INDEX2(T[n - 1], T[n - 2] < T[n - 1])] = ++d;
-
- if (threads == 1 || n < 65536) {
- d = libsais_partial_sorting_scan_left_to_right_32s_4k(T, SA, k, buckets, d, 0, n);
- }
- (void)(thread_state);
- return d;
-}
-
-static void libsais_partial_sorting_scan_left_to_right_32s_1k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t n, sa_sint_t * RESTRICT buckets,
- sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- SA[buckets[T[n - 1]]++] = (n - 1) | ((sa_sint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1));
-
- if (threads == 1 || n < 65536) {
- libsais_partial_sorting_scan_left_to_right_32s_1k(T, SA, buckets, 0, n);
- }
- (void)(thread_state);
-}
-
-static void libsais_partial_sorting_shift_markers_8u_omp(sa_sint_t * RESTRICT SA, sa_sint_t n,
- const sa_sint_t * RESTRICT buckets, sa_sint_t threads) {
- const fast_sint_t prefetch_distance = 32;
-
- const sa_sint_t * RESTRICT temp_bucket = &buckets[4 * ALPHABET_SIZE];
-
- fast_sint_t c;
- (void)(threads);
- (void)(n);
-
- for (c = BUCKETS_INDEX2(ALPHABET_SIZE - 1, 0); c >= BUCKETS_INDEX2(1, 0); c -= BUCKETS_INDEX2(1, 0)) {
- fast_sint_t i, j;
- sa_sint_t s = SAINT_MIN;
- for (i = (fast_sint_t)temp_bucket[c] - 1, j = (fast_sint_t)buckets[c - BUCKETS_INDEX2(1, 0)] + 3; i >= j;
- i -= 4) {
- libsais_prefetchw(&SA[i - prefetch_distance]);
-
- sa_sint_t p0 = SA[i - 0], q0 = (p0 & SAINT_MIN) ^ s;
- s = s ^ q0;
- SA[i - 0] = p0 ^ q0;
- sa_sint_t p1 = SA[i - 1], q1 = (p1 & SAINT_MIN) ^ s;
- s = s ^ q1;
- SA[i - 1] = p1 ^ q1;
- sa_sint_t p2 = SA[i - 2], q2 = (p2 & SAINT_MIN) ^ s;
- s = s ^ q2;
- SA[i - 2] = p2 ^ q2;
- sa_sint_t p3 = SA[i - 3], q3 = (p3 & SAINT_MIN) ^ s;
- s = s ^ q3;
- SA[i - 3] = p3 ^ q3;
- }
-
- for (j -= 3; i >= j; i -= 1) {
- sa_sint_t p = SA[i], q = (p & SAINT_MIN) ^ s;
- s = s ^ q;
- SA[i] = p ^ q;
- }
- }
-}
-
-static void libsais_partial_sorting_shift_markers_32s_6k_omp(sa_sint_t * RESTRICT SA, sa_sint_t k,
- const sa_sint_t * RESTRICT buckets, sa_sint_t threads) {
- const fast_sint_t prefetch_distance = 32;
-
- const sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k];
-
- fast_sint_t c;
- (void)(threads);
-
- for (c = (fast_sint_t)k - 1; c >= 1; c -= 1) {
- fast_sint_t i, j;
- sa_sint_t s = SAINT_MIN;
- for (i = (fast_sint_t)buckets[BUCKETS_INDEX4(c, 0)] - 1,
- j = (fast_sint_t)temp_bucket[BUCKETS_INDEX2(c - 1, 0)] + 3;
- i >= j; i -= 4) {
- libsais_prefetchw(&SA[i - prefetch_distance]);
-
- sa_sint_t p0 = SA[i - 0], q0 = (p0 & SAINT_MIN) ^ s;
- s = s ^ q0;
- SA[i - 0] = p0 ^ q0;
- sa_sint_t p1 = SA[i - 1], q1 = (p1 & SAINT_MIN) ^ s;
- s = s ^ q1;
- SA[i - 1] = p1 ^ q1;
- sa_sint_t p2 = SA[i - 2], q2 = (p2 & SAINT_MIN) ^ s;
- s = s ^ q2;
- SA[i - 2] = p2 ^ q2;
- sa_sint_t p3 = SA[i - 3], q3 = (p3 & SAINT_MIN) ^ s;
- s = s ^ q3;
- SA[i - 3] = p3 ^ q3;
- }
-
- for (j -= 3; i >= j; i -= 1) {
- sa_sint_t p = SA[i], q = (p & SAINT_MIN) ^ s;
- s = s ^ q;
- SA[i] = p ^ q;
- }
- }
-}
-
-static void libsais_partial_sorting_shift_markers_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n) {
- const fast_sint_t prefetch_distance = 32;
-
- fast_sint_t i;
- sa_sint_t s = SUFFIX_GROUP_MARKER;
- for (i = (fast_sint_t)n - 1; i >= 3; i -= 4) {
- libsais_prefetchw(&SA[i - prefetch_distance]);
-
- sa_sint_t p0 = SA[i - 0],
- q0 = ((p0 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p0 > 0) << ((SUFFIX_GROUP_BIT - 1)));
- s = s ^ q0;
- SA[i - 0] = p0 ^ q0;
- sa_sint_t p1 = SA[i - 1],
- q1 = ((p1 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p1 > 0) << ((SUFFIX_GROUP_BIT - 1)));
- s = s ^ q1;
- SA[i - 1] = p1 ^ q1;
- sa_sint_t p2 = SA[i - 2],
- q2 = ((p2 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p2 > 0) << ((SUFFIX_GROUP_BIT - 1)));
- s = s ^ q2;
- SA[i - 2] = p2 ^ q2;
- sa_sint_t p3 = SA[i - 3],
- q3 = ((p3 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p3 > 0) << ((SUFFIX_GROUP_BIT - 1)));
- s = s ^ q3;
- SA[i - 3] = p3 ^ q3;
- }
-
- for (; i >= 0; i -= 1) {
- sa_sint_t p = SA[i], q = ((p & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p > 0) << ((SUFFIX_GROUP_BIT - 1)));
- s = s ^ q;
- SA[i] = p ^ q;
- }
-}
-
-static void libsais_partial_sorting_shift_buckets_32s_6k(sa_sint_t k, sa_sint_t * RESTRICT buckets) {
- sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k];
-
- fast_sint_t i;
- for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0)) {
- buckets[2 * i + BUCKETS_INDEX4(0, 0)] = temp_bucket[i + BUCKETS_INDEX2(0, 0)];
- buckets[2 * i + BUCKETS_INDEX4(0, 1)] = temp_bucket[i + BUCKETS_INDEX2(0, 1)];
- }
-}
-
-static sa_sint_t libsais_partial_sorting_scan_right_to_left_8u(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t * RESTRICT buckets, sa_sint_t d,
- fast_sint_t omp_block_start,
- fast_sint_t omp_block_size) {
- const fast_sint_t prefetch_distance = 32;
-
- sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
- sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE];
-
- fast_sint_t i, j;
- for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) {
- libsais_prefetch(&SA[i - 2 * prefetch_distance]);
-
- libsais_prefetch(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 1);
- libsais_prefetch(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 2);
- libsais_prefetch(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 1);
- libsais_prefetch(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 2);
-
- sa_sint_t p0 = SA[i - 0];
- d += (p0 < 0);
- p0 &= SAINT_MAX;
- sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]);
- SA[--induction_bucket[v0]] = (p0 - 1) | ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1));
- distinct_names[v0] = d;
-
- sa_sint_t p1 = SA[i - 1];
- d += (p1 < 0);
- p1 &= SAINT_MAX;
- sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]);
- SA[--induction_bucket[v1]] = (p1 - 1) | ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1));
- distinct_names[v1] = d;
- }
-
- for (j -= prefetch_distance + 1; i >= j; i -= 1) {
- sa_sint_t p = SA[i];
- d += (p < 0);
- p &= SAINT_MAX;
- sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]);
- SA[--induction_bucket[v]] = (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1));
- distinct_names[v] = d;
- }
-
- return d;
-}
-static void libsais_partial_sorting_scan_right_to_left_8u_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t n, sa_sint_t * RESTRICT buckets,
- sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count,
- sa_sint_t d, sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- fast_sint_t scan_start = (fast_sint_t)left_suffixes_count + 1;
- fast_sint_t scan_end = (fast_sint_t)n - (fast_sint_t)first_lms_suffix;
-
- if (threads == 1 || (scan_end - scan_start) < 65536) {
- libsais_partial_sorting_scan_right_to_left_8u(T, SA, buckets, d, scan_start, scan_end - scan_start);
- }
- (void)(thread_state);
-}
-
-static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k(const sa_sint_t * RESTRICT T,
- sa_sint_t * RESTRICT SA,
- sa_sint_t * RESTRICT buckets, sa_sint_t d,
- fast_sint_t omp_block_start,
- fast_sint_t omp_block_size) {
- const fast_sint_t prefetch_distance = 32;
-
- fast_sint_t i, j;
- for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2) {
- libsais_prefetch(&SA[i - 3 * prefetch_distance]);
-
- libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 0] & SAINT_MAX] - 1);
- libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 0] & SAINT_MAX] - 2);
- libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 1] & SAINT_MAX] - 1);
- libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 1] & SAINT_MAX] - 2);
-
- sa_sint_t p0 = SA[i - prefetch_distance - 0] & SAINT_MAX;
- sa_sint_t v0 = BUCKETS_INDEX4(T[p0 - (p0 > 0)], 0);
- libsais_prefetchw(&buckets[v0]);
- sa_sint_t p1 = SA[i - prefetch_distance - 1] & SAINT_MAX;
- sa_sint_t v1 = BUCKETS_INDEX4(T[p1 - (p1 > 0)], 0);
- libsais_prefetchw(&buckets[v1]);
-
- sa_sint_t p2 = SA[i - 0];
- d += (p2 < 0);
- p2 &= SAINT_MAX;
- sa_sint_t v2 = BUCKETS_INDEX4(T[p2 - 1], T[p2 - 2] > T[p2 - 1]);
- SA[--buckets[v2]] = (p2 - 1) | ((sa_sint_t)(buckets[2 + v2] != d) << (SAINT_BIT - 1));
- buckets[2 + v2] = d;
-
- sa_sint_t p3 = SA[i - 1];
- d += (p3 < 0);
- p3 &= SAINT_MAX;
- sa_sint_t v3 = BUCKETS_INDEX4(T[p3 - 1], T[p3 - 2] > T[p3 - 1]);
- SA[--buckets[v3]] = (p3 - 1) | ((sa_sint_t)(buckets[2 + v3] != d) << (SAINT_BIT - 1));
- buckets[2 + v3] = d;
- }
-
- for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1) {
- sa_sint_t p = SA[i];
- d += (p < 0);
- p &= SAINT_MAX;
- sa_sint_t v = BUCKETS_INDEX4(T[p - 1], T[p - 2] > T[p - 1]);
- SA[--buckets[v]] = (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1));
- buckets[2 + v] = d;
- }
-
- return d;
-}
-
-static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k(const sa_sint_t * RESTRICT T,
- sa_sint_t * RESTRICT SA, sa_sint_t k,
- sa_sint_t * RESTRICT buckets, sa_sint_t d,
- fast_sint_t omp_block_start,
- fast_sint_t omp_block_size) {
- const fast_sint_t prefetch_distance = 32;
-
- sa_sint_t * RESTRICT induction_bucket = &buckets[3 * k];
- sa_sint_t * RESTRICT distinct_names = &buckets[0 * k];
-
- fast_sint_t i, j;
- for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2) {
- libsais_prefetchw(&SA[i - 3 * prefetch_distance]);
-
- sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0];
- const sa_sint_t * Ts0 = &T[s0 & ~SUFFIX_GROUP_MARKER] - 1;
- libsais_prefetch(s0 > 0 ? Ts0 : NULL);
- Ts0--;
- libsais_prefetch(s0 > 0 ? Ts0 : NULL);
- sa_sint_t s1 = SA[i - 2 * prefetch_distance - 1];
- const sa_sint_t * Ts1 = &T[s1 & ~SUFFIX_GROUP_MARKER] - 1;
- libsais_prefetch(s1 > 0 ? Ts1 : NULL);
- Ts1--;
- libsais_prefetch(s1 > 0 ? Ts1 : NULL);
- sa_sint_t s2 = SA[i - 1 * prefetch_distance - 0];
- if (s2 > 0) {
- const fast_sint_t Ts2 = T[(s2 & ~SUFFIX_GROUP_MARKER) - 1];
- libsais_prefetchw(&induction_bucket[Ts2]);
- libsais_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts2, 0)]);
- }
- sa_sint_t s3 = SA[i - 1 * prefetch_distance - 1];
- if (s3 > 0) {
- const fast_sint_t Ts3 = T[(s3 & ~SUFFIX_GROUP_MARKER) - 1];
- libsais_prefetchw(&induction_bucket[Ts3]);
- libsais_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts3, 0)]);
- }
-
- sa_sint_t p0 = SA[i - 0];
- if (p0 > 0) {
- SA[i - 0] = 0;
- d += (p0 >> (SUFFIX_GROUP_BIT - 1));
- p0 &= ~SUFFIX_GROUP_MARKER;
- sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]);
- SA[--induction_bucket[T[p0 - 1]]] = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1)) |
- ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1));
- distinct_names[v0] = d;
- }
-
- sa_sint_t p1 = SA[i - 1];
- if (p1 > 0) {
- SA[i - 1] = 0;
- d += (p1 >> (SUFFIX_GROUP_BIT - 1));
- p1 &= ~SUFFIX_GROUP_MARKER;
- sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]);
- SA[--induction_bucket[T[p1 - 1]]] = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1)) |
- ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1));
- distinct_names[v1] = d;
- }
- }
-
- for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1) {
- sa_sint_t p = SA[i];
- if (p > 0) {
- SA[i] = 0;
- d += (p >> (SUFFIX_GROUP_BIT - 1));
- p &= ~SUFFIX_GROUP_MARKER;
- sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]);
- SA[--induction_bucket[T[p - 1]]] = (p - 1) | ((sa_sint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1)) |
- ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1));
- distinct_names[v] = d;
- }
- }
-
- return d;
-}
-
-static void libsais_partial_sorting_scan_right_to_left_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t * RESTRICT induction_bucket,
- fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
- const fast_sint_t prefetch_distance = 32;
-
- fast_sint_t i, j;
- for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2) {
- libsais_prefetchw(&SA[i - 3 * prefetch_distance]);
-
- sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0];
- const sa_sint_t * Ts0 = &T[s0] - 1;
- libsais_prefetch(s0 > 0 ? Ts0 : NULL);
- sa_sint_t s1 = SA[i - 2 * prefetch_distance - 1];
- const sa_sint_t * Ts1 = &T[s1] - 1;
- libsais_prefetch(s1 > 0 ? Ts1 : NULL);
- sa_sint_t s2 = SA[i - 1 * prefetch_distance - 0];
- if (s2 > 0) {
- libsais_prefetchw(&induction_bucket[T[s2 - 1]]);
- libsais_prefetch(&T[s2] - 2);
- }
- sa_sint_t s3 = SA[i - 1 * prefetch_distance - 1];
- if (s3 > 0) {
- libsais_prefetchw(&induction_bucket[T[s3 - 1]]);
- libsais_prefetch(&T[s3] - 2);
- }
-
- sa_sint_t p0 = SA[i - 0];
- if (p0 > 0) {
- SA[i - 0] = 0;
- SA[--induction_bucket[T[p0 - 1]]] = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1));
- }
- sa_sint_t p1 = SA[i - 1];
- if (p1 > 0) {
- SA[i - 1] = 0;
- SA[--induction_bucket[T[p1 - 1]]] = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1));
- }
- }
-
- for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1) {
- sa_sint_t p = SA[i];
- if (p > 0) {
- SA[i] = 0;
- SA[--induction_bucket[T[p - 1]]] = (p - 1) | ((sa_sint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1));
- }
- }
-}
-static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k_omp(
- const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets,
- sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- fast_sint_t scan_start = (fast_sint_t)left_suffixes_count + 1;
- fast_sint_t scan_end = (fast_sint_t)n - (fast_sint_t)first_lms_suffix;
-
- if (threads == 1 || (scan_end - scan_start) < 65536) {
- d = libsais_partial_sorting_scan_right_to_left_32s_6k(T, SA, buckets, d, scan_start, scan_end - scan_start);
- }
- (void)(thread_state);
- return d;
-}
-
-static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k_omp(const sa_sint_t * RESTRICT T,
- sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t k, sa_sint_t * RESTRICT buckets,
- sa_sint_t d, sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- if (threads == 1 || n < 65536) {
- d = libsais_partial_sorting_scan_right_to_left_32s_4k(T, SA, k, buckets, d, 0, n);
- }
- (void)(thread_state);
- return d;
-}
-
-static void libsais_partial_sorting_scan_right_to_left_32s_1k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t n, sa_sint_t * RESTRICT buckets,
- sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- if (threads == 1 || n < 65536) {
- libsais_partial_sorting_scan_right_to_left_32s_1k(T, SA, buckets, 0, n);
- }
- (void)(thread_state);
-}
-
-static fast_sint_t libsais_partial_sorting_gather_lms_suffixes_32s_4k(sa_sint_t * RESTRICT SA,
- fast_sint_t omp_block_start,
- fast_sint_t omp_block_size) {
- const fast_sint_t prefetch_distance = 32;
-
- fast_sint_t i, j, l;
- for (i = omp_block_start, j = omp_block_start + omp_block_size - 3, l = omp_block_start; i < j; i += 4) {
- libsais_prefetch(&SA[i + prefetch_distance]);
-
- sa_sint_t s0 = SA[i + 0];
- SA[l] = (s0 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER);
- l += (s0 < 0);
- sa_sint_t s1 = SA[i + 1];
- SA[l] = (s1 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER);
- l += (s1 < 0);
- sa_sint_t s2 = SA[i + 2];
- SA[l] = (s2 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER);
- l += (s2 < 0);
- sa_sint_t s3 = SA[i + 3];
- SA[l] = (s3 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER);
- l += (s3 < 0);
- }
-
- for (j += 3; i < j; i += 1) {
- sa_sint_t s = SA[i];
- SA[l] = (s - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER);
- l += (s < 0);
- }
-
- return l;
-}
-
-static fast_sint_t libsais_partial_sorting_gather_lms_suffixes_32s_1k(sa_sint_t * RESTRICT SA,
- fast_sint_t omp_block_start,
- fast_sint_t omp_block_size) {
- const fast_sint_t prefetch_distance = 32;
-
- fast_sint_t i, j, l;
- for (i = omp_block_start, j = omp_block_start + omp_block_size - 3, l = omp_block_start; i < j; i += 4) {
- libsais_prefetch(&SA[i + prefetch_distance]);
-
- sa_sint_t s0 = SA[i + 0];
- SA[l] = s0 & SAINT_MAX;
- l += (s0 < 0);
- sa_sint_t s1 = SA[i + 1];
- SA[l] = s1 & SAINT_MAX;
- l += (s1 < 0);
- sa_sint_t s2 = SA[i + 2];
- SA[l] = s2 & SAINT_MAX;
- l += (s2 < 0);
- sa_sint_t s3 = SA[i + 3];
- SA[l] = s3 & SAINT_MAX;
- l += (s3 < 0);
- }
-
- for (j += 3; i < j; i += 1) {
- sa_sint_t s = SA[i];
- SA[l] = s & SAINT_MAX;
- l += (s < 0);
- }
-
- return l;
-}
-
-static void libsais_partial_sorting_gather_lms_suffixes_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- {
- (void)(threads);
- (void)(thread_state);
-
- fast_sint_t omp_thread_num = 0;
- fast_sint_t omp_num_threads = 1;
-
- fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
- fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
- fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
-
- if (omp_num_threads == 1) {
- libsais_partial_sorting_gather_lms_suffixes_32s_4k(SA, omp_block_start, omp_block_size);
- }
- }
-}
-
-static void libsais_partial_sorting_gather_lms_suffixes_32s_1k_omp(sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- {
- (void)(threads);
- (void)(thread_state);
-
- fast_sint_t omp_thread_num = 0;
- fast_sint_t omp_num_threads = 1;
-
- fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
- fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
- fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
-
- if (omp_num_threads == 1) {
- libsais_partial_sorting_gather_lms_suffixes_32s_1k(SA, omp_block_start, omp_block_size);
- }
- }
-}
-
-static void libsais_induce_partial_order_8u_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix,
- sa_sint_t left_suffixes_count, sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- memset(&buckets[2 * ALPHABET_SIZE], 0, 2 * ALPHABET_SIZE * sizeof(sa_sint_t));
-
- sa_sint_t d = libsais_partial_sorting_scan_left_to_right_8u_omp(T, SA, n, buckets, left_suffixes_count, 0, threads,
- thread_state);
- libsais_partial_sorting_shift_markers_8u_omp(SA, n, buckets, threads);
- libsais_partial_sorting_scan_right_to_left_8u_omp(T, SA, n, buckets, first_lms_suffix, left_suffixes_count, d,
- threads, thread_state);
-}
-
-static void libsais_induce_partial_order_32s_6k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t k, sa_sint_t * RESTRICT buckets,
- sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count,
- sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- sa_sint_t d = libsais_partial_sorting_scan_left_to_right_32s_6k_omp(T, SA, n, buckets, left_suffixes_count, 0,
- threads, thread_state);
- libsais_partial_sorting_shift_markers_32s_6k_omp(SA, k, buckets, threads);
- libsais_partial_sorting_shift_buckets_32s_6k(k, buckets);
- libsais_partial_sorting_scan_right_to_left_32s_6k_omp(T, SA, n, buckets, first_lms_suffix, left_suffixes_count, d,
- threads, thread_state);
-}
-
-static void libsais_induce_partial_order_32s_4k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t));
-
- sa_sint_t d = libsais_partial_sorting_scan_left_to_right_32s_4k_omp(T, SA, n, k, buckets, 0, threads, thread_state);
- libsais_partial_sorting_shift_markers_32s_4k(SA, n);
- libsais_partial_sorting_scan_right_to_left_32s_4k_omp(T, SA, n, k, buckets, d, threads, thread_state);
- libsais_partial_sorting_gather_lms_suffixes_32s_4k_omp(SA, n, threads, thread_state);
-}
-
-static void libsais_induce_partial_order_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- libsais_partial_sorting_scan_left_to_right_32s_1k_omp(T, SA, n, &buckets[1 * k], threads, thread_state);
- libsais_partial_sorting_scan_right_to_left_32s_1k_omp(T, SA, n, &buckets[0 * k], threads, thread_state);
- libsais_partial_sorting_gather_lms_suffixes_32s_1k_omp(SA, n, threads, thread_state);
-}
-
-static void libsais_induce_partial_order_32s_1k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- libsais_count_suffixes_32s(T, n, k, buckets);
- libsais_initialize_buckets_start_32s_1k(k, buckets);
- libsais_partial_sorting_scan_left_to_right_32s_1k_omp(T, SA, n, buckets, threads, thread_state);
-
- libsais_count_suffixes_32s(T, n, k, buckets);
- libsais_initialize_buckets_end_32s_1k(k, buckets);
- libsais_partial_sorting_scan_right_to_left_32s_1k_omp(T, SA, n, buckets, threads, thread_state);
-
- libsais_partial_sorting_gather_lms_suffixes_32s_1k_omp(SA, n, threads, thread_state);
-}
-
-static sa_sint_t libsais_renumber_lms_suffixes_8u(sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t name,
- fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
- const fast_sint_t prefetch_distance = 32;
-
- sa_sint_t * RESTRICT SAm = &SA[m];
-
- fast_sint_t i, j;
- for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) {
- libsais_prefetch(&SA[i + 2 * prefetch_distance]);
-
- libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 0] & SAINT_MAX) >> 1]);
- libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 1] & SAINT_MAX) >> 1]);
- libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 2] & SAINT_MAX) >> 1]);
- libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 3] & SAINT_MAX) >> 1]);
-
- sa_sint_t p0 = SA[i + 0];
- SAm[(p0 & SAINT_MAX) >> 1] = name | SAINT_MIN;
- name += p0 < 0;
- sa_sint_t p1 = SA[i + 1];
- SAm[(p1 & SAINT_MAX) >> 1] = name | SAINT_MIN;
- name += p1 < 0;
- sa_sint_t p2 = SA[i + 2];
- SAm[(p2 & SAINT_MAX) >> 1] = name | SAINT_MIN;
- name += p2 < 0;
- sa_sint_t p3 = SA[i + 3];
- SAm[(p3 & SAINT_MAX) >> 1] = name | SAINT_MIN;
- name += p3 < 0;
- }
-
- for (j += prefetch_distance + 3; i < j; i += 1) {
- sa_sint_t p = SA[i];
- SAm[(p & SAINT_MAX) >> 1] = name | SAINT_MIN;
- name += p < 0;
- }
-
- return name;
-}
-
-static fast_sint_t libsais_gather_marked_suffixes_8u(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t l,
- fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
- const fast_sint_t prefetch_distance = 32;
-
- l -= 1;
-
- fast_sint_t i, j;
- for (i = (fast_sint_t)m + omp_block_start + omp_block_size - 1, j = (fast_sint_t)m + omp_block_start + 3; i >= j;
- i -= 4) {
- libsais_prefetch(&SA[i - prefetch_distance]);
-
- sa_sint_t s0 = SA[i - 0];
- SA[l] = s0 & SAINT_MAX;
- l -= s0 < 0;
- sa_sint_t s1 = SA[i - 1];
- SA[l] = s1 & SAINT_MAX;
- l -= s1 < 0;
- sa_sint_t s2 = SA[i - 2];
- SA[l] = s2 & SAINT_MAX;
- l -= s2 < 0;
- sa_sint_t s3 = SA[i - 3];
- SA[l] = s3 & SAINT_MAX;
- l -= s3 < 0;
- }
-
- for (j -= 3; i >= j; i -= 1) {
- sa_sint_t s = SA[i];
- SA[l] = s & SAINT_MAX;
- l -= s < 0;
- }
-
- l += 1;
-
- return l;
-}
-
-static sa_sint_t libsais_renumber_lms_suffixes_8u_omp(sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- sa_sint_t name = 0;
- {
- (void)(threads);
- (void)(thread_state);
-
- fast_sint_t omp_thread_num = 0;
- fast_sint_t omp_num_threads = 1;
-
- fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16);
- fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
- fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
-
- if (omp_num_threads == 1) {
- name = libsais_renumber_lms_suffixes_8u(SA, m, 0, omp_block_start, omp_block_size);
- }
- }
-
- return name;
-}
-
-static void libsais_gather_marked_lms_suffixes_8u_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs,
- sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- {
- (void)(threads);
- (void)(thread_state);
-
- fast_sint_t omp_thread_num = 0;
- fast_sint_t omp_num_threads = 1;
-
- fast_sint_t omp_block_stride = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16);
- fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
- fast_sint_t omp_block_size =
- omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ((fast_sint_t)n >> 1) - omp_block_start;
-
- if (omp_num_threads == 1) {
- libsais_gather_marked_suffixes_8u(SA, m, (fast_sint_t)n + (fast_sint_t)fs, omp_block_start, omp_block_size);
- }
- }
-}
-
-static sa_sint_t libsais_renumber_and_gather_lms_suffixes_8u_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
- sa_sint_t fs, sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- memset(&SA[m], 0, ((size_t)n >> 1) * sizeof(sa_sint_t));
-
- sa_sint_t name = libsais_renumber_lms_suffixes_8u_omp(SA, m, threads, thread_state);
- if (name < m) {
- libsais_gather_marked_lms_suffixes_8u_omp(SA, n, m, fs, threads, thread_state);
- } else {
- fast_sint_t i;
- for (i = 0; i < m; i += 1) {
- SA[i] &= SAINT_MAX;
- }
- }
-
- return name;
-}
-
-static sa_sint_t libsais_renumber_distinct_lms_suffixes_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t name,
- fast_sint_t omp_block_start,
- fast_sint_t omp_block_size) {
- const fast_sint_t prefetch_distance = 32;
-
- sa_sint_t * RESTRICT SAm = &SA[m];
-
- fast_sint_t i, j;
- sa_sint_t p0, p1, p2, p3 = 0;
- for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) {
- libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
-
- libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 0] & SAINT_MAX) >> 1]);
- libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 1] & SAINT_MAX) >> 1]);
- libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 2] & SAINT_MAX) >> 1]);
- libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 3] & SAINT_MAX) >> 1]);
-
- p0 = SA[i + 0];
- SAm[(SA[i + 0] = p0 & SAINT_MAX) >> 1] = name | (p0 & p3 & SAINT_MIN);
- name += p0 < 0;
- p1 = SA[i + 1];
- SAm[(SA[i + 1] = p1 & SAINT_MAX) >> 1] = name | (p1 & p0 & SAINT_MIN);
- name += p1 < 0;
- p2 = SA[i + 2];
- SAm[(SA[i + 2] = p2 & SAINT_MAX) >> 1] = name | (p2 & p1 & SAINT_MIN);
- name += p2 < 0;
- p3 = SA[i + 3];
- SAm[(SA[i + 3] = p3 & SAINT_MAX) >> 1] = name | (p3 & p2 & SAINT_MIN);
- name += p3 < 0;
- }
-
- for (j += prefetch_distance + 3; i < j; i += 1) {
- p2 = p3;
- p3 = SA[i];
- SAm[(SA[i] = p3 & SAINT_MAX) >> 1] = name | (p3 & p2 & SAINT_MIN);
- name += p3 < 0;
- }
-
- return name;
-}
-
-static void libsais_mark_distinct_lms_suffixes_32s(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t omp_block_start,
- fast_sint_t omp_block_size) {
- const fast_sint_t prefetch_distance = 32;
-
- fast_sint_t i, j;
- sa_sint_t p0, p1, p2, p3 = 0;
- for (i = (fast_sint_t)m + omp_block_start, j = (fast_sint_t)m + omp_block_start + omp_block_size - 3; i < j;
- i += 4) {
- libsais_prefetchw(&SA[i + prefetch_distance]);
-
- p0 = SA[i + 0];
- SA[i + 0] = p0 & (p3 | SAINT_MAX);
- p0 = (p0 == 0) ? p3 : p0;
- p1 = SA[i + 1];
- SA[i + 1] = p1 & (p0 | SAINT_MAX);
- p1 = (p1 == 0) ? p0 : p1;
- p2 = SA[i + 2];
- SA[i + 2] = p2 & (p1 | SAINT_MAX);
- p2 = (p2 == 0) ? p1 : p2;
- p3 = SA[i + 3];
- SA[i + 3] = p3 & (p2 | SAINT_MAX);
- p3 = (p3 == 0) ? p2 : p3;
- }
-
- for (j += 3; i < j; i += 1) {
- p2 = p3;
- p3 = SA[i];
- SA[i] = p3 & (p2 | SAINT_MAX);
- p3 = (p3 == 0) ? p2 : p3;
- }
-}
-
-static void libsais_clamp_lms_suffixes_length_32s(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t omp_block_start,
- fast_sint_t omp_block_size) {
- const fast_sint_t prefetch_distance = 32;
-
- sa_sint_t * RESTRICT SAm = &SA[m];
-
- fast_sint_t i, j;
- for (i = omp_block_start, j = omp_block_start + omp_block_size - 3; i < j; i += 4) {
- libsais_prefetchw(&SAm[i + prefetch_distance]);
-
- SAm[i + 0] = (SAm[i + 0] < 0 ? SAm[i + 0] : 0) & SAINT_MAX;
- SAm[i + 1] = (SAm[i + 1] < 0 ? SAm[i + 1] : 0) & SAINT_MAX;
- SAm[i + 2] = (SAm[i + 2] < 0 ? SAm[i + 2] : 0) & SAINT_MAX;
- SAm[i + 3] = (SAm[i + 3] < 0 ? SAm[i + 3] : 0) & SAINT_MAX;
- }
-
- for (j += 3; i < j; i += 1) {
- SAm[i] = (SAm[i] < 0 ? SAm[i] : 0) & SAINT_MAX;
- }
-}
-
-static sa_sint_t libsais_renumber_distinct_lms_suffixes_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t m,
- sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- sa_sint_t name = 0;
- {
- (void)(threads);
- (void)(thread_state);
-
- fast_sint_t omp_thread_num = 0;
- fast_sint_t omp_num_threads = 1;
-
- fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16);
- fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
- fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
-
- if (omp_num_threads == 1) {
- name = libsais_renumber_distinct_lms_suffixes_32s_4k(SA, m, 1, omp_block_start, omp_block_size);
- }
- }
-
- return name - 1;
-}
-
-static void libsais_mark_distinct_lms_suffixes_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
- sa_sint_t threads) {
- {
- (void)(threads);
-
- fast_sint_t omp_block_start = 0;
- fast_sint_t omp_block_size = (fast_sint_t)n >> 1;
-
- libsais_mark_distinct_lms_suffixes_32s(SA, m, omp_block_start, omp_block_size);
- }
-}
-
-static void libsais_clamp_lms_suffixes_length_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
- sa_sint_t threads) {
- {
- (void)(threads);
-
- fast_sint_t omp_block_start = 0;
- fast_sint_t omp_block_size = (fast_sint_t)n >> 1;
-
- libsais_clamp_lms_suffixes_length_32s(SA, m, omp_block_start, omp_block_size);
- }
-}
-
-static sa_sint_t libsais_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(
- sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- memset(&SA[m], 0, ((size_t)n >> 1) * sizeof(sa_sint_t));
-
- sa_sint_t name = libsais_renumber_distinct_lms_suffixes_32s_4k_omp(SA, m, threads, thread_state);
- if (name < m) {
- libsais_mark_distinct_lms_suffixes_32s_omp(SA, n, m, threads);
- }
-
- return name;
-}
-
-static sa_sint_t libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(sa_sint_t * RESTRICT T,
- sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t m, sa_sint_t threads) {
- const fast_sint_t prefetch_distance = 32;
-
- sa_sint_t * RESTRICT SAm = &SA[m];
-
- {
- libsais_gather_lms_suffixes_32s(T, SA, n);
-
- memset(&SA[m], 0, ((size_t)n - (size_t)m - (size_t)m) * sizeof(sa_sint_t));
-
- fast_sint_t i, j;
- for (i = (fast_sint_t)n - (fast_sint_t)m, j = (fast_sint_t)n - 1 - prefetch_distance - 3; i < j; i += 4) {
- libsais_prefetch(&SA[i + 2 * prefetch_distance]);
-
- libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]);
- libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]);
- libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 2]) >> 1]);
- libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 3]) >> 1]);
-
- SAm[((sa_uint_t)SA[i + 0]) >> 1] = SA[i + 1] - SA[i + 0] + 1 + SAINT_MIN;
- SAm[((sa_uint_t)SA[i + 1]) >> 1] = SA[i + 2] - SA[i + 1] + 1 + SAINT_MIN;
- SAm[((sa_uint_t)SA[i + 2]) >> 1] = SA[i + 3] - SA[i + 2] + 1 + SAINT_MIN;
- SAm[((sa_uint_t)SA[i + 3]) >> 1] = SA[i + 4] - SA[i + 3] + 1 + SAINT_MIN;
- }
-
- for (j += prefetch_distance + 3; i < j; i += 1) {
- SAm[((sa_uint_t)SA[i]) >> 1] = SA[i + 1] - SA[i] + 1 + SAINT_MIN;
- }
-
- SAm[((sa_uint_t)SA[n - 1]) >> 1] = 1 + SAINT_MIN;
- }
-
- { libsais_clamp_lms_suffixes_length_32s_omp(SA, n, m, threads); }
-
- sa_sint_t name = 1;
-
- {
- fast_sint_t i, j, p = SA[0], plen = SAm[p >> 1];
- sa_sint_t pdiff = SAINT_MIN;
- for (i = 1, j = m - prefetch_distance - 1; i < j; i += 2) {
- libsais_prefetch(&SA[i + 2 * prefetch_distance]);
-
- libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]);
- libsais_prefetch(&T[((sa_uint_t)SA[i + prefetch_distance + 0])]);
- libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]);
- libsais_prefetch(&T[((sa_uint_t)SA[i + prefetch_distance + 1])]);
-
- fast_sint_t q = SA[i + 0], qlen = SAm[q >> 1];
- sa_sint_t qdiff = SAINT_MIN;
- if (plen == qlen) {
- fast_sint_t l = 0;
- do {
- if (T[p + l] != T[q + l]) {
- break;
- }
- } while (++l < qlen);
- qdiff = (sa_sint_t)(l - qlen) & SAINT_MIN;
- }
- SAm[p >> 1] = name | (pdiff & qdiff);
- name += (qdiff < 0);
-
- p = SA[i + 1];
- plen = SAm[p >> 1];
- pdiff = SAINT_MIN;
- if (qlen == plen) {
- fast_sint_t l = 0;
- do {
- if (T[q + l] != T[p + l]) {
- break;
- }
- } while (++l < plen);
- pdiff = (sa_sint_t)(l - plen) & SAINT_MIN;
- }
- SAm[q >> 1] = name | (qdiff & pdiff);
- name += (pdiff < 0);
- }
-
- for (j += prefetch_distance + 1; i < j; i += 1) {
- fast_sint_t q = SA[i], qlen = SAm[q >> 1];
- sa_sint_t qdiff = SAINT_MIN;
- if (plen == qlen) {
- fast_sint_t l = 0;
- do {
- if (T[p + l] != T[q + l]) {
- break;
- }
- } while (++l < plen);
- qdiff = (sa_sint_t)(l - plen) & SAINT_MIN;
- }
- SAm[p >> 1] = name | (pdiff & qdiff);
- name += (qdiff < 0);
-
- p = q;
- plen = qlen;
- pdiff = qdiff;
- }
-
- SAm[p >> 1] = name | pdiff;
- name++;
- }
-
- if (name <= m) {
- libsais_mark_distinct_lms_suffixes_32s_omp(SA, n, m, threads);
- }
-
- return name - 1;
-}
-
-static void libsais_reconstruct_lms_suffixes(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
- fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
- const fast_sint_t prefetch_distance = 32;
-
- const sa_sint_t * RESTRICT SAnm = &SA[n - m];
-
- fast_sint_t i, j;
- for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) {
- libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
-
- libsais_prefetch(&SAnm[SA[i + prefetch_distance + 0]]);
- libsais_prefetch(&SAnm[SA[i + prefetch_distance + 1]]);
- libsais_prefetch(&SAnm[SA[i + prefetch_distance + 2]]);
- libsais_prefetch(&SAnm[SA[i + prefetch_distance + 3]]);
-
- SA[i + 0] = SAnm[SA[i + 0]];
- SA[i + 1] = SAnm[SA[i + 1]];
- SA[i + 2] = SAnm[SA[i + 2]];
- SA[i + 3] = SAnm[SA[i + 3]];
- }
-
- for (j += prefetch_distance + 3; i < j; i += 1) {
- SA[i] = SAnm[SA[i]];
- }
-}
-
-static void libsais_reconstruct_lms_suffixes_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads) {
- {
- (void)(threads);
-
- fast_sint_t omp_block_start = 0;
- fast_sint_t omp_block_size = m;
- libsais_reconstruct_lms_suffixes(SA, n, m, omp_block_start, omp_block_size);
- }
-}
-
-static void libsais_place_lms_suffixes_interval_8u(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
- const sa_sint_t * RESTRICT buckets) {
- const sa_sint_t * RESTRICT bucket_end = &buckets[7 * ALPHABET_SIZE];
-
- fast_sint_t c, j = n;
- for (c = ALPHABET_SIZE - 2; c >= 0; --c) {
- fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1) + BUCKETS_INDEX2(1, 0)] -
- (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1)];
- if (l > 0) {
- fast_sint_t i = bucket_end[c];
- if (j - i > 0) {
- memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
- }
-
- memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
- }
- }
-
- memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
-}
-
-static void libsais_place_lms_suffixes_interval_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m,
- const sa_sint_t * RESTRICT buckets) {
- const sa_sint_t * RESTRICT bucket_end = &buckets[3 * k];
-
- fast_sint_t c, j = n;
- for (c = (fast_sint_t)k - 2; c >= 0; --c) {
- fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1) + BUCKETS_INDEX2(1, 0)] -
- (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1)];
- if (l > 0) {
- fast_sint_t i = bucket_end[c];
- if (j - i > 0) {
- memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
- }
-
- memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
- }
- }
-
- memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
-}
-
-static void libsais_place_lms_suffixes_interval_32s_2k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m,
- const sa_sint_t * RESTRICT buckets) {
- fast_sint_t j = n;
-
- if (k > 1) {
- fast_sint_t c;
- for (c = BUCKETS_INDEX2((fast_sint_t)k - 2, 0); c >= BUCKETS_INDEX2(0, 0); c -= BUCKETS_INDEX2(1, 0)) {
- fast_sint_t l =
- (fast_sint_t)buckets[c + BUCKETS_INDEX2(1, 1)] - (fast_sint_t)buckets[c + BUCKETS_INDEX2(0, 1)];
- if (l > 0) {
- fast_sint_t i = buckets[c];
- if (j - i > 0) {
- memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
- }
-
- memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
- }
- }
- }
-
- memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
-}
-
-static void libsais_place_lms_suffixes_interval_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t k, sa_sint_t m, sa_sint_t * RESTRICT buckets) {
- const fast_sint_t prefetch_distance = 32;
-
- sa_sint_t c = k - 1;
- fast_sint_t i, l = buckets[c];
- for (i = (fast_sint_t)m - 1; i >= prefetch_distance + 3; i -= 4) {
- libsais_prefetch(&SA[i - 2 * prefetch_distance]);
-
- libsais_prefetch(&T[SA[i - prefetch_distance - 0]]);
- libsais_prefetch(&T[SA[i - prefetch_distance - 1]]);
- libsais_prefetch(&T[SA[i - prefetch_distance - 2]]);
- libsais_prefetch(&T[SA[i - prefetch_distance - 3]]);
-
- sa_sint_t p0 = SA[i - 0];
- if (T[p0] != c) {
- c = T[p0];
- memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t));
- l = buckets[c];
- }
- SA[--l] = p0;
- sa_sint_t p1 = SA[i - 1];
- if (T[p1] != c) {
- c = T[p1];
- memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t));
- l = buckets[c];
- }
- SA[--l] = p1;
- sa_sint_t p2 = SA[i - 2];
- if (T[p2] != c) {
- c = T[p2];
- memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t));
- l = buckets[c];
- }
- SA[--l] = p2;
- sa_sint_t p3 = SA[i - 3];
- if (T[p3] != c) {
- c = T[p3];
- memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t));
- l = buckets[c];
- }
- SA[--l] = p3;
- }
-
- for (; i >= 0; i -= 1) {
- sa_sint_t p = SA[i];
- if (T[p] != c) {
- c = T[p];
- memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t));
- l = buckets[c];
- }
- SA[--l] = p;
- }
-
- memset(&SA[0], 0, (size_t)l * sizeof(sa_sint_t));
-}
-
-static void libsais_place_lms_suffixes_histogram_32s_6k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m,
- const sa_sint_t * RESTRICT buckets) {
- const sa_sint_t * RESTRICT bucket_end = &buckets[5 * k];
-
- fast_sint_t c, j = n;
- for (c = (fast_sint_t)k - 2; c >= 0; --c) {
- fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX4(c, 1)];
- if (l > 0) {
- fast_sint_t i = bucket_end[c];
- if (j - i > 0) {
- memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
- }
-
- memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
- }
- }
-
- memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
-}
-
-static void libsais_place_lms_suffixes_histogram_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m,
- const sa_sint_t * RESTRICT buckets) {
- const sa_sint_t * RESTRICT bucket_end = &buckets[3 * k];
-
- fast_sint_t c, j = n;
- for (c = (fast_sint_t)k - 2; c >= 0; --c) {
- fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1)];
- if (l > 0) {
- fast_sint_t i = bucket_end[c];
- if (j - i > 0) {
- memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
- }
-
- memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
- }
- }
-
- memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
-}
-
-static void libsais_place_lms_suffixes_histogram_32s_2k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m,
- const sa_sint_t * RESTRICT buckets) {
- fast_sint_t j = n;
-
- if (k > 1) {
- fast_sint_t c;
- for (c = BUCKETS_INDEX2((fast_sint_t)k - 2, 0); c >= BUCKETS_INDEX2(0, 0); c -= BUCKETS_INDEX2(1, 0)) {
- fast_sint_t l = (fast_sint_t)buckets[c + BUCKETS_INDEX2(0, 1)];
- if (l > 0) {
- fast_sint_t i = buckets[c];
- if (j - i > 0) {
- memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
- }
-
- memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
- }
- }
- }
-
- memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
-}
-
-static void libsais_final_bwt_scan_left_to_right_8u(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start,
- fast_sint_t omp_block_size) {
- const fast_sint_t prefetch_distance = 32;
-
- fast_sint_t i, j;
- for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) {
- libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
-
- sa_sint_t s0 = SA[i + prefetch_distance + 0];
- const u8 * Ts0 = &T[s0] - 1;
- libsais_prefetch(s0 > 0 ? Ts0 : NULL);
- Ts0--;
- libsais_prefetch(s0 > 0 ? Ts0 : NULL);
- sa_sint_t s1 = SA[i + prefetch_distance + 1];
- const u8 * Ts1 = &T[s1] - 1;
- libsais_prefetch(s1 > 0 ? Ts1 : NULL);
- Ts1--;
- libsais_prefetch(s1 > 0 ? Ts1 : NULL);
-
- sa_sint_t p0 = SA[i + 0];
- SA[i + 0] = p0 & SAINT_MAX;
- if (p0 > 0) {
- p0--;
- SA[i + 0] = T[p0] | SAINT_MIN;
- SA[induction_bucket[T[p0]]++] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1));
- }
- sa_sint_t p1 = SA[i + 1];
- SA[i + 1] = p1 & SAINT_MAX;
- if (p1 > 0) {
- p1--;
- SA[i + 1] = T[p1] | SAINT_MIN;
- SA[induction_bucket[T[p1]]++] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1));
- }
- }
-
- for (j += prefetch_distance + 1; i < j; i += 1) {
- sa_sint_t p = SA[i];
- SA[i] = p & SAINT_MAX;
- if (p > 0) {
- p--;
- SA[i] = T[p] | SAINT_MIN;
- SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1));
- }
- }
-}
-
-static void libsais_final_bwt_aux_scan_left_to_right_8u(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm,
- sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket,
- fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
- const fast_sint_t prefetch_distance = 32;
-
- fast_sint_t i, j;
- for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) {
- libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
-
- sa_sint_t s0 = SA[i + prefetch_distance + 0];
- const u8 * Ts0 = &T[s0] - 1;
- libsais_prefetch(s0 > 0 ? Ts0 : NULL);
- Ts0--;
- libsais_prefetch(s0 > 0 ? Ts0 : NULL);
- sa_sint_t s1 = SA[i + prefetch_distance + 1];
- const u8 * Ts1 = &T[s1] - 1;
- libsais_prefetch(s1 > 0 ? Ts1 : NULL);
- Ts1--;
- libsais_prefetch(s1 > 0 ? Ts1 : NULL);
-
- sa_sint_t p0 = SA[i + 0];
- SA[i + 0] = p0 & SAINT_MAX;
- if (p0 > 0) {
- p0--;
- SA[i + 0] = T[p0] | SAINT_MIN;
- SA[induction_bucket[T[p0]]++] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1));
- if ((p0 & rm) == 0) {
- I[p0 / (rm + 1)] = induction_bucket[T[p0]];
- }
- }
- sa_sint_t p1 = SA[i + 1];
- SA[i + 1] = p1 & SAINT_MAX;
- if (p1 > 0) {
- p1--;
- SA[i + 1] = T[p1] | SAINT_MIN;
- SA[induction_bucket[T[p1]]++] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1));
- if ((p1 & rm) == 0) {
- I[p1 / (rm + 1)] = induction_bucket[T[p1]];
- }
- }
- }
-
- for (j += prefetch_distance + 1; i < j; i += 1) {
- sa_sint_t p = SA[i];
- SA[i] = p & SAINT_MAX;
- if (p > 0) {
- p--;
- SA[i] = T[p] | SAINT_MIN;
- SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1));
- if ((p & rm) == 0) {
- I[p / (rm + 1)] = induction_bucket[T[p]];
- }
- }
- }
-}
-
-static void libsais_final_sorting_scan_left_to_right_8u(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t * RESTRICT induction_bucket,
- fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
- const fast_sint_t prefetch_distance = 32;
-
- fast_sint_t i, j;
- for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) {
- libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
-
- sa_sint_t s0 = SA[i + prefetch_distance + 0];
- const u8 * Ts0 = &T[s0] - 1;
- libsais_prefetch(s0 > 0 ? Ts0 : NULL);
- Ts0--;
- libsais_prefetch(s0 > 0 ? Ts0 : NULL);
- sa_sint_t s1 = SA[i + prefetch_distance + 1];
- const u8 * Ts1 = &T[s1] - 1;
- libsais_prefetch(s1 > 0 ? Ts1 : NULL);
- Ts1--;
- libsais_prefetch(s1 > 0 ? Ts1 : NULL);
-
- sa_sint_t p0 = SA[i + 0];
- SA[i + 0] = p0 ^ SAINT_MIN;
- if (p0 > 0) {
- p0--;
- SA[induction_bucket[T[p0]]++] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1));
- }
- sa_sint_t p1 = SA[i + 1];
- SA[i + 1] = p1 ^ SAINT_MIN;
- if (p1 > 0) {
- p1--;
- SA[induction_bucket[T[p1]]++] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1));
- }
- }
-
- for (j += prefetch_distance + 1; i < j; i += 1) {
- sa_sint_t p = SA[i];
- SA[i] = p ^ SAINT_MIN;
- if (p > 0) {
- p--;
- SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1));
- }
- }
-}
-
-static void libsais_final_sorting_scan_left_to_right_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t * RESTRICT induction_bucket,
- fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
- const fast_sint_t prefetch_distance = 32;
-
- fast_sint_t i, j;
- for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2) {
- libsais_prefetchw(&SA[i + 3 * prefetch_distance]);
-
- sa_sint_t s0 = SA[i + 2 * prefetch_distance + 0];
- const sa_sint_t * Ts0 = &T[s0] - 1;
- libsais_prefetch(s0 > 0 ? Ts0 : NULL);
- sa_sint_t s1 = SA[i + 2 * prefetch_distance + 1];
- const sa_sint_t * Ts1 = &T[s1] - 1;
- libsais_prefetch(s1 > 0 ? Ts1 : NULL);
- sa_sint_t s2 = SA[i + 1 * prefetch_distance + 0];
- if (s2 > 0) {
- libsais_prefetchw(&induction_bucket[T[s2 - 1]]);
- libsais_prefetch(&T[s2] - 2);
- }
- sa_sint_t s3 = SA[i + 1 * prefetch_distance + 1];
- if (s3 > 0) {
- libsais_prefetchw(&induction_bucket[T[s3 - 1]]);
- libsais_prefetch(&T[s3] - 2);
- }
-
- sa_sint_t p0 = SA[i + 0];
- SA[i + 0] = p0 ^ SAINT_MIN;
- if (p0 > 0) {
- p0--;
- SA[induction_bucket[T[p0]]++] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1));
- }
- sa_sint_t p1 = SA[i + 1];
- SA[i + 1] = p1 ^ SAINT_MIN;
- if (p1 > 0) {
- p1--;
- SA[induction_bucket[T[p1]]++] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1));
- }
- }
-
- for (j += 2 * prefetch_distance + 1; i < j; i += 1) {
- sa_sint_t p = SA[i];
- SA[i] = p ^ SAINT_MIN;
- if (p > 0) {
- p--;
- SA[induction_bucket[T[p]]++] = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1));
- }
- }
-}
-static void libsais_final_bwt_scan_left_to_right_8u_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t n,
- sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- SA[induction_bucket[T[(sa_sint_t)n - 1]]++] =
- ((sa_sint_t)n - 1) | ((sa_sint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1));
-
- if (threads == 1 || n < 65536) {
- libsais_final_bwt_scan_left_to_right_8u(T, SA, induction_bucket, 0, n);
- }
- (void)(thread_state);
-}
-
-static void libsais_final_bwt_aux_scan_left_to_right_8u_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
- fast_sint_t n, sa_sint_t rm, sa_sint_t * RESTRICT I,
- sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- SA[induction_bucket[T[(sa_sint_t)n - 1]]++] =
- ((sa_sint_t)n - 1) | ((sa_sint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1));
-
- if ((((sa_sint_t)n - 1) & rm) == 0) {
- I[((sa_sint_t)n - 1) / (rm + 1)] = induction_bucket[T[(sa_sint_t)n - 1]];
- }
-
- if (threads == 1 || n < 65536) {
- libsais_final_bwt_aux_scan_left_to_right_8u(T, SA, rm, I, induction_bucket, 0, n);
- }
- (void)(thread_state);
-}
-
-static void libsais_final_sorting_scan_left_to_right_8u_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
- fast_sint_t n, sa_sint_t * RESTRICT induction_bucket,
- sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- SA[induction_bucket[T[(sa_sint_t)n - 1]]++] =
- ((sa_sint_t)n - 1) | ((sa_sint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1));
-
- if (threads == 1 || n < 65536) {
- libsais_final_sorting_scan_left_to_right_8u(T, SA, induction_bucket, 0, n);
- }
- (void)(thread_state);
-}
-
-static void libsais_final_sorting_scan_left_to_right_32s_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t n, sa_sint_t * RESTRICT induction_bucket,
- sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- SA[induction_bucket[T[n - 1]]++] = (n - 1) | ((sa_sint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1));
-
- if (threads == 1 || n < 65536) {
- libsais_final_sorting_scan_left_to_right_32s(T, SA, induction_bucket, 0, n);
- }
- (void)(thread_state);
-}
-
-static sa_sint_t libsais_final_bwt_scan_right_to_left_8u(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t * RESTRICT induction_bucket,
- fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
- const fast_sint_t prefetch_distance = 32;
-
- fast_sint_t i, j;
- sa_sint_t index = -1;
- for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) {
- libsais_prefetchw(&SA[i - 2 * prefetch_distance]);
-
- sa_sint_t s0 = SA[i - prefetch_distance - 0];
- const u8 * Ts0 = &T[s0] - 1;
- libsais_prefetch(s0 > 0 ? Ts0 : NULL);
- Ts0--;
- libsais_prefetch(s0 > 0 ? Ts0 : NULL);
- sa_sint_t s1 = SA[i - prefetch_distance - 1];
- const u8 * Ts1 = &T[s1] - 1;
- libsais_prefetch(s1 > 0 ? Ts1 : NULL);
- Ts1--;
- libsais_prefetch(s1 > 0 ? Ts1 : NULL);
-
- sa_sint_t p0 = SA[i - 0];
- index = (p0 == 0) ? (sa_sint_t)(i - 0) : index;
- SA[i - 0] = p0 & SAINT_MAX;
- if (p0 > 0) {
- p0--;
- u8 c0 = T[p0 - (p0 > 0)], c1 = T[p0];
- SA[i - 0] = c1;
- sa_sint_t t = c0 | SAINT_MIN;
- SA[--induction_bucket[c1]] = (c0 <= c1) ? p0 : t;
- }
-
- sa_sint_t p1 = SA[i - 1];
- index = (p1 == 0) ? (sa_sint_t)(i - 1) : index;
- SA[i - 1] = p1 & SAINT_MAX;
- if (p1 > 0) {
- p1--;
- u8 c0 = T[p1 - (p1 > 0)], c1 = T[p1];
- SA[i - 1] = c1;
- sa_sint_t t = c0 | SAINT_MIN;
- SA[--induction_bucket[c1]] = (c0 <= c1) ? p1 : t;
- }
- }
-
- for (j -= prefetch_distance + 1; i >= j; i -= 1) {
- sa_sint_t p = SA[i];
- index = (p == 0) ? (sa_sint_t)i : index;
- SA[i] = p & SAINT_MAX;
- if (p > 0) {
- p--;
- u8 c0 = T[p - (p > 0)], c1 = T[p];
- SA[i] = c1;
- sa_sint_t t = c0 | SAINT_MIN;
- SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t;
- }
- }
-
- return index;
-}
-
-static void libsais_final_bwt_aux_scan_right_to_left_8u(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm,
- sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket,
- fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
- const fast_sint_t prefetch_distance = 32;
-
- fast_sint_t i, j;
- for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) {
- libsais_prefetchw(&SA[i - 2 * prefetch_distance]);
-
- sa_sint_t s0 = SA[i - prefetch_distance - 0];
- const u8 * Ts0 = &T[s0] - 1;
- libsais_prefetch(s0 > 0 ? Ts0 : NULL);
- Ts0--;
- libsais_prefetch(s0 > 0 ? Ts0 : NULL);
- sa_sint_t s1 = SA[i - prefetch_distance - 1];
- const u8 * Ts1 = &T[s1] - 1;
- libsais_prefetch(s1 > 0 ? Ts1 : NULL);
- Ts1--;
- libsais_prefetch(s1 > 0 ? Ts1 : NULL);
-
- sa_sint_t p0 = SA[i - 0];
- SA[i - 0] = p0 & SAINT_MAX;
- if (p0 > 0) {
- p0--;
- u8 c0 = T[p0 - (p0 > 0)], c1 = T[p0];
- SA[i - 0] = c1;
- sa_sint_t t = c0 | SAINT_MIN;
- SA[--induction_bucket[c1]] = (c0 <= c1) ? p0 : t;
- if ((p0 & rm) == 0) {
- I[p0 / (rm + 1)] = induction_bucket[T[p0]] + 1;
- }
- }
-
- sa_sint_t p1 = SA[i - 1];
- SA[i - 1] = p1 & SAINT_MAX;
- if (p1 > 0) {
- p1--;
- u8 c0 = T[p1 - (p1 > 0)], c1 = T[p1];
- SA[i - 1] = c1;
- sa_sint_t t = c0 | SAINT_MIN;
- SA[--induction_bucket[c1]] = (c0 <= c1) ? p1 : t;
- if ((p1 & rm) == 0) {
- I[p1 / (rm + 1)] = induction_bucket[T[p1]] + 1;
- }
- }
- }
-
- for (j -= prefetch_distance + 1; i >= j; i -= 1) {
- sa_sint_t p = SA[i];
- SA[i] = p & SAINT_MAX;
- if (p > 0) {
- p--;
- u8 c0 = T[p - (p > 0)], c1 = T[p];
- SA[i] = c1;
- sa_sint_t t = c0 | SAINT_MIN;
- SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t;
- if ((p & rm) == 0) {
- I[p / (rm + 1)] = induction_bucket[T[p]] + 1;
- }
- }
- }
-}
-
-static void libsais_final_sorting_scan_right_to_left_8u(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t * RESTRICT induction_bucket,
- fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
- const fast_sint_t prefetch_distance = 32;
-
- fast_sint_t i, j;
- for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) {
- libsais_prefetchw(&SA[i - 2 * prefetch_distance]);
-
- sa_sint_t s0 = SA[i - prefetch_distance - 0];
- const u8 * Ts0 = &T[s0] - 1;
- libsais_prefetch(s0 > 0 ? Ts0 : NULL);
- Ts0--;
- libsais_prefetch(s0 > 0 ? Ts0 : NULL);
- sa_sint_t s1 = SA[i - prefetch_distance - 1];
- const u8 * Ts1 = &T[s1] - 1;
- libsais_prefetch(s1 > 0 ? Ts1 : NULL);
- Ts1--;
- libsais_prefetch(s1 > 0 ? Ts1 : NULL);
-
- sa_sint_t p0 = SA[i - 0];
- SA[i - 0] = p0 & SAINT_MAX;
- if (p0 > 0) {
- p0--;
- SA[--induction_bucket[T[p0]]] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1));
- }
- sa_sint_t p1 = SA[i - 1];
- SA[i - 1] = p1 & SAINT_MAX;
- if (p1 > 0) {
- p1--;
- SA[--induction_bucket[T[p1]]] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1));
- }
- }
-
- for (j -= prefetch_distance + 1; i >= j; i -= 1) {
- sa_sint_t p = SA[i];
- SA[i] = p & SAINT_MAX;
- if (p > 0) {
- p--;
- SA[--induction_bucket[T[p]]] = p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1));
- }
- }
-}
-
-static void libsais_final_sorting_scan_right_to_left_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t * RESTRICT induction_bucket,
- fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
- const fast_sint_t prefetch_distance = 32;
-
- fast_sint_t i, j;
- for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2) {
- libsais_prefetchw(&SA[i - 3 * prefetch_distance]);
-
- sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0];
- const sa_sint_t * Ts0 = &T[s0] - 1;
- libsais_prefetch(s0 > 0 ? Ts0 : NULL);
- sa_sint_t s1 = SA[i - 2 * prefetch_distance - 1];
- const sa_sint_t * Ts1 = &T[s1] - 1;
- libsais_prefetch(s1 > 0 ? Ts1 : NULL);
- sa_sint_t s2 = SA[i - 1 * prefetch_distance - 0];
- if (s2 > 0) {
- libsais_prefetchw(&induction_bucket[T[s2 - 1]]);
- libsais_prefetch(&T[s2] - 2);
- }
- sa_sint_t s3 = SA[i - 1 * prefetch_distance - 1];
- if (s3 > 0) {
- libsais_prefetchw(&induction_bucket[T[s3 - 1]]);
- libsais_prefetch(&T[s3] - 2);
- }
-
- sa_sint_t p0 = SA[i - 0];
- SA[i - 0] = p0 & SAINT_MAX;
- if (p0 > 0) {
- p0--;
- SA[--induction_bucket[T[p0]]] = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1));
- }
- sa_sint_t p1 = SA[i - 1];
- SA[i - 1] = p1 & SAINT_MAX;
- if (p1 > 0) {
- p1--;
- SA[--induction_bucket[T[p1]]] = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1));
- }
- }
-
- for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1) {
- sa_sint_t p = SA[i];
- SA[i] = p & SAINT_MAX;
- if (p > 0) {
- p--;
- SA[--induction_bucket[T[p]]] = p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1));
- }
- }
-}
-static sa_sint_t libsais_final_bwt_scan_right_to_left_8u_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t n, sa_sint_t * RESTRICT induction_bucket,
- sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- sa_sint_t index = -1;
-
- if (threads == 1 || n < 65536) {
- index = libsais_final_bwt_scan_right_to_left_8u(T, SA, induction_bucket, 0, n);
- }
- (void)(thread_state);
- return index;
-}
-
-static void libsais_final_bwt_aux_scan_right_to_left_8u_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t rm, sa_sint_t * RESTRICT I,
- sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- if (threads == 1 || n < 65536) {
- libsais_final_bwt_aux_scan_right_to_left_8u(T, SA, rm, I, induction_bucket, 0, n);
- }
- (void)(thread_state);
-}
-
-static void libsais_final_sorting_scan_right_to_left_8u_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- if (threads == 1 || n < 65536) {
- libsais_final_sorting_scan_right_to_left_8u(T, SA, induction_bucket, 0, n);
- }
- (void)(thread_state);
-}
-
-static void libsais_final_sorting_scan_right_to_left_32s_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t n, sa_sint_t * RESTRICT induction_bucket,
- sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- if (threads == 1 || n < 65536) {
- libsais_final_sorting_scan_right_to_left_32s(T, SA, induction_bucket, 0, n);
- }
- (void)(thread_state);
-}
-
-static void libsais_clear_lms_suffixes_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
- sa_sint_t * RESTRICT bucket_start, sa_sint_t * RESTRICT bucket_end,
- sa_sint_t threads) {
- fast_sint_t c;
- (void)(threads);
- (void)(n);
-
- for (c = 0; c < k; ++c) {
- if (bucket_end[c] > bucket_start[c]) {
- memset(&SA[bucket_start[c]], 0, ((size_t)bucket_end[c] - (size_t)bucket_start[c]) * sizeof(sa_sint_t));
- }
- }
-}
-
-static sa_sint_t libsais_induce_final_order_8u_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t bwt, sa_sint_t r, sa_sint_t * RESTRICT I,
- sa_sint_t * RESTRICT buckets, sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- if (!bwt) {
- libsais_final_sorting_scan_left_to_right_8u_omp(T, SA, n, &buckets[6 * ALPHABET_SIZE], threads, thread_state);
- if (threads > 1 && n >= 65536) {
- libsais_clear_lms_suffixes_omp(SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE],
- &buckets[7 * ALPHABET_SIZE], threads);
- }
- libsais_final_sorting_scan_right_to_left_8u_omp(T, SA, n, &buckets[7 * ALPHABET_SIZE], threads, thread_state);
- return 0;
- } else if (I != NULL) {
- libsais_final_bwt_aux_scan_left_to_right_8u_omp(T, SA, n, r - 1, I, &buckets[6 * ALPHABET_SIZE], threads,
- thread_state);
- if (threads > 1 && n >= 65536) {
- libsais_clear_lms_suffixes_omp(SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE],
- &buckets[7 * ALPHABET_SIZE], threads);
- }
- libsais_final_bwt_aux_scan_right_to_left_8u_omp(T, SA, n, r - 1, I, &buckets[7 * ALPHABET_SIZE], threads,
- thread_state);
- return 0;
- } else {
- libsais_final_bwt_scan_left_to_right_8u_omp(T, SA, n, &buckets[6 * ALPHABET_SIZE], threads, thread_state);
- if (threads > 1 && n >= 65536) {
- libsais_clear_lms_suffixes_omp(SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE],
- &buckets[7 * ALPHABET_SIZE], threads);
- }
- return libsais_final_bwt_scan_right_to_left_8u_omp(T, SA, n, &buckets[7 * ALPHABET_SIZE], threads,
- thread_state);
- }
-}
-
-static void libsais_induce_final_order_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[4 * k], threads, thread_state);
- libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[5 * k], threads, thread_state);
-}
-
-static void libsais_induce_final_order_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[2 * k], threads, thread_state);
- libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[3 * k], threads, thread_state);
-}
-
-static void libsais_induce_final_order_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[1 * k], threads, thread_state);
- libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[0 * k], threads, thread_state);
-}
-
-static void libsais_induce_final_order_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- libsais_count_suffixes_32s(T, n, k, buckets);
- libsais_initialize_buckets_start_32s_1k(k, buckets);
- libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, buckets, threads, thread_state);
-
- libsais_count_suffixes_32s(T, n, k, buckets);
- libsais_initialize_buckets_end_32s_1k(k, buckets);
- libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, buckets, threads, thread_state);
-}
-
-static sa_sint_t libsais_renumber_unique_and_nonunique_lms_suffixes_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t m, sa_sint_t f,
- fast_sint_t omp_block_start,
- fast_sint_t omp_block_size) {
- const fast_sint_t prefetch_distance = 32;
-
- sa_sint_t * RESTRICT SAm = &SA[m];
-
- sa_sint_t i, j;
- for (i = (sa_sint_t)omp_block_start,
- j = (sa_sint_t)omp_block_start + (sa_sint_t)omp_block_size - 2 * (sa_sint_t)prefetch_distance - 3;
- i < j; i += 4) {
- libsais_prefetch(&SA[i + 3 * prefetch_distance]);
-
- libsais_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 0]) >> 1]);
- libsais_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 1]) >> 1]);
- libsais_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 2]) >> 1]);
- libsais_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 3]) >> 1]);
-
- sa_uint_t q0 = (sa_uint_t)SA[i + prefetch_distance + 0];
- const sa_sint_t * Tq0 = &T[q0];
- libsais_prefetchw(SAm[q0 >> 1] < 0 ? Tq0 : NULL);
- sa_uint_t q1 = (sa_uint_t)SA[i + prefetch_distance + 1];
- const sa_sint_t * Tq1 = &T[q1];
- libsais_prefetchw(SAm[q1 >> 1] < 0 ? Tq1 : NULL);
- sa_uint_t q2 = (sa_uint_t)SA[i + prefetch_distance + 2];
- const sa_sint_t * Tq2 = &T[q2];
- libsais_prefetchw(SAm[q2 >> 1] < 0 ? Tq2 : NULL);
- sa_uint_t q3 = (sa_uint_t)SA[i + prefetch_distance + 3];
- const sa_sint_t * Tq3 = &T[q3];
- libsais_prefetchw(SAm[q3 >> 1] < 0 ? Tq3 : NULL);
-
- sa_uint_t p0 = (sa_uint_t)SA[i + 0];
- sa_sint_t s0 = SAm[p0 >> 1];
- if (s0 < 0) {
- T[p0] |= SAINT_MIN;
- f++;
- s0 = i + 0 + SAINT_MIN + f;
- }
- SAm[p0 >> 1] = s0 - f;
- sa_uint_t p1 = (sa_uint_t)SA[i + 1];
- sa_sint_t s1 = SAm[p1 >> 1];
- if (s1 < 0) {
- T[p1] |= SAINT_MIN;
- f++;
- s1 = i + 1 + SAINT_MIN + f;
- }
- SAm[p1 >> 1] = s1 - f;
- sa_uint_t p2 = (sa_uint_t)SA[i + 2];
- sa_sint_t s2 = SAm[p2 >> 1];
- if (s2 < 0) {
- T[p2] |= SAINT_MIN;
- f++;
- s2 = i + 2 + SAINT_MIN + f;
- }
- SAm[p2 >> 1] = s2 - f;
- sa_uint_t p3 = (sa_uint_t)SA[i + 3];
- sa_sint_t s3 = SAm[p3 >> 1];
- if (s3 < 0) {
- T[p3] |= SAINT_MIN;
- f++;
- s3 = i + 3 + SAINT_MIN + f;
- }
- SAm[p3 >> 1] = s3 - f;
- }
-
- for (j += 2 * (sa_sint_t)prefetch_distance + 3; i < j; i += 1) {
- sa_uint_t p = (sa_uint_t)SA[i];
- sa_sint_t s = SAm[p >> 1];
- if (s < 0) {
- T[p] |= SAINT_MIN;
- f++;
- s = i + SAINT_MIN + f;
- }
- SAm[p >> 1] = s - f;
- }
-
- return f;
-}
-
-static void libsais_compact_unique_and_nonunique_lms_suffixes_32s(sa_sint_t * RESTRICT SA, sa_sint_t m,
- fast_sint_t * pl, fast_sint_t * pr,
- fast_sint_t omp_block_start,
- fast_sint_t omp_block_size) {
- const fast_sint_t prefetch_distance = 32;
-
- sa_sint_t * RESTRICT SAl = &SA[0];
- sa_sint_t * RESTRICT SAr = &SA[0];
-
- fast_sint_t i, j, l = *pl - 1, r = *pr - 1;
- for (i = (fast_sint_t)m + omp_block_start + omp_block_size - 1, j = (fast_sint_t)m + omp_block_start + 3; i >= j;
- i -= 4) {
- libsais_prefetch(&SA[i - prefetch_distance]);
-
- sa_sint_t p0 = SA[i - 0];
- SAl[l] = p0 & SAINT_MAX;
- l -= p0 < 0;
- SAr[r] = p0 - 1;
- r -= p0 > 0;
- sa_sint_t p1 = SA[i - 1];
- SAl[l] = p1 & SAINT_MAX;
- l -= p1 < 0;
- SAr[r] = p1 - 1;
- r -= p1 > 0;
- sa_sint_t p2 = SA[i - 2];
- SAl[l] = p2 & SAINT_MAX;
- l -= p2 < 0;
- SAr[r] = p2 - 1;
- r -= p2 > 0;
- sa_sint_t p3 = SA[i - 3];
- SAl[l] = p3 & SAINT_MAX;
- l -= p3 < 0;
- SAr[r] = p3 - 1;
- r -= p3 > 0;
- }
-
- for (j -= 3; i >= j; i -= 1) {
- sa_sint_t p = SA[i];
- SAl[l] = p & SAINT_MAX;
- l -= p < 0;
- SAr[r] = p - 1;
- r -= p > 0;
- }
-
- *pl = l + 1;
- *pr = r + 1;
-}
-static sa_sint_t libsais_renumber_unique_and_nonunique_lms_suffixes_32s_omp(
- sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- sa_sint_t f = 0;
- {
- (void)(threads);
- (void)(thread_state);
-
- fast_sint_t omp_thread_num = 0;
- fast_sint_t omp_num_threads = 1;
-
- fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16);
- fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
- fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
-
- if (omp_num_threads == 1) {
- f = libsais_renumber_unique_and_nonunique_lms_suffixes_32s(T, SA, m, 0, omp_block_start, omp_block_size);
- }
- }
-
- return f;
-}
-
-static void libsais_compact_unique_and_nonunique_lms_suffixes_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
- sa_sint_t fs, sa_sint_t f, sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- {
- (void)(threads);
- (void)(thread_state);
-
- fast_sint_t omp_thread_num = 0;
- fast_sint_t omp_num_threads = 1;
-
- fast_sint_t omp_block_stride = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16);
- fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
- fast_sint_t omp_block_size =
- omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ((fast_sint_t)n >> 1) - omp_block_start;
-
- if (omp_num_threads == 1) {
- fast_sint_t l = m, r = (fast_sint_t)n + (fast_sint_t)fs;
- libsais_compact_unique_and_nonunique_lms_suffixes_32s(SA, m, &l, &r, omp_block_start, omp_block_size);
- }
- }
-
- memcpy(&SA[(fast_sint_t)n + (fast_sint_t)fs - (fast_sint_t)m], &SA[(fast_sint_t)m - (fast_sint_t)f],
- (size_t)f * sizeof(sa_sint_t));
-}
-
-static sa_sint_t libsais_compact_lms_suffixes_32s_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t m, sa_sint_t fs, sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- sa_sint_t f = libsais_renumber_unique_and_nonunique_lms_suffixes_32s_omp(T, SA, m, threads, thread_state);
- libsais_compact_unique_and_nonunique_lms_suffixes_32s_omp(SA, n, m, fs, f, threads, thread_state);
-
- return f;
-}
-
-static void libsais_merge_unique_lms_suffixes_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t m, fast_sint_t l, fast_sint_t omp_block_start,
- fast_sint_t omp_block_size) {
- const fast_sint_t prefetch_distance = 32;
-
- const sa_sint_t * RESTRICT SAnm = &SA[(fast_sint_t)n - (fast_sint_t)m - 1 + l];
-
- sa_sint_t i, j;
- fast_sint_t tmp = *SAnm++;
- for (i = (sa_sint_t)omp_block_start, j = (sa_sint_t)omp_block_start + (sa_sint_t)omp_block_size - 6; i < j;
- i += 4) {
- libsais_prefetch(&T[i + prefetch_distance]);
-
- sa_sint_t c0 = T[i + 0];
- if (c0 < 0) {
- T[i + 0] = c0 & SAINT_MAX;
- SA[tmp] = i + 0;
- i++;
- tmp = *SAnm++;
- }
- sa_sint_t c1 = T[i + 1];
- if (c1 < 0) {
- T[i + 1] = c1 & SAINT_MAX;
- SA[tmp] = i + 1;
- i++;
- tmp = *SAnm++;
- }
- sa_sint_t c2 = T[i + 2];
- if (c2 < 0) {
- T[i + 2] = c2 & SAINT_MAX;
- SA[tmp] = i + 2;
- i++;
- tmp = *SAnm++;
- }
- sa_sint_t c3 = T[i + 3];
- if (c3 < 0) {
- T[i + 3] = c3 & SAINT_MAX;
- SA[tmp] = i + 3;
- i++;
- tmp = *SAnm++;
- }
- }
-
- for (j += 6; i < j; i += 1) {
- sa_sint_t c = T[i];
- if (c < 0) {
- T[i] = c & SAINT_MAX;
- SA[tmp] = i;
- i++;
- tmp = *SAnm++;
- }
- }
-}
-
-static void libsais_merge_nonunique_lms_suffixes_32s(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, fast_sint_t l,
- fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
- const fast_sint_t prefetch_distance = 32;
-
- const sa_sint_t * RESTRICT SAnm = &SA[(fast_sint_t)n - (fast_sint_t)m - 1 + l];
-
- fast_sint_t i, j;
- sa_sint_t tmp = *SAnm++;
- for (i = omp_block_start, j = omp_block_start + omp_block_size - 3; i < j; i += 4) {
- libsais_prefetch(&SA[i + prefetch_distance]);
-
- if (SA[i + 0] == 0) {
- SA[i + 0] = tmp;
- tmp = *SAnm++;
- }
- if (SA[i + 1] == 0) {
- SA[i + 1] = tmp;
- tmp = *SAnm++;
- }
- if (SA[i + 2] == 0) {
- SA[i + 2] = tmp;
- tmp = *SAnm++;
- }
- if (SA[i + 3] == 0) {
- SA[i + 3] = tmp;
- tmp = *SAnm++;
- }
- }
-
- for (j += 3; i < j; i += 1) {
- if (SA[i] == 0) {
- SA[i] = tmp;
- tmp = *SAnm++;
- }
- }
-}
-
-static void libsais_merge_unique_lms_suffixes_32s_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t m, sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- {
- (void)(threads);
- (void)(thread_state);
-
- fast_sint_t omp_thread_num = 0;
- fast_sint_t omp_num_threads = 1;
-
- fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
- fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
- fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
-
- if (omp_num_threads == 1) {
- libsais_merge_unique_lms_suffixes_32s(T, SA, n, m, 0, omp_block_start, omp_block_size);
- }
- }
-}
-
-static void libsais_merge_nonunique_lms_suffixes_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t f,
- sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- {
- (void)(threads);
- (void)(thread_state);
-
- fast_sint_t omp_thread_num = 0;
- fast_sint_t omp_num_threads = 1;
-
- fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16);
- fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
- fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
-
- if (omp_num_threads == 1) {
- libsais_merge_nonunique_lms_suffixes_32s(SA, n, m, f, omp_block_start, omp_block_size);
- }
- }
-}
-
-static void libsais_merge_compacted_lms_suffixes_32s_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t m, sa_sint_t f, sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- libsais_merge_unique_lms_suffixes_32s_omp(T, SA, n, m, threads, thread_state);
- libsais_merge_nonunique_lms_suffixes_32s_omp(SA, n, m, f, threads, thread_state);
-}
-
-static void libsais_reconstruct_compacted_lms_suffixes_32s_2k_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t n, sa_sint_t k, sa_sint_t m, sa_sint_t fs,
- sa_sint_t f, sa_sint_t * RESTRICT buckets,
- sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- if (f > 0) {
- memmove(&SA[n - m - 1], &SA[n + fs - m], (size_t)f * sizeof(sa_sint_t));
-
- libsais_count_and_gather_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state);
- libsais_reconstruct_lms_suffixes_omp(SA, n, m - f, threads);
-
- memcpy(&SA[n - m - 1 + f], &SA[0], ((size_t)m - (size_t)f) * sizeof(sa_sint_t));
- memset(&SA[0], 0, (size_t)m * sizeof(sa_sint_t));
-
- libsais_merge_compacted_lms_suffixes_32s_omp(T, SA, n, m, f, threads, thread_state);
- } else {
- libsais_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets, 0, n);
- libsais_reconstruct_lms_suffixes_omp(SA, n, m, threads);
- }
-}
-
-static void libsais_reconstruct_compacted_lms_suffixes_32s_1k_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t f,
- sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- if (f > 0) {
- memmove(&SA[n - m - 1], &SA[n + fs - m], (size_t)f * sizeof(sa_sint_t));
-
- libsais_gather_compacted_lms_suffixes_32s(T, SA, n);
- libsais_reconstruct_lms_suffixes_omp(SA, n, m - f, threads);
-
- memcpy(&SA[n - m - 1 + f], &SA[0], ((size_t)m - (size_t)f) * sizeof(sa_sint_t));
- memset(&SA[0], 0, (size_t)m * sizeof(sa_sint_t));
-
- libsais_merge_compacted_lms_suffixes_32s_omp(T, SA, n, m, f, threads, thread_state);
- } else {
- libsais_gather_lms_suffixes_32s(T, SA, n);
- libsais_reconstruct_lms_suffixes_omp(SA, n, m, threads);
- }
-}
-
-static sa_sint_t libsais_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
- sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- fs = fs < (SAINT_MAX - n) ? fs : (SAINT_MAX - n);
-
- if (k > 0 && fs / k >= 6) {
- sa_sint_t alignment = (fs - 1024) / k >= 6 ? 1024 : 16;
- sa_sint_t * RESTRICT buckets =
- (fs - alignment) / k >= 6
- ? (sa_sint_t *)libsais_align_up(&SA[n + fs - 6 * k - alignment], (size_t)alignment * sizeof(sa_sint_t))
- : &SA[n + fs - 6 * k];
-
- sa_sint_t m = libsais_count_and_gather_lms_suffixes_32s_4k_omp(T, SA, n, k, buckets, threads, thread_state);
- if (m > 1) {
- memset(SA, 0, ((size_t)n - (size_t)m) * sizeof(sa_sint_t));
-
- sa_sint_t first_lms_suffix = SA[n - m];
- sa_sint_t left_suffixes_count =
- libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(T, k, buckets, first_lms_suffix);
-
- libsais_radix_sort_lms_suffixes_32s_6k_omp(T, SA, n, m, &buckets[4 * k], threads, thread_state);
- libsais_radix_sort_set_markers_32s_6k_omp(SA, k, &buckets[4 * k], threads);
-
- if (threads > 1 && n >= 65536) {
- memset(&SA[(fast_sint_t)n - (fast_sint_t)m], 0, (size_t)m * sizeof(sa_sint_t));
- }
-
- libsais_initialize_buckets_for_partial_sorting_32s_6k(T, k, buckets, first_lms_suffix, left_suffixes_count);
- libsais_induce_partial_order_32s_6k_omp(T, SA, n, k, buckets, first_lms_suffix, left_suffixes_count,
- threads, thread_state);
-
- sa_sint_t names =
- libsais_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(SA, n, m, threads, thread_state);
- if (names < m) {
- sa_sint_t f = libsais_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state);
-
- if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads,
- thread_state) != 0) {
- return -2;
- }
-
- libsais_reconstruct_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, m, fs, f, buckets, threads,
- thread_state);
- } else {
- libsais_count_lms_suffixes_32s_2k(T, n, k, buckets);
- }
-
- libsais_initialize_buckets_start_and_end_32s_4k(k, buckets);
- libsais_place_lms_suffixes_histogram_32s_4k(SA, n, k, m, buckets);
- libsais_induce_final_order_32s_4k(T, SA, n, k, buckets, threads, thread_state);
- } else {
- SA[0] = SA[n - 1];
-
- libsais_initialize_buckets_start_and_end_32s_6k(k, buckets);
- libsais_place_lms_suffixes_histogram_32s_6k(SA, n, k, m, buckets);
- libsais_induce_final_order_32s_6k(T, SA, n, k, buckets, threads, thread_state);
- }
-
- return 0;
- } else if (k > 0 && fs / k >= 4) {
- sa_sint_t alignment = (fs - 1024) / k >= 4 ? 1024 : 16;
- sa_sint_t * RESTRICT buckets =
- (fs - alignment) / k >= 4
- ? (sa_sint_t *)libsais_align_up(&SA[n + fs - 4 * k - alignment], (size_t)alignment * sizeof(sa_sint_t))
- : &SA[n + fs - 4 * k];
-
- sa_sint_t m = libsais_count_and_gather_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state);
- if (m > 1) {
- libsais_initialize_buckets_for_radix_and_partial_sorting_32s_4k(T, k, buckets, SA[n - m]);
-
- libsais_radix_sort_lms_suffixes_32s_2k_omp(T, SA, n, m, &buckets[1], threads, thread_state);
- libsais_radix_sort_set_markers_32s_4k_omp(SA, k, &buckets[1], threads);
-
- libsais_place_lms_suffixes_interval_32s_4k(SA, n, k, m - 1, buckets);
- libsais_induce_partial_order_32s_4k_omp(T, SA, n, k, buckets, threads, thread_state);
-
- sa_sint_t names =
- libsais_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(SA, n, m, threads, thread_state);
- if (names < m) {
- sa_sint_t f = libsais_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state);
-
- if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads,
- thread_state) != 0) {
- return -2;
- }
-
- libsais_reconstruct_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, m, fs, f, buckets, threads,
- thread_state);
- } else {
- libsais_count_lms_suffixes_32s_2k(T, n, k, buckets);
- }
- } else {
- SA[0] = SA[n - 1];
- }
-
- libsais_initialize_buckets_start_and_end_32s_4k(k, buckets);
- libsais_place_lms_suffixes_histogram_32s_4k(SA, n, k, m, buckets);
- libsais_induce_final_order_32s_4k(T, SA, n, k, buckets, threads, thread_state);
-
- return 0;
- } else if (k > 0 && fs / k >= 2) {
- sa_sint_t alignment = (fs - 1024) / k >= 2 ? 1024 : 16;
- sa_sint_t * RESTRICT buckets =
- (fs - alignment) / k >= 2
- ? (sa_sint_t *)libsais_align_up(&SA[n + fs - 2 * k - alignment], (size_t)alignment * sizeof(sa_sint_t))
- : &SA[n + fs - 2 * k];
-
- sa_sint_t m = libsais_count_and_gather_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state);
- if (m > 1) {
- libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_2k(T, k, buckets, SA[n - m]);
-
- libsais_radix_sort_lms_suffixes_32s_2k_omp(T, SA, n, m, &buckets[1], threads, thread_state);
- libsais_place_lms_suffixes_interval_32s_2k(SA, n, k, m - 1, buckets);
-
- libsais_initialize_buckets_start_and_end_32s_2k(k, buckets);
- libsais_induce_partial_order_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state);
-
- sa_sint_t names = libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(T, SA, n, m, threads);
- if (names < m) {
- sa_sint_t f = libsais_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state);
-
- if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads,
- thread_state) != 0) {
- return -2;
- }
-
- libsais_reconstruct_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, m, fs, f, buckets, threads,
- thread_state);
- } else {
- libsais_count_lms_suffixes_32s_2k(T, n, k, buckets);
- }
- } else {
- SA[0] = SA[n - 1];
- }
-
- libsais_initialize_buckets_end_32s_2k(k, buckets);
- libsais_place_lms_suffixes_histogram_32s_2k(SA, n, k, m, buckets);
-
- libsais_initialize_buckets_start_and_end_32s_2k(k, buckets);
- libsais_induce_final_order_32s_2k(T, SA, n, k, buckets, threads, thread_state);
-
- return 0;
- } else {
- sa_sint_t * buffer =
- fs < k ? (sa_sint_t *)libsais_alloc_aligned((size_t)k * sizeof(sa_sint_t), 4096) : (sa_sint_t *)NULL;
-
- sa_sint_t alignment = fs - 1024 >= k ? 1024 : 16;
- sa_sint_t * RESTRICT buckets =
- fs - alignment >= k
- ? (sa_sint_t *)libsais_align_up(&SA[n + fs - k - alignment], (size_t)alignment * sizeof(sa_sint_t))
- : fs >= k ? &SA[n + fs - k]
- : buffer;
-
- if (buckets == NULL) {
- return -2;
- }
-
- memset(SA, 0, (size_t)n * sizeof(sa_sint_t));
-
- libsais_count_suffixes_32s(T, n, k, buckets);
- libsais_initialize_buckets_end_32s_1k(k, buckets);
-
- sa_sint_t m = libsais_radix_sort_lms_suffixes_32s_1k(T, SA, n, buckets);
- if (m > 1) {
- libsais_induce_partial_order_32s_1k_omp(T, SA, n, k, buckets, threads, thread_state);
-
- sa_sint_t names = libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(T, SA, n, m, threads);
- if (names < m) {
- if (buffer != NULL) {
- libsais_free_aligned(buffer);
- buckets = NULL;
- }
-
- sa_sint_t f = libsais_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state);
-
- if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads,
- thread_state) != 0) {
- return -2;
- }
-
- libsais_reconstruct_compacted_lms_suffixes_32s_1k_omp(T, SA, n, m, fs, f, threads, thread_state);
-
- if (buckets == NULL) {
- buckets = buffer = (sa_sint_t *)libsais_alloc_aligned((size_t)k * sizeof(sa_sint_t), 4096);
- }
- if (buckets == NULL) {
- return -2;
- }
- }
-
- libsais_count_suffixes_32s(T, n, k, buckets);
- libsais_initialize_buckets_end_32s_1k(k, buckets);
- libsais_place_lms_suffixes_interval_32s_1k(T, SA, k, m, buckets);
- }
-
- libsais_induce_final_order_32s_1k(T, SA, n, k, buckets, threads, thread_state);
- libsais_free_aligned(buffer);
-
- return 0;
- }
-}
-
-static sa_sint_t libsais_main_8u(const u8 * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t bwt,
- sa_sint_t r, sa_sint_t * RESTRICT I, sa_sint_t fs, sa_sint_t * freq, sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- fs = fs < (SAINT_MAX - n) ? fs : (SAINT_MAX - n);
-
- sa_sint_t m = libsais_count_and_gather_lms_suffixes_8u_omp(T, SA, n, buckets, threads, thread_state);
-
- libsais_initialize_buckets_start_and_end_8u(buckets, freq);
-
- if (m > 0) {
- sa_sint_t first_lms_suffix = SA[n - m];
- sa_sint_t left_suffixes_count =
- libsais_initialize_buckets_for_lms_suffixes_radix_sort_8u(T, buckets, first_lms_suffix);
-
- if (threads > 1 && n >= 65536) {
- memset(SA, 0, ((size_t)n - (size_t)m) * sizeof(sa_sint_t));
- }
- libsais_radix_sort_lms_suffixes_8u_omp(T, SA, n, m, buckets, threads, thread_state);
- if (threads > 1 && n >= 65536) {
- memset(&SA[(fast_sint_t)n - (fast_sint_t)m], 0, (size_t)m * sizeof(sa_sint_t));
- }
-
- libsais_initialize_buckets_for_partial_sorting_8u(T, buckets, first_lms_suffix, left_suffixes_count);
- libsais_induce_partial_order_8u_omp(T, SA, n, buckets, first_lms_suffix, left_suffixes_count, threads,
- thread_state);
-
- sa_sint_t names = libsais_renumber_and_gather_lms_suffixes_8u_omp(SA, n, m, fs, threads, thread_state);
- if (names < m) {
- if (libsais_main_32s(SA + n + fs - m, SA, m, names, fs + n - 2 * m, threads, thread_state) != 0) {
- return -2;
- }
-
- libsais_gather_lms_suffixes_8u_omp(T, SA, n, threads, thread_state);
- libsais_reconstruct_lms_suffixes_omp(SA, n, m, threads);
- }
-
- libsais_place_lms_suffixes_interval_8u(SA, n, m, buckets);
- } else {
- memset(SA, 0, (size_t)n * sizeof(sa_sint_t));
- }
-
- return libsais_induce_final_order_8u_omp(T, SA, n, bwt, r, I, buckets, threads, thread_state);
-}
-
-static sa_sint_t libsais_main(const u8 * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t bwt, sa_sint_t r, sa_sint_t * I,
- sa_sint_t fs, sa_sint_t * freq, sa_sint_t threads) {
- LIBSAIS_THREAD_STATE * RESTRICT thread_state = threads > 1 ? libsais_alloc_thread_state(threads) : NULL;
- sa_sint_t * RESTRICT buckets = (sa_sint_t *)libsais_alloc_aligned(8 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096);
-
- sa_sint_t index = buckets != NULL && (thread_state != NULL || threads == 1)
- ? libsais_main_8u(T, SA, n, buckets, bwt, r, I, fs, freq, threads, thread_state)
- : -2;
-
- libsais_free_aligned(buckets);
- libsais_free_thread_state(thread_state);
-
- return index;
-}
-
-static s32 libsais_main_int(sa_sint_t * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t k, sa_sint_t fs, sa_sint_t threads) {
- LIBSAIS_THREAD_STATE * RESTRICT thread_state = threads > 1 ? libsais_alloc_thread_state(threads) : NULL;
-
- sa_sint_t index =
- thread_state != NULL || threads == 1 ? libsais_main_32s(T, SA, n, k, fs, threads, thread_state) : -2;
-
- libsais_free_thread_state(thread_state);
-
- return index;
-}
-
-static sa_sint_t libsais_main_ctx(const LIBSAIS_CONTEXT * ctx, const u8 * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t bwt,
- sa_sint_t r, sa_sint_t * I, sa_sint_t fs, sa_sint_t * freq) {
- return ctx != NULL && (ctx->buckets != NULL && (ctx->thread_state != NULL || ctx->threads == 1))
- ? libsais_main_8u(T, SA, n, ctx->buckets, bwt, r, I, fs, freq, (sa_sint_t)ctx->threads,
- ctx->thread_state)
- : -2;
-}
-
-static void libsais_bwt_copy_8u(u8 * RESTRICT U, sa_sint_t * RESTRICT A, sa_sint_t n) {
- const fast_sint_t prefetch_distance = 32;
-
- fast_sint_t i, j;
- for (i = 0, j = (fast_sint_t)n - 7; i < j; i += 8) {
- libsais_prefetch(&A[i + prefetch_distance]);
-
- U[i + 0] = (u8)A[i + 0];
- U[i + 1] = (u8)A[i + 1];
- U[i + 2] = (u8)A[i + 2];
- U[i + 3] = (u8)A[i + 3];
- U[i + 4] = (u8)A[i + 4];
- U[i + 5] = (u8)A[i + 5];
- U[i + 6] = (u8)A[i + 6];
- U[i + 7] = (u8)A[i + 7];
- }
-
- for (j += 7; i < j; i += 1) {
- U[i] = (u8)A[i];
- }
-}
-void * libsais_create_ctx(void) { return (void *)libsais_create_ctx_main(1); }
-
-void libsais_free_ctx(void * ctx) { libsais_free_ctx_main((LIBSAIS_CONTEXT *)ctx); }
-
-s32 libsais(const u8 * T, s32 * SA, s32 n, s32 fs, s32 * freq) {
- if ((T == NULL) || (SA == NULL) || (n < 0) || (fs < 0)) {
- return -1;
- } else if (n < 2) {
- if (freq != NULL) {
- memset(freq, 0, ALPHABET_SIZE * sizeof(s32));
- }
- if (n == 1) {
- SA[0] = 0;
- if (freq != NULL) {
- freq[T[0]]++;
- }
- }
- return 0;
- }
-
- return libsais_main(T, SA, n, 0, 0, NULL, fs, freq, 1);
-}
-
-s32 libsais_int(s32 * T, s32 * SA, s32 n, s32 k, s32 fs) {
- if ((T == NULL) || (SA == NULL) || (n < 0) || (fs < 0)) {
- return -1;
- } else if (n < 2) {
- if (n == 1) {
- SA[0] = 0;
- }
- return 0;
- }
-
- return libsais_main_int(T, SA, n, k, fs, 1);
-}
-
-s32 libsais_ctx(const void * ctx, const u8 * T, s32 * SA, s32 n, s32 fs, s32 * freq) {
- if ((ctx == NULL) || (T == NULL) || (SA == NULL) || (n < 0) || (fs < 0)) {
- return -1;
- } else if (n < 2) {
- if (freq != NULL) {
- memset(freq, 0, ALPHABET_SIZE * sizeof(s32));
- }
- if (n == 1) {
- SA[0] = 0;
- if (freq != NULL) {
- freq[T[0]]++;
- }
- }
- return 0;
- }
-
- return libsais_main_ctx((const LIBSAIS_CONTEXT *)ctx, T, SA, n, 0, 0, NULL, fs, freq);
-}
-
-s32 libsais_bwt(const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq) {
- if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0)) {
- return -1;
- } else if (n <= 1) {
- if (freq != NULL) {
- memset(freq, 0, ALPHABET_SIZE * sizeof(s32));
- }
- if (n == 1) {
- U[0] = T[0];
- if (freq != NULL) {
- freq[T[0]]++;
- }
- }
- return n;
- }
-
- sa_sint_t index = libsais_main(T, A, n, 1, 0, NULL, fs, freq, 1);
- if (index >= 0) {
- index++;
-
- U[0] = T[n - 1];
- libsais_bwt_copy_8u(U + 1, A, index - 1);
- libsais_bwt_copy_8u(U + index, A + index, n - index);
- }
-
- return index;
-}
-
-s32 libsais_bwt_aux(const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq, s32 r, s32 * I) {
- if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (r < 2) || ((r & (r - 1)) != 0) ||
- (I == NULL)) {
- return -1;
- } else if (n <= 1) {
- if (freq != NULL) {
- memset(freq, 0, ALPHABET_SIZE * sizeof(s32));
- }
- if (n == 1) {
- U[0] = T[0];
- if (freq != NULL) {
- freq[T[0]]++;
- }
- }
- I[0] = n;
- return 0;
- }
-
- if (libsais_main(T, A, n, 1, r, I, fs, freq, 1) != 0) {
- return -2;
- }
-
- U[0] = T[n - 1];
- libsais_bwt_copy_8u(U + 1, A, I[0] - 1);
- libsais_bwt_copy_8u(U + I[0], A + I[0], n - I[0]);
-
- return 0;
-}
-
-s32 libsais_bwt_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq) {
- if ((ctx == NULL) || (T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0)) {
- return -1;
- } else if (n <= 1) {
- if (freq != NULL) {
- memset(freq, 0, ALPHABET_SIZE * sizeof(s32));
- }
- if (n == 1) {
- U[0] = T[0];
- if (freq != NULL) {
- freq[T[0]]++;
- }
- }
- return n;
- }
-
- sa_sint_t index = libsais_main_ctx((const LIBSAIS_CONTEXT *)ctx, T, A, n, 1, 0, NULL, fs, freq);
- if (index >= 0) {
- index++;
-
- U[0] = T[n - 1];
-
- libsais_bwt_copy_8u(U + 1, A, index - 1);
- libsais_bwt_copy_8u(U + index, A + index, n - index);
- }
-
- return index;
-}
-
-s32 libsais_bwt_aux_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq, s32 r, s32 * I) {
- if ((ctx == NULL) || (T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (r < 2) ||
- ((r & (r - 1)) != 0) || (I == NULL)) {
- return -1;
- } else if (n <= 1) {
- if (freq != NULL) {
- memset(freq, 0, ALPHABET_SIZE * sizeof(s32));
- }
- if (n == 1) {
- U[0] = T[0];
- if (freq != NULL) {
- freq[T[0]]++;
- }
- }
- I[0] = n;
- return 0;
- }
-
- if (libsais_main_ctx((const LIBSAIS_CONTEXT *)ctx, T, A, n, 1, r, I, fs, freq) != 0) {
- return -2;
- }
-
- U[0] = T[n - 1];
- libsais_bwt_copy_8u(U + 1, A, I[0] - 1);
- libsais_bwt_copy_8u(U + I[0], A + I[0], n - I[0]);
- return 0;
-}
-static LIBSAIS_UNBWT_CONTEXT * libsais_unbwt_create_ctx_main(sa_sint_t threads) {
- LIBSAIS_UNBWT_CONTEXT * RESTRICT ctx =
- (LIBSAIS_UNBWT_CONTEXT *)libsais_alloc_aligned(sizeof(LIBSAIS_UNBWT_CONTEXT), 64);
- sa_uint_t * RESTRICT bucket2 =
- (sa_uint_t *)libsais_alloc_aligned(ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t), 4096);
- u16 * RESTRICT fastbits = (u16 *)libsais_alloc_aligned((1 + (1 << UNBWT_FASTBITS)) * sizeof(u16), 4096);
- sa_uint_t * RESTRICT buckets =
- threads > 1 ? (sa_uint_t *)libsais_alloc_aligned(
- (size_t)threads * (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)) * sizeof(sa_uint_t), 4096)
- : NULL;
-
- if (ctx != NULL && bucket2 != NULL && fastbits != NULL && (buckets != NULL || threads == 1)) {
- ctx->bucket2 = bucket2;
- ctx->fastbits = fastbits;
- ctx->buckets = buckets;
- ctx->threads = threads;
-
- return ctx;
- }
-
- libsais_free_aligned(buckets);
- libsais_free_aligned(fastbits);
- libsais_free_aligned(bucket2);
- libsais_free_aligned(ctx);
-
- return NULL;
-}
-
-static void libsais_unbwt_free_ctx_main(LIBSAIS_UNBWT_CONTEXT * ctx) {
- if (ctx != NULL) {
- libsais_free_aligned(ctx->buckets);
- libsais_free_aligned(ctx->fastbits);
- libsais_free_aligned(ctx->bucket2);
- libsais_free_aligned(ctx);
- }
-}
-
-static void libsais_unbwt_compute_histogram(const u8 * RESTRICT T, fast_sint_t n, sa_uint_t * RESTRICT count) {
- const fast_sint_t prefetch_distance = 256;
-
- const u8 * RESTRICT T_p = T;
-
- if (n >= 1024) {
- sa_uint_t copy[4 * (ALPHABET_SIZE + 16)];
-
- memset(copy, 0, 4 * (ALPHABET_SIZE + 16) * sizeof(sa_uint_t));
-
- sa_uint_t * RESTRICT copy0 = copy + 0 * (ALPHABET_SIZE + 16);
- sa_uint_t * RESTRICT copy1 = copy + 1 * (ALPHABET_SIZE + 16);
- sa_uint_t * RESTRICT copy2 = copy + 2 * (ALPHABET_SIZE + 16);
- sa_uint_t * RESTRICT copy3 = copy + 3 * (ALPHABET_SIZE + 16);
-
- for (; T_p < (u8 *)((ptrdiff_t)(T + 63) & (-64)); T_p += 1) {
- copy0[T_p[0]]++;
- }
-
- fast_uint_t x = ((const u32 *)(const void *)T_p)[0], y = ((const u32 *)(const void *)T_p)[1];
-
- for (; T_p < (u8 *)((ptrdiff_t)(T + n - 8) & (-64)); T_p += 64) {
- libsais_prefetch(&T_p[prefetch_distance]);
-
- fast_uint_t z = ((const u32 *)(const void *)T_p)[2], w = ((const u32 *)(const void *)T_p)[3];
- copy0[(u8)x]++;
- x >>= 8;
- copy1[(u8)x]++;
- x >>= 8;
- copy2[(u8)x]++;
- x >>= 8;
- copy3[x]++;
- copy0[(u8)y]++;
- y >>= 8;
- copy1[(u8)y]++;
- y >>= 8;
- copy2[(u8)y]++;
- y >>= 8;
- copy3[y]++;
-
- x = ((const u32 *)(const void *)T_p)[4];
- y = ((const u32 *)(const void *)T_p)[5];
- copy0[(u8)z]++;
- z >>= 8;
- copy1[(u8)z]++;
- z >>= 8;
- copy2[(u8)z]++;
- z >>= 8;
- copy3[z]++;
- copy0[(u8)w]++;
- w >>= 8;
- copy1[(u8)w]++;
- w >>= 8;
- copy2[(u8)w]++;
- w >>= 8;
- copy3[w]++;
-
- z = ((const u32 *)(const void *)T_p)[6];
- w = ((const u32 *)(const void *)T_p)[7];
- copy0[(u8)x]++;
- x >>= 8;
- copy1[(u8)x]++;
- x >>= 8;
- copy2[(u8)x]++;
- x >>= 8;
- copy3[x]++;
- copy0[(u8)y]++;
- y >>= 8;
- copy1[(u8)y]++;
- y >>= 8;
- copy2[(u8)y]++;
- y >>= 8;
- copy3[y]++;
-
- x = ((const u32 *)(const void *)T_p)[8];
- y = ((const u32 *)(const void *)T_p)[9];
- copy0[(u8)z]++;
- z >>= 8;
- copy1[(u8)z]++;
- z >>= 8;
- copy2[(u8)z]++;
- z >>= 8;
- copy3[z]++;
- copy0[(u8)w]++;
- w >>= 8;
- copy1[(u8)w]++;
- w >>= 8;
- copy2[(u8)w]++;
- w >>= 8;
- copy3[w]++;
-
- z = ((const u32 *)(const void *)T_p)[10];
- w = ((const u32 *)(const void *)T_p)[11];
- copy0[(u8)x]++;
- x >>= 8;
- copy1[(u8)x]++;
- x >>= 8;
- copy2[(u8)x]++;
- x >>= 8;
- copy3[x]++;
- copy0[(u8)y]++;
- y >>= 8;
- copy1[(u8)y]++;
- y >>= 8;
- copy2[(u8)y]++;
- y >>= 8;
- copy3[y]++;
-
- x = ((const u32 *)(const void *)T_p)[12];
- y = ((const u32 *)(const void *)T_p)[13];
- copy0[(u8)z]++;
- z >>= 8;
- copy1[(u8)z]++;
- z >>= 8;
- copy2[(u8)z]++;
- z >>= 8;
- copy3[z]++;
- copy0[(u8)w]++;
- w >>= 8;
- copy1[(u8)w]++;
- w >>= 8;
- copy2[(u8)w]++;
- w >>= 8;
- copy3[w]++;
-
- z = ((const u32 *)(const void *)T_p)[14];
- w = ((const u32 *)(const void *)T_p)[15];
- copy0[(u8)x]++;
- x >>= 8;
- copy1[(u8)x]++;
- x >>= 8;
- copy2[(u8)x]++;
- x >>= 8;
- copy3[x]++;
- copy0[(u8)y]++;
- y >>= 8;
- copy1[(u8)y]++;
- y >>= 8;
- copy2[(u8)y]++;
- y >>= 8;
- copy3[y]++;
-
- x = ((const u32 *)(const void *)T_p)[16];
- y = ((const u32 *)(const void *)T_p)[17];
- copy0[(u8)z]++;
- z >>= 8;
- copy1[(u8)z]++;
- z >>= 8;
- copy2[(u8)z]++;
- z >>= 8;
- copy3[z]++;
- copy0[(u8)w]++;
- w >>= 8;
- copy1[(u8)w]++;
- w >>= 8;
- copy2[(u8)w]++;
- w >>= 8;
- copy3[w]++;
- }
-
- copy0[(u8)x]++;
- x >>= 8;
- copy1[(u8)x]++;
- x >>= 8;
- copy2[(u8)x]++;
- x >>= 8;
- copy3[x]++;
- copy0[(u8)y]++;
- y >>= 8;
- copy1[(u8)y]++;
- y >>= 8;
- copy2[(u8)y]++;
- y >>= 8;
- copy3[y]++;
-
- T_p += 8;
-
- fast_uint_t i;
- for (i = 0; i < ALPHABET_SIZE; i++) {
- count[i] += copy0[i] + copy1[i] + copy2[i] + copy3[i];
- }
- }
-
- for (; T_p < T + n; T_p += 1) {
- count[T_p[0]]++;
- }
-}
-
-static void libsais_unbwt_transpose_bucket2(sa_uint_t * RESTRICT bucket2) {
- fast_uint_t x, y, c, d;
- for (x = 0; x != ALPHABET_SIZE; x += 16) {
- for (c = x; c != x + 16; ++c) {
- for (d = c + 1; d != x + 16; ++d) {
- sa_uint_t tmp = bucket2[(d << 8) + c];
- bucket2[(d << 8) + c] = bucket2[(c << 8) + d];
- bucket2[(c << 8) + d] = tmp;
- }
- }
-
- for (y = x + 16; y != ALPHABET_SIZE; y += 16) {
- for (c = x; c != x + 16; ++c) {
- sa_uint_t * bucket2_yc = &bucket2[(y << 8) + c];
- sa_uint_t * bucket2_cy = &bucket2[(c << 8) + y];
-
- sa_uint_t tmp00 = bucket2_yc[0 * 256];
- bucket2_yc[0 * 256] = bucket2_cy[0];
- bucket2_cy[0] = tmp00;
- sa_uint_t tmp01 = bucket2_yc[1 * 256];
- bucket2_yc[1 * 256] = bucket2_cy[1];
- bucket2_cy[1] = tmp01;
- sa_uint_t tmp02 = bucket2_yc[2 * 256];
- bucket2_yc[2 * 256] = bucket2_cy[2];
- bucket2_cy[2] = tmp02;
- sa_uint_t tmp03 = bucket2_yc[3 * 256];
- bucket2_yc[3 * 256] = bucket2_cy[3];
- bucket2_cy[3] = tmp03;
- sa_uint_t tmp04 = bucket2_yc[4 * 256];
- bucket2_yc[4 * 256] = bucket2_cy[4];
- bucket2_cy[4] = tmp04;
- sa_uint_t tmp05 = bucket2_yc[5 * 256];
- bucket2_yc[5 * 256] = bucket2_cy[5];
- bucket2_cy[5] = tmp05;
- sa_uint_t tmp06 = bucket2_yc[6 * 256];
- bucket2_yc[6 * 256] = bucket2_cy[6];
- bucket2_cy[6] = tmp06;
- sa_uint_t tmp07 = bucket2_yc[7 * 256];
- bucket2_yc[7 * 256] = bucket2_cy[7];
- bucket2_cy[7] = tmp07;
- sa_uint_t tmp08 = bucket2_yc[8 * 256];
- bucket2_yc[8 * 256] = bucket2_cy[8];
- bucket2_cy[8] = tmp08;
- sa_uint_t tmp09 = bucket2_yc[9 * 256];
- bucket2_yc[9 * 256] = bucket2_cy[9];
- bucket2_cy[9] = tmp09;
- sa_uint_t tmp10 = bucket2_yc[10 * 256];
- bucket2_yc[10 * 256] = bucket2_cy[10];
- bucket2_cy[10] = tmp10;
- sa_uint_t tmp11 = bucket2_yc[11 * 256];
- bucket2_yc[11 * 256] = bucket2_cy[11];
- bucket2_cy[11] = tmp11;
- sa_uint_t tmp12 = bucket2_yc[12 * 256];
- bucket2_yc[12 * 256] = bucket2_cy[12];
- bucket2_cy[12] = tmp12;
- sa_uint_t tmp13 = bucket2_yc[13 * 256];
- bucket2_yc[13 * 256] = bucket2_cy[13];
- bucket2_cy[13] = tmp13;
- sa_uint_t tmp14 = bucket2_yc[14 * 256];
- bucket2_yc[14 * 256] = bucket2_cy[14];
- bucket2_cy[14] = tmp14;
- sa_uint_t tmp15 = bucket2_yc[15 * 256];
- bucket2_yc[15 * 256] = bucket2_cy[15];
- bucket2_cy[15] = tmp15;
- }
- }
- }
-}
-
-static void libsais_unbwt_compute_bigram_histogram_single(const u8 * RESTRICT T, sa_uint_t * RESTRICT bucket1,
- sa_uint_t * RESTRICT bucket2, fast_uint_t index) {
- fast_uint_t sum, c;
- for (sum = 1, c = 0; c < ALPHABET_SIZE; ++c) {
- fast_uint_t prev = sum;
- sum += bucket1[c];
- bucket1[c] = (sa_uint_t)prev;
- if (prev != sum) {
- sa_uint_t * RESTRICT bucket2_p = &bucket2[c << 8];
-
- {
- fast_uint_t hi = index;
- if (sum < hi) {
- hi = sum;
- }
- libsais_unbwt_compute_histogram(&T[prev], (fast_sint_t)(hi - prev), bucket2_p);
- }
-
- {
- fast_uint_t lo = index + 1;
- if (prev > lo) {
- lo = prev;
- }
- libsais_unbwt_compute_histogram(&T[lo - 1], (fast_sint_t)(sum - lo), bucket2_p);
- }
- }
- }
-
- libsais_unbwt_transpose_bucket2(bucket2);
-}
-
-static void libsais_unbwt_calculate_fastbits(sa_uint_t * RESTRICT bucket2, u16 * RESTRICT fastbits, fast_uint_t lastc,
- fast_uint_t shift) {
- fast_uint_t v, w, sum, c, d;
- for (v = 0, w = 0, sum = 1, c = 0; c < ALPHABET_SIZE; ++c) {
- if (c == lastc) {
- sum += 1;
- }
-
- for (d = 0; d < ALPHABET_SIZE; ++d, ++w) {
- fast_uint_t prev = sum;
- sum += bucket2[w];
- bucket2[w] = (sa_uint_t)prev;
- if (prev != sum) {
- for (; v <= ((sum - 1) >> shift); ++v) {
- fastbits[v] = (u16)w;
- }
- }
- }
- }
-}
-
-static void libsais_unbwt_calculate_biPSI(const u8 * RESTRICT T, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket1,
- sa_uint_t * RESTRICT bucket2, fast_uint_t index, fast_sint_t omp_block_start,
- fast_sint_t omp_block_end) {
- {
- fast_sint_t i = omp_block_start, j = (fast_sint_t)index;
- if (omp_block_end < j) {
- j = omp_block_end;
- }
- for (; i < j; ++i) {
- fast_uint_t c = T[i];
- fast_uint_t p = bucket1[c]++;
- fast_sint_t t = (fast_sint_t)(index - p);
-
- if (t != 0) {
- fast_uint_t w = (((fast_uint_t)T[p + (fast_uint_t)(t >> ((sizeof(fast_sint_t) * 8) - 1))]) << 8) + c;
- P[bucket2[w]++] = (sa_uint_t)i;
- }
- }
- }
-
- {
- fast_sint_t i = (fast_sint_t)index, j = omp_block_end;
- if (omp_block_start > i) {
- i = omp_block_start;
- }
- for (i += 1; i <= j; ++i) {
- fast_uint_t c = T[i - 1];
- fast_uint_t p = bucket1[c]++;
- fast_sint_t t = (fast_sint_t)(index - p);
-
- if (t != 0) {
- fast_uint_t w = (((fast_uint_t)T[p + (fast_uint_t)(t >> ((sizeof(fast_sint_t) * 8) - 1))]) << 8) + c;
- P[bucket2[w]++] = (sa_uint_t)i;
- }
- }
- }
-}
-
-static void libsais_unbwt_init_single(const u8 * RESTRICT T, sa_uint_t * RESTRICT P, sa_sint_t n,
- const sa_sint_t * freq, const sa_uint_t * RESTRICT I,
- sa_uint_t * RESTRICT bucket2, u16 * RESTRICT fastbits) {
- sa_uint_t bucket1[ALPHABET_SIZE];
-
- fast_uint_t index = I[0];
- fast_uint_t lastc = T[0];
- fast_uint_t shift = 0;
- while ((n >> shift) > (1 << UNBWT_FASTBITS)) {
- shift++;
- }
-
- if (freq != NULL) {
- memcpy(bucket1, freq, ALPHABET_SIZE * sizeof(sa_uint_t));
- } else {
- memset(bucket1, 0, ALPHABET_SIZE * sizeof(sa_uint_t));
- libsais_unbwt_compute_histogram(T, n, bucket1);
- }
-
- memset(bucket2, 0, ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t));
- libsais_unbwt_compute_bigram_histogram_single(T, bucket1, bucket2, index);
-
- libsais_unbwt_calculate_fastbits(bucket2, fastbits, lastc, shift);
- libsais_unbwt_calculate_biPSI(T, P, bucket1, bucket2, index, 0, n);
-}
-static void libsais_unbwt_decode_1(u8 * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2,
- u16 * RESTRICT fastbits, fast_uint_t shift, fast_uint_t * i0, fast_uint_t k) {
- u16 * RESTRICT U0 = (u16 *)(void *)U;
-
- fast_uint_t i, p0 = *i0;
-
- for (i = 0; i != k; ++i) {
- u16 c0 = fastbits[p0 >> shift];
- if (bucket2[c0] <= p0) {
- do {
- c0++;
- } while (bucket2[c0] <= p0);
- }
- p0 = P[p0];
- U0[i] = libsais_bswap16(c0);
- }
-
- *i0 = p0;
-}
-
-static void libsais_unbwt_decode_2(u8 * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2,
- u16 * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0,
- fast_uint_t * i1, fast_uint_t k) {
- u16 * RESTRICT U0 = (u16 *)(void *)U;
- u16 * RESTRICT U1 = (u16 *)(void *)(((u8 *)U0) + r);
-
- fast_uint_t i, p0 = *i0, p1 = *i1;
-
- for (i = 0; i != k; ++i) {
- u16 c0 = fastbits[p0 >> shift];
- if (bucket2[c0] <= p0) {
- do {
- c0++;
- } while (bucket2[c0] <= p0);
- }
- p0 = P[p0];
- U0[i] = libsais_bswap16(c0);
- u16 c1 = fastbits[p1 >> shift];
- if (bucket2[c1] <= p1) {
- do {
- c1++;
- } while (bucket2[c1] <= p1);
- }
- p1 = P[p1];
- U1[i] = libsais_bswap16(c1);
- }
-
- *i0 = p0;
- *i1 = p1;
-}
-
-static void libsais_unbwt_decode_3(u8 * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2,
- u16 * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0,
- fast_uint_t * i1, fast_uint_t * i2, fast_uint_t k) {
- u16 * RESTRICT U0 = (u16 *)(void *)U;
- u16 * RESTRICT U1 = (u16 *)(void *)(((u8 *)U0) + r);
- u16 * RESTRICT U2 = (u16 *)(void *)(((u8 *)U1) + r);
-
- fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2;
-
- for (i = 0; i != k; ++i) {
- u16 c0 = fastbits[p0 >> shift];
- if (bucket2[c0] <= p0) {
- do {
- c0++;
- } while (bucket2[c0] <= p0);
- }
- p0 = P[p0];
- U0[i] = libsais_bswap16(c0);
- u16 c1 = fastbits[p1 >> shift];
- if (bucket2[c1] <= p1) {
- do {
- c1++;
- } while (bucket2[c1] <= p1);
- }
- p1 = P[p1];
- U1[i] = libsais_bswap16(c1);
- u16 c2 = fastbits[p2 >> shift];
- if (bucket2[c2] <= p2) {
- do {
- c2++;
- } while (bucket2[c2] <= p2);
- }
- p2 = P[p2];
- U2[i] = libsais_bswap16(c2);
- }
-
- *i0 = p0;
- *i1 = p1;
- *i2 = p2;
-}
-
-static void libsais_unbwt_decode_4(u8 * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2,
- u16 * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0,
- fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t k) {
- u16 * RESTRICT U0 = (u16 *)(void *)U;
- u16 * RESTRICT U1 = (u16 *)(void *)(((u8 *)U0) + r);
- u16 * RESTRICT U2 = (u16 *)(void *)(((u8 *)U1) + r);
- u16 * RESTRICT U3 = (u16 *)(void *)(((u8 *)U2) + r);
-
- fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3;
-
- for (i = 0; i != k; ++i) {
- u16 c0 = fastbits[p0 >> shift];
- if (bucket2[c0] <= p0) {
- do {
- c0++;
- } while (bucket2[c0] <= p0);
- }
- p0 = P[p0];
- U0[i] = libsais_bswap16(c0);
- u16 c1 = fastbits[p1 >> shift];
- if (bucket2[c1] <= p1) {
- do {
- c1++;
- } while (bucket2[c1] <= p1);
- }
- p1 = P[p1];
- U1[i] = libsais_bswap16(c1);
- u16 c2 = fastbits[p2 >> shift];
- if (bucket2[c2] <= p2) {
- do {
- c2++;
- } while (bucket2[c2] <= p2);
- }
- p2 = P[p2];
- U2[i] = libsais_bswap16(c2);
- u16 c3 = fastbits[p3 >> shift];
- if (bucket2[c3] <= p3) {
- do {
- c3++;
- } while (bucket2[c3] <= p3);
- }
- p3 = P[p3];
- U3[i] = libsais_bswap16(c3);
- }
-
- *i0 = p0;
- *i1 = p1;
- *i2 = p2;
- *i3 = p3;
-}
-
-static void libsais_unbwt_decode_5(u8 * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2,
- u16 * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0,
- fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4,
- fast_uint_t k) {
- u16 * RESTRICT U0 = (u16 *)(void *)U;
- u16 * RESTRICT U1 = (u16 *)(void *)(((u8 *)U0) + r);
- u16 * RESTRICT U2 = (u16 *)(void *)(((u8 *)U1) + r);
- u16 * RESTRICT U3 = (u16 *)(void *)(((u8 *)U2) + r);
- u16 * RESTRICT U4 = (u16 *)(void *)(((u8 *)U3) + r);
-
- fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4;
-
- for (i = 0; i != k; ++i) {
- u16 c0 = fastbits[p0 >> shift];
- if (bucket2[c0] <= p0) {
- do {
- c0++;
- } while (bucket2[c0] <= p0);
- }
- p0 = P[p0];
- U0[i] = libsais_bswap16(c0);
- u16 c1 = fastbits[p1 >> shift];
- if (bucket2[c1] <= p1) {
- do {
- c1++;
- } while (bucket2[c1] <= p1);
- }
- p1 = P[p1];
- U1[i] = libsais_bswap16(c1);
- u16 c2 = fastbits[p2 >> shift];
- if (bucket2[c2] <= p2) {
- do {
- c2++;
- } while (bucket2[c2] <= p2);
- }
- p2 = P[p2];
- U2[i] = libsais_bswap16(c2);
- u16 c3 = fastbits[p3 >> shift];
- if (bucket2[c3] <= p3) {
- do {
- c3++;
- } while (bucket2[c3] <= p3);
- }
- p3 = P[p3];
- U3[i] = libsais_bswap16(c3);
- u16 c4 = fastbits[p4 >> shift];
- if (bucket2[c4] <= p4) {
- do {
- c4++;
- } while (bucket2[c4] <= p4);
- }
- p4 = P[p4];
- U4[i] = libsais_bswap16(c4);
- }
-
- *i0 = p0;
- *i1 = p1;
- *i2 = p2;
- *i3 = p3;
- *i4 = p4;
-}
-
-static void libsais_unbwt_decode_6(u8 * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2,
- u16 * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0,
- fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4,
- fast_uint_t * i5, fast_uint_t k) {
- u16 * RESTRICT U0 = (u16 *)(void *)U;
- u16 * RESTRICT U1 = (u16 *)(void *)(((u8 *)U0) + r);
- u16 * RESTRICT U2 = (u16 *)(void *)(((u8 *)U1) + r);
- u16 * RESTRICT U3 = (u16 *)(void *)(((u8 *)U2) + r);
- u16 * RESTRICT U4 = (u16 *)(void *)(((u8 *)U3) + r);
- u16 * RESTRICT U5 = (u16 *)(void *)(((u8 *)U4) + r);
-
- fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4, p5 = *i5;
-
- for (i = 0; i != k; ++i) {
- u16 c0 = fastbits[p0 >> shift];
- if (bucket2[c0] <= p0) {
- do {
- c0++;
- } while (bucket2[c0] <= p0);
- }
- p0 = P[p0];
- U0[i] = libsais_bswap16(c0);
- u16 c1 = fastbits[p1 >> shift];
- if (bucket2[c1] <= p1) {
- do {
- c1++;
- } while (bucket2[c1] <= p1);
- }
- p1 = P[p1];
- U1[i] = libsais_bswap16(c1);
- u16 c2 = fastbits[p2 >> shift];
- if (bucket2[c2] <= p2) {
- do {
- c2++;
- } while (bucket2[c2] <= p2);
- }
- p2 = P[p2];
- U2[i] = libsais_bswap16(c2);
- u16 c3 = fastbits[p3 >> shift];
- if (bucket2[c3] <= p3) {
- do {
- c3++;
- } while (bucket2[c3] <= p3);
- }
- p3 = P[p3];
- U3[i] = libsais_bswap16(c3);
- u16 c4 = fastbits[p4 >> shift];
- if (bucket2[c4] <= p4) {
- do {
- c4++;
- } while (bucket2[c4] <= p4);
- }
- p4 = P[p4];
- U4[i] = libsais_bswap16(c4);
- u16 c5 = fastbits[p5 >> shift];
- if (bucket2[c5] <= p5) {
- do {
- c5++;
- } while (bucket2[c5] <= p5);
- }
- p5 = P[p5];
- U5[i] = libsais_bswap16(c5);
- }
-
- *i0 = p0;
- *i1 = p1;
- *i2 = p2;
- *i3 = p3;
- *i4 = p4;
- *i5 = p5;
-}
-
-static void libsais_unbwt_decode_7(u8 * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2,
- u16 * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0,
- fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4,
- fast_uint_t * i5, fast_uint_t * i6, fast_uint_t k) {
- u16 * RESTRICT U0 = (u16 *)(void *)U;
- u16 * RESTRICT U1 = (u16 *)(void *)(((u8 *)U0) + r);
- u16 * RESTRICT U2 = (u16 *)(void *)(((u8 *)U1) + r);
- u16 * RESTRICT U3 = (u16 *)(void *)(((u8 *)U2) + r);
- u16 * RESTRICT U4 = (u16 *)(void *)(((u8 *)U3) + r);
- u16 * RESTRICT U5 = (u16 *)(void *)(((u8 *)U4) + r);
- u16 * RESTRICT U6 = (u16 *)(void *)(((u8 *)U5) + r);
-
- fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4, p5 = *i5, p6 = *i6;
-
- for (i = 0; i != k; ++i) {
- u16 c0 = fastbits[p0 >> shift];
- if (bucket2[c0] <= p0) {
- do {
- c0++;
- } while (bucket2[c0] <= p0);
- }
- p0 = P[p0];
- U0[i] = libsais_bswap16(c0);
- u16 c1 = fastbits[p1 >> shift];
- if (bucket2[c1] <= p1) {
- do {
- c1++;
- } while (bucket2[c1] <= p1);
- }
- p1 = P[p1];
- U1[i] = libsais_bswap16(c1);
- u16 c2 = fastbits[p2 >> shift];
- if (bucket2[c2] <= p2) {
- do {
- c2++;
- } while (bucket2[c2] <= p2);
- }
- p2 = P[p2];
- U2[i] = libsais_bswap16(c2);
- u16 c3 = fastbits[p3 >> shift];
- if (bucket2[c3] <= p3) {
- do {
- c3++;
- } while (bucket2[c3] <= p3);
- }
- p3 = P[p3];
- U3[i] = libsais_bswap16(c3);
- u16 c4 = fastbits[p4 >> shift];
- if (bucket2[c4] <= p4) {
- do {
- c4++;
- } while (bucket2[c4] <= p4);
- }
- p4 = P[p4];
- U4[i] = libsais_bswap16(c4);
- u16 c5 = fastbits[p5 >> shift];
- if (bucket2[c5] <= p5) {
- do {
- c5++;
- } while (bucket2[c5] <= p5);
- }
- p5 = P[p5];
- U5[i] = libsais_bswap16(c5);
- u16 c6 = fastbits[p6 >> shift];
- if (bucket2[c6] <= p6) {
- do {
- c6++;
- } while (bucket2[c6] <= p6);
- }
- p6 = P[p6];
- U6[i] = libsais_bswap16(c6);
- }
-
- *i0 = p0;
- *i1 = p1;
- *i2 = p2;
- *i3 = p3;
- *i4 = p4;
- *i5 = p5;
- *i6 = p6;
-}
-
-static void libsais_unbwt_decode_8(u8 * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2,
- u16 * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0,
- fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4,
- fast_uint_t * i5, fast_uint_t * i6, fast_uint_t * i7, fast_uint_t k) {
- u16 * RESTRICT U0 = (u16 *)(void *)U;
- u16 * RESTRICT U1 = (u16 *)(void *)(((u8 *)U0) + r);
- u16 * RESTRICT U2 = (u16 *)(void *)(((u8 *)U1) + r);
- u16 * RESTRICT U3 = (u16 *)(void *)(((u8 *)U2) + r);
- u16 * RESTRICT U4 = (u16 *)(void *)(((u8 *)U3) + r);
- u16 * RESTRICT U5 = (u16 *)(void *)(((u8 *)U4) + r);
- u16 * RESTRICT U6 = (u16 *)(void *)(((u8 *)U5) + r);
- u16 * RESTRICT U7 = (u16 *)(void *)(((u8 *)U6) + r);
-
- fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4, p5 = *i5, p6 = *i6, p7 = *i7;
-
- for (i = 0; i != k; ++i) {
- u16 c0 = fastbits[p0 >> shift];
- if (bucket2[c0] <= p0) {
- do {
- c0++;
- } while (bucket2[c0] <= p0);
- }
- p0 = P[p0];
- U0[i] = libsais_bswap16(c0);
- u16 c1 = fastbits[p1 >> shift];
- if (bucket2[c1] <= p1) {
- do {
- c1++;
- } while (bucket2[c1] <= p1);
- }
- p1 = P[p1];
- U1[i] = libsais_bswap16(c1);
- u16 c2 = fastbits[p2 >> shift];
- if (bucket2[c2] <= p2) {
- do {
- c2++;
- } while (bucket2[c2] <= p2);
- }
- p2 = P[p2];
- U2[i] = libsais_bswap16(c2);
- u16 c3 = fastbits[p3 >> shift];
- if (bucket2[c3] <= p3) {
- do {
- c3++;
- } while (bucket2[c3] <= p3);
- }
- p3 = P[p3];
- U3[i] = libsais_bswap16(c3);
- u16 c4 = fastbits[p4 >> shift];
- if (bucket2[c4] <= p4) {
- do {
- c4++;
- } while (bucket2[c4] <= p4);
- }
- p4 = P[p4];
- U4[i] = libsais_bswap16(c4);
- u16 c5 = fastbits[p5 >> shift];
- if (bucket2[c5] <= p5) {
- do {
- c5++;
- } while (bucket2[c5] <= p5);
- }
- p5 = P[p5];
- U5[i] = libsais_bswap16(c5);
- u16 c6 = fastbits[p6 >> shift];
- if (bucket2[c6] <= p6) {
- do {
- c6++;
- } while (bucket2[c6] <= p6);
- }
- p6 = P[p6];
- U6[i] = libsais_bswap16(c6);
- u16 c7 = fastbits[p7 >> shift];
- if (bucket2[c7] <= p7) {
- do {
- c7++;
- } while (bucket2[c7] <= p7);
- }
- p7 = P[p7];
- U7[i] = libsais_bswap16(c7);
- }
-
- *i0 = p0;
- *i1 = p1;
- *i2 = p2;
- *i3 = p3;
- *i4 = p4;
- *i5 = p5;
- *i6 = p6;
- *i7 = p7;
-}
-
-static void libsais_unbwt_decode(u8 * RESTRICT U, sa_uint_t * RESTRICT P, sa_sint_t n, sa_sint_t r,
- const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, u16 * RESTRICT fastbits,
- fast_sint_t blocks, fast_uint_t reminder) {
- fast_uint_t shift = 0;
- while ((n >> shift) > (1 << UNBWT_FASTBITS)) {
- shift++;
- }
- fast_uint_t offset = 0;
-
- while (blocks > 8) {
- fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5], i6 = I[6], i7 = I[7];
- libsais_unbwt_decode_8(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5,
- &i6, &i7, (fast_uint_t)r >> 1);
- I += 8;
- blocks -= 8;
- offset += 8 * (fast_uint_t)r;
- }
-
- if (blocks == 1) {
- fast_uint_t i0 = I[0];
- libsais_unbwt_decode_1(U + offset, P, bucket2, fastbits, shift, &i0, reminder >> 1);
- } else if (blocks == 2) {
- fast_uint_t i0 = I[0], i1 = I[1];
- libsais_unbwt_decode_2(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, reminder >> 1);
- libsais_unbwt_decode_1(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift, &i0,
- ((fast_uint_t)r >> 1) - (reminder >> 1));
- } else if (blocks == 3) {
- fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2];
- libsais_unbwt_decode_3(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, reminder >> 1);
- libsais_unbwt_decode_2(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1,
- ((fast_uint_t)r >> 1) - (reminder >> 1));
- } else if (blocks == 4) {
- fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3];
- libsais_unbwt_decode_4(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3,
- reminder >> 1);
- libsais_unbwt_decode_3(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1,
- &i2, ((fast_uint_t)r >> 1) - (reminder >> 1));
- } else if (blocks == 5) {
- fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4];
- libsais_unbwt_decode_5(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4,
- reminder >> 1);
- libsais_unbwt_decode_4(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1,
- &i2, &i3, ((fast_uint_t)r >> 1) - (reminder >> 1));
- } else if (blocks == 6) {
- fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5];
- libsais_unbwt_decode_6(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5,
- reminder >> 1);
- libsais_unbwt_decode_5(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1,
- &i2, &i3, &i4, ((fast_uint_t)r >> 1) - (reminder >> 1));
- } else if (blocks == 7) {
- fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5], i6 = I[6];
- libsais_unbwt_decode_7(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5,
- &i6, reminder >> 1);
- libsais_unbwt_decode_6(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1,
- &i2, &i3, &i4, &i5, ((fast_uint_t)r >> 1) - (reminder >> 1));
- } else {
- fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5], i6 = I[6], i7 = I[7];
- libsais_unbwt_decode_8(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5,
- &i6, &i7, reminder >> 1);
- libsais_unbwt_decode_7(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1,
- &i2, &i3, &i4, &i5, &i6, ((fast_uint_t)r >> 1) - (reminder >> 1));
- }
-}
-
-static void libsais_unbwt_decode_omp(const u8 * RESTRICT T, u8 * RESTRICT U, sa_uint_t * RESTRICT P, sa_sint_t n,
- sa_sint_t r, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2,
- u16 * RESTRICT fastbits, sa_sint_t threads) {
- fast_uint_t lastc = T[0];
- fast_sint_t blocks = 1 + (((fast_sint_t)n - 1) / (fast_sint_t)r);
- fast_uint_t reminder = (fast_uint_t)n - ((fast_uint_t)r * ((fast_uint_t)blocks - 1));
-
- {
- (void)(threads);
-
- fast_sint_t omp_thread_num = 0;
- fast_sint_t omp_num_threads = 1;
- fast_sint_t omp_block_stride = blocks / omp_num_threads;
- fast_sint_t omp_block_reminder = blocks % omp_num_threads;
- fast_sint_t omp_block_size = omp_block_stride + (omp_thread_num < omp_block_reminder);
- fast_sint_t omp_block_start = omp_block_stride * omp_thread_num +
- (omp_thread_num < omp_block_reminder ? omp_thread_num : omp_block_reminder);
-
- libsais_unbwt_decode(U + r * omp_block_start, P, n, r, I + omp_block_start, bucket2, fastbits, omp_block_size,
- omp_thread_num < omp_num_threads - 1 ? (fast_uint_t)r : reminder);
- }
-
- U[n - 1] = (u8)lastc;
-}
-
-static sa_sint_t libsais_unbwt_core(const u8 * RESTRICT T, u8 * RESTRICT U, sa_uint_t * RESTRICT P, sa_sint_t n,
- const sa_sint_t * freq, sa_sint_t r, const sa_uint_t * RESTRICT I,
- sa_uint_t * RESTRICT bucket2, u16 * RESTRICT fastbits, sa_uint_t * RESTRICT buckets,
- sa_sint_t threads) {
- (void)(buckets);
-
- { libsais_unbwt_init_single(T, P, n, freq, I, bucket2, fastbits); }
-
- libsais_unbwt_decode_omp(T, U, P, n, r, I, bucket2, fastbits, threads);
- return 0;
-}
-
-static sa_sint_t libsais_unbwt_main(const u8 * T, u8 * U, sa_uint_t * P, sa_sint_t n, const sa_sint_t * freq,
- sa_sint_t r, const sa_uint_t * I, sa_sint_t threads) {
- fast_uint_t shift = 0;
- while ((n >> shift) > (1 << UNBWT_FASTBITS)) {
- shift++;
- }
-
- sa_uint_t * RESTRICT bucket2 =
- (sa_uint_t *)libsais_alloc_aligned(ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t), 4096);
- u16 * RESTRICT fastbits = (u16 *)libsais_alloc_aligned(((size_t)1 + (size_t)(n >> shift)) * sizeof(u16), 4096);
- memset(fastbits, 0, ((size_t)1 + (size_t)(n >> shift)) * sizeof(u16));
- sa_uint_t * RESTRICT buckets =
- threads > 1 && n >= 262144
- ? (sa_uint_t *)libsais_alloc_aligned(
- (size_t)threads * (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)) * sizeof(sa_uint_t), 4096)
- : NULL;
-
- sa_sint_t index = bucket2 != NULL && fastbits != NULL && (buckets != NULL || threads == 1 || n < 262144)
- ? libsais_unbwt_core(T, U, P, n, freq, r, I, bucket2, fastbits, buckets, threads)
- : -2;
-
- libsais_free_aligned(buckets);
- libsais_free_aligned(fastbits);
- libsais_free_aligned(bucket2);
-
- return index;
-}
-
-static sa_sint_t libsais_unbwt_main_ctx(const LIBSAIS_UNBWT_CONTEXT * ctx, const u8 * T, u8 * U, sa_uint_t * P,
- sa_sint_t n, const sa_sint_t * freq, sa_sint_t r, const sa_uint_t * I) {
- return ctx != NULL && ctx->bucket2 != NULL && ctx->fastbits != NULL && (ctx->buckets != NULL || ctx->threads == 1)
- ? libsais_unbwt_core(T, U, P, n, freq, r, I, ctx->bucket2, ctx->fastbits, ctx->buckets,
- (sa_sint_t)ctx->threads)
- : -2;
-}
-
-void * libsais_unbwt_create_ctx(void) { return (void *)libsais_unbwt_create_ctx_main(1); }
-
-void libsais_unbwt_free_ctx(void * ctx) { libsais_unbwt_free_ctx_main((LIBSAIS_UNBWT_CONTEXT *)ctx); }
-
-s32 libsais_unbwt(const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq, s32 i) {
- return libsais_unbwt_aux(T, U, A, n, freq, n, &i);
-}
-
-s32 libsais_unbwt_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq, s32 i) {
- return libsais_unbwt_aux_ctx(ctx, T, U, A, n, freq, n, &i);
-}
-
-s32 libsais_unbwt_aux(const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq, s32 r, const s32 * I) {
- if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || ((r != n) && ((r < 2) || ((r & (r - 1)) != 0))) ||
- (I == NULL)) {
- return -1;
- } else if (n <= 1) {
- if (I[0] != n) {
- return -1;
- }
- if (n == 1) {
- U[0] = T[0];
- }
- return 0;
- }
-
- fast_sint_t t;
- for (t = 0; t <= (n - 1) / r; ++t) {
- if (I[t] <= 0 || I[t] > n) {
- return -1;
- }
- }
-
- return libsais_unbwt_main(T, U, (sa_uint_t *)A, n, freq, r, (const sa_uint_t *)I, 1);
-}
-
-s32 libsais_unbwt_aux_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq, s32 r,
- const s32 * I) {
- if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || ((r != n) && ((r < 2) || ((r & (r - 1)) != 0))) ||
- (I == NULL)) {
- return -1;
- } else if (n <= 1) {
- if (I[0] != n) {
- return -1;
- }
- if (n == 1) {
- U[0] = T[0];
- }
- return 0;
- }
-
- fast_sint_t t;
- for (t = 0; t <= (n - 1) / r; ++t) {
- if (I[t] <= 0 || I[t] > n) {
- return -1;
- }
- }
-
- return libsais_unbwt_main_ctx((const LIBSAIS_UNBWT_CONTEXT *)ctx, T, U, (sa_uint_t *)A, n, freq, r,
- (const sa_uint_t *)I);
-}
-static void libsais_compute_phi(const sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT PLCP, sa_sint_t n,
- fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
- const fast_sint_t prefetch_distance = 32;
-
- fast_sint_t i, j;
- sa_sint_t k = omp_block_start > 0 ? SA[omp_block_start - 1] : n;
- for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) {
- libsais_prefetchw(&PLCP[SA[i + prefetch_distance + 0]]);
- libsais_prefetchw(&PLCP[SA[i + prefetch_distance + 1]]);
-
- PLCP[SA[i + 0]] = k;
- k = SA[i + 0];
- PLCP[SA[i + 1]] = k;
- k = SA[i + 1];
-
- libsais_prefetchw(&PLCP[SA[i + prefetch_distance + 2]]);
- libsais_prefetchw(&PLCP[SA[i + prefetch_distance + 3]]);
-
- PLCP[SA[i + 2]] = k;
- k = SA[i + 2];
- PLCP[SA[i + 3]] = k;
- k = SA[i + 3];
- }
-
- for (j += prefetch_distance + 3; i < j; i += 1) {
- PLCP[SA[i]] = k;
- k = SA[i];
- }
-}
-
-static void libsais_compute_phi_omp(const sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT PLCP, sa_sint_t n,
- sa_sint_t threads) {
- {
- (void)(threads);
-
- fast_sint_t omp_thread_num = 0;
- fast_sint_t omp_num_threads = 1;
-
- fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
- fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
- fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
-
- libsais_compute_phi(SA, PLCP, n, omp_block_start, omp_block_size);
- }
-}
-
-static void libsais_compute_plcp(const u8 * RESTRICT T, sa_sint_t * RESTRICT PLCP, fast_sint_t n,
- fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
- const fast_sint_t prefetch_distance = 32;
-
- fast_sint_t i, j, l = 0;
- for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance; i < j; i += 1) {
- libsais_prefetch(&T[PLCP[i + prefetch_distance] + l]);
-
- fast_sint_t k = PLCP[i], m = n - (i > k ? i : k);
- while (l < m && T[i + l] == T[k + l]) {
- l++;
- }
-
- PLCP[i] = (sa_sint_t)l;
- l -= (l != 0);
- }
-
- for (j += prefetch_distance; i < j; i += 1) {
- fast_sint_t k = PLCP[i], m = n - (i > k ? i : k);
- while (l < m && T[i + l] == T[k + l]) {
- l++;
- }
-
- PLCP[i] = (sa_sint_t)l;
- l -= (l != 0);
- }
-}
-
-static void libsais_compute_plcp_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT PLCP, sa_sint_t n, sa_sint_t threads) {
- {
- (void)(threads);
-
- fast_sint_t omp_thread_num = 0;
- fast_sint_t omp_num_threads = 1;
-
- fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
- fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
- fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
-
- libsais_compute_plcp(T, PLCP, n, omp_block_start, omp_block_size);
- }
-}
-
-static void libsais_compute_lcp(const sa_sint_t * RESTRICT PLCP, const sa_sint_t * RESTRICT SA,
- sa_sint_t * RESTRICT LCP, fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
- const fast_sint_t prefetch_distance = 32;
-
- fast_sint_t i, j;
- for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) {
- libsais_prefetch(&PLCP[SA[i + prefetch_distance + 0]]);
- libsais_prefetch(&PLCP[SA[i + prefetch_distance + 1]]);
-
- LCP[i + 0] = PLCP[SA[i + 0]];
- LCP[i + 1] = PLCP[SA[i + 1]];
-
- libsais_prefetch(&PLCP[SA[i + prefetch_distance + 2]]);
- libsais_prefetch(&PLCP[SA[i + prefetch_distance + 3]]);
-
- LCP[i + 2] = PLCP[SA[i + 2]];
- LCP[i + 3] = PLCP[SA[i + 3]];
- }
-
- for (j += prefetch_distance + 3; i < j; i += 1) {
- LCP[i] = PLCP[SA[i]];
- }
-}
-
-static void libsais_compute_lcp_omp(const sa_sint_t * RESTRICT PLCP, const sa_sint_t * RESTRICT SA,
- sa_sint_t * RESTRICT LCP, sa_sint_t n, sa_sint_t threads) {
- {
- (void)(threads);
-
- fast_sint_t omp_thread_num = 0;
- fast_sint_t omp_num_threads = 1;
-
- fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
- fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
- fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
-
- libsais_compute_lcp(PLCP, SA, LCP, omp_block_start, omp_block_size);
- }
-}
-
-s32 libsais_plcp(const u8 * T, const s32 * SA, s32 * PLCP, s32 n) {
- if ((T == NULL) || (SA == NULL) || (PLCP == NULL) || (n < 0)) {
- return -1;
- } else if (n <= 1) {
- if (n == 1) {
- PLCP[0] = 0;
- }
- return 0;
- }
-
- libsais_compute_phi_omp(SA, PLCP, n, 1);
- libsais_compute_plcp_omp(T, PLCP, n, 1);
-
- return 0;
-}
-
-s32 libsais_lcp(const s32 * PLCP, const s32 * SA, s32 * LCP, s32 n) {
- if ((PLCP == NULL) || (SA == NULL) || (LCP == NULL) || (n < 0)) {
- return -1;
- } else if (n <= 1) {
- if (n == 1) {
- LCP[0] = PLCP[SA[0]];
- }
- return 0;
- }
-
- libsais_compute_lcp_omp(PLCP, SA, LCP, n, 1);
-
- return 0;
-}
diff --git a/src/lzp.c b/src/lzp.c
deleted file mode 100644
index ccb0a8b..0000000
--- a/src/lzp.c
+++ /dev/null
@@ -1,142 +0,0 @@
-
-// Lempel Ziv Prediction code.
-// A heavily modified version of libbsc's LZP predictor. This one has single thread performance and provides better
-// compression ratio. It is also mostly UB-free and less brittle during AFL fuzzing.
-
-#include <memory.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "common.h"
-
-#define MATCH 0xf2
-
-static s32 lzp_encode_block(const u8 * RESTRICT in, const u8 * in_end, u8 * RESTRICT out, u8 * out_end,
- s32 * RESTRICT lut, s32 mask, s32 m_len) {
- const u8 *ins = in, *outs = out;
- const u8 * out_eob = out_end - 8;
- const u8 * heur = in;
-
- u32 ctx;
-
- for (s32 i = 0; i < 4; ++i) *out++ = *in++;
-
- ctx = ((u32)in[-1]) | (((u32)in[-2]) << 8) | (((u32)in[-3]) << 16) | (((u32)in[-4]) << 24);
-
- while (in < in_end - m_len - 32 && out < out_eob) {
- u32 idx = (ctx >> 15 ^ ctx ^ ctx >> 3) & mask;
- s32 val = lut[idx];
- lut[idx] = in - ins;
- if (val > 0) {
- const u8 * RESTRICT ref = ins + val;
- if (memcmp(in + m_len - 4, ref + m_len - 4, sizeof(u32)) == 0 && memcmp(in, ref, sizeof(u32)) == 0) {
- if (heur > in && *(u32 *)heur != *(u32 *)(ref + (heur - in))) goto not_found;
-
- s32 len = 4;
- for (; in + len < in_end - m_len - 32; len += sizeof(u32)) {
- if (*(u32 *)(in + len) != *(u32 *)(ref + len)) break;
- }
-
- if (len < m_len) {
- if (heur < in + len) heur = in + len;
- goto not_found;
- }
-
- len += in[len] == ref[len];
- len += in[len] == ref[len];
- len += in[len] == ref[len];
-
- in += len;
- ctx = ((u32)in[-1]) | (((u32)in[-2]) << 8) | (((u32)in[-3]) << 16) | (((u32)in[-4]) << 24);
-
- *out++ = MATCH;
-
- len -= m_len;
- while (len >= 254) {
- len -= 254;
- *out++ = 254;
- if (out >= out_eob) break;
- }
-
- *out++ = len;
- } else {
- not_found:;
- u8 next = *out++ = *in++;
- ctx = ctx << 8 | next;
- if (next == MATCH) *out++ = 255;
- }
- } else {
- ctx = (ctx << 8) | (*out++ = *in++);
- }
- }
-
- ctx = ((u32)in[-1]) | (((u32)in[-2]) << 8) | (((u32)in[-3]) << 16) | (((u32)in[-4]) << 24);
-
- while (in < in_end && out < out_eob) {
- u32 idx = (ctx >> 15 ^ ctx ^ ctx >> 3) & mask;
- s32 val = lut[idx];
- lut[idx] = (s32)(in - ins);
-
- u8 next = *out++ = *in++;
- ctx = ctx << 8 | next;
- if (next == MATCH && val > 0) *out++ = 255;
- }
-
- return out >= out_eob ? -1 : (s32)(out - outs);
-}
-
-static s32 lzp_decode_block(const u8 * RESTRICT in, const u8 * in_end, s32 * RESTRICT lut, u8 * RESTRICT out, s32 hash,
- s32 m_len) {
- if (in_end - in < 4) return -1;
-
- memset(lut, 0, sizeof(s32) * (1 << hash));
-
- u32 mask = (s32)(1 << hash) - 1;
- const u8 * outs = out;
-
- for (s32 i = 0; i < 4; ++i) *out++ = *in++;
-
- u32 ctx = ((u32)out[-1]) | (((u32)out[-2]) << 8) | (((u32)out[-3]) << 16) | (((u32)out[-4]) << 24);
-
- while (in < in_end) {
- u32 idx = (ctx >> 15 ^ ctx ^ ctx >> 3) & mask;
- s32 val = lut[idx];
- lut[idx] = (s32)(out - outs);
- if (*in == MATCH && val > 0) {
- in++;
- if (*in != 255) {
- s32 len = m_len;
- while (1) {
- len += *in;
- if (*in++ != 254) break;
- }
-
- const u8 * ref = outs + val;
- u8 * out_end = out + len;
-
- while (out < out_end) *out++ = *ref++;
-
- ctx = ((u32)out[-1]) | (((u32)out[-2]) << 8) | (((u32)out[-3]) << 16) | (((u32)out[-4]) << 24);
- } else {
- in++;
- ctx = (ctx << 8) | (*out++ = MATCH);
- }
- } else {
- ctx = (ctx << 8) | (*out++ = *in++);
- }
- }
-
- return out - outs;
-}
-
-s32 lzp_compress(const u8 * RESTRICT in, u8 * RESTRICT out, s32 n, s32 hash, s32 m_len, s32 * RESTRICT lut) {
- if (n - m_len < 32) return -1;
-
- memset(lut, 0, sizeof(s32) * (1 << hash));
-
- return lzp_encode_block(in, in + n, out, out + n, lut, (s32)(1 << hash) - 1, m_len);
-}
-
-s32 lzp_decompress(const u8 * RESTRICT in, u8 * RESTRICT out, s32 n, s32 hash, s32 m_len, s32 * RESTRICT lut) {
- return lzp_decode_block(in, in + n, lut, out, hash, m_len);
-}
diff --git a/src/rle.c b/src/rle.c
deleted file mode 100644
index 87271fc..0000000
--- a/src/rle.c
+++ /dev/null
@@ -1,84 +0,0 @@
-
-/*
- * BZip3 - A spiritual successor to BZip2.
- * Copyright (C) 2022 Kamila Szewczyk
- *
- * This program is free software: you can redistribute it and/or modify it
- * under the terms of the GNU Lesser General Public License as published by the Free
- * Software Foundation, either version 3 of the License, or (at your option)
- * any later version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU Lesser General Public License along with
- * this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "rle.h"
-
-#include "common.h"
-
-s32 mrlec(u8 * in, s32 inlen, u8 * out) {
- u8 *ip = in, *in_end = in + inlen;
- s32 op = 0;
- s32 c, pc = -1;
- s32 t[256] = { 0 };
- s32 run = 0;
- while ((c = (ip < in_end ? *ip++ : -1)) != -1) {
- if (c == pc)
- t[c] += (++run % 255) != 0;
- else
- --t[c], run = 0;
- pc = c;
- }
- for (s32 i = 0; i < 32; ++i) {
- c = 0;
- for (s32 j = 0; j < 8; ++j) c += (t[i * 8 + j] > 0) << j;
- out[op++] = c;
- }
- ip = in;
- c = pc = -1;
- run = 0;
- do {
- c = ip < in_end ? *ip++ : -1;
- if (c == pc)
- ++run;
- else if (run > 0 && t[pc] > 0) {
- out[op++] = pc;
- for (; run > 255; run -= 255) out[op++] = 255;
- out[op++] = run - 1;
- run = 1;
- } else
- for (++run; run > 1; --run) out[op++] = pc;
- pc = c;
- } while (c != -1);
-
- return op;
-}
-
-void mrled(u8 * RESTRICT in, u8 * RESTRICT out, s32 outlen) {
- s32 op = 0, ip = 0;
-
- s32 c, pc = -1;
- s32 t[256] = { 0 };
- s32 run = 0;
-
- for (s32 i = 0; i < 32; ++i) {
- c = in[ip++];
- for (s32 j = 0; j < 8; ++j) t[i * 8 + j] = (c >> j) & 1;
- }
-
- while (op < outlen) {
- c = in[ip++];
- if (t[c]) {
- for (run = 0; (pc = in[ip++]) == 255; run += 255)
- ;
- run += pc + 1;
- for (; run > 0; --run) out[op++] = c;
- } else
- out[op++] = c;
- }
-}
