LZP for text data
diff --git a/.clang-format b/.clang-format
index ef36a0a..9b1bf1e 100644
--- a/.clang-format
+++ b/.clang-format
@@ -7,5 +7,6 @@ PointerAlignment: Middle
TabWidth: '4'
UseTab: Never
Cpp11BracedListStyle: false
+ColumnLimit: 100
...
diff --git a/Makefile b/Makefile
index e4e8a8b..397feff 100644
--- a/Makefile
+++ b/Makefile
@@ -6,7 +6,7 @@ PREFIX?=/usr/local
.PHONY: all clean format install
OBJECTS=obj/main.o obj/libsais.o obj/crc32.o obj/mtf.o obj/srt.o obj/rle.o \
- obj/cm.o obj/libbz3.o obj/txt.o
+ obj/cm.o obj/libbz3.o obj/txt.o obj/lzp.o
all: bzip3
diff --git a/include/libsais.h b/include/libsais.h
index 6286a5d..b0cae1e 100644
--- a/include/libsais.h
+++ b/include/libsais.h
@@ -77,8 +77,7 @@ s32 libsais_int(s32 * T, s32 * SA, s32 n, s32 k, s32 fs);
* @param freq [0..255] The output symbol frequency table (can be NULL).
* @return 0 if no error occurred, -1 or -2 otherwise.
*/
-s32 libsais_ctx(const void * ctx, const u8 * T, s32 * SA, s32 n, s32 fs,
- s32 * freq);
+s32 libsais_ctx(const void * ctx, const u8 * T, s32 * SA, s32 n, s32 fs, s32 * freq);
/**
* Constructs the burrows-wheeler transformed string (BWT) of a given string.
@@ -107,8 +106,7 @@ s32 libsais_bwt(const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq);
* @param I [0..(n-1)/r] The output auxiliary indexes.
* @return 0 if no error occurred, -1 or -2 otherwise.
*/
-s32 libsais_bwt_aux(const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq,
- s32 r, s32 * I);
+s32 libsais_bwt_aux(const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq, s32 r, s32 * I);
/**
* Constructs the burrows-wheeler transformed string (BWT) of a given string
@@ -123,8 +121,7 @@ s32 libsais_bwt_aux(const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq,
* @param freq [0..255] The output symbol frequency table (can be NULL).
* @return The primary index if no error occurred, -1 or -2 otherwise.
*/
-s32 libsais_bwt_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n,
- s32 fs, s32 * freq);
+s32 libsais_bwt_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq);
/**
* Constructs the burrows-wheeler transformed string (BWT) of a given string
@@ -141,8 +138,8 @@ s32 libsais_bwt_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n,
* @param I [0..(n-1)/r] The output auxiliary indexes.
* @return 0 if no error occurred, -1 or -2 otherwise.
*/
-s32 libsais_bwt_aux_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n,
- s32 fs, s32 * freq, s32 r, s32 * I);
+s32 libsais_bwt_aux_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq,
+ s32 r, s32 * I);
/**
* Creates the libsais reverse BWT context that allows reusing allocated memory
@@ -170,8 +167,7 @@ void libsais_unbwt_free_ctx(void * ctx);
* @param i The primary index.
* @return 0 if no error occurred, -1 or -2 otherwise.
*/
-s32 libsais_unbwt(const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq,
- s32 i);
+s32 libsais_unbwt(const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq, s32 i);
/**
* Constructs the original string from a given burrows-wheeler transformed
@@ -186,8 +182,8 @@ s32 libsais_unbwt(const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq,
* @param i The primary index.
* @return 0 if no error occurred, -1 or -2 otherwise.
*/
-s32 libsais_unbwt_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n,
- const s32 * freq, s32 i);
+s32 libsais_unbwt_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq,
+ s32 i);
/**
* Constructs the original string from a given burrows-wheeler transformed
@@ -202,8 +198,7 @@ s32 libsais_unbwt_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n,
* @param I [0..(n-1)/r] The input auxiliary indexes.
* @return 0 if no error occurred, -1 or -2 otherwise.
*/
-s32 libsais_unbwt_aux(const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq,
- s32 r, const s32 * I);
+s32 libsais_unbwt_aux(const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq, s32 r, const s32 * I);
/**
* Constructs the original string from a given burrows-wheeler transformed
@@ -219,8 +214,8 @@ s32 libsais_unbwt_aux(const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq,
* @param I [0..(n-1)/r] The input auxiliary indexes.
* @return 0 if no error occurred, -1 or -2 otherwise.
*/
-s32 libsais_unbwt_aux_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A,
- s32 n, const s32 * freq, s32 r, const s32 * I);
+s32 libsais_unbwt_aux_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq,
+ s32 r, const s32 * I);
/**
* Constructs the permuted longest common prefix array (PLCP) of a given string
diff --git a/include/lzp.h b/include/lzp.h
new file mode 100644
index 0000000..d07c956
--- /dev/null
+++ b/include/lzp.h
@@ -0,0 +1,11 @@
+
+#ifndef _LZP_H
+#define _LZP_H
+
+#include "common.h"
+
+s32 lzp_compress(const u8 * input, u8 * output, s32 n, s32 hash, s32 min);
+
+s32 lzp_decompress(const u8 * input, u8 * output, s32 n, s32 hash, s32 min);
+
+#endif
diff --git a/src/cm.c b/src/cm.c
index a939436..05ecea7 100644
--- a/src/cm.c
+++ b/src/cm.c
@@ -17,7 +17,7 @@ static void encodebit0(state * s, u32 p) {
// Write identical bits.
while ((s->low ^ s->high) < (1 << 24)) {
- write_out(s, s->low >> 24); // Same as s->high >> 24
+ write_out(s, s->low >> 24); // Same as s->high >> 24
s->low <<= 8;
s->high = (s->high << 8) | 0xFF;
}
diff --git a/src/crc32.c b/src/crc32.c
index 2b88e98..8bed740 100644
--- a/src/crc32.c
+++ b/src/crc32.c
@@ -20,58 +20,43 @@
#include "crc32.h"
static const u32 crc32Table[256] = {
- 0x00000000L, 0xF26B8303L, 0xE13B70F7L, 0x1350F3F4L, 0xC79A971FL,
- 0x35F1141CL, 0x26A1E7E8L, 0xD4CA64EBL, 0x8AD958CFL, 0x78B2DBCCL,
- 0x6BE22838L, 0x9989AB3BL, 0x4D43CFD0L, 0xBF284CD3L, 0xAC78BF27L,
- 0x5E133C24L, 0x105EC76FL, 0xE235446CL, 0xF165B798L, 0x030E349BL,
- 0xD7C45070L, 0x25AFD373L, 0x36FF2087L, 0xC494A384L, 0x9A879FA0L,
- 0x68EC1CA3L, 0x7BBCEF57L, 0x89D76C54L, 0x5D1D08BFL, 0xAF768BBCL,
- 0xBC267848L, 0x4E4DFB4BL, 0x20BD8EDEL, 0xD2D60DDDL, 0xC186FE29L,
- 0x33ED7D2AL, 0xE72719C1L, 0x154C9AC2L, 0x061C6936L, 0xF477EA35L,
- 0xAA64D611L, 0x580F5512L, 0x4B5FA6E6L, 0xB93425E5L, 0x6DFE410EL,
- 0x9F95C20DL, 0x8CC531F9L, 0x7EAEB2FAL, 0x30E349B1L, 0xC288CAB2L,
- 0xD1D83946L, 0x23B3BA45L, 0xF779DEAEL, 0x05125DADL, 0x1642AE59L,
- 0xE4292D5AL, 0xBA3A117EL, 0x4851927DL, 0x5B016189L, 0xA96AE28AL,
- 0x7DA08661L, 0x8FCB0562L, 0x9C9BF696L, 0x6EF07595L, 0x417B1DBCL,
- 0xB3109EBFL, 0xA0406D4BL, 0x522BEE48L, 0x86E18AA3L, 0x748A09A0L,
- 0x67DAFA54L, 0x95B17957L, 0xCBA24573L, 0x39C9C670L, 0x2A993584L,
- 0xD8F2B687L, 0x0C38D26CL, 0xFE53516FL, 0xED03A29BL, 0x1F682198L,
- 0x5125DAD3L, 0xA34E59D0L, 0xB01EAA24L, 0x42752927L, 0x96BF4DCCL,
- 0x64D4CECFL, 0x77843D3BL, 0x85EFBE38L, 0xDBFC821CL, 0x2997011FL,
- 0x3AC7F2EBL, 0xC8AC71E8L, 0x1C661503L, 0xEE0D9600L, 0xFD5D65F4L,
- 0x0F36E6F7L, 0x61C69362L, 0x93AD1061L, 0x80FDE395L, 0x72966096L,
- 0xA65C047DL, 0x5437877EL, 0x4767748AL, 0xB50CF789L, 0xEB1FCBADL,
- 0x197448AEL, 0x0A24BB5AL, 0xF84F3859L, 0x2C855CB2L, 0xDEEEDFB1L,
- 0xCDBE2C45L, 0x3FD5AF46L, 0x7198540DL, 0x83F3D70EL, 0x90A324FAL,
- 0x62C8A7F9L, 0xB602C312L, 0x44694011L, 0x5739B3E5L, 0xA55230E6L,
- 0xFB410CC2L, 0x092A8FC1L, 0x1A7A7C35L, 0xE811FF36L, 0x3CDB9BDDL,
- 0xCEB018DEL, 0xDDE0EB2AL, 0x2F8B6829L, 0x82F63B78L, 0x709DB87BL,
- 0x63CD4B8FL, 0x91A6C88CL, 0x456CAC67L, 0xB7072F64L, 0xA457DC90L,
- 0x563C5F93L, 0x082F63B7L, 0xFA44E0B4L, 0xE9141340L, 0x1B7F9043L,
- 0xCFB5F4A8L, 0x3DDE77ABL, 0x2E8E845FL, 0xDCE5075CL, 0x92A8FC17L,
- 0x60C37F14L, 0x73938CE0L, 0x81F80FE3L, 0x55326B08L, 0xA759E80BL,
- 0xB4091BFFL, 0x466298FCL, 0x1871A4D8L, 0xEA1A27DBL, 0xF94AD42FL,
- 0x0B21572CL, 0xDFEB33C7L, 0x2D80B0C4L, 0x3ED04330L, 0xCCBBC033L,
- 0xA24BB5A6L, 0x502036A5L, 0x4370C551L, 0xB11B4652L, 0x65D122B9L,
- 0x97BAA1BAL, 0x84EA524EL, 0x7681D14DL, 0x2892ED69L, 0xDAF96E6AL,
- 0xC9A99D9EL, 0x3BC21E9DL, 0xEF087A76L, 0x1D63F975L, 0x0E330A81L,
- 0xFC588982L, 0xB21572C9L, 0x407EF1CAL, 0x532E023EL, 0xA145813DL,
- 0x758FE5D6L, 0x87E466D5L, 0x94B49521L, 0x66DF1622L, 0x38CC2A06L,
- 0xCAA7A905L, 0xD9F75AF1L, 0x2B9CD9F2L, 0xFF56BD19L, 0x0D3D3E1AL,
- 0x1E6DCDEEL, 0xEC064EEDL, 0xC38D26C4L, 0x31E6A5C7L, 0x22B65633L,
- 0xD0DDD530L, 0x0417B1DBL, 0xF67C32D8L, 0xE52CC12CL, 0x1747422FL,
- 0x49547E0BL, 0xBB3FFD08L, 0xA86F0EFCL, 0x5A048DFFL, 0x8ECEE914L,
- 0x7CA56A17L, 0x6FF599E3L, 0x9D9E1AE0L, 0xD3D3E1ABL, 0x21B862A8L,
- 0x32E8915CL, 0xC083125FL, 0x144976B4L, 0xE622F5B7L, 0xF5720643L,
- 0x07198540L, 0x590AB964L, 0xAB613A67L, 0xB831C993L, 0x4A5A4A90L,
- 0x9E902E7BL, 0x6CFBAD78L, 0x7FAB5E8CL, 0x8DC0DD8FL, 0xE330A81AL,
- 0x115B2B19L, 0x020BD8EDL, 0xF0605BEEL, 0x24AA3F05L, 0xD6C1BC06L,
- 0xC5914FF2L, 0x37FACCF1L, 0x69E9F0D5L, 0x9B8273D6L, 0x88D28022L,
- 0x7AB90321L, 0xAE7367CAL, 0x5C18E4C9L, 0x4F48173DL, 0xBD23943EL,
- 0xF36E6F75L, 0x0105EC76L, 0x12551F82L, 0xE03E9C81L, 0x34F4F86AL,
- 0xC69F7B69L, 0xD5CF889DL, 0x27A40B9EL, 0x79B737BAL, 0x8BDCB4B9L,
- 0x988C474DL, 0x6AE7C44EL, 0xBE2DA0A5L, 0x4C4623A6L, 0x5F16D052L,
- 0xAD7D5351L
+ 0x00000000L, 0xF26B8303L, 0xE13B70F7L, 0x1350F3F4L, 0xC79A971FL, 0x35F1141CL, 0x26A1E7E8L,
+ 0xD4CA64EBL, 0x8AD958CFL, 0x78B2DBCCL, 0x6BE22838L, 0x9989AB3BL, 0x4D43CFD0L, 0xBF284CD3L,
+ 0xAC78BF27L, 0x5E133C24L, 0x105EC76FL, 0xE235446CL, 0xF165B798L, 0x030E349BL, 0xD7C45070L,
+ 0x25AFD373L, 0x36FF2087L, 0xC494A384L, 0x9A879FA0L, 0x68EC1CA3L, 0x7BBCEF57L, 0x89D76C54L,
+ 0x5D1D08BFL, 0xAF768BBCL, 0xBC267848L, 0x4E4DFB4BL, 0x20BD8EDEL, 0xD2D60DDDL, 0xC186FE29L,
+ 0x33ED7D2AL, 0xE72719C1L, 0x154C9AC2L, 0x061C6936L, 0xF477EA35L, 0xAA64D611L, 0x580F5512L,
+ 0x4B5FA6E6L, 0xB93425E5L, 0x6DFE410EL, 0x9F95C20DL, 0x8CC531F9L, 0x7EAEB2FAL, 0x30E349B1L,
+ 0xC288CAB2L, 0xD1D83946L, 0x23B3BA45L, 0xF779DEAEL, 0x05125DADL, 0x1642AE59L, 0xE4292D5AL,
+ 0xBA3A117EL, 0x4851927DL, 0x5B016189L, 0xA96AE28AL, 0x7DA08661L, 0x8FCB0562L, 0x9C9BF696L,
+ 0x6EF07595L, 0x417B1DBCL, 0xB3109EBFL, 0xA0406D4BL, 0x522BEE48L, 0x86E18AA3L, 0x748A09A0L,
+ 0x67DAFA54L, 0x95B17957L, 0xCBA24573L, 0x39C9C670L, 0x2A993584L, 0xD8F2B687L, 0x0C38D26CL,
+ 0xFE53516FL, 0xED03A29BL, 0x1F682198L, 0x5125DAD3L, 0xA34E59D0L, 0xB01EAA24L, 0x42752927L,
+ 0x96BF4DCCL, 0x64D4CECFL, 0x77843D3BL, 0x85EFBE38L, 0xDBFC821CL, 0x2997011FL, 0x3AC7F2EBL,
+ 0xC8AC71E8L, 0x1C661503L, 0xEE0D9600L, 0xFD5D65F4L, 0x0F36E6F7L, 0x61C69362L, 0x93AD1061L,
+ 0x80FDE395L, 0x72966096L, 0xA65C047DL, 0x5437877EL, 0x4767748AL, 0xB50CF789L, 0xEB1FCBADL,
+ 0x197448AEL, 0x0A24BB5AL, 0xF84F3859L, 0x2C855CB2L, 0xDEEEDFB1L, 0xCDBE2C45L, 0x3FD5AF46L,
+ 0x7198540DL, 0x83F3D70EL, 0x90A324FAL, 0x62C8A7F9L, 0xB602C312L, 0x44694011L, 0x5739B3E5L,
+ 0xA55230E6L, 0xFB410CC2L, 0x092A8FC1L, 0x1A7A7C35L, 0xE811FF36L, 0x3CDB9BDDL, 0xCEB018DEL,
+ 0xDDE0EB2AL, 0x2F8B6829L, 0x82F63B78L, 0x709DB87BL, 0x63CD4B8FL, 0x91A6C88CL, 0x456CAC67L,
+ 0xB7072F64L, 0xA457DC90L, 0x563C5F93L, 0x082F63B7L, 0xFA44E0B4L, 0xE9141340L, 0x1B7F9043L,
+ 0xCFB5F4A8L, 0x3DDE77ABL, 0x2E8E845FL, 0xDCE5075CL, 0x92A8FC17L, 0x60C37F14L, 0x73938CE0L,
+ 0x81F80FE3L, 0x55326B08L, 0xA759E80BL, 0xB4091BFFL, 0x466298FCL, 0x1871A4D8L, 0xEA1A27DBL,
+ 0xF94AD42FL, 0x0B21572CL, 0xDFEB33C7L, 0x2D80B0C4L, 0x3ED04330L, 0xCCBBC033L, 0xA24BB5A6L,
+ 0x502036A5L, 0x4370C551L, 0xB11B4652L, 0x65D122B9L, 0x97BAA1BAL, 0x84EA524EL, 0x7681D14DL,
+ 0x2892ED69L, 0xDAF96E6AL, 0xC9A99D9EL, 0x3BC21E9DL, 0xEF087A76L, 0x1D63F975L, 0x0E330A81L,
+ 0xFC588982L, 0xB21572C9L, 0x407EF1CAL, 0x532E023EL, 0xA145813DL, 0x758FE5D6L, 0x87E466D5L,
+ 0x94B49521L, 0x66DF1622L, 0x38CC2A06L, 0xCAA7A905L, 0xD9F75AF1L, 0x2B9CD9F2L, 0xFF56BD19L,
+ 0x0D3D3E1AL, 0x1E6DCDEEL, 0xEC064EEDL, 0xC38D26C4L, 0x31E6A5C7L, 0x22B65633L, 0xD0DDD530L,
+ 0x0417B1DBL, 0xF67C32D8L, 0xE52CC12CL, 0x1747422FL, 0x49547E0BL, 0xBB3FFD08L, 0xA86F0EFCL,
+ 0x5A048DFFL, 0x8ECEE914L, 0x7CA56A17L, 0x6FF599E3L, 0x9D9E1AE0L, 0xD3D3E1ABL, 0x21B862A8L,
+ 0x32E8915CL, 0xC083125FL, 0x144976B4L, 0xE622F5B7L, 0xF5720643L, 0x07198540L, 0x590AB964L,
+ 0xAB613A67L, 0xB831C993L, 0x4A5A4A90L, 0x9E902E7BL, 0x6CFBAD78L, 0x7FAB5E8CL, 0x8DC0DD8FL,
+ 0xE330A81AL, 0x115B2B19L, 0x020BD8EDL, 0xF0605BEEL, 0x24AA3F05L, 0xD6C1BC06L, 0xC5914FF2L,
+ 0x37FACCF1L, 0x69E9F0D5L, 0x9B8273D6L, 0x88D28022L, 0x7AB90321L, 0xAE7367CAL, 0x5C18E4C9L,
+ 0x4F48173DL, 0xBD23943EL, 0xF36E6F75L, 0x0105EC76L, 0x12551F82L, 0xE03E9C81L, 0x34F4F86AL,
+ 0xC69F7B69L, 0xD5CF889DL, 0x27A40B9EL, 0x79B737BAL, 0x8BDCB4B9L, 0x988C474DL, 0x6AE7C44EL,
+ 0xBE2DA0A5L, 0x4C4623A6L, 0x5F16D052L, 0xAD7D5351L
};
u32 crc32sum(u32 crc, u8 * buf, size_t size) {
diff --git a/src/libbz3.c b/src/libbz3.c
index f1aed9a..f1aacb7 100644
--- a/src/libbz3.c
+++ b/src/libbz3.c
@@ -10,10 +10,14 @@
#include "common.h"
#include "crc32.h"
#include "libsais.h"
+#include "lzp.h"
#include "mtf.h"
-#include "txt.h"
#include "rle.h"
#include "srt.h"
+#include "txt.h"
+
+#define LZP_DICTIONARY 16
+#define LZP_MIN_MATCH 40
struct block_encoder_state {
u8 *buf1, *buf2;
@@ -25,9 +29,7 @@ struct block_encoder_state {
s8 last_error;
};
-s8 get_last_error(struct block_encoder_state * state) {
- return state->last_error;
-}
+s8 get_last_error(struct block_encoder_state * state) { return state->last_error; }
const char * str_last_error(struct block_encoder_state * state) {
switch (state->last_error) {
@@ -60,8 +62,7 @@ s32 commit_read(struct block_encoder_state * state, s32 bytes_read) {
}
struct block_encoder_state * new_block_encoder_state(s32 block_size) {
- struct block_encoder_state * block_encoder_state =
- malloc(sizeof(struct block_encoder_state));
+ struct block_encoder_state * block_encoder_state = malloc(sizeof(struct block_encoder_state));
if (!block_encoder_state) {
return NULL;
@@ -97,18 +98,19 @@ struct encoding_result encode_block(struct block_encoder_state * state) {
int txt = is_text(state->buf1, state->bytes_read);
- if(txt) {
- s32 bwt_index = libsais_bwt(state->buf1, state->buf1, state->sais_array,
- state->bytes_read, 16, NULL);
- if(bwt_index < 0) {
+ if (txt) {
+ s32 lzp_size = lzp_compress(state->buf1, state->buf2, state->bytes_read, LZP_DICTIONARY,
+ LZP_MIN_MATCH);
+ s32 bwt_index =
+ libsais_bwt(state->buf2, state->buf1, state->sais_array, lzp_size, 16, NULL);
+ if (bwt_index < 0) {
state->last_error = BZ3_ERR_BWT;
- return (struct encoding_result) { NULL, -1 };
+ return (struct encoding_result){ NULL, -1 };
}
begin(state->cm_state);
state->cm_state->out_queue = state->buf2 + 24;
state->cm_state->output_ptr = 0;
- for(s32 i = 0; i < state->bytes_read; i++)
- encode_byte(state->cm_state, state->buf1[i]);
+ for (s32 i = 0; i < lzp_size; i++) encode_byte(state->cm_state, state->buf1[i]);
flush(state->cm_state);
s32 new_size = state->cm_state->output_ptr;
@@ -116,15 +118,15 @@ struct encoding_result encode_block(struct block_encoder_state * state) {
((uint32_t *)state->buf2)[1] = htonl(state->bytes_read);
((uint32_t *)state->buf2)[2] = htonl(bwt_index);
((uint32_t *)state->buf2)[3] = 0xFFFFFFFF;
- ((uint32_t *)state->buf2)[4] = 0xFFFFFFFF;
+ ((uint32_t *)state->buf2)[4] = htonl(lzp_size);
((uint32_t *)state->buf2)[5] = htonl(new_size);
-
+
state->last_error = BZ3_OK;
- return (struct encoding_result) { state->buf2, 24 + new_size };
+ return (struct encoding_result){ state->buf2, 24 + new_size };
} else {
s32 new_size = mrlec(state->buf1, state->bytes_read, state->buf2);
- s32 bwt_index = libsais_bwt(state->buf2, state->buf2, state->sais_array,
- new_size, 16, NULL);
+ s32 bwt_index =
+ libsais_bwt(state->buf2, state->buf2, state->sais_array, new_size, 16, NULL);
if (bwt_index < 0) {
state->last_error = BZ3_ERR_BWT;
return (struct encoding_result){ NULL, -1 };
@@ -132,8 +134,7 @@ struct encoding_result encode_block(struct block_encoder_state * state) {
s32 new_size2;
if (new_size > MiB(3)) {
- new_size2 =
- srt_encode(state->srt_state, state->buf2, state->buf1, new_size);
+ new_size2 = srt_encode(state->srt_state, state->buf2, state->buf1, new_size);
} else {
new_size2 = -1;
mtf_encode(state->mtf_state, state->buf2, state->buf1, new_size);
@@ -143,11 +144,9 @@ struct encoding_result encode_block(struct block_encoder_state * state) {
state->cm_state->out_queue = state->buf2 + 24;
state->cm_state->output_ptr = 0;
if (new_size2 != -1)
- for (s32 i = 0; i < new_size2; i++)
- encode_byte(state->cm_state, state->buf1[i]);
+ for (s32 i = 0; i < new_size2; i++) encode_byte(state->cm_state, state->buf1[i]);
else
- for (s32 i = 0; i < new_size; i++)
- encode_byte(state->cm_state, state->buf1[i]);
+ for (s32 i = 0; i < new_size; i++) encode_byte(state->cm_state, state->buf1[i]);
flush(state->cm_state);
s32 new_size3 = state->cm_state->output_ptr;
@@ -158,8 +157,7 @@ struct encoding_result encode_block(struct block_encoder_state * state) {
((uint32_t *)state->buf2)[4] = htonl(new_size2);
((uint32_t *)state->buf2)[5] = htonl(new_size3);
state->last_error = BZ3_OK;
- return (struct encoding_result){ .buffer = state->buf2,
- .size = 24 + new_size3 };
+ return (struct encoding_result){ .buffer = state->buf2, .size = 24 + new_size3 };
}
}
@@ -174,23 +172,21 @@ struct encoding_result decode_block(struct block_encoder_state * state) {
new_size2 = ntohl(((uint32_t *)state->buf1)[4]);
new_size3 = ntohl(((uint32_t *)state->buf1)[5]);
- if(new_size2 != 0xFFFFFFFF || new_size != 0xFFFFFFFF) {
+ if (new_size != 0xFFFFFFFF) {
begin(state->cm_state);
state->cm_state->in_queue = state->buf1 + 24;
state->cm_state->input_ptr = 0;
state->cm_state->input_max = new_size3;
init(state->cm_state);
if (new_size2 != -1) {
- for (s32 i = 0; i < new_size2; i++)
- state->buf2[i] = decode_byte(state->cm_state);
+ for (s32 i = 0; i < new_size2; i++) state->buf2[i] = decode_byte(state->cm_state);
srt_decode(state->srt_state, state->buf2, state->buf1, new_size2);
} else {
- for (s32 i = 0; i < new_size; i++)
- state->buf2[i] = decode_byte(state->cm_state);
+ for (s32 i = 0; i < new_size; i++) state->buf2[i] = decode_byte(state->cm_state);
mtf_decode(state->mtf_state, state->buf2, state->buf1, new_size);
}
- if (libsais_unbwt(state->buf1, state->buf2, state->sais_array, new_size,
- NULL, bwt_index) < 0) {
+ if (libsais_unbwt(state->buf1, state->buf2, state->sais_array, new_size, NULL, bwt_index) <
+ 0) {
state->last_error = BZ3_ERR_BWT;
return (struct encoding_result){ NULL, -1 };
}
@@ -200,28 +196,26 @@ struct encoding_result decode_block(struct block_encoder_state * state) {
return (struct encoding_result){ .buffer = NULL, .size = -1 };
}
state->last_error = BZ3_OK;
- return (struct encoding_result){ .buffer = state->buf1,
- .size = state->bytes_read };
+ return (struct encoding_result){ .buffer = state->buf1, .size = state->bytes_read };
} else {
begin(state->cm_state);
state->cm_state->in_queue = state->buf1 + 24;
state->cm_state->input_ptr = 0;
state->cm_state->input_max = new_size3;
init(state->cm_state);
- for (s32 i = 0; i < state->bytes_read; i++)
- state->buf2[i] = decode_byte(state->cm_state);
- if (libsais_unbwt(state->buf2, state->buf1, state->sais_array, state->bytes_read,
- NULL, bwt_index) < 0) {
+ for (s32 i = 0; i < new_size2; i++) state->buf2[i] = decode_byte(state->cm_state);
+ if (libsais_unbwt(state->buf2, state->buf1, state->sais_array, new_size2, NULL, bwt_index) <
+ 0) {
state->last_error = BZ3_ERR_BWT;
return (struct encoding_result){ NULL, -1 };
}
- if (crc32sum(1, state->buf1, state->bytes_read) != crc32) {
+ lzp_decompress(state->buf1, state->buf2, new_size2, LZP_DICTIONARY, LZP_MIN_MATCH);
+ if (crc32sum(1, state->buf2, state->bytes_read) != crc32) {
state->last_error = BZ3_ERR_CRC;
return (struct encoding_result){ .buffer = NULL, .size = -1 };
}
state->last_error = BZ3_OK;
- return (struct encoding_result){ .buffer = state->buf1,
- .size = state->bytes_read };
+ return (struct encoding_result){ .buffer = state->buf2, .size = state->bytes_read };
}
}
diff --git a/src/libsais.c b/src/libsais.c
index d2f6733..fd5f71a 100644
--- a/src/libsais.c
+++ b/src/libsais.c
@@ -100,8 +100,7 @@ typedef struct LIBSAIS_UNBWT_CONTEXT {
#if __has_builtin(__builtin_prefetch)
#define HAS_BUILTIN_PREFECTCH
#endif
-#elif defined(__GNUC__) && \
- (((__GNUC__ == 3) && (__GNUC_MINOR__ >= 2)) || (__GNUC__ >= 4))
+#elif defined(__GNUC__) && (((__GNUC__ == 3) && (__GNUC_MINOR__ >= 2)) || (__GNUC__ >= 4))
#define HAS_BUILTIN_PREFECTCH
#endif
@@ -109,20 +108,16 @@ typedef struct LIBSAIS_UNBWT_CONTEXT {
#if __has_builtin(__builtin_bswap16)
#define HAS_BUILTIN_BSWAP16
#endif
-#elif defined(__GNUC__) && \
- (((__GNUC__ == 4) && (__GNUC_MINOR__ >= 8)) || (__GNUC__ >= 5))
+#elif defined(__GNUC__) && (((__GNUC__ == 4) && (__GNUC_MINOR__ >= 8)) || (__GNUC__ >= 5))
#define HAS_BUILTIN_BSWAP16
#endif
#if defined(HAS_BUILTIN_PREFECTCH)
- #define libsais_prefetch(address) \
- __builtin_prefetch((const void *)(address), 0, 0)
- #define libsais_prefetchw(address) \
- __builtin_prefetch((const void *)(address), 1, 0)
+ #define libsais_prefetch(address) __builtin_prefetch((const void *)(address), 0, 0)
+ #define libsais_prefetchw(address) __builtin_prefetch((const void *)(address), 1, 0)
#elif defined(_M_IX86) || defined(_M_AMD64)
#include <intrin.h>
- #define libsais_prefetch(address) \
- _mm_prefetch((const void *)(address), _MM_HINT_NTA)
+ #define libsais_prefetch(address) _mm_prefetch((const void *)(address), _MM_HINT_NTA)
#define libsais_prefetchw(address) _m_prefetchw((const void *)(address))
#elif defined(_M_ARM)
#include <intrin.h>
@@ -137,24 +132,18 @@ typedef struct LIBSAIS_UNBWT_CONTEXT {
#endif
#if !defined(__LITTLE_ENDIAN__) && !defined(__BIG_ENDIAN__)
- #if defined(_LITTLE_ENDIAN) || \
- (defined(BYTE_ORDER) && defined(LITTLE_ENDIAN) && \
- BYTE_ORDER == LITTLE_ENDIAN) || \
- (defined(_BYTE_ORDER) && defined(_LITTLE_ENDIAN) && \
- _BYTE_ORDER == _LITTLE_ENDIAN) || \
- (defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && \
- __BYTE_ORDER == __LITTLE_ENDIAN) || \
- (defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && \
+ #if defined(_LITTLE_ENDIAN) || \
+ (defined(BYTE_ORDER) && defined(LITTLE_ENDIAN) && BYTE_ORDER == LITTLE_ENDIAN) || \
+ (defined(_BYTE_ORDER) && defined(_LITTLE_ENDIAN) && _BYTE_ORDER == _LITTLE_ENDIAN) || \
+ (defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && __BYTE_ORDER == __LITTLE_ENDIAN) || \
+ (defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && \
__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
#define __LITTLE_ENDIAN__
- #elif defined(_BIG_ENDIAN) || \
- (defined(BYTE_ORDER) && defined(BIG_ENDIAN) && \
- BYTE_ORDER == BIG_ENDIAN) || \
- (defined(_BYTE_ORDER) && defined(_BIG_ENDIAN) && \
- _BYTE_ORDER == _BIG_ENDIAN) || \
- (defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && \
- __BYTE_ORDER == __BIG_ENDIAN) || \
- (defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && \
+ #elif defined(_BIG_ENDIAN) || \
+ (defined(BYTE_ORDER) && defined(BIG_ENDIAN) && BYTE_ORDER == BIG_ENDIAN) || \
+ (defined(_BYTE_ORDER) && defined(_BIG_ENDIAN) && _BYTE_ORDER == _BIG_ENDIAN) || \
+ (defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && __BYTE_ORDER == __BIG_ENDIAN) || \
+ (defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && \
__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
#define __BIG_ENDIAN__
#elif defined(_WIN32)
@@ -184,11 +173,9 @@ static void * libsais_align_up(const void * address, size_t alignment) {
static void * libsais_alloc_aligned(size_t size, size_t alignment) {
void * address = malloc(size + sizeof(short) + alignment - 1);
if (address != NULL) {
- void * aligned_address = libsais_align_up(
- (void *)((ptrdiff_t)address + (ptrdiff_t)(sizeof(short))),
- alignment);
- ((short *)aligned_address)[-1] =
- (short)((ptrdiff_t)aligned_address - (ptrdiff_t)address);
+ void * aligned_address =
+ libsais_align_up((void *)((ptrdiff_t)address + (ptrdiff_t)(sizeof(short))), alignment);
+ ((short *)aligned_address)[-1] = (short)((ptrdiff_t)aligned_address - (ptrdiff_t)address);
return aligned_address;
}
@@ -198,25 +185,19 @@ static void * libsais_alloc_aligned(size_t size, size_t alignment) {
static void libsais_free_aligned(void * aligned_address) {
if (aligned_address != NULL) {
- free((void *)((ptrdiff_t)aligned_address -
- ((short *)aligned_address)[-1]));
+ free((void *)((ptrdiff_t)aligned_address - ((short *)aligned_address)[-1]));
}
}
static LIBSAIS_THREAD_STATE * libsais_alloc_thread_state(sa_sint_t threads) {
- LIBSAIS_THREAD_STATE * RESTRICT thread_state =
- (LIBSAIS_THREAD_STATE *)libsais_alloc_aligned(
- (size_t)threads * sizeof(LIBSAIS_THREAD_STATE), 4096);
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state = (LIBSAIS_THREAD_STATE *)libsais_alloc_aligned(
+ (size_t)threads * sizeof(LIBSAIS_THREAD_STATE), 4096);
sa_sint_t * RESTRICT thread_buckets = (sa_sint_t *)libsais_alloc_aligned(
(size_t)threads * 4 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096);
- LIBSAIS_THREAD_CACHE * RESTRICT thread_cache =
- (LIBSAIS_THREAD_CACHE *)libsais_alloc_aligned(
- (size_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE *
- sizeof(LIBSAIS_THREAD_CACHE),
- 4096);
-
- if (thread_state != NULL && thread_buckets != NULL &&
- thread_cache != NULL) {
+ LIBSAIS_THREAD_CACHE * RESTRICT thread_cache = (LIBSAIS_THREAD_CACHE *)libsais_alloc_aligned(
+ (size_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE * sizeof(LIBSAIS_THREAD_CACHE), 4096);
+
+ if (thread_state != NULL && thread_buckets != NULL && thread_cache != NULL) {
fast_sint_t t;
for (t = 0; t < threads; ++t) {
thread_state[t].state.buckets = thread_buckets;
@@ -245,13 +226,12 @@ static void libsais_free_thread_state(LIBSAIS_THREAD_STATE * thread_state) {
static LIBSAIS_CONTEXT * libsais_create_ctx_main(sa_sint_t threads) {
LIBSAIS_CONTEXT * RESTRICT ctx =
(LIBSAIS_CONTEXT *)libsais_alloc_aligned(sizeof(LIBSAIS_CONTEXT), 64);
- sa_sint_t * RESTRICT buckets = (sa_sint_t *)libsais_alloc_aligned(
- 8 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096);
+ sa_sint_t * RESTRICT buckets =
+ (sa_sint_t *)libsais_alloc_aligned(8 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096);
LIBSAIS_THREAD_STATE * RESTRICT thread_state =
threads > 1 ? libsais_alloc_thread_state(threads) : NULL;
- if (ctx != NULL && buckets != NULL &&
- (thread_state != NULL || threads == 1)) {
+ if (ctx != NULL && buckets != NULL && (thread_state != NULL || threads == 1)) {
ctx->buckets = buckets;
ctx->threads = threads;
ctx->thread_state = thread_state;
@@ -275,9 +255,9 @@ static void libsais_free_ctx_main(LIBSAIS_CONTEXT * ctx) {
#if defined(_OPENMP)
-static sa_sint_t libsais_count_negative_marked_suffixes(
- sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start,
- fast_sint_t omp_block_size) {
+static sa_sint_t libsais_count_negative_marked_suffixes(sa_sint_t * RESTRICT SA,
+ fast_sint_t omp_block_start,
+ fast_sint_t omp_block_size) {
sa_sint_t count = 0;
fast_sint_t i;
@@ -288,9 +268,9 @@ static sa_sint_t libsais_count_negative_marked_suffixes(
return count;
}
-static sa_sint_t libsais_count_zero_marked_suffixes(
- sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start,
- fast_sint_t omp_block_size) {
+static sa_sint_t libsais_count_zero_marked_suffixes(sa_sint_t * RESTRICT SA,
+ fast_sint_t omp_block_start,
+ fast_sint_t omp_block_size) {
sa_sint_t count = 0;
fast_sint_t i;
@@ -303,14 +283,12 @@ static sa_sint_t libsais_count_zero_marked_suffixes(
static void libsais_place_cached_suffixes(sa_sint_t * RESTRICT SA,
LIBSAIS_THREAD_CACHE * RESTRICT cache,
- fast_sint_t omp_block_start,
- fast_sint_t omp_block_size) {
+ fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
const fast_sint_t prefetch_distance = 32;
fast_sint_t i, j;
- for (i = omp_block_start,
- j = omp_block_start + omp_block_size - prefetch_distance - 3;
- i < j; i += 4) {
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j;
+ i += 4) {
libsais_prefetch(&cache[i + 2 * prefetch_distance]);
libsais_prefetchw(&SA[cache[i + prefetch_distance + 0].symbol]);
@@ -329,15 +307,15 @@ static void libsais_place_cached_suffixes(sa_sint_t * RESTRICT SA,
}
}
-static void libsais_compact_and_place_cached_suffixes(
- sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache,
- fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+static void libsais_compact_and_place_cached_suffixes(sa_sint_t * RESTRICT SA,
+ LIBSAIS_THREAD_CACHE * RESTRICT cache,
+ fast_sint_t omp_block_start,
+ fast_sint_t omp_block_size) {
const fast_sint_t prefetch_distance = 32;
fast_sint_t i, j, l;
- for (i = omp_block_start, j = omp_block_start + omp_block_size - 3,
- l = omp_block_start;
- i < j; i += 4) {
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - 3, l = omp_block_start; i < j;
+ i += 4) {
libsais_prefetchw(&cache[i + prefetch_distance]);
cache[l] = cache[i + 0];
@@ -355,12 +333,10 @@ static void libsais_compact_and_place_cached_suffixes(
l += cache[l].symbol >= 0;
}
- libsais_place_cached_suffixes(SA, cache, omp_block_start,
- l - omp_block_start);
+ libsais_place_cached_suffixes(SA, cache, omp_block_start, l - omp_block_start);
}
-static void libsais_accumulate_counts_s32_2(sa_sint_t * RESTRICT bucket00,
- fast_sint_t bucket_size,
+static void libsais_accumulate_counts_s32_2(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size,
fast_sint_t bucket_stride) {
sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
fast_sint_t s;
@@ -369,8 +345,7 @@ static void libsais_accumulate_counts_s32_2(sa_sint_t * RESTRICT bucket00,
}
}
-static void libsais_accumulate_counts_s32_3(sa_sint_t * RESTRICT bucket00,
- fast_sint_t bucket_size,
+static void libsais_accumulate_counts_s32_3(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size,
fast_sint_t bucket_stride) {
sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
@@ -380,8 +355,7 @@ static void libsais_accumulate_counts_s32_3(sa_sint_t * RESTRICT bucket00,
}
}
-static void libsais_accumulate_counts_s32_4(sa_sint_t * RESTRICT bucket00,
- fast_sint_t bucket_size,
+static void libsais_accumulate_counts_s32_4(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size,
fast_sint_t bucket_stride) {
sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
@@ -392,8 +366,7 @@ static void libsais_accumulate_counts_s32_4(sa_sint_t * RESTRICT bucket00,
}
}
-static void libsais_accumulate_counts_s32_5(sa_sint_t * RESTRICT bucket00,
- fast_sint_t bucket_size,
+static void libsais_accumulate_counts_s32_5(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size,
fast_sint_t bucket_stride) {
sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
@@ -401,13 +374,11 @@ static void libsais_accumulate_counts_s32_5(sa_sint_t * RESTRICT bucket00,
sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride;
fast_sint_t s;
for (s = 0; s < bucket_size; s += 1) {
- bucket00[s] =
- bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s];
+ bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s];
}
}
-static void libsais_accumulate_counts_s32_6(sa_sint_t * RESTRICT bucket00,
- fast_sint_t bucket_size,
+static void libsais_accumulate_counts_s32_6(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size,
fast_sint_t bucket_stride) {
sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
@@ -416,13 +387,12 @@ static void libsais_accumulate_counts_s32_6(sa_sint_t * RESTRICT bucket00,
sa_sint_t * RESTRICT bucket05 = bucket04 - bucket_stride;
fast_sint_t s;
for (s = 0; s < bucket_size; s += 1) {
- bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] +
- bucket04[s] + bucket05[s];
+ bucket00[s] =
+ bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] + bucket05[s];
}
}
-static void libsais_accumulate_counts_s32_7(sa_sint_t * RESTRICT bucket00,
- fast_sint_t bucket_size,
+static void libsais_accumulate_counts_s32_7(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size,
fast_sint_t bucket_stride) {
sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
@@ -432,13 +402,12 @@ static void libsais_accumulate_counts_s32_7(sa_sint_t * RESTRICT bucket00,
sa_sint_t * RESTRICT bucket06 = bucket05 - bucket_stride;
fast_sint_t s;
for (s = 0; s < bucket_size; s += 1) {
- bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] +
- bucket04[s] + bucket05[s] + bucket06[s];
+ bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] +
+ bucket05[s] + bucket06[s];
}
}
-static void libsais_accumulate_counts_s32_8(sa_sint_t * RESTRICT bucket00,
- fast_sint_t bucket_size,
+static void libsais_accumulate_counts_s32_8(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size,
fast_sint_t bucket_stride) {
sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
@@ -449,13 +418,12 @@ static void libsais_accumulate_counts_s32_8(sa_sint_t * RESTRICT bucket00,
sa_sint_t * RESTRICT bucket07 = bucket06 - bucket_stride;
fast_sint_t s;
for (s = 0; s < bucket_size; s += 1) {
- bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] +
- bucket04[s] + bucket05[s] + bucket06[s] + bucket07[s];
+ bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] +
+ bucket05[s] + bucket06[s] + bucket07[s];
}
}
-static void libsais_accumulate_counts_s32_9(sa_sint_t * RESTRICT bucket00,
- fast_sint_t bucket_size,
+static void libsais_accumulate_counts_s32_9(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size,
fast_sint_t bucket_stride) {
sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride;
sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride;
@@ -467,20 +435,16 @@ static void libsais_accumulate_counts_s32_9(sa_sint_t * RESTRICT bucket00,
sa_sint_t * RESTRICT bucket08 = bucket07 - bucket_stride;
fast_sint_t s;
for (s = 0; s < bucket_size; s += 1) {
- bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] +
- bucket04[s] + bucket05[s] + bucket06[s] + bucket07[s] +
- bucket08[s];
+ bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] +
+ bucket05[s] + bucket06[s] + bucket07[s] + bucket08[s];
}
}
-static void libsais_accumulate_counts_s32(sa_sint_t * RESTRICT buckets,
- fast_sint_t bucket_size,
- fast_sint_t bucket_stride,
- fast_sint_t num_buckets) {
+static void libsais_accumulate_counts_s32(sa_sint_t * RESTRICT buckets, fast_sint_t bucket_size,
+ fast_sint_t bucket_stride, fast_sint_t num_buckets) {
while (num_buckets >= 9) {
- libsais_accumulate_counts_s32_9(
- buckets - (num_buckets - 9) * bucket_stride, bucket_size,
- bucket_stride);
+ libsais_accumulate_counts_s32_9(buckets - (num_buckets - 9) * bucket_stride, bucket_size,
+ bucket_stride);
num_buckets -= 8;
}
@@ -488,42 +452,33 @@ static void libsais_accumulate_counts_s32(sa_sint_t * RESTRICT buckets,
case 1:
break;
case 2:
- libsais_accumulate_counts_s32_2(buckets, bucket_size,
- bucket_stride);
+ libsais_accumulate_counts_s32_2(buckets, bucket_size, bucket_stride);
break;
case 3:
- libsais_accumulate_counts_s32_3(buckets, bucket_size,
- bucket_stride);
+ libsais_accumulate_counts_s32_3(buckets, bucket_size, bucket_stride);
break;
case 4:
- libsais_accumulate_counts_s32_4(buckets, bucket_size,
- bucket_stride);
+ libsais_accumulate_counts_s32_4(buckets, bucket_size, bucket_stride);
break;
case 5:
- libsais_accumulate_counts_s32_5(buckets, bucket_size,
- bucket_stride);
+ libsais_accumulate_counts_s32_5(buckets, bucket_size, bucket_stride);
break;
case 6:
- libsais_accumulate_counts_s32_6(buckets, bucket_size,
- bucket_stride);
+ libsais_accumulate_counts_s32_6(buckets, bucket_size, bucket_stride);
break;
case 7:
- libsais_accumulate_counts_s32_7(buckets, bucket_size,
- bucket_stride);
+ libsais_accumulate_counts_s32_7(buckets, bucket_size, bucket_stride);
break;
case 8:
- libsais_accumulate_counts_s32_8(buckets, bucket_size,
- bucket_stride);
+ libsais_accumulate_counts_s32_8(buckets, bucket_size, bucket_stride);
break;
}
}
#endif
-static void libsais_gather_lms_suffixes_8u(const u8 * RESTRICT T,
- sa_sint_t * RESTRICT SA, sa_sint_t n,
- fast_sint_t m,
- fast_sint_t omp_block_start,
+static void libsais_gather_lms_suffixes_8u(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
+ sa_sint_t n, fast_sint_t m, fast_sint_t omp_block_start,
fast_sint_t omp_block_size) {
if (omp_block_size > 0) {
const fast_sint_t prefetch_distance = 128;
@@ -537,8 +492,7 @@ static void libsais_gather_lms_suffixes_8u(const u8 * RESTRICT T,
fast_uint_t s = c0 >= c1;
- for (i = omp_block_start + omp_block_size - 2, j = omp_block_start + 3;
- i >= j; i -= 4) {
+ for (i = omp_block_start + omp_block_size - 2, j = omp_block_start + 3; i >= j; i -= 4) {
libsais_prefetch(&T[i - prefetch_distance]);
c1 = T[i - 0];
@@ -571,9 +525,9 @@ static void libsais_gather_lms_suffixes_8u(const u8 * RESTRICT T,
}
}
-static void libsais_gather_lms_suffixes_8u_omp(
- const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+static void libsais_gather_lms_suffixes_8u_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
+ sa_sint_t n, sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
#if defined(_OPENMP)
#pragma omp parallel num_threads(threads) if (threads > 1 && n >= 65536 && \
omp_get_dynamic() == 0)
@@ -591,13 +545,12 @@ static void libsais_gather_lms_suffixes_8u_omp(
#endif
fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
- fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
- ? omp_block_stride
- : n - omp_block_start;
+ fast_sint_t omp_block_size =
+ omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
if (omp_num_threads == 1) {
- libsais_gather_lms_suffixes_8u(T, SA, n, (fast_sint_t)n - 1,
- omp_block_start, omp_block_size);
+ libsais_gather_lms_suffixes_8u(T, SA, n, (fast_sint_t)n - 1, omp_block_start,
+ omp_block_size);
}
#if defined(_OPENMP)
else {
@@ -606,15 +559,14 @@ static void libsais_gather_lms_suffixes_8u_omp(
m += thread_state[t].state.m;
}
- libsais_gather_lms_suffixes_8u(T, SA, n, (fast_sint_t)n - 1 - m,
- omp_block_start, omp_block_size);
+ libsais_gather_lms_suffixes_8u(T, SA, n, (fast_sint_t)n - 1 - m, omp_block_start,
+ omp_block_size);
#pragma omp barrier
if (thread_state[omp_thread_num].state.m > 0) {
SA[(fast_sint_t)n - 1 - m] =
- (sa_sint_t)thread_state[omp_thread_num]
- .state.last_lms_suffix;
+ (sa_sint_t)thread_state[omp_thread_num].state.last_lms_suffix;
}
}
#endif
@@ -622,8 +574,7 @@ static void libsais_gather_lms_suffixes_8u_omp(
}
static sa_sint_t libsais_gather_lms_suffixes_32s(const sa_sint_t * RESTRICT T,
- sa_sint_t * RESTRICT SA,
- sa_sint_t n) {
+ sa_sint_t * RESTRICT SA, sa_sint_t n) {
const fast_sint_t prefetch_distance = 32;
sa_sint_t i = n - 2;
@@ -664,8 +615,8 @@ static sa_sint_t libsais_gather_lms_suffixes_32s(const sa_sint_t * RESTRICT T,
return n - 1 - m;
}
-static sa_sint_t libsais_gather_compacted_lms_suffixes_32s(
- const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n) {
+static sa_sint_t libsais_gather_compacted_lms_suffixes_32s(const sa_sint_t * RESTRICT T,
+ sa_sint_t * RESTRICT SA, sa_sint_t n) {
const fast_sint_t prefetch_distance = 32;
sa_sint_t i = n - 2;
@@ -708,9 +659,8 @@ static sa_sint_t libsais_gather_compacted_lms_suffixes_32s(
#if defined(_OPENMP)
-static void libsais_count_lms_suffixes_32s_4k(const sa_sint_t * RESTRICT T,
- sa_sint_t n, sa_sint_t k,
- sa_sint_t * RESTRICT buckets) {
+static void libsais_count_lms_suffixes_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t n,
+ sa_sint_t k, sa_sint_t * RESTRICT buckets) {
const fast_sint_t prefetch_distance = 32;
memset(buckets, 0, 4 * (size_t)k * sizeof(sa_sint_t));
@@ -723,14 +673,10 @@ static void libsais_count_lms_suffixes_32s_4k(const sa_sint_t * RESTRICT T,
for (; i >= prefetch_distance + 3; i -= 4) {
libsais_prefetch(&T[i - 2 * prefetch_distance]);
- libsais_prefetchw(
- &buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 0], 0)]);
- libsais_prefetchw(
- &buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 1], 0)]);
- libsais_prefetchw(
- &buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 2], 0)]);
- libsais_prefetchw(
- &buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 3], 0)]);
+ libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 0], 0)]);
+ libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 1], 0)]);
+ libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 2], 0)]);
+ libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 3], 0)]);
c1 = T[i - 0];
s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
@@ -761,9 +707,8 @@ static void libsais_count_lms_suffixes_32s_4k(const sa_sint_t * RESTRICT T,
#endif
-static void libsais_count_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T,
- sa_sint_t n, sa_sint_t k,
- sa_sint_t * RESTRICT buckets) {
+static void libsais_count_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t n,
+ sa_sint_t k, sa_sint_t * RESTRICT buckets) {
const fast_sint_t prefetch_distance = 32;
memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t));
@@ -776,14 +721,10 @@ static void libsais_count_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T,
for (; i >= prefetch_distance + 3; i -= 4) {
libsais_prefetch(&T[i - 2 * prefetch_distance]);
- libsais_prefetchw(
- &buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0], 0)]);
- libsais_prefetchw(
- &buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1], 0)]);
- libsais_prefetchw(
- &buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2], 0)]);
- libsais_prefetchw(
- &buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3], 0)]);
+ libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0], 0)]);
+ libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1], 0)]);
+ libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2], 0)]);
+ libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3], 0)]);
c1 = T[i - 0];
s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
@@ -814,9 +755,8 @@ static void libsais_count_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T,
#if defined(_OPENMP)
-static void libsais_count_compacted_lms_suffixes_32s_2k(
- const sa_sint_t * RESTRICT T, sa_sint_t n, sa_sint_t k,
- sa_sint_t * RESTRICT buckets) {
+static void libsais_count_compacted_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t n,
+ sa_sint_t k, sa_sint_t * RESTRICT buckets) {
const fast_sint_t prefetch_distance = 32;
memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t));
@@ -829,14 +769,10 @@ static void libsais_count_compacted_lms_suffixes_32s_2k(
for (; i >= prefetch_distance + 3; i -= 4) {
libsais_prefetch(&T[i - 2 * prefetch_distance]);
- libsais_prefetchw(&buckets[BUCKETS_INDEX2(
- T[i - prefetch_distance - 0] & SAINT_MAX, 0)]);
- libsais_prefetchw(&buckets[BUCKETS_INDEX2(
- T[i - prefetch_distance - 1] & SAINT_MAX, 0)]);
- libsais_prefetchw(&buckets[BUCKETS_INDEX2(
- T[i - prefetch_distance - 2] & SAINT_MAX, 0)]);
- libsais_prefetchw(&buckets[BUCKETS_INDEX2(
- T[i - prefetch_distance - 3] & SAINT_MAX, 0)]);
+ libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0] & SAINT_MAX, 0)]);
+ libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1] & SAINT_MAX, 0)]);
+ libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2] & SAINT_MAX, 0)]);
+ libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3] & SAINT_MAX, 0)]);
c1 = T[i - 0];
s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
@@ -873,10 +809,11 @@ static void libsais_count_compacted_lms_suffixes_32s_2k(
#endif
-static sa_sint_t libsais_count_and_gather_lms_suffixes_8u(
- const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start,
- fast_sint_t omp_block_size) {
+static sa_sint_t libsais_count_and_gather_lms_suffixes_8u(const u8 * RESTRICT T,
+ sa_sint_t * RESTRICT SA, sa_sint_t n,
+ sa_sint_t * RESTRICT buckets,
+ fast_sint_t omp_block_start,
+ fast_sint_t omp_block_size) {
memset(buckets, 0, 4 * ALPHABET_SIZE * sizeof(sa_sint_t));
fast_sint_t m = omp_block_start + omp_block_size - 1;
@@ -940,9 +877,8 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_8u(
}
static sa_sint_t libsais_count_and_gather_lms_suffixes_8u_omp(
- const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t * RESTRICT buckets, sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets,
+ sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
sa_sint_t m = 0;
#if defined(_OPENMP)
@@ -962,23 +898,20 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_8u_omp(
#endif
fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
- fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
- ? omp_block_stride
- : n - omp_block_start;
+ fast_sint_t omp_block_size =
+ omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
if (omp_num_threads == 1) {
- m = libsais_count_and_gather_lms_suffixes_8u(
- T, SA, n, buckets, omp_block_start, omp_block_size);
+ m = libsais_count_and_gather_lms_suffixes_8u(T, SA, n, buckets, omp_block_start,
+ omp_block_size);
}
#if defined(_OPENMP)
else {
{
- thread_state[omp_thread_num].state.position =
- omp_block_start + omp_block_size;
- thread_state[omp_thread_num].state.m =
- libsais_count_and_gather_lms_suffixes_8u(
- T, SA, n, thread_state[omp_thread_num].state.buckets,
- omp_block_start, omp_block_size);
+ thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size;
+ thread_state[omp_thread_num].state.m = libsais_count_and_gather_lms_suffixes_8u(
+ T, SA, n, thread_state[omp_thread_num].state.buckets, omp_block_start,
+ omp_block_size);
if (thread_state[omp_thread_num].state.m > 0) {
thread_state[omp_thread_num].state.last_lms_suffix =
@@ -996,18 +929,14 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_8u_omp(
for (t = omp_num_threads - 1; t >= 0; --t) {
m += (sa_sint_t)thread_state[t].state.m;
- if (t != omp_num_threads - 1 &&
- thread_state[t].state.m > 0) {
+ if (t != omp_num_threads - 1 && thread_state[t].state.m > 0) {
memcpy(&SA[n - m],
- &SA[thread_state[t].state.position -
- thread_state[t].state.m],
- (size_t)thread_state[t].state.m *
- sizeof(sa_sint_t));
+ &SA[thread_state[t].state.position - thread_state[t].state.m],
+ (size_t)thread_state[t].state.m * sizeof(sa_sint_t));
}
{
- sa_sint_t * RESTRICT temp_bucket =
- thread_state[t].state.buckets;
+ sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
fast_sint_t s;
for (s = 0; s < 4 * ALPHABET_SIZE; s += 1) {
sa_sint_t A = buckets[s], B = temp_bucket[s];
@@ -1025,9 +954,8 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_8u_omp(
}
static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k(
- const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t k, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start,
- fast_sint_t omp_block_size) {
+ const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
+ sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
memset(buckets, 0, 4 * (size_t)k * sizeof(sa_sint_t));
fast_sint_t m = omp_block_start + omp_block_size - 1;
@@ -1043,18 +971,13 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k(
fast_uint_t s = c0 >= c1;
- for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j;
- i -= 4) {
+ for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4) {
libsais_prefetch(&T[i - 2 * prefetch_distance]);
- libsais_prefetchw(
- &buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 0], 0)]);
- libsais_prefetchw(
- &buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 1], 0)]);
- libsais_prefetchw(
- &buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 2], 0)]);
- libsais_prefetchw(
- &buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 3], 0)]);
+ libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 0], 0)]);
+ libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 1], 0)]);
+ libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 2], 0)]);
+ libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 3], 0)]);
c1 = T[i - 0];
s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
@@ -1101,9 +1024,8 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k(
}
static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k(
- const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t k, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start,
- fast_sint_t omp_block_size) {
+ const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
+ sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t));
fast_sint_t m = omp_block_start + omp_block_size - 1;
@@ -1119,18 +1041,13 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k(
fast_uint_t s = c0 >= c1;
- for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j;
- i -= 4) {
+ for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4) {
libsais_prefetch(&T[i - 2 * prefetch_distance]);
- libsais_prefetchw(
- &buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0], 0)]);
- libsais_prefetchw(
- &buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1], 0)]);
- libsais_prefetchw(
- &buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2], 0)]);
- libsais_prefetchw(
- &buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3], 0)]);
+ libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0], 0)]);
+ libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1], 0)]);
+ libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2], 0)]);
+ libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3], 0)]);
c1 = T[i - 0];
s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
@@ -1177,9 +1094,8 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k(
}
static sa_sint_t libsais_count_and_gather_compacted_lms_suffixes_32s_2k(
- const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t k, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start,
- fast_sint_t omp_block_size) {
+ const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
+ sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t));
fast_sint_t m = omp_block_start + omp_block_size - 1;
@@ -1195,18 +1111,17 @@ static sa_sint_t libsais_count_and_gather_compacted_lms_suffixes_32s_2k(
fast_uint_t s = c0 >= c1;
- for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j;
- i -= 4) {
+ for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4) {
libsais_prefetch(&T[i - 2 * prefetch_distance]);
- libsais_prefetchw(&buckets[BUCKETS_INDEX2(
- T[i - prefetch_distance - 0] & SAINT_MAX, 0)]);
- libsais_prefetchw(&buckets[BUCKETS_INDEX2(
- T[i - prefetch_distance - 1] & SAINT_MAX, 0)]);
- libsais_prefetchw(&buckets[BUCKETS_INDEX2(
- T[i - prefetch_distance - 2] & SAINT_MAX, 0)]);
- libsais_prefetchw(&buckets[BUCKETS_INDEX2(
- T[i - prefetch_distance - 3] & SAINT_MAX, 0)]);
+ libsais_prefetchw(
+ &buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0] & SAINT_MAX, 0)]);
+ libsais_prefetchw(
+ &buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1] & SAINT_MAX, 0)]);
+ libsais_prefetchw(
+ &buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2] & SAINT_MAX, 0)]);
+ libsais_prefetchw(
+ &buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3] & SAINT_MAX, 0)]);
c1 = T[i - 0];
s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1)));
@@ -1260,8 +1175,7 @@ static sa_sint_t libsais_count_and_gather_compacted_lms_suffixes_32s_2k(
#if defined(_OPENMP)
-static fast_sint_t libsais_get_bucket_stride(fast_sint_t free_space,
- fast_sint_t bucket_size,
+static fast_sint_t libsais_get_bucket_stride(fast_sint_t free_space, fast_sint_t bucket_size,
fast_sint_t num_buckets) {
fast_sint_t bucket_size_1024 = (bucket_size + 1023) & (-1024);
if (free_space / (num_buckets - 1) >= bucket_size_1024) {
@@ -1276,9 +1190,8 @@ static fast_sint_t libsais_get_bucket_stride(fast_sint_t free_space,
}
static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k_fs_omp(
- const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
+ sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
sa_sint_t m = 0;
#if defined(_OPENMP)
@@ -1297,27 +1210,25 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k_fs_omp(
#endif
fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
- fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
- ? omp_block_stride
- : n - omp_block_start;
+ fast_sint_t omp_block_size =
+ omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
if (omp_num_threads == 1) {
- m = libsais_count_and_gather_lms_suffixes_32s_4k(
- T, SA, n, k, buckets, omp_block_start, omp_block_size);
+ m = libsais_count_and_gather_lms_suffixes_32s_4k(T, SA, n, k, buckets, omp_block_start,
+ omp_block_size);
}
#if defined(_OPENMP)
else {
fast_sint_t bucket_size = 4 * (fast_sint_t)k;
- fast_sint_t bucket_stride = libsais_get_bucket_stride(
- buckets - &SA[n], bucket_size, omp_num_threads);
+ fast_sint_t bucket_stride =
+ libsais_get_bucket_stride(buckets - &SA[n], bucket_size, omp_num_threads);
{
- thread_state[omp_thread_num].state.position =
- omp_block_start + omp_block_size;
+ thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size;
thread_state[omp_thread_num].state.count =
libsais_count_and_gather_lms_suffixes_32s_4k(
- T, SA, n, k, buckets - (omp_thread_num * bucket_stride),
- omp_block_start, omp_block_size);
+ T, SA, n, k, buckets - (omp_thread_num * bucket_stride), omp_block_start,
+ omp_block_size);
}
#pragma omp barrier
@@ -1327,13 +1238,10 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k_fs_omp(
for (t = omp_num_threads - 1; t >= 0; --t) {
m += (sa_sint_t)thread_state[t].state.count;
- if (t != omp_num_threads - 1 &&
- thread_state[t].state.count > 0) {
+ if (t != omp_num_threads - 1 && thread_state[t].state.count > 0) {
memcpy(&SA[n - m],
- &SA[thread_state[t].state.position -
- thread_state[t].state.count],
- (size_t)thread_state[t].state.count *
- sizeof(sa_sint_t));
+ &SA[thread_state[t].state.position - thread_state[t].state.count],
+ (size_t)thread_state[t].state.count * sizeof(sa_sint_t));
}
}
} else {
@@ -1344,9 +1252,8 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k_fs_omp(
? omp_block_stride
: bucket_size - omp_block_start;
- libsais_accumulate_counts_s32(buckets + omp_block_start,
- omp_block_size, bucket_stride,
- omp_num_threads + 1);
+ libsais_accumulate_counts_s32(buckets + omp_block_start, omp_block_size,
+ bucket_stride, omp_num_threads + 1);
}
}
#endif
@@ -1356,9 +1263,8 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k_fs_omp(
}
static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k_fs_omp(
- const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
+ sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
sa_sint_t m = 0;
#if defined(_OPENMP)
@@ -1377,27 +1283,25 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k_fs_omp(
#endif
fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
- fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
- ? omp_block_stride
- : n - omp_block_start;
+ fast_sint_t omp_block_size =
+ omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
if (omp_num_threads == 1) {
- m = libsais_count_and_gather_lms_suffixes_32s_2k(
- T, SA, n, k, buckets, omp_block_start, omp_block_size);
+ m = libsais_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets, omp_block_start,
+ omp_block_size);
}
#if defined(_OPENMP)
else {
fast_sint_t bucket_size = 2 * (fast_sint_t)k;
- fast_sint_t bucket_stride = libsais_get_bucket_stride(
- buckets - &SA[n], bucket_size, omp_num_threads);
+ fast_sint_t bucket_stride =
+ libsais_get_bucket_stride(buckets - &SA[n], bucket_size, omp_num_threads);
{
- thread_state[omp_thread_num].state.position =
- omp_block_start + omp_block_size;
+ thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size;
thread_state[omp_thread_num].state.count =
libsais_count_and_gather_lms_suffixes_32s_2k(
- T, SA, n, k, buckets - (omp_thread_num * bucket_stride),
- omp_block_start, omp_block_size);
+ T, SA, n, k, buckets - (omp_thread_num * bucket_stride), omp_block_start,
+ omp_block_size);
}
#pragma omp barrier
@@ -1407,13 +1311,10 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k_fs_omp(
for (t = omp_num_threads - 1; t >= 0; --t) {
m += (sa_sint_t)thread_state[t].state.count;
- if (t != omp_num_threads - 1 &&
- thread_state[t].state.count > 0) {
+ if (t != omp_num_threads - 1 && thread_state[t].state.count > 0) {
memcpy(&SA[n - m],
- &SA[thread_state[t].state.position -
- thread_state[t].state.count],
- (size_t)thread_state[t].state.count *
- sizeof(sa_sint_t));
+ &SA[thread_state[t].state.position - thread_state[t].state.count],
+ (size_t)thread_state[t].state.count * sizeof(sa_sint_t));
}
}
} else {
@@ -1424,9 +1325,8 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k_fs_omp(
? omp_block_stride
: bucket_size - omp_block_start;
- libsais_accumulate_counts_s32(buckets + omp_block_start,
- omp_block_size, bucket_stride,
- omp_num_threads + 1);
+ libsais_accumulate_counts_s32(buckets + omp_block_start, omp_block_size,
+ bucket_stride, omp_num_threads + 1);
}
}
#endif
@@ -1436,9 +1336,8 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k_fs_omp(
}
static void libsais_count_and_gather_compacted_lms_suffixes_32s_2k_fs_omp(
- const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
+ sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
#if defined(_OPENMP)
#pragma omp parallel num_threads(threads) if (threads > 1 && n >= 65536)
#endif
@@ -1455,27 +1354,24 @@ static void libsais_count_and_gather_compacted_lms_suffixes_32s_2k_fs_omp(
#endif
fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
- fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
- ? omp_block_stride
- : n - omp_block_start;
+ fast_sint_t omp_block_size =
+ omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
if (omp_num_threads == 1) {
- libsais_count_and_gather_compacted_lms_suffixes_32s_2k(
- T, SA, n, k, buckets, omp_block_start, omp_block_size);
+ libsais_count_and_gather_compacted_lms_suffixes_32s_2k(T, SA, n, k, buckets,
+ omp_block_start, omp_block_size);
}
#if defined(_OPENMP)
else {
fast_sint_t bucket_size = 2 * (fast_sint_t)k;
- fast_sint_t bucket_stride = libsais_get_bucket_stride(
- buckets - &SA[n + n], bucket_size, omp_num_threads);
+ fast_sint_t bucket_stride =
+ libsais_get_bucket_stride(buckets - &SA[n + n], bucket_size, omp_num_threads);
{
- thread_state[omp_thread_num].state.position =
- omp_block_start + omp_block_size;
+ thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size;
thread_state[omp_thread_num].state.count =
libsais_count_and_gather_compacted_lms_suffixes_32s_2k(
- T, SA + n, n, k,
- buckets - (omp_thread_num * bucket_stride),
+ T, SA + n, n, k, buckets - (omp_thread_num * bucket_stride),
omp_block_start, omp_block_size);
}
@@ -1491,8 +1387,7 @@ static void libsais_count_and_gather_compacted_lms_suffixes_32s_2k_fs_omp(
memcpy(&SA[n - m],
&SA[n + thread_state[omp_thread_num].state.position -
thread_state[omp_thread_num].state.count],
- (size_t)thread_state[omp_thread_num].state.count *
- sizeof(sa_sint_t));
+ (size_t)thread_state[omp_thread_num].state.count * sizeof(sa_sint_t));
}
}
@@ -1503,9 +1398,8 @@ static void libsais_count_and_gather_compacted_lms_suffixes_32s_2k_fs_omp(
? omp_block_stride
: bucket_size - omp_block_start;
- libsais_accumulate_counts_s32(buckets + omp_block_start,
- omp_block_size, bucket_stride,
- omp_num_threads);
+ libsais_accumulate_counts_s32(buckets + omp_block_start, omp_block_size,
+ bucket_stride, omp_num_threads);
}
}
#endif
@@ -1514,9 +1408,11 @@ static void libsais_count_and_gather_compacted_lms_suffixes_32s_2k_fs_omp(
#endif
-static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k_nofs_omp(
- const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads) {
+static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k_nofs_omp(const sa_sint_t * RESTRICT T,
+ sa_sint_t * RESTRICT SA,
+ sa_sint_t n, sa_sint_t k,
+ sa_sint_t * RESTRICT buckets,
+ sa_sint_t threads) {
sa_sint_t m = 0;
#if defined(_OPENMP)
@@ -1532,8 +1428,7 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k_nofs_omp(
fast_sint_t omp_num_threads = 1;
#endif
if (omp_num_threads == 1) {
- m = libsais_count_and_gather_lms_suffixes_32s_4k(T, SA, n, k,
- buckets, 0, n);
+ m = libsais_count_and_gather_lms_suffixes_32s_4k(T, SA, n, k, buckets, 0, n);
}
#if defined(_OPENMP)
else if (omp_thread_num == 0) {
@@ -1547,9 +1442,11 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k_nofs_omp(
return m;
}
-static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k_nofs_omp(
- const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads) {
+static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k_nofs_omp(const sa_sint_t * RESTRICT T,
+ sa_sint_t * RESTRICT SA,
+ sa_sint_t n, sa_sint_t k,
+ sa_sint_t * RESTRICT buckets,
+ sa_sint_t threads) {
sa_sint_t m = 0;
#if defined(_OPENMP)
@@ -1565,8 +1462,7 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k_nofs_omp(
fast_sint_t omp_num_threads = 1;
#endif
if (omp_num_threads == 1) {
- m = libsais_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k,
- buckets, 0, n);
+ m = libsais_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets, 0, n);
}
#if defined(_OPENMP)
else if (omp_thread_num == 0) {
@@ -1580,10 +1476,9 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k_nofs_omp(
return m;
}
-static sa_sint_t
-libsais_count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp(
- const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads) {
+static sa_sint_t libsais_count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp(
+ const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
+ sa_sint_t * RESTRICT buckets, sa_sint_t threads) {
sa_sint_t m = 0;
#if defined(_OPENMP)
@@ -1599,8 +1494,7 @@ libsais_count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp(
fast_sint_t omp_num_threads = 1;
#endif
if (omp_num_threads == 1) {
- m = libsais_count_and_gather_compacted_lms_suffixes_32s_2k(
- T, SA, n, k, buckets, 0, n);
+ m = libsais_count_and_gather_compacted_lms_suffixes_32s_2k(T, SA, n, k, buckets, 0, n);
}
#if defined(_OPENMP)
else if (omp_thread_num == 0) {
@@ -1615,14 +1509,12 @@ libsais_count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp(
}
static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k_omp(
- const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
+ sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
sa_sint_t m;
#if defined(_OPENMP)
- sa_sint_t max_threads =
- (sa_sint_t)((buckets - &SA[n]) / ((4 * (fast_sint_t)k + 15) & (-16)));
+ sa_sint_t max_threads = (sa_sint_t)((buckets - &SA[n]) / ((4 * (fast_sint_t)k + 15) & (-16)));
if (max_threads > threads) {
max_threads = threads;
}
@@ -1631,29 +1523,25 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k_omp(
max_threads = n / 16 / k;
}
m = libsais_count_and_gather_lms_suffixes_32s_4k_fs_omp(
- T, SA, n, k, buckets, max_threads > 2 ? max_threads : 2,
- thread_state);
+ T, SA, n, k, buckets, max_threads > 2 ? max_threads : 2, thread_state);
} else
#else
UNUSED(thread_state);
#endif
{
- m = libsais_count_and_gather_lms_suffixes_32s_4k_nofs_omp(
- T, SA, n, k, buckets, threads);
+ m = libsais_count_and_gather_lms_suffixes_32s_4k_nofs_omp(T, SA, n, k, buckets, threads);
}
return m;
}
static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k_omp(
- const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
+ sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
sa_sint_t m;
#if defined(_OPENMP)
- sa_sint_t max_threads =
- (sa_sint_t)((buckets - &SA[n]) / ((2 * (fast_sint_t)k + 15) & (-16)));
+ sa_sint_t max_threads = (sa_sint_t)((buckets - &SA[n]) / ((2 * (fast_sint_t)k + 15) & (-16)));
if (max_threads > threads) {
max_threads = threads;
}
@@ -1662,27 +1550,24 @@ static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k_omp(
max_threads = n / 8 / k;
}
m = libsais_count_and_gather_lms_suffixes_32s_2k_fs_omp(
- T, SA, n, k, buckets, max_threads > 2 ? max_threads : 2,
- thread_state);
+ T, SA, n, k, buckets, max_threads > 2 ? max_threads : 2, thread_state);
} else
#else
UNUSED(thread_state);
#endif
{
- m = libsais_count_and_gather_lms_suffixes_32s_2k_nofs_omp(
- T, SA, n, k, buckets, threads);
+ m = libsais_count_and_gather_lms_suffixes_32s_2k_nofs_omp(T, SA, n, k, buckets, threads);
}
return m;
}
static void libsais_count_and_gather_compacted_lms_suffixes_32s_2k_omp(
- const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
+ sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
#if defined(_OPENMP)
- sa_sint_t max_threads = (sa_sint_t)((buckets - &SA[n + n]) /
- ((2 * (fast_sint_t)k + 15) & (-16)));
+ sa_sint_t max_threads =
+ (sa_sint_t)((buckets - &SA[n + n]) / ((2 * (fast_sint_t)k + 15) & (-16)));
if (max_threads > threads) {
max_threads = threads;
}
@@ -1691,20 +1576,18 @@ static void libsais_count_and_gather_compacted_lms_suffixes_32s_2k_omp(
max_threads = n / 8 / k;
}
libsais_count_and_gather_compacted_lms_suffixes_32s_2k_fs_omp(
- T, SA, n, k, buckets, max_threads > 2 ? max_threads : 2,
- thread_state);
+ T, SA, n, k, buckets, max_threads > 2 ? max_threads : 2, thread_state);
} else
#else
UNUSED(thread_state);
#endif
{
- libsais_count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp(
- T, SA, n, k, buckets, threads);
+ libsais_count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp(T, SA, n, k, buckets,
+ threads);
}
}
-static void libsais_count_suffixes_32s(const sa_sint_t * RESTRICT T,
- sa_sint_t n, sa_sint_t k,
+static void libsais_count_suffixes_32s(const sa_sint_t * RESTRICT T, sa_sint_t n, sa_sint_t k,
sa_sint_t * RESTRICT buckets) {
const fast_sint_t prefetch_distance = 32;
@@ -1729,93 +1612,80 @@ static void libsais_count_suffixes_32s(const sa_sint_t * RESTRICT T,
}
}
-static void libsais_initialize_buckets_start_and_end_8u(
- sa_sint_t * RESTRICT buckets, sa_sint_t * RESTRICT freq) {
+static void libsais_initialize_buckets_start_and_end_8u(sa_sint_t * RESTRICT buckets,
+ sa_sint_t * RESTRICT freq) {
sa_sint_t * RESTRICT bucket_start = &buckets[6 * ALPHABET_SIZE];
sa_sint_t * RESTRICT bucket_end = &buckets[7 * ALPHABET_SIZE];
if (freq != NULL) {
fast_sint_t i, j;
sa_sint_t sum = 0;
- for (i = BUCKETS_INDEX4(0, 0), j = 0;
- i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0);
+ for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0);
i += BUCKETS_INDEX4(1, 0), j += 1) {
bucket_start[j] = sum;
- sum += (freq[j] = buckets[i + BUCKETS_INDEX4(0, 0)] +
- buckets[i + BUCKETS_INDEX4(0, 1)] +
- buckets[i + BUCKETS_INDEX4(0, 2)] +
- buckets[i + BUCKETS_INDEX4(0, 3)]);
+ sum +=
+ (freq[j] = buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 1)] +
+ buckets[i + BUCKETS_INDEX4(0, 2)] + buckets[i + BUCKETS_INDEX4(0, 3)]);
bucket_end[j] = sum;
}
} else {
fast_sint_t i, j;
sa_sint_t sum = 0;
- for (i = BUCKETS_INDEX4(0, 0), j = 0;
- i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0);
+ for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0);
i += BUCKETS_INDEX4(1, 0), j += 1) {
bucket_start[j] = sum;
- sum += buckets[i + BUCKETS_INDEX4(0, 0)] +
- buckets[i + BUCKETS_INDEX4(0, 1)] +
- buckets[i + BUCKETS_INDEX4(0, 2)] +
- buckets[i + BUCKETS_INDEX4(0, 3)];
+ sum += buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 1)] +
+ buckets[i + BUCKETS_INDEX4(0, 2)] + buckets[i + BUCKETS_INDEX4(0, 3)];
bucket_end[j] = sum;
}
}
}
-static void libsais_initialize_buckets_start_and_end_32s_6k(
- sa_sint_t k, sa_sint_t * RESTRICT buckets) {
+static void libsais_initialize_buckets_start_and_end_32s_6k(sa_sint_t k,
+ sa_sint_t * RESTRICT buckets) {
sa_sint_t * RESTRICT bucket_start = &buckets[4 * k];
sa_sint_t * RESTRICT bucket_end = &buckets[5 * k];
fast_sint_t i, j;
sa_sint_t sum = 0;
- for (i = BUCKETS_INDEX4(0, 0), j = 0;
- i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0);
+ for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0);
i += BUCKETS_INDEX4(1, 0), j += 1) {
bucket_start[j] = sum;
- sum += buckets[i + BUCKETS_INDEX4(0, 0)] +
- buckets[i + BUCKETS_INDEX4(0, 1)] +
- buckets[i + BUCKETS_INDEX4(0, 2)] +
- buckets[i + BUCKETS_INDEX4(0, 3)];
+ sum += buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 1)] +
+ buckets[i + BUCKETS_INDEX4(0, 2)] + buckets[i + BUCKETS_INDEX4(0, 3)];
bucket_end[j] = sum;
}
}
-static void libsais_initialize_buckets_start_and_end_32s_4k(
- sa_sint_t k, sa_sint_t * RESTRICT buckets) {
+static void libsais_initialize_buckets_start_and_end_32s_4k(sa_sint_t k,
+ sa_sint_t * RESTRICT buckets) {
sa_sint_t * RESTRICT bucket_start = &buckets[2 * k];
sa_sint_t * RESTRICT bucket_end = &buckets[3 * k];
fast_sint_t i, j;
sa_sint_t sum = 0;
- for (i = BUCKETS_INDEX2(0, 0), j = 0;
- i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0);
+ for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0);
i += BUCKETS_INDEX2(1, 0), j += 1) {
bucket_start[j] = sum;
- sum += buckets[i + BUCKETS_INDEX2(0, 0)] +
- buckets[i + BUCKETS_INDEX2(0, 1)];
+ sum += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)];
bucket_end[j] = sum;
}
}
-static void libsais_initialize_buckets_end_32s_2k(
- sa_sint_t k, sa_sint_t * RESTRICT buckets) {
+static void libsais_initialize_buckets_end_32s_2k(sa_sint_t k, sa_sint_t * RESTRICT buckets) {
fast_sint_t i;
sa_sint_t sum0 = 0;
for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0);
i += BUCKETS_INDEX2(1, 0)) {
- sum0 += buckets[i + BUCKETS_INDEX2(0, 0)] +
- buckets[i + BUCKETS_INDEX2(0, 1)];
+ sum0 += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)];
buckets[i + BUCKETS_INDEX2(0, 0)] = sum0;
}
}
-static void libsais_initialize_buckets_start_and_end_32s_2k(
- sa_sint_t k, sa_sint_t * RESTRICT buckets) {
+static void libsais_initialize_buckets_start_and_end_32s_2k(sa_sint_t k,
+ sa_sint_t * RESTRICT buckets) {
fast_sint_t i, j;
- for (i = BUCKETS_INDEX2(0, 0), j = 0;
- i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0);
+ for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0);
i += BUCKETS_INDEX2(1, 0), j += 1) {
buckets[j] = buckets[i];
}
@@ -1824,8 +1694,7 @@ static void libsais_initialize_buckets_start_and_end_32s_2k(
memcpy(&buckets[k + 1], buckets, ((size_t)k - 1) * sizeof(sa_sint_t));
}
-static void libsais_initialize_buckets_start_32s_1k(
- sa_sint_t k, sa_sint_t * RESTRICT buckets) {
+static void libsais_initialize_buckets_start_32s_1k(sa_sint_t k, sa_sint_t * RESTRICT buckets) {
fast_sint_t i;
sa_sint_t sum = 0;
for (i = 0; i <= (fast_sint_t)k - 1; i += 1) {
@@ -1835,8 +1704,7 @@ static void libsais_initialize_buckets_start_32s_1k(
}
}
-static void libsais_initialize_buckets_end_32s_1k(
- sa_sint_t k, sa_sint_t * RESTRICT buckets) {
+static void libsais_initialize_buckets_end_32s_1k(sa_sint_t k, sa_sint_t * RESTRICT buckets) {
fast_sint_t i;
sa_sint_t sum = 0;
for (i = 0; i <= (fast_sint_t)k - 1; i += 1) {
@@ -1846,8 +1714,7 @@ static void libsais_initialize_buckets_end_32s_1k(
}
static sa_sint_t libsais_initialize_buckets_for_lms_suffixes_radix_sort_8u(
- const u8 * RESTRICT T, sa_sint_t * RESTRICT buckets,
- sa_sint_t first_lms_suffix) {
+ const u8 * RESTRICT T, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix) {
{
fast_uint_t s = 0;
fast_sint_t c0 = T[first_lms_suffix];
@@ -1872,8 +1739,7 @@ static sa_sint_t libsais_initialize_buckets_for_lms_suffixes_radix_sort_8u(
i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0);
i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) {
temp_bucket[j + BUCKETS_INDEX2(0, 1)] = sum;
- sum += buckets[i + BUCKETS_INDEX4(0, 1)] +
- buckets[i + BUCKETS_INDEX4(0, 3)];
+ sum += buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 3)];
temp_bucket[j] = sum;
}
@@ -1891,8 +1757,7 @@ static void libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_2k(
sa_sint_t sum0 = 0, sum1 = 0;
for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0);
i += BUCKETS_INDEX2(1, 0)) {
- sum0 += buckets[i + BUCKETS_INDEX2(0, 0)] +
- buckets[i + BUCKETS_INDEX2(0, 1)];
+ sum0 += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)];
sum1 += buckets[i + BUCKETS_INDEX2(0, 1)];
buckets[i + BUCKETS_INDEX2(0, 0)] = sum0;
@@ -1923,11 +1788,9 @@ static sa_sint_t libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(
fast_sint_t i, j;
sa_sint_t sum = 0;
- for (i = BUCKETS_INDEX4(0, 0), j = 0;
- i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0);
+ for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0);
i += BUCKETS_INDEX4(1, 0), j += 1) {
- sum += buckets[i + BUCKETS_INDEX4(0, 1)] +
- buckets[i + BUCKETS_INDEX4(0, 3)];
+ sum += buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 3)];
temp_bucket[j] = sum;
}
@@ -1946,29 +1809,26 @@ static void libsais_initialize_buckets_for_radix_and_partial_sorting_32s_4k(
fast_sint_t i, j;
sa_sint_t sum0 = 0, sum1 = 0;
- for (i = BUCKETS_INDEX2(0, 0), j = 0;
- i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0);
+ for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0);
i += BUCKETS_INDEX2(1, 0), j += 1) {
bucket_start[j] = sum1;
sum0 += buckets[i + BUCKETS_INDEX2(0, 1)];
- sum1 += buckets[i + BUCKETS_INDEX2(0, 0)] +
- buckets[i + BUCKETS_INDEX2(0, 1)];
+ sum1 += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)];
buckets[i + BUCKETS_INDEX2(0, 1)] = sum0;
bucket_end[j] = sum1;
}
}
-static void libsais_radix_sort_lms_suffixes_8u(
- const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start,
- fast_sint_t omp_block_size) {
+static void libsais_radix_sort_lms_suffixes_8u(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
+ sa_sint_t * RESTRICT induction_bucket,
+ fast_sint_t omp_block_start,
+ fast_sint_t omp_block_size) {
const fast_sint_t prefetch_distance = 32;
fast_sint_t i, j;
- for (i = omp_block_start + omp_block_size - 1,
- j = omp_block_start + prefetch_distance + 3;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 3;
i >= j; i -= 4) {
libsais_prefetch(&SA[i - 2 * prefetch_distance]);
@@ -1993,13 +1853,12 @@ static void libsais_radix_sort_lms_suffixes_8u(
}
}
-static void libsais_radix_sort_lms_suffixes_8u_omp(
- const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
- sa_sint_t * RESTRICT buckets, sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+static void libsais_radix_sort_lms_suffixes_8u_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
+ sa_sint_t n, sa_sint_t m,
+ sa_sint_t * RESTRICT buckets, sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
#if defined(_OPENMP)
- #pragma omp parallel num_threads(threads) if (threads > 1 && n >= 65536 && \
- m >= 65536 && \
+ #pragma omp parallel num_threads(threads) if (threads > 1 && n >= 65536 && m >= 65536 && \
omp_get_dynamic() == 0)
#endif
{
@@ -2013,16 +1872,15 @@ static void libsais_radix_sort_lms_suffixes_8u_omp(
fast_sint_t omp_num_threads = 1;
#endif
if (omp_num_threads == 1) {
- libsais_radix_sort_lms_suffixes_8u(
- T, SA, &buckets[4 * ALPHABET_SIZE],
- (fast_sint_t)n - (fast_sint_t)m + 1, (fast_sint_t)m - 1);
+ libsais_radix_sort_lms_suffixes_8u(T, SA, &buckets[4 * ALPHABET_SIZE],
+ (fast_sint_t)n - (fast_sint_t)m + 1,
+ (fast_sint_t)m - 1);
}
#if defined(_OPENMP)
else {
{
sa_sint_t * RESTRICT src_bucket = &buckets[4 * ALPHABET_SIZE];
- sa_sint_t * RESTRICT dst_bucket =
- thread_state[omp_thread_num].state.buckets;
+ sa_sint_t * RESTRICT dst_bucket = thread_state[omp_thread_num].state.buckets;
fast_sint_t i, j;
for (i = BUCKETS_INDEX2(0, 0), j = BUCKETS_INDEX4(0, 1);
@@ -2033,9 +1891,8 @@ static void libsais_radix_sort_lms_suffixes_8u_omp(
}
{
- fast_sint_t t,
- omp_block_start = 0,
- omp_block_size = thread_state[omp_thread_num].state.m;
+ fast_sint_t t, omp_block_start = 0,
+ omp_block_size = thread_state[omp_thread_num].state.m;
for (t = omp_num_threads - 1; t >= omp_thread_num; --t)
omp_block_start += thread_state[t].state.m;
@@ -2053,15 +1910,15 @@ static void libsais_radix_sort_lms_suffixes_8u_omp(
}
}
-static void libsais_radix_sort_lms_suffixes_32s_6k(
- const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start,
- fast_sint_t omp_block_size) {
+static void libsais_radix_sort_lms_suffixes_32s_6k(const sa_sint_t * RESTRICT T,
+ sa_sint_t * RESTRICT SA,
+ sa_sint_t * RESTRICT induction_bucket,
+ fast_sint_t omp_block_start,
+ fast_sint_t omp_block_size) {
const fast_sint_t prefetch_distance = 32;
fast_sint_t i, j;
- for (i = omp_block_start + omp_block_size - 1,
- j = omp_block_start + 2 * prefetch_distance + 3;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 3;
i >= j; i -= 4) {
libsais_prefetch(&SA[i - 3 * prefetch_distance]);
@@ -2091,15 +1948,15 @@ static void libsais_radix_sort_lms_suffixes_32s_6k(
}
}
-static void libsais_radix_sort_lms_suffixes_32s_2k(
- const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start,
- fast_sint_t omp_block_size) {
+static void libsais_radix_sort_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T,
+ sa_sint_t * RESTRICT SA,
+ sa_sint_t * RESTRICT induction_bucket,
+ fast_sint_t omp_block_start,
+ fast_sint_t omp_block_size) {
const fast_sint_t prefetch_distance = 32;
fast_sint_t i, j;
- for (i = omp_block_start + omp_block_size - 1,
- j = omp_block_start + 2 * prefetch_distance + 3;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 3;
i >= j; i -= 4) {
libsais_prefetch(&SA[i - 3 * prefetch_distance]);
@@ -2108,14 +1965,10 @@ static void libsais_radix_sort_lms_suffixes_32s_2k(
libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 2]]);
libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 3]]);
- libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(
- T[SA[i - prefetch_distance - 0]], 0)]);
- libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(
- T[SA[i - prefetch_distance - 1]], 0)]);
- libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(
- T[SA[i - prefetch_distance - 2]], 0)]);
- libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(
- T[SA[i - prefetch_distance - 3]], 0)]);
+ libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 0]], 0)]);
+ libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 1]], 0)]);
+ libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 2]], 0)]);
+ libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 3]], 0)]);
sa_sint_t p0 = SA[i - 0];
SA[--induction_bucket[BUCKETS_INDEX2(T[p0], 0)]] = p0;
@@ -2135,16 +1988,16 @@ static void libsais_radix_sort_lms_suffixes_32s_2k(
#if defined(_OPENMP)
-static void libsais_radix_sort_lms_suffixes_32s_block_gather(
- const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
- LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
- fast_sint_t omp_block_size) {
+static void libsais_radix_sort_lms_suffixes_32s_block_gather(const sa_sint_t * RESTRICT T,
+ sa_sint_t * RESTRICT SA,
+ LIBSAIS_THREAD_CACHE * RESTRICT cache,
+ fast_sint_t omp_block_start,
+ fast_sint_t omp_block_size) {
const fast_sint_t prefetch_distance = 32;
fast_sint_t i, j;
- for (i = omp_block_start,
- j = omp_block_start + omp_block_size - prefetch_distance - 3;
- i < j; i += 4) {
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j;
+ i += 4) {
libsais_prefetch(&SA[i + 2 * prefetch_distance]);
libsais_prefetch(&T[SA[i + prefetch_distance + 0]]);
@@ -2165,26 +2018,21 @@ static void libsais_radix_sort_lms_suffixes_32s_block_gather(
}
}
-static void libsais_radix_sort_lms_suffixes_32s_6k_block_sort(
- sa_sint_t * RESTRICT induction_bucket,
- LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
- fast_sint_t omp_block_size) {
+static void libsais_radix_sort_lms_suffixes_32s_6k_block_sort(sa_sint_t * RESTRICT induction_bucket,
+ LIBSAIS_THREAD_CACHE * RESTRICT cache,
+ fast_sint_t omp_block_start,
+ fast_sint_t omp_block_size) {
const fast_sint_t prefetch_distance = 32;
fast_sint_t i, j;
- for (i = omp_block_start + omp_block_size - 1,
- j = omp_block_start + prefetch_distance + 3;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 3;
i >= j; i -= 4) {
libsais_prefetchw(&cache[i - 2 * prefetch_distance]);
- libsais_prefetchw(
- &induction_bucket[cache[i - prefetch_distance - 0].symbol]);
- libsais_prefetchw(
- &induction_bucket[cache[i - prefetch_distance - 1].symbol]);
- libsais_prefetchw(
- &induction_bucket[cache[i - prefetch_distance - 2].symbol]);
- libsais_prefetchw(
- &induction_bucket[cache[i - prefetch_distance - 3].symbol]);
+ libsais_prefetchw(&induction_bucket[cache[i - prefetch_distance - 0].symbol]);
+ libsais_prefetchw(&induction_bucket[cache[i - prefetch_distance - 1].symbol]);
+ libsais_prefetchw(&induction_bucket[cache[i - prefetch_distance - 2].symbol]);
+ libsais_prefetchw(&induction_bucket[cache[i - prefetch_distance - 3].symbol]);
cache[i - 0].symbol = --induction_bucket[cache[i - 0].symbol];
cache[i - 1].symbol = --induction_bucket[cache[i - 1].symbol];
@@ -2197,51 +2045,43 @@ static void libsais_radix_sort_lms_suffixes_32s_6k_block_sort(
}
}
-static void libsais_radix_sort_lms_suffixes_32s_2k_block_sort(
- sa_sint_t * RESTRICT induction_bucket,
- LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
- fast_sint_t omp_block_size) {
+static void libsais_radix_sort_lms_suffixes_32s_2k_block_sort(sa_sint_t * RESTRICT induction_bucket,
+ LIBSAIS_THREAD_CACHE * RESTRICT cache,
+ fast_sint_t omp_block_start,
+ fast_sint_t omp_block_size) {
const fast_sint_t prefetch_distance = 32;
fast_sint_t i, j;
- for (i = omp_block_start + omp_block_size - 1,
- j = omp_block_start + prefetch_distance + 3;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 3;
i >= j; i -= 4) {
libsais_prefetchw(&cache[i - 2 * prefetch_distance]);
- libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(
- cache[i - prefetch_distance - 0].symbol, 0)]);
- libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(
- cache[i - prefetch_distance - 1].symbol, 0)]);
- libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(
- cache[i - prefetch_distance - 2].symbol, 0)]);
- libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(
- cache[i - prefetch_distance - 3].symbol, 0)]);
-
- cache[i - 0].symbol =
- --induction_bucket[BUCKETS_INDEX2(cache[i - 0].symbol, 0)];
- cache[i - 1].symbol =
- --induction_bucket[BUCKETS_INDEX2(cache[i - 1].symbol, 0)];
- cache[i - 2].symbol =
- --induction_bucket[BUCKETS_INDEX2(cache[i - 2].symbol, 0)];
- cache[i - 3].symbol =
- --induction_bucket[BUCKETS_INDEX2(cache[i - 3].symbol, 0)];
+ libsais_prefetchw(
+ &induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 0].symbol, 0)]);
+ libsais_prefetchw(
+ &induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 1].symbol, 0)]);
+ libsais_prefetchw(
+ &induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 2].symbol, 0)]);
+ libsais_prefetchw(
+ &induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 3].symbol, 0)]);
+
+ cache[i - 0].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 0].symbol, 0)];
+ cache[i - 1].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 1].symbol, 0)];
+ cache[i - 2].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 2].symbol, 0)];
+ cache[i - 3].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 3].symbol, 0)];
}
for (j -= prefetch_distance + 3; i >= j; i -= 1) {
- cache[i].symbol =
- --induction_bucket[BUCKETS_INDEX2(cache[i].symbol, 0)];
+ cache[i].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i].symbol, 0)];
}
}
static void libsais_radix_sort_lms_suffixes_32s_6k_block_omp(
- const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t * RESTRICT induction_bucket,
- LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start,
- fast_sint_t block_size, sa_sint_t threads) {
+ const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket,
+ LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size,
+ sa_sint_t threads) {
#if defined(_OPENMP)
- #pragma omp parallel num_threads(threads) if (threads > 1 && \
- block_size >= 16384)
+ #pragma omp parallel num_threads(threads) if (threads > 1 && block_size >= 16384)
#endif
{
#if defined(_OPENMP)
@@ -2256,22 +2096,20 @@ static void libsais_radix_sort_lms_suffixes_32s_6k_block_omp(
#endif
fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
- fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
- ? omp_block_stride
- : block_size - omp_block_start;
+ fast_sint_t omp_block_size =
+ omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
omp_block_start += block_start;
if (omp_num_threads == 1) {
- libsais_radix_sort_lms_suffixes_32s_6k(
- T, SA, induction_bucket, omp_block_start, omp_block_size);
+ libsais_radix_sort_lms_suffixes_32s_6k(T, SA, induction_bucket, omp_block_start,
+ omp_block_size);
}
#if defined(_OPENMP)
else {
{
- libsais_radix_sort_lms_suffixes_32s_block_gather(
- T, SA, cache - block_start, omp_block_start,
- omp_block_size);
+ libsais_radix_sort_lms_suffixes_32s_block_gather(T, SA, cache - block_start,
+ omp_block_start, omp_block_size);
}
#pragma omp barrier
@@ -2279,15 +2117,14 @@ static void libsais_radix_sort_lms_suffixes_32s_6k_block_omp(
#pragma omp master
{
libsais_radix_sort_lms_suffixes_32s_6k_block_sort(
- induction_bucket, cache - block_start, block_start,
- block_size);
+ induction_bucket, cache - block_start, block_start, block_size);
}
#pragma omp barrier
{
- libsais_place_cached_suffixes(SA, cache - block_start,
- omp_block_start, omp_block_size);
+ libsais_place_cached_suffixes(SA, cache - block_start, omp_block_start,
+ omp_block_size);
}
}
#endif
@@ -2295,13 +2132,11 @@ static void libsais_radix_sort_lms_suffixes_32s_6k_block_omp(
}
static void libsais_radix_sort_lms_suffixes_32s_2k_block_omp(
- const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t * RESTRICT induction_bucket,
- LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start,
- fast_sint_t block_size, sa_sint_t threads) {
+ const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket,
+ LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size,
+ sa_sint_t threads) {
#if defined(_OPENMP)
- #pragma omp parallel num_threads(threads) if (threads > 1 && \
- block_size >= 16384)
+ #pragma omp parallel num_threads(threads) if (threads > 1 && block_size >= 16384)
#endif
{
#if defined(_OPENMP)
@@ -2316,22 +2151,20 @@ static void libsais_radix_sort_lms_suffixes_32s_2k_block_omp(
#endif
fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
- fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
- ? omp_block_stride
- : block_size - omp_block_start;
+ fast_sint_t omp_block_size =
+ omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
omp_block_start += block_start;
if (omp_num_threads == 1) {
- libsais_radix_sort_lms_suffixes_32s_2k(
- T, SA, induction_bucket, omp_block_start, omp_block_size);
+ libsais_radix_sort_lms_suffixes_32s_2k(T, SA, induction_bucket, omp_block_start,
+ omp_block_size);
}
#if defined(_OPENMP)
else {
{
- libsais_radix_sort_lms_suffixes_32s_block_gather(
- T, SA, cache - block_start, omp_block_start,
- omp_block_size);
+ libsais_radix_sort_lms_suffixes_32s_block_gather(T, SA, cache - block_start,
+ omp_block_start, omp_block_size);
}
#pragma omp barrier
@@ -2339,15 +2172,14 @@ static void libsais_radix_sort_lms_suffixes_32s_2k_block_omp(
#pragma omp master
{
libsais_radix_sort_lms_suffixes_32s_2k_block_sort(
- induction_bucket, cache - block_start, block_start,
- block_size);
+ induction_bucket, cache - block_start, block_start, block_size);
}
#pragma omp barrier
{
- libsais_place_cached_suffixes(SA, cache - block_start,
- omp_block_start, omp_block_size);
+ libsais_place_cached_suffixes(SA, cache - block_start, omp_block_start,
+ omp_block_size);
}
}
#endif
@@ -2357,28 +2189,25 @@ static void libsais_radix_sort_lms_suffixes_32s_2k_block_omp(
#endif
static void libsais_radix_sort_lms_suffixes_32s_6k_omp(
- const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t m, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads,
+ const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
+ sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads,
LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
if (threads == 1 || m < 65536) {
libsais_radix_sort_lms_suffixes_32s_6k(
- T, SA, induction_bucket, (fast_sint_t)n - (fast_sint_t)m + 1,
- (fast_sint_t)m - 1);
+ T, SA, induction_bucket, (fast_sint_t)n - (fast_sint_t)m + 1, (fast_sint_t)m - 1);
}
#if defined(_OPENMP)
else {
fast_sint_t block_start, block_end;
- for (block_start = 0; block_start < (fast_sint_t)m - 1;
- block_start = block_end) {
- block_end = block_start +
- (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
+ for (block_start = 0; block_start < (fast_sint_t)m - 1; block_start = block_end) {
+ block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
if (block_end >= m) {
block_end = (fast_sint_t)m - 1;
}
libsais_radix_sort_lms_suffixes_32s_6k_block_omp(
- T, SA, induction_bucket, thread_state[0].state.cache,
- (fast_sint_t)n - block_end, block_end - block_start, threads);
+ T, SA, induction_bucket, thread_state[0].state.cache, (fast_sint_t)n - block_end,
+ block_end - block_start, threads);
}
}
#else
@@ -2387,28 +2216,25 @@ static void libsais_radix_sort_lms_suffixes_32s_6k_omp(
}
static void libsais_radix_sort_lms_suffixes_32s_2k_omp(
- const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t m, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads,
+ const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
+ sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads,
LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
if (threads == 1 || m < 65536) {
libsais_radix_sort_lms_suffixes_32s_2k(
- T, SA, induction_bucket, (fast_sint_t)n - (fast_sint_t)m + 1,
- (fast_sint_t)m - 1);
+ T, SA, induction_bucket, (fast_sint_t)n - (fast_sint_t)m + 1, (fast_sint_t)m - 1);
}
#if defined(_OPENMP)
else {
fast_sint_t block_start, block_end;
- for (block_start = 0; block_start < (fast_sint_t)m - 1;
- block_start = block_end) {
- block_end = block_start +
- (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
+ for (block_start = 0; block_start < (fast_sint_t)m - 1; block_start = block_end) {
+ block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
if (block_end >= m) {
block_end = (fast_sint_t)m - 1;
}
libsais_radix_sort_lms_suffixes_32s_2k_block_omp(
- T, SA, induction_bucket, thread_state[0].state.cache,
- (fast_sint_t)n - block_end, block_end - block_start, threads);
+ T, SA, induction_bucket, thread_state[0].state.cache, (fast_sint_t)n - block_end,
+ block_end - block_start, threads);
}
}
#else
@@ -2416,9 +2242,9 @@ static void libsais_radix_sort_lms_suffixes_32s_2k_omp(
#endif
}
-static sa_sint_t libsais_radix_sort_lms_suffixes_32s_1k(
- const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t * RESTRICT buckets) {
+static sa_sint_t libsais_radix_sort_lms_suffixes_32s_1k(const sa_sint_t * RESTRICT T,
+ sa_sint_t * RESTRICT SA, sa_sint_t n,
+ sa_sint_t * RESTRICT buckets) {
const fast_sint_t prefetch_distance = 32;
sa_sint_t i = n - 2;
@@ -2482,15 +2308,15 @@ static sa_sint_t libsais_radix_sort_lms_suffixes_32s_1k(
return m;
}
-static void libsais_radix_sort_set_markers_32s_6k(
- sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket,
- fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+static void libsais_radix_sort_set_markers_32s_6k(sa_sint_t * RESTRICT SA,
+ sa_sint_t * RESTRICT induction_bucket,
+ fast_sint_t omp_block_start,
+ fast_sint_t omp_block_size) {
const fast_sint_t prefetch_distance = 32;
fast_sint_t i, j;
- for (i = omp_block_start,
- j = omp_block_start + omp_block_size - prefetch_distance - 3;
- i < j; i += 4) {
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j;
+ i += 4) {
libsais_prefetch(&induction_bucket[i + 2 * prefetch_distance]);
libsais_prefetchw(&SA[induction_bucket[i + prefetch_distance + 0]]);
@@ -2509,26 +2335,21 @@ static void libsais_radix_sort_set_markers_32s_6k(
}
}
-static void libsais_radix_sort_set_markers_32s_4k(
- sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket,
- fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+static void libsais_radix_sort_set_markers_32s_4k(sa_sint_t * RESTRICT SA,
+ sa_sint_t * RESTRICT induction_bucket,
+ fast_sint_t omp_block_start,
+ fast_sint_t omp_block_size) {
const fast_sint_t prefetch_distance = 32;
fast_sint_t i, j;
- for (i = omp_block_start,
- j = omp_block_start + omp_block_size - prefetch_distance - 3;
- i < j; i += 4) {
- libsais_prefetch(
- &induction_bucket[BUCKETS_INDEX2(i + 2 * prefetch_distance, 0)]);
-
- libsais_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(
- i + prefetch_distance + 0, 0)]]);
- libsais_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(
- i + prefetch_distance + 1, 0)]]);
- libsais_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(
- i + prefetch_distance + 2, 0)]]);
- libsais_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(
- i + prefetch_distance + 3, 0)]]);
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j;
+ i += 4) {
+ libsais_prefetch(&induction_bucket[BUCKETS_INDEX2(i + 2 * prefetch_distance, 0)]);
+
+ libsais_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 0, 0)]]);
+ libsais_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 1, 0)]]);
+ libsais_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 2, 0)]]);
+ libsais_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 3, 0)]]);
SA[induction_bucket[BUCKETS_INDEX2(i + 0, 0)]] |= SUFFIX_GROUP_MARKER;
SA[induction_bucket[BUCKETS_INDEX2(i + 1, 0)]] |= SUFFIX_GROUP_MARKER;
@@ -2541,9 +2362,9 @@ static void libsais_radix_sort_set_markers_32s_4k(
}
}
-static void libsais_radix_sort_set_markers_32s_6k_omp(
- sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket,
- sa_sint_t threads) {
+static void libsais_radix_sort_set_markers_32s_6k_omp(sa_sint_t * RESTRICT SA, sa_sint_t k,
+ sa_sint_t * RESTRICT induction_bucket,
+ sa_sint_t threads) {
#if defined(_OPENMP)
#pragma omp parallel num_threads(threads) if (threads > 1 && k >= 65536)
#endif
@@ -2551,8 +2372,7 @@ static void libsais_radix_sort_set_markers_32s_6k_omp(
#if defined(_OPENMP)
fast_sint_t omp_thread_num = omp_get_thread_num();
fast_sint_t omp_num_threads = omp_get_num_threads();
- fast_sint_t omp_block_stride =
- (((fast_sint_t)k - 1) / omp_num_threads) & (-16);
+ fast_sint_t omp_block_stride = (((fast_sint_t)k - 1) / omp_num_threads) & (-16);
fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
? omp_block_stride
@@ -2564,14 +2384,14 @@ static void libsais_radix_sort_set_markers_32s_6k_omp(
fast_sint_t omp_block_size = (fast_sint_t)k - 1;
#endif
- libsais_radix_sort_set_markers_32s_6k(SA, induction_bucket,
- omp_block_start, omp_block_size);
+ libsais_radix_sort_set_markers_32s_6k(SA, induction_bucket, omp_block_start,
+ omp_block_size);
}
}
-static void libsais_radix_sort_set_markers_32s_4k_omp(
- sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket,
- sa_sint_t threads) {
+static void libsais_radix_sort_set_markers_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t k,
+ sa_sint_t * RESTRICT induction_bucket,
+ sa_sint_t threads) {
#if defined(_OPENMP)
#pragma omp parallel num_threads(threads) if (threads > 1 && k >= 65536)
#endif
@@ -2579,8 +2399,7 @@ static void libsais_radix_sort_set_markers_32s_4k_omp(
#if defined(_OPENMP)
fast_sint_t omp_thread_num = omp_get_thread_num();
fast_sint_t omp_num_threads = omp_get_num_threads();
- fast_sint_t omp_block_stride =
- (((fast_sint_t)k - 1) / omp_num_threads) & (-16);
+ fast_sint_t omp_block_stride = (((fast_sint_t)k - 1) / omp_num_threads) & (-16);
fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
? omp_block_stride
@@ -2592,14 +2411,15 @@ static void libsais_radix_sort_set_markers_32s_4k_omp(
fast_sint_t omp_block_size = (fast_sint_t)k - 1;
#endif
- libsais_radix_sort_set_markers_32s_4k(SA, induction_bucket,
- omp_block_start, omp_block_size);
+ libsais_radix_sort_set_markers_32s_4k(SA, induction_bucket, omp_block_start,
+ omp_block_size);
}
}
-static void libsais_initialize_buckets_for_partial_sorting_8u(
- const u8 * RESTRICT T, sa_sint_t * RESTRICT buckets,
- sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count) {
+static void libsais_initialize_buckets_for_partial_sorting_8u(const u8 * RESTRICT T,
+ sa_sint_t * RESTRICT buckets,
+ sa_sint_t first_lms_suffix,
+ sa_sint_t left_suffixes_count) {
sa_sint_t * RESTRICT temp_bucket = &buckets[4 * ALPHABET_SIZE];
buckets[BUCKETS_INDEX4((fast_uint_t)T[first_lms_suffix], 1)]++;
@@ -2611,8 +2431,7 @@ static void libsais_initialize_buckets_for_partial_sorting_8u(
i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) {
temp_bucket[j + BUCKETS_INDEX2(0, 0)] = sum0;
- sum0 += buckets[i + BUCKETS_INDEX4(0, 0)] +
- buckets[i + BUCKETS_INDEX4(0, 2)];
+ sum0 += buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 2)];
sum1 += buckets[i + BUCKETS_INDEX4(0, 1)];
buckets[j + BUCKETS_INDEX2(0, 0)] = sum0;
@@ -2620,15 +2439,16 @@ static void libsais_initialize_buckets_for_partial_sorting_8u(
}
}
-static void libsais_initialize_buckets_for_partial_sorting_32s_6k(
- const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets,
- sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count) {
+static void libsais_initialize_buckets_for_partial_sorting_32s_6k(const sa_sint_t * RESTRICT T,
+ sa_sint_t k,
+ sa_sint_t * RESTRICT buckets,
+ sa_sint_t first_lms_suffix,
+ sa_sint_t left_suffixes_count) {
sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k];
fast_sint_t i, j;
sa_sint_t sum0 = left_suffixes_count + 1, sum1 = 0, sum2 = 0;
- for (first_lms_suffix = T[first_lms_suffix], i = BUCKETS_INDEX4(0, 0),
- j = BUCKETS_INDEX2(0, 0);
+ for (first_lms_suffix = T[first_lms_suffix], i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0);
i <= BUCKETS_INDEX4((fast_sint_t)first_lms_suffix - 1, 0);
i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) {
sa_sint_t SS = buckets[i + BUCKETS_INDEX4(0, 0)];
@@ -2671,18 +2491,16 @@ static void libsais_initialize_buckets_for_partial_sorting_32s_6k(
}
static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u(
- const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start,
- fast_sint_t omp_block_size) {
+ const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d,
+ fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
const fast_sint_t prefetch_distance = 32;
sa_sint_t * RESTRICT induction_bucket = &buckets[4 * ALPHABET_SIZE];
sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE];
fast_sint_t i, j;
- for (i = omp_block_start,
- j = omp_block_start + omp_block_size - prefetch_distance - 1;
- i < j; i += 2) {
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j;
+ i += 2) {
libsais_prefetch(&SA[i + 2 * prefetch_distance]);
libsais_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1);
@@ -2695,8 +2513,7 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u(
p0 &= SAINT_MAX;
sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] >= T[p0 - 1]);
SA[induction_bucket[v0]++] =
- (p0 - 1) |
- ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1));
+ (p0 - 1) | ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1));
distinct_names[v0] = d;
sa_sint_t p1 = SA[i + 1];
@@ -2704,8 +2521,7 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u(
p1 &= SAINT_MAX;
sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] >= T[p1 - 1]);
SA[induction_bucket[v1]++] =
- (p1 - 1) |
- ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1));
+ (p1 - 1) | ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1));
distinct_names[v1] = d;
}
@@ -2725,9 +2541,8 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u(
#if defined(_OPENMP)
static void libsais_partial_sorting_scan_left_to_right_8u_block_prepare(
- const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache,
- fast_sint_t omp_block_start, fast_sint_t omp_block_size,
+ const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
+ LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size,
LIBSAIS_THREAD_STATE * RESTRICT state) {
const fast_sint_t prefetch_distance = 32;
@@ -2738,9 +2553,8 @@ static void libsais_partial_sorting_scan_left_to_right_8u_block_prepare(
fast_sint_t i, j, count = 0;
sa_sint_t d = 1;
- for (i = omp_block_start,
- j = omp_block_start + omp_block_size - prefetch_distance - 1;
- i < j; i += 2) {
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j;
+ i += 2) {
libsais_prefetch(&SA[i + 2 * prefetch_distance]);
libsais_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1);
@@ -2751,15 +2565,13 @@ static void libsais_partial_sorting_scan_left_to_right_8u_block_prepare(
sa_sint_t p0 = cache[count].index = SA[i + 0];
d += (p0 < 0);
p0 &= SAINT_MAX;
- sa_sint_t v0 = cache[count++].symbol =
- BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] >= T[p0 - 1]);
+ sa_sint_t v0 = cache[count++].symbol = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] >= T[p0 - 1]);
induction_bucket[v0]++;
distinct_names[v0] = d;
sa_sint_t p1 = cache[count].index = SA[i + 1];
d += (p1 < 0);
p1 &= SAINT_MAX;
- sa_sint_t v1 = cache[count++].symbol =
- BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] >= T[p1 - 1]);
+ sa_sint_t v1 = cache[count++].symbol = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] >= T[p1 - 1]);
induction_bucket[v1]++;
distinct_names[v1] = d;
}
@@ -2768,8 +2580,7 @@ static void libsais_partial_sorting_scan_left_to_right_8u_block_prepare(
sa_sint_t p = cache[count].index = SA[i];
d += (p < 0);
p &= SAINT_MAX;
- sa_sint_t v = cache[count++].symbol =
- BUCKETS_INDEX2(T[p - 1], T[p - 2] >= T[p - 1]);
+ sa_sint_t v = cache[count++].symbol = BUCKETS_INDEX2(T[p - 1], T[p - 2] >= T[p - 1]);
induction_bucket[v]++;
distinct_names[v] = d;
}
@@ -2779,8 +2590,8 @@ static void libsais_partial_sorting_scan_left_to_right_8u_block_prepare(
}
static void libsais_partial_sorting_scan_left_to_right_8u_block_place(
- sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
- LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count, sa_sint_t d) {
+ sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache,
+ fast_sint_t count, sa_sint_t d) {
const fast_sint_t prefetch_distance = 32;
sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
@@ -2794,16 +2605,14 @@ static void libsais_partial_sorting_scan_left_to_right_8u_block_place(
d += (p0 < 0);
sa_sint_t v0 = cache[i + 0].symbol;
SA[induction_bucket[v0]++] =
- (p0 - 1) |
- ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1));
+ (p0 - 1) | ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1));
distinct_names[v0] = d;
sa_sint_t p1 = cache[i + 1].index;
d += (p1 < 0);
sa_sint_t v1 = cache[i + 1].symbol;
SA[induction_bucket[v1]++] =
- (p1 - 1) |
- ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1));
+ (p1 - 1) | ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1));
distinct_names[v1] = d;
}
@@ -2818,14 +2627,13 @@ static void libsais_partial_sorting_scan_left_to_right_8u_block_place(
}
static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u_block_omp(
- const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t block_start,
- fast_sint_t block_size, sa_sint_t threads,
+ const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d,
+ fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads,
LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
#if defined(_OPENMP)
- #pragma omp parallel num_threads( \
- threads) if (threads > 1 && block_size >= 64 * ALPHABET_SIZE && \
- omp_get_dynamic() == 0)
+ #pragma omp parallel num_threads(threads) if (threads > 1 && \
+ block_size >= 64 * ALPHABET_SIZE && \
+ omp_get_dynamic() == 0)
#endif
{
#if defined(_OPENMP)
@@ -2840,33 +2648,30 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u_block_omp(
#endif
fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
- fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
- ? omp_block_stride
- : block_size - omp_block_start;
+ fast_sint_t omp_block_size =
+ omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
omp_block_start += block_start;
if (omp_num_threads == 1) {
- d = libsais_partial_sorting_scan_left_to_right_8u(
- T, SA, buckets, d, omp_block_start, omp_block_size);
+ d = libsais_partial_sorting_scan_left_to_right_8u(T, SA, buckets, d, omp_block_start,
+ omp_block_size);
}
#if defined(_OPENMP)
else {
{
libsais_partial_sorting_scan_left_to_right_8u_block_prepare(
T, SA, thread_state[omp_thread_num].state.buckets,
- thread_state[omp_thread_num].state.cache, omp_block_start,
- omp_block_size, &thread_state[omp_thread_num]);
+ thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size,
+ &thread_state[omp_thread_num]);
}
#pragma omp barrier
#pragma omp master
{
- sa_sint_t * RESTRICT induction_bucket =
- &buckets[4 * ALPHABET_SIZE];
- sa_sint_t * RESTRICT distinct_names =
- &buckets[2 * ALPHABET_SIZE];
+ sa_sint_t * RESTRICT induction_bucket = &buckets[4 * ALPHABET_SIZE];
+ sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE];
fast_sint_t t;
for (t = 0; t < omp_num_threads; ++t) {
@@ -2877,15 +2682,13 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u_block_omp(
fast_sint_t c;
for (c = 0; c < 2 * ALPHABET_SIZE; c += 1) {
- sa_sint_t A = induction_bucket[c],
- B = temp_induction_bucket[c];
+ sa_sint_t A = induction_bucket[c], B = temp_induction_bucket[c];
induction_bucket[c] = A + B;
temp_induction_bucket[c] = A;
}
for (d -= 1, c = 0; c < 2 * ALPHABET_SIZE; c += 1) {
- sa_sint_t A = distinct_names[c],
- B = temp_distinct_names[c], D = B + d;
+ sa_sint_t A = distinct_names[c], B = temp_distinct_names[c], D = B + d;
distinct_names[c] = B > 0 ? D : A;
temp_distinct_names[c] = A;
}
@@ -2914,14 +2717,13 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u_block_omp(
#endif
static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u_omp(
- const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t * RESTRICT buckets, sa_sint_t left_suffixes_count, sa_sint_t d,
- sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets,
+ sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
sa_sint_t * RESTRICT induction_bucket = &buckets[4 * ALPHABET_SIZE];
sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE];
- SA[induction_bucket[BUCKETS_INDEX2(T[n - 1], T[n - 2] >= T[n - 1])]++] =
- (n - 1) | SAINT_MIN;
+ SA[induction_bucket[BUCKETS_INDEX2(T[n - 1], T[n - 2] >= T[n - 1])]++] = (n - 1) | SAINT_MIN;
distinct_names[BUCKETS_INDEX2(T[n - 1], T[n - 2] >= T[n - 1])] = ++d;
if (threads == 1 || left_suffixes_count < 65536) {
@@ -2936,9 +2738,8 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u_omp(
block_start++;
} else {
fast_sint_t block_max_end =
- block_start +
- ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE -
- 16 * (fast_sint_t)threads);
+ block_start + ((fast_sint_t)threads) *
+ (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads);
if (block_max_end > left_suffixes_count) {
block_max_end = left_suffixes_count;
}
@@ -2953,17 +2754,14 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u_omp(
sa_sint_t p = SA[block_start];
d += (p < 0);
p &= SAINT_MAX;
- sa_sint_t v =
- BUCKETS_INDEX2(T[p - 1], T[p - 2] >= T[p - 1]);
+ sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] >= T[p - 1]);
SA[induction_bucket[v]++] =
- (p - 1) | ((sa_sint_t)(distinct_names[v] != d)
- << (SAINT_BIT - 1));
+ (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1));
distinct_names[v] = d;
}
} else {
d = libsais_partial_sorting_scan_left_to_right_8u_block_omp(
- T, SA, buckets, d, block_start, block_size, threads,
- thread_state);
+ T, SA, buckets, d, block_start, block_size, threads, thread_state);
block_start = block_end;
}
}
@@ -2977,14 +2775,12 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u_omp(
}
static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k(
- const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start,
- fast_sint_t omp_block_size) {
+ const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
+ sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
const fast_sint_t prefetch_distance = 32;
fast_sint_t i, j;
- for (i = omp_block_start,
- j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1;
i < j; i += 2) {
libsais_prefetch(&SA[i + 3 * prefetch_distance]);
@@ -3004,16 +2800,14 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k(
d += (p2 < 0);
p2 &= SAINT_MAX;
sa_sint_t v2 = BUCKETS_INDEX4(T[p2 - 1], T[p2 - 2] >= T[p2 - 1]);
- SA[buckets[v2]++] =
- (p2 - 1) | ((sa_sint_t)(buckets[2 + v2] != d) << (SAINT_BIT - 1));
+ SA[buckets[v2]++] = (p2 - 1) | ((sa_sint_t)(buckets[2 + v2] != d) << (SAINT_BIT - 1));
buckets[2 + v2] = d;
sa_sint_t p3 = SA[i + 1];
d += (p3 < 0);
p3 &= SAINT_MAX;
sa_sint_t v3 = BUCKETS_INDEX4(T[p3 - 1], T[p3 - 2] >= T[p3 - 1]);
- SA[buckets[v3]++] =
- (p3 - 1) | ((sa_sint_t)(buckets[2 + v3] != d) << (SAINT_BIT - 1));
+ SA[buckets[v3]++] = (p3 - 1) | ((sa_sint_t)(buckets[2 + v3] != d) << (SAINT_BIT - 1));
buckets[2 + v3] = d;
}
@@ -3022,8 +2816,7 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k(
d += (p < 0);
p &= SAINT_MAX;
sa_sint_t v = BUCKETS_INDEX4(T[p - 1], T[p - 2] >= T[p - 1]);
- SA[buckets[v]++] =
- (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1));
+ SA[buckets[v]++] = (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1));
buckets[2 + v] = d;
}
@@ -3040,8 +2833,7 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k(
sa_sint_t * RESTRICT distinct_names = &buckets[0 * k];
fast_sint_t i, j;
- for (i = omp_block_start,
- j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1;
i < j; i += 2) {
libsais_prefetchw(&SA[i + 3 * prefetch_distance]);
@@ -3076,10 +2868,8 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k(
p0 &= ~SUFFIX_GROUP_MARKER;
sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] < T[p0 - 1]);
SA[induction_bucket[T[p0 - 1]]++] =
- (p0 - 1) |
- ((sa_sint_t)(T[p0 - 2] < T[p0 - 1]) << (SAINT_BIT - 1)) |
- ((sa_sint_t)(distinct_names[v0] != d)
- << (SUFFIX_GROUP_BIT - 1));
+ (p0 - 1) | ((sa_sint_t)(T[p0 - 2] < T[p0 - 1]) << (SAINT_BIT - 1)) |
+ ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1));
distinct_names[v0] = d;
}
@@ -3091,10 +2881,8 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k(
p1 &= ~SUFFIX_GROUP_MARKER;
sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] < T[p1 - 1]);
SA[induction_bucket[T[p1 - 1]]++] =
- (p1 - 1) |
- ((sa_sint_t)(T[p1 - 2] < T[p1 - 1]) << (SAINT_BIT - 1)) |
- ((sa_sint_t)(distinct_names[v1] != d)
- << (SUFFIX_GROUP_BIT - 1));
+ (p1 - 1) | ((sa_sint_t)(T[p1 - 2] < T[p1 - 1]) << (SAINT_BIT - 1)) |
+ ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1));
distinct_names[v1] = d;
}
}
@@ -3108,8 +2896,7 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k(
p &= ~SUFFIX_GROUP_MARKER;
sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] < T[p - 1]);
SA[induction_bucket[T[p - 1]]++] =
- (p - 1) |
- ((sa_sint_t)(T[p - 2] < T[p - 1]) << (SAINT_BIT - 1)) |
+ (p - 1) | ((sa_sint_t)(T[p - 2] < T[p - 1]) << (SAINT_BIT - 1)) |
((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1));
distinct_names[v] = d;
}
@@ -3118,15 +2905,15 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k(
return d;
}
-static void libsais_partial_sorting_scan_left_to_right_32s_1k(
- const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start,
- fast_sint_t omp_block_size) {
+static void libsais_partial_sorting_scan_left_to_right_32s_1k(const sa_sint_t * RESTRICT T,
+ sa_sint_t * RESTRICT SA,
+ sa_sint_t * RESTRICT induction_bucket,
+ fast_sint_t omp_block_start,
+ fast_sint_t omp_block_size) {
const fast_sint_t prefetch_distance = 32;
fast_sint_t i, j;
- for (i = omp_block_start,
- j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1;
i < j; i += 2) {
libsais_prefetchw(&SA[i + 3 * prefetch_distance]);
@@ -3152,16 +2939,14 @@ static void libsais_partial_sorting_scan_left_to_right_32s_1k(
if (p0 > 0) {
SA[i + 0] = 0;
SA[induction_bucket[T[p0 - 1]]++] =
- (p0 - 1) |
- ((sa_sint_t)(T[p0 - 2] < T[p0 - 1]) << (SAINT_BIT - 1));
+ (p0 - 1) | ((sa_sint_t)(T[p0 - 2] < T[p0 - 1]) << (SAINT_BIT - 1));
}
sa_sint_t p1 = SA[i + 1];
SA[i + 1] = p1 & SAINT_MAX;
if (p1 > 0) {
SA[i + 1] = 0;
SA[induction_bucket[T[p1 - 1]]++] =
- (p1 - 1) |
- ((sa_sint_t)(T[p1 - 2] < T[p1 - 1]) << (SAINT_BIT - 1));
+ (p1 - 1) | ((sa_sint_t)(T[p1 - 2] < T[p1 - 1]) << (SAINT_BIT - 1));
}
}
@@ -3179,15 +2964,13 @@ static void libsais_partial_sorting_scan_left_to_right_32s_1k(
#if defined(_OPENMP)
static void libsais_partial_sorting_scan_left_to_right_32s_6k_block_gather(
- const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
- LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
- fast_sint_t omp_block_size) {
+ const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache,
+ fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
const fast_sint_t prefetch_distance = 32;
fast_sint_t i, j;
- for (i = omp_block_start,
- j = omp_block_start + omp_block_size - prefetch_distance - 1;
- i < j; i += 2) {
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j;
+ i += 2) {
libsais_prefetch(&SA[i + 2 * prefetch_distance]);
libsais_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1);
@@ -3225,15 +3008,13 @@ static void libsais_partial_sorting_scan_left_to_right_32s_6k_block_gather(
}
static void libsais_partial_sorting_scan_left_to_right_32s_4k_block_gather(
- const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
- LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
- fast_sint_t omp_block_size) {
+ const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache,
+ fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
const fast_sint_t prefetch_distance = 32;
fast_sint_t i, j;
- for (i = omp_block_start,
- j = omp_block_start + omp_block_size - prefetch_distance - 1;
- i < j; i += 2) {
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j;
+ i += 2) {
libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
sa_sint_t s0 = SA[i + prefetch_distance + 0];
@@ -3283,15 +3064,13 @@ static void libsais_partial_sorting_scan_left_to_right_32s_4k_block_gather(
}
static void libsais_partial_sorting_scan_left_to_right_32s_1k_block_gather(
- const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
- LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
- fast_sint_t omp_block_size) {
+ const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache,
+ fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
const fast_sint_t prefetch_distance = 32;
fast_sint_t i, j;
- for (i = omp_block_start,
- j = omp_block_start + omp_block_size - prefetch_distance - 1;
- i < j; i += 2) {
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j;
+ i += 2) {
libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
sa_sint_t s0 = SA[i + prefetch_distance + 0];
@@ -3309,8 +3088,7 @@ static void libsais_partial_sorting_scan_left_to_right_32s_1k_block_gather(
sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0];
if (p0 > 0) {
- cache[i + 0].index = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] < T[p0 - 1])
- << (SAINT_BIT - 1));
+ cache[i + 0].index = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] < T[p0 - 1]) << (SAINT_BIT - 1));
symbol0 = T[p0 - 1];
p0 = 0;
}
@@ -3318,8 +3096,7 @@ static void libsais_partial_sorting_scan_left_to_right_32s_1k_block_gather(
SA[i + 0] = p0 & SAINT_MAX;
sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1];
if (p1 > 0) {
- cache[i + 1].index = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] < T[p1 - 1])
- << (SAINT_BIT - 1));
+ cache[i + 1].index = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] < T[p1 - 1]) << (SAINT_BIT - 1));
symbol1 = T[p1 - 1];
p1 = 0;
}
@@ -3330,8 +3107,7 @@ static void libsais_partial_sorting_scan_left_to_right_32s_1k_block_gather(
for (j += prefetch_distance + 1; i < j; i += 1) {
sa_sint_t symbol = SAINT_MIN, p = SA[i];
if (p > 0) {
- cache[i].index =
- (p - 1) | ((sa_sint_t)(T[p - 2] < T[p - 1]) << (SAINT_BIT - 1));
+ cache[i].index = (p - 1) | ((sa_sint_t)(T[p - 2] < T[p - 1]) << (SAINT_BIT - 1));
symbol = T[p - 1];
p = 0;
}
@@ -3347,8 +3123,7 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k_block_sort(
const fast_sint_t prefetch_distance = 32;
fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size;
- for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j;
- i += 2) {
+ for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2) {
libsais_prefetchw(&cache[i + 2 * prefetch_distance]);
libsais_prefetchw(&buckets[cache[i + prefetch_distance + 0].symbol]);
@@ -3357,8 +3132,7 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k_block_sort(
sa_sint_t v0 = cache[i + 0].symbol, p0 = cache[i + 0].index;
d += (p0 < 0);
cache[i + 0].symbol = buckets[v0]++;
- cache[i + 0].index =
- (p0 - 1) | ((sa_sint_t)(buckets[2 + v0] != d) << (SAINT_BIT - 1));
+ cache[i + 0].index = (p0 - 1) | ((sa_sint_t)(buckets[2 + v0] != d) << (SAINT_BIT - 1));
buckets[2 + v0] = d;
if (cache[i + 0].symbol < omp_block_end) {
sa_sint_t s = cache[i + 0].symbol,
@@ -3369,8 +3143,7 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k_block_sort(
sa_sint_t v1 = cache[i + 1].symbol, p1 = cache[i + 1].index;
d += (p1 < 0);
cache[i + 1].symbol = buckets[v1]++;
- cache[i + 1].index =
- (p1 - 1) | ((sa_sint_t)(buckets[2 + v1] != d) << (SAINT_BIT - 1));
+ cache[i + 1].index = (p1 - 1) | ((sa_sint_t)(buckets[2 + v1] != d) << (SAINT_BIT - 1));
buckets[2 + v1] = d;
if (cache[i + 1].symbol < omp_block_end) {
sa_sint_t s = cache[i + 1].symbol,
@@ -3383,12 +3156,10 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k_block_sort(
sa_sint_t v = cache[i].symbol, p = cache[i].index;
d += (p < 0);
cache[i].symbol = buckets[v]++;
- cache[i].index =
- (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1));
+ cache[i].index = (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1));
buckets[2 + v] = d;
if (cache[i].symbol < omp_block_end) {
- sa_sint_t s = cache[i].symbol,
- q = (cache[s].index = cache[i].index) & SAINT_MAX;
+ sa_sint_t s = cache[i].symbol, q = (cache[s].index = cache[i].index) & SAINT_MAX;
cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] >= T[q - 1]);
}
}
@@ -3397,17 +3168,16 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k_block_sort(
}
static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k_block_sort(
- const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets,
- sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache,
- fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+ const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d,
+ LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
+ fast_sint_t omp_block_size) {
const fast_sint_t prefetch_distance = 32;
sa_sint_t * RESTRICT induction_bucket = &buckets[2 * k];
sa_sint_t * RESTRICT distinct_names = &buckets[0 * k];
fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size;
- for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j;
- i += 2) {
+ for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2) {
libsais_prefetchw(&cache[i + 2 * prefetch_distance]);
sa_sint_t s0 = cache[i + prefetch_distance + 0].symbol;
@@ -3427,16 +3197,14 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k_block_sort(
d += (p0 >> (SUFFIX_GROUP_BIT - 1));
cache[i + 0].symbol = induction_bucket[v0 >> 1]++;
cache[i + 0].index = (p0 - 1) | (v0 << (SAINT_BIT - 1)) |
- ((sa_sint_t)(distinct_names[v0] != d)
- << (SUFFIX_GROUP_BIT - 1));
+ ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1));
distinct_names[v0] = d;
if (cache[i + 0].symbol < omp_block_end) {
sa_sint_t ni = cache[i + 0].symbol, np = cache[i + 0].index;
if (np > 0) {
cache[ni].index = np;
np &= ~SUFFIX_GROUP_MARKER;
- cache[ni].symbol =
- BUCKETS_INDEX2(T[np - 1], T[np - 2] < T[np - 1]);
+ cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] < T[np - 1]);
np = 0;
}
cache[i + 0].index = np & SAINT_MAX;
@@ -3449,16 +3217,14 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k_block_sort(
d += (p1 >> (SUFFIX_GROUP_BIT - 1));
cache[i + 1].symbol = induction_bucket[v1 >> 1]++;
cache[i + 1].index = (p1 - 1) | (v1 << (SAINT_BIT - 1)) |
- ((sa_sint_t)(distinct_names[v1] != d)
- << (SUFFIX_GROUP_BIT - 1));
+ ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1));
distinct_names[v1] = d;
if (cache[i + 1].symbol < omp_block_end) {
sa_sint_t ni = cache[i + 1].symbol, np = cache[i + 1].index;
if (np > 0) {
cache[ni].index = np;
np &= ~SUFFIX_GROUP_MARKER;
- cache[ni].symbol =
- BUCKETS_INDEX2(T[np - 1], T[np - 2] < T[np - 1]);
+ cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] < T[np - 1]);
np = 0;
}
cache[i + 1].index = np & SAINT_MAX;
@@ -3472,17 +3238,15 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k_block_sort(
sa_sint_t p = cache[i].index;
d += (p >> (SUFFIX_GROUP_BIT - 1));
cache[i].symbol = induction_bucket[v >> 1]++;
- cache[i].index =
- (p - 1) | (v << (SAINT_BIT - 1)) |
- ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1));
+ cache[i].index = (p - 1) | (v << (SAINT_BIT - 1)) |
+ ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1));
distinct_names[v] = d;
if (cache[i].symbol < omp_block_end) {
sa_sint_t ni = cache[i].symbol, np = cache[i].index;
if (np > 0) {
cache[ni].index = np;
np &= ~SUFFIX_GROUP_MARKER;
- cache[ni].symbol =
- BUCKETS_INDEX2(T[np - 1], T[np - 2] < T[np - 1]);
+ cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] < T[np - 1]);
np = 0;
}
cache[i].index = np & SAINT_MAX;
@@ -3500,8 +3264,7 @@ static void libsais_partial_sorting_scan_left_to_right_32s_1k_block_sort(
const fast_sint_t prefetch_distance = 32;
fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size;
- for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j;
- i += 2) {
+ for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2) {
libsais_prefetchw(&cache[i + 2 * prefetch_distance]);
sa_sint_t s0 = cache[i + prefetch_distance + 0].symbol;
@@ -3518,8 +3281,7 @@ static void libsais_partial_sorting_scan_left_to_right_32s_1k_block_sort(
sa_sint_t ni = cache[i + 0].symbol, np = cache[i + 0].index;
if (np > 0) {
cache[ni].index =
- (np - 1) |
- ((sa_sint_t)(T[np - 2] < T[np - 1]) << (SAINT_BIT - 1));
+ (np - 1) | ((sa_sint_t)(T[np - 2] < T[np - 1]) << (SAINT_BIT - 1));
cache[ni].symbol = T[np - 1];
np = 0;
}
@@ -3534,8 +3296,7 @@ static void libsais_partial_sorting_scan_left_to_right_32s_1k_block_sort(
sa_sint_t ni = cache[i + 1].symbol, np = cache[i + 1].index;
if (np > 0) {
cache[ni].index =
- (np - 1) |
- ((sa_sint_t)(T[np - 2] < T[np - 1]) << (SAINT_BIT - 1));
+ (np - 1) | ((sa_sint_t)(T[np - 2] < T[np - 1]) << (SAINT_BIT - 1));
cache[ni].symbol = T[np - 1];
np = 0;
}
@@ -3552,8 +3313,7 @@ static void libsais_partial_sorting_scan_left_to_right_32s_1k_block_sort(
sa_sint_t ni = cache[i].symbol, np = cache[i].index;
if (np > 0) {
cache[ni].index =
- (np - 1) |
- ((sa_sint_t)(T[np - 2] < T[np - 1]) << (SAINT_BIT - 1));
+ (np - 1) | ((sa_sint_t)(T[np - 2] < T[np - 1]) << (SAINT_BIT - 1));
cache[ni].symbol = T[np - 1];
np = 0;
}
@@ -3564,13 +3324,11 @@ static void libsais_partial_sorting_scan_left_to_right_32s_1k_block_sort(
}
static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k_block_omp(
- const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t * RESTRICT buckets, sa_sint_t d,
- LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start,
+ const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
+ sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start,
fast_sint_t block_size, sa_sint_t threads) {
#if defined(_OPENMP)
- #pragma omp parallel num_threads(threads) if (threads > 1 && \
- block_size >= 16384)
+ #pragma omp parallel num_threads(threads) if (threads > 1 && block_size >= 16384)
#endif
{
#if defined(_OPENMP)
@@ -3585,22 +3343,20 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k_block_omp(
#endif
fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
- fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
- ? omp_block_stride
- : block_size - omp_block_start;
+ fast_sint_t omp_block_size =
+ omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
omp_block_start += block_start;
if (omp_num_threads == 1) {
- d = libsais_partial_sorting_scan_left_to_right_32s_6k(
- T, SA, buckets, d, omp_block_start, omp_block_size);
+ d = libsais_partial_sorting_scan_left_to_right_32s_6k(T, SA, buckets, d,
+ omp_block_start, omp_block_size);
}
#if defined(_OPENMP)
else {
{
libsais_partial_sorting_scan_left_to_right_32s_6k_block_gather(
- T, SA, cache - block_start, omp_block_start,
- omp_block_size);
+ T, SA, cache - block_start, omp_block_start, omp_block_size);
}
#pragma omp barrier
@@ -3608,15 +3364,14 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k_block_omp(
#pragma omp master
{
d = libsais_partial_sorting_scan_left_to_right_32s_6k_block_sort(
- T, buckets, d, cache - block_start, block_start,
- block_size);
+ T, buckets, d, cache - block_start, block_start, block_size);
}
#pragma omp barrier
{
- libsais_place_cached_suffixes(SA, cache - block_start,
- omp_block_start, omp_block_size);
+ libsais_place_cached_suffixes(SA, cache - block_start, omp_block_start,
+ omp_block_size);
}
}
#endif
@@ -3627,12 +3382,10 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k_block_omp(
static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k_block_omp(
const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k,
- sa_sint_t * RESTRICT buckets, sa_sint_t d,
- LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start,
- fast_sint_t block_size, sa_sint_t threads) {
+ sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache,
+ fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) {
#if defined(_OPENMP)
- #pragma omp parallel num_threads(threads) if (threads > 1 && \
- block_size >= 16384)
+ #pragma omp parallel num_threads(threads) if (threads > 1 && block_size >= 16384)
#endif
{
#if defined(_OPENMP)
@@ -3647,22 +3400,20 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k_block_omp(
#endif
fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
- fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
- ? omp_block_stride
- : block_size - omp_block_start;
+ fast_sint_t omp_block_size =
+ omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
omp_block_start += block_start;
if (omp_num_threads == 1) {
- d = libsais_partial_sorting_scan_left_to_right_32s_4k(
- T, SA, k, buckets, d, omp_block_start, omp_block_size);
+ d = libsais_partial_sorting_scan_left_to_right_32s_4k(T, SA, k, buckets, d,
+ omp_block_start, omp_block_size);
}
#if defined(_OPENMP)
else {
{
libsais_partial_sorting_scan_left_to_right_32s_4k_block_gather(
- T, SA, cache - block_start, omp_block_start,
- omp_block_size);
+ T, SA, cache - block_start, omp_block_start, omp_block_size);
}
#pragma omp barrier
@@ -3670,15 +3421,14 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k_block_omp(
#pragma omp master
{
d = libsais_partial_sorting_scan_left_to_right_32s_4k_block_sort(
- T, k, buckets, d, cache - block_start, block_start,
- block_size);
+ T, k, buckets, d, cache - block_start, block_start, block_size);
}
#pragma omp barrier
{
- libsais_compact_and_place_cached_suffixes(
- SA, cache - block_start, omp_block_start, omp_block_size);
+ libsais_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start,
+ omp_block_size);
}
}
#endif
@@ -3688,12 +3438,11 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k_block_omp(
}
static void libsais_partial_sorting_scan_left_to_right_32s_1k_block_omp(
- const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache,
- fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) {
+ const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
+ LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size,
+ sa_sint_t threads) {
#if defined(_OPENMP)
- #pragma omp parallel num_threads(threads) if (threads > 1 && \
- block_size >= 16384)
+ #pragma omp parallel num_threads(threads) if (threads > 1 && block_size >= 16384)
#endif
{
#if defined(_OPENMP)
@@ -3708,22 +3457,20 @@ static void libsais_partial_sorting_scan_left_to_right_32s_1k_block_omp(
#endif
fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
- fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
- ? omp_block_stride
- : block_size - omp_block_start;
+ fast_sint_t omp_block_size =
+ omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
omp_block_start += block_start;
if (omp_num_threads == 1) {
- libsais_partial_sorting_scan_left_to_right_32s_1k(
- T, SA, buckets, omp_block_start, omp_block_size);
+ libsais_partial_sorting_scan_left_to_right_32s_1k(T, SA, buckets, omp_block_start,
+ omp_block_size);
}
#if defined(_OPENMP)
else {
{
libsais_partial_sorting_scan_left_to_right_32s_1k_block_gather(
- T, SA, cache - block_start, omp_block_start,
- omp_block_size);
+ T, SA, cache - block_start, omp_block_start, omp_block_size);
}
#pragma omp barrier
@@ -3737,8 +3484,8 @@ static void libsais_partial_sorting_scan_left_to_right_32s_1k_block_omp(
#pragma omp barrier
{
- libsais_compact_and_place_cached_suffixes(
- SA, cache - block_start, omp_block_start, omp_block_size);
+ libsais_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start,
+ omp_block_size);
}
}
#endif
@@ -3749,23 +3496,20 @@ static void libsais_partial_sorting_scan_left_to_right_32s_1k_block_omp(
static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k_omp(
const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t * RESTRICT buckets, sa_sint_t left_suffixes_count, sa_sint_t d,
- sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- SA[buckets[BUCKETS_INDEX4(T[n - 1], T[n - 2] >= T[n - 1])]++] =
- (n - 1) | SAINT_MIN;
+ sa_sint_t * RESTRICT buckets, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ SA[buckets[BUCKETS_INDEX4(T[n - 1], T[n - 2] >= T[n - 1])]++] = (n - 1) | SAINT_MIN;
buckets[2 + BUCKETS_INDEX4(T[n - 1], T[n - 2] >= T[n - 1])] = ++d;
if (threads == 1 || left_suffixes_count < 65536) {
- d = libsais_partial_sorting_scan_left_to_right_32s_6k(
- T, SA, buckets, d, 0, left_suffixes_count);
+ d = libsais_partial_sorting_scan_left_to_right_32s_6k(T, SA, buckets, d, 0,
+ left_suffixes_count);
}
#if defined(_OPENMP)
else {
fast_sint_t block_start, block_end;
- for (block_start = 0; block_start < left_suffixes_count;
- block_start = block_end) {
- block_end = block_start +
- (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
+ for (block_start = 0; block_start < left_suffixes_count; block_start = block_end) {
+ block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
if (block_end > left_suffixes_count) {
block_end = left_suffixes_count;
}
@@ -3783,27 +3527,24 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k_omp(
}
static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k_omp(
- const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, sa_sint_t threads,
+ const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
+ sa_sint_t * RESTRICT buckets, sa_sint_t d, sa_sint_t threads,
LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
sa_sint_t * RESTRICT induction_bucket = &buckets[2 * k];
sa_sint_t * RESTRICT distinct_names = &buckets[0 * k];
SA[induction_bucket[T[n - 1]]++] =
- (n - 1) | ((sa_sint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1)) |
- SUFFIX_GROUP_MARKER;
+ (n - 1) | ((sa_sint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1)) | SUFFIX_GROUP_MARKER;
distinct_names[BUCKETS_INDEX2(T[n - 1], T[n - 2] < T[n - 1])] = ++d;
if (threads == 1 || n < 65536) {
- d = libsais_partial_sorting_scan_left_to_right_32s_4k(T, SA, k, buckets,
- d, 0, n);
+ d = libsais_partial_sorting_scan_left_to_right_32s_4k(T, SA, k, buckets, d, 0, n);
}
#if defined(_OPENMP)
else {
fast_sint_t block_start, block_end;
for (block_start = 0; block_start < n; block_start = block_end) {
- block_end = block_start +
- (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
+ block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
if (block_end > n) {
block_end = n;
}
@@ -3822,10 +3563,8 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k_omp(
static void libsais_partial_sorting_scan_left_to_right_32s_1k_omp(
const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t * RESTRICT buckets, sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- SA[buckets[T[n - 1]]++] =
- (n - 1) | ((sa_sint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1));
+ sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ SA[buckets[T[n - 1]]++] = (n - 1) | ((sa_sint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1));
if (threads == 1 || n < 65536) {
libsais_partial_sorting_scan_left_to_right_32s_1k(T, SA, buckets, 0, n);
@@ -3834,15 +3573,14 @@ static void libsais_partial_sorting_scan_left_to_right_32s_1k_omp(
else {
fast_sint_t block_start, block_end;
for (block_start = 0; block_start < n; block_start = block_end) {
- block_end = block_start +
- (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
+ block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
if (block_end > n) {
block_end = n;
}
libsais_partial_sorting_scan_left_to_right_32s_1k_block_omp(
- T, SA, buckets, thread_state[0].state.cache, block_start,
- block_end - block_start, threads);
+ T, SA, buckets, thread_state[0].state.cache, block_start, block_end - block_start,
+ threads);
}
}
#else
@@ -3850,9 +3588,9 @@ static void libsais_partial_sorting_scan_left_to_right_32s_1k_omp(
#endif
}
-static void libsais_partial_sorting_shift_markers_8u_omp(
- sa_sint_t * RESTRICT SA, sa_sint_t n, const sa_sint_t * RESTRICT buckets,
- sa_sint_t threads) {
+static void libsais_partial_sorting_shift_markers_8u_omp(sa_sint_t * RESTRICT SA, sa_sint_t n,
+ const sa_sint_t * RESTRICT buckets,
+ sa_sint_t threads) {
const fast_sint_t prefetch_distance = 32;
const sa_sint_t * RESTRICT temp_bucket = &buckets[4 * ALPHABET_SIZE];
@@ -3860,8 +3598,7 @@ static void libsais_partial_sorting_shift_markers_8u_omp(
fast_sint_t c;
#if defined(_OPENMP)
- #pragma omp parallel for schedule(static, 1) \
- num_threads(threads) if (threads > 1 && n >= 65536)
+ #pragma omp parallel for schedule(static, 1) num_threads(threads) if (threads > 1 && n >= 65536)
#else
UNUSED(threads);
UNUSED(n);
@@ -3897,9 +3634,9 @@ static void libsais_partial_sorting_shift_markers_8u_omp(
}
}
-static void libsais_partial_sorting_shift_markers_32s_6k_omp(
- sa_sint_t * RESTRICT SA, sa_sint_t k, const sa_sint_t * RESTRICT buckets,
- sa_sint_t threads) {
+static void libsais_partial_sorting_shift_markers_32s_6k_omp(sa_sint_t * RESTRICT SA, sa_sint_t k,
+ const sa_sint_t * RESTRICT buckets,
+ sa_sint_t threads) {
const fast_sint_t prefetch_distance = 32;
const sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k];
@@ -3907,8 +3644,7 @@ static void libsais_partial_sorting_shift_markers_32s_6k_omp(
fast_sint_t c;
#if defined(_OPENMP)
- #pragma omp parallel for schedule(static, 1) \
- num_threads(threads) if (threads > 1 && k >= 65536)
+ #pragma omp parallel for schedule(static, 1) num_threads(threads) if (threads > 1 && k >= 65536)
#else
UNUSED(threads);
#endif
@@ -3942,8 +3678,7 @@ static void libsais_partial_sorting_shift_markers_32s_6k_omp(
}
}
-static void libsais_partial_sorting_shift_markers_32s_4k(
- sa_sint_t * RESTRICT SA, sa_sint_t n) {
+static void libsais_partial_sorting_shift_markers_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n) {
const fast_sint_t prefetch_distance = 32;
fast_sint_t i;
@@ -3951,63 +3686,54 @@ static void libsais_partial_sorting_shift_markers_32s_4k(
for (i = (fast_sint_t)n - 1; i >= 3; i -= 4) {
libsais_prefetchw(&SA[i - prefetch_distance]);
- sa_sint_t p0 = SA[i - 0],
- q0 = ((p0 & SUFFIX_GROUP_MARKER) ^ s) &
- ((sa_sint_t)(p0 > 0) << ((SUFFIX_GROUP_BIT - 1)));
+ sa_sint_t p0 = SA[i - 0], q0 = ((p0 & SUFFIX_GROUP_MARKER) ^ s) &
+ ((sa_sint_t)(p0 > 0) << ((SUFFIX_GROUP_BIT - 1)));
s = s ^ q0;
SA[i - 0] = p0 ^ q0;
- sa_sint_t p1 = SA[i - 1],
- q1 = ((p1 & SUFFIX_GROUP_MARKER) ^ s) &
- ((sa_sint_t)(p1 > 0) << ((SUFFIX_GROUP_BIT - 1)));
+ sa_sint_t p1 = SA[i - 1], q1 = ((p1 & SUFFIX_GROUP_MARKER) ^ s) &
+ ((sa_sint_t)(p1 > 0) << ((SUFFIX_GROUP_BIT - 1)));
s = s ^ q1;
SA[i - 1] = p1 ^ q1;
- sa_sint_t p2 = SA[i - 2],
- q2 = ((p2 & SUFFIX_GROUP_MARKER) ^ s) &
- ((sa_sint_t)(p2 > 0) << ((SUFFIX_GROUP_BIT - 1)));
+ sa_sint_t p2 = SA[i - 2], q2 = ((p2 & SUFFIX_GROUP_MARKER) ^ s) &
+ ((sa_sint_t)(p2 > 0) << ((SUFFIX_GROUP_BIT - 1)));
s = s ^ q2;
SA[i - 2] = p2 ^ q2;
- sa_sint_t p3 = SA[i - 3],
- q3 = ((p3 & SUFFIX_GROUP_MARKER) ^ s) &
- ((sa_sint_t)(p3 > 0) << ((SUFFIX_GROUP_BIT - 1)));
+ sa_sint_t p3 = SA[i - 3], q3 = ((p3 & SUFFIX_GROUP_MARKER) ^ s) &
+ ((sa_sint_t)(p3 > 0) << ((SUFFIX_GROUP_BIT - 1)));
s = s ^ q3;
SA[i - 3] = p3 ^ q3;
}
for (; i >= 0; i -= 1) {
- sa_sint_t p = SA[i],
- q = ((p & SUFFIX_GROUP_MARKER) ^ s) &
- ((sa_sint_t)(p > 0) << ((SUFFIX_GROUP_BIT - 1)));
+ sa_sint_t p = SA[i], q = ((p & SUFFIX_GROUP_MARKER) ^ s) &
+ ((sa_sint_t)(p > 0) << ((SUFFIX_GROUP_BIT - 1)));
s = s ^ q;
SA[i] = p ^ q;
}
}
-static void libsais_partial_sorting_shift_buckets_32s_6k(
- sa_sint_t k, sa_sint_t * RESTRICT buckets) {
+static void libsais_partial_sorting_shift_buckets_32s_6k(sa_sint_t k,
+ sa_sint_t * RESTRICT buckets) {
sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k];
fast_sint_t i;
for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0);
i += BUCKETS_INDEX2(1, 0)) {
- buckets[2 * i + BUCKETS_INDEX4(0, 0)] =
- temp_bucket[i + BUCKETS_INDEX2(0, 0)];
- buckets[2 * i + BUCKETS_INDEX4(0, 1)] =
- temp_bucket[i + BUCKETS_INDEX2(0, 1)];
+ buckets[2 * i + BUCKETS_INDEX4(0, 0)] = temp_bucket[i + BUCKETS_INDEX2(0, 0)];
+ buckets[2 * i + BUCKETS_INDEX4(0, 1)] = temp_bucket[i + BUCKETS_INDEX2(0, 1)];
}
}
static sa_sint_t libsais_partial_sorting_scan_right_to_left_8u(
- const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start,
- fast_sint_t omp_block_size) {
+ const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d,
+ fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
const fast_sint_t prefetch_distance = 32;
sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE];
fast_sint_t i, j;
- for (i = omp_block_start + omp_block_size - 1,
- j = omp_block_start + prefetch_distance + 1;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1;
i >= j; i -= 2) {
libsais_prefetch(&SA[i - 2 * prefetch_distance]);
@@ -4021,8 +3747,7 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_8u(
p0 &= SAINT_MAX;
sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]);
SA[--induction_bucket[v0]] =
- (p0 - 1) |
- ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1));
+ (p0 - 1) | ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1));
distinct_names[v0] = d;
sa_sint_t p1 = SA[i - 1];
@@ -4030,8 +3755,7 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_8u(
p1 &= SAINT_MAX;
sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]);
SA[--induction_bucket[v1]] =
- (p1 - 1) |
- ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1));
+ (p1 - 1) | ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1));
distinct_names[v1] = d;
}
@@ -4051,9 +3775,8 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_8u(
#if defined(_OPENMP)
static void libsais_partial_sorting_scan_right_to_left_8u_block_prepare(
- const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache,
- fast_sint_t omp_block_start, fast_sint_t omp_block_size,
+ const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
+ LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size,
LIBSAIS_THREAD_STATE * RESTRICT state) {
const fast_sint_t prefetch_distance = 32;
@@ -4064,8 +3787,7 @@ static void libsais_partial_sorting_scan_right_to_left_8u_block_prepare(
fast_sint_t i, j, count = 0;
sa_sint_t d = 1;
- for (i = omp_block_start + omp_block_size - 1,
- j = omp_block_start + prefetch_distance + 1;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1;
i >= j; i -= 2) {
libsais_prefetch(&SA[i - 2 * prefetch_distance]);
@@ -4077,15 +3799,13 @@ static void libsais_partial_sorting_scan_right_to_left_8u_block_prepare(
sa_sint_t p0 = cache[count].index = SA[i - 0];
d += (p0 < 0);
p0 &= SAINT_MAX;
- sa_sint_t v0 = cache[count++].symbol =
- BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]);
+ sa_sint_t v0 = cache[count++].symbol = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]);
induction_bucket[v0]++;
distinct_names[v0] = d;
sa_sint_t p1 = cache[count].index = SA[i - 1];
d += (p1 < 0);
p1 &= SAINT_MAX;
- sa_sint_t v1 = cache[count++].symbol =
- BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]);
+ sa_sint_t v1 = cache[count++].symbol = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]);
induction_bucket[v1]++;
distinct_names[v1] = d;
}
@@ -4094,8 +3814,7 @@ static void libsais_partial_sorting_scan_right_to_left_8u_block_prepare(
sa_sint_t p = cache[count].index = SA[i];
d += (p < 0);
p &= SAINT_MAX;
- sa_sint_t v = cache[count++].symbol =
- BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]);
+ sa_sint_t v = cache[count++].symbol = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]);
induction_bucket[v]++;
distinct_names[v] = d;
}
@@ -4105,8 +3824,8 @@ static void libsais_partial_sorting_scan_right_to_left_8u_block_prepare(
}
static void libsais_partial_sorting_scan_right_to_left_8u_block_place(
- sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
- LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count, sa_sint_t d) {
+ sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache,
+ fast_sint_t count, sa_sint_t d) {
const fast_sint_t prefetch_distance = 32;
sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
@@ -4120,16 +3839,14 @@ static void libsais_partial_sorting_scan_right_to_left_8u_block_place(
d += (p0 < 0);
sa_sint_t v0 = cache[i + 0].symbol;
SA[--induction_bucket[v0]] =
- (p0 - 1) |
- ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1));
+ (p0 - 1) | ((sa_sint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1));
distinct_names[v0] = d;
sa_sint_t p1 = cache[i + 1].index;
d += (p1 < 0);
sa_sint_t v1 = cache[i + 1].symbol;
SA[--induction_bucket[v1]] =
- (p1 - 1) |
- ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1));
+ (p1 - 1) | ((sa_sint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1));
distinct_names[v1] = d;
}
@@ -4144,14 +3861,13 @@ static void libsais_partial_sorting_scan_right_to_left_8u_block_place(
}
static sa_sint_t libsais_partial_sorting_scan_right_to_left_8u_block_omp(
- const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t block_start,
- fast_sint_t block_size, sa_sint_t threads,
+ const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d,
+ fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads,
LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
#if defined(_OPENMP)
- #pragma omp parallel num_threads( \
- threads) if (threads > 1 && block_size >= 64 * ALPHABET_SIZE && \
- omp_get_dynamic() == 0)
+ #pragma omp parallel num_threads(threads) if (threads > 1 && \
+ block_size >= 64 * ALPHABET_SIZE && \
+ omp_get_dynamic() == 0)
#endif
{
#if defined(_OPENMP)
@@ -4166,33 +3882,30 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_8u_block_omp(
#endif
fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
- fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
- ? omp_block_stride
- : block_size - omp_block_start;
+ fast_sint_t omp_block_size =
+ omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
omp_block_start += block_start;
if (omp_num_threads == 1) {
- d = libsais_partial_sorting_scan_right_to_left_8u(
- T, SA, buckets, d, omp_block_start, omp_block_size);
+ d = libsais_partial_sorting_scan_right_to_left_8u(T, SA, buckets, d, omp_block_start,
+ omp_block_size);
}
#if defined(_OPENMP)
else {
{
libsais_partial_sorting_scan_right_to_left_8u_block_prepare(
T, SA, thread_state[omp_thread_num].state.buckets,
- thread_state[omp_thread_num].state.cache, omp_block_start,
- omp_block_size, &thread_state[omp_thread_num]);
+ thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size,
+ &thread_state[omp_thread_num]);
}
#pragma omp barrier
#pragma omp master
{
- sa_sint_t * RESTRICT induction_bucket =
- &buckets[0 * ALPHABET_SIZE];
- sa_sint_t * RESTRICT distinct_names =
- &buckets[2 * ALPHABET_SIZE];
+ sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE];
+ sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE];
fast_sint_t t;
for (t = omp_num_threads - 1; t >= 0; --t) {
@@ -4203,15 +3916,13 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_8u_block_omp(
fast_sint_t c;
for (c = 0; c < 2 * ALPHABET_SIZE; c += 1) {
- sa_sint_t A = induction_bucket[c],
- B = temp_induction_bucket[c];
+ sa_sint_t A = induction_bucket[c], B = temp_induction_bucket[c];
induction_bucket[c] = A - B;
temp_induction_bucket[c] = A;
}
for (d -= 1, c = 0; c < 2 * ALPHABET_SIZE; c += 1) {
- sa_sint_t A = distinct_names[c],
- B = temp_distinct_names[c], D = B + d;
+ sa_sint_t A = distinct_names[c], B = temp_distinct_names[c], D = B + d;
distinct_names[c] = B > 0 ? D : A;
temp_distinct_names[c] = A;
}
@@ -4240,16 +3951,15 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_8u_block_omp(
#endif
static void libsais_partial_sorting_scan_right_to_left_8u_omp(
- const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix,
- sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads,
+ const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets,
+ sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads,
LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
fast_sint_t scan_start = (fast_sint_t)left_suffixes_count + 1;
fast_sint_t scan_end = (fast_sint_t)n - (fast_sint_t)first_lms_suffix;
if (threads == 1 || (scan_end - scan_start) < 65536) {
- libsais_partial_sorting_scan_right_to_left_8u(
- T, SA, buckets, d, scan_start, scan_end - scan_start);
+ libsais_partial_sorting_scan_right_to_left_8u(T, SA, buckets, d, scan_start,
+ scan_end - scan_start);
}
#if defined(_OPENMP)
else {
@@ -4262,9 +3972,8 @@ static void libsais_partial_sorting_scan_right_to_left_8u_omp(
block_start--;
} else {
fast_sint_t block_max_end =
- block_start -
- ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE -
- 16 * (fast_sint_t)threads);
+ block_start - ((fast_sint_t)threads) *
+ (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads);
if (block_max_end < scan_start) {
block_max_end = scan_start - 1;
}
@@ -4279,17 +3988,14 @@ static void libsais_partial_sorting_scan_right_to_left_8u_omp(
sa_sint_t p = SA[block_start];
d += (p < 0);
p &= SAINT_MAX;
- sa_sint_t v =
- BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]);
+ sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]);
SA[--induction_bucket[v]] =
- (p - 1) | ((sa_sint_t)(distinct_names[v] != d)
- << (SAINT_BIT - 1));
+ (p - 1) | ((sa_sint_t)(distinct_names[v] != d) << (SAINT_BIT - 1));
distinct_names[v] = d;
}
} else {
d = libsais_partial_sorting_scan_right_to_left_8u_block_omp(
- T, SA, buckets, d, block_end + 1, block_size, threads,
- thread_state);
+ T, SA, buckets, d, block_end + 1, block_size, threads, thread_state);
block_start = block_end;
}
}
@@ -4301,14 +4007,12 @@ static void libsais_partial_sorting_scan_right_to_left_8u_omp(
}
static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k(
- const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start,
- fast_sint_t omp_block_size) {
+ const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
+ sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
const fast_sint_t prefetch_distance = 32;
fast_sint_t i, j;
- for (i = omp_block_start + omp_block_size - 1,
- j = omp_block_start + 2 * prefetch_distance + 1;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1;
i >= j; i -= 2) {
libsais_prefetch(&SA[i - 3 * prefetch_distance]);
@@ -4328,16 +4032,14 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k(
d += (p2 < 0);
p2 &= SAINT_MAX;
sa_sint_t v2 = BUCKETS_INDEX4(T[p2 - 1], T[p2 - 2] > T[p2 - 1]);
- SA[--buckets[v2]] =
- (p2 - 1) | ((sa_sint_t)(buckets[2 + v2] != d) << (SAINT_BIT - 1));
+ SA[--buckets[v2]] = (p2 - 1) | ((sa_sint_t)(buckets[2 + v2] != d) << (SAINT_BIT - 1));
buckets[2 + v2] = d;
sa_sint_t p3 = SA[i - 1];
d += (p3 < 0);
p3 &= SAINT_MAX;
sa_sint_t v3 = BUCKETS_INDEX4(T[p3 - 1], T[p3 - 2] > T[p3 - 1]);
- SA[--buckets[v3]] =
- (p3 - 1) | ((sa_sint_t)(buckets[2 + v3] != d) << (SAINT_BIT - 1));
+ SA[--buckets[v3]] = (p3 - 1) | ((sa_sint_t)(buckets[2 + v3] != d) << (SAINT_BIT - 1));
buckets[2 + v3] = d;
}
@@ -4346,8 +4048,7 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k(
d += (p < 0);
p &= SAINT_MAX;
sa_sint_t v = BUCKETS_INDEX4(T[p - 1], T[p - 2] > T[p - 1]);
- SA[--buckets[v]] =
- (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1));
+ SA[--buckets[v]] = (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1));
buckets[2 + v] = d;
}
@@ -4364,8 +4065,7 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k(
sa_sint_t * RESTRICT distinct_names = &buckets[0 * k];
fast_sint_t i, j;
- for (i = omp_block_start + omp_block_size - 1,
- j = omp_block_start + 2 * prefetch_distance + 1;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1;
i >= j; i -= 2) {
libsais_prefetchw(&SA[i - 3 * prefetch_distance]);
@@ -4399,10 +4099,8 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k(
p0 &= ~SUFFIX_GROUP_MARKER;
sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]);
SA[--induction_bucket[T[p0 - 1]]] =
- (p0 - 1) |
- ((sa_sint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1)) |
- ((sa_sint_t)(distinct_names[v0] != d)
- << (SUFFIX_GROUP_BIT - 1));
+ (p0 - 1) | ((sa_sint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1)) |
+ ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1));
distinct_names[v0] = d;
}
@@ -4413,10 +4111,8 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k(
p1 &= ~SUFFIX_GROUP_MARKER;
sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]);
SA[--induction_bucket[T[p1 - 1]]] =
- (p1 - 1) |
- ((sa_sint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1)) |
- ((sa_sint_t)(distinct_names[v1] != d)
- << (SUFFIX_GROUP_BIT - 1));
+ (p1 - 1) | ((sa_sint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1)) |
+ ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1));
distinct_names[v1] = d;
}
}
@@ -4429,8 +4125,7 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k(
p &= ~SUFFIX_GROUP_MARKER;
sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]);
SA[--induction_bucket[T[p - 1]]] =
- (p - 1) |
- ((sa_sint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1)) |
+ (p - 1) | ((sa_sint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1)) |
((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1));
distinct_names[v] = d;
}
@@ -4439,15 +4134,15 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k(
return d;
}
-static void libsais_partial_sorting_scan_right_to_left_32s_1k(
- const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start,
- fast_sint_t omp_block_size) {
+static void libsais_partial_sorting_scan_right_to_left_32s_1k(const sa_sint_t * RESTRICT T,
+ sa_sint_t * RESTRICT SA,
+ sa_sint_t * RESTRICT induction_bucket,
+ fast_sint_t omp_block_start,
+ fast_sint_t omp_block_size) {
const fast_sint_t prefetch_distance = 32;
fast_sint_t i, j;
- for (i = omp_block_start + omp_block_size - 1,
- j = omp_block_start + 2 * prefetch_distance + 1;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1;
i >= j; i -= 2) {
libsais_prefetchw(&SA[i - 3 * prefetch_distance]);
@@ -4472,15 +4167,13 @@ static void libsais_partial_sorting_scan_right_to_left_32s_1k(
if (p0 > 0) {
SA[i - 0] = 0;
SA[--induction_bucket[T[p0 - 1]]] =
- (p0 - 1) |
- ((sa_sint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1));
+ (p0 - 1) | ((sa_sint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1));
}
sa_sint_t p1 = SA[i - 1];
if (p1 > 0) {
SA[i - 1] = 0;
SA[--induction_bucket[T[p1 - 1]]] =
- (p1 - 1) |
- ((sa_sint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1));
+ (p1 - 1) | ((sa_sint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1));
}
}
@@ -4497,15 +4190,13 @@ static void libsais_partial_sorting_scan_right_to_left_32s_1k(
#if defined(_OPENMP)
static void libsais_partial_sorting_scan_right_to_left_32s_6k_block_gather(
- const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
- LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
- fast_sint_t omp_block_size) {
+ const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache,
+ fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
const fast_sint_t prefetch_distance = 32;
fast_sint_t i, j;
- for (i = omp_block_start,
- j = omp_block_start + omp_block_size - prefetch_distance - 1;
- i < j; i += 2) {
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j;
+ i += 2) {
libsais_prefetch(&SA[i + 2 * prefetch_distance]);
libsais_prefetch(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1);
@@ -4543,15 +4234,13 @@ static void libsais_partial_sorting_scan_right_to_left_32s_6k_block_gather(
}
static void libsais_partial_sorting_scan_right_to_left_32s_4k_block_gather(
- const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
- LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
- fast_sint_t omp_block_size) {
+ const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache,
+ fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
const fast_sint_t prefetch_distance = 32;
fast_sint_t i, j;
- for (i = omp_block_start,
- j = omp_block_start + omp_block_size - prefetch_distance - 1;
- i < j; i += 2) {
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j;
+ i += 2) {
libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
sa_sint_t s0 = SA[i + prefetch_distance + 0];
@@ -4598,15 +4287,13 @@ static void libsais_partial_sorting_scan_right_to_left_32s_4k_block_gather(
}
static void libsais_partial_sorting_scan_right_to_left_32s_1k_block_gather(
- const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
- LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
- fast_sint_t omp_block_size) {
+ const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache,
+ fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
const fast_sint_t prefetch_distance = 32;
fast_sint_t i, j;
- for (i = omp_block_start,
- j = omp_block_start + omp_block_size - prefetch_distance - 1;
- i < j; i += 2) {
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j;
+ i += 2) {
libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
sa_sint_t s0 = SA[i + prefetch_distance + 0];
@@ -4625,16 +4312,14 @@ static void libsais_partial_sorting_scan_right_to_left_32s_1k_block_gather(
sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0];
if (p0 > 0) {
SA[i + 0] = 0;
- cache[i + 0].index = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] > T[p0 - 1])
- << (SAINT_BIT - 1));
+ cache[i + 0].index = (p0 - 1) | ((sa_sint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1));
symbol0 = T[p0 - 1];
}
cache[i + 0].symbol = symbol0;
sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1];
if (p1 > 0) {
SA[i + 1] = 0;
- cache[i + 1].index = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] > T[p1 - 1])
- << (SAINT_BIT - 1));
+ cache[i + 1].index = (p1 - 1) | ((sa_sint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1));
symbol1 = T[p1 - 1];
}
cache[i + 1].symbol = symbol1;
@@ -4644,8 +4329,7 @@ static void libsais_partial_sorting_scan_right_to_left_32s_1k_block_gather(
sa_sint_t symbol = SAINT_MIN, p = SA[i];
if (p > 0) {
SA[i] = 0;
- cache[i].index =
- (p - 1) | ((sa_sint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1));
+ cache[i].index = (p - 1) | ((sa_sint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1));
symbol = T[p - 1];
}
cache[i].symbol = symbol;
@@ -4659,8 +4343,7 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k_block_sort(
const fast_sint_t prefetch_distance = 32;
fast_sint_t i, j;
- for (i = omp_block_start + omp_block_size - 1,
- j = omp_block_start + prefetch_distance + 1;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1;
i >= j; i -= 2) {
libsais_prefetchw(&cache[i - 2 * prefetch_distance]);
@@ -4670,8 +4353,7 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k_block_sort(
sa_sint_t v0 = cache[i - 0].symbol, p0 = cache[i - 0].index;
d += (p0 < 0);
cache[i - 0].symbol = --buckets[v0];
- cache[i - 0].index =
- (p0 - 1) | ((sa_sint_t)(buckets[2 + v0] != d) << (SAINT_BIT - 1));
+ cache[i - 0].index = (p0 - 1) | ((sa_sint_t)(buckets[2 + v0] != d) << (SAINT_BIT - 1));
buckets[2 + v0] = d;
if (cache[i - 0].symbol >= omp_block_start) {
sa_sint_t s = cache[i - 0].symbol,
@@ -4682,8 +4364,7 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k_block_sort(
sa_sint_t v1 = cache[i - 1].symbol, p1 = cache[i - 1].index;
d += (p1 < 0);
cache[i - 1].symbol = --buckets[v1];
- cache[i - 1].index =
- (p1 - 1) | ((sa_sint_t)(buckets[2 + v1] != d) << (SAINT_BIT - 1));
+ cache[i - 1].index = (p1 - 1) | ((sa_sint_t)(buckets[2 + v1] != d) << (SAINT_BIT - 1));
buckets[2 + v1] = d;
if (cache[i - 1].symbol >= omp_block_start) {
sa_sint_t s = cache[i - 1].symbol,
@@ -4696,12 +4377,10 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k_block_sort(
sa_sint_t v = cache[i].symbol, p = cache[i].index;
d += (p < 0);
cache[i].symbol = --buckets[v];
- cache[i].index =
- (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1));
+ cache[i].index = (p - 1) | ((sa_sint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1));
buckets[2 + v] = d;
if (cache[i].symbol >= omp_block_start) {
- sa_sint_t s = cache[i].symbol,
- q = (cache[s].index = cache[i].index) & SAINT_MAX;
+ sa_sint_t s = cache[i].symbol, q = (cache[s].index = cache[i].index) & SAINT_MAX;
cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] > T[q - 1]);
}
}
@@ -4710,17 +4389,16 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k_block_sort(
}
static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k_block_sort(
- const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets,
- sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache,
- fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+ const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d,
+ LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
+ fast_sint_t omp_block_size) {
const fast_sint_t prefetch_distance = 32;
sa_sint_t * RESTRICT induction_bucket = &buckets[3 * k];
sa_sint_t * RESTRICT distinct_names = &buckets[0 * k];
fast_sint_t i, j;
- for (i = omp_block_start + omp_block_size - 1,
- j = omp_block_start + prefetch_distance + 1;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1;
i >= j; i -= 2) {
libsais_prefetchw(&cache[i - 2 * prefetch_distance]);
@@ -4741,8 +4419,7 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k_block_sort(
d += (p0 >> (SUFFIX_GROUP_BIT - 1));
cache[i - 0].symbol = --induction_bucket[v0 >> 1];
cache[i - 0].index = (p0 - 1) | (v0 << (SAINT_BIT - 1)) |
- ((sa_sint_t)(distinct_names[v0] != d)
- << (SUFFIX_GROUP_BIT - 1));
+ ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1));
distinct_names[v0] = d;
if (cache[i - 0].symbol >= omp_block_start) {
sa_sint_t ni = cache[i - 0].symbol, np = cache[i - 0].index;
@@ -4750,8 +4427,7 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k_block_sort(
cache[i - 0].index = 0;
cache[ni].index = np;
np &= ~SUFFIX_GROUP_MARKER;
- cache[ni].symbol =
- BUCKETS_INDEX2(T[np - 1], T[np - 2] > T[np - 1]);
+ cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] > T[np - 1]);
}
}
}
@@ -4762,8 +4438,7 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k_block_sort(
d += (p1 >> (SUFFIX_GROUP_BIT - 1));
cache[i - 1].symbol = --induction_bucket[v1 >> 1];
cache[i - 1].index = (p1 - 1) | (v1 << (SAINT_BIT - 1)) |
- ((sa_sint_t)(distinct_names[v1] != d)
- << (SUFFIX_GROUP_BIT - 1));
+ ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1));
distinct_names[v1] = d;
if (cache[i - 1].symbol >= omp_block_start) {
sa_sint_t ni = cache[i - 1].symbol, np = cache[i - 1].index;
@@ -4771,8 +4446,7 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k_block_sort(
cache[i - 1].index = 0;
cache[ni].index = np;
np &= ~SUFFIX_GROUP_MARKER;
- cache[ni].symbol =
- BUCKETS_INDEX2(T[np - 1], T[np - 2] > T[np - 1]);
+ cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] > T[np - 1]);
}
}
}
@@ -4784,9 +4458,8 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k_block_sort(
sa_sint_t p = cache[i].index;
d += (p >> (SUFFIX_GROUP_BIT - 1));
cache[i].symbol = --induction_bucket[v >> 1];
- cache[i].index =
- (p - 1) | (v << (SAINT_BIT - 1)) |
- ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1));
+ cache[i].index = (p - 1) | (v << (SAINT_BIT - 1)) |
+ ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1));
distinct_names[v] = d;
if (cache[i].symbol >= omp_block_start) {
sa_sint_t ni = cache[i].symbol, np = cache[i].index;
@@ -4794,8 +4467,7 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k_block_sort(
cache[i].index = 0;
cache[ni].index = np;
np &= ~SUFFIX_GROUP_MARKER;
- cache[ni].symbol =
- BUCKETS_INDEX2(T[np - 1], T[np - 2] > T[np - 1]);
+ cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] > T[np - 1]);
}
}
}
@@ -4811,8 +4483,7 @@ static void libsais_partial_sorting_scan_right_to_left_32s_1k_block_sort(
const fast_sint_t prefetch_distance = 32;
fast_sint_t i, j;
- for (i = omp_block_start + omp_block_size - 1,
- j = omp_block_start + prefetch_distance + 1;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1;
i >= j; i -= 2) {
libsais_prefetchw(&cache[i - 2 * prefetch_distance]);
@@ -4831,8 +4502,7 @@ static void libsais_partial_sorting_scan_right_to_left_32s_1k_block_sort(
if (np > 0) {
cache[i - 0].index = 0;
cache[ni].index =
- (np - 1) |
- ((sa_sint_t)(T[np - 2] > T[np - 1]) << (SAINT_BIT - 1));
+ (np - 1) | ((sa_sint_t)(T[np - 2] > T[np - 1]) << (SAINT_BIT - 1));
cache[ni].symbol = T[np - 1];
}
}
@@ -4846,8 +4516,7 @@ static void libsais_partial_sorting_scan_right_to_left_32s_1k_block_sort(
if (np > 0) {
cache[i - 1].index = 0;
cache[ni].index =
- (np - 1) |
- ((sa_sint_t)(T[np - 2] > T[np - 1]) << (SAINT_BIT - 1));
+ (np - 1) | ((sa_sint_t)(T[np - 2] > T[np - 1]) << (SAINT_BIT - 1));
cache[ni].symbol = T[np - 1];
}
}
@@ -4863,8 +4532,7 @@ static void libsais_partial_sorting_scan_right_to_left_32s_1k_block_sort(
if (np > 0) {
cache[i].index = 0;
cache[ni].index =
- (np - 1) |
- ((sa_sint_t)(T[np - 2] > T[np - 1]) << (SAINT_BIT - 1));
+ (np - 1) | ((sa_sint_t)(T[np - 2] > T[np - 1]) << (SAINT_BIT - 1));
cache[ni].symbol = T[np - 1];
}
}
@@ -4873,13 +4541,11 @@ static void libsais_partial_sorting_scan_right_to_left_32s_1k_block_sort(
}
static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k_block_omp(
- const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t * RESTRICT buckets, sa_sint_t d,
- LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start,
+ const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
+ sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start,
fast_sint_t block_size, sa_sint_t threads) {
#if defined(_OPENMP)
- #pragma omp parallel num_threads(threads) if (threads > 1 && \
- block_size >= 16384)
+ #pragma omp parallel num_threads(threads) if (threads > 1 && block_size >= 16384)
#endif
{
#if defined(_OPENMP)
@@ -4894,22 +4560,20 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k_block_omp(
#endif
fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
- fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
- ? omp_block_stride
- : block_size - omp_block_start;
+ fast_sint_t omp_block_size =
+ omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
omp_block_start += block_start;
if (omp_num_threads == 1) {
- d = libsais_partial_sorting_scan_right_to_left_32s_6k(
- T, SA, buckets, d, omp_block_start, omp_block_size);
+ d = libsais_partial_sorting_scan_right_to_left_32s_6k(T, SA, buckets, d,
+ omp_block_start, omp_block_size);
}
#if defined(_OPENMP)
else {
{
libsais_partial_sorting_scan_right_to_left_32s_6k_block_gather(
- T, SA, cache - block_start, omp_block_start,
- omp_block_size);
+ T, SA, cache - block_start, omp_block_start, omp_block_size);
}
#pragma omp barrier
@@ -4917,15 +4581,14 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k_block_omp(
#pragma omp master
{
d = libsais_partial_sorting_scan_right_to_left_32s_6k_block_sort(
- T, buckets, d, cache - block_start, block_start,
- block_size);
+ T, buckets, d, cache - block_start, block_start, block_size);
}
#pragma omp barrier
{
- libsais_place_cached_suffixes(SA, cache - block_start,
- omp_block_start, omp_block_size);
+ libsais_place_cached_suffixes(SA, cache - block_start, omp_block_start,
+ omp_block_size);
}
}
#endif
@@ -4936,12 +4599,10 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k_block_omp(
static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k_block_omp(
const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k,
- sa_sint_t * RESTRICT buckets, sa_sint_t d,
- LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start,
- fast_sint_t block_size, sa_sint_t threads) {
+ sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache,
+ fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) {
#if defined(_OPENMP)
- #pragma omp parallel num_threads(threads) if (threads > 1 && \
- block_size >= 16384)
+ #pragma omp parallel num_threads(threads) if (threads > 1 && block_size >= 16384)
#endif
{
#if defined(_OPENMP)
@@ -4956,22 +4617,20 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k_block_omp(
#endif
fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
- fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
- ? omp_block_stride
- : block_size - omp_block_start;
+ fast_sint_t omp_block_size =
+ omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
omp_block_start += block_start;
if (omp_num_threads == 1) {
- d = libsais_partial_sorting_scan_right_to_left_32s_4k(
- T, SA, k, buckets, d, omp_block_start, omp_block_size);
+ d = libsais_partial_sorting_scan_right_to_left_32s_4k(T, SA, k, buckets, d,
+ omp_block_start, omp_block_size);
}
#if defined(_OPENMP)
else {
{
libsais_partial_sorting_scan_right_to_left_32s_4k_block_gather(
- T, SA, cache - block_start, omp_block_start,
- omp_block_size);
+ T, SA, cache - block_start, omp_block_start, omp_block_size);
}
#pragma omp barrier
@@ -4979,15 +4638,14 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k_block_omp(
#pragma omp master
{
d = libsais_partial_sorting_scan_right_to_left_32s_4k_block_sort(
- T, k, buckets, d, cache - block_start, block_start,
- block_size);
+ T, k, buckets, d, cache - block_start, block_start, block_size);
}
#pragma omp barrier
{
- libsais_compact_and_place_cached_suffixes(
- SA, cache - block_start, omp_block_start, omp_block_size);
+ libsais_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start,
+ omp_block_size);
}
}
#endif
@@ -4997,12 +4655,11 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k_block_omp(
}
static void libsais_partial_sorting_scan_right_to_left_32s_1k_block_omp(
- const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache,
- fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) {
+ const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
+ LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size,
+ sa_sint_t threads) {
#if defined(_OPENMP)
- #pragma omp parallel num_threads(threads) if (threads > 1 && \
- block_size >= 16384)
+ #pragma omp parallel num_threads(threads) if (threads > 1 && block_size >= 16384)
#endif
{
#if defined(_OPENMP)
@@ -5017,22 +4674,20 @@ static void libsais_partial_sorting_scan_right_to_left_32s_1k_block_omp(
#endif
fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
- fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
- ? omp_block_stride
- : block_size - omp_block_start;
+ fast_sint_t omp_block_size =
+ omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
omp_block_start += block_start;
if (omp_num_threads == 1) {
- libsais_partial_sorting_scan_right_to_left_32s_1k(
- T, SA, buckets, omp_block_start, omp_block_size);
+ libsais_partial_sorting_scan_right_to_left_32s_1k(T, SA, buckets, omp_block_start,
+ omp_block_size);
}
#if defined(_OPENMP)
else {
{
libsais_partial_sorting_scan_right_to_left_32s_1k_block_gather(
- T, SA, cache - block_start, omp_block_start,
- omp_block_size);
+ T, SA, cache - block_start, omp_block_start, omp_block_size);
}
#pragma omp barrier
@@ -5046,8 +4701,8 @@ static void libsais_partial_sorting_scan_right_to_left_32s_1k_block_omp(
#pragma omp barrier
{
- libsais_compact_and_place_cached_suffixes(
- SA, cache - block_start, omp_block_start, omp_block_size);
+ libsais_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start,
+ omp_block_size);
}
}
#endif
@@ -5058,23 +4713,20 @@ static void libsais_partial_sorting_scan_right_to_left_32s_1k_block_omp(
static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k_omp(
const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix,
- sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count,
+ sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
fast_sint_t scan_start = (fast_sint_t)left_suffixes_count + 1;
fast_sint_t scan_end = (fast_sint_t)n - (fast_sint_t)first_lms_suffix;
if (threads == 1 || (scan_end - scan_start) < 65536) {
- d = libsais_partial_sorting_scan_right_to_left_32s_6k(
- T, SA, buckets, d, scan_start, scan_end - scan_start);
+ d = libsais_partial_sorting_scan_right_to_left_32s_6k(T, SA, buckets, d, scan_start,
+ scan_end - scan_start);
}
#if defined(_OPENMP)
else {
fast_sint_t block_start, block_end;
- for (block_start = scan_end - 1; block_start >= scan_start;
- block_start = block_end) {
- block_end = block_start -
- (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
+ for (block_start = scan_end - 1; block_start >= scan_start; block_start = block_end) {
+ block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
if (block_end < scan_start) {
block_end = scan_start - 1;
}
@@ -5092,27 +4744,24 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k_omp(
}
static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k_omp(
- const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, sa_sint_t threads,
+ const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
+ sa_sint_t * RESTRICT buckets, sa_sint_t d, sa_sint_t threads,
LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
if (threads == 1 || n < 65536) {
- d = libsais_partial_sorting_scan_right_to_left_32s_4k(T, SA, k, buckets,
- d, 0, n);
+ d = libsais_partial_sorting_scan_right_to_left_32s_4k(T, SA, k, buckets, d, 0, n);
}
#if defined(_OPENMP)
else {
fast_sint_t block_start, block_end;
- for (block_start = (fast_sint_t)n - 1; block_start >= 0;
- block_start = block_end) {
- block_end = block_start -
- (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
+ for (block_start = (fast_sint_t)n - 1; block_start >= 0; block_start = block_end) {
+ block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
if (block_end < 0) {
block_end = -1;
}
d = libsais_partial_sorting_scan_right_to_left_32s_4k_block_omp(
- T, SA, k, buckets, d, thread_state[0].state.cache,
- block_end + 1, block_start - block_end, threads);
+ T, SA, k, buckets, d, thread_state[0].state.cache, block_end + 1,
+ block_start - block_end, threads);
}
}
#else
@@ -5124,25 +4773,22 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k_omp(
static void libsais_partial_sorting_scan_right_to_left_32s_1k_omp(
const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t * RESTRICT buckets, sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
if (threads == 1 || n < 65536) {
libsais_partial_sorting_scan_right_to_left_32s_1k(T, SA, buckets, 0, n);
}
#if defined(_OPENMP)
else {
fast_sint_t block_start, block_end;
- for (block_start = (fast_sint_t)n - 1; block_start >= 0;
- block_start = block_end) {
- block_end = block_start -
- (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
+ for (block_start = (fast_sint_t)n - 1; block_start >= 0; block_start = block_end) {
+ block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
if (block_end < 0) {
block_end = -1;
}
libsais_partial_sorting_scan_right_to_left_32s_1k_block_omp(
- T, SA, buckets, thread_state[0].state.cache, block_end + 1,
- block_start - block_end, threads);
+ T, SA, buckets, thread_state[0].state.cache, block_end + 1, block_start - block_end,
+ threads);
}
}
#else
@@ -5150,15 +4796,14 @@ static void libsais_partial_sorting_scan_right_to_left_32s_1k_omp(
#endif
}
-static fast_sint_t libsais_partial_sorting_gather_lms_suffixes_32s_4k(
- sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start,
- fast_sint_t omp_block_size) {
+static fast_sint_t libsais_partial_sorting_gather_lms_suffixes_32s_4k(sa_sint_t * RESTRICT SA,
+ fast_sint_t omp_block_start,
+ fast_sint_t omp_block_size) {
const fast_sint_t prefetch_distance = 32;
fast_sint_t i, j, l;
- for (i = omp_block_start, j = omp_block_start + omp_block_size - 3,
- l = omp_block_start;
- i < j; i += 4) {
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - 3, l = omp_block_start; i < j;
+ i += 4) {
libsais_prefetch(&SA[i + prefetch_distance]);
sa_sint_t s0 = SA[i + 0];
@@ -5184,15 +4829,14 @@ static fast_sint_t libsais_partial_sorting_gather_lms_suffixes_32s_4k(
return l;
}
-static fast_sint_t libsais_partial_sorting_gather_lms_suffixes_32s_1k(
- sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start,
- fast_sint_t omp_block_size) {
+static fast_sint_t libsais_partial_sorting_gather_lms_suffixes_32s_1k(sa_sint_t * RESTRICT SA,
+ fast_sint_t omp_block_start,
+ fast_sint_t omp_block_size) {
const fast_sint_t prefetch_distance = 32;
fast_sint_t i, j, l;
- for (i = omp_block_start, j = omp_block_start + omp_block_size - 3,
- l = omp_block_start;
- i < j; i += 4) {
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - 3, l = omp_block_start; i < j;
+ i += 4) {
libsais_prefetch(&SA[i + prefetch_distance]);
sa_sint_t s0 = SA[i + 0];
@@ -5237,21 +4881,19 @@ static void libsais_partial_sorting_gather_lms_suffixes_32s_4k_omp(
#endif
fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
- fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
- ? omp_block_stride
- : n - omp_block_start;
+ fast_sint_t omp_block_size =
+ omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
if (omp_num_threads == 1) {
- libsais_partial_sorting_gather_lms_suffixes_32s_4k(
- SA, omp_block_start, omp_block_size);
+ libsais_partial_sorting_gather_lms_suffixes_32s_4k(SA, omp_block_start, omp_block_size);
}
#if defined(_OPENMP)
else {
{
thread_state[omp_thread_num].state.position = omp_block_start;
thread_state[omp_thread_num].state.count =
- libsais_partial_sorting_gather_lms_suffixes_32s_4k(
- SA, omp_block_start, omp_block_size) -
+ libsais_partial_sorting_gather_lms_suffixes_32s_4k(SA, omp_block_start,
+ omp_block_size) -
omp_block_start;
}
@@ -5262,10 +4904,8 @@ static void libsais_partial_sorting_gather_lms_suffixes_32s_4k_omp(
fast_sint_t t, position = 0;
for (t = 0; t < omp_num_threads; ++t) {
if (t > 0 && thread_state[t].state.count > 0) {
- memmove(&SA[position],
- &SA[thread_state[t].state.position],
- (size_t)thread_state[t].state.count *
- sizeof(sa_sint_t));
+ memmove(&SA[position], &SA[thread_state[t].state.position],
+ (size_t)thread_state[t].state.count * sizeof(sa_sint_t));
}
position += thread_state[t].state.count;
@@ -5295,21 +4935,19 @@ static void libsais_partial_sorting_gather_lms_suffixes_32s_1k_omp(
#endif
fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
- fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
- ? omp_block_stride
- : n - omp_block_start;
+ fast_sint_t omp_block_size =
+ omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
if (omp_num_threads == 1) {
- libsais_partial_sorting_gather_lms_suffixes_32s_1k(
- SA, omp_block_start, omp_block_size);
+ libsais_partial_sorting_gather_lms_suffixes_32s_1k(SA, omp_block_start, omp_block_size);
}
#if defined(_OPENMP)
else {
{
thread_state[omp_thread_num].state.position = omp_block_start;
thread_state[omp_thread_num].state.count =
- libsais_partial_sorting_gather_lms_suffixes_32s_1k(
- SA, omp_block_start, omp_block_size) -
+ libsais_partial_sorting_gather_lms_suffixes_32s_1k(SA, omp_block_start,
+ omp_block_size) -
omp_block_start;
}
@@ -5320,10 +4958,8 @@ static void libsais_partial_sorting_gather_lms_suffixes_32s_1k_omp(
fast_sint_t t, position = 0;
for (t = 0; t < omp_num_threads; ++t) {
if (t > 0 && thread_state[t].state.count > 0) {
- memmove(&SA[position],
- &SA[thread_state[t].state.position],
- (size_t)thread_state[t].state.count *
- sizeof(sa_sint_t));
+ memmove(&SA[position], &SA[thread_state[t].state.position],
+ (size_t)thread_state[t].state.count * sizeof(sa_sint_t));
}
position += thread_state[t].state.count;
@@ -5334,103 +4970,91 @@ static void libsais_partial_sorting_gather_lms_suffixes_32s_1k_omp(
}
}
-static void libsais_induce_partial_order_8u_omp(
- const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix,
- sa_sint_t left_suffixes_count, sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- memset(&buckets[2 * ALPHABET_SIZE], 0,
- 2 * ALPHABET_SIZE * sizeof(sa_sint_t));
+static void libsais_induce_partial_order_8u_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
+ sa_sint_t n, sa_sint_t * RESTRICT buckets,
+ sa_sint_t first_lms_suffix,
+ sa_sint_t left_suffixes_count, sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ memset(&buckets[2 * ALPHABET_SIZE], 0, 2 * ALPHABET_SIZE * sizeof(sa_sint_t));
sa_sint_t d = libsais_partial_sorting_scan_left_to_right_8u_omp(
T, SA, n, buckets, left_suffixes_count, 0, threads, thread_state);
libsais_partial_sorting_shift_markers_8u_omp(SA, n, buckets, threads);
libsais_partial_sorting_scan_right_to_left_8u_omp(
- T, SA, n, buckets, first_lms_suffix, left_suffixes_count, d, threads,
- thread_state);
+ T, SA, n, buckets, first_lms_suffix, left_suffixes_count, d, threads, thread_state);
}
static void libsais_induce_partial_order_32s_6k_omp(
- const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix,
- sa_sint_t left_suffixes_count, sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
+ sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count,
+ sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
sa_sint_t d = libsais_partial_sorting_scan_left_to_right_32s_6k_omp(
T, SA, n, buckets, left_suffixes_count, 0, threads, thread_state);
libsais_partial_sorting_shift_markers_32s_6k_omp(SA, k, buckets, threads);
libsais_partial_sorting_shift_buckets_32s_6k(k, buckets);
libsais_partial_sorting_scan_right_to_left_32s_6k_omp(
- T, SA, n, buckets, first_lms_suffix, left_suffixes_count, d, threads,
- thread_state);
+ T, SA, n, buckets, first_lms_suffix, left_suffixes_count, d, threads, thread_state);
}
-static void libsais_induce_partial_order_32s_4k_omp(
- const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+static void libsais_induce_partial_order_32s_4k_omp(const sa_sint_t * RESTRICT T,
+ sa_sint_t * RESTRICT SA, sa_sint_t n,
+ sa_sint_t k, sa_sint_t * RESTRICT buckets,
+ sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t));
- sa_sint_t d = libsais_partial_sorting_scan_left_to_right_32s_4k_omp(
- T, SA, n, k, buckets, 0, threads, thread_state);
+ sa_sint_t d = libsais_partial_sorting_scan_left_to_right_32s_4k_omp(T, SA, n, k, buckets, 0,
+ threads, thread_state);
libsais_partial_sorting_shift_markers_32s_4k(SA, n);
- libsais_partial_sorting_scan_right_to_left_32s_4k_omp(
- T, SA, n, k, buckets, d, threads, thread_state);
- libsais_partial_sorting_gather_lms_suffixes_32s_4k_omp(SA, n, threads,
- thread_state);
-}
-
-static void libsais_induce_partial_order_32s_2k_omp(
- const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- libsais_partial_sorting_scan_left_to_right_32s_1k_omp(
- T, SA, n, &buckets[1 * k], threads, thread_state);
- libsais_partial_sorting_scan_right_to_left_32s_1k_omp(
- T, SA, n, &buckets[0 * k], threads, thread_state);
- libsais_partial_sorting_gather_lms_suffixes_32s_1k_omp(SA, n, threads,
- thread_state);
-}
-
-static void libsais_induce_partial_order_32s_1k_omp(
- const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ libsais_partial_sorting_scan_right_to_left_32s_4k_omp(T, SA, n, k, buckets, d, threads,
+ thread_state);
+ libsais_partial_sorting_gather_lms_suffixes_32s_4k_omp(SA, n, threads, thread_state);
+}
+
+static void libsais_induce_partial_order_32s_2k_omp(const sa_sint_t * RESTRICT T,
+ sa_sint_t * RESTRICT SA, sa_sint_t n,
+ sa_sint_t k, sa_sint_t * RESTRICT buckets,
+ sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ libsais_partial_sorting_scan_left_to_right_32s_1k_omp(T, SA, n, &buckets[1 * k], threads,
+ thread_state);
+ libsais_partial_sorting_scan_right_to_left_32s_1k_omp(T, SA, n, &buckets[0 * k], threads,
+ thread_state);
+ libsais_partial_sorting_gather_lms_suffixes_32s_1k_omp(SA, n, threads, thread_state);
+}
+
+static void libsais_induce_partial_order_32s_1k_omp(const sa_sint_t * RESTRICT T,
+ sa_sint_t * RESTRICT SA, sa_sint_t n,
+ sa_sint_t k, sa_sint_t * RESTRICT buckets,
+ sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
libsais_count_suffixes_32s(T, n, k, buckets);
libsais_initialize_buckets_start_32s_1k(k, buckets);
- libsais_partial_sorting_scan_left_to_right_32s_1k_omp(
- T, SA, n, buckets, threads, thread_state);
+ libsais_partial_sorting_scan_left_to_right_32s_1k_omp(T, SA, n, buckets, threads, thread_state);
libsais_count_suffixes_32s(T, n, k, buckets);
libsais_initialize_buckets_end_32s_1k(k, buckets);
- libsais_partial_sorting_scan_right_to_left_32s_1k_omp(
- T, SA, n, buckets, threads, thread_state);
+ libsais_partial_sorting_scan_right_to_left_32s_1k_omp(T, SA, n, buckets, threads, thread_state);
- libsais_partial_sorting_gather_lms_suffixes_32s_1k_omp(SA, n, threads,
- thread_state);
+ libsais_partial_sorting_gather_lms_suffixes_32s_1k_omp(SA, n, threads, thread_state);
}
-static sa_sint_t libsais_renumber_lms_suffixes_8u(sa_sint_t * RESTRICT SA,
- sa_sint_t m, sa_sint_t name,
- fast_sint_t omp_block_start,
+static sa_sint_t libsais_renumber_lms_suffixes_8u(sa_sint_t * RESTRICT SA, sa_sint_t m,
+ sa_sint_t name, fast_sint_t omp_block_start,
fast_sint_t omp_block_size) {
const fast_sint_t prefetch_distance = 32;
sa_sint_t * RESTRICT SAm = &SA[m];
fast_sint_t i, j;
- for (i = omp_block_start,
- j = omp_block_start + omp_block_size - prefetch_distance - 3;
- i < j; i += 4) {
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j;
+ i += 4) {
libsais_prefetch(&SA[i + 2 * prefetch_distance]);
- libsais_prefetchw(
- &SAm[(SA[i + prefetch_distance + 0] & SAINT_MAX) >> 1]);
- libsais_prefetchw(
- &SAm[(SA[i + prefetch_distance + 1] & SAINT_MAX) >> 1]);
- libsais_prefetchw(
- &SAm[(SA[i + prefetch_distance + 2] & SAINT_MAX) >> 1]);
- libsais_prefetchw(
- &SAm[(SA[i + prefetch_distance + 3] & SAINT_MAX) >> 1]);
+ libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 0] & SAINT_MAX) >> 1]);
+ libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 1] & SAINT_MAX) >> 1]);
+ libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 2] & SAINT_MAX) >> 1]);
+ libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 3] & SAINT_MAX) >> 1]);
sa_sint_t p0 = SA[i + 0];
SAm[(p0 & SAINT_MAX) >> 1] = name | SAINT_MIN;
@@ -5455,9 +5079,9 @@ static sa_sint_t libsais_renumber_lms_suffixes_8u(sa_sint_t * RESTRICT SA,
return name;
}
-static fast_sint_t libsais_gather_marked_suffixes_8u(
- sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t l,
- fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+static fast_sint_t libsais_gather_marked_suffixes_8u(sa_sint_t * RESTRICT SA, sa_sint_t m,
+ fast_sint_t l, fast_sint_t omp_block_start,
+ fast_sint_t omp_block_size) {
const fast_sint_t prefetch_distance = 32;
l -= 1;
@@ -5514,20 +5138,17 @@ static sa_sint_t libsais_renumber_lms_suffixes_8u_omp(
#endif
fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16);
fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
- fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
- ? omp_block_stride
- : m - omp_block_start;
+ fast_sint_t omp_block_size =
+ omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
if (omp_num_threads == 1) {
- name = libsais_renumber_lms_suffixes_8u(SA, m, 0, omp_block_start,
- omp_block_size);
+ name = libsais_renumber_lms_suffixes_8u(SA, m, 0, omp_block_start, omp_block_size);
}
#if defined(_OPENMP)
else {
{
thread_state[omp_thread_num].state.count =
- libsais_count_negative_marked_suffixes(SA, omp_block_start,
- omp_block_size);
+ libsais_count_negative_marked_suffixes(SA, omp_block_start, omp_block_size);
}
#pragma omp barrier
@@ -5539,13 +5160,11 @@ static sa_sint_t libsais_renumber_lms_suffixes_8u_omp(
}
if (omp_thread_num == omp_num_threads - 1) {
- name =
- (sa_sint_t)(count +
- thread_state[omp_thread_num].state.count);
+ name = (sa_sint_t)(count + thread_state[omp_thread_num].state.count);
}
- libsais_renumber_lms_suffixes_8u(
- SA, m, (sa_sint_t)count, omp_block_start, omp_block_size);
+ libsais_renumber_lms_suffixes_8u(SA, m, (sa_sint_t)count, omp_block_start,
+ omp_block_size);
}
}
#endif
@@ -5555,8 +5174,8 @@ static sa_sint_t libsais_renumber_lms_suffixes_8u_omp(
}
static void libsais_gather_marked_lms_suffixes_8u_omp(
- sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs,
- sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
#if defined(_OPENMP)
#pragma omp parallel num_threads(threads) if (threads > 1 && n >= 131072)
#endif
@@ -5571,36 +5190,29 @@ static void libsais_gather_marked_lms_suffixes_8u_omp(
fast_sint_t omp_thread_num = 0;
fast_sint_t omp_num_threads = 1;
#endif
- fast_sint_t omp_block_stride =
- (((fast_sint_t)n >> 1) / omp_num_threads) & (-16);
+ fast_sint_t omp_block_stride = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16);
fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
- fast_sint_t omp_block_size =
- omp_thread_num < omp_num_threads - 1
- ? omp_block_stride
- : ((fast_sint_t)n >> 1) - omp_block_start;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
+ ? omp_block_stride
+ : ((fast_sint_t)n >> 1) - omp_block_start;
if (omp_num_threads == 1) {
- libsais_gather_marked_suffixes_8u(SA, m,
- (fast_sint_t)n + (fast_sint_t)fs,
+ libsais_gather_marked_suffixes_8u(SA, m, (fast_sint_t)n + (fast_sint_t)fs,
omp_block_start, omp_block_size);
}
#if defined(_OPENMP)
else {
{
if (omp_thread_num < omp_num_threads - 1) {
- thread_state[omp_thread_num].state.position =
- libsais_gather_marked_suffixes_8u(
- SA, m,
- (fast_sint_t)m + omp_block_start + omp_block_size,
- omp_block_start, omp_block_size);
+ thread_state[omp_thread_num].state.position = libsais_gather_marked_suffixes_8u(
+ SA, m, (fast_sint_t)m + omp_block_start + omp_block_size, omp_block_start,
+ omp_block_size);
thread_state[omp_thread_num].state.count =
(fast_sint_t)m + omp_block_start + omp_block_size -
thread_state[omp_thread_num].state.position;
} else {
- thread_state[omp_thread_num].state.position =
- libsais_gather_marked_suffixes_8u(
- SA, m, (fast_sint_t)n + (fast_sint_t)fs,
- omp_block_start, omp_block_size);
+ thread_state[omp_thread_num].state.position = libsais_gather_marked_suffixes_8u(
+ SA, m, (fast_sint_t)n + (fast_sint_t)fs, omp_block_start, omp_block_size);
thread_state[omp_thread_num].state.count =
(fast_sint_t)n + (fast_sint_t)fs -
thread_state[omp_thread_num].state.position;
@@ -5615,12 +5227,9 @@ static void libsais_gather_marked_lms_suffixes_8u_omp(
for (t = omp_num_threads - 1; t >= 0; --t) {
position -= thread_state[t].state.count;
- if (t != omp_num_threads - 1 &&
- thread_state[t].state.count > 0) {
- memmove(&SA[position],
- &SA[thread_state[t].state.position],
- (size_t)thread_state[t].state.count *
- sizeof(sa_sint_t));
+ if (t != omp_num_threads - 1 && thread_state[t].state.count > 0) {
+ memmove(&SA[position], &SA[thread_state[t].state.position],
+ (size_t)thread_state[t].state.count * sizeof(sa_sint_t));
}
}
}
@@ -5630,15 +5239,13 @@ static void libsais_gather_marked_lms_suffixes_8u_omp(
}
static sa_sint_t libsais_renumber_and_gather_lms_suffixes_8u_omp(
- sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs,
- sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
memset(&SA[m], 0, ((size_t)n >> 1) * sizeof(sa_sint_t));
- sa_sint_t name =
- libsais_renumber_lms_suffixes_8u_omp(SA, m, threads, thread_state);
+ sa_sint_t name = libsais_renumber_lms_suffixes_8u_omp(SA, m, threads, thread_state);
if (name < m) {
- libsais_gather_marked_lms_suffixes_8u_omp(SA, n, m, fs, threads,
- thread_state);
+ libsais_gather_marked_lms_suffixes_8u_omp(SA, n, m, fs, threads, thread_state);
} else {
fast_sint_t i;
for (i = 0; i < m; i += 1) {
@@ -5649,28 +5256,24 @@ static sa_sint_t libsais_renumber_and_gather_lms_suffixes_8u_omp(
return name;
}
-static sa_sint_t libsais_renumber_distinct_lms_suffixes_32s_4k(
- sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t name,
- fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+static sa_sint_t libsais_renumber_distinct_lms_suffixes_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t m,
+ sa_sint_t name,
+ fast_sint_t omp_block_start,
+ fast_sint_t omp_block_size) {
const fast_sint_t prefetch_distance = 32;
sa_sint_t * RESTRICT SAm = &SA[m];
fast_sint_t i, j;
sa_sint_t p0, p1, p2, p3 = 0;
- for (i = omp_block_start,
- j = omp_block_start + omp_block_size - prefetch_distance - 3;
- i < j; i += 4) {
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j;
+ i += 4) {
libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
- libsais_prefetchw(
- &SAm[(SA[i + prefetch_distance + 0] & SAINT_MAX) >> 1]);
- libsais_prefetchw(
- &SAm[(SA[i + prefetch_distance + 1] & SAINT_MAX) >> 1]);
- libsais_prefetchw(
- &SAm[(SA[i + prefetch_distance + 2] & SAINT_MAX) >> 1]);
- libsais_prefetchw(
- &SAm[(SA[i + prefetch_distance + 3] & SAINT_MAX) >> 1]);
+ libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 0] & SAINT_MAX) >> 1]);
+ libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 1] & SAINT_MAX) >> 1]);
+ libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 2] & SAINT_MAX) >> 1]);
+ libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 3] & SAINT_MAX) >> 1]);
p0 = SA[i + 0];
SAm[(SA[i + 0] = p0 & SAINT_MAX) >> 1] = name | (p0 & p3 & SAINT_MIN);
@@ -5696,8 +5299,7 @@ static sa_sint_t libsais_renumber_distinct_lms_suffixes_32s_4k(
return name;
}
-static void libsais_mark_distinct_lms_suffixes_32s(sa_sint_t * RESTRICT SA,
- sa_sint_t m,
+static void libsais_mark_distinct_lms_suffixes_32s(sa_sint_t * RESTRICT SA, sa_sint_t m,
fast_sint_t omp_block_start,
fast_sint_t omp_block_size) {
const fast_sint_t prefetch_distance = 32;
@@ -5731,8 +5333,7 @@ static void libsais_mark_distinct_lms_suffixes_32s(sa_sint_t * RESTRICT SA,
}
}
-static void libsais_clamp_lms_suffixes_length_32s(sa_sint_t * RESTRICT SA,
- sa_sint_t m,
+static void libsais_clamp_lms_suffixes_length_32s(sa_sint_t * RESTRICT SA, sa_sint_t m,
fast_sint_t omp_block_start,
fast_sint_t omp_block_size) {
const fast_sint_t prefetch_distance = 32;
@@ -5740,8 +5341,7 @@ static void libsais_clamp_lms_suffixes_length_32s(sa_sint_t * RESTRICT SA,
sa_sint_t * RESTRICT SAm = &SA[m];
fast_sint_t i, j;
- for (i = omp_block_start, j = omp_block_start + omp_block_size - 3; i < j;
- i += 4) {
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - 3; i < j; i += 4) {
libsais_prefetchw(&SAm[i + prefetch_distance]);
SAm[i + 0] = (SAm[i + 0] < 0 ? SAm[i + 0] : 0) & SAINT_MAX;
@@ -5776,20 +5376,18 @@ static sa_sint_t libsais_renumber_distinct_lms_suffixes_32s_4k_omp(
#endif
fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16);
fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
- fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
- ? omp_block_stride
- : m - omp_block_start;
+ fast_sint_t omp_block_size =
+ omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
if (omp_num_threads == 1) {
- name = libsais_renumber_distinct_lms_suffixes_32s_4k(
- SA, m, 1, omp_block_start, omp_block_size);
+ name = libsais_renumber_distinct_lms_suffixes_32s_4k(SA, m, 1, omp_block_start,
+ omp_block_size);
}
#if defined(_OPENMP)
else {
{
thread_state[omp_thread_num].state.count =
- libsais_count_negative_marked_suffixes(SA, omp_block_start,
- omp_block_size);
+ libsais_count_negative_marked_suffixes(SA, omp_block_start, omp_block_size);
}
#pragma omp barrier
@@ -5801,13 +5399,11 @@ static sa_sint_t libsais_renumber_distinct_lms_suffixes_32s_4k_omp(
}
if (omp_thread_num == omp_num_threads - 1) {
- name =
- (sa_sint_t)(count +
- thread_state[omp_thread_num].state.count);
+ name = (sa_sint_t)(count + thread_state[omp_thread_num].state.count);
}
- libsais_renumber_distinct_lms_suffixes_32s_4k(
- SA, m, (sa_sint_t)count, omp_block_start, omp_block_size);
+ libsais_renumber_distinct_lms_suffixes_32s_4k(SA, m, (sa_sint_t)count,
+ omp_block_start, omp_block_size);
}
}
#endif
@@ -5816,9 +5412,8 @@ static sa_sint_t libsais_renumber_distinct_lms_suffixes_32s_4k_omp(
return name - 1;
}
-static void libsais_mark_distinct_lms_suffixes_32s_omp(sa_sint_t * RESTRICT SA,
- sa_sint_t n, sa_sint_t m,
- sa_sint_t threads) {
+static void libsais_mark_distinct_lms_suffixes_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n,
+ sa_sint_t m, sa_sint_t threads) {
#if defined(_OPENMP)
#pragma omp parallel num_threads(threads) if (threads > 1 && n >= 131072)
#endif
@@ -5826,27 +5421,23 @@ static void libsais_mark_distinct_lms_suffixes_32s_omp(sa_sint_t * RESTRICT SA,
#if defined(_OPENMP)
fast_sint_t omp_thread_num = omp_get_thread_num();
fast_sint_t omp_num_threads = omp_get_num_threads();
- fast_sint_t omp_block_stride =
- (((fast_sint_t)n >> 1) / omp_num_threads) & (-16);
+ fast_sint_t omp_block_stride = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16);
fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
- fast_sint_t omp_block_size =
- omp_thread_num < omp_num_threads - 1
- ? omp_block_stride
- : ((fast_sint_t)n >> 1) - omp_block_start;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
+ ? omp_block_stride
+ : ((fast_sint_t)n >> 1) - omp_block_start;
#else
UNUSED(threads);
fast_sint_t omp_block_start = 0;
fast_sint_t omp_block_size = (fast_sint_t)n >> 1;
#endif
- libsais_mark_distinct_lms_suffixes_32s(SA, m, omp_block_start,
- omp_block_size);
+ libsais_mark_distinct_lms_suffixes_32s(SA, m, omp_block_start, omp_block_size);
}
}
-static void libsais_clamp_lms_suffixes_length_32s_omp(sa_sint_t * RESTRICT SA,
- sa_sint_t n, sa_sint_t m,
- sa_sint_t threads) {
+static void libsais_clamp_lms_suffixes_length_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n,
+ sa_sint_t m, sa_sint_t threads) {
#if defined(_OPENMP)
#pragma omp parallel num_threads(threads) if (threads > 1 && n >= 131072)
#endif
@@ -5854,21 +5445,18 @@ static void libsais_clamp_lms_suffixes_length_32s_omp(sa_sint_t * RESTRICT SA,
#if defined(_OPENMP)
fast_sint_t omp_thread_num = omp_get_thread_num();
fast_sint_t omp_num_threads = omp_get_num_threads();
- fast_sint_t omp_block_stride =
- (((fast_sint_t)n >> 1) / omp_num_threads) & (-16);
+ fast_sint_t omp_block_stride = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16);
fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
- fast_sint_t omp_block_size =
- omp_thread_num < omp_num_threads - 1
- ? omp_block_stride
- : ((fast_sint_t)n >> 1) - omp_block_start;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
+ ? omp_block_stride
+ : ((fast_sint_t)n >> 1) - omp_block_start;
#else
UNUSED(threads);
fast_sint_t omp_block_start = 0;
fast_sint_t omp_block_size = (fast_sint_t)n >> 1;
#endif
- libsais_clamp_lms_suffixes_length_32s(SA, m, omp_block_start,
- omp_block_size);
+ libsais_clamp_lms_suffixes_length_32s(SA, m, omp_block_start, omp_block_size);
}
}
@@ -5877,8 +5465,8 @@ static sa_sint_t libsais_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(
LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
memset(&SA[m], 0, ((size_t)n >> 1) * sizeof(sa_sint_t));
- sa_sint_t name = libsais_renumber_distinct_lms_suffixes_32s_4k_omp(
- SA, m, threads, thread_state);
+ sa_sint_t name =
+ libsais_renumber_distinct_lms_suffixes_32s_4k_omp(SA, m, threads, thread_state);
if (name < m) {
libsais_mark_distinct_lms_suffixes_32s_omp(SA, n, m, threads);
}
@@ -5887,8 +5475,7 @@ static sa_sint_t libsais_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(
}
static sa_sint_t libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(
- sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
- sa_sint_t threads) {
+ sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads) {
const fast_sint_t prefetch_distance = 32;
sa_sint_t * RESTRICT SAm = &SA[m];
@@ -5896,32 +5483,22 @@ static sa_sint_t libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(
{
libsais_gather_lms_suffixes_32s(T, SA, n);
- memset(&SA[m], 0,
- ((size_t)n - (size_t)m - (size_t)m) * sizeof(sa_sint_t));
+ memset(&SA[m], 0, ((size_t)n - (size_t)m - (size_t)m) * sizeof(sa_sint_t));
fast_sint_t i, j;
- for (i = (fast_sint_t)n - (fast_sint_t)m,
- j = (fast_sint_t)n - 1 - prefetch_distance - 3;
+ for (i = (fast_sint_t)n - (fast_sint_t)m, j = (fast_sint_t)n - 1 - prefetch_distance - 3;
i < j; i += 4) {
libsais_prefetch(&SA[i + 2 * prefetch_distance]);
- libsais_prefetchw(
- &SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]);
- libsais_prefetchw(
- &SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]);
- libsais_prefetchw(
- &SAm[((sa_uint_t)SA[i + prefetch_distance + 2]) >> 1]);
- libsais_prefetchw(
- &SAm[((sa_uint_t)SA[i + prefetch_distance + 3]) >> 1]);
+ libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]);
+ libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]);
+ libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 2]) >> 1]);
+ libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 3]) >> 1]);
- SAm[((sa_uint_t)SA[i + 0]) >> 1] =
- SA[i + 1] - SA[i + 0] + 1 + SAINT_MIN;
- SAm[((sa_uint_t)SA[i + 1]) >> 1] =
- SA[i + 2] - SA[i + 1] + 1 + SAINT_MIN;
- SAm[((sa_uint_t)SA[i + 2]) >> 1] =
- SA[i + 3] - SA[i + 2] + 1 + SAINT_MIN;
- SAm[((sa_uint_t)SA[i + 3]) >> 1] =
- SA[i + 4] - SA[i + 3] + 1 + SAINT_MIN;
+ SAm[((sa_uint_t)SA[i + 0]) >> 1] = SA[i + 1] - SA[i + 0] + 1 + SAINT_MIN;
+ SAm[((sa_uint_t)SA[i + 1]) >> 1] = SA[i + 2] - SA[i + 1] + 1 + SAINT_MIN;
+ SAm[((sa_uint_t)SA[i + 2]) >> 1] = SA[i + 3] - SA[i + 2] + 1 + SAINT_MIN;
+ SAm[((sa_uint_t)SA[i + 3]) >> 1] = SA[i + 4] - SA[i + 3] + 1 + SAINT_MIN;
}
for (j += prefetch_distance + 3; i < j; i += 1) {
@@ -5941,11 +5518,9 @@ static sa_sint_t libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(
for (i = 1, j = m - prefetch_distance - 1; i < j; i += 2) {
libsais_prefetch(&SA[i + 2 * prefetch_distance]);
- libsais_prefetchw(
- &SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]);
+ libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]);
libsais_prefetch(&T[((sa_uint_t)SA[i + prefetch_distance + 0])]);
- libsais_prefetchw(
- &SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]);
+ libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]);
libsais_prefetch(&T[((sa_uint_t)SA[i + prefetch_distance + 1])]);
fast_sint_t q = SA[i + 0], qlen = SAm[q >> 1];
@@ -6009,8 +5584,7 @@ static sa_sint_t libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(
return name - 1;
}
-static void libsais_reconstruct_lms_suffixes(sa_sint_t * RESTRICT SA,
- sa_sint_t n, sa_sint_t m,
+static void libsais_reconstruct_lms_suffixes(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
fast_sint_t omp_block_start,
fast_sint_t omp_block_size) {
const fast_sint_t prefetch_distance = 32;
@@ -6018,9 +5592,8 @@ static void libsais_reconstruct_lms_suffixes(sa_sint_t * RESTRICT SA,
const sa_sint_t * RESTRICT SAnm = &SA[n - m];
fast_sint_t i, j;
- for (i = omp_block_start,
- j = omp_block_start + omp_block_size - prefetch_distance - 3;
- i < j; i += 4) {
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j;
+ i += 4) {
libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
libsais_prefetch(&SAnm[SA[i + prefetch_distance + 0]]);
@@ -6039,8 +5612,7 @@ static void libsais_reconstruct_lms_suffixes(sa_sint_t * RESTRICT SA,
}
}
-static void libsais_reconstruct_lms_suffixes_omp(sa_sint_t * RESTRICT SA,
- sa_sint_t n, sa_sint_t m,
+static void libsais_reconstruct_lms_suffixes_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
sa_sint_t threads) {
#if defined(_OPENMP)
#pragma omp parallel num_threads(threads) if (threads > 1 && m >= 65536)
@@ -6050,10 +5622,9 @@ static void libsais_reconstruct_lms_suffixes_omp(sa_sint_t * RESTRICT SA,
fast_sint_t omp_thread_num = omp_get_thread_num();
fast_sint_t omp_num_threads = omp_get_num_threads();
fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16);
- fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
- fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
- ? omp_block_stride
- : m - omp_block_start;
+ fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
+ fast_sint_t omp_block_size =
+ omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
#else
UNUSED(threads);
@@ -6061,68 +5632,63 @@ static void libsais_reconstruct_lms_suffixes_omp(sa_sint_t * RESTRICT SA,
fast_sint_t omp_block_size = m;
#endif
- libsais_reconstruct_lms_suffixes(SA, n, m, omp_block_start,
- omp_block_size);
+ libsais_reconstruct_lms_suffixes(SA, n, m, omp_block_start, omp_block_size);
}
}
-static void libsais_place_lms_suffixes_interval_8u(
- sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
- const sa_sint_t * RESTRICT buckets) {
+static void libsais_place_lms_suffixes_interval_8u(sa_sint_t * RESTRICT SA, sa_sint_t n,
+ sa_sint_t m,
+ const sa_sint_t * RESTRICT buckets) {
const sa_sint_t * RESTRICT bucket_end = &buckets[7 * ALPHABET_SIZE];
fast_sint_t c, j = n;
for (c = ALPHABET_SIZE - 2; c >= 0; --c) {
- fast_sint_t l =
- (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1) + BUCKETS_INDEX2(1, 0)] -
- (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1)];
+ fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1) + BUCKETS_INDEX2(1, 0)] -
+ (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1)];
if (l > 0) {
fast_sint_t i = bucket_end[c];
if (j - i > 0) {
memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
}
- memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l],
- (size_t)l * sizeof(sa_sint_t));
+ memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
}
}
memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
}
-static void libsais_place_lms_suffixes_interval_32s_4k(
- sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m,
- const sa_sint_t * RESTRICT buckets) {
+static void libsais_place_lms_suffixes_interval_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n,
+ sa_sint_t k, sa_sint_t m,
+ const sa_sint_t * RESTRICT buckets) {
const sa_sint_t * RESTRICT bucket_end = &buckets[3 * k];
fast_sint_t c, j = n;
for (c = (fast_sint_t)k - 2; c >= 0; --c) {
- fast_sint_t l =
- (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1) + BUCKETS_INDEX2(1, 0)] -
- (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1)];
+ fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1) + BUCKETS_INDEX2(1, 0)] -
+ (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1)];
if (l > 0) {
fast_sint_t i = bucket_end[c];
if (j - i > 0) {
memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
}
- memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l],
- (size_t)l * sizeof(sa_sint_t));
+ memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
}
}
memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
}
-static void libsais_place_lms_suffixes_interval_32s_2k(
- sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m,
- const sa_sint_t * RESTRICT buckets) {
+static void libsais_place_lms_suffixes_interval_32s_2k(sa_sint_t * RESTRICT SA, sa_sint_t n,
+ sa_sint_t k, sa_sint_t m,
+ const sa_sint_t * RESTRICT buckets) {
fast_sint_t j = n;
if (k > 1) {
fast_sint_t c;
- for (c = BUCKETS_INDEX2((fast_sint_t)k - 2, 0);
- c >= BUCKETS_INDEX2(0, 0); c -= BUCKETS_INDEX2(1, 0)) {
+ for (c = BUCKETS_INDEX2((fast_sint_t)k - 2, 0); c >= BUCKETS_INDEX2(0, 0);
+ c -= BUCKETS_INDEX2(1, 0)) {
fast_sint_t l = (fast_sint_t)buckets[c + BUCKETS_INDEX2(1, 1)] -
(fast_sint_t)buckets[c + BUCKETS_INDEX2(0, 1)];
if (l > 0) {
@@ -6131,8 +5697,7 @@ static void libsais_place_lms_suffixes_interval_32s_2k(
memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
}
- memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l],
- (size_t)l * sizeof(sa_sint_t));
+ memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
}
}
}
@@ -6140,9 +5705,9 @@ static void libsais_place_lms_suffixes_interval_32s_2k(
memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
}
-static void libsais_place_lms_suffixes_interval_32s_1k(
- const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k,
- sa_sint_t m, sa_sint_t * RESTRICT buckets) {
+static void libsais_place_lms_suffixes_interval_32s_1k(const sa_sint_t * RESTRICT T,
+ sa_sint_t * RESTRICT SA, sa_sint_t k,
+ sa_sint_t m, sa_sint_t * RESTRICT buckets) {
const fast_sint_t prefetch_distance = 32;
sa_sint_t c = k - 1;
@@ -6158,32 +5723,28 @@ static void libsais_place_lms_suffixes_interval_32s_1k(
sa_sint_t p0 = SA[i - 0];
if (T[p0] != c) {
c = T[p0];
- memset(&SA[buckets[c]], 0,
- (size_t)(l - buckets[c]) * sizeof(sa_sint_t));
+ memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t));
l = buckets[c];
}
SA[--l] = p0;
sa_sint_t p1 = SA[i - 1];
if (T[p1] != c) {
c = T[p1];
- memset(&SA[buckets[c]], 0,
- (size_t)(l - buckets[c]) * sizeof(sa_sint_t));
+ memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t));
l = buckets[c];
}
SA[--l] = p1;
sa_sint_t p2 = SA[i - 2];
if (T[p2] != c) {
c = T[p2];
- memset(&SA[buckets[c]], 0,
- (size_t)(l - buckets[c]) * sizeof(sa_sint_t));
+ memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t));
l = buckets[c];
}
SA[--l] = p2;
sa_sint_t p3 = SA[i - 3];
if (T[p3] != c) {
c = T[p3];
- memset(&SA[buckets[c]], 0,
- (size_t)(l - buckets[c]) * sizeof(sa_sint_t));
+ memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t));
l = buckets[c];
}
SA[--l] = p3;
@@ -6193,8 +5754,7 @@ static void libsais_place_lms_suffixes_interval_32s_1k(
sa_sint_t p = SA[i];
if (T[p] != c) {
c = T[p];
- memset(&SA[buckets[c]], 0,
- (size_t)(l - buckets[c]) * sizeof(sa_sint_t));
+ memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t));
l = buckets[c];
}
SA[--l] = p;
@@ -6203,9 +5763,9 @@ static void libsais_place_lms_suffixes_interval_32s_1k(
memset(&SA[0], 0, (size_t)l * sizeof(sa_sint_t));
}
-static void libsais_place_lms_suffixes_histogram_32s_6k(
- sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m,
- const sa_sint_t * RESTRICT buckets) {
+static void libsais_place_lms_suffixes_histogram_32s_6k(sa_sint_t * RESTRICT SA, sa_sint_t n,
+ sa_sint_t k, sa_sint_t m,
+ const sa_sint_t * RESTRICT buckets) {
const sa_sint_t * RESTRICT bucket_end = &buckets[5 * k];
fast_sint_t c, j = n;
@@ -6217,17 +5777,16 @@ static void libsais_place_lms_suffixes_histogram_32s_6k(
memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
}
- memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l],
- (size_t)l * sizeof(sa_sint_t));
+ memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
}
}
memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
}
-static void libsais_place_lms_suffixes_histogram_32s_4k(
- sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m,
- const sa_sint_t * RESTRICT buckets) {
+static void libsais_place_lms_suffixes_histogram_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n,
+ sa_sint_t k, sa_sint_t m,
+ const sa_sint_t * RESTRICT buckets) {
const sa_sint_t * RESTRICT bucket_end = &buckets[3 * k];
fast_sint_t c, j = n;
@@ -6239,23 +5798,22 @@ static void libsais_place_lms_suffixes_histogram_32s_4k(
memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
}
- memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l],
- (size_t)l * sizeof(sa_sint_t));
+ memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
}
}
memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
}
-static void libsais_place_lms_suffixes_histogram_32s_2k(
- sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m,
- const sa_sint_t * RESTRICT buckets) {
+static void libsais_place_lms_suffixes_histogram_32s_2k(sa_sint_t * RESTRICT SA, sa_sint_t n,
+ sa_sint_t k, sa_sint_t m,
+ const sa_sint_t * RESTRICT buckets) {
fast_sint_t j = n;
if (k > 1) {
fast_sint_t c;
- for (c = BUCKETS_INDEX2((fast_sint_t)k - 2, 0);
- c >= BUCKETS_INDEX2(0, 0); c -= BUCKETS_INDEX2(1, 0)) {
+ for (c = BUCKETS_INDEX2((fast_sint_t)k - 2, 0); c >= BUCKETS_INDEX2(0, 0);
+ c -= BUCKETS_INDEX2(1, 0)) {
fast_sint_t l = (fast_sint_t)buckets[c + BUCKETS_INDEX2(0, 1)];
if (l > 0) {
fast_sint_t i = buckets[c];
@@ -6263,8 +5821,7 @@ static void libsais_place_lms_suffixes_histogram_32s_2k(
memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t));
}
- memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l],
- (size_t)l * sizeof(sa_sint_t));
+ memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t));
}
}
}
@@ -6272,16 +5829,15 @@ static void libsais_place_lms_suffixes_histogram_32s_2k(
memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t));
}
-static void libsais_final_bwt_scan_left_to_right_8u(
- const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start,
- fast_sint_t omp_block_size) {
+static void libsais_final_bwt_scan_left_to_right_8u(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
+ sa_sint_t * RESTRICT induction_bucket,
+ fast_sint_t omp_block_start,
+ fast_sint_t omp_block_size) {
const fast_sint_t prefetch_distance = 32;
fast_sint_t i, j;
- for (i = omp_block_start,
- j = omp_block_start + omp_block_size - prefetch_distance - 1;
- i < j; i += 2) {
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j;
+ i += 2) {
libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
sa_sint_t s0 = SA[i + prefetch_distance + 0];
@@ -6325,16 +5881,17 @@ static void libsais_final_bwt_scan_left_to_right_8u(
}
}
-static void libsais_final_bwt_aux_scan_left_to_right_8u(
- const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm,
- sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket,
- fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+static void libsais_final_bwt_aux_scan_left_to_right_8u(const u8 * RESTRICT T,
+ sa_sint_t * RESTRICT SA, sa_sint_t rm,
+ sa_sint_t * RESTRICT I,
+ sa_sint_t * RESTRICT induction_bucket,
+ fast_sint_t omp_block_start,
+ fast_sint_t omp_block_size) {
const fast_sint_t prefetch_distance = 32;
fast_sint_t i, j;
- for (i = omp_block_start,
- j = omp_block_start + omp_block_size - prefetch_distance - 1;
- i < j; i += 2) {
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j;
+ i += 2) {
libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
sa_sint_t s0 = SA[i + prefetch_distance + 0];
@@ -6387,16 +5944,16 @@ static void libsais_final_bwt_aux_scan_left_to_right_8u(
}
}
-static void libsais_final_sorting_scan_left_to_right_8u(
- const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start,
- fast_sint_t omp_block_size) {
+static void libsais_final_sorting_scan_left_to_right_8u(const u8 * RESTRICT T,
+ sa_sint_t * RESTRICT SA,
+ sa_sint_t * RESTRICT induction_bucket,
+ fast_sint_t omp_block_start,
+ fast_sint_t omp_block_size) {
const fast_sint_t prefetch_distance = 32;
fast_sint_t i, j;
- for (i = omp_block_start,
- j = omp_block_start + omp_block_size - prefetch_distance - 1;
- i < j; i += 2) {
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j;
+ i += 2) {
libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
sa_sint_t s0 = SA[i + prefetch_distance + 0];
@@ -6437,15 +5994,15 @@ static void libsais_final_sorting_scan_left_to_right_8u(
}
}
-static void libsais_final_sorting_scan_left_to_right_32s(
- const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start,
- fast_sint_t omp_block_size) {
+static void libsais_final_sorting_scan_left_to_right_32s(const sa_sint_t * RESTRICT T,
+ sa_sint_t * RESTRICT SA,
+ sa_sint_t * RESTRICT induction_bucket,
+ fast_sint_t omp_block_start,
+ fast_sint_t omp_block_size) {
const fast_sint_t prefetch_distance = 32;
fast_sint_t i, j;
- for (i = omp_block_start,
- j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1;
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1;
i < j; i += 2) {
libsais_prefetchw(&SA[i + 3 * prefetch_distance]);
@@ -6496,17 +6053,16 @@ static void libsais_final_sorting_scan_left_to_right_32s(
#if defined(_OPENMP)
static fast_sint_t libsais_final_bwt_scan_left_to_right_8u_block_prepare(
- const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache,
- fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+ const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
+ LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
+ fast_sint_t omp_block_size) {
const fast_sint_t prefetch_distance = 32;
memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t));
fast_sint_t i, j, count = 0;
- for (i = omp_block_start,
- j = omp_block_start + omp_block_size - prefetch_distance - 1;
- i < j; i += 2) {
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j;
+ i += 2) {
libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
sa_sint_t s0 = SA[i + prefetch_distance + 0];
@@ -6526,8 +6082,7 @@ static fast_sint_t libsais_final_bwt_scan_left_to_right_8u_block_prepare(
p0--;
SA[i + 0] = T[p0] | SAINT_MIN;
buckets[cache[count].symbol = T[p0]]++;
- cache[count++].index =
- p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1));
+ cache[count++].index = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1));
}
sa_sint_t p1 = SA[i + 1];
SA[i + 1] = p1 & SAINT_MAX;
@@ -6535,8 +6090,7 @@ static fast_sint_t libsais_final_bwt_scan_left_to_right_8u_block_prepare(
p1--;
SA[i + 1] = T[p1] | SAINT_MIN;
buckets[cache[count].symbol = T[p1]]++;
- cache[count++].index =
- p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1));
+ cache[count++].index = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1));
}
}
@@ -6547,8 +6101,7 @@ static fast_sint_t libsais_final_bwt_scan_left_to_right_8u_block_prepare(
p--;
SA[i] = T[p] | SAINT_MIN;
buckets[cache[count].symbol = T[p]]++;
- cache[count++].index =
- p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1));
+ cache[count++].index = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1));
}
}
@@ -6556,17 +6109,16 @@ static fast_sint_t libsais_final_bwt_scan_left_to_right_8u_block_prepare(
}
static fast_sint_t libsais_final_sorting_scan_left_to_right_8u_block_prepare(
- const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache,
- fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+ const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
+ LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
+ fast_sint_t omp_block_size) {
const fast_sint_t prefetch_distance = 32;
memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t));
fast_sint_t i, j, count = 0;
- for (i = omp_block_start,
- j = omp_block_start + omp_block_size - prefetch_distance - 1;
- i < j; i += 2) {
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j;
+ i += 2) {
libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
sa_sint_t s0 = SA[i + prefetch_distance + 0];
@@ -6585,16 +6137,14 @@ static fast_sint_t libsais_final_sorting_scan_left_to_right_8u_block_prepare(
if (p0 > 0) {
p0--;
buckets[cache[count].symbol = T[p0]]++;
- cache[count++].index =
- p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1));
+ cache[count++].index = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1));
}
sa_sint_t p1 = SA[i + 1];
SA[i + 1] = p1 ^ SAINT_MIN;
if (p1 > 0) {
p1--;
buckets[cache[count].symbol = T[p1]]++;
- cache[count++].index =
- p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1));
+ cache[count++].index = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1));
}
}
@@ -6604,8 +6154,7 @@ static fast_sint_t libsais_final_sorting_scan_left_to_right_8u_block_prepare(
if (p > 0) {
p--;
buckets[cache[count].symbol = T[p]]++;
- cache[count++].index =
- p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1));
+ cache[count++].index = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1));
}
}
@@ -6613,8 +6162,8 @@ static fast_sint_t libsais_final_sorting_scan_left_to_right_8u_block_prepare(
}
static void libsais_final_order_scan_left_to_right_8u_block_place(
- sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
- LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count) {
+ sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache,
+ fast_sint_t count) {
const fast_sint_t prefetch_distance = 32;
fast_sint_t i, j;
@@ -6633,9 +6182,8 @@ static void libsais_final_order_scan_left_to_right_8u_block_place(
}
static void libsais_final_bwt_aux_scan_left_to_right_8u_block_place(
- sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I,
- sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache,
- fast_sint_t count) {
+ sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT buckets,
+ LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count) {
const fast_sint_t prefetch_distance = 32;
fast_sint_t i, j;
@@ -6644,45 +6192,38 @@ static void libsais_final_bwt_aux_scan_left_to_right_8u_block_place(
SA[buckets[cache[i + 0].symbol]++] = cache[i + 0].index;
if ((cache[i + 0].index & rm) == 0) {
- I[(cache[i + 0].index & SAINT_MAX) / (rm + 1)] =
- buckets[cache[i + 0].symbol];
+ I[(cache[i + 0].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 0].symbol];
}
SA[buckets[cache[i + 1].symbol]++] = cache[i + 1].index;
if ((cache[i + 1].index & rm) == 0) {
- I[(cache[i + 1].index & SAINT_MAX) / (rm + 1)] =
- buckets[cache[i + 1].symbol];
+ I[(cache[i + 1].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 1].symbol];
}
SA[buckets[cache[i + 2].symbol]++] = cache[i + 2].index;
if ((cache[i + 2].index & rm) == 0) {
- I[(cache[i + 2].index & SAINT_MAX) / (rm + 1)] =
- buckets[cache[i + 2].symbol];
+ I[(cache[i + 2].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 2].symbol];
}
SA[buckets[cache[i + 3].symbol]++] = cache[i + 3].index;
if ((cache[i + 3].index & rm) == 0) {
- I[(cache[i + 3].index & SAINT_MAX) / (rm + 1)] =
- buckets[cache[i + 3].symbol];
+ I[(cache[i + 3].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 3].symbol];
}
}
for (j += 3; i < j; i += 1) {
SA[buckets[cache[i].symbol]++] = cache[i].index;
if ((cache[i].index & rm) == 0) {
- I[(cache[i].index & SAINT_MAX) / (rm + 1)] =
- buckets[cache[i].symbol];
+ I[(cache[i].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i].symbol];
}
}
}
static void libsais_final_sorting_scan_left_to_right_32s_block_gather(
- const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
- LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
- fast_sint_t omp_block_size) {
+ const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache,
+ fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
const fast_sint_t prefetch_distance = 32;
fast_sint_t i, j;
- for (i = omp_block_start,
- j = omp_block_start + omp_block_size - prefetch_distance - 1;
- i < j; i += 2) {
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j;
+ i += 2) {
libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
sa_sint_t s0 = SA[i + prefetch_distance + 0];
@@ -6702,8 +6243,7 @@ static void libsais_final_sorting_scan_left_to_right_32s_block_gather(
SA[i + 0] = p0 ^ SAINT_MIN;
if (p0 > 0) {
p0--;
- cache[i + 0].index =
- p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1));
+ cache[i + 0].index = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1));
symbol0 = T[p0];
}
cache[i + 0].symbol = symbol0;
@@ -6711,8 +6251,7 @@ static void libsais_final_sorting_scan_left_to_right_32s_block_gather(
SA[i + 1] = p1 ^ SAINT_MIN;
if (p1 > 0) {
p1--;
- cache[i + 1].index =
- p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1));
+ cache[i + 1].index = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1));
symbol1 = T[p1];
}
cache[i + 1].symbol = symbol1;
@@ -6723,8 +6262,7 @@ static void libsais_final_sorting_scan_left_to_right_32s_block_gather(
SA[i] = p ^ SAINT_MIN;
if (p > 0) {
p--;
- cache[i].index =
- p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1));
+ cache[i].index = p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1));
symbol = T[p];
}
cache[i].symbol = symbol;
@@ -6738,8 +6276,7 @@ static void libsais_final_sorting_scan_left_to_right_32s_block_sort(
const fast_sint_t prefetch_distance = 32;
fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size;
- for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j;
- i += 2) {
+ for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2) {
libsais_prefetchw(&cache[i + 2 * prefetch_distance]);
sa_sint_t s0 = cache[i + prefetch_distance + 0].symbol;
@@ -6758,8 +6295,7 @@ static void libsais_final_sorting_scan_left_to_right_32s_block_sort(
if (np > 0) {
np--;
cache[ni].index =
- np | ((sa_sint_t)(T[np - (np > 0)] < T[np])
- << (SAINT_BIT - 1));
+ np | ((sa_sint_t)(T[np - (np > 0)] < T[np]) << (SAINT_BIT - 1));
cache[ni].symbol = T[np];
}
}
@@ -6774,8 +6310,7 @@ static void libsais_final_sorting_scan_left_to_right_32s_block_sort(
if (np > 0) {
np--;
cache[ni].index =
- np | ((sa_sint_t)(T[np - (np > 0)] < T[np])
- << (SAINT_BIT - 1));
+ np | ((sa_sint_t)(T[np - (np > 0)] < T[np]) << (SAINT_BIT - 1));
cache[ni].symbol = T[np];
}
}
@@ -6792,8 +6327,7 @@ static void libsais_final_sorting_scan_left_to_right_32s_block_sort(
if (np > 0) {
np--;
cache[ni].index =
- np | ((sa_sint_t)(T[np - (np > 0)] < T[np])
- << (SAINT_BIT - 1));
+ np | ((sa_sint_t)(T[np - (np > 0)] < T[np]) << (SAINT_BIT - 1));
cache[ni].symbol = T[np];
}
}
@@ -6802,14 +6336,13 @@ static void libsais_final_sorting_scan_left_to_right_32s_block_sort(
}
static void libsais_final_bwt_scan_left_to_right_8u_block_omp(
- const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start,
- fast_sint_t block_size, sa_sint_t threads,
+ const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket,
+ fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads,
LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
#if defined(_OPENMP)
- #pragma omp parallel num_threads( \
- threads) if (threads > 1 && block_size >= 64 * ALPHABET_SIZE && \
- omp_get_dynamic() == 0)
+ #pragma omp parallel num_threads(threads) if (threads > 1 && \
+ block_size >= 64 * ALPHABET_SIZE && \
+ omp_get_dynamic() == 0)
#endif
{
#if defined(_OPENMP)
@@ -6824,15 +6357,14 @@ static void libsais_final_bwt_scan_left_to_right_8u_block_omp(
#endif
fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
- fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
- ? omp_block_stride
- : block_size - omp_block_start;
+ fast_sint_t omp_block_size =
+ omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
omp_block_start += block_start;
if (omp_num_threads == 1) {
- libsais_final_bwt_scan_left_to_right_8u(
- T, SA, induction_bucket, omp_block_start, omp_block_size);
+ libsais_final_bwt_scan_left_to_right_8u(T, SA, induction_bucket, omp_block_start,
+ omp_block_size);
}
#if defined(_OPENMP)
else {
@@ -6840,8 +6372,7 @@ static void libsais_final_bwt_scan_left_to_right_8u_block_omp(
thread_state[omp_thread_num].state.count =
libsais_final_bwt_scan_left_to_right_8u_block_prepare(
T, SA, thread_state[omp_thread_num].state.buckets,
- thread_state[omp_thread_num].state.cache,
- omp_block_start, omp_block_size);
+ thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size);
}
#pragma omp barrier
@@ -6850,8 +6381,7 @@ static void libsais_final_bwt_scan_left_to_right_8u_block_omp(
{
fast_sint_t t;
for (t = 0; t < omp_num_threads; ++t) {
- sa_sint_t * RESTRICT temp_bucket =
- thread_state[t].state.buckets;
+ sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
fast_sint_t c;
for (c = 0; c < ALPHABET_SIZE; c += 1) {
sa_sint_t A = induction_bucket[c], B = temp_bucket[c];
@@ -6875,14 +6405,13 @@ static void libsais_final_bwt_scan_left_to_right_8u_block_omp(
}
static void libsais_final_bwt_aux_scan_left_to_right_8u_block_omp(
- const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm,
- sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket,
- fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I,
+ sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size,
+ sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
#if defined(_OPENMP)
- #pragma omp parallel num_threads( \
- threads) if (threads > 1 && block_size >= 64 * ALPHABET_SIZE && \
- omp_get_dynamic() == 0)
+ #pragma omp parallel num_threads(threads) if (threads > 1 && \
+ block_size >= 64 * ALPHABET_SIZE && \
+ omp_get_dynamic() == 0)
#endif
{
#if defined(_OPENMP)
@@ -6897,16 +6426,14 @@ static void libsais_final_bwt_aux_scan_left_to_right_8u_block_omp(
#endif
fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
- fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
- ? omp_block_stride
- : block_size - omp_block_start;
+ fast_sint_t omp_block_size =
+ omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
omp_block_start += block_start;
if (omp_num_threads == 1) {
- libsais_final_bwt_aux_scan_left_to_right_8u(
- T, SA, rm, I, induction_bucket, omp_block_start,
- omp_block_size);
+ libsais_final_bwt_aux_scan_left_to_right_8u(T, SA, rm, I, induction_bucket,
+ omp_block_start, omp_block_size);
}
#if defined(_OPENMP)
else {
@@ -6914,8 +6441,7 @@ static void libsais_final_bwt_aux_scan_left_to_right_8u_block_omp(
thread_state[omp_thread_num].state.count =
libsais_final_bwt_scan_left_to_right_8u_block_prepare(
T, SA, thread_state[omp_thread_num].state.buckets,
- thread_state[omp_thread_num].state.cache,
- omp_block_start, omp_block_size);
+ thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size);
}
#pragma omp barrier
@@ -6924,8 +6450,7 @@ static void libsais_final_bwt_aux_scan_left_to_right_8u_block_omp(
{
fast_sint_t t;
for (t = 0; t < omp_num_threads; ++t) {
- sa_sint_t * RESTRICT temp_bucket =
- thread_state[t].state.buckets;
+ sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
fast_sint_t c;
for (c = 0; c < ALPHABET_SIZE; c += 1) {
sa_sint_t A = induction_bucket[c], B = temp_bucket[c];
@@ -6949,14 +6474,13 @@ static void libsais_final_bwt_aux_scan_left_to_right_8u_block_omp(
}
static void libsais_final_sorting_scan_left_to_right_8u_block_omp(
- const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start,
- fast_sint_t block_size, sa_sint_t threads,
+ const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket,
+ fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads,
LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
#if defined(_OPENMP)
- #pragma omp parallel num_threads( \
- threads) if (threads > 1 && block_size >= 64 * ALPHABET_SIZE && \
- omp_get_dynamic() == 0)
+ #pragma omp parallel num_threads(threads) if (threads > 1 && \
+ block_size >= 64 * ALPHABET_SIZE && \
+ omp_get_dynamic() == 0)
#endif
{
#if defined(_OPENMP)
@@ -6971,15 +6495,14 @@ static void libsais_final_sorting_scan_left_to_right_8u_block_omp(
#endif
fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
- fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
- ? omp_block_stride
- : block_size - omp_block_start;
+ fast_sint_t omp_block_size =
+ omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
omp_block_start += block_start;
if (omp_num_threads == 1) {
- libsais_final_sorting_scan_left_to_right_8u(
- T, SA, induction_bucket, omp_block_start, omp_block_size);
+ libsais_final_sorting_scan_left_to_right_8u(T, SA, induction_bucket, omp_block_start,
+ omp_block_size);
}
#if defined(_OPENMP)
else {
@@ -6987,8 +6510,7 @@ static void libsais_final_sorting_scan_left_to_right_8u_block_omp(
thread_state[omp_thread_num].state.count =
libsais_final_sorting_scan_left_to_right_8u_block_prepare(
T, SA, thread_state[omp_thread_num].state.buckets,
- thread_state[omp_thread_num].state.cache,
- omp_block_start, omp_block_size);
+ thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size);
}
#pragma omp barrier
@@ -6997,8 +6519,7 @@ static void libsais_final_sorting_scan_left_to_right_8u_block_omp(
{
fast_sint_t t;
for (t = 0; t < omp_num_threads; ++t) {
- sa_sint_t * RESTRICT temp_bucket =
- thread_state[t].state.buckets;
+ sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
fast_sint_t c;
for (c = 0; c < ALPHABET_SIZE; c += 1) {
sa_sint_t A = induction_bucket[c], B = temp_bucket[c];
@@ -7022,12 +6543,11 @@ static void libsais_final_sorting_scan_left_to_right_8u_block_omp(
}
static void libsais_final_sorting_scan_left_to_right_32s_block_omp(
- const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache,
- fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) {
+ const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
+ LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size,
+ sa_sint_t threads) {
#if defined(_OPENMP)
- #pragma omp parallel num_threads(threads) if (threads > 1 && \
- block_size >= 16384)
+ #pragma omp parallel num_threads(threads) if (threads > 1 && block_size >= 16384)
#endif
{
#if defined(_OPENMP)
@@ -7042,22 +6562,20 @@ static void libsais_final_sorting_scan_left_to_right_32s_block_omp(
#endif
fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
- fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
- ? omp_block_stride
- : block_size - omp_block_start;
+ fast_sint_t omp_block_size =
+ omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
omp_block_start += block_start;
if (omp_num_threads == 1) {
- libsais_final_sorting_scan_left_to_right_32s(
- T, SA, buckets, omp_block_start, omp_block_size);
+ libsais_final_sorting_scan_left_to_right_32s(T, SA, buckets, omp_block_start,
+ omp_block_size);
}
#if defined(_OPENMP)
else {
{
libsais_final_sorting_scan_left_to_right_32s_block_gather(
- T, SA, cache - block_start, omp_block_start,
- omp_block_size);
+ T, SA, cache - block_start, omp_block_start, omp_block_size);
}
#pragma omp barrier
@@ -7071,8 +6589,8 @@ static void libsais_final_sorting_scan_left_to_right_32s_block_omp(
#pragma omp barrier
{
- libsais_compact_and_place_cached_suffixes(
- SA, cache - block_start, omp_block_start, omp_block_size);
+ libsais_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start,
+ omp_block_size);
}
}
#endif
@@ -7087,8 +6605,7 @@ static void libsais_final_bwt_scan_left_to_right_8u_omp(
LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
SA[induction_bucket[T[(sa_sint_t)n - 1]]++] =
((sa_sint_t)n - 1) |
- ((sa_sint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1])
- << (SAINT_BIT - 1));
+ ((sa_sint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1));
if (threads == 1 || n < 65536) {
libsais_final_bwt_scan_left_to_right_8u(T, SA, induction_bucket, 0, n);
@@ -7101,9 +6618,8 @@ static void libsais_final_bwt_scan_left_to_right_8u_omp(
block_start++;
} else {
fast_sint_t block_max_end =
- block_start +
- ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE -
- 16 * (fast_sint_t)threads);
+ block_start + ((fast_sint_t)threads) *
+ (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads);
if (block_max_end > n) {
block_max_end = n;
}
@@ -7121,14 +6637,12 @@ static void libsais_final_bwt_scan_left_to_right_8u_omp(
p--;
SA[block_start] = T[p] | SAINT_MIN;
SA[induction_bucket[T[p]]++] =
- p | ((sa_sint_t)(T[p - (p > 0)] < T[p])
- << (SAINT_BIT - 1));
+ p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1));
}
}
} else {
libsais_final_bwt_scan_left_to_right_8u_block_omp(
- T, SA, induction_bucket, block_start, block_size,
- threads, thread_state);
+ T, SA, induction_bucket, block_start, block_size, threads, thread_state);
block_start = block_end;
}
}
@@ -7141,21 +6655,18 @@ static void libsais_final_bwt_scan_left_to_right_8u_omp(
static void libsais_final_bwt_aux_scan_left_to_right_8u_omp(
const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t n, sa_sint_t rm,
- sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket,
- sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
SA[induction_bucket[T[(sa_sint_t)n - 1]]++] =
((sa_sint_t)n - 1) |
- ((sa_sint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1])
- << (SAINT_BIT - 1));
+ ((sa_sint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1));
if ((((sa_sint_t)n - 1) & rm) == 0) {
- I[((sa_sint_t)n - 1) / (rm + 1)] =
- induction_bucket[T[(sa_sint_t)n - 1]];
+ I[((sa_sint_t)n - 1) / (rm + 1)] = induction_bucket[T[(sa_sint_t)n - 1]];
}
if (threads == 1 || n < 65536) {
- libsais_final_bwt_aux_scan_left_to_right_8u(T, SA, rm, I,
- induction_bucket, 0, n);
+ libsais_final_bwt_aux_scan_left_to_right_8u(T, SA, rm, I, induction_bucket, 0, n);
}
#if defined(_OPENMP)
else {
@@ -7165,9 +6676,8 @@ static void libsais_final_bwt_aux_scan_left_to_right_8u_omp(
block_start++;
} else {
fast_sint_t block_max_end =
- block_start +
- ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE -
- 16 * (fast_sint_t)threads);
+ block_start + ((fast_sint_t)threads) *
+ (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads);
if (block_max_end > n) {
block_max_end = n;
}
@@ -7185,8 +6695,7 @@ static void libsais_final_bwt_aux_scan_left_to_right_8u_omp(
p--;
SA[block_start] = T[p] | SAINT_MIN;
SA[induction_bucket[T[p]]++] =
- p | ((sa_sint_t)(T[p - (p > 0)] < T[p])
- << (SAINT_BIT - 1));
+ p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1));
if ((p & rm) == 0) {
I[p / (rm + 1)] = induction_bucket[T[p]];
}
@@ -7194,8 +6703,8 @@ static void libsais_final_bwt_aux_scan_left_to_right_8u_omp(
}
} else {
libsais_final_bwt_aux_scan_left_to_right_8u_block_omp(
- T, SA, rm, I, induction_bucket, block_start, block_size,
- threads, thread_state);
+ T, SA, rm, I, induction_bucket, block_start, block_size, threads,
+ thread_state);
block_start = block_end;
}
}
@@ -7212,12 +6721,10 @@ static void libsais_final_sorting_scan_left_to_right_8u_omp(
LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
SA[induction_bucket[T[(sa_sint_t)n - 1]]++] =
((sa_sint_t)n - 1) |
- ((sa_sint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1])
- << (SAINT_BIT - 1));
+ ((sa_sint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1));
if (threads == 1 || n < 65536) {
- libsais_final_sorting_scan_left_to_right_8u(T, SA, induction_bucket, 0,
- n);
+ libsais_final_sorting_scan_left_to_right_8u(T, SA, induction_bucket, 0, n);
}
#if defined(_OPENMP)
else {
@@ -7227,9 +6734,8 @@ static void libsais_final_sorting_scan_left_to_right_8u_omp(
block_start++;
} else {
fast_sint_t block_max_end =
- block_start +
- ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE -
- 16 * (fast_sint_t)threads);
+ block_start + ((fast_sint_t)threads) *
+ (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads);
if (block_max_end > n) {
block_max_end = n;
}
@@ -7246,14 +6752,12 @@ static void libsais_final_sorting_scan_left_to_right_8u_omp(
if (p > 0) {
p--;
SA[induction_bucket[T[p]]++] =
- p | ((sa_sint_t)(T[p - (p > 0)] < T[p])
- << (SAINT_BIT - 1));
+ p | ((sa_sint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1));
}
}
} else {
libsais_final_sorting_scan_left_to_right_8u_block_omp(
- T, SA, induction_bucket, block_start, block_size,
- threads, thread_state);
+ T, SA, induction_bucket, block_start, block_size, threads, thread_state);
block_start = block_end;
}
}
@@ -7272,22 +6776,20 @@ static void libsais_final_sorting_scan_left_to_right_32s_omp(
(n - 1) | ((sa_sint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1));
if (threads == 1 || n < 65536) {
- libsais_final_sorting_scan_left_to_right_32s(T, SA, induction_bucket, 0,
- n);
+ libsais_final_sorting_scan_left_to_right_32s(T, SA, induction_bucket, 0, n);
}
#if defined(_OPENMP)
else {
fast_sint_t block_start, block_end;
for (block_start = 0; block_start < n; block_start = block_end) {
- block_end = block_start +
- (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
+ block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
if (block_end > n) {
block_end = n;
}
libsais_final_sorting_scan_left_to_right_32s_block_omp(
- T, SA, induction_bucket, thread_state[0].state.cache,
- block_start, block_end - block_start, threads);
+ T, SA, induction_bucket, thread_state[0].state.cache, block_start,
+ block_end - block_start, threads);
}
}
#else
@@ -7295,16 +6797,16 @@ static void libsais_final_sorting_scan_left_to_right_32s_omp(
#endif
}
-static sa_sint_t libsais_final_bwt_scan_right_to_left_8u(
- const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start,
- fast_sint_t omp_block_size) {
+static sa_sint_t libsais_final_bwt_scan_right_to_left_8u(const u8 * RESTRICT T,
+ sa_sint_t * RESTRICT SA,
+ sa_sint_t * RESTRICT induction_bucket,
+ fast_sint_t omp_block_start,
+ fast_sint_t omp_block_size) {
const fast_sint_t prefetch_distance = 32;
fast_sint_t i, j;
sa_sint_t index = -1;
- for (i = omp_block_start + omp_block_size - 1,
- j = omp_block_start + prefetch_distance + 1;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1;
i >= j; i -= 2) {
libsais_prefetchw(&SA[i - 2 * prefetch_distance]);
@@ -7358,15 +6860,16 @@ static sa_sint_t libsais_final_bwt_scan_right_to_left_8u(
return index;
}
-static void libsais_final_bwt_aux_scan_right_to_left_8u(
- const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm,
- sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket,
- fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+static void libsais_final_bwt_aux_scan_right_to_left_8u(const u8 * RESTRICT T,
+ sa_sint_t * RESTRICT SA, sa_sint_t rm,
+ sa_sint_t * RESTRICT I,
+ sa_sint_t * RESTRICT induction_bucket,
+ fast_sint_t omp_block_start,
+ fast_sint_t omp_block_size) {
const fast_sint_t prefetch_distance = 32;
fast_sint_t i, j;
- for (i = omp_block_start + omp_block_size - 1,
- j = omp_block_start + prefetch_distance + 1;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1;
i >= j; i -= 2) {
libsais_prefetchw(&SA[i - 2 * prefetch_distance]);
@@ -7424,15 +6927,15 @@ static void libsais_final_bwt_aux_scan_right_to_left_8u(
}
}
-static void libsais_final_sorting_scan_right_to_left_8u(
- const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start,
- fast_sint_t omp_block_size) {
+static void libsais_final_sorting_scan_right_to_left_8u(const u8 * RESTRICT T,
+ sa_sint_t * RESTRICT SA,
+ sa_sint_t * RESTRICT induction_bucket,
+ fast_sint_t omp_block_start,
+ fast_sint_t omp_block_size) {
const fast_sint_t prefetch_distance = 32;
fast_sint_t i, j;
- for (i = omp_block_start + omp_block_size - 1,
- j = omp_block_start + prefetch_distance + 1;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1;
i >= j; i -= 2) {
libsais_prefetchw(&SA[i - 2 * prefetch_distance]);
@@ -7474,15 +6977,15 @@ static void libsais_final_sorting_scan_right_to_left_8u(
}
}
-static void libsais_final_sorting_scan_right_to_left_32s(
- const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start,
- fast_sint_t omp_block_size) {
+static void libsais_final_sorting_scan_right_to_left_32s(const sa_sint_t * RESTRICT T,
+ sa_sint_t * RESTRICT SA,
+ sa_sint_t * RESTRICT induction_bucket,
+ fast_sint_t omp_block_start,
+ fast_sint_t omp_block_size) {
const fast_sint_t prefetch_distance = 32;
fast_sint_t i, j;
- for (i = omp_block_start + omp_block_size - 1,
- j = omp_block_start + 2 * prefetch_distance + 1;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1;
i >= j; i -= 2) {
libsais_prefetchw(&SA[i - 3 * prefetch_distance]);
@@ -7533,16 +7036,15 @@ static void libsais_final_sorting_scan_right_to_left_32s(
#if defined(_OPENMP)
static fast_sint_t libsais_final_bwt_scan_right_to_left_8u_block_prepare(
- const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache,
- fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+ const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
+ LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
+ fast_sint_t omp_block_size) {
const fast_sint_t prefetch_distance = 32;
memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t));
fast_sint_t i, j, count = 0;
- for (i = omp_block_start + omp_block_size - 1,
- j = omp_block_start + prefetch_distance + 1;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1;
i >= j; i -= 2) {
libsais_prefetchw(&SA[i - 2 * prefetch_distance]);
@@ -7596,16 +7098,15 @@ static fast_sint_t libsais_final_bwt_scan_right_to_left_8u_block_prepare(
}
static fast_sint_t libsais_final_bwt_aux_scan_right_to_left_8u_block_prepare(
- const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache,
- fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+ const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
+ LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
+ fast_sint_t omp_block_size) {
const fast_sint_t prefetch_distance = 32;
memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t));
fast_sint_t i, j, count = 0;
- for (i = omp_block_start + omp_block_size - 1,
- j = omp_block_start + prefetch_distance + 1;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1;
i >= j; i -= 2) {
libsais_prefetchw(&SA[i - 2 * prefetch_distance]);
@@ -7665,16 +7166,15 @@ static fast_sint_t libsais_final_bwt_aux_scan_right_to_left_8u_block_prepare(
}
static fast_sint_t libsais_final_sorting_scan_right_to_left_8u_block_prepare(
- const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache,
- fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+ const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
+ LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
+ fast_sint_t omp_block_size) {
const fast_sint_t prefetch_distance = 32;
memset(buckets, 0, ALPHABET_SIZE * sizeof(sa_sint_t));
fast_sint_t i, j, count = 0;
- for (i = omp_block_start + omp_block_size - 1,
- j = omp_block_start + prefetch_distance + 1;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1;
i >= j; i -= 2) {
libsais_prefetchw(&SA[i - 2 * prefetch_distance]);
@@ -7694,16 +7194,14 @@ static fast_sint_t libsais_final_sorting_scan_right_to_left_8u_block_prepare(
if (p0 > 0) {
p0--;
buckets[cache[count].symbol = T[p0]]++;
- cache[count++].index =
- p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1));
+ cache[count++].index = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1));
}
sa_sint_t p1 = SA[i - 1];
SA[i - 1] = p1 & SAINT_MAX;
if (p1 > 0) {
p1--;
buckets[cache[count].symbol = T[p1]]++;
- cache[count++].index =
- p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1));
+ cache[count++].index = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1));
}
}
@@ -7713,8 +7211,7 @@ static fast_sint_t libsais_final_sorting_scan_right_to_left_8u_block_prepare(
if (p > 0) {
p--;
buckets[cache[count].symbol = T[p]]++;
- cache[count++].index =
- p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1));
+ cache[count++].index = p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1));
}
}
@@ -7722,8 +7219,8 @@ static fast_sint_t libsais_final_sorting_scan_right_to_left_8u_block_prepare(
}
static void libsais_final_order_scan_right_to_left_8u_block_place(
- sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
- LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count) {
+ sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache,
+ fast_sint_t count) {
const fast_sint_t prefetch_distance = 32;
fast_sint_t i, j;
@@ -7742,9 +7239,8 @@ static void libsais_final_order_scan_right_to_left_8u_block_place(
}
static void libsais_final_bwt_aux_scan_right_to_left_8u_block_place(
- sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I,
- sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache,
- fast_sint_t count) {
+ sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT buckets,
+ LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count) {
const fast_sint_t prefetch_distance = 32;
fast_sint_t i, j;
@@ -7772,22 +7268,19 @@ static void libsais_final_bwt_aux_scan_right_to_left_8u_block_place(
for (j += 6; i < j; i += 2) {
SA[--buckets[cache[i].symbol]] = cache[i].index;
if ((cache[i + 1].index & rm) == 0) {
- I[(cache[i + 1].index & SAINT_MAX) / (rm + 1)] =
- buckets[cache[i].symbol] + 1;
+ I[(cache[i + 1].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i].symbol] + 1;
}
}
}
static void libsais_final_sorting_scan_right_to_left_32s_block_gather(
- const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
- LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start,
- fast_sint_t omp_block_size) {
+ const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache,
+ fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
const fast_sint_t prefetch_distance = 32;
fast_sint_t i, j;
- for (i = omp_block_start,
- j = omp_block_start + omp_block_size - prefetch_distance - 1;
- i < j; i += 2) {
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j;
+ i += 2) {
libsais_prefetchw(&SA[i + 2 * prefetch_distance]);
sa_sint_t s0 = SA[i + prefetch_distance + 0];
@@ -7807,8 +7300,7 @@ static void libsais_final_sorting_scan_right_to_left_32s_block_gather(
SA[i + 0] = p0 & SAINT_MAX;
if (p0 > 0) {
p0--;
- cache[i + 0].index =
- p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1));
+ cache[i + 0].index = p0 | ((sa_sint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1));
symbol0 = T[p0];
}
cache[i + 0].symbol = symbol0;
@@ -7816,8 +7308,7 @@ static void libsais_final_sorting_scan_right_to_left_32s_block_gather(
SA[i + 1] = p1 & SAINT_MAX;
if (p1 > 0) {
p1--;
- cache[i + 1].index =
- p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1));
+ cache[i + 1].index = p1 | ((sa_sint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1));
symbol1 = T[p1];
}
cache[i + 1].symbol = symbol1;
@@ -7828,8 +7319,7 @@ static void libsais_final_sorting_scan_right_to_left_32s_block_gather(
SA[i] = p & SAINT_MAX;
if (p > 0) {
p--;
- cache[i].index =
- p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1));
+ cache[i].index = p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1));
symbol = T[p];
}
cache[i].symbol = symbol;
@@ -7843,8 +7333,7 @@ static void libsais_final_sorting_scan_right_to_left_32s_block_sort(
const fast_sint_t prefetch_distance = 32;
fast_sint_t i, j;
- for (i = omp_block_start + omp_block_size - 1,
- j = omp_block_start + prefetch_distance + 1;
+ for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1;
i >= j; i -= 2) {
libsais_prefetchw(&cache[i - 2 * prefetch_distance]);
@@ -7864,8 +7353,7 @@ static void libsais_final_sorting_scan_right_to_left_32s_block_sort(
if (np > 0) {
np--;
cache[ni].index =
- np | ((sa_sint_t)(T[np - (np > 0)] > T[np])
- << (SAINT_BIT - 1));
+ np | ((sa_sint_t)(T[np - (np > 0)] > T[np]) << (SAINT_BIT - 1));
cache[ni].symbol = T[np];
}
}
@@ -7880,8 +7368,7 @@ static void libsais_final_sorting_scan_right_to_left_32s_block_sort(
if (np > 0) {
np--;
cache[ni].index =
- np | ((sa_sint_t)(T[np - (np > 0)] > T[np])
- << (SAINT_BIT - 1));
+ np | ((sa_sint_t)(T[np - (np > 0)] > T[np]) << (SAINT_BIT - 1));
cache[ni].symbol = T[np];
}
}
@@ -7898,8 +7385,7 @@ static void libsais_final_sorting_scan_right_to_left_32s_block_sort(
if (np > 0) {
np--;
cache[ni].index =
- np | ((sa_sint_t)(T[np - (np > 0)] > T[np])
- << (SAINT_BIT - 1));
+ np | ((sa_sint_t)(T[np - (np > 0)] > T[np]) << (SAINT_BIT - 1));
cache[ni].symbol = T[np];
}
}
@@ -7908,14 +7394,13 @@ static void libsais_final_sorting_scan_right_to_left_32s_block_sort(
}
static void libsais_final_bwt_scan_right_to_left_8u_block_omp(
- const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start,
- fast_sint_t block_size, sa_sint_t threads,
+ const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket,
+ fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads,
LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
#if defined(_OPENMP)
- #pragma omp parallel num_threads( \
- threads) if (threads > 1 && block_size >= 64 * ALPHABET_SIZE && \
- omp_get_dynamic() == 0)
+ #pragma omp parallel num_threads(threads) if (threads > 1 && \
+ block_size >= 64 * ALPHABET_SIZE && \
+ omp_get_dynamic() == 0)
#endif
{
#if defined(_OPENMP)
@@ -7930,15 +7415,14 @@ static void libsais_final_bwt_scan_right_to_left_8u_block_omp(
#endif
fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
- fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
- ? omp_block_stride
- : block_size - omp_block_start;
+ fast_sint_t omp_block_size =
+ omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
omp_block_start += block_start;
if (omp_num_threads == 1) {
- libsais_final_bwt_scan_right_to_left_8u(
- T, SA, induction_bucket, omp_block_start, omp_block_size);
+ libsais_final_bwt_scan_right_to_left_8u(T, SA, induction_bucket, omp_block_start,
+ omp_block_size);
}
#if defined(_OPENMP)
else {
@@ -7946,8 +7430,7 @@ static void libsais_final_bwt_scan_right_to_left_8u_block_omp(
thread_state[omp_thread_num].state.count =
libsais_final_bwt_scan_right_to_left_8u_block_prepare(
T, SA, thread_state[omp_thread_num].state.buckets,
- thread_state[omp_thread_num].state.cache,
- omp_block_start, omp_block_size);
+ thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size);
}
#pragma omp barrier
@@ -7956,8 +7439,7 @@ static void libsais_final_bwt_scan_right_to_left_8u_block_omp(
{
fast_sint_t t;
for (t = omp_num_threads - 1; t >= 0; --t) {
- sa_sint_t * RESTRICT temp_bucket =
- thread_state[t].state.buckets;
+ sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
fast_sint_t c;
for (c = 0; c < ALPHABET_SIZE; c += 1) {
sa_sint_t A = induction_bucket[c], B = temp_bucket[c];
@@ -7981,14 +7463,13 @@ static void libsais_final_bwt_scan_right_to_left_8u_block_omp(
}
static void libsais_final_bwt_aux_scan_right_to_left_8u_block_omp(
- const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm,
- sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket,
- fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I,
+ sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size,
+ sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
#if defined(_OPENMP)
- #pragma omp parallel num_threads( \
- threads) if (threads > 1 && block_size >= 64 * ALPHABET_SIZE && \
- omp_get_dynamic() == 0)
+ #pragma omp parallel num_threads(threads) if (threads > 1 && \
+ block_size >= 64 * ALPHABET_SIZE && \
+ omp_get_dynamic() == 0)
#endif
{
#if defined(_OPENMP)
@@ -8003,16 +7484,14 @@ static void libsais_final_bwt_aux_scan_right_to_left_8u_block_omp(
#endif
fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
- fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
- ? omp_block_stride
- : block_size - omp_block_start;
+ fast_sint_t omp_block_size =
+ omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
omp_block_start += block_start;
if (omp_num_threads == 1) {
- libsais_final_bwt_aux_scan_right_to_left_8u(
- T, SA, rm, I, induction_bucket, omp_block_start,
- omp_block_size);
+ libsais_final_bwt_aux_scan_right_to_left_8u(T, SA, rm, I, induction_bucket,
+ omp_block_start, omp_block_size);
}
#if defined(_OPENMP)
else {
@@ -8020,8 +7499,7 @@ static void libsais_final_bwt_aux_scan_right_to_left_8u_block_omp(
thread_state[omp_thread_num].state.count =
libsais_final_bwt_aux_scan_right_to_left_8u_block_prepare(
T, SA, thread_state[omp_thread_num].state.buckets,
- thread_state[omp_thread_num].state.cache,
- omp_block_start, omp_block_size);
+ thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size);
}
#pragma omp barrier
@@ -8030,8 +7508,7 @@ static void libsais_final_bwt_aux_scan_right_to_left_8u_block_omp(
{
fast_sint_t t;
for (t = omp_num_threads - 1; t >= 0; --t) {
- sa_sint_t * RESTRICT temp_bucket =
- thread_state[t].state.buckets;
+ sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
fast_sint_t c;
for (c = 0; c < ALPHABET_SIZE; c += 1) {
sa_sint_t A = induction_bucket[c], B = temp_bucket[c];
@@ -8055,14 +7532,13 @@ static void libsais_final_bwt_aux_scan_right_to_left_8u_block_omp(
}
static void libsais_final_sorting_scan_right_to_left_8u_block_omp(
- const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start,
- fast_sint_t block_size, sa_sint_t threads,
+ const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket,
+ fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads,
LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
#if defined(_OPENMP)
- #pragma omp parallel num_threads( \
- threads) if (threads > 1 && block_size >= 64 * ALPHABET_SIZE && \
- omp_get_dynamic() == 0)
+ #pragma omp parallel num_threads(threads) if (threads > 1 && \
+ block_size >= 64 * ALPHABET_SIZE && \
+ omp_get_dynamic() == 0)
#endif
{
#if defined(_OPENMP)
@@ -8077,15 +7553,14 @@ static void libsais_final_sorting_scan_right_to_left_8u_block_omp(
#endif
fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
- fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
- ? omp_block_stride
- : block_size - omp_block_start;
+ fast_sint_t omp_block_size =
+ omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
omp_block_start += block_start;
if (omp_num_threads == 1) {
- libsais_final_sorting_scan_right_to_left_8u(
- T, SA, induction_bucket, omp_block_start, omp_block_size);
+ libsais_final_sorting_scan_right_to_left_8u(T, SA, induction_bucket, omp_block_start,
+ omp_block_size);
}
#if defined(_OPENMP)
else {
@@ -8093,8 +7568,7 @@ static void libsais_final_sorting_scan_right_to_left_8u_block_omp(
thread_state[omp_thread_num].state.count =
libsais_final_sorting_scan_right_to_left_8u_block_prepare(
T, SA, thread_state[omp_thread_num].state.buckets,
- thread_state[omp_thread_num].state.cache,
- omp_block_start, omp_block_size);
+ thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size);
}
#pragma omp barrier
@@ -8103,8 +7577,7 @@ static void libsais_final_sorting_scan_right_to_left_8u_block_omp(
{
fast_sint_t t;
for (t = omp_num_threads - 1; t >= 0; --t) {
- sa_sint_t * RESTRICT temp_bucket =
- thread_state[t].state.buckets;
+ sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets;
fast_sint_t c;
for (c = 0; c < ALPHABET_SIZE; c += 1) {
sa_sint_t A = induction_bucket[c], B = temp_bucket[c];
@@ -8128,12 +7601,11 @@ static void libsais_final_sorting_scan_right_to_left_8u_block_omp(
}
static void libsais_final_sorting_scan_right_to_left_32s_block_omp(
- const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
- sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache,
- fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) {
+ const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets,
+ LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size,
+ sa_sint_t threads) {
#if defined(_OPENMP)
- #pragma omp parallel num_threads(threads) if (threads > 1 && \
- block_size >= 16384)
+ #pragma omp parallel num_threads(threads) if (threads > 1 && block_size >= 16384)
#endif
{
#if defined(_OPENMP)
@@ -8148,22 +7620,20 @@ static void libsais_final_sorting_scan_right_to_left_32s_block_omp(
#endif
fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16);
fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
- fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
- ? omp_block_stride
- : block_size - omp_block_start;
+ fast_sint_t omp_block_size =
+ omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start;
omp_block_start += block_start;
if (omp_num_threads == 1) {
- libsais_final_sorting_scan_right_to_left_32s(
- T, SA, buckets, omp_block_start, omp_block_size);
+ libsais_final_sorting_scan_right_to_left_32s(T, SA, buckets, omp_block_start,
+ omp_block_size);
}
#if defined(_OPENMP)
else {
{
libsais_final_sorting_scan_right_to_left_32s_block_gather(
- T, SA, cache - block_start, omp_block_start,
- omp_block_size);
+ T, SA, cache - block_start, omp_block_start, omp_block_size);
}
#pragma omp barrier
@@ -8177,8 +7647,8 @@ static void libsais_final_sorting_scan_right_to_left_32s_block_omp(
#pragma omp barrier
{
- libsais_compact_and_place_cached_suffixes(
- SA, cache - block_start, omp_block_start, omp_block_size);
+ libsais_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start,
+ omp_block_size);
}
}
#endif
@@ -8194,8 +7664,7 @@ static sa_sint_t libsais_final_bwt_scan_right_to_left_8u_omp(
sa_sint_t index = -1;
if (threads == 1 || n < 65536) {
- index = libsais_final_bwt_scan_right_to_left_8u(T, SA, induction_bucket,
- 0, n);
+ index = libsais_final_bwt_scan_right_to_left_8u(T, SA, induction_bucket, 0, n);
}
#if defined(_OPENMP)
else {
@@ -8205,9 +7674,8 @@ static sa_sint_t libsais_final_bwt_scan_right_to_left_8u_omp(
index = (sa_sint_t)block_start--;
} else {
fast_sint_t block_max_end =
- block_start -
- ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE -
- 16 * (fast_sint_t)threads);
+ block_start - ((fast_sint_t)threads) *
+ (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads);
if (block_max_end < 0) {
block_max_end = -1;
}
@@ -8231,8 +7699,7 @@ static sa_sint_t libsais_final_bwt_scan_right_to_left_8u_omp(
}
} else {
libsais_final_bwt_scan_right_to_left_8u_block_omp(
- T, SA, induction_bucket, block_end + 1, block_size,
- threads, thread_state);
+ T, SA, induction_bucket, block_end + 1, block_size, threads, thread_state);
block_start = block_end;
}
}
@@ -8247,11 +7714,10 @@ static sa_sint_t libsais_final_bwt_scan_right_to_left_8u_omp(
static void libsais_final_bwt_aux_scan_right_to_left_8u_omp(
const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t rm,
- sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket,
- sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
if (threads == 1 || n < 65536) {
- libsais_final_bwt_aux_scan_right_to_left_8u(T, SA, rm, I,
- induction_bucket, 0, n);
+ libsais_final_bwt_aux_scan_right_to_left_8u(T, SA, rm, I, induction_bucket, 0, n);
}
#if defined(_OPENMP)
else {
@@ -8262,9 +7728,8 @@ static void libsais_final_bwt_aux_scan_right_to_left_8u_omp(
} else {
fast_sint_t block_max_end =
block_start -
- ((fast_sint_t)threads) * ((LIBSAIS_PER_THREAD_CACHE_SIZE -
- 16 * (fast_sint_t)threads) /
- 2);
+ ((fast_sint_t)threads) *
+ ((LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads) / 2);
if (block_max_end < 0) {
block_max_end = -1;
}
@@ -8291,8 +7756,8 @@ static void libsais_final_bwt_aux_scan_right_to_left_8u_omp(
}
} else {
libsais_final_bwt_aux_scan_right_to_left_8u_block_omp(
- T, SA, rm, I, induction_bucket, block_end + 1,
- block_size, threads, thread_state);
+ T, SA, rm, I, induction_bucket, block_end + 1, block_size, threads,
+ thread_state);
block_start = block_end;
}
}
@@ -8308,8 +7773,7 @@ static void libsais_final_sorting_scan_right_to_left_8u_omp(
sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads,
LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
if (threads == 1 || n < 65536) {
- libsais_final_sorting_scan_right_to_left_8u(T, SA, induction_bucket, 0,
- n);
+ libsais_final_sorting_scan_right_to_left_8u(T, SA, induction_bucket, 0, n);
}
#if defined(_OPENMP)
else {
@@ -8319,9 +7783,8 @@ static void libsais_final_sorting_scan_right_to_left_8u_omp(
block_start--;
} else {
fast_sint_t block_max_end =
- block_start -
- ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE -
- 16 * (fast_sint_t)threads);
+ block_start - ((fast_sint_t)threads) *
+ (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads);
if (block_max_end < -1) {
block_max_end = -1;
}
@@ -8338,14 +7801,12 @@ static void libsais_final_sorting_scan_right_to_left_8u_omp(
if (p > 0) {
p--;
SA[--induction_bucket[T[p]]] =
- p | ((sa_sint_t)(T[p - (p > 0)] > T[p])
- << (SAINT_BIT - 1));
+ p | ((sa_sint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1));
}
}
} else {
libsais_final_sorting_scan_right_to_left_8u_block_omp(
- T, SA, induction_bucket, block_end + 1, block_size,
- threads, thread_state);
+ T, SA, induction_bucket, block_end + 1, block_size, threads, thread_state);
block_start = block_end;
}
}
@@ -8361,23 +7822,20 @@ static void libsais_final_sorting_scan_right_to_left_32s_omp(
sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads,
LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
if (threads == 1 || n < 65536) {
- libsais_final_sorting_scan_right_to_left_32s(T, SA, induction_bucket, 0,
- n);
+ libsais_final_sorting_scan_right_to_left_32s(T, SA, induction_bucket, 0, n);
}
#if defined(_OPENMP)
else {
fast_sint_t block_start, block_end;
- for (block_start = (fast_sint_t)n - 1; block_start >= 0;
- block_start = block_end) {
- block_end = block_start -
- (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
+ for (block_start = (fast_sint_t)n - 1; block_start >= 0; block_start = block_end) {
+ block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE;
if (block_end < 0) {
block_end = -1;
}
libsais_final_sorting_scan_right_to_left_32s_block_omp(
- T, SA, induction_bucket, thread_state[0].state.cache,
- block_end + 1, block_start - block_end, threads);
+ T, SA, induction_bucket, thread_state[0].state.cache, block_end + 1,
+ block_start - block_end, threads);
}
}
#else
@@ -8385,16 +7843,13 @@ static void libsais_final_sorting_scan_right_to_left_32s_omp(
#endif
}
-static void libsais_clear_lms_suffixes_omp(sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t k,
+static void libsais_clear_lms_suffixes_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
sa_sint_t * RESTRICT bucket_start,
- sa_sint_t * RESTRICT bucket_end,
- sa_sint_t threads) {
+ sa_sint_t * RESTRICT bucket_end, sa_sint_t threads) {
fast_sint_t c;
#if defined(_OPENMP)
- #pragma omp parallel for schedule(static, 1) \
- num_threads(threads) if (threads > 1 && n >= 65536)
+ #pragma omp parallel for schedule(static, 1) num_threads(threads) if (threads > 1 && n >= 65536)
#else
UNUSED(threads);
UNUSED(n);
@@ -8402,96 +7857,89 @@ static void libsais_clear_lms_suffixes_omp(sa_sint_t * RESTRICT SA, sa_sint_t n,
for (c = 0; c < k; ++c) {
if (bucket_end[c] > bucket_start[c]) {
memset(&SA[bucket_start[c]], 0,
- ((size_t)bucket_end[c] - (size_t)bucket_start[c]) *
- sizeof(sa_sint_t));
+ ((size_t)bucket_end[c] - (size_t)bucket_start[c]) * sizeof(sa_sint_t));
}
}
}
-static sa_sint_t libsais_induce_final_order_8u_omp(
- const u8 * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t bwt,
- sa_sint_t r, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT buckets,
- sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+static sa_sint_t libsais_induce_final_order_8u_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT SA,
+ sa_sint_t n, sa_sint_t bwt, sa_sint_t r,
+ sa_sint_t * RESTRICT I,
+ sa_sint_t * RESTRICT buckets, sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
if (!bwt) {
- libsais_final_sorting_scan_left_to_right_8u_omp(
- T, SA, n, &buckets[6 * ALPHABET_SIZE], threads, thread_state);
+ libsais_final_sorting_scan_left_to_right_8u_omp(T, SA, n, &buckets[6 * ALPHABET_SIZE],
+ threads, thread_state);
if (threads > 1 && n >= 65536) {
- libsais_clear_lms_suffixes_omp(
- SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE],
- &buckets[7 * ALPHABET_SIZE], threads);
+ libsais_clear_lms_suffixes_omp(SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE],
+ &buckets[7 * ALPHABET_SIZE], threads);
}
- libsais_final_sorting_scan_right_to_left_8u_omp(
- T, SA, n, &buckets[7 * ALPHABET_SIZE], threads, thread_state);
+ libsais_final_sorting_scan_right_to_left_8u_omp(T, SA, n, &buckets[7 * ALPHABET_SIZE],
+ threads, thread_state);
return 0;
} else if (I != NULL) {
libsais_final_bwt_aux_scan_left_to_right_8u_omp(
- T, SA, n, r - 1, I, &buckets[6 * ALPHABET_SIZE], threads,
- thread_state);
+ T, SA, n, r - 1, I, &buckets[6 * ALPHABET_SIZE], threads, thread_state);
if (threads > 1 && n >= 65536) {
- libsais_clear_lms_suffixes_omp(
- SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE],
- &buckets[7 * ALPHABET_SIZE], threads);
+ libsais_clear_lms_suffixes_omp(SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE],
+ &buckets[7 * ALPHABET_SIZE], threads);
}
libsais_final_bwt_aux_scan_right_to_left_8u_omp(
- T, SA, n, r - 1, I, &buckets[7 * ALPHABET_SIZE], threads,
- thread_state);
+ T, SA, n, r - 1, I, &buckets[7 * ALPHABET_SIZE], threads, thread_state);
return 0;
} else {
- libsais_final_bwt_scan_left_to_right_8u_omp(
- T, SA, n, &buckets[6 * ALPHABET_SIZE], threads, thread_state);
+ libsais_final_bwt_scan_left_to_right_8u_omp(T, SA, n, &buckets[6 * ALPHABET_SIZE], threads,
+ thread_state);
if (threads > 1 && n >= 65536) {
- libsais_clear_lms_suffixes_omp(
- SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE],
- &buckets[7 * ALPHABET_SIZE], threads);
+ libsais_clear_lms_suffixes_omp(SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE],
+ &buckets[7 * ALPHABET_SIZE], threads);
}
- return libsais_final_bwt_scan_right_to_left_8u_omp(
- T, SA, n, &buckets[7 * ALPHABET_SIZE], threads, thread_state);
+ return libsais_final_bwt_scan_right_to_left_8u_omp(T, SA, n, &buckets[7 * ALPHABET_SIZE],
+ threads, thread_state);
}
}
-static void libsais_induce_final_order_32s_6k(
- const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[4 * k],
- threads, thread_state);
- libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[5 * k],
- threads, thread_state);
+static void libsais_induce_final_order_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+ sa_sint_t n, sa_sint_t k,
+ sa_sint_t * RESTRICT buckets, sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[4 * k], threads,
+ thread_state);
+ libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[5 * k], threads,
+ thread_state);
}
-static void libsais_induce_final_order_32s_4k(
- const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[2 * k],
- threads, thread_state);
- libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[3 * k],
- threads, thread_state);
+static void libsais_induce_final_order_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+ sa_sint_t n, sa_sint_t k,
+ sa_sint_t * RESTRICT buckets, sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[2 * k], threads,
+ thread_state);
+ libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[3 * k], threads,
+ thread_state);
}
-static void libsais_induce_final_order_32s_2k(
- const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[1 * k],
- threads, thread_state);
- libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[0 * k],
- threads, thread_state);
+static void libsais_induce_final_order_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+ sa_sint_t n, sa_sint_t k,
+ sa_sint_t * RESTRICT buckets, sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[1 * k], threads,
+ thread_state);
+ libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[0 * k], threads,
+ thread_state);
}
-static void libsais_induce_final_order_32s_1k(
- const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
- sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+static void libsais_induce_final_order_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+ sa_sint_t n, sa_sint_t k,
+ sa_sint_t * RESTRICT buckets, sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
libsais_count_suffixes_32s(T, n, k, buckets);
libsais_initialize_buckets_start_32s_1k(k, buckets);
- libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, buckets, threads,
- thread_state);
+ libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, buckets, threads, thread_state);
libsais_count_suffixes_32s(T, n, k, buckets);
libsais_initialize_buckets_end_32s_1k(k, buckets);
- libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, buckets, threads,
- thread_state);
+ libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, buckets, threads, thread_state);
}
static sa_sint_t libsais_renumber_unique_and_nonunique_lms_suffixes_32s(
@@ -8502,20 +7950,16 @@ static sa_sint_t libsais_renumber_unique_and_nonunique_lms_suffixes_32s(
sa_sint_t * RESTRICT SAm = &SA[m];
sa_sint_t i, j;
- for (i = (sa_sint_t)omp_block_start,
- j = (sa_sint_t)omp_block_start + (sa_sint_t)omp_block_size -
- 2 * (sa_sint_t)prefetch_distance - 3;
+ for (i = (sa_sint_t)omp_block_start, j = (sa_sint_t)omp_block_start +
+ (sa_sint_t)omp_block_size -
+ 2 * (sa_sint_t)prefetch_distance - 3;
i < j; i += 4) {
libsais_prefetch(&SA[i + 3 * prefetch_distance]);
- libsais_prefetchw(
- &SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 0]) >> 1]);
- libsais_prefetchw(
- &SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 1]) >> 1]);
- libsais_prefetchw(
- &SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 2]) >> 1]);
- libsais_prefetchw(
- &SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 3]) >> 1]);
+ libsais_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 0]) >> 1]);
+ libsais_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 1]) >> 1]);
+ libsais_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 2]) >> 1]);
+ libsais_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 3]) >> 1]);
sa_uint_t q0 = (sa_uint_t)SA[i + prefetch_distance + 0];
const sa_sint_t * Tq0 = &T[q0];
@@ -8578,9 +8022,11 @@ static sa_sint_t libsais_renumber_unique_and_nonunique_lms_suffixes_32s(
return f;
}
-static void libsais_compact_unique_and_nonunique_lms_suffixes_32s(
- sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t * pl, fast_sint_t * pr,
- fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+static void libsais_compact_unique_and_nonunique_lms_suffixes_32s(sa_sint_t * RESTRICT SA,
+ sa_sint_t m, fast_sint_t * pl,
+ fast_sint_t * pr,
+ fast_sint_t omp_block_start,
+ fast_sint_t omp_block_size) {
const fast_sint_t prefetch_distance = 32;
sa_sint_t * RESTRICT SAl = &SA[0];
@@ -8628,8 +8074,7 @@ static void libsais_compact_unique_and_nonunique_lms_suffixes_32s(
#if defined(_OPENMP)
-static sa_sint_t libsais_count_unique_suffixes(sa_sint_t * RESTRICT SA,
- sa_sint_t m,
+static sa_sint_t libsais_count_unique_suffixes(sa_sint_t * RESTRICT SA, sa_sint_t m,
fast_sint_t omp_block_start,
fast_sint_t omp_block_size) {
const fast_sint_t prefetch_distance = 32;
@@ -8638,9 +8083,8 @@ static sa_sint_t libsais_count_unique_suffixes(sa_sint_t * RESTRICT SA,
fast_sint_t i, j;
sa_sint_t f0 = 0, f1 = 0, f2 = 0, f3 = 0;
- for (i = omp_block_start,
- j = omp_block_start + omp_block_size - prefetch_distance - 3;
- i < j; i += 4) {
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j;
+ i += 4) {
libsais_prefetch(&SA[i + 2 * prefetch_distance]);
libsais_prefetch(&SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]);
@@ -8664,8 +8108,8 @@ static sa_sint_t libsais_count_unique_suffixes(sa_sint_t * RESTRICT SA,
#endif
static sa_sint_t libsais_renumber_unique_and_nonunique_lms_suffixes_32s_omp(
- sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t m,
- sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
sa_sint_t f = 0;
#if defined(_OPENMP)
@@ -8684,20 +8128,18 @@ static sa_sint_t libsais_renumber_unique_and_nonunique_lms_suffixes_32s_omp(
#endif
fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16);
fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
- fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
- ? omp_block_stride
- : m - omp_block_start;
+ fast_sint_t omp_block_size =
+ omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
if (omp_num_threads == 1) {
- f = libsais_renumber_unique_and_nonunique_lms_suffixes_32s(
- T, SA, m, 0, omp_block_start, omp_block_size);
+ f = libsais_renumber_unique_and_nonunique_lms_suffixes_32s(T, SA, m, 0, omp_block_start,
+ omp_block_size);
}
#if defined(_OPENMP)
else {
{
thread_state[omp_thread_num].state.count =
- libsais_count_unique_suffixes(SA, m, omp_block_start,
- omp_block_size);
+ libsais_count_unique_suffixes(SA, m, omp_block_start, omp_block_size);
}
#pragma omp barrier
@@ -8709,13 +8151,11 @@ static sa_sint_t libsais_renumber_unique_and_nonunique_lms_suffixes_32s_omp(
}
if (omp_thread_num == omp_num_threads - 1) {
- f = (sa_sint_t)(count +
- thread_state[omp_thread_num].state.count);
+ f = (sa_sint_t)(count + thread_state[omp_thread_num].state.count);
}
libsais_renumber_unique_and_nonunique_lms_suffixes_32s(
- T, SA, m, (sa_sint_t)count, omp_block_start,
- omp_block_size);
+ T, SA, m, (sa_sint_t)count, omp_block_start, omp_block_size);
}
}
#endif
@@ -8725,12 +8165,10 @@ static sa_sint_t libsais_renumber_unique_and_nonunique_lms_suffixes_32s_omp(
}
static void libsais_compact_unique_and_nonunique_lms_suffixes_32s_omp(
- sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs,
- sa_sint_t f, sa_sint_t threads,
+ sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t f, sa_sint_t threads,
LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
#if defined(_OPENMP)
- #pragma omp parallel num_threads(threads) if (threads > 1 && \
- n >= 131072 && m < fs)
+ #pragma omp parallel num_threads(threads) if (threads > 1 && n >= 131072 && m < fs)
#endif
{
#if defined(_OPENMP)
@@ -8743,32 +8181,28 @@ static void libsais_compact_unique_and_nonunique_lms_suffixes_32s_omp(
fast_sint_t omp_thread_num = 0;
fast_sint_t omp_num_threads = 1;
#endif
- fast_sint_t omp_block_stride =
- (((fast_sint_t)n >> 1) / omp_num_threads) & (-16);
+ fast_sint_t omp_block_stride = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16);
fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
- fast_sint_t omp_block_size =
- omp_thread_num < omp_num_threads - 1
- ? omp_block_stride
- : ((fast_sint_t)n >> 1) - omp_block_start;
+ fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
+ ? omp_block_stride
+ : ((fast_sint_t)n >> 1) - omp_block_start;
if (omp_num_threads == 1) {
fast_sint_t l = m, r = (fast_sint_t)n + (fast_sint_t)fs;
- libsais_compact_unique_and_nonunique_lms_suffixes_32s(
- SA, m, &l, &r, omp_block_start, omp_block_size);
+ libsais_compact_unique_and_nonunique_lms_suffixes_32s(SA, m, &l, &r, omp_block_start,
+ omp_block_size);
}
#if defined(_OPENMP)
else {
{
thread_state[omp_thread_num].state.position =
- (fast_sint_t)m + ((fast_sint_t)n >> 1) + omp_block_start +
- omp_block_size;
+ (fast_sint_t)m + ((fast_sint_t)n >> 1) + omp_block_start + omp_block_size;
thread_state[omp_thread_num].state.count =
(fast_sint_t)m + omp_block_start + omp_block_size;
libsais_compact_unique_and_nonunique_lms_suffixes_32s(
SA, m, &thread_state[omp_thread_num].state.position,
- &thread_state[omp_thread_num].state.count, omp_block_start,
- omp_block_size);
+ &thread_state[omp_thread_num].state.count, omp_block_start, omp_block_size);
}
#pragma omp barrier
@@ -8778,29 +8212,24 @@ static void libsais_compact_unique_and_nonunique_lms_suffixes_32s_omp(
fast_sint_t t, position;
for (position = m, t = omp_num_threads - 1; t >= 0; --t) {
- fast_sint_t omp_block_end = t < omp_num_threads - 1
- ? omp_block_stride * (t + 1)
- : ((fast_sint_t)n >> 1);
- fast_sint_t count =
- ((fast_sint_t)m + ((fast_sint_t)n >> 1) +
- omp_block_end - thread_state[t].state.position);
+ fast_sint_t omp_block_end = t < omp_num_threads - 1 ? omp_block_stride * (t + 1)
+ : ((fast_sint_t)n >> 1);
+ fast_sint_t count = ((fast_sint_t)m + ((fast_sint_t)n >> 1) + omp_block_end -
+ thread_state[t].state.position);
if (count > 0) {
position -= count;
- memcpy(&SA[position],
- &SA[thread_state[t].state.position],
+ memcpy(&SA[position], &SA[thread_state[t].state.position],
(size_t)count * sizeof(sa_sint_t));
}
}
- for (position = (fast_sint_t)n + (fast_sint_t)fs,
- t = omp_num_threads - 1;
- t >= 0; --t) {
- fast_sint_t omp_block_end = t < omp_num_threads - 1
- ? omp_block_stride * (t + 1)
- : ((fast_sint_t)n >> 1);
- fast_sint_t count = ((fast_sint_t)m + omp_block_end -
- thread_state[t].state.count);
+ for (position = (fast_sint_t)n + (fast_sint_t)fs, t = omp_num_threads - 1; t >= 0;
+ --t) {
+ fast_sint_t omp_block_end = t < omp_num_threads - 1 ? omp_block_stride * (t + 1)
+ : ((fast_sint_t)n >> 1);
+ fast_sint_t count =
+ ((fast_sint_t)m + omp_block_end - thread_state[t].state.count);
if (count > 0) {
position -= count;
@@ -8818,24 +8247,23 @@ static void libsais_compact_unique_and_nonunique_lms_suffixes_32s_omp(
}
static sa_sint_t libsais_compact_lms_suffixes_32s_omp(
- sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
- sa_sint_t fs, sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- sa_sint_t f = libsais_renumber_unique_and_nonunique_lms_suffixes_32s_omp(
- T, SA, m, threads, thread_state);
- libsais_compact_unique_and_nonunique_lms_suffixes_32s_omp(
- SA, n, m, fs, f, threads, thread_state);
+ sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs,
+ sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ sa_sint_t f =
+ libsais_renumber_unique_and_nonunique_lms_suffixes_32s_omp(T, SA, m, threads, thread_state);
+ libsais_compact_unique_and_nonunique_lms_suffixes_32s_omp(SA, n, m, fs, f, threads,
+ thread_state);
return f;
}
-static void libsais_merge_unique_lms_suffixes_32s(
- sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
- fast_sint_t l, fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+static void libsais_merge_unique_lms_suffixes_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA,
+ sa_sint_t n, sa_sint_t m, fast_sint_t l,
+ fast_sint_t omp_block_start,
+ fast_sint_t omp_block_size) {
const fast_sint_t prefetch_distance = 32;
- const sa_sint_t * RESTRICT SAnm =
- &SA[(fast_sint_t)n - (fast_sint_t)m - 1 + l];
+ const sa_sint_t * RESTRICT SAnm = &SA[(fast_sint_t)n - (fast_sint_t)m - 1 + l];
sa_sint_t i, j;
fast_sint_t tmp = *SAnm++;
@@ -8885,18 +8313,17 @@ static void libsais_merge_unique_lms_suffixes_32s(
}
}
-static void libsais_merge_nonunique_lms_suffixes_32s(
- sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, fast_sint_t l,
- fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
+static void libsais_merge_nonunique_lms_suffixes_32s(sa_sint_t * RESTRICT SA, sa_sint_t n,
+ sa_sint_t m, fast_sint_t l,
+ fast_sint_t omp_block_start,
+ fast_sint_t omp_block_size) {
const fast_sint_t prefetch_distance = 32;
- const sa_sint_t * RESTRICT SAnm =
- &SA[(fast_sint_t)n - (fast_sint_t)m - 1 + l];
+ const sa_sint_t * RESTRICT SAnm = &SA[(fast_sint_t)n - (fast_sint_t)m - 1 + l];
fast_sint_t i, j;
sa_sint_t tmp = *SAnm++;
- for (i = omp_block_start, j = omp_block_start + omp_block_size - 3; i < j;
- i += 4) {
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - 3; i < j; i += 4) {
libsais_prefetch(&SA[i + prefetch_distance]);
if (SA[i + 0] == 0) {
@@ -8926,8 +8353,8 @@ static void libsais_merge_nonunique_lms_suffixes_32s(
}
static void libsais_merge_unique_lms_suffixes_32s_omp(
- sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
- sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
#if defined(_OPENMP)
#pragma omp parallel num_threads(threads) if (threads > 1 && n >= 65536)
#endif
@@ -8944,20 +8371,17 @@ static void libsais_merge_unique_lms_suffixes_32s_omp(
#endif
fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
- fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
- ? omp_block_stride
- : n - omp_block_start;
+ fast_sint_t omp_block_size =
+ omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
if (omp_num_threads == 1) {
- libsais_merge_unique_lms_suffixes_32s(
- T, SA, n, m, 0, omp_block_start, omp_block_size);
+ libsais_merge_unique_lms_suffixes_32s(T, SA, n, m, 0, omp_block_start, omp_block_size);
}
#if defined(_OPENMP)
else {
{
thread_state[omp_thread_num].state.count =
- libsais_count_negative_marked_suffixes(T, omp_block_start,
- omp_block_size);
+ libsais_count_negative_marked_suffixes(T, omp_block_start, omp_block_size);
}
#pragma omp barrier
@@ -8968,8 +8392,8 @@ static void libsais_merge_unique_lms_suffixes_32s_omp(
count += thread_state[t].state.count;
}
- libsais_merge_unique_lms_suffixes_32s(
- T, SA, n, m, count, omp_block_start, omp_block_size);
+ libsais_merge_unique_lms_suffixes_32s(T, SA, n, m, count, omp_block_start,
+ omp_block_size);
}
}
#endif
@@ -8977,8 +8401,8 @@ static void libsais_merge_unique_lms_suffixes_32s_omp(
}
static void libsais_merge_nonunique_lms_suffixes_32s_omp(
- sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t f,
- sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t f, sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
#if defined(_OPENMP)
#pragma omp parallel num_threads(threads) if (threads > 1 && m >= 65536)
#endif
@@ -8995,20 +8419,17 @@ static void libsais_merge_nonunique_lms_suffixes_32s_omp(
#endif
fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16);
fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
- fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
- ? omp_block_stride
- : m - omp_block_start;
+ fast_sint_t omp_block_size =
+ omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start;
if (omp_num_threads == 1) {
- libsais_merge_nonunique_lms_suffixes_32s(
- SA, n, m, f, omp_block_start, omp_block_size);
+ libsais_merge_nonunique_lms_suffixes_32s(SA, n, m, f, omp_block_start, omp_block_size);
}
#if defined(_OPENMP)
else {
{
thread_state[omp_thread_num].state.count =
- libsais_count_zero_marked_suffixes(SA, omp_block_start,
- omp_block_size);
+ libsais_count_zero_marked_suffixes(SA, omp_block_start, omp_block_size);
}
#pragma omp barrier
@@ -9019,8 +8440,8 @@ static void libsais_merge_nonunique_lms_suffixes_32s_omp(
count += thread_state[t].state.count;
}
- libsais_merge_nonunique_lms_suffixes_32s(
- SA, n, m, count, omp_block_start, omp_block_size);
+ libsais_merge_nonunique_lms_suffixes_32s(SA, n, m, count, omp_block_start,
+ omp_block_size);
}
}
#endif
@@ -9028,112 +8449,96 @@ static void libsais_merge_nonunique_lms_suffixes_32s_omp(
}
static void libsais_merge_compacted_lms_suffixes_32s_omp(
- sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
- sa_sint_t f, sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
- libsais_merge_unique_lms_suffixes_32s_omp(T, SA, n, m, threads,
- thread_state);
- libsais_merge_nonunique_lms_suffixes_32s_omp(SA, n, m, f, threads,
- thread_state);
+ sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t f,
+ sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ libsais_merge_unique_lms_suffixes_32s_omp(T, SA, n, m, threads, thread_state);
+ libsais_merge_nonunique_lms_suffixes_32s_omp(SA, n, m, f, threads, thread_state);
}
static void libsais_reconstruct_compacted_lms_suffixes_32s_2k_omp(
- sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
- sa_sint_t m, sa_sint_t fs, sa_sint_t f, sa_sint_t * RESTRICT buckets,
- sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m,
+ sa_sint_t fs, sa_sint_t f, sa_sint_t * RESTRICT buckets, sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
if (f > 0) {
memmove(&SA[n - m - 1], &SA[n + fs - m], (size_t)f * sizeof(sa_sint_t));
- libsais_count_and_gather_compacted_lms_suffixes_32s_2k_omp(
- T, SA, n, k, buckets, threads, thread_state);
+ libsais_count_and_gather_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, threads,
+ thread_state);
libsais_reconstruct_lms_suffixes_omp(SA, n, m - f, threads);
- memcpy(&SA[n - m - 1 + f], &SA[0],
- ((size_t)m - (size_t)f) * sizeof(sa_sint_t));
+ memcpy(&SA[n - m - 1 + f], &SA[0], ((size_t)m - (size_t)f) * sizeof(sa_sint_t));
memset(&SA[0], 0, (size_t)m * sizeof(sa_sint_t));
- libsais_merge_compacted_lms_suffixes_32s_omp(T, SA, n, m, f, threads,
- thread_state);
+ libsais_merge_compacted_lms_suffixes_32s_omp(T, SA, n, m, f, threads, thread_state);
} else {
- libsais_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets, 0,
- n);
+ libsais_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets, 0, n);
libsais_reconstruct_lms_suffixes_omp(SA, n, m, threads);
}
}
static void libsais_reconstruct_compacted_lms_suffixes_32s_1k_omp(
- sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m,
- sa_sint_t fs, sa_sint_t f, sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs,
+ sa_sint_t f, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
if (f > 0) {
memmove(&SA[n - m - 1], &SA[n + fs - m], (size_t)f * sizeof(sa_sint_t));
libsais_gather_compacted_lms_suffixes_32s(T, SA, n);
libsais_reconstruct_lms_suffixes_omp(SA, n, m - f, threads);
- memcpy(&SA[n - m - 1 + f], &SA[0],
- ((size_t)m - (size_t)f) * sizeof(sa_sint_t));
+ memcpy(&SA[n - m - 1 + f], &SA[0], ((size_t)m - (size_t)f) * sizeof(sa_sint_t));
memset(&SA[0], 0, (size_t)m * sizeof(sa_sint_t));
- libsais_merge_compacted_lms_suffixes_32s_omp(T, SA, n, m, f, threads,
- thread_state);
+ libsais_merge_compacted_lms_suffixes_32s_omp(T, SA, n, m, f, threads, thread_state);
} else {
libsais_gather_lms_suffixes_32s(T, SA, n);
libsais_reconstruct_lms_suffixes_omp(SA, n, m, threads);
}
}
-static sa_sint_t libsais_main_32s(
- sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k,
- sa_sint_t fs, sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+static sa_sint_t libsais_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n,
+ sa_sint_t k, sa_sint_t fs, sa_sint_t threads,
+ LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
fs = fs < (SAINT_MAX - n) ? fs : (SAINT_MAX - n);
if (k > 0 && fs / k >= 6) {
sa_sint_t alignment = (fs - 1024) / k >= 6 ? 1024 : 16;
sa_sint_t * RESTRICT buckets =
(fs - alignment) / k >= 6
- ? (sa_sint_t *)libsais_align_up(
- &SA[n + fs - 6 * k - alignment],
- (size_t)alignment * sizeof(sa_sint_t))
+ ? (sa_sint_t *)libsais_align_up(&SA[n + fs - 6 * k - alignment],
+ (size_t)alignment * sizeof(sa_sint_t))
: &SA[n + fs - 6 * k];
- sa_sint_t m = libsais_count_and_gather_lms_suffixes_32s_4k_omp(
- T, SA, n, k, buckets, threads, thread_state);
+ sa_sint_t m = libsais_count_and_gather_lms_suffixes_32s_4k_omp(T, SA, n, k, buckets,
+ threads, thread_state);
if (m > 1) {
memset(SA, 0, ((size_t)n - (size_t)m) * sizeof(sa_sint_t));
sa_sint_t first_lms_suffix = SA[n - m];
sa_sint_t left_suffixes_count =
- libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(
- T, k, buckets, first_lms_suffix);
+ libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(T, k, buckets,
+ first_lms_suffix);
- libsais_radix_sort_lms_suffixes_32s_6k_omp(
- T, SA, n, m, &buckets[4 * k], threads, thread_state);
- libsais_radix_sort_set_markers_32s_6k_omp(SA, k, &buckets[4 * k],
- threads);
+ libsais_radix_sort_lms_suffixes_32s_6k_omp(T, SA, n, m, &buckets[4 * k], threads,
+ thread_state);
+ libsais_radix_sort_set_markers_32s_6k_omp(SA, k, &buckets[4 * k], threads);
if (threads > 1 && n >= 65536) {
- memset(&SA[(fast_sint_t)n - (fast_sint_t)m], 0,
- (size_t)m * sizeof(sa_sint_t));
+ memset(&SA[(fast_sint_t)n - (fast_sint_t)m], 0, (size_t)m * sizeof(sa_sint_t));
}
- libsais_initialize_buckets_for_partial_sorting_32s_6k(
- T, k, buckets, first_lms_suffix, left_suffixes_count);
- libsais_induce_partial_order_32s_6k_omp(
- T, SA, n, k, buckets, first_lms_suffix, left_suffixes_count,
- threads, thread_state);
+ libsais_initialize_buckets_for_partial_sorting_32s_6k(T, k, buckets, first_lms_suffix,
+ left_suffixes_count);
+ libsais_induce_partial_order_32s_6k_omp(T, SA, n, k, buckets, first_lms_suffix,
+ left_suffixes_count, threads, thread_state);
- sa_sint_t names =
- libsais_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(
- SA, n, m, threads, thread_state);
+ sa_sint_t names = libsais_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(
+ SA, n, m, threads, thread_state);
if (names < m) {
- sa_sint_t f = libsais_compact_lms_suffixes_32s_omp(
- T, SA, n, m, fs, threads, thread_state);
+ sa_sint_t f =
+ libsais_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state);
- if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f,
- fs + n - 2 * m + f, threads,
- thread_state) != 0) {
+ if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f,
+ threads, thread_state) != 0) {
return -2;
}
@@ -9145,15 +8550,13 @@ static sa_sint_t libsais_main_32s(
libsais_initialize_buckets_start_and_end_32s_4k(k, buckets);
libsais_place_lms_suffixes_histogram_32s_4k(SA, n, k, m, buckets);
- libsais_induce_final_order_32s_4k(T, SA, n, k, buckets, threads,
- thread_state);
+ libsais_induce_final_order_32s_4k(T, SA, n, k, buckets, threads, thread_state);
} else {
SA[0] = SA[n - 1];
libsais_initialize_buckets_start_and_end_32s_6k(k, buckets);
libsais_place_lms_suffixes_histogram_32s_6k(SA, n, k, m, buckets);
- libsais_induce_final_order_32s_6k(T, SA, n, k, buckets, threads,
- thread_state);
+ libsais_induce_final_order_32s_6k(T, SA, n, k, buckets, threads, thread_state);
}
return 0;
@@ -9161,37 +8564,31 @@ static sa_sint_t libsais_main_32s(
sa_sint_t alignment = (fs - 1024) / k >= 4 ? 1024 : 16;
sa_sint_t * RESTRICT buckets =
(fs - alignment) / k >= 4
- ? (sa_sint_t *)libsais_align_up(
- &SA[n + fs - 4 * k - alignment],
- (size_t)alignment * sizeof(sa_sint_t))
+ ? (sa_sint_t *)libsais_align_up(&SA[n + fs - 4 * k - alignment],
+ (size_t)alignment * sizeof(sa_sint_t))
: &SA[n + fs - 4 * k];
- sa_sint_t m = libsais_count_and_gather_lms_suffixes_32s_2k_omp(
- T, SA, n, k, buckets, threads, thread_state);
+ sa_sint_t m = libsais_count_and_gather_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets,
+ threads, thread_state);
if (m > 1) {
- libsais_initialize_buckets_for_radix_and_partial_sorting_32s_4k(
- T, k, buckets, SA[n - m]);
+ libsais_initialize_buckets_for_radix_and_partial_sorting_32s_4k(T, k, buckets,
+ SA[n - m]);
- libsais_radix_sort_lms_suffixes_32s_2k_omp(T, SA, n, m, &buckets[1],
- threads, thread_state);
- libsais_radix_sort_set_markers_32s_4k_omp(SA, k, &buckets[1],
- threads);
+ libsais_radix_sort_lms_suffixes_32s_2k_omp(T, SA, n, m, &buckets[1], threads,
+ thread_state);
+ libsais_radix_sort_set_markers_32s_4k_omp(SA, k, &buckets[1], threads);
- libsais_place_lms_suffixes_interval_32s_4k(SA, n, k, m - 1,
- buckets);
- libsais_induce_partial_order_32s_4k_omp(T, SA, n, k, buckets,
- threads, thread_state);
+ libsais_place_lms_suffixes_interval_32s_4k(SA, n, k, m - 1, buckets);
+ libsais_induce_partial_order_32s_4k_omp(T, SA, n, k, buckets, threads, thread_state);
- sa_sint_t names =
- libsais_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(
- SA, n, m, threads, thread_state);
+ sa_sint_t names = libsais_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(
+ SA, n, m, threads, thread_state);
if (names < m) {
- sa_sint_t f = libsais_compact_lms_suffixes_32s_omp(
- T, SA, n, m, fs, threads, thread_state);
+ sa_sint_t f =
+ libsais_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state);
- if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f,
- fs + n - 2 * m + f, threads,
- thread_state) != 0) {
+ if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f,
+ threads, thread_state) != 0) {
return -2;
}
@@ -9206,44 +8603,37 @@ static sa_sint_t libsais_main_32s(
libsais_initialize_buckets_start_and_end_32s_4k(k, buckets);
libsais_place_lms_suffixes_histogram_32s_4k(SA, n, k, m, buckets);
- libsais_induce_final_order_32s_4k(T, SA, n, k, buckets, threads,
- thread_state);
+ libsais_induce_final_order_32s_4k(T, SA, n, k, buckets, threads, thread_state);
return 0;
} else if (k > 0 && fs / k >= 2) {
sa_sint_t alignment = (fs - 1024) / k >= 2 ? 1024 : 16;
sa_sint_t * RESTRICT buckets =
(fs - alignment) / k >= 2
- ? (sa_sint_t *)libsais_align_up(
- &SA[n + fs - 2 * k - alignment],
- (size_t)alignment * sizeof(sa_sint_t))
+ ? (sa_sint_t *)libsais_align_up(&SA[n + fs - 2 * k - alignment],
+ (size_t)alignment * sizeof(sa_sint_t))
: &SA[n + fs - 2 * k];
- sa_sint_t m = libsais_count_and_gather_lms_suffixes_32s_2k_omp(
- T, SA, n, k, buckets, threads, thread_state);
+ sa_sint_t m = libsais_count_and_gather_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets,
+ threads, thread_state);
if (m > 1) {
- libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_2k(
- T, k, buckets, SA[n - m]);
+ libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_2k(T, k, buckets, SA[n - m]);
- libsais_radix_sort_lms_suffixes_32s_2k_omp(T, SA, n, m, &buckets[1],
- threads, thread_state);
- libsais_place_lms_suffixes_interval_32s_2k(SA, n, k, m - 1,
- buckets);
+ libsais_radix_sort_lms_suffixes_32s_2k_omp(T, SA, n, m, &buckets[1], threads,
+ thread_state);
+ libsais_place_lms_suffixes_interval_32s_2k(SA, n, k, m - 1, buckets);
libsais_initialize_buckets_start_and_end_32s_2k(k, buckets);
- libsais_induce_partial_order_32s_2k_omp(T, SA, n, k, buckets,
- threads, thread_state);
+ libsais_induce_partial_order_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state);
sa_sint_t names =
- libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(
- T, SA, n, m, threads);
+ libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(T, SA, n, m, threads);
if (names < m) {
- sa_sint_t f = libsais_compact_lms_suffixes_32s_omp(
- T, SA, n, m, fs, threads, thread_state);
+ sa_sint_t f =
+ libsais_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state);
- if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f,
- fs + n - 2 * m + f, threads,
- thread_state) != 0) {
+ if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f,
+ threads, thread_state) != 0) {
return -2;
}
@@ -9260,20 +8650,19 @@ static sa_sint_t libsais_main_32s(
libsais_place_lms_suffixes_histogram_32s_2k(SA, n, k, m, buckets);
libsais_initialize_buckets_start_and_end_32s_2k(k, buckets);
- libsais_induce_final_order_32s_2k(T, SA, n, k, buckets, threads,
- thread_state);
+ libsais_induce_final_order_32s_2k(T, SA, n, k, buckets, threads, thread_state);
return 0;
} else {
- sa_sint_t * buffer = fs < k ? (sa_sint_t *)libsais_alloc_aligned(
- (size_t)k * sizeof(sa_sint_t), 4096)
- : (sa_sint_t *)NULL;
+ sa_sint_t * buffer =
+ fs < k ? (sa_sint_t *)libsais_alloc_aligned((size_t)k * sizeof(sa_sint_t), 4096)
+ : (sa_sint_t *)NULL;
sa_sint_t alignment = fs - 1024 >= k ? 1024 : 16;
sa_sint_t * RESTRICT buckets =
- fs - alignment >= k ? (sa_sint_t *)libsais_align_up(
- &SA[n + fs - k - alignment],
- (size_t)alignment * sizeof(sa_sint_t))
+ fs - alignment >= k
+ ? (sa_sint_t *)libsais_align_up(&SA[n + fs - k - alignment],
+ (size_t)alignment * sizeof(sa_sint_t))
: fs >= k ? &SA[n + fs - k]
: buffer;
@@ -9288,33 +8677,30 @@ static sa_sint_t libsais_main_32s(
sa_sint_t m = libsais_radix_sort_lms_suffixes_32s_1k(T, SA, n, buckets);
if (m > 1) {
- libsais_induce_partial_order_32s_1k_omp(T, SA, n, k, buckets,
- threads, thread_state);
+ libsais_induce_partial_order_32s_1k_omp(T, SA, n, k, buckets, threads, thread_state);
sa_sint_t names =
- libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(
- T, SA, n, m, threads);
+ libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(T, SA, n, m, threads);
if (names < m) {
if (buffer != NULL) {
libsais_free_aligned(buffer);
buckets = NULL;
}
- sa_sint_t f = libsais_compact_lms_suffixes_32s_omp(
- T, SA, n, m, fs, threads, thread_state);
+ sa_sint_t f =
+ libsais_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state);
- if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f,
- fs + n - 2 * m + f, threads,
- thread_state) != 0) {
+ if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f,
+ threads, thread_state) != 0) {
return -2;
}
- libsais_reconstruct_compacted_lms_suffixes_32s_1k_omp(
- T, SA, n, m, fs, f, threads, thread_state);
+ libsais_reconstruct_compacted_lms_suffixes_32s_1k_omp(T, SA, n, m, fs, f, threads,
+ thread_state);
if (buckets == NULL) {
- buckets = buffer = (sa_sint_t *)libsais_alloc_aligned(
- (size_t)k * sizeof(sa_sint_t), 4096);
+ buckets = buffer =
+ (sa_sint_t *)libsais_alloc_aligned((size_t)k * sizeof(sa_sint_t), 4096);
}
if (buckets == NULL) {
return -2;
@@ -9326,8 +8712,7 @@ static sa_sint_t libsais_main_32s(
libsais_place_lms_suffixes_interval_32s_1k(T, SA, k, m, buckets);
}
- libsais_induce_final_order_32s_1k(T, SA, n, k, buckets, threads,
- thread_state);
+ libsais_induce_final_order_32s_1k(T, SA, n, k, buckets, threads, thread_state);
libsais_free_aligned(buffer);
return 0;
@@ -9335,45 +8720,39 @@ static sa_sint_t libsais_main_32s(
}
static sa_sint_t libsais_main_8u(const u8 * T, sa_sint_t * SA, sa_sint_t n,
- sa_sint_t * RESTRICT buckets, sa_sint_t bwt,
- sa_sint_t r, sa_sint_t * RESTRICT I,
- sa_sint_t fs, sa_sint_t * freq,
- sa_sint_t threads,
- LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
+ sa_sint_t * RESTRICT buckets, sa_sint_t bwt, sa_sint_t r,
+ sa_sint_t * RESTRICT I, sa_sint_t fs, sa_sint_t * freq,
+ sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) {
fs = fs < (SAINT_MAX - n) ? fs : (SAINT_MAX - n);
- sa_sint_t m = libsais_count_and_gather_lms_suffixes_8u_omp(
- T, SA, n, buckets, threads, thread_state);
+ sa_sint_t m =
+ libsais_count_and_gather_lms_suffixes_8u_omp(T, SA, n, buckets, threads, thread_state);
libsais_initialize_buckets_start_and_end_8u(buckets, freq);
if (m > 0) {
sa_sint_t first_lms_suffix = SA[n - m];
sa_sint_t left_suffixes_count =
- libsais_initialize_buckets_for_lms_suffixes_radix_sort_8u(
- T, buckets, first_lms_suffix);
+ libsais_initialize_buckets_for_lms_suffixes_radix_sort_8u(T, buckets, first_lms_suffix);
if (threads > 1 && n >= 65536) {
memset(SA, 0, ((size_t)n - (size_t)m) * sizeof(sa_sint_t));
}
- libsais_radix_sort_lms_suffixes_8u_omp(T, SA, n, m, buckets, threads,
- thread_state);
+ libsais_radix_sort_lms_suffixes_8u_omp(T, SA, n, m, buckets, threads, thread_state);
if (threads > 1 && n >= 65536) {
- memset(&SA[(fast_sint_t)n - (fast_sint_t)m], 0,
- (size_t)m * sizeof(sa_sint_t));
+ memset(&SA[(fast_sint_t)n - (fast_sint_t)m], 0, (size_t)m * sizeof(sa_sint_t));
}
- libsais_initialize_buckets_for_partial_sorting_8u(
- T, buckets, first_lms_suffix, left_suffixes_count);
+ libsais_initialize_buckets_for_partial_sorting_8u(T, buckets, first_lms_suffix,
+ left_suffixes_count);
libsais_induce_partial_order_8u_omp(T, SA, n, buckets, first_lms_suffix,
- left_suffixes_count, threads,
- thread_state);
+ left_suffixes_count, threads, thread_state);
- sa_sint_t names = libsais_renumber_and_gather_lms_suffixes_8u_omp(
- SA, n, m, fs, threads, thread_state);
+ sa_sint_t names =
+ libsais_renumber_and_gather_lms_suffixes_8u_omp(SA, n, m, fs, threads, thread_state);
if (names < m) {
- if (libsais_main_32s(SA + n + fs - m, SA, m, names, fs + n - 2 * m,
- threads, thread_state) != 0) {
+ if (libsais_main_32s(SA + n + fs - m, SA, m, names, fs + n - 2 * m, threads,
+ thread_state) != 0) {
return -2;
}
@@ -9386,23 +8765,20 @@ static sa_sint_t libsais_main_8u(const u8 * T, sa_sint_t * SA, sa_sint_t n,
memset(SA, 0, (size_t)n * sizeof(sa_sint_t));
}
- return libsais_induce_final_order_8u_omp(T, SA, n, bwt, r, I, buckets,
- threads, thread_state);
+ return libsais_induce_final_order_8u_omp(T, SA, n, bwt, r, I, buckets, threads, thread_state);
}
-static sa_sint_t libsais_main(const u8 * T, sa_sint_t * SA, sa_sint_t n,
- sa_sint_t bwt, sa_sint_t r, sa_sint_t * I,
- sa_sint_t fs, sa_sint_t * freq,
- sa_sint_t threads) {
+static sa_sint_t libsais_main(const u8 * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t bwt, sa_sint_t r,
+ sa_sint_t * I, sa_sint_t fs, sa_sint_t * freq, sa_sint_t threads) {
LIBSAIS_THREAD_STATE * RESTRICT thread_state =
threads > 1 ? libsais_alloc_thread_state(threads) : NULL;
- sa_sint_t * RESTRICT buckets = (sa_sint_t *)libsais_alloc_aligned(
- 8 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096);
+ sa_sint_t * RESTRICT buckets =
+ (sa_sint_t *)libsais_alloc_aligned(8 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096);
- sa_sint_t index = buckets != NULL && (thread_state != NULL || threads == 1)
- ? libsais_main_8u(T, SA, n, buckets, bwt, r, I, fs,
- freq, threads, thread_state)
- : -2;
+ sa_sint_t index =
+ buckets != NULL && (thread_state != NULL || threads == 1)
+ ? libsais_main_8u(T, SA, n, buckets, bwt, r, I, fs, freq, threads, thread_state)
+ : -2;
libsais_free_aligned(buckets);
libsais_free_thread_state(thread_state);
@@ -9410,34 +8786,30 @@ static sa_sint_t libsais_main(const u8 * T, sa_sint_t * SA, sa_sint_t n,
return index;
}
-static s32 libsais_main_int(sa_sint_t * T, sa_sint_t * SA, sa_sint_t n,
- sa_sint_t k, sa_sint_t fs, sa_sint_t threads) {
+static s32 libsais_main_int(sa_sint_t * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t k, sa_sint_t fs,
+ sa_sint_t threads) {
LIBSAIS_THREAD_STATE * RESTRICT thread_state =
threads > 1 ? libsais_alloc_thread_state(threads) : NULL;
- sa_sint_t index =
- thread_state != NULL || threads == 1
- ? libsais_main_32s(T, SA, n, k, fs, threads, thread_state)
- : -2;
+ sa_sint_t index = thread_state != NULL || threads == 1
+ ? libsais_main_32s(T, SA, n, k, fs, threads, thread_state)
+ : -2;
libsais_free_thread_state(thread_state);
return index;
}
-static sa_sint_t libsais_main_ctx(const LIBSAIS_CONTEXT * ctx, const u8 * T,
- sa_sint_t * SA, sa_sint_t n, sa_sint_t bwt,
- sa_sint_t r, sa_sint_t * I, sa_sint_t fs,
- sa_sint_t * freq) {
- return ctx != NULL && (ctx->buckets != NULL &&
- (ctx->thread_state != NULL || ctx->threads == 1))
+static sa_sint_t libsais_main_ctx(const LIBSAIS_CONTEXT * ctx, const u8 * T, sa_sint_t * SA,
+ sa_sint_t n, sa_sint_t bwt, sa_sint_t r, sa_sint_t * I,
+ sa_sint_t fs, sa_sint_t * freq) {
+ return ctx != NULL && (ctx->buckets != NULL && (ctx->thread_state != NULL || ctx->threads == 1))
? libsais_main_8u(T, SA, n, ctx->buckets, bwt, r, I, fs, freq,
(sa_sint_t)ctx->threads, ctx->thread_state)
: -2;
}
-static void libsais_bwt_copy_8u(u8 * RESTRICT U, sa_sint_t * RESTRICT A,
- sa_sint_t n) {
+static void libsais_bwt_copy_8u(u8 * RESTRICT U, sa_sint_t * RESTRICT A, sa_sint_t n) {
const fast_sint_t prefetch_distance = 32;
fast_sint_t i, j;
@@ -9461,8 +8833,8 @@ static void libsais_bwt_copy_8u(u8 * RESTRICT U, sa_sint_t * RESTRICT A,
#if defined(_OPENMP)
-static void libsais_bwt_copy_8u_omp(u8 * RESTRICT U, sa_sint_t * RESTRICT A,
- sa_sint_t n, sa_sint_t threads) {
+static void libsais_bwt_copy_8u_omp(u8 * RESTRICT U, sa_sint_t * RESTRICT A, sa_sint_t n,
+ sa_sint_t threads) {
#if defined(_OPENMP)
#pragma omp parallel num_threads(threads) if (threads > 1 && n >= 65536)
#endif
@@ -9470,8 +8842,7 @@ static void libsais_bwt_copy_8u_omp(u8 * RESTRICT U, sa_sint_t * RESTRICT A,
#if defined(_OPENMP)
fast_sint_t omp_thread_num = omp_get_thread_num();
fast_sint_t omp_num_threads = omp_get_num_threads();
- fast_sint_t omp_block_stride =
- ((fast_sint_t)n / omp_num_threads) & (-16);
+ fast_sint_t omp_block_stride = ((fast_sint_t)n / omp_num_threads) & (-16);
fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
? omp_block_stride
@@ -9483,8 +8854,7 @@ static void libsais_bwt_copy_8u_omp(u8 * RESTRICT U, sa_sint_t * RESTRICT A,
fast_sint_t omp_block_size = (fast_sint_t)n;
#endif
- libsais_bwt_copy_8u(U + omp_block_start, A + omp_block_start,
- (sa_sint_t)omp_block_size);
+ libsais_bwt_copy_8u(U + omp_block_start, A + omp_block_start, (sa_sint_t)omp_block_size);
}
}
@@ -9492,9 +8862,7 @@ static void libsais_bwt_copy_8u_omp(u8 * RESTRICT U, sa_sint_t * RESTRICT A,
void * libsais_create_ctx(void) { return (void *)libsais_create_ctx_main(1); }
-void libsais_free_ctx(void * ctx) {
- libsais_free_ctx_main((LIBSAIS_CONTEXT *)ctx);
-}
+void libsais_free_ctx(void * ctx) { libsais_free_ctx_main((LIBSAIS_CONTEXT *)ctx); }
s32 libsais(const u8 * T, s32 * SA, s32 n, s32 fs, s32 * freq) {
if ((T == NULL) || (SA == NULL) || (n < 0) || (fs < 0)) {
@@ -9528,8 +8896,7 @@ s32 libsais_int(s32 * T, s32 * SA, s32 n, s32 k, s32 fs) {
return libsais_main_int(T, SA, n, k, fs, 1);
}
-s32 libsais_ctx(const void * ctx, const u8 * T, s32 * SA, s32 n, s32 fs,
- s32 * freq) {
+s32 libsais_ctx(const void * ctx, const u8 * T, s32 * SA, s32 n, s32 fs, s32 * freq) {
if ((ctx == NULL) || (T == NULL) || (SA == NULL) || (n < 0) || (fs < 0)) {
return -1;
} else if (n < 2) {
@@ -9545,8 +8912,7 @@ s32 libsais_ctx(const void * ctx, const u8 * T, s32 * SA, s32 n, s32 fs,
return 0;
}
- return libsais_main_ctx((const LIBSAIS_CONTEXT *)ctx, T, SA, n, 0, 0, NULL,
- fs, freq);
+ return libsais_main_ctx((const LIBSAIS_CONTEXT *)ctx, T, SA, n, 0, 0, NULL, fs, freq);
}
s32 libsais_bwt(const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq) {
@@ -9577,10 +8943,9 @@ s32 libsais_bwt(const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq) {
return index;
}
-s32 libsais_bwt_aux(const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq,
- s32 r, s32 * I) {
- if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) ||
- (r < 2) || ((r & (r - 1)) != 0) || (I == NULL)) {
+s32 libsais_bwt_aux(const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq, s32 r, s32 * I) {
+ if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (r < 2) ||
+ ((r & (r - 1)) != 0) || (I == NULL)) {
return -1;
} else if (n <= 1) {
if (freq != NULL) {
@@ -9607,10 +8972,8 @@ s32 libsais_bwt_aux(const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq,
return 0;
}
-s32 libsais_bwt_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n,
- s32 fs, s32 * freq) {
- if ((ctx == NULL) || (T == NULL) || (U == NULL) || (A == NULL) || (n < 0) ||
- (fs < 0)) {
+s32 libsais_bwt_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq) {
+ if ((ctx == NULL) || (T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0)) {
return -1;
} else if (n <= 1) {
if (freq != NULL) {
@@ -9625,20 +8988,17 @@ s32 libsais_bwt_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n,
return n;
}
- sa_sint_t index = libsais_main_ctx((const LIBSAIS_CONTEXT *)ctx, T, A, n, 1,
- 0, NULL, fs, freq);
+ sa_sint_t index = libsais_main_ctx((const LIBSAIS_CONTEXT *)ctx, T, A, n, 1, 0, NULL, fs, freq);
if (index >= 0) {
index++;
U[0] = T[n - 1];
#if defined(_OPENMP)
- libsais_bwt_copy_8u_omp(
- U + 1, A, index - 1,
- (sa_sint_t)((const LIBSAIS_CONTEXT *)ctx)->threads);
- libsais_bwt_copy_8u_omp(
- U + index, A + index, n - index,
- (sa_sint_t)((const LIBSAIS_CONTEXT *)ctx)->threads);
+ libsais_bwt_copy_8u_omp(U + 1, A, index - 1,
+ (sa_sint_t)((const LIBSAIS_CONTEXT *)ctx)->threads);
+ libsais_bwt_copy_8u_omp(U + index, A + index, n - index,
+ (sa_sint_t)((const LIBSAIS_CONTEXT *)ctx)->threads);
#else
libsais_bwt_copy_8u(U + 1, A, index - 1);
libsais_bwt_copy_8u(U + index, A + index, n - index);
@@ -9648,10 +9008,10 @@ s32 libsais_bwt_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n,
return index;
}
-s32 libsais_bwt_aux_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n,
- s32 fs, s32 * freq, s32 r, s32 * I) {
- if ((ctx == NULL) || (T == NULL) || (U == NULL) || (A == NULL) || (n < 0) ||
- (fs < 0) || (r < 2) || ((r & (r - 1)) != 0) || (I == NULL)) {
+s32 libsais_bwt_aux_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq,
+ s32 r, s32 * I) {
+ if ((ctx == NULL) || (T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) ||
+ (r < 2) || ((r & (r - 1)) != 0) || (I == NULL)) {
return -1;
} else if (n <= 1) {
if (freq != NULL) {
@@ -9667,16 +9027,14 @@ s32 libsais_bwt_aux_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n,
return 0;
}
- if (libsais_main_ctx((const LIBSAIS_CONTEXT *)ctx, T, A, n, 1, r, I, fs,
- freq) != 0) {
+ if (libsais_main_ctx((const LIBSAIS_CONTEXT *)ctx, T, A, n, 1, r, I, fs, freq) != 0) {
return -2;
}
U[0] = T[n - 1];
#if defined(_OPENMP)
- libsais_bwt_copy_8u_omp(U + 1, A, I[0] - 1,
- (sa_sint_t)((const LIBSAIS_CONTEXT *)ctx)->threads);
+ libsais_bwt_copy_8u_omp(U + 1, A, I[0] - 1, (sa_sint_t)((const LIBSAIS_CONTEXT *)ctx)->threads);
libsais_bwt_copy_8u_omp(U + I[0], A + I[0], n - I[0],
(sa_sint_t)((const LIBSAIS_CONTEXT *)ctx)->threads);
#else
@@ -9698,8 +9056,7 @@ void * libsais_create_ctx_omp(s32 threads) {
return (void *)libsais_create_ctx_main(threads);
}
-s32 libsais_omp(const u8 * T, s32 * SA, s32 n, s32 fs, s32 * freq,
- s32 threads) {
+s32 libsais_omp(const u8 * T, s32 * SA, s32 n, s32 fs, s32 * freq, s32 threads) {
if ((T == NULL) || (SA == NULL) || (n < 0) || (fs < 0) || (threads < 0)) {
return -1;
} else if (n < 2) {
@@ -9735,10 +9092,8 @@ s32 libsais_int_omp(s32 * T, s32 * SA, s32 n, s32 k, s32 fs, s32 threads) {
return libsais_main_int(T, SA, n, k, fs, threads);
}
-s32 libsais_bwt_omp(const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq,
- s32 threads) {
- if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) ||
- (threads < 0)) {
+s32 libsais_bwt_omp(const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq, s32 threads) {
+ if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (threads < 0)) {
return -1;
} else if (n <= 1) {
if (freq != NULL) {
@@ -9767,10 +9122,10 @@ s32 libsais_bwt_omp(const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq,
return index;
}
-s32 libsais_bwt_aux_omp(const u8 * T, u8 * U, s32 * A, s32 n, s32 fs,
- s32 * freq, s32 r, s32 * I, s32 threads) {
- if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) ||
- (r < 2) || ((r & (r - 1)) != 0) || (I == NULL) || (threads < 0)) {
+s32 libsais_bwt_aux_omp(const u8 * T, u8 * U, s32 * A, s32 n, s32 fs, s32 * freq, s32 r, s32 * I,
+ s32 threads) {
+ if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (r < 2) ||
+ ((r & (r - 1)) != 0) || (I == NULL) || (threads < 0)) {
return -1;
} else if (n <= 1) {
if (freq != NULL) {
@@ -9801,26 +9156,21 @@ s32 libsais_bwt_aux_omp(const u8 * T, u8 * U, s32 * A, s32 n, s32 fs,
#endif
-static LIBSAIS_UNBWT_CONTEXT * libsais_unbwt_create_ctx_main(
- sa_sint_t threads) {
+static LIBSAIS_UNBWT_CONTEXT * libsais_unbwt_create_ctx_main(sa_sint_t threads) {
LIBSAIS_UNBWT_CONTEXT * RESTRICT ctx =
- (LIBSAIS_UNBWT_CONTEXT *)libsais_alloc_aligned(
- sizeof(LIBSAIS_UNBWT_CONTEXT), 64);
- sa_uint_t * RESTRICT bucket2 = (sa_uint_t *)libsais_alloc_aligned(
- ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t), 4096);
- u16 * RESTRICT fastbits = (u16 *)libsais_alloc_aligned(
- (1 + (1 << UNBWT_FASTBITS)) * sizeof(u16), 4096);
+ (LIBSAIS_UNBWT_CONTEXT *)libsais_alloc_aligned(sizeof(LIBSAIS_UNBWT_CONTEXT), 64);
+ sa_uint_t * RESTRICT bucket2 =
+ (sa_uint_t *)libsais_alloc_aligned(ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t), 4096);
+ u16 * RESTRICT fastbits =
+ (u16 *)libsais_alloc_aligned((1 + (1 << UNBWT_FASTBITS)) * sizeof(u16), 4096);
sa_uint_t * RESTRICT buckets =
- threads > 1
- ? (sa_uint_t *)libsais_alloc_aligned(
- (size_t)threads *
- (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)) *
- sizeof(sa_uint_t),
- 4096)
- : NULL;
+ threads > 1 ? (sa_uint_t *)libsais_alloc_aligned(
+ (size_t)threads * (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)) *
+ sizeof(sa_uint_t),
+ 4096)
+ : NULL;
- if (ctx != NULL && bucket2 != NULL && fastbits != NULL &&
- (buckets != NULL || threads == 1)) {
+ if (ctx != NULL && bucket2 != NULL && fastbits != NULL && (buckets != NULL || threads == 1)) {
ctx->bucket2 = bucket2;
ctx->fastbits = fastbits;
ctx->buckets = buckets;
@@ -9846,8 +9196,7 @@ static void libsais_unbwt_free_ctx_main(LIBSAIS_UNBWT_CONTEXT * ctx) {
}
}
-static void libsais_unbwt_compute_histogram(const u8 * RESTRICT T,
- fast_sint_t n,
+static void libsais_unbwt_compute_histogram(const u8 * RESTRICT T, fast_sint_t n,
sa_uint_t * RESTRICT count) {
const fast_sint_t prefetch_distance = 256;
@@ -10107,9 +9456,10 @@ static void libsais_unbwt_transpose_bucket2(sa_uint_t * RESTRICT bucket2) {
}
}
-static void libsais_unbwt_compute_bigram_histogram_single(
- const u8 * RESTRICT T, sa_uint_t * RESTRICT bucket1,
- sa_uint_t * RESTRICT bucket2, fast_uint_t index) {
+static void libsais_unbwt_compute_bigram_histogram_single(const u8 * RESTRICT T,
+ sa_uint_t * RESTRICT bucket1,
+ sa_uint_t * RESTRICT bucket2,
+ fast_uint_t index) {
fast_uint_t sum, c;
for (sum = 1, c = 0; c < ALPHABET_SIZE; ++c) {
fast_uint_t prev = sum;
@@ -10123,8 +9473,7 @@ static void libsais_unbwt_compute_bigram_histogram_single(
if (sum < hi) {
hi = sum;
}
- libsais_unbwt_compute_histogram(
- &T[prev], (fast_sint_t)(hi - prev), bucket2_p);
+ libsais_unbwt_compute_histogram(&T[prev], (fast_sint_t)(hi - prev), bucket2_p);
}
{
@@ -10132,8 +9481,7 @@ static void libsais_unbwt_compute_bigram_histogram_single(
if (prev > lo) {
lo = prev;
}
- libsais_unbwt_compute_histogram(
- &T[lo - 1], (fast_sint_t)(sum - lo), bucket2_p);
+ libsais_unbwt_compute_histogram(&T[lo - 1], (fast_sint_t)(sum - lo), bucket2_p);
}
}
}
@@ -10141,10 +9489,8 @@ static void libsais_unbwt_compute_bigram_histogram_single(
libsais_unbwt_transpose_bucket2(bucket2);
}
-static void libsais_unbwt_calculate_fastbits(sa_uint_t * RESTRICT bucket2,
- u16 * RESTRICT fastbits,
- fast_uint_t lastc,
- fast_uint_t shift) {
+static void libsais_unbwt_calculate_fastbits(sa_uint_t * RESTRICT bucket2, u16 * RESTRICT fastbits,
+ fast_uint_t lastc, fast_uint_t shift) {
fast_uint_t v, w, sum, c, d;
for (v = 0, w = 0, sum = 1, c = 0; c < ALPHABET_SIZE; ++c) {
if (c == lastc) {
@@ -10164,10 +9510,10 @@ static void libsais_unbwt_calculate_fastbits(sa_uint_t * RESTRICT bucket2,
}
}
-static void libsais_unbwt_calculate_biPSI(
- const u8 * RESTRICT T, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket1,
- sa_uint_t * RESTRICT bucket2, fast_uint_t index,
- fast_sint_t omp_block_start, fast_sint_t omp_block_end) {
+static void libsais_unbwt_calculate_biPSI(const u8 * RESTRICT T, sa_uint_t * RESTRICT P,
+ sa_uint_t * RESTRICT bucket1,
+ sa_uint_t * RESTRICT bucket2, fast_uint_t index,
+ fast_sint_t omp_block_start, fast_sint_t omp_block_end) {
{
fast_sint_t i = omp_block_start, j = (fast_sint_t)index;
if (omp_block_end < j) {
@@ -10180,9 +9526,7 @@ static void libsais_unbwt_calculate_biPSI(
if (t != 0) {
fast_uint_t w =
- (((fast_uint_t)
- T[p + (fast_uint_t)(t >>
- ((sizeof(fast_sint_t) * 8) - 1))])
+ (((fast_uint_t)T[p + (fast_uint_t)(t >> ((sizeof(fast_sint_t) * 8) - 1))])
<< 8) +
c;
P[bucket2[w]++] = (sa_uint_t)i;
@@ -10202,9 +9546,7 @@ static void libsais_unbwt_calculate_biPSI(
if (t != 0) {
fast_uint_t w =
- (((fast_uint_t)
- T[p + (fast_uint_t)(t >>
- ((sizeof(fast_sint_t) * 8) - 1))])
+ (((fast_uint_t)T[p + (fast_uint_t)(t >> ((sizeof(fast_sint_t) * 8) - 1))])
<< 8) +
c;
P[bucket2[w]++] = (sa_uint_t)i;
@@ -10213,12 +9555,9 @@ static void libsais_unbwt_calculate_biPSI(
}
}
-static void libsais_unbwt_init_single(const u8 * RESTRICT T,
- sa_uint_t * RESTRICT P, sa_sint_t n,
- const sa_sint_t * freq,
- const sa_uint_t * RESTRICT I,
- sa_uint_t * RESTRICT bucket2,
- u16 * RESTRICT fastbits) {
+static void libsais_unbwt_init_single(const u8 * RESTRICT T, sa_uint_t * RESTRICT P, sa_sint_t n,
+ const sa_sint_t * freq, const sa_uint_t * RESTRICT I,
+ sa_uint_t * RESTRICT bucket2, u16 * RESTRICT fastbits) {
sa_uint_t bucket1[ALPHABET_SIZE];
fast_uint_t index = I[0];
@@ -10246,8 +9585,7 @@ static void libsais_unbwt_init_single(const u8 * RESTRICT T,
static void libsais_unbwt_compute_bigram_histogram_parallel(
const u8 * RESTRICT T, fast_uint_t index, sa_uint_t * RESTRICT bucket1,
- sa_uint_t * RESTRICT bucket2, fast_sint_t omp_block_start,
- fast_sint_t omp_block_size) {
+ sa_uint_t * RESTRICT bucket2, fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
fast_sint_t i;
for (i = omp_block_start; i < omp_block_start + omp_block_size; ++i) {
fast_uint_t c = T[i];
@@ -10256,21 +9594,17 @@ static void libsais_unbwt_compute_bigram_histogram_parallel(
if (t != 0) {
fast_uint_t w =
- (((fast_uint_t)
- T[p +
- (fast_uint_t)(t >> ((sizeof(fast_sint_t) * 8) - 1))])
- << 8) +
+ (((fast_uint_t)T[p + (fast_uint_t)(t >> ((sizeof(fast_sint_t) * 8) - 1))]) << 8) +
c;
bucket2[w]++;
}
}
}
-static void libsais_unbwt_init_parallel(
- const u8 * RESTRICT T, sa_uint_t * RESTRICT P, sa_sint_t n,
- const sa_sint_t * freq, const sa_uint_t * RESTRICT I,
- sa_uint_t * RESTRICT bucket2, u16 * RESTRICT fastbits,
- sa_uint_t * RESTRICT buckets, sa_sint_t threads) {
+static void libsais_unbwt_init_parallel(const u8 * RESTRICT T, sa_uint_t * RESTRICT P, sa_sint_t n,
+ const sa_sint_t * freq, const sa_uint_t * RESTRICT I,
+ sa_uint_t * RESTRICT bucket2, u16 * RESTRICT fastbits,
+ sa_uint_t * RESTRICT buckets, sa_sint_t threads) {
sa_uint_t bucket1[ALPHABET_SIZE];
fast_uint_t index = I[0];
@@ -10292,20 +9626,17 @@ static void libsais_unbwt_init_parallel(
libsais_unbwt_init_single(T, P, n, freq, I, bucket2, fastbits);
} else {
sa_uint_t * RESTRICT bucket1_local =
- buckets + omp_thread_num *
- (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE));
+ buckets + omp_thread_num * (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE));
sa_uint_t * RESTRICT bucket2_local = bucket1_local + ALPHABET_SIZE;
fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
- fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
- ? omp_block_stride
- : n - omp_block_start;
+ fast_sint_t omp_block_size =
+ omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
{
memset(bucket1_local, 0, ALPHABET_SIZE * sizeof(sa_uint_t));
- libsais_unbwt_compute_histogram(T + omp_block_start,
- omp_block_size, bucket1_local);
+ libsais_unbwt_compute_histogram(T + omp_block_start, omp_block_size, bucket1_local);
}
#pragma omp barrier
@@ -10317,8 +9648,7 @@ static void libsais_unbwt_init_parallel(
fast_sint_t t;
for (t = 0; t < omp_num_threads;
- ++t, bucket1_temp +=
- ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)) {
+ ++t, bucket1_temp += ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)) {
fast_sint_t c;
for (c = 0; c < ALPHABET_SIZE; c += 1) {
sa_uint_t A = bucket1[c], B = bucket1_temp[c];
@@ -10347,11 +9677,9 @@ static void libsais_unbwt_init_parallel(
bucket1_local[c] = A + B;
}
- memset(bucket2_local, 0,
- ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t));
+ memset(bucket2_local, 0, ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t));
libsais_unbwt_compute_bigram_histogram_parallel(
- T, index, bucket1_local, bucket2_local, omp_block_start,
- omp_block_size);
+ T, index, bucket1_local, bucket2_local, omp_block_start, omp_block_size);
}
#pragma omp barrier
@@ -10359,8 +9687,7 @@ static void libsais_unbwt_init_parallel(
{
fast_sint_t omp_bucket2_stride =
((ALPHABET_SIZE * ALPHABET_SIZE) / omp_num_threads) & (-16);
- fast_sint_t omp_bucket2_start =
- omp_thread_num * omp_bucket2_stride;
+ fast_sint_t omp_bucket2_start = omp_thread_num * omp_bucket2_stride;
fast_sint_t omp_bucket2_size =
omp_thread_num < omp_num_threads - 1
? omp_bucket2_stride
@@ -10370,11 +9697,9 @@ static void libsais_unbwt_init_parallel(
fast_sint_t t;
for (t = 0; t < omp_num_threads;
- ++t, bucket2_temp +=
- ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)) {
+ ++t, bucket2_temp += ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)) {
fast_sint_t c;
- for (c = omp_bucket2_start;
- c < omp_bucket2_start + omp_bucket2_size; c += 1) {
+ for (c = omp_bucket2_start; c < omp_bucket2_start + omp_bucket2_size; c += 1) {
sa_uint_t A = bucket2[c], B = bucket2_temp[c];
bucket2[c] = A + B;
bucket2_temp[c] = A;
@@ -10386,21 +9711,17 @@ static void libsais_unbwt_init_parallel(
#pragma omp master
{
- libsais_unbwt_calculate_fastbits(bucket2, fastbits, lastc,
- shift);
+ libsais_unbwt_calculate_fastbits(bucket2, fastbits, lastc, shift);
{
fast_sint_t t;
for (t = omp_num_threads - 1; t >= 1; --t) {
sa_uint_t * RESTRICT dst_bucket1 =
- buckets + t * (ALPHABET_SIZE +
- (ALPHABET_SIZE * ALPHABET_SIZE));
+ buckets + t * (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE));
sa_uint_t * RESTRICT src_bucket1 =
- dst_bucket1 -
- (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE));
+ dst_bucket1 - (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE));
- memcpy(dst_bucket1, src_bucket1,
- ALPHABET_SIZE * sizeof(sa_uint_t));
+ memcpy(dst_bucket1, src_bucket1, ALPHABET_SIZE * sizeof(sa_uint_t));
}
memcpy(buckets, bucket1, ALPHABET_SIZE * sizeof(sa_uint_t));
@@ -10416,9 +9737,8 @@ static void libsais_unbwt_init_parallel(
bucket2_local[c] = A + B;
}
- libsais_unbwt_calculate_biPSI(
- T, P, bucket1_local, bucket2_local, index, omp_block_start,
- omp_block_start + omp_block_size);
+ libsais_unbwt_calculate_biPSI(T, P, bucket1_local, bucket2_local, index,
+ omp_block_start, omp_block_start + omp_block_size);
}
#pragma omp barrier
@@ -10428,8 +9748,7 @@ static void libsais_unbwt_init_parallel(
memcpy(
bucket2,
buckets + ALPHABET_SIZE +
- (omp_num_threads - 1) *
- (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)),
+ (omp_num_threads - 1) * (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)),
ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t));
}
}
@@ -10439,9 +9758,8 @@ static void libsais_unbwt_init_parallel(
#endif
static void libsais_unbwt_decode_1(u8 * RESTRICT U, sa_uint_t * RESTRICT P,
- sa_uint_t * RESTRICT bucket2,
- u16 * RESTRICT fastbits, fast_uint_t shift,
- fast_uint_t * i0, fast_uint_t k) {
+ sa_uint_t * RESTRICT bucket2, u16 * RESTRICT fastbits,
+ fast_uint_t shift, fast_uint_t * i0, fast_uint_t k) {
u16 * RESTRICT U0 = (u16 *)(void *)U;
fast_uint_t i, p0 = *i0;
@@ -10461,9 +9779,8 @@ static void libsais_unbwt_decode_1(u8 * RESTRICT U, sa_uint_t * RESTRICT P,
}
static void libsais_unbwt_decode_2(u8 * RESTRICT U, sa_uint_t * RESTRICT P,
- sa_uint_t * RESTRICT bucket2,
- u16 * RESTRICT fastbits, fast_uint_t shift,
- fast_uint_t r, fast_uint_t * i0,
+ sa_uint_t * RESTRICT bucket2, u16 * RESTRICT fastbits,
+ fast_uint_t shift, fast_uint_t r, fast_uint_t * i0,
fast_uint_t * i1, fast_uint_t k) {
u16 * RESTRICT U0 = (u16 *)(void *)U;
u16 * RESTRICT U1 = (u16 *)(void *)(((u8 *)U0) + r);
@@ -10494,11 +9811,9 @@ static void libsais_unbwt_decode_2(u8 * RESTRICT U, sa_uint_t * RESTRICT P,
}
static void libsais_unbwt_decode_3(u8 * RESTRICT U, sa_uint_t * RESTRICT P,
- sa_uint_t * RESTRICT bucket2,
- u16 * RESTRICT fastbits, fast_uint_t shift,
- fast_uint_t r, fast_uint_t * i0,
- fast_uint_t * i1, fast_uint_t * i2,
- fast_uint_t k) {
+ sa_uint_t * RESTRICT bucket2, u16 * RESTRICT fastbits,
+ fast_uint_t shift, fast_uint_t r, fast_uint_t * i0,
+ fast_uint_t * i1, fast_uint_t * i2, fast_uint_t k) {
u16 * RESTRICT U0 = (u16 *)(void *)U;
u16 * RESTRICT U1 = (u16 *)(void *)(((u8 *)U0) + r);
u16 * RESTRICT U2 = (u16 *)(void *)(((u8 *)U1) + r);
@@ -10538,11 +9853,10 @@ static void libsais_unbwt_decode_3(u8 * RESTRICT U, sa_uint_t * RESTRICT P,
}
static void libsais_unbwt_decode_4(u8 * RESTRICT U, sa_uint_t * RESTRICT P,
- sa_uint_t * RESTRICT bucket2,
- u16 * RESTRICT fastbits, fast_uint_t shift,
- fast_uint_t r, fast_uint_t * i0,
- fast_uint_t * i1, fast_uint_t * i2,
- fast_uint_t * i3, fast_uint_t k) {
+ sa_uint_t * RESTRICT bucket2, u16 * RESTRICT fastbits,
+ fast_uint_t shift, fast_uint_t r, fast_uint_t * i0,
+ fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3,
+ fast_uint_t k) {
u16 * RESTRICT U0 = (u16 *)(void *)U;
u16 * RESTRICT U1 = (u16 *)(void *)(((u8 *)U0) + r);
u16 * RESTRICT U2 = (u16 *)(void *)(((u8 *)U1) + r);
@@ -10592,12 +9906,10 @@ static void libsais_unbwt_decode_4(u8 * RESTRICT U, sa_uint_t * RESTRICT P,
}
static void libsais_unbwt_decode_5(u8 * RESTRICT U, sa_uint_t * RESTRICT P,
- sa_uint_t * RESTRICT bucket2,
- u16 * RESTRICT fastbits, fast_uint_t shift,
- fast_uint_t r, fast_uint_t * i0,
- fast_uint_t * i1, fast_uint_t * i2,
- fast_uint_t * i3, fast_uint_t * i4,
- fast_uint_t k) {
+ sa_uint_t * RESTRICT bucket2, u16 * RESTRICT fastbits,
+ fast_uint_t shift, fast_uint_t r, fast_uint_t * i0,
+ fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3,
+ fast_uint_t * i4, fast_uint_t k) {
u16 * RESTRICT U0 = (u16 *)(void *)U;
u16 * RESTRICT U1 = (u16 *)(void *)(((u8 *)U0) + r);
u16 * RESTRICT U2 = (u16 *)(void *)(((u8 *)U1) + r);
@@ -10657,12 +9969,10 @@ static void libsais_unbwt_decode_5(u8 * RESTRICT U, sa_uint_t * RESTRICT P,
}
static void libsais_unbwt_decode_6(u8 * RESTRICT U, sa_uint_t * RESTRICT P,
- sa_uint_t * RESTRICT bucket2,
- u16 * RESTRICT fastbits, fast_uint_t shift,
- fast_uint_t r, fast_uint_t * i0,
- fast_uint_t * i1, fast_uint_t * i2,
- fast_uint_t * i3, fast_uint_t * i4,
- fast_uint_t * i5, fast_uint_t k) {
+ sa_uint_t * RESTRICT bucket2, u16 * RESTRICT fastbits,
+ fast_uint_t shift, fast_uint_t r, fast_uint_t * i0,
+ fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3,
+ fast_uint_t * i4, fast_uint_t * i5, fast_uint_t k) {
u16 * RESTRICT U0 = (u16 *)(void *)U;
u16 * RESTRICT U1 = (u16 *)(void *)(((u8 *)U0) + r);
u16 * RESTRICT U2 = (u16 *)(void *)(((u8 *)U1) + r);
@@ -10731,11 +10041,12 @@ static void libsais_unbwt_decode_6(u8 * RESTRICT U, sa_uint_t * RESTRICT P,
*i5 = p5;
}
-static void libsais_unbwt_decode_7(
- u8 * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2,
- u16 * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0,
- fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4,
- fast_uint_t * i5, fast_uint_t * i6, fast_uint_t k) {
+static void libsais_unbwt_decode_7(u8 * RESTRICT U, sa_uint_t * RESTRICT P,
+ sa_uint_t * RESTRICT bucket2, u16 * RESTRICT fastbits,
+ fast_uint_t shift, fast_uint_t r, fast_uint_t * i0,
+ fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3,
+ fast_uint_t * i4, fast_uint_t * i5, fast_uint_t * i6,
+ fast_uint_t k) {
u16 * RESTRICT U0 = (u16 *)(void *)U;
u16 * RESTRICT U1 = (u16 *)(void *)(((u8 *)U0) + r);
u16 * RESTRICT U2 = (u16 *)(void *)(((u8 *)U1) + r);
@@ -10744,8 +10055,7 @@ static void libsais_unbwt_decode_7(
u16 * RESTRICT U5 = (u16 *)(void *)(((u8 *)U4) + r);
u16 * RESTRICT U6 = (u16 *)(void *)(((u8 *)U5) + r);
- fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4, p5 = *i5,
- p6 = *i6;
+ fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4, p5 = *i5, p6 = *i6;
for (i = 0; i != k; ++i) {
u16 c0 = fastbits[p0 >> shift];
@@ -10815,11 +10125,12 @@ static void libsais_unbwt_decode_7(
*i6 = p6;
}
-static void libsais_unbwt_decode_8(
- u8 * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2,
- u16 * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0,
- fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4,
- fast_uint_t * i5, fast_uint_t * i6, fast_uint_t * i7, fast_uint_t k) {
+static void libsais_unbwt_decode_8(u8 * RESTRICT U, sa_uint_t * RESTRICT P,
+ sa_uint_t * RESTRICT bucket2, u16 * RESTRICT fastbits,
+ fast_uint_t shift, fast_uint_t r, fast_uint_t * i0,
+ fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3,
+ fast_uint_t * i4, fast_uint_t * i5, fast_uint_t * i6,
+ fast_uint_t * i7, fast_uint_t k) {
u16 * RESTRICT U0 = (u16 *)(void *)U;
u16 * RESTRICT U1 = (u16 *)(void *)(((u8 *)U0) + r);
u16 * RESTRICT U2 = (u16 *)(void *)(((u8 *)U1) + r);
@@ -10829,8 +10140,7 @@ static void libsais_unbwt_decode_8(
u16 * RESTRICT U6 = (u16 *)(void *)(((u8 *)U5) + r);
u16 * RESTRICT U7 = (u16 *)(void *)(((u8 *)U6) + r);
- fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4, p5 = *i5,
- p6 = *i6, p7 = *i7;
+ fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4, p5 = *i5, p6 = *i6, p7 = *i7;
for (i = 0; i != k; ++i) {
u16 c0 = fastbits[p0 >> shift];
@@ -10909,10 +10219,8 @@ static void libsais_unbwt_decode_8(
*i7 = p7;
}
-static void libsais_unbwt_decode(u8 * RESTRICT U, sa_uint_t * RESTRICT P,
- sa_sint_t n, sa_sint_t r,
- const sa_uint_t * RESTRICT I,
- sa_uint_t * RESTRICT bucket2,
+static void libsais_unbwt_decode(u8 * RESTRICT U, sa_uint_t * RESTRICT P, sa_sint_t n, sa_sint_t r,
+ const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2,
u16 * RESTRICT fastbits, fast_sint_t blocks,
fast_uint_t reminder) {
fast_uint_t shift = 0;
@@ -10922,11 +10230,10 @@ static void libsais_unbwt_decode(u8 * RESTRICT U, sa_uint_t * RESTRICT P,
fast_uint_t offset = 0;
while (blocks > 8) {
- fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4],
- i5 = I[5], i6 = I[6], i7 = I[7];
- libsais_unbwt_decode_8(U + offset, P, bucket2, fastbits, shift,
- (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5,
- &i6, &i7, (fast_uint_t)r >> 1);
+ fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5], i6 = I[6],
+ i7 = I[7];
+ libsais_unbwt_decode_8(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1,
+ &i2, &i3, &i4, &i5, &i6, &i7, (fast_uint_t)r >> 1);
I += 8;
blocks -= 8;
offset += 8 * (fast_uint_t)r;
@@ -10934,86 +10241,69 @@ static void libsais_unbwt_decode(u8 * RESTRICT U, sa_uint_t * RESTRICT P,
if (blocks == 1) {
fast_uint_t i0 = I[0];
- libsais_unbwt_decode_1(U + offset, P, bucket2, fastbits, shift, &i0,
- reminder >> 1);
+ libsais_unbwt_decode_1(U + offset, P, bucket2, fastbits, shift, &i0, reminder >> 1);
} else if (blocks == 2) {
fast_uint_t i0 = I[0], i1 = I[1];
- libsais_unbwt_decode_2(U + offset, P, bucket2, fastbits, shift,
- (fast_uint_t)r, &i0, &i1, reminder >> 1);
- libsais_unbwt_decode_1(U + offset + 2 * (reminder >> 1), P, bucket2,
- fastbits, shift, &i0,
+ libsais_unbwt_decode_2(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1,
+ reminder >> 1);
+ libsais_unbwt_decode_1(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift, &i0,
((fast_uint_t)r >> 1) - (reminder >> 1));
} else if (blocks == 3) {
fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2];
- libsais_unbwt_decode_3(U + offset, P, bucket2, fastbits, shift,
- (fast_uint_t)r, &i0, &i1, &i2, reminder >> 1);
- libsais_unbwt_decode_2(U + offset + 2 * (reminder >> 1), P, bucket2,
- fastbits, shift, (fast_uint_t)r, &i0, &i1,
- ((fast_uint_t)r >> 1) - (reminder >> 1));
+ libsais_unbwt_decode_3(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1,
+ &i2, reminder >> 1);
+ libsais_unbwt_decode_2(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift,
+ (fast_uint_t)r, &i0, &i1, ((fast_uint_t)r >> 1) - (reminder >> 1));
} else if (blocks == 4) {
fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3];
- libsais_unbwt_decode_4(U + offset, P, bucket2, fastbits, shift,
- (fast_uint_t)r, &i0, &i1, &i2, &i3,
- reminder >> 1);
- libsais_unbwt_decode_3(U + offset + 2 * (reminder >> 1), P, bucket2,
- fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2,
+ libsais_unbwt_decode_4(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1,
+ &i2, &i3, reminder >> 1);
+ libsais_unbwt_decode_3(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift,
+ (fast_uint_t)r, &i0, &i1, &i2,
((fast_uint_t)r >> 1) - (reminder >> 1));
} else if (blocks == 5) {
fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4];
- libsais_unbwt_decode_5(U + offset, P, bucket2, fastbits, shift,
- (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4,
- reminder >> 1);
- libsais_unbwt_decode_4(U + offset + 2 * (reminder >> 1), P, bucket2,
- fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2,
- &i3, ((fast_uint_t)r >> 1) - (reminder >> 1));
+ libsais_unbwt_decode_5(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1,
+ &i2, &i3, &i4, reminder >> 1);
+ libsais_unbwt_decode_4(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift,
+ (fast_uint_t)r, &i0, &i1, &i2, &i3,
+ ((fast_uint_t)r >> 1) - (reminder >> 1));
} else if (blocks == 6) {
- fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4],
- i5 = I[5];
- libsais_unbwt_decode_6(U + offset, P, bucket2, fastbits, shift,
- (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5,
- reminder >> 1);
- libsais_unbwt_decode_5(U + offset + 2 * (reminder >> 1), P, bucket2,
- fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2,
- &i3, &i4,
+ fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5];
+ libsais_unbwt_decode_6(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1,
+ &i2, &i3, &i4, &i5, reminder >> 1);
+ libsais_unbwt_decode_5(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift,
+ (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4,
((fast_uint_t)r >> 1) - (reminder >> 1));
} else if (blocks == 7) {
- fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4],
- i5 = I[5], i6 = I[6];
- libsais_unbwt_decode_7(U + offset, P, bucket2, fastbits, shift,
+ fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5], i6 = I[6];
+ libsais_unbwt_decode_7(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1,
+ &i2, &i3, &i4, &i5, &i6, reminder >> 1);
+ libsais_unbwt_decode_6(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift,
(fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5,
- &i6, reminder >> 1);
- libsais_unbwt_decode_6(U + offset + 2 * (reminder >> 1), P, bucket2,
- fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2,
- &i3, &i4, &i5,
((fast_uint_t)r >> 1) - (reminder >> 1));
} else {
- fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4],
- i5 = I[5], i6 = I[6], i7 = I[7];
- libsais_unbwt_decode_8(U + offset, P, bucket2, fastbits, shift,
- (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5,
- &i6, &i7, reminder >> 1);
- libsais_unbwt_decode_7(U + offset + 2 * (reminder >> 1), P, bucket2,
- fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2,
- &i3, &i4, &i5, &i6,
+ fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5], i6 = I[6],
+ i7 = I[7];
+ libsais_unbwt_decode_8(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1,
+ &i2, &i3, &i4, &i5, &i6, &i7, reminder >> 1);
+ libsais_unbwt_decode_7(U + offset + 2 * (reminder >> 1), P, bucket2, fastbits, shift,
+ (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, &i6,
((fast_uint_t)r >> 1) - (reminder >> 1));
}
}
-static void libsais_unbwt_decode_omp(const u8 * RESTRICT T, u8 * RESTRICT U,
- sa_uint_t * RESTRICT P, sa_sint_t n,
- sa_sint_t r, const sa_uint_t * RESTRICT I,
- sa_uint_t * RESTRICT bucket2,
- u16 * RESTRICT fastbits,
+static void libsais_unbwt_decode_omp(const u8 * RESTRICT T, u8 * RESTRICT U, sa_uint_t * RESTRICT P,
+ sa_sint_t n, sa_sint_t r, const sa_uint_t * RESTRICT I,
+ sa_uint_t * RESTRICT bucket2, u16 * RESTRICT fastbits,
sa_sint_t threads) {
fast_uint_t lastc = T[0];
fast_sint_t blocks = 1 + (((fast_sint_t)n - 1) / (fast_sint_t)r);
- fast_uint_t reminder =
- (fast_uint_t)n - ((fast_uint_t)r * ((fast_uint_t)blocks - 1));
+ fast_uint_t reminder = (fast_uint_t)n - ((fast_uint_t)r * ((fast_uint_t)blocks - 1));
#if defined(_OPENMP)
fast_sint_t max_threads = blocks < threads ? blocks : threads;
- #pragma omp parallel num_threads(max_threads) if (max_threads > 1 && \
- n >= 65536)
+ #pragma omp parallel num_threads(max_threads) if (max_threads > 1 && n >= 65536)
#endif
{
#if defined(_OPENMP)
@@ -11028,31 +10318,27 @@ static void libsais_unbwt_decode_omp(const u8 * RESTRICT T, u8 * RESTRICT U,
fast_sint_t omp_block_stride = blocks / omp_num_threads;
fast_sint_t omp_block_reminder = blocks % omp_num_threads;
- fast_sint_t omp_block_size =
- omp_block_stride + (omp_thread_num < omp_block_reminder);
+ fast_sint_t omp_block_size = omp_block_stride + (omp_thread_num < omp_block_reminder);
fast_sint_t omp_block_start =
omp_block_stride * omp_thread_num +
- (omp_thread_num < omp_block_reminder ? omp_thread_num
- : omp_block_reminder);
+ (omp_thread_num < omp_block_reminder ? omp_thread_num : omp_block_reminder);
- libsais_unbwt_decode(
- U + r * omp_block_start, P, n, r, I + omp_block_start, bucket2,
- fastbits, omp_block_size,
- omp_thread_num < omp_num_threads - 1 ? (fast_uint_t)r : reminder);
+ libsais_unbwt_decode(U + r * omp_block_start, P, n, r, I + omp_block_start, bucket2,
+ fastbits, omp_block_size,
+ omp_thread_num < omp_num_threads - 1 ? (fast_uint_t)r : reminder);
}
U[n - 1] = (u8)lastc;
}
-static sa_sint_t libsais_unbwt_core(
- const u8 * RESTRICT T, u8 * RESTRICT U, sa_uint_t * RESTRICT P, sa_sint_t n,
- const sa_sint_t * freq, sa_sint_t r, const sa_uint_t * RESTRICT I,
- sa_uint_t * RESTRICT bucket2, u16 * RESTRICT fastbits,
- sa_uint_t * RESTRICT buckets, sa_sint_t threads) {
+static sa_sint_t libsais_unbwt_core(const u8 * RESTRICT T, u8 * RESTRICT U, sa_uint_t * RESTRICT P,
+ sa_sint_t n, const sa_sint_t * freq, sa_sint_t r,
+ const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2,
+ u16 * RESTRICT fastbits, sa_uint_t * RESTRICT buckets,
+ sa_sint_t threads) {
#if defined(_OPENMP)
if (threads > 1 && n >= 262144) {
- libsais_unbwt_init_parallel(T, P, n, freq, I, bucket2, fastbits,
- buckets, threads);
+ libsais_unbwt_init_parallel(T, P, n, freq, I, bucket2, fastbits, buckets, threads);
} else
#else
UNUSED(buckets);
@@ -11065,33 +10351,30 @@ static sa_sint_t libsais_unbwt_core(
return 0;
}
-static sa_sint_t libsais_unbwt_main(const u8 * T, u8 * U, sa_uint_t * P,
- sa_sint_t n, const sa_sint_t * freq,
- sa_sint_t r, const sa_uint_t * I,
+static sa_sint_t libsais_unbwt_main(const u8 * T, u8 * U, sa_uint_t * P, sa_sint_t n,
+ const sa_sint_t * freq, sa_sint_t r, const sa_uint_t * I,
sa_sint_t threads) {
fast_uint_t shift = 0;
while ((n >> shift) > (1 << UNBWT_FASTBITS)) {
shift++;
}
- sa_uint_t * RESTRICT bucket2 = (sa_uint_t *)libsais_alloc_aligned(
- ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t), 4096);
- u16 * RESTRICT fastbits = (u16 *)libsais_alloc_aligned(
- ((size_t)1 + (size_t)(n >> shift)) * sizeof(u16), 4096);
+ sa_uint_t * RESTRICT bucket2 =
+ (sa_uint_t *)libsais_alloc_aligned(ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t), 4096);
+ u16 * RESTRICT fastbits =
+ (u16 *)libsais_alloc_aligned(((size_t)1 + (size_t)(n >> shift)) * sizeof(u16), 4096);
sa_uint_t * RESTRICT buckets =
threads > 1 && n >= 262144
? (sa_uint_t *)libsais_alloc_aligned(
- (size_t)threads *
- (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)) *
+ (size_t)threads * (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)) *
sizeof(sa_uint_t),
4096)
: NULL;
- sa_sint_t index = bucket2 != NULL && fastbits != NULL &&
- (buckets != NULL || threads == 1 || n < 262144)
- ? libsais_unbwt_core(T, U, P, n, freq, r, I, bucket2,
- fastbits, buckets, threads)
- : -2;
+ sa_sint_t index =
+ bucket2 != NULL && fastbits != NULL && (buckets != NULL || threads == 1 || n < 262144)
+ ? libsais_unbwt_core(T, U, P, n, freq, r, I, bucket2, fastbits, buckets, threads)
+ : -2;
libsais_free_aligned(buckets);
libsais_free_aligned(fastbits);
@@ -11100,38 +10383,33 @@ static sa_sint_t libsais_unbwt_main(const u8 * T, u8 * U, sa_uint_t * P,
return index;
}
-static sa_sint_t libsais_unbwt_main_ctx(const LIBSAIS_UNBWT_CONTEXT * ctx,
- const u8 * T, u8 * U, sa_uint_t * P,
- sa_sint_t n, const sa_sint_t * freq,
+static sa_sint_t libsais_unbwt_main_ctx(const LIBSAIS_UNBWT_CONTEXT * ctx, const u8 * T, u8 * U,
+ sa_uint_t * P, sa_sint_t n, const sa_sint_t * freq,
sa_sint_t r, const sa_uint_t * I) {
return ctx != NULL && ctx->bucket2 != NULL && ctx->fastbits != NULL &&
(ctx->buckets != NULL || ctx->threads == 1)
- ? libsais_unbwt_core(T, U, P, n, freq, r, I, ctx->bucket2,
- ctx->fastbits, ctx->buckets,
- (sa_sint_t)ctx->threads)
+ ? libsais_unbwt_core(T, U, P, n, freq, r, I, ctx->bucket2, ctx->fastbits,
+ ctx->buckets, (sa_sint_t)ctx->threads)
: -2;
}
-void * libsais_unbwt_create_ctx(void) {
- return (void *)libsais_unbwt_create_ctx_main(1);
-}
+void * libsais_unbwt_create_ctx(void) { return (void *)libsais_unbwt_create_ctx_main(1); }
void libsais_unbwt_free_ctx(void * ctx) {
libsais_unbwt_free_ctx_main((LIBSAIS_UNBWT_CONTEXT *)ctx);
}
-s32 libsais_unbwt(const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq,
- s32 i) {
+s32 libsais_unbwt(const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq, s32 i) {
return libsais_unbwt_aux(T, U, A, n, freq, n, &i);
}
-s32 libsais_unbwt_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n,
- const s32 * freq, s32 i) {
+s32 libsais_unbwt_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq,
+ s32 i) {
return libsais_unbwt_aux_ctx(ctx, T, U, A, n, freq, n, &i);
}
-s32 libsais_unbwt_aux(const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq,
- s32 r, const s32 * I) {
+s32 libsais_unbwt_aux(const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq, s32 r,
+ const s32 * I) {
if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) ||
((r != n) && ((r < 2) || ((r & (r - 1)) != 0))) || (I == NULL)) {
return -1;
@@ -11152,12 +10430,11 @@ s32 libsais_unbwt_aux(const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq,
}
}
- return libsais_unbwt_main(T, U, (sa_uint_t *)A, n, freq, r,
- (const sa_uint_t *)I, 1);
+ return libsais_unbwt_main(T, U, (sa_uint_t *)A, n, freq, r, (const sa_uint_t *)I, 1);
}
-s32 libsais_unbwt_aux_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A,
- s32 n, const s32 * freq, s32 r, const s32 * I) {
+s32 libsais_unbwt_aux_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq,
+ s32 r, const s32 * I) {
if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) ||
((r != n) && ((r < 2) || ((r & (r - 1)) != 0))) || (I == NULL)) {
return -1;
@@ -11178,9 +10455,8 @@ s32 libsais_unbwt_aux_ctx(const void * ctx, const u8 * T, u8 * U, s32 * A,
}
}
- return libsais_unbwt_main_ctx((const LIBSAIS_UNBWT_CONTEXT *)ctx, T, U,
- (sa_uint_t *)A, n, freq, r,
- (const sa_uint_t *)I);
+ return libsais_unbwt_main_ctx((const LIBSAIS_UNBWT_CONTEXT *)ctx, T, U, (sa_uint_t *)A, n, freq,
+ r, (const sa_uint_t *)I);
}
#if defined(_OPENMP)
@@ -11194,16 +10470,14 @@ void * libsais_unbwt_create_ctx_omp(s32 threads) {
return (void *)libsais_unbwt_create_ctx_main(threads);
}
-s32 libsais_unbwt_omp(const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq,
- s32 i, s32 threads) {
+s32 libsais_unbwt_omp(const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq, s32 i, s32 threads) {
return libsais_unbwt_aux_omp(T, U, A, n, freq, n, &i, threads);
}
-s32 libsais_unbwt_aux_omp(const u8 * T, u8 * U, s32 * A, s32 n,
- const s32 * freq, s32 r, const s32 * I, s32 threads) {
+s32 libsais_unbwt_aux_omp(const u8 * T, u8 * U, s32 * A, s32 n, const s32 * freq, s32 r,
+ const s32 * I, s32 threads) {
if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) ||
- ((r != n) && ((r < 2) || ((r & (r - 1)) != 0))) || (I == NULL) ||
- (threads < 0)) {
+ ((r != n) && ((r < 2) || ((r & (r - 1)) != 0))) || (I == NULL) || (threads < 0)) {
return -1;
} else if (n <= 1) {
if (I[0] != n) {
@@ -11223,23 +10497,20 @@ s32 libsais_unbwt_aux_omp(const u8 * T, u8 * U, s32 * A, s32 n,
}
threads = threads > 0 ? threads : omp_get_max_threads();
- return libsais_unbwt_main(T, U, (sa_uint_t *)A, n, freq, r,
- (const sa_uint_t *)I, threads);
+ return libsais_unbwt_main(T, U, (sa_uint_t *)A, n, freq, r, (const sa_uint_t *)I, threads);
}
#endif
-static void libsais_compute_phi(const sa_sint_t * RESTRICT SA,
- sa_sint_t * RESTRICT PLCP, sa_sint_t n,
- fast_sint_t omp_block_start,
+static void libsais_compute_phi(const sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT PLCP,
+ sa_sint_t n, fast_sint_t omp_block_start,
fast_sint_t omp_block_size) {
const fast_sint_t prefetch_distance = 32;
fast_sint_t i, j;
sa_sint_t k = omp_block_start > 0 ? SA[omp_block_start - 1] : n;
- for (i = omp_block_start,
- j = omp_block_start + omp_block_size - prefetch_distance - 3;
- i < j; i += 4) {
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j;
+ i += 4) {
libsais_prefetchw(&PLCP[SA[i + prefetch_distance + 0]]);
libsais_prefetchw(&PLCP[SA[i + prefetch_distance + 1]]);
@@ -11263,9 +10534,8 @@ static void libsais_compute_phi(const sa_sint_t * RESTRICT SA,
}
}
-static void libsais_compute_phi_omp(const sa_sint_t * RESTRICT SA,
- sa_sint_t * RESTRICT PLCP, sa_sint_t n,
- sa_sint_t threads) {
+static void libsais_compute_phi_omp(const sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT PLCP,
+ sa_sint_t n, sa_sint_t threads) {
#if defined(_OPENMP)
#pragma omp parallel num_threads(threads) if (threads > 1 && n >= 65536)
#endif
@@ -11281,24 +10551,20 @@ static void libsais_compute_phi_omp(const sa_sint_t * RESTRICT SA,
#endif
fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
- fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
- ? omp_block_stride
- : n - omp_block_start;
+ fast_sint_t omp_block_size =
+ omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
libsais_compute_phi(SA, PLCP, n, omp_block_start, omp_block_size);
}
}
-static void libsais_compute_plcp(const u8 * RESTRICT T,
- sa_sint_t * RESTRICT PLCP, fast_sint_t n,
- fast_sint_t omp_block_start,
- fast_sint_t omp_block_size) {
+static void libsais_compute_plcp(const u8 * RESTRICT T, sa_sint_t * RESTRICT PLCP, fast_sint_t n,
+ fast_sint_t omp_block_start, fast_sint_t omp_block_size) {
const fast_sint_t prefetch_distance = 32;
fast_sint_t i, j, l = 0;
- for (i = omp_block_start,
- j = omp_block_start + omp_block_size - prefetch_distance;
- i < j; i += 1) {
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance; i < j;
+ i += 1) {
libsais_prefetch(&T[PLCP[i + prefetch_distance] + l]);
fast_sint_t k = PLCP[i], m = n - (i > k ? i : k);
@@ -11321,8 +10587,7 @@ static void libsais_compute_plcp(const u8 * RESTRICT T,
}
}
-static void libsais_compute_plcp_omp(const u8 * RESTRICT T,
- sa_sint_t * RESTRICT PLCP, sa_sint_t n,
+static void libsais_compute_plcp_omp(const u8 * RESTRICT T, sa_sint_t * RESTRICT PLCP, sa_sint_t n,
sa_sint_t threads) {
#if defined(_OPENMP)
#pragma omp parallel num_threads(threads) if (threads > 1 && n >= 65536)
@@ -11339,25 +10604,21 @@ static void libsais_compute_plcp_omp(const u8 * RESTRICT T,
#endif
fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
- fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
- ? omp_block_stride
- : n - omp_block_start;
+ fast_sint_t omp_block_size =
+ omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
libsais_compute_plcp(T, PLCP, n, omp_block_start, omp_block_size);
}
}
-static void libsais_compute_lcp(const sa_sint_t * RESTRICT PLCP,
- const sa_sint_t * RESTRICT SA,
- sa_sint_t * RESTRICT LCP,
- fast_sint_t omp_block_start,
+static void libsais_compute_lcp(const sa_sint_t * RESTRICT PLCP, const sa_sint_t * RESTRICT SA,
+ sa_sint_t * RESTRICT LCP, fast_sint_t omp_block_start,
fast_sint_t omp_block_size) {
const fast_sint_t prefetch_distance = 32;
fast_sint_t i, j;
- for (i = omp_block_start,
- j = omp_block_start + omp_block_size - prefetch_distance - 3;
- i < j; i += 4) {
+ for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j;
+ i += 4) {
libsais_prefetch(&PLCP[SA[i + prefetch_distance + 0]]);
libsais_prefetch(&PLCP[SA[i + prefetch_distance + 1]]);
@@ -11376,10 +10637,8 @@ static void libsais_compute_lcp(const sa_sint_t * RESTRICT PLCP,
}
}
-static void libsais_compute_lcp_omp(const sa_sint_t * RESTRICT PLCP,
- const sa_sint_t * RESTRICT SA,
- sa_sint_t * RESTRICT LCP, sa_sint_t n,
- sa_sint_t threads) {
+static void libsais_compute_lcp_omp(const sa_sint_t * RESTRICT PLCP, const sa_sint_t * RESTRICT SA,
+ sa_sint_t * RESTRICT LCP, sa_sint_t n, sa_sint_t threads) {
#if defined(_OPENMP)
#pragma omp parallel num_threads(threads) if (threads > 1 && n >= 65536)
#endif
@@ -11395,9 +10654,8 @@ static void libsais_compute_lcp_omp(const sa_sint_t * RESTRICT PLCP,
#endif
fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16);
fast_sint_t omp_block_start = omp_thread_num * omp_block_stride;
- fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1
- ? omp_block_stride
- : n - omp_block_start;
+ fast_sint_t omp_block_size =
+ omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start;
libsais_compute_lcp(PLCP, SA, LCP, omp_block_start, omp_block_size);
}
@@ -11436,10 +10694,8 @@ s32 libsais_lcp(const s32 * PLCP, const s32 * SA, s32 * LCP, s32 n) {
#if defined(_OPENMP)
-s32 libsais_plcp_omp(const u8 * T, const s32 * SA, s32 * PLCP, s32 n,
- s32 threads) {
- if ((T == NULL) || (SA == NULL) || (PLCP == NULL) || (n < 0) ||
- (threads < 0)) {
+s32 libsais_plcp_omp(const u8 * T, const s32 * SA, s32 * PLCP, s32 n, s32 threads) {
+ if ((T == NULL) || (SA == NULL) || (PLCP == NULL) || (n < 0) || (threads < 0)) {
return -1;
} else if (n <= 1) {
if (n == 1) {
@@ -11456,10 +10712,8 @@ s32 libsais_plcp_omp(const u8 * T, const s32 * SA, s32 * PLCP, s32 n,
return 0;
}
-s32 libsais_lcp_omp(const s32 * PLCP, const s32 * SA, s32 * LCP, s32 n,
- s32 threads) {
- if ((PLCP == NULL) || (SA == NULL) || (LCP == NULL) || (n < 0) ||
- (threads < 0)) {
+s32 libsais_lcp_omp(const s32 * PLCP, const s32 * SA, s32 * LCP, s32 n, s32 threads) {
+ if ((PLCP == NULL) || (SA == NULL) || (LCP == NULL) || (n < 0) || (threads < 0)) {
return -1;
} else if (n <= 1) {
if (n == 1) {
diff --git a/src/lzp.c b/src/lzp.c
new file mode 100644
index 0000000..e49428e
--- /dev/null
+++ b/src/lzp.c
@@ -0,0 +1,239 @@
+
+// Lempel Ziv Prediction code.
+// TODO: Move the LUT allocation out of block coding routine to save some clock cycles.
+
+#include <memory.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "common.h"
+
+#define MATCH 0xf2
+
+static inline s32 num_blocks(s32 n) {
+ if (n < KiB(256)) return 1;
+ if (n < MiB(4)) return 2;
+ if (n < MiB(16)) return 4;
+ return 8;
+}
+
+static s32 lzp_encode_block(const u8 * restrict in, const u8 * in_end, u8 * restrict out,
+ u8 * out_end, s32 * restrict lut, s32 mask, s32 m_len) {
+ const u8 *ins = in, *outs = out;
+ const u8 * out_eob = out_end - 8;
+ const u8 * heur = in;
+
+ u32 ctx;
+
+ for (s32 i = 0; i < 4; ++i) *out++ = *in++;
+
+ ctx = in[-1] | (in[-2] << 8) | (in[-3] << 16) | (in[-4] << 24);
+
+ while (in < in_end - m_len - 32 && out < out_eob) {
+ u32 idx = (ctx >> 15 ^ ctx ^ ctx >> 3) & mask;
+ s32 val = lut[idx];
+ lut[idx] = in - ins;
+ if (val > 0) {
+ const u8 * restrict ref = ins + val;
+ if (memcmp(in + m_len - 4, ref + m_len - 4, sizeof(u32)) == 0 &&
+ memcmp(in, ref, sizeof(u32)) == 0) {
+ if (heur > in && *(u32 *)heur != *(u32 *)(ref + (heur - in))) goto not_found;
+
+ s32 len = 4;
+ for (; in + len < in_end - m_len - 32; len += sizeof(u32)) {
+ if (*(u32 *)(in + len) != *(u32 *)(ref + len)) break;
+ }
+
+ if (len < m_len) {
+ if (heur < in + len) heur = in + len;
+ goto not_found;
+ }
+
+ len += in[len] == ref[len];
+ len += in[len] == ref[len];
+ len += in[len] == ref[len];
+
+ in += len;
+ ctx = in[-1] | (in[-2] << 8) | (in[-3] << 16) | (in[-4] << 24);
+
+ *out++ = MATCH;
+
+ len -= m_len;
+ while (len >= 254) {
+ len -= 254;
+ *out++ = 254;
+ if (out >= out_eob) break;
+ }
+
+ *out++ = len;
+ } else {
+ not_found:;
+ u8 next = *out++ = *in++;
+ ctx = ctx << 8 | next;
+ if (next == MATCH) *out++ = 255;
+ }
+ } else {
+ ctx = (ctx << 8) | (*out++ = *in++);
+ }
+ }
+
+ ctx = in[-1] | (in[-2] << 8) | (in[-3] << 16) | (in[-4] << 24);
+
+ while (in < in_end && out < out_eob) {
+ u32 idx = (ctx >> 15 ^ ctx ^ ctx >> 3) & mask;
+ s32 val = lut[idx];
+ lut[idx] = (s32)(in - ins);
+
+ u8 next = *out++ = *in++;
+ ctx = ctx << 8 | next;
+ if (next == MATCH && val > 0) *out++ = 255;
+ }
+
+ return out >= out_eob ? -1 : (s32)(out - outs);
+}
+
+static s32 lzp_decode_block(const u8 * restrict in, const u8 * in_end, u8 * restrict out, s32 hash,
+ s32 m_len) {
+ if (in_end - in < 4) return -1;
+
+ s32 * restrict lut = calloc(1 << hash, sizeof(s32));
+ if (!lut) return -1;
+
+ u32 mask = (s32)(1 << hash) - 1;
+ const u8 * outs = out;
+
+ for (s32 i = 0; i < 4; ++i) *out++ = *in++;
+
+ u32 ctx = out[-1] | (out[-2] << 8) | (out[-3] << 16) | (out[-4] << 24);
+
+ while (in < in_end) {
+ u32 idx = (ctx >> 15 ^ ctx ^ ctx >> 3) & mask;
+ s32 val = lut[idx];
+ lut[idx] = (s32)(out - outs);
+ if (*in == MATCH && val > 0) {
+ in++;
+ if (*in != 255) {
+ s32 len = m_len;
+ while (1) {
+ len += *in;
+ if (*in++ != 254) break;
+ }
+
+ const u8 * ref = outs + val;
+ u8 * out_end = out + len;
+
+ while (out < out_end) *out++ = *ref++;
+
+ ctx = out[-1] | out[-2] << 8 | out[-3] << 16 | out[-4] << 24;
+ } else {
+ in++;
+ ctx = (ctx << 8) | (*out++ = MATCH);
+ }
+ } else {
+ ctx = (ctx << 8) | (*out++ = *in++);
+ }
+ }
+
+ free(lut);
+
+ return out - outs;
+}
+
+s32 lzp_compress(const u8 * in, u8 * out, s32 n, s32 hash, s32 m_len) {
+ s32 nblk = num_blocks(n);
+
+ if (nblk == 1) {
+ if (n - m_len < 32) return -1;
+
+ s32 * lut = calloc(1 << hash, sizeof(s32));
+
+ if (!lut) return -1;
+
+ s32 r =
+ lzp_encode_block(in, in + n, out + 1, out + n - 1, lut, (s32)(1 << hash) - 1, m_len);
+
+ free(lut);
+
+ if (r >= 0) {
+ out[0] = 1;
+ r++;
+ }
+
+ return r;
+ }
+
+ s32 out_ptr = 1 + 8 * nblk;
+
+ out[0] = nblk;
+ for (s32 b_id = 0; b_id < nblk; ++b_id) {
+ s32 ins = b_id * (n / nblk);
+ s32 insz = b_id != nblk - 1 ? n / nblk : n - ins;
+ s32 outsz = insz;
+ if (outsz > n - out_ptr) outsz = n - out_ptr;
+
+ s32 r;
+
+ if (insz - m_len < 32)
+ r = -1;
+ else {
+ s32 * lut = calloc(1 << hash, sizeof(s32));
+
+ if (!lut)
+ r = -1;
+ else
+ r = lzp_encode_block(in + ins, in + ins + insz, out + out_ptr,
+ out + out_ptr + outsz, lut, (s32)(1 << hash) - 1, m_len);
+
+ free(lut);
+ }
+
+ if (r < 0) {
+ if (out_ptr + insz >= n) return -1;
+ r = insz;
+ memcpy(out + out_ptr, in + ins, insz);
+ }
+ memcpy(out + 1 + 8 * b_id + 0, &insz, sizeof(s32));
+ memcpy(out + 1 + 8 * b_id + 4, &r, sizeof(s32));
+
+ out_ptr += r;
+ }
+
+ return out_ptr;
+}
+
+s32 lzp_decompress(const u8 * in, u8 * out, s32 n, s32 hash, s32 m_len) {
+ s32 nblk = in[0];
+
+ if (nblk == 1) return lzp_decode_block(in + 1, in + n, out, hash, m_len);
+
+ s32 dec[256];
+
+ for (s32 b_id = 0; b_id < nblk; ++b_id) {
+ s32 in_ptr = 0, out_ptr = 0;
+ for (s32 p = 0; p < b_id; ++p) {
+ in_ptr += *(s32 *)(in + 1 + 8 * p + 4);
+ out_ptr += *(s32 *)(in + 1 + 8 * p + 0);
+ }
+
+ in_ptr += 1 + 8 * nblk;
+
+ s32 insz = *(s32 *)(in + 1 + 8 * b_id + 4);
+ s32 outsz = *(s32 *)(in + 1 + 8 * b_id + 0);
+
+ if (insz != outsz) {
+ dec[b_id] =
+ lzp_decode_block(in + in_ptr, in + in_ptr + insz, out + out_ptr, hash, m_len);
+ } else {
+ dec[b_id] = insz;
+ memcpy(out + out_ptr, in + in_ptr, insz);
+ }
+ }
+
+ s32 dataSize = 0, r = 0;
+ for (s32 b_id = 0; b_id < nblk; ++b_id) {
+ if (dec[b_id] < 0) r = dec[b_id];
+ dataSize += dec[b_id];
+ }
+
+ return (r == 0) ? dataSize : r;
+}
diff --git a/src/main.c b/src/main.c
index cce43cc..2af0d42 100644
--- a/src/main.c
+++ b/src/main.c
@@ -58,8 +58,7 @@ int main(int argc, char * argv[]) {
}
if (mode == 0) {
- fprintf(stderr, "Usage: %s [-e/-d/-t] [-b block_size] input output\n",
- argv[0]);
+ fprintf(stderr, "Usage: %s [-e/-d/-t] [-b block_size] input output\n", argv[0]);
fprintf(stderr,
"If input or output are not specified, they default to stdin "
"and stdout.\n");
@@ -121,8 +120,7 @@ int main(int argc, char * argv[]) {
}
}
- struct block_encoder_state * block_encoder_state =
- new_block_encoder_state(block_size);
+ struct block_encoder_state * block_encoder_state = new_block_encoder_state(block_size);
if (block_encoder_state == NULL) {
fprintf(stderr, "Failed to create a block encoder state.\n");
@@ -131,11 +129,9 @@ int main(int argc, char * argv[]) {
if (mode == 1)
while (commit_read(block_encoder_state,
- read(input_des, get_buffer(block_encoder_state),
- block_size)) > 0) {
+ read(input_des, get_buffer(block_encoder_state), block_size)) > 0) {
if (get_last_error(block_encoder_state) != BZ3_OK) {
- fprintf(stderr, "Failed to read data: %s\n",
- str_last_error(block_encoder_state));
+ fprintf(stderr, "Failed to read data: %s\n", str_last_error(block_encoder_state));
return 1;
}
struct encoding_result r = encode_block(block_encoder_state);
@@ -150,8 +146,7 @@ int main(int argc, char * argv[]) {
s32 read_size;
while ((read_size = read_block(input_des, block_encoder_state)) > 0) {
if (get_last_error(block_encoder_state) != BZ3_OK) {
- fprintf(stderr, "Failed to read data: %s\n",
- str_last_error(block_encoder_state));
+ fprintf(stderr, "Failed to read data: %s\n", str_last_error(block_encoder_state));
return 1;
}
struct encoding_result r = decode_block(block_encoder_state);
@@ -166,22 +161,19 @@ int main(int argc, char * argv[]) {
s32 read_size;
while ((read_size = read_block(input_des, block_encoder_state)) > 0) {
if (get_last_error(block_encoder_state) != BZ3_OK) {
- fprintf(stderr, "Failed to read data: %s\n",
- str_last_error(block_encoder_state));
+ fprintf(stderr, "Failed to read data: %s\n", str_last_error(block_encoder_state));
return 1;
}
decode_block(block_encoder_state);
if (get_last_error(block_encoder_state) != BZ3_OK) {
- fprintf(stderr, "Failed to decode data: %s\n",
- str_last_error(block_encoder_state));
+ fprintf(stderr, "Failed to decode data: %s\n", str_last_error(block_encoder_state));
return 1;
}
}
}
if (get_last_error(block_encoder_state) != BZ3_OK) {
- fprintf(stderr, "Failed to read data: %s\n",
- str_last_error(block_encoder_state));
+ fprintf(stderr, "Failed to read data: %s\n", str_last_error(block_encoder_state));
return 1;
}
diff --git a/src/mtf.c b/src/mtf.c
index e18f646..a2f6cf6 100644
--- a/src/mtf.c
+++ b/src/mtf.c
@@ -55,8 +55,7 @@ void mtf_decode(struct mtf_state * mtf, u8 * src, u8 * dst, u32 count) {
mtf->prev[c] = mtf->curr[c] = i;
- for (; r > 0 && mtf->curr[mtf->ranks[r - 1]] <= i; r--)
- mtf->ranks[r] = mtf->ranks[r - 1];
+ for (; r > 0 && mtf->curr[mtf->ranks[r - 1]] <= i; r--) mtf->ranks[r] = mtf->ranks[r - 1];
mtf->ranks[r] = c;
}
diff --git a/src/srt.c b/src/srt.c
index c5c1e38..d292abe 100644
--- a/src/srt.c
+++ b/src/srt.c
@@ -32,7 +32,8 @@ static s32 preprocess(const u32 * freqs, u8 * symbols) {
for (u32 i = h; i < nb_symbols; i++) {
const s32 t = symbols[i];
s32 b = i - h;
- while ((b >= 0) && (freqs[symbols[b]] < freqs[t] || (freqs[t] == freqs[symbols[b]] && t < symbols[b]))) {
+ while ((b >= 0) && (freqs[symbols[b]] < freqs[t] ||
+ (freqs[t] == freqs[symbols[b]] && t < symbols[b]))) {
symbols[b + h] = symbols[b];
b -= h;
}
diff --git a/src/txt.c b/src/txt.c
index 7968733..316d04f 100644
--- a/src/txt.c
+++ b/src/txt.c
@@ -4,7 +4,7 @@
#include <math.h>
#ifdef STANDALONE
-#include <stdio.h>
+ #include <stdio.h>
#endif
int is_text(const u8 * data, s32 len) {
@@ -13,9 +13,8 @@ int is_text(const u8 * data, s32 len) {
#endif
s32 histogram[256] = { 0 };
- for(s32 i = 0; i < len; i++)
- histogram[data[i]]++;
-
+ for (s32 i = 0; i < len; i++) histogram[data[i]]++;
+
// Text criterions:
// 1. Shannon entropy is between 4.5 and 5.2.
// 2. Majority of the document must be uppercase/lowercase numbers.
@@ -24,9 +23,9 @@ int is_text(const u8 * data, s32 len) {
// Step 1
double entropy = 0;
- for(s32 i = 0; i < 256; i++) {
+ for (s32 i = 0; i < 256; i++) {
double p = (double)histogram[i] / len;
- if(p == 0) continue;
+ if (p == 0) continue;
entropy += p * log2(p);
}
entropy = -entropy;
@@ -35,34 +34,32 @@ int is_text(const u8 * data, s32 len) {
printf("Shannon entropy: %lf\n", entropy);
#endif
- if(entropy > 5.4 || entropy < 4.5)
- return 0;
-
+ if (entropy > 5.4 || entropy < 4.5) return 0;
+
// Step 2
s32 letters = 0;
s32 whitespace = 0;
- for(s32 i = 0; i < 256; i++) {
- if(i >= 'A' && i <= 'Z')
+ for (s32 i = 0; i < 256; i++) {
+ if (i >= 'A' && i <= 'Z')
letters += histogram[i];
- else if(i >= 'a' && i <= 'z')
+ else if (i >= 'a' && i <= 'z')
letters += histogram[i];
- else if(i >= '0' && i <= '9')
+ else if (i >= '0' && i <= '9')
letters += histogram[i];
- else if(i == ' ' || i == '\t' || i == '\n' || i == '\r' || i == '\v')
+ else if (i == ' ' || i == '\t' || i == '\n' || i == '\r' || i == '\v')
whitespace += histogram[i];
}
#ifdef STANDALONE
- printf("Letters: %d, whitespace: %d, text to other ratio: %lf\n", letters, whitespace, (double)(letters+whitespace) / len);
+ printf("Letters: %d, whitespace: %d, text to other ratio: %lf\n", letters, whitespace,
+ (double)(letters + whitespace) / len);
#endif
- if((double)(letters+whitespace) / len < 0.6)
- return 0;
+ if ((double)(letters + whitespace) / len < 0.6) return 0;
// Step 3
double letters_ratio = (double)letters / whitespace;
- if(letters_ratio < 2 || letters_ratio > 9)
- return 0;
+ if (letters_ratio < 2 || letters_ratio > 9) return 0;
#ifdef STANDALONE
printf("Letter to whitespace ratio: %lf\n", letters_ratio);
