From c9e5f4c542553af029517d7849d252cfc64580f8 Mon Sep 17 00:00:00 2001 From: Ilya Muravyov Date: Sat, 9 Apr 2016 13:27:36 +0300 Subject: [PATCH 01/34] Create README.md --- README.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 README.md diff --git a/README.md b/README.md new file mode 100644 index 0000000..af1a435 --- /dev/null +++ b/README.md @@ -0,0 +1,14 @@ +BCM
+ +DESCRIPTION
+BCM is a high-performance file compressor that utilizes advanced context modeling +techniques to achieve a very high compression ratio. All in all, it's like a big +brother of the BZIP2. + +AUTHORS
+Ilya Muravyov
+The libdivsufsort-lite library is developed by Yuta Mori + +THANKS
+Special thanks to Yuta Mori, Matt Mahoney, Eugene Shelwien, Przemysław Skibiński +and LovePimple. From a9051c9ebf13ae521dbadf104792a98a5d099705 Mon Sep 17 00:00:00 2001 From: Ilya Muravyov Date: Sat, 9 Apr 2016 13:42:25 +0300 Subject: [PATCH 02/34] Update bcm.cpp --- bcm.cpp | 440 +++++++++++++ divsufsort.c | 1782 ++++++++++++++++++++++++++++++++++++++++++++++++++ divsufsort.h | 63 ++ 3 files changed, 2285 insertions(+) create mode 100644 bcm.cpp create mode 100644 divsufsort.c create mode 100644 divsufsort.h diff --git a/bcm.cpp b/bcm.cpp new file mode 100644 index 0000000..1174bb7 --- /dev/null +++ b/bcm.cpp @@ -0,0 +1,440 @@ +/* + +BCM - A BWT-based file compressor +Written and placed in the public domain by Ilya Muravyov + +*/ + +#ifdef __GNUC__ +#define _FILE_OFFSET_BITS 64 +#define _fseeki64 fseeko64 +#define _ftelli64 ftello64 +#endif + +#define _CRT_SECURE_NO_WARNINGS +#define _CRT_DISABLE_PERFCRIT_LOCKS +#include +#include +#include +#include +#include "divsufsort.h" // libdivsufsort-lite + +typedef unsigned char byte; +typedef unsigned int uint; +typedef unsigned long long ulonglong; + +const char magic[]="BCM1"; + +FILE* in; +FILE* out; + +class Encoder +{ +public: + uint code; + uint low; + uint high; + + Encoder() + : code(0), low(0), high(-1) + {} + + void Encode(int bit, uint p) + { + const uint mid=low+((ulonglong(high-low)*(p<<14))>>32); + + if (bit) + high=mid; + else + low=mid+1; + + while ((low^high)<(1<<24)) + { + putc(low>>24, out); + low<<=8; + high=(high<<8)|255; + } + } + + void Flush() + { + for (int i=0; i<4; ++i) + { + putc(low>>24, out); + low<<=8; + } + } + + void Init() + { + for (int i=0; i<4; ++i) + code=(code<<8)|getc(in); + } + + int Decode(uint p) + { + const uint mid=low+((ulonglong(high-low)*(p<<14))>>32); + + const int bit=(code<=mid); + if (bit) + high=mid; + else + low=mid+1; + + while ((low^high)<(1<<24)) + { + code=(code<<8)|getc(in); + low<<=8; + high=(high<<8)|255; + } + + return bit; + } +}; + +template +class Counter +{ +public: + int p; + + Counter() + : p(1<<15) + {} + + void Update(int bit) + { + if (bit) + p+=(p^65535)>>RATE; + else + p-=p>>RATE; + } +}; + +class CM: public Encoder +{ +public: + Counter<2> counter0[256]; + Counter<4> counter1[256][256]; + Counter<6> counter2[2][256][17]; + int c1; + int c2; + int run; + + CM() + : c1(0), c2(0), run(0) + { + for (int i=0; i<2; ++i) + { + for (int j=0; j<256; ++j) + { + for (int k=0; k<17; ++k) + counter2[i][j][k].p=(k-(k==16))<<12; + } + } + } + + void Put(int c) + { + if (c1==c2) + ++run; + else + run=0; + const int f=(run>2); + + int ctx=1; + while (ctx<256) + { + const int p0=counter0[ctx].p; + const int p1=counter1[c1][ctx].p; + const int p2=counter1[c2][ctx].p; + const int p=(p0+p0+p0+p0+p1+p1+p1+p2)>>3; + + const int idx=p>>12; + const int x1=counter2[f][ctx][idx].p; + const int x2=counter2[f][ctx][idx+1].p; + const int ssep=x1+(((x2-x1)*(p&4095))>>12); + + const int bit=((c&128)!=0); + c+=c; + Encoder::Encode(bit, p+ssep+ssep+ssep); + + counter0[ctx].Update(bit); + counter1[c1][ctx].Update(bit); + counter2[f][ctx][idx].Update(bit); + counter2[f][ctx][idx+1].Update(bit); + + ctx+=ctx+bit; + } + + c2=c1; + c1=byte(ctx); + } + + int Get() + { + if (c1==c2) + ++run; + else + run=0; + const int f=(run>2); + + int ctx=1; + while (ctx<256) + { + const int p0=counter0[ctx].p; + const int p1=counter1[c1][ctx].p; + const int p2=counter1[c2][ctx].p; + const int p=(p0+p0+p0+p0+p1+p1+p1+p2)>>3; + + const int idx=p>>12; + const int x1=counter2[f][ctx][idx].p; + const int x2=counter2[f][ctx][idx+1].p; + const int ssep=x1+(((x2-x1)*(p&4095))>>12); + + const int bit=Encoder::Decode(p+ssep+ssep+ssep); + + counter0[ctx].Update(bit); + counter1[c1][ctx].Update(bit); + counter2[f][ctx][idx].Update(bit); + counter2[f][ctx][idx+1].Update(bit); + + ctx+=ctx+bit; + } + + c2=c1; + return c1=byte(ctx); + } +} cm; + +byte* buf; + +void compress(int b) +{ + if (_fseeki64(in, 0, SEEK_END)!=0) + { + perror("Fseek failed"); + exit(1); + } + const long long flen=_ftelli64(in); + if (flen<0) + { + perror("Ftell failed"); + exit(1); + } + if (b>flen) + b=int(flen); + rewind(in); + + buf=(byte*)calloc(b, 5); + if (!buf) + { + fprintf(stderr, "Out of memory\n"); + exit(1); + } + + putc(magic[0], out); + putc(magic[1], out); + putc(magic[2], out); + putc(magic[3], out); + + int n; + while ((n=fread(buf, 1, b, in))>0) + { + const int p=divbwt(buf, buf, (int*)&buf[b], n); + if (p<1) + { + perror("Divbwt failed"); + exit(1); + } + + cm.Put(n>>24); + cm.Put(n>>16); + cm.Put(n>>8); + cm.Put(n); + cm.Put(p>>24); + cm.Put(p>>16); + cm.Put(p>>8); + cm.Put(p); + + for (int i=0; ib)||(p<1)||(p>n)) + { + fprintf(stderr, "File corrupted\n"); + exit(1); + } + // Inverse BWT + int t[257]={0}; + for (int i=0; i=p); + for (int i=p; i!=0;) + { + i=next[i-1]; + putc(buf[i-(i>=p)], out); + } + } +} + +int main(int argc, char* argv[]) +{ + const clock_t start=clock(); + + int block_size=20<<20; // 20 MB + bool do_decomp=false; + bool overwrite=false; + + while ((argc>1)&&(argv[1][0]=='-')) + { + switch (argv[1][1]) + { + case 'b': + block_size=atoi(&argv[1][2]) + <<(argv[1][strlen(argv[1])-1]=='k'?10:20); + if (block_size<1) + { + fprintf(stderr, "Invalid block size\n"); + exit(1); + } + break; + case 'd': + do_decomp=true; + break; + case 'f': + overwrite=true; + break; + default: + fprintf(stderr, "Unknown option: %s\n", argv[1]); + exit(1); + } + --argc; + ++argv; + } + + if (argc<2) + { + fprintf(stderr, + "BCM - A BWT-based file compressor, v1.00\n" + "\n" + "Usage: BCM [options] infile [outfile]\n" + "\n" + "Options:\n" + " -b[k] Set block size to N MB or KB (default is 20 MB)\n" + " -d Decompress\n" + " -f Force overwrite of output file\n"); + exit(1); + } + + in=fopen(argv[1], "rb"); + if (!in) + { + perror(argv[1]); + exit(1); + } + + char ofname[FILENAME_MAX]; + if (argc<3) + { + strcpy(ofname, argv[1]); + if (do_decomp) + { + const int p=strlen(ofname)-4; + if ((p>0)&&(strcmp(&ofname[p], ".bcm")==0)) + ofname[p]='\0'; + else + strcat(ofname, ".out"); + } + else + strcat(ofname, ".bcm"); + } + else + strcpy(ofname, argv[2]); + + if (!overwrite) + { + FILE* f=fopen(ofname, "rb"); + if (f) + { + fclose(f); + fprintf(stderr, "%s already exists\n", ofname); + exit(1); + } + } + + out=fopen(ofname, "wb"); + if (!out) + { + perror(ofname); + exit(1); + } + + fprintf(stdout, "%s: ", argv[1]); + fflush(stdout); + + if (do_decomp) + decompress(); + else + compress(block_size); + + fprintf(stdout, "%lld -> %lld in %.3fs\n", + _ftelli64(in), _ftelli64(out), + double(clock()-start)/CLOCKS_PER_SEC); + + fclose(in); + fclose(out); + + free(buf); + + return 0; +} diff --git a/divsufsort.c b/divsufsort.c new file mode 100644 index 0000000..9bbac45 --- /dev/null +++ b/divsufsort.c @@ -0,0 +1,1782 @@ +/* + * divsufsort.c for libdivsufsort-lite + * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include +#ifdef _OPENMP +# include +#endif +#include "divsufsort.h" + + +/*- Constants -*/ +#define INLINE __inline +#if defined(ALPHABET_SIZE) && (ALPHABET_SIZE < 1) +# undef ALPHABET_SIZE +#endif +#if !defined(ALPHABET_SIZE) +# define ALPHABET_SIZE (256) +#endif +#define BUCKET_A_SIZE (ALPHABET_SIZE) +#define BUCKET_B_SIZE (ALPHABET_SIZE * ALPHABET_SIZE) +#if defined(SS_INSERTIONSORT_THRESHOLD) +# if SS_INSERTIONSORT_THRESHOLD < 1 +# undef SS_INSERTIONSORT_THRESHOLD +# define SS_INSERTIONSORT_THRESHOLD (1) +# endif +#else +# define SS_INSERTIONSORT_THRESHOLD (8) +#endif +#if defined(SS_BLOCKSIZE) +# if SS_BLOCKSIZE < 0 +# undef SS_BLOCKSIZE +# define SS_BLOCKSIZE (0) +# elif 32768 <= SS_BLOCKSIZE +# undef SS_BLOCKSIZE +# define SS_BLOCKSIZE (32767) +# endif +#else +# define SS_BLOCKSIZE (1024) +#endif +/* minstacksize = log(SS_BLOCKSIZE) / log(3) * 2 */ +#if SS_BLOCKSIZE == 0 +# define SS_MISORT_STACKSIZE (96) +#elif SS_BLOCKSIZE <= 4096 +# define SS_MISORT_STACKSIZE (16) +#else +# define SS_MISORT_STACKSIZE (24) +#endif +#define SS_SMERGE_STACKSIZE (32) +#define TR_INSERTIONSORT_THRESHOLD (8) +#define TR_STACKSIZE (64) + + +/*- Macros -*/ +#ifndef SWAP +# define SWAP(_a, _b) do { t = (_a); (_a) = (_b); (_b) = t; } while(0) +#endif /* SWAP */ +#ifndef MIN +# define MIN(_a, _b) (((_a) < (_b)) ? (_a) : (_b)) +#endif /* MIN */ +#ifndef MAX +# define MAX(_a, _b) (((_a) > (_b)) ? (_a) : (_b)) +#endif /* MAX */ +#define STACK_PUSH(_a, _b, _c, _d)\ + do {\ + assert(ssize < STACK_SIZE);\ + stack[ssize].a = (_a), stack[ssize].b = (_b),\ + stack[ssize].c = (_c), stack[ssize++].d = (_d);\ + } while(0) +#define STACK_PUSH5(_a, _b, _c, _d, _e)\ + do {\ + assert(ssize < STACK_SIZE);\ + stack[ssize].a = (_a), stack[ssize].b = (_b),\ + stack[ssize].c = (_c), stack[ssize].d = (_d), stack[ssize++].e = (_e);\ + } while(0) +#define STACK_POP(_a, _b, _c, _d)\ + do {\ + assert(0 <= ssize);\ + if(ssize == 0) { return; }\ + (_a) = stack[--ssize].a, (_b) = stack[ssize].b,\ + (_c) = stack[ssize].c, (_d) = stack[ssize].d;\ + } while(0) +#define STACK_POP5(_a, _b, _c, _d, _e)\ + do {\ + assert(0 <= ssize);\ + if(ssize == 0) { return; }\ + (_a) = stack[--ssize].a, (_b) = stack[ssize].b,\ + (_c) = stack[ssize].c, (_d) = stack[ssize].d, (_e) = stack[ssize].e;\ + } while(0) +#define BUCKET_A(_c0) bucket_A[(_c0)] +#if ALPHABET_SIZE == 256 +#define BUCKET_B(_c0, _c1) (bucket_B[((_c1) << 8) | (_c0)]) +#define BUCKET_BSTAR(_c0, _c1) (bucket_B[((_c0) << 8) | (_c1)]) +#else +#define BUCKET_B(_c0, _c1) (bucket_B[(_c1) * ALPHABET_SIZE + (_c0)]) +#define BUCKET_BSTAR(_c0, _c1) (bucket_B[(_c0) * ALPHABET_SIZE + (_c1)]) +#endif + + +/*- Private Functions -*/ + +static const int lg_table[256]= { + -1,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4, + 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5, + 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6, + 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7 +}; + +#if (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE) + +static INLINE +int +ss_ilg(int n) { +#if SS_BLOCKSIZE == 0 + return (n & 0xffff0000) ? + ((n & 0xff000000) ? + 24 + lg_table[(n >> 24) & 0xff] : + 16 + lg_table[(n >> 16) & 0xff]) : + ((n & 0x0000ff00) ? + 8 + lg_table[(n >> 8) & 0xff] : + 0 + lg_table[(n >> 0) & 0xff]); +#elif SS_BLOCKSIZE < 256 + return lg_table[n]; +#else + return (n & 0xff00) ? + 8 + lg_table[(n >> 8) & 0xff] : + 0 + lg_table[(n >> 0) & 0xff]; +#endif +} + +#endif /* (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE) */ + +#if SS_BLOCKSIZE != 0 + +static const int sqq_table[256] = { + 0, 16, 22, 27, 32, 35, 39, 42, 45, 48, 50, 53, 55, 57, 59, 61, + 64, 65, 67, 69, 71, 73, 75, 76, 78, 80, 81, 83, 84, 86, 87, 89, + 90, 91, 93, 94, 96, 97, 98, 99, 101, 102, 103, 104, 106, 107, 108, 109, +110, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, +128, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, +143, 144, 144, 145, 146, 147, 148, 149, 150, 150, 151, 152, 153, 154, 155, 155, +156, 157, 158, 159, 160, 160, 161, 162, 163, 163, 164, 165, 166, 167, 167, 168, +169, 170, 170, 171, 172, 173, 173, 174, 175, 176, 176, 177, 178, 178, 179, 180, +181, 181, 182, 183, 183, 184, 185, 185, 186, 187, 187, 188, 189, 189, 190, 191, +192, 192, 193, 193, 194, 195, 195, 196, 197, 197, 198, 199, 199, 200, 201, 201, +202, 203, 203, 204, 204, 205, 206, 206, 207, 208, 208, 209, 209, 210, 211, 211, +212, 212, 213, 214, 214, 215, 215, 216, 217, 217, 218, 218, 219, 219, 220, 221, +221, 222, 222, 223, 224, 224, 225, 225, 226, 226, 227, 227, 228, 229, 229, 230, +230, 231, 231, 232, 232, 233, 234, 234, 235, 235, 236, 236, 237, 237, 238, 238, +239, 240, 240, 241, 241, 242, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, +247, 248, 248, 249, 249, 250, 250, 251, 251, 252, 252, 253, 253, 254, 254, 255 +}; + +static INLINE +int +ss_isqrt(int x) { + int y, e; + + if(x >= (SS_BLOCKSIZE * SS_BLOCKSIZE)) { return SS_BLOCKSIZE; } + e = (x & 0xffff0000) ? + ((x & 0xff000000) ? + 24 + lg_table[(x >> 24) & 0xff] : + 16 + lg_table[(x >> 16) & 0xff]) : + ((x & 0x0000ff00) ? + 8 + lg_table[(x >> 8) & 0xff] : + 0 + lg_table[(x >> 0) & 0xff]); + + if(e >= 16) { + y = sqq_table[x >> ((e - 6) - (e & 1))] << ((e >> 1) - 7); + if(e >= 24) { y = (y + 1 + x / y) >> 1; } + y = (y + 1 + x / y) >> 1; + } else if(e >= 8) { + y = (sqq_table[x >> ((e - 6) - (e & 1))] >> (7 - (e >> 1))) + 1; + } else { + return sqq_table[x] >> 4; + } + + return (x < (y * y)) ? y - 1 : y; +} + +#endif /* SS_BLOCKSIZE != 0 */ + + +/*---------------------------------------------------------------------------*/ + +/* Compares two suffixes. */ +static INLINE +int +ss_compare(const unsigned char *T, + const int *p1, const int *p2, + int depth) { + const unsigned char *U1, *U2, *U1n, *U2n; + + for(U1 = T + depth + *p1, + U2 = T + depth + *p2, + U1n = T + *(p1 + 1) + 2, + U2n = T + *(p2 + 1) + 2; + (U1 < U1n) && (U2 < U2n) && (*U1 == *U2); + ++U1, ++U2) { + } + + return U1 < U1n ? + (U2 < U2n ? *U1 - *U2 : 1) : + (U2 < U2n ? -1 : 0); +} + + +/*---------------------------------------------------------------------------*/ + +#if (SS_BLOCKSIZE != 1) && (SS_INSERTIONSORT_THRESHOLD != 1) + +/* Insertionsort for small size groups */ +static +void +ss_insertionsort(const unsigned char *T, const int *PA, + int *first, int *last, int depth) { + int *i, *j; + int t; + int r; + + for(i = last - 2; first <= i; --i) { + for(t = *i, j = i + 1; 0 < (r = ss_compare(T, PA + t, PA + *j, depth));) { + do { *(j - 1) = *j; } while((++j < last) && (*j < 0)); + if(last <= j) { break; } + } + if(r == 0) { *j = ~*j; } + *(j - 1) = t; + } +} + +#endif /* (SS_BLOCKSIZE != 1) && (SS_INSERTIONSORT_THRESHOLD != 1) */ + + +/*---------------------------------------------------------------------------*/ + +#if (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE) + +static INLINE +void +ss_fixdown(const unsigned char *Td, const int *PA, + int *SA, int i, int size) { + int j, k; + int v; + int c, d, e; + + for(v = SA[i], c = Td[PA[v]]; (j = 2 * i + 1) < size; SA[i] = SA[k], i = k) { + d = Td[PA[SA[k = j++]]]; + if(d < (e = Td[PA[SA[j]]])) { k = j; d = e; } + if(d <= c) { break; } + } + SA[i] = v; +} + +/* Simple top-down heapsort. */ +static +void +ss_heapsort(const unsigned char *Td, const int *PA, int *SA, int size) { + int i, m; + int t; + + m = size; + if((size % 2) == 0) { + m--; + if(Td[PA[SA[m / 2]]] < Td[PA[SA[m]]]) { SWAP(SA[m], SA[m / 2]); } + } + + for(i = m / 2 - 1; 0 <= i; --i) { ss_fixdown(Td, PA, SA, i, m); } + if((size % 2) == 0) { SWAP(SA[0], SA[m]); ss_fixdown(Td, PA, SA, 0, m); } + for(i = m - 1; 0 < i; --i) { + t = SA[0], SA[0] = SA[i]; + ss_fixdown(Td, PA, SA, 0, i); + SA[i] = t; + } +} + + +/*---------------------------------------------------------------------------*/ + +/* Returns the median of three elements. */ +static INLINE +int * +ss_median3(const unsigned char *Td, const int *PA, + int *v1, int *v2, int *v3) { + int *t; + if(Td[PA[*v1]] > Td[PA[*v2]]) { SWAP(v1, v2); } + if(Td[PA[*v2]] > Td[PA[*v3]]) { + if(Td[PA[*v1]] > Td[PA[*v3]]) { return v1; } + else { return v3; } + } + return v2; +} + +/* Returns the median of five elements. */ +static INLINE +int * +ss_median5(const unsigned char *Td, const int *PA, + int *v1, int *v2, int *v3, int *v4, int *v5) { + int *t; + if(Td[PA[*v2]] > Td[PA[*v3]]) { SWAP(v2, v3); } + if(Td[PA[*v4]] > Td[PA[*v5]]) { SWAP(v4, v5); } + if(Td[PA[*v2]] > Td[PA[*v4]]) { SWAP(v2, v4); SWAP(v3, v5); } + if(Td[PA[*v1]] > Td[PA[*v3]]) { SWAP(v1, v3); } + if(Td[PA[*v1]] > Td[PA[*v4]]) { SWAP(v1, v4); SWAP(v3, v5); } + if(Td[PA[*v3]] > Td[PA[*v4]]) { return v4; } + return v3; +} + +/* Returns the pivot element. */ +static INLINE +int * +ss_pivot(const unsigned char *Td, const int *PA, int *first, int *last) { + int *middle; + int t; + + t = last - first; + middle = first + t / 2; + + if(t <= 512) { + if(t <= 32) { + return ss_median3(Td, PA, first, middle, last - 1); + } else { + t >>= 2; + return ss_median5(Td, PA, first, first + t, middle, last - 1 - t, last - 1); + } + } + t >>= 3; + first = ss_median3(Td, PA, first, first + t, first + (t << 1)); + middle = ss_median3(Td, PA, middle - t, middle, middle + t); + last = ss_median3(Td, PA, last - 1 - (t << 1), last - 1 - t, last - 1); + return ss_median3(Td, PA, first, middle, last); +} + + +/*---------------------------------------------------------------------------*/ + +/* Binary partition for substrings. */ +static INLINE +int * +ss_partition(const int *PA, + int *first, int *last, int depth) { + int *a, *b; + int t; + for(a = first - 1, b = last;;) { + for(; (++a < b) && ((PA[*a] + depth) >= (PA[*a + 1] + 1));) { *a = ~*a; } + for(; (a < --b) && ((PA[*b] + depth) < (PA[*b + 1] + 1));) { } + if(b <= a) { break; } + t = ~*b; + *b = *a; + *a = t; + } + if(first < a) { *first = ~*first; } + return a; +} + +/* Multikey introsort for medium size groups. */ +static +void +ss_mintrosort(const unsigned char *T, const int *PA, + int *first, int *last, + int depth) { +#define STACK_SIZE SS_MISORT_STACKSIZE + struct { int *a, *b, c; int d; } stack[STACK_SIZE]; + const unsigned char *Td; + int *a, *b, *c, *d, *e, *f; + int s, t; + int ssize; + int limit; + int v, x = 0; + + for(ssize = 0, limit = ss_ilg(last - first);;) { + + if((last - first) <= SS_INSERTIONSORT_THRESHOLD) { +#if 1 < SS_INSERTIONSORT_THRESHOLD + if(1 < (last - first)) { ss_insertionsort(T, PA, first, last, depth); } +#endif + STACK_POP(first, last, depth, limit); + continue; + } + + Td = T + depth; + if(limit-- == 0) { ss_heapsort(Td, PA, first, last - first); } + if(limit < 0) { + for(a = first + 1, v = Td[PA[*first]]; a < last; ++a) { + if((x = Td[PA[*a]]) != v) { + if(1 < (a - first)) { break; } + v = x; + first = a; + } + } + if(Td[PA[*first] - 1] < v) { + first = ss_partition(PA, first, a, depth); + } + if((a - first) <= (last - a)) { + if(1 < (a - first)) { + STACK_PUSH(a, last, depth, -1); + last = a, depth += 1, limit = ss_ilg(a - first); + } else { + first = a, limit = -1; + } + } else { + if(1 < (last - a)) { + STACK_PUSH(first, a, depth + 1, ss_ilg(a - first)); + first = a, limit = -1; + } else { + last = a, depth += 1, limit = ss_ilg(a - first); + } + } + continue; + } + + /* choose pivot */ + a = ss_pivot(Td, PA, first, last); + v = Td[PA[*a]]; + SWAP(*first, *a); + + /* partition */ + for(b = first; (++b < last) && ((x = Td[PA[*b]]) == v);) { } + if(((a = b) < last) && (x < v)) { + for(; (++b < last) && ((x = Td[PA[*b]]) <= v);) { + if(x == v) { SWAP(*b, *a); ++a; } + } + } + for(c = last; (b < --c) && ((x = Td[PA[*c]]) == v);) { } + if((b < (d = c)) && (x > v)) { + for(; (b < --c) && ((x = Td[PA[*c]]) >= v);) { + if(x == v) { SWAP(*c, *d); --d; } + } + } + for(; b < c;) { + SWAP(*b, *c); + for(; (++b < c) && ((x = Td[PA[*b]]) <= v);) { + if(x == v) { SWAP(*b, *a); ++a; } + } + for(; (b < --c) && ((x = Td[PA[*c]]) >= v);) { + if(x == v) { SWAP(*c, *d); --d; } + } + } + + if(a <= d) { + c = b - 1; + + if((s = a - first) > (t = b - a)) { s = t; } + for(e = first, f = b - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); } + if((s = d - c) > (t = last - d - 1)) { s = t; } + for(e = b, f = last - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); } + + a = first + (b - a), c = last - (d - c); + b = (v <= Td[PA[*a] - 1]) ? a : ss_partition(PA, a, c, depth); + + if((a - first) <= (last - c)) { + if((last - c) <= (c - b)) { + STACK_PUSH(b, c, depth + 1, ss_ilg(c - b)); + STACK_PUSH(c, last, depth, limit); + last = a; + } else if((a - first) <= (c - b)) { + STACK_PUSH(c, last, depth, limit); + STACK_PUSH(b, c, depth + 1, ss_ilg(c - b)); + last = a; + } else { + STACK_PUSH(c, last, depth, limit); + STACK_PUSH(first, a, depth, limit); + first = b, last = c, depth += 1, limit = ss_ilg(c - b); + } + } else { + if((a - first) <= (c - b)) { + STACK_PUSH(b, c, depth + 1, ss_ilg(c - b)); + STACK_PUSH(first, a, depth, limit); + first = c; + } else if((last - c) <= (c - b)) { + STACK_PUSH(first, a, depth, limit); + STACK_PUSH(b, c, depth + 1, ss_ilg(c - b)); + first = c; + } else { + STACK_PUSH(first, a, depth, limit); + STACK_PUSH(c, last, depth, limit); + first = b, last = c, depth += 1, limit = ss_ilg(c - b); + } + } + } else { + limit += 1; + if(Td[PA[*first] - 1] < v) { + first = ss_partition(PA, first, last, depth); + limit = ss_ilg(last - first); + } + depth += 1; + } + } +#undef STACK_SIZE +} + +#endif /* (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE) */ + + +/*---------------------------------------------------------------------------*/ + +#if SS_BLOCKSIZE != 0 + +static INLINE +void +ss_blockswap(int *a, int *b, int n) { + int t; + for(; 0 < n; --n, ++a, ++b) { + t = *a, *a = *b, *b = t; + } +} + +static INLINE +void +ss_rotate(int *first, int *middle, int *last) { + int *a, *b, t; + int l, r; + l = middle - first, r = last - middle; + for(; (0 < l) && (0 < r);) { + if(l == r) { ss_blockswap(first, middle, l); break; } + if(l < r) { + a = last - 1, b = middle - 1; + t = *a; + do { + *a-- = *b, *b-- = *a; + if(b < first) { + *a = t; + last = a; + if((r -= l + 1) <= l) { break; } + a -= 1, b = middle - 1; + t = *a; + } + } while(1); + } else { + a = first, b = middle; + t = *a; + do { + *a++ = *b, *b++ = *a; + if(last <= b) { + *a = t; + first = a + 1; + if((l -= r + 1) <= r) { break; } + a += 1, b = middle; + t = *a; + } + } while(1); + } + } +} + + +/*---------------------------------------------------------------------------*/ + +static +void +ss_inplacemerge(const unsigned char *T, const int *PA, + int *first, int *middle, int *last, + int depth) { + const int *p; + int *a, *b; + int len, half; + int q, r; + int x; + + for(;;) { + if(*(last - 1) < 0) { x = 1; p = PA + ~*(last - 1); } + else { x = 0; p = PA + *(last - 1); } + for(a = first, len = middle - first, half = len >> 1, r = -1; + 0 < len; + len = half, half >>= 1) { + b = a + half; + q = ss_compare(T, PA + ((0 <= *b) ? *b : ~*b), p, depth); + if(q < 0) { + a = b + 1; + half -= (len & 1) ^ 1; + } else { + r = q; + } + } + if(a < middle) { + if(r == 0) { *a = ~*a; } + ss_rotate(a, middle, last); + last -= middle - a; + middle = a; + if(first == middle) { break; } + } + --last; + if(x != 0) { while(*--last < 0) { } } + if(middle == last) { break; } + } +} + + +/*---------------------------------------------------------------------------*/ + +/* Merge-forward with internal buffer. */ +static +void +ss_mergeforward(const unsigned char *T, const int *PA, + int *first, int *middle, int *last, + int *buf, int depth) { + int *a, *b, *c, *bufend; + int t; + int r; + + bufend = buf + (middle - first) - 1; + ss_blockswap(buf, first, middle - first); + + for(t = *(a = first), b = buf, c = middle;;) { + r = ss_compare(T, PA + *b, PA + *c, depth); + if(r < 0) { + do { + *a++ = *b; + if(bufend <= b) { *bufend = t; return; } + *b++ = *a; + } while(*b < 0); + } else if(r > 0) { + do { + *a++ = *c, *c++ = *a; + if(last <= c) { + while(b < bufend) { *a++ = *b, *b++ = *a; } + *a = *b, *b = t; + return; + } + } while(*c < 0); + } else { + *c = ~*c; + do { + *a++ = *b; + if(bufend <= b) { *bufend = t; return; } + *b++ = *a; + } while(*b < 0); + + do { + *a++ = *c, *c++ = *a; + if(last <= c) { + while(b < bufend) { *a++ = *b, *b++ = *a; } + *a = *b, *b = t; + return; + } + } while(*c < 0); + } + } +} + +/* Merge-backward with internal buffer. */ +static +void +ss_mergebackward(const unsigned char *T, const int *PA, + int *first, int *middle, int *last, + int *buf, int depth) { + const int *p1, *p2; + int *a, *b, *c, *bufend; + int t; + int r; + int x; + + bufend = buf + (last - middle) - 1; + ss_blockswap(buf, middle, last - middle); + + x = 0; + if(*bufend < 0) { p1 = PA + ~*bufend; x |= 1; } + else { p1 = PA + *bufend; } + if(*(middle - 1) < 0) { p2 = PA + ~*(middle - 1); x |= 2; } + else { p2 = PA + *(middle - 1); } + for(t = *(a = last - 1), b = bufend, c = middle - 1;;) { + r = ss_compare(T, p1, p2, depth); + if(0 < r) { + if(x & 1) { do { *a-- = *b, *b-- = *a; } while(*b < 0); x ^= 1; } + *a-- = *b; + if(b <= buf) { *buf = t; break; } + *b-- = *a; + if(*b < 0) { p1 = PA + ~*b; x |= 1; } + else { p1 = PA + *b; } + } else if(r < 0) { + if(x & 2) { do { *a-- = *c, *c-- = *a; } while(*c < 0); x ^= 2; } + *a-- = *c, *c-- = *a; + if(c < first) { + while(buf < b) { *a-- = *b, *b-- = *a; } + *a = *b, *b = t; + break; + } + if(*c < 0) { p2 = PA + ~*c; x |= 2; } + else { p2 = PA + *c; } + } else { + if(x & 1) { do { *a-- = *b, *b-- = *a; } while(*b < 0); x ^= 1; } + *a-- = ~*b; + if(b <= buf) { *buf = t; break; } + *b-- = *a; + if(x & 2) { do { *a-- = *c, *c-- = *a; } while(*c < 0); x ^= 2; } + *a-- = *c, *c-- = *a; + if(c < first) { + while(buf < b) { *a-- = *b, *b-- = *a; } + *a = *b, *b = t; + break; + } + if(*b < 0) { p1 = PA + ~*b; x |= 1; } + else { p1 = PA + *b; } + if(*c < 0) { p2 = PA + ~*c; x |= 2; } + else { p2 = PA + *c; } + } + } +} + +/* D&C based merge. */ +static +void +ss_swapmerge(const unsigned char *T, const int *PA, + int *first, int *middle, int *last, + int *buf, int bufsize, int depth) { +#define STACK_SIZE SS_SMERGE_STACKSIZE +#define GETIDX(a) ((0 <= (a)) ? (a) : (~(a))) +#define MERGE_CHECK(a, b, c)\ + do {\ + if(((c) & 1) ||\ + (((c) & 2) && (ss_compare(T, PA + GETIDX(*((a) - 1)), PA + *(a), depth) == 0))) {\ + *(a) = ~*(a);\ + }\ + if(((c) & 4) && ((ss_compare(T, PA + GETIDX(*((b) - 1)), PA + *(b), depth) == 0))) {\ + *(b) = ~*(b);\ + }\ + } while(0) + struct { int *a, *b, *c; int d; } stack[STACK_SIZE]; + int *l, *r, *lm, *rm; + int m, len, half; + int ssize; + int check, next; + + for(check = 0, ssize = 0;;) { + if((last - middle) <= bufsize) { + if((first < middle) && (middle < last)) { + ss_mergebackward(T, PA, first, middle, last, buf, depth); + } + MERGE_CHECK(first, last, check); + STACK_POP(first, middle, last, check); + continue; + } + + if((middle - first) <= bufsize) { + if(first < middle) { + ss_mergeforward(T, PA, first, middle, last, buf, depth); + } + MERGE_CHECK(first, last, check); + STACK_POP(first, middle, last, check); + continue; + } + + for(m = 0, len = MIN(middle - first, last - middle), half = len >> 1; + 0 < len; + len = half, half >>= 1) { + if(ss_compare(T, PA + GETIDX(*(middle + m + half)), + PA + GETIDX(*(middle - m - half - 1)), depth) < 0) { + m += half + 1; + half -= (len & 1) ^ 1; + } + } + + if(0 < m) { + lm = middle - m, rm = middle + m; + ss_blockswap(lm, middle, m); + l = r = middle, next = 0; + if(rm < last) { + if(*rm < 0) { + *rm = ~*rm; + if(first < lm) { for(; *--l < 0;) { } next |= 4; } + next |= 1; + } else if(first < lm) { + for(; *r < 0; ++r) { } + next |= 2; + } + } + + if((l - first) <= (last - r)) { + STACK_PUSH(r, rm, last, (next & 3) | (check & 4)); + middle = lm, last = l, check = (check & 3) | (next & 4); + } else { + if((next & 2) && (r == middle)) { next ^= 6; } + STACK_PUSH(first, lm, l, (check & 3) | (next & 4)); + first = r, middle = rm, check = (next & 3) | (check & 4); + } + } else { + if(ss_compare(T, PA + GETIDX(*(middle - 1)), PA + *middle, depth) == 0) { + *middle = ~*middle; + } + MERGE_CHECK(first, last, check); + STACK_POP(first, middle, last, check); + } + } +#undef STACK_SIZE +} + +#endif /* SS_BLOCKSIZE != 0 */ + + +/*---------------------------------------------------------------------------*/ + +/* Substring sort */ +static +void +sssort(const unsigned char *T, const int *PA, + int *first, int *last, + int *buf, int bufsize, + int depth, int n, int lastsuffix) { + int *a; +#if SS_BLOCKSIZE != 0 + int *b, *middle, *curbuf; + int j, k, curbufsize, limit; +#endif + int i; + + if(lastsuffix != 0) { ++first; } + +#if SS_BLOCKSIZE == 0 + ss_mintrosort(T, PA, first, last, depth); +#else + if((bufsize < SS_BLOCKSIZE) && + (bufsize < (last - first)) && + (bufsize < (limit = ss_isqrt(last - first)))) { + if(SS_BLOCKSIZE < limit) { limit = SS_BLOCKSIZE; } + buf = middle = last - limit, bufsize = limit; + } else { + middle = last, limit = 0; + } + for(a = first, i = 0; SS_BLOCKSIZE < (middle - a); a += SS_BLOCKSIZE, ++i) { +#if SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE + ss_mintrosort(T, PA, a, a + SS_BLOCKSIZE, depth); +#elif 1 < SS_BLOCKSIZE + ss_insertionsort(T, PA, a, a + SS_BLOCKSIZE, depth); +#endif + curbufsize = last - (a + SS_BLOCKSIZE); + curbuf = a + SS_BLOCKSIZE; + if(curbufsize <= bufsize) { curbufsize = bufsize, curbuf = buf; } + for(b = a, k = SS_BLOCKSIZE, j = i; j & 1; b -= k, k <<= 1, j >>= 1) { + ss_swapmerge(T, PA, b - k, b, b + k, curbuf, curbufsize, depth); + } + } +#if SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE + ss_mintrosort(T, PA, a, middle, depth); +#elif 1 < SS_BLOCKSIZE + ss_insertionsort(T, PA, a, middle, depth); +#endif + for(k = SS_BLOCKSIZE; i != 0; k <<= 1, i >>= 1) { + if(i & 1) { + ss_swapmerge(T, PA, a - k, a, middle, buf, bufsize, depth); + a -= k; + } + } + if(limit != 0) { +#if SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE + ss_mintrosort(T, PA, middle, last, depth); +#elif 1 < SS_BLOCKSIZE + ss_insertionsort(T, PA, middle, last, depth); +#endif + ss_inplacemerge(T, PA, first, middle, last, depth); + } +#endif + + if(lastsuffix != 0) { + /* Insert last type B* suffix. */ + int PAi[2]; PAi[0] = PA[*(first - 1)], PAi[1] = n - 2; + for(a = first, i = *(first - 1); + (a < last) && ((*a < 0) || (0 < ss_compare(T, &(PAi[0]), PA + *a, depth))); + ++a) { + *(a - 1) = *a; + } + *(a - 1) = i; + } +} + + +/*---------------------------------------------------------------------------*/ + +static INLINE +int +tr_ilg(int n) { + return (n & 0xffff0000) ? + ((n & 0xff000000) ? + 24 + lg_table[(n >> 24) & 0xff] : + 16 + lg_table[(n >> 16) & 0xff]) : + ((n & 0x0000ff00) ? + 8 + lg_table[(n >> 8) & 0xff] : + 0 + lg_table[(n >> 0) & 0xff]); +} + + +/*---------------------------------------------------------------------------*/ + +/* Simple insertionsort for small size groups. */ +static +void +tr_insertionsort(const int *ISAd, int *first, int *last) { + int *a, *b; + int t, r; + + for(a = first + 1; a < last; ++a) { + for(t = *a, b = a - 1; 0 > (r = ISAd[t] - ISAd[*b]);) { + do { *(b + 1) = *b; } while((first <= --b) && (*b < 0)); + if(b < first) { break; } + } + if(r == 0) { *b = ~*b; } + *(b + 1) = t; + } +} + + +/*---------------------------------------------------------------------------*/ + +static INLINE +void +tr_fixdown(const int *ISAd, int *SA, int i, int size) { + int j, k; + int v; + int c, d, e; + + for(v = SA[i], c = ISAd[v]; (j = 2 * i + 1) < size; SA[i] = SA[k], i = k) { + d = ISAd[SA[k = j++]]; + if(d < (e = ISAd[SA[j]])) { k = j; d = e; } + if(d <= c) { break; } + } + SA[i] = v; +} + +/* Simple top-down heapsort. */ +static +void +tr_heapsort(const int *ISAd, int *SA, int size) { + int i, m; + int t; + + m = size; + if((size % 2) == 0) { + m--; + if(ISAd[SA[m / 2]] < ISAd[SA[m]]) { SWAP(SA[m], SA[m / 2]); } + } + + for(i = m / 2 - 1; 0 <= i; --i) { tr_fixdown(ISAd, SA, i, m); } + if((size % 2) == 0) { SWAP(SA[0], SA[m]); tr_fixdown(ISAd, SA, 0, m); } + for(i = m - 1; 0 < i; --i) { + t = SA[0], SA[0] = SA[i]; + tr_fixdown(ISAd, SA, 0, i); + SA[i] = t; + } +} + + +/*---------------------------------------------------------------------------*/ + +/* Returns the median of three elements. */ +static INLINE +int * +tr_median3(const int *ISAd, int *v1, int *v2, int *v3) { + int *t; + if(ISAd[*v1] > ISAd[*v2]) { SWAP(v1, v2); } + if(ISAd[*v2] > ISAd[*v3]) { + if(ISAd[*v1] > ISAd[*v3]) { return v1; } + else { return v3; } + } + return v2; +} + +/* Returns the median of five elements. */ +static INLINE +int * +tr_median5(const int *ISAd, + int *v1, int *v2, int *v3, int *v4, int *v5) { + int *t; + if(ISAd[*v2] > ISAd[*v3]) { SWAP(v2, v3); } + if(ISAd[*v4] > ISAd[*v5]) { SWAP(v4, v5); } + if(ISAd[*v2] > ISAd[*v4]) { SWAP(v2, v4); SWAP(v3, v5); } + if(ISAd[*v1] > ISAd[*v3]) { SWAP(v1, v3); } + if(ISAd[*v1] > ISAd[*v4]) { SWAP(v1, v4); SWAP(v3, v5); } + if(ISAd[*v3] > ISAd[*v4]) { return v4; } + return v3; +} + +/* Returns the pivot element. */ +static INLINE +int * +tr_pivot(const int *ISAd, int *first, int *last) { + int *middle; + int t; + + t = last - first; + middle = first + t / 2; + + if(t <= 512) { + if(t <= 32) { + return tr_median3(ISAd, first, middle, last - 1); + } else { + t >>= 2; + return tr_median5(ISAd, first, first + t, middle, last - 1 - t, last - 1); + } + } + t >>= 3; + first = tr_median3(ISAd, first, first + t, first + (t << 1)); + middle = tr_median3(ISAd, middle - t, middle, middle + t); + last = tr_median3(ISAd, last - 1 - (t << 1), last - 1 - t, last - 1); + return tr_median3(ISAd, first, middle, last); +} + + +/*---------------------------------------------------------------------------*/ + +typedef struct _trbudget_t trbudget_t; +struct _trbudget_t { + int chance; + int remain; + int incval; + int count; +}; + +static INLINE +void +trbudget_init(trbudget_t *budget, int chance, int incval) { + budget->chance = chance; + budget->remain = budget->incval = incval; +} + +static INLINE +int +trbudget_check(trbudget_t *budget, int size) { + if(size <= budget->remain) { budget->remain -= size; return 1; } + if(budget->chance == 0) { budget->count += size; return 0; } + budget->remain += budget->incval - size; + budget->chance -= 1; + return 1; +} + + +/*---------------------------------------------------------------------------*/ + +static INLINE +void +tr_partition(const int *ISAd, + int *first, int *middle, int *last, + int **pa, int **pb, int v) { + int *a, *b, *c, *d, *e, *f; + int t, s; + int x = 0; + + for(b = middle - 1; (++b < last) && ((x = ISAd[*b]) == v);) { } + if(((a = b) < last) && (x < v)) { + for(; (++b < last) && ((x = ISAd[*b]) <= v);) { + if(x == v) { SWAP(*b, *a); ++a; } + } + } + for(c = last; (b < --c) && ((x = ISAd[*c]) == v);) { } + if((b < (d = c)) && (x > v)) { + for(; (b < --c) && ((x = ISAd[*c]) >= v);) { + if(x == v) { SWAP(*c, *d); --d; } + } + } + for(; b < c;) { + SWAP(*b, *c); + for(; (++b < c) && ((x = ISAd[*b]) <= v);) { + if(x == v) { SWAP(*b, *a); ++a; } + } + for(; (b < --c) && ((x = ISAd[*c]) >= v);) { + if(x == v) { SWAP(*c, *d); --d; } + } + } + + if(a <= d) { + c = b - 1; + if((s = a - first) > (t = b - a)) { s = t; } + for(e = first, f = b - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); } + if((s = d - c) > (t = last - d - 1)) { s = t; } + for(e = b, f = last - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); } + first += (b - a), last -= (d - c); + } + *pa = first, *pb = last; +} + +static +void +tr_copy(int *ISA, const int *SA, + int *first, int *a, int *b, int *last, + int depth) { + /* sort suffixes of middle partition + by using sorted order of suffixes of left and right partition. */ + int *c, *d, *e; + int s, v; + + v = b - SA - 1; + for(c = first, d = a - 1; c <= d; ++c) { + if((0 <= (s = *c - depth)) && (ISA[s] == v)) { + *++d = s; + ISA[s] = d - SA; + } + } + for(c = last - 1, e = d + 1, d = b; e < d; --c) { + if((0 <= (s = *c - depth)) && (ISA[s] == v)) { + *--d = s; + ISA[s] = d - SA; + } + } +} + +static +void +tr_partialcopy(int *ISA, const int *SA, + int *first, int *a, int *b, int *last, + int depth) { + int *c, *d, *e; + int s, v; + int rank, lastrank, newrank = -1; + + v = b - SA - 1; + lastrank = -1; + for(c = first, d = a - 1; c <= d; ++c) { + if((0 <= (s = *c - depth)) && (ISA[s] == v)) { + *++d = s; + rank = ISA[s + depth]; + if(lastrank != rank) { lastrank = rank; newrank = d - SA; } + ISA[s] = newrank; + } + } + + lastrank = -1; + for(e = d; first <= e; --e) { + rank = ISA[*e]; + if(lastrank != rank) { lastrank = rank; newrank = e - SA; } + if(newrank != rank) { ISA[*e] = newrank; } + } + + lastrank = -1; + for(c = last - 1, e = d + 1, d = b; e < d; --c) { + if((0 <= (s = *c - depth)) && (ISA[s] == v)) { + *--d = s; + rank = ISA[s + depth]; + if(lastrank != rank) { lastrank = rank; newrank = d - SA; } + ISA[s] = newrank; + } + } +} + +static +void +tr_introsort(int *ISA, const int *ISAd, + int *SA, int *first, int *last, + trbudget_t *budget) { +#define STACK_SIZE TR_STACKSIZE + struct { const int *a; int *b, *c; int d, e; }stack[STACK_SIZE]; + int *a, *b, *c; + int t; + int v, x = 0; + int incr = ISAd - ISA; + int limit, next; + int ssize, trlink = -1; + + for(ssize = 0, limit = tr_ilg(last - first);;) { + + if(limit < 0) { + if(limit == -1) { + /* tandem repeat partition */ + tr_partition(ISAd - incr, first, first, last, &a, &b, last - SA - 1); + + /* update ranks */ + if(a < last) { + for(c = first, v = a - SA - 1; c < a; ++c) { ISA[*c] = v; } + } + if(b < last) { + for(c = a, v = b - SA - 1; c < b; ++c) { ISA[*c] = v; } + } + + /* push */ + if(1 < (b - a)) { + STACK_PUSH5(NULL, a, b, 0, 0); + STACK_PUSH5(ISAd - incr, first, last, -2, trlink); + trlink = ssize - 2; + } + if((a - first) <= (last - b)) { + if(1 < (a - first)) { + STACK_PUSH5(ISAd, b, last, tr_ilg(last - b), trlink); + last = a, limit = tr_ilg(a - first); + } else if(1 < (last - b)) { + first = b, limit = tr_ilg(last - b); + } else { + STACK_POP5(ISAd, first, last, limit, trlink); + } + } else { + if(1 < (last - b)) { + STACK_PUSH5(ISAd, first, a, tr_ilg(a - first), trlink); + first = b, limit = tr_ilg(last - b); + } else if(1 < (a - first)) { + last = a, limit = tr_ilg(a - first); + } else { + STACK_POP5(ISAd, first, last, limit, trlink); + } + } + } else if(limit == -2) { + /* tandem repeat copy */ + a = stack[--ssize].b, b = stack[ssize].c; + if(stack[ssize].d == 0) { + tr_copy(ISA, SA, first, a, b, last, ISAd - ISA); + } else { + if(0 <= trlink) { stack[trlink].d = -1; } + tr_partialcopy(ISA, SA, first, a, b, last, ISAd - ISA); + } + STACK_POP5(ISAd, first, last, limit, trlink); + } else { + /* sorted partition */ + if(0 <= *first) { + a = first; + do { ISA[*a] = a - SA; } while((++a < last) && (0 <= *a)); + first = a; + } + if(first < last) { + a = first; do { *a = ~*a; } while(*++a < 0); + next = (ISA[*a] != ISAd[*a]) ? tr_ilg(a - first + 1) : -1; + if(++a < last) { for(b = first, v = a - SA - 1; b < a; ++b) { ISA[*b] = v; } } + + /* push */ + if(trbudget_check(budget, a - first)) { + if((a - first) <= (last - a)) { + STACK_PUSH5(ISAd, a, last, -3, trlink); + ISAd += incr, last = a, limit = next; + } else { + if(1 < (last - a)) { + STACK_PUSH5(ISAd + incr, first, a, next, trlink); + first = a, limit = -3; + } else { + ISAd += incr, last = a, limit = next; + } + } + } else { + if(0 <= trlink) { stack[trlink].d = -1; } + if(1 < (last - a)) { + first = a, limit = -3; + } else { + STACK_POP5(ISAd, first, last, limit, trlink); + } + } + } else { + STACK_POP5(ISAd, first, last, limit, trlink); + } + } + continue; + } + + if((last - first) <= TR_INSERTIONSORT_THRESHOLD) { + tr_insertionsort(ISAd, first, last); + limit = -3; + continue; + } + + if(limit-- == 0) { + tr_heapsort(ISAd, first, last - first); + for(a = last - 1; first < a; a = b) { + for(x = ISAd[*a], b = a - 1; (first <= b) && (ISAd[*b] == x); --b) { *b = ~*b; } + } + limit = -3; + continue; + } + + /* choose pivot */ + a = tr_pivot(ISAd, first, last); + SWAP(*first, *a); + v = ISAd[*first]; + + /* partition */ + tr_partition(ISAd, first, first + 1, last, &a, &b, v); + if((last - first) != (b - a)) { + next = (ISA[*a] != v) ? tr_ilg(b - a) : -1; + + /* update ranks */ + for(c = first, v = a - SA - 1; c < a; ++c) { ISA[*c] = v; } + if(b < last) { for(c = a, v = b - SA - 1; c < b; ++c) { ISA[*c] = v; } } + + /* push */ + if((1 < (b - a)) && (trbudget_check(budget, b - a))) { + if((a - first) <= (last - b)) { + if((last - b) <= (b - a)) { + if(1 < (a - first)) { + STACK_PUSH5(ISAd + incr, a, b, next, trlink); + STACK_PUSH5(ISAd, b, last, limit, trlink); + last = a; + } else if(1 < (last - b)) { + STACK_PUSH5(ISAd + incr, a, b, next, trlink); + first = b; + } else { + ISAd += incr, first = a, last = b, limit = next; + } + } else if((a - first) <= (b - a)) { + if(1 < (a - first)) { + STACK_PUSH5(ISAd, b, last, limit, trlink); + STACK_PUSH5(ISAd + incr, a, b, next, trlink); + last = a; + } else { + STACK_PUSH5(ISAd, b, last, limit, trlink); + ISAd += incr, first = a, last = b, limit = next; + } + } else { + STACK_PUSH5(ISAd, b, last, limit, trlink); + STACK_PUSH5(ISAd, first, a, limit, trlink); + ISAd += incr, first = a, last = b, limit = next; + } + } else { + if((a - first) <= (b - a)) { + if(1 < (last - b)) { + STACK_PUSH5(ISAd + incr, a, b, next, trlink); + STACK_PUSH5(ISAd, first, a, limit, trlink); + first = b; + } else if(1 < (a - first)) { + STACK_PUSH5(ISAd + incr, a, b, next, trlink); + last = a; + } else { + ISAd += incr, first = a, last = b, limit = next; + } + } else if((last - b) <= (b - a)) { + if(1 < (last - b)) { + STACK_PUSH5(ISAd, first, a, limit, trlink); + STACK_PUSH5(ISAd + incr, a, b, next, trlink); + first = b; + } else { + STACK_PUSH5(ISAd, first, a, limit, trlink); + ISAd += incr, first = a, last = b, limit = next; + } + } else { + STACK_PUSH5(ISAd, first, a, limit, trlink); + STACK_PUSH5(ISAd, b, last, limit, trlink); + ISAd += incr, first = a, last = b, limit = next; + } + } + } else { + if((1 < (b - a)) && (0 <= trlink)) { stack[trlink].d = -1; } + if((a - first) <= (last - b)) { + if(1 < (a - first)) { + STACK_PUSH5(ISAd, b, last, limit, trlink); + last = a; + } else if(1 < (last - b)) { + first = b; + } else { + STACK_POP5(ISAd, first, last, limit, trlink); + } + } else { + if(1 < (last - b)) { + STACK_PUSH5(ISAd, first, a, limit, trlink); + first = b; + } else if(1 < (a - first)) { + last = a; + } else { + STACK_POP5(ISAd, first, last, limit, trlink); + } + } + } + } else { + if(trbudget_check(budget, last - first)) { + limit = tr_ilg(last - first), ISAd += incr; + } else { + if(0 <= trlink) { stack[trlink].d = -1; } + STACK_POP5(ISAd, first, last, limit, trlink); + } + } + } +#undef STACK_SIZE +} + + + +/*---------------------------------------------------------------------------*/ + +/* Tandem repeat sort */ +static +void +trsort(int *ISA, int *SA, int n, int depth) { + int *ISAd; + int *first, *last; + trbudget_t budget; + int t, skip, unsorted; + + trbudget_init(&budget, tr_ilg(n) * 2 / 3, n); +/* trbudget_init(&budget, tr_ilg(n) * 3 / 4, n); */ + for(ISAd = ISA + depth; -n < *SA; ISAd += ISAd - ISA) { + first = SA; + skip = 0; + unsorted = 0; + do { + if((t = *first) < 0) { first -= t; skip += t; } + else { + if(skip != 0) { *(first + skip) = skip; skip = 0; } + last = SA + ISA[t] + 1; + if(1 < (last - first)) { + budget.count = 0; + tr_introsort(ISA, ISAd, SA, first, last, &budget); + if(budget.count != 0) { unsorted += budget.count; } + else { skip = first - last; } + } else if((last - first) == 1) { + skip = -1; + } + first = last; + } + } while(first < (SA + n)); + if(skip != 0) { *(first + skip) = skip; } + if(unsorted == 0) { break; } + } +} + + +/*---------------------------------------------------------------------------*/ + +/* Sorts suffixes of type B*. */ +static +int +sort_typeBstar(const unsigned char *T, int *SA, + int *bucket_A, int *bucket_B, + int n) { + int *PAb, *ISAb, *buf; +#ifdef _OPENMP + int *curbuf; + int l; +#endif + int i, j, k, t, m, bufsize; + int c0, c1; +#ifdef _OPENMP + int d0, d1; + int tmp; +#endif + + /* Initialize bucket arrays. */ + for(i = 0; i < BUCKET_A_SIZE; ++i) { bucket_A[i] = 0; } + for(i = 0; i < BUCKET_B_SIZE; ++i) { bucket_B[i] = 0; } + + /* Count the number of occurrences of the first one or two characters of each + type A, B and B* suffix. Moreover, store the beginning position of all + type B* suffixes into the array SA. */ + for(i = n - 1, m = n, c0 = T[n - 1]; 0 <= i;) { + /* type A suffix. */ + do { ++BUCKET_A(c1 = c0); } while((0 <= --i) && ((c0 = T[i]) >= c1)); + if(0 <= i) { + /* type B* suffix. */ + ++BUCKET_BSTAR(c0, c1); + SA[--m] = i; + /* type B suffix. */ + for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) <= c1); --i, c1 = c0) { + ++BUCKET_B(c0, c1); + } + } + } + m = n - m; +/* +note: + A type B* suffix is lexicographically smaller than a type B suffix that + begins with the same first two characters. +*/ + + /* Calculate the index of start/end point of each bucket. */ + for(c0 = 0, i = 0, j = 0; c0 < ALPHABET_SIZE; ++c0) { + t = i + BUCKET_A(c0); + BUCKET_A(c0) = i + j; /* start point */ + i = t + BUCKET_B(c0, c0); + for(c1 = c0 + 1; c1 < ALPHABET_SIZE; ++c1) { + j += BUCKET_BSTAR(c0, c1); + BUCKET_BSTAR(c0, c1) = j; /* end point */ + i += BUCKET_B(c0, c1); + } + } + + if(0 < m) { + /* Sort the type B* suffixes by their first two characters. */ + PAb = SA + n - m; ISAb = SA + m; + for(i = m - 2; 0 <= i; --i) { + t = PAb[i], c0 = T[t], c1 = T[t + 1]; + SA[--BUCKET_BSTAR(c0, c1)] = i; + } + t = PAb[m - 1], c0 = T[t], c1 = T[t + 1]; + SA[--BUCKET_BSTAR(c0, c1)] = m - 1; + + /* Sort the type B* substrings using sssort. */ +#ifdef _OPENMP + tmp = omp_get_max_threads(); + buf = SA + m, bufsize = (n - (2 * m)) / tmp; + c0 = ALPHABET_SIZE - 2, c1 = ALPHABET_SIZE - 1, j = m; +#pragma omp parallel default(shared) private(curbuf, k, l, d0, d1, tmp) + { + tmp = omp_get_thread_num(); + curbuf = buf + tmp * bufsize; + k = 0; + for(;;) { + #pragma omp critical(sssort_lock) + { + if(0 < (l = j)) { + d0 = c0, d1 = c1; + do { + k = BUCKET_BSTAR(d0, d1); + if(--d1 <= d0) { + d1 = ALPHABET_SIZE - 1; + if(--d0 < 0) { break; } + } + } while(((l - k) <= 1) && (0 < (l = k))); + c0 = d0, c1 = d1, j = k; + } + } + if(l == 0) { break; } + sssort(T, PAb, SA + k, SA + l, + curbuf, bufsize, 2, n, *(SA + k) == (m - 1)); + } + } +#else + buf = SA + m, bufsize = n - (2 * m); + for(c0 = ALPHABET_SIZE - 2, j = m; 0 < j; --c0) { + for(c1 = ALPHABET_SIZE - 1; c0 < c1; j = i, --c1) { + i = BUCKET_BSTAR(c0, c1); + if(1 < (j - i)) { + sssort(T, PAb, SA + i, SA + j, + buf, bufsize, 2, n, *(SA + i) == (m - 1)); + } + } + } +#endif + + /* Compute ranks of type B* substrings. */ + for(i = m - 1; 0 <= i; --i) { + if(0 <= SA[i]) { + j = i; + do { ISAb[SA[i]] = i; } while((0 <= --i) && (0 <= SA[i])); + SA[i + 1] = i - j; + if(i <= 0) { break; } + } + j = i; + do { ISAb[SA[i] = ~SA[i]] = j; } while(SA[--i] < 0); + ISAb[SA[i]] = j; + } + + /* Construct the inverse suffix array of type B* suffixes using trsort. */ + trsort(ISAb, SA, m, 1); + + /* Set the sorted order of tyoe B* suffixes. */ + for(i = n - 1, j = m, c0 = T[n - 1]; 0 <= i;) { + for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) >= c1); --i, c1 = c0) { } + if(0 <= i) { + t = i; + for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) <= c1); --i, c1 = c0) { } + SA[ISAb[--j]] = ((t == 0) || (1 < (t - i))) ? t : ~t; + } + } + + /* Calculate the index of start/end point of each bucket. */ + BUCKET_B(ALPHABET_SIZE - 1, ALPHABET_SIZE - 1) = n; /* end point */ + for(c0 = ALPHABET_SIZE - 2, k = m - 1; 0 <= c0; --c0) { + i = BUCKET_A(c0 + 1) - 1; + for(c1 = ALPHABET_SIZE - 1; c0 < c1; --c1) { + t = i - BUCKET_B(c0, c1); + BUCKET_B(c0, c1) = i; /* end point */ + + /* Move all type B* suffixes to the correct position. */ + for(i = t, j = BUCKET_BSTAR(c0, c1); + j <= k; + --i, --k) { SA[i] = SA[k]; } + } + BUCKET_BSTAR(c0, c0 + 1) = i - BUCKET_B(c0, c0) + 1; /* start point */ + BUCKET_B(c0, c0) = i; /* end point */ + } + } + + return m; +} + +/* Constructs the suffix array by using the sorted order of type B* suffixes. */ +static +void +construct_SA(const unsigned char *T, int *SA, + int *bucket_A, int *bucket_B, + int n, int m) { + int *i, *j, *k; + int s; + int c0, c1, c2; + + if(0 < m) { + /* Construct the sorted order of type B suffixes by using + the sorted order of type B* suffixes. */ + for(c1 = ALPHABET_SIZE - 2; 0 <= c1; --c1) { + /* Scan the suffix array from right to left. */ + for(i = SA + BUCKET_BSTAR(c1, c1 + 1), + j = SA + BUCKET_A(c1 + 1) - 1, k = NULL, c2 = -1; + i <= j; + --j) { + if(0 < (s = *j)) { + assert(T[s] == c1); + assert(((s + 1) < n) && (T[s] <= T[s + 1])); + assert(T[s - 1] <= T[s]); + *j = ~s; + c0 = T[--s]; + if((0 < s) && (T[s - 1] > c0)) { s = ~s; } + if(c0 != c2) { + if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; } + k = SA + BUCKET_B(c2 = c0, c1); + } + assert(k < j); + *k-- = s; + } else { + assert(((s == 0) && (T[s] == c1)) || (s < 0)); + *j = ~s; + } + } + } + } + + /* Construct the suffix array by using + the sorted order of type B suffixes. */ + k = SA + BUCKET_A(c2 = T[n - 1]); + *k++ = (T[n - 2] < c2) ? ~(n - 1) : (n - 1); + /* Scan the suffix array from left to right. */ + for(i = SA, j = SA + n; i < j; ++i) { + if(0 < (s = *i)) { + assert(T[s - 1] >= T[s]); + c0 = T[--s]; + if((s == 0) || (T[s - 1] < c0)) { s = ~s; } + if(c0 != c2) { + BUCKET_A(c2) = k - SA; + k = SA + BUCKET_A(c2 = c0); + } + assert(i < k); + *k++ = s; + } else { + assert(s < 0); + *i = ~s; + } + } +} + +/* Constructs the burrows-wheeler transformed string directly + by using the sorted order of type B* suffixes. */ +static +int +construct_BWT(const unsigned char *T, int *SA, + int *bucket_A, int *bucket_B, + int n, int m) { + int *i, *j, *k, *orig; + int s; + int c0, c1, c2; + + if(0 < m) { + /* Construct the sorted order of type B suffixes by using + the sorted order of type B* suffixes. */ + for(c1 = ALPHABET_SIZE - 2; 0 <= c1; --c1) { + /* Scan the suffix array from right to left. */ + for(i = SA + BUCKET_BSTAR(c1, c1 + 1), + j = SA + BUCKET_A(c1 + 1) - 1, k = NULL, c2 = -1; + i <= j; + --j) { + if(0 < (s = *j)) { + assert(T[s] == c1); + assert(((s + 1) < n) && (T[s] <= T[s + 1])); + assert(T[s - 1] <= T[s]); + c0 = T[--s]; + *j = ~((int)c0); + if((0 < s) && (T[s - 1] > c0)) { s = ~s; } + if(c0 != c2) { + if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; } + k = SA + BUCKET_B(c2 = c0, c1); + } + assert(k < j); + *k-- = s; + } else if(s != 0) { + *j = ~s; +#ifndef NDEBUG + } else { + assert(T[s] == c1); +#endif + } + } + } + } + + /* Construct the BWTed string by using + the sorted order of type B suffixes. */ + k = SA + BUCKET_A(c2 = T[n - 1]); + *k++ = (T[n - 2] < c2) ? ~((int)T[n - 2]) : (n - 1); + /* Scan the suffix array from left to right. */ + for(i = SA, j = SA + n, orig = SA; i < j; ++i) { + if(0 < (s = *i)) { + assert(T[s - 1] >= T[s]); + c0 = T[--s]; + *i = c0; + if((0 < s) && (T[s - 1] < c0)) { s = ~((int)T[s - 1]); } + if(c0 != c2) { + BUCKET_A(c2) = k - SA; + k = SA + BUCKET_A(c2 = c0); + } + assert(i < k); + *k++ = s; + } else if(s != 0) { + *i = ~s; + } else { + orig = i; + } + } + + return orig - SA; +} + + +/*---------------------------------------------------------------------------*/ + +/*- Function -*/ + +int +divsufsort(const unsigned char *T, int *SA, int n) { + int *bucket_A, *bucket_B; + int m; + int err = 0; + + /* Check arguments. */ + if((T == NULL) || (SA == NULL) || (n < 0)) { return -1; } + else if(n == 0) { return 0; } + else if(n == 1) { SA[0] = 0; return 0; } + else if(n == 2) { m = (T[0] < T[1]); SA[m ^ 1] = 0, SA[m] = 1; return 0; } + + bucket_A = (int *)malloc(BUCKET_A_SIZE * sizeof(int)); + bucket_B = (int *)malloc(BUCKET_B_SIZE * sizeof(int)); + + /* Suffixsort. */ + if((bucket_A != NULL) && (bucket_B != NULL)) { + m = sort_typeBstar(T, SA, bucket_A, bucket_B, n); + construct_SA(T, SA, bucket_A, bucket_B, n, m); + } else { + err = -2; + } + + free(bucket_B); + free(bucket_A); + + return err; +} + +int +divbwt(const unsigned char *T, unsigned char *U, int *A, int n) { + int *B; + int *bucket_A, *bucket_B; + int m, pidx, i; + + /* Check arguments. */ + if((T == NULL) || (U == NULL) || (n < 0)) { return -1; } + else if(n <= 1) { if(n == 1) { U[0] = T[0]; } return n; } + + if((B = A) == NULL) { B = (int *)malloc((size_t)(n + 1) * sizeof(int)); } + bucket_A = (int *)malloc(BUCKET_A_SIZE * sizeof(int)); + bucket_B = (int *)malloc(BUCKET_B_SIZE * sizeof(int)); + + /* Burrows-Wheeler Transform. */ + if((B != NULL) && (bucket_A != NULL) && (bucket_B != NULL)) { + m = sort_typeBstar(T, B, bucket_A, bucket_B, n); + pidx = construct_BWT(T, B, bucket_A, bucket_B, n, m); + + /* Copy to output string. */ + U[0] = T[n - 1]; + for(i = 0; i < pidx; ++i) { U[i + 1] = (unsigned char)B[i]; } + for(i += 1; i < n; ++i) { U[i] = (unsigned char)B[i]; } + pidx += 1; + } else { + pidx = -2; + } + + free(bucket_B); + free(bucket_A); + if(A == NULL) { free(B); } + + return pidx; +} diff --git a/divsufsort.h b/divsufsort.h new file mode 100644 index 0000000..8d8952e --- /dev/null +++ b/divsufsort.h @@ -0,0 +1,63 @@ +/* + * divsufsort.h for libdivsufsort-lite + * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef _DIVSUFSORT_H +#define _DIVSUFSORT_H 1 + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + + +/*- Prototypes -*/ + +/** + * Constructs the suffix array of a given string. + * @param T[0..n-1] The input string. + * @param SA[0..n-1] The output array of suffixes. + * @param n The length of the given string. + * @return 0 if no error occurred, -1 or -2 otherwise. + */ +int +divsufsort(const unsigned char *T, int *SA, int n); + +/** + * Constructs the burrows-wheeler transformed string of a given string. + * @param T[0..n-1] The input string. + * @param U[0..n-1] The output string. (can be T) + * @param A[0..n-1] The temporary array. (can be NULL) + * @param n The length of the given string. + * @return The primary index if no error occurred, -1 or -2 otherwise. + */ +int +divbwt(const unsigned char *T, unsigned char *U, int *A, int n); + + +#ifdef __cplusplus +} /* extern "C" */ +#endif /* __cplusplus */ + +#endif /* _DIVSUFSORT_H */ From b13dba2b81f61c91e7aec52dc46818b7e87368f8 Mon Sep 17 00:00:00 2001 From: Ilya Muravyov Date: Sat, 9 Apr 2016 13:49:09 +0300 Subject: [PATCH 03/34] Updated to v1.01 Rename src/bcm.cpp to src/1/bcm.cpp --- bcm.cpp => src/bcm.cpp | 203 ++++++++++++++++++------------- divsufsort.c => src/divsufsort.c | 0 divsufsort.h => src/divsufsort.h | 0 3 files changed, 118 insertions(+), 85 deletions(-) rename bcm.cpp => src/bcm.cpp (61%) rename divsufsort.c => src/divsufsort.c (100%) rename divsufsort.h => src/divsufsort.h (100%) diff --git a/bcm.cpp b/src/bcm.cpp similarity index 61% rename from bcm.cpp rename to src/bcm.cpp index 1174bb7..1c4fb2a 100644 --- a/bcm.cpp +++ b/src/bcm.cpp @@ -28,25 +28,34 @@ const char magic[]="BCM1"; FILE* in; FILE* out; -class Encoder +struct Encoder { -public: uint code; uint low; uint high; Encoder() - : code(0), low(0), high(-1) - {} + { + code=0; + low=0; + high=uint(-1); + } - void Encode(int bit, uint p) + void EncodeBit0(uint p) { - const uint mid=low+((ulonglong(high-low)*(p<<14))>>32); + low+=((ulonglong(high-low)*(p<<14))>>32)+1; - if (bit) - high=mid; - else - low=mid+1; + while ((low^high)<(1<<24)) + { + putc(low>>24, out); + low<<=8; + high=(high<<8)|255; + } + } + + void EncodeBit1(uint p) + { + high=low+((ulonglong(high-low)*(p<<14))>>32); while ((low^high)<(1<<24)) { @@ -93,27 +102,28 @@ class Encoder }; template -class Counter +struct Counter { -public: int p; Counter() - : p(1<<15) - {} + { + p=1<<15; + } - void Update(int bit) + void UpdateBit0() { - if (bit) - p+=(p^65535)>>RATE; - else - p-=p>>RATE; + p-=p>>RATE; + } + + void UpdateBit1() + { + p+=(p^65535)>>RATE; } }; -class CM: public Encoder +struct CM: Encoder { -public: Counter<2> counter0[256]; Counter<4> counter1[256][256]; Counter<6> counter2[2][256][17]; @@ -122,8 +132,11 @@ class CM: public Encoder int run; CM() - : c1(0), c2(0), run(0) { + c1=0; + c2=0; + run=0; + for (int i=0; i<2; ++i) { for (int j=0; j<256; ++j) @@ -134,7 +147,7 @@ class CM: public Encoder } } - void Put(int c) + void Encode(int c) { if (c1==c2) ++run; @@ -155,23 +168,34 @@ class CM: public Encoder const int x2=counter2[f][ctx][idx+1].p; const int ssep=x1+(((x2-x1)*(p&4095))>>12); - const int bit=((c&128)!=0); + const int bit=c&128; c+=c; - Encoder::Encode(bit, p+ssep+ssep+ssep); - counter0[ctx].Update(bit); - counter1[c1][ctx].Update(bit); - counter2[f][ctx][idx].Update(bit); - counter2[f][ctx][idx+1].Update(bit); - - ctx+=ctx+bit; + if (bit) + { + Encoder::EncodeBit1(p+ssep+ssep+ssep); + counter0[ctx].UpdateBit1(); + counter1[c1][ctx].UpdateBit1(); + counter2[f][ctx][idx].UpdateBit1(); + counter2[f][ctx][idx+1].UpdateBit1(); + ctx+=ctx+1; + } + else + { + Encoder::EncodeBit0(p+ssep+ssep+ssep); + counter0[ctx].UpdateBit0(); + counter1[c1][ctx].UpdateBit0(); + counter2[f][ctx][idx].UpdateBit0(); + counter2[f][ctx][idx+1].UpdateBit0(); + ctx+=ctx; + } } c2=c1; - c1=byte(ctx); + c1=ctx&255; } - int Get() + int Decode() { if (c1==c2) ++run; @@ -194,16 +218,26 @@ class CM: public Encoder const int bit=Encoder::Decode(p+ssep+ssep+ssep); - counter0[ctx].Update(bit); - counter1[c1][ctx].Update(bit); - counter2[f][ctx][idx].Update(bit); - counter2[f][ctx][idx+1].Update(bit); - - ctx+=ctx+bit; + if (bit) + { + counter0[ctx].UpdateBit1(); + counter1[c1][ctx].UpdateBit1(); + counter2[f][ctx][idx].UpdateBit1(); + counter2[f][ctx][idx+1].UpdateBit1(); + ctx+=ctx+1; + } + else + { + counter0[ctx].UpdateBit0(); + counter1[c1][ctx].UpdateBit0(); + counter2[f][ctx][idx].UpdateBit0(); + counter2[f][ctx][idx+1].UpdateBit0(); + ctx+=ctx; + } } c2=c1; - return c1=byte(ctx); + return c1=ctx&255; } } cm; @@ -211,15 +245,15 @@ byte* buf; void compress(int b) { - if (_fseeki64(in, 0, SEEK_END)!=0) + if (_fseeki64(in, 0, SEEK_END)) { - perror("Fseek failed"); + perror("Fseek() failed"); exit(1); } const long long flen=_ftelli64(in); if (flen<0) { - perror("Ftell failed"); + perror("Ftell() failed"); exit(1); } if (b>flen) @@ -244,37 +278,37 @@ void compress(int b) const int p=divbwt(buf, buf, (int*)&buf[b], n); if (p<1) { - perror("Divbwt failed"); + perror("Divbwt() failed"); exit(1); } - cm.Put(n>>24); - cm.Put(n>>16); - cm.Put(n>>8); - cm.Put(n); - cm.Put(p>>24); - cm.Put(p>>16); - cm.Put(p>>8); - cm.Put(p); + cm.Encode(n>>24); + cm.Encode(n>>16); + cm.Encode(n>>8); + cm.Encode(n); + cm.Encode(p>>24); + cm.Encode(p>>16); + cm.Encode(p>>8); + cm.Encode(p); for (int i=0; ib)||(p<1)||(p>n)) + const int p=(cm.Decode()<<24) + |(cm.Decode()<<16) + |(cm.Decode()<<8) + |cm.Decode(); + if (n<1 || n>b || p<1 || p>n) { fprintf(stderr, "File corrupted\n"); exit(1); @@ -313,13 +347,13 @@ void decompress() // Inverse BWT int t[257]={0}; for (int i=0; i=p); - for (int i=p; i!=0;) + for (int i=p; i;) { i=next[i-1]; putc(buf[i-(i>=p)], out); @@ -327,7 +361,7 @@ void decompress() } } -int main(int argc, char* argv[]) +int main(int argc, char** argv) { const clock_t start=clock(); @@ -335,7 +369,7 @@ int main(int argc, char* argv[]) bool do_decomp=false; bool overwrite=false; - while ((argc>1)&&(argv[1][0]=='-')) + while (argc>1 && *argv[1]=='-') { switch (argv[1][1]) { @@ -365,14 +399,14 @@ int main(int argc, char* argv[]) if (argc<2) { fprintf(stderr, - "BCM - A BWT-based file compressor, v1.00\n" + "BCM - A BWT-based file compressor, v1.01\n" "\n" "Usage: BCM [options] infile [outfile]\n" "\n" "Options:\n" - " -b[k] Set block size to N MB or KB (default is 20 MB)\n" - " -d Decompress\n" - " -f Force overwrite of output file\n"); + " -b#[k] Set block size to # MB or KB (default is 20 MB)\n" + " -d Decompress\n" + " -f Force overwrite of output file\n"); exit(1); } @@ -390,7 +424,7 @@ int main(int argc, char* argv[]) if (do_decomp) { const int p=strlen(ofname)-4; - if ((p>0)&&(strcmp(&ofname[p], ".bcm")==0)) + if (p>0 && !strcmp(&ofname[p], ".bcm")) ofname[p]='\0'; else strcat(ofname, ".out"); @@ -419,16 +453,15 @@ int main(int argc, char* argv[]) exit(1); } - fprintf(stdout, "%s: ", argv[1]); - fflush(stdout); + fprintf(stderr, "%s: ", argv[1]); + fflush(stderr); if (do_decomp) decompress(); else compress(block_size); - fprintf(stdout, "%lld -> %lld in %.3fs\n", - _ftelli64(in), _ftelli64(out), + fprintf(stderr, "%lld->%lld in %.3fs\n", _ftelli64(in), _ftelli64(out), double(clock()-start)/CLOCKS_PER_SEC); fclose(in); diff --git a/divsufsort.c b/src/divsufsort.c similarity index 100% rename from divsufsort.c rename to src/divsufsort.c diff --git a/divsufsort.h b/src/divsufsort.h similarity index 100% rename from divsufsort.h rename to src/divsufsort.h From d0a842646a2daa9f3f2992498901f90f56f9774a Mon Sep 17 00:00:00 2001 From: Ilya Muravyov Date: Tue, 12 Apr 2016 18:01:54 +0400 Subject: [PATCH 04/34] Update README.md --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index af1a435..4de73a2 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,14 @@ -BCM
+## BCM v1.01 -DESCRIPTION
+#### DESCRIPTION BCM is a high-performance file compressor that utilizes advanced context modeling techniques to achieve a very high compression ratio. All in all, it's like a big brother of the BZIP2. -AUTHORS
-Ilya Muravyov
-The libdivsufsort-lite library is developed by Yuta Mori +#### AUTHORS +- Ilya Muravyov +- The libdivsufsort-lite library is developed by Yuta Mori -THANKS
+#### THANKS Special thanks to Yuta Mori, Matt Mahoney, Eugene Shelwien, Przemysław Skibiński and LovePimple. From 8f752653f5e14464d2bf6ecc033b212bcc366b9c Mon Sep 17 00:00:00 2001 From: Ilya Muravyov Date: Tue, 12 Apr 2016 23:24:05 +0400 Subject: [PATCH 05/34] Updated to v1.02 --- src/bcm.cpp | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/src/bcm.cpp b/src/bcm.cpp index 1c4fb2a..bace5fb 100644 --- a/src/bcm.cpp +++ b/src/bcm.cpp @@ -43,8 +43,11 @@ struct Encoder void EncodeBit0(uint p) { +#ifdef _WIN64 + low+=((ulonglong(high-low)*p)>>18)+1; +#else low+=((ulonglong(high-low)*(p<<14))>>32)+1; - +#endif while ((low^high)<(1<<24)) { putc(low>>24, out); @@ -55,8 +58,11 @@ struct Encoder void EncodeBit1(uint p) { +#ifdef _WIN64 + high=low+((ulonglong(high-low)*p)>>18); +#else high=low+((ulonglong(high-low)*(p<<14))>>32); - +#endif while ((low^high)<(1<<24)) { putc(low>>24, out); @@ -80,10 +86,13 @@ struct Encoder code=(code<<8)|getc(in); } - int Decode(uint p) + int DecodeBit(uint p) { +#ifdef _WIN64 + const uint mid=low+((ulonglong(high-low)*p)>>18); +#else const uint mid=low+((ulonglong(high-low)*(p<<14))>>32); - +#endif const int bit=(code<=mid); if (bit) high=mid; @@ -216,7 +225,7 @@ struct CM: Encoder const int x2=counter2[f][ctx][idx+1].p; const int ssep=x1+(((x2-x1)*(p&4095))>>12); - const int bit=Encoder::Decode(p+ssep+ssep+ssep); + const int bit=Encoder::DecodeBit(p+ssep+ssep+ssep); if (bit) { @@ -378,7 +387,7 @@ int main(int argc, char** argv) <<(argv[1][strlen(argv[1])-1]=='k'?10:20); if (block_size<1) { - fprintf(stderr, "Invalid block size\n"); + fprintf(stderr, "Block size is out of range\n"); exit(1); } break; @@ -399,7 +408,7 @@ int main(int argc, char** argv) if (argc<2) { fprintf(stderr, - "BCM - A BWT-based file compressor, v1.01\n" + "BCM - A BWT-based file compressor, v1.02\n" "\n" "Usage: BCM [options] infile [outfile]\n" "\n" @@ -435,6 +444,13 @@ int main(int argc, char** argv) else strcpy(ofname, argv[2]); + if (!strcmp(ofname, argv[1])) + { + fprintf(stderr, "%s: Cannot %scompress onto itself\n", argv[1], + do_decomp?"de":""); + exit(1); + } + if (!overwrite) { FILE* f=fopen(ofname, "rb"); From b98b5b3cb15f980f9860c5198bde277f32dc5446 Mon Sep 17 00:00:00 2001 From: Ilya Muravyov Date: Tue, 12 Apr 2016 22:24:51 +0300 Subject: [PATCH 06/34] Update README.md --- README.md | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 4de73a2..0a2ce0f 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,11 @@ -## BCM v1.01 +## BCM v1.02 #### DESCRIPTION -BCM is a high-performance file compressor that utilizes advanced context modeling -techniques to achieve a very high compression ratio. All in all, it's like a big -brother of the BZIP2. +BCM is a high-performance file compressor that utilizes advanced context modeling techniques to achieve a very high compression ratio. All in all, it's like a big brother of the BZIP2. #### AUTHORS - Ilya Muravyov - The libdivsufsort-lite library is developed by Yuta Mori #### THANKS -Special thanks to Yuta Mori, Matt Mahoney, Eugene Shelwien, Przemysław Skibiński -and LovePimple. +Special thanks to Yuta Mori, Matt Mahoney, Eugene Shelwien, Alexander Ratushnyak, Przemyslaw Skibinski and LovePimple. From f4d7226d75d50d407920cf22801d70b448a6dabc Mon Sep 17 00:00:00 2001 From: Ilya Muravyov Date: Tue, 19 Apr 2016 11:17:12 +0400 Subject: [PATCH 07/34] GCC fix getc_unlocked()/putc_unlocked() --- src/bcm.cpp | 98 ++++++++++++++++++++++++++++++----------------------- 1 file changed, 56 insertions(+), 42 deletions(-) diff --git a/src/bcm.cpp b/src/bcm.cpp index bace5fb..188e6fa 100644 --- a/src/bcm.cpp +++ b/src/bcm.cpp @@ -6,13 +6,26 @@ Written and placed in the public domain by Ilya Muravyov */ #ifdef __GNUC__ + #define _FILE_OFFSET_BITS 64 #define _fseeki64 fseeko64 #define _ftelli64 ftello64 + +#ifdef HAVE_GETC_UNLOCKED +#undef getc +#define getc getc_unlocked +#endif + +#ifdef HAVE_PUTC_UNLOCKED +#undef putc +#define putc putc_unlocked #endif +#endif // __GNUC__ + #define _CRT_SECURE_NO_WARNINGS #define _CRT_DISABLE_PERFCRIT_LOCKS + #include #include #include @@ -110,7 +123,7 @@ struct Encoder } }; -template +template struct Counter { int p; @@ -122,12 +135,12 @@ struct Counter void UpdateBit0() { - p-=p>>RATE; + p-=p>>rate; } void UpdateBit1() { - p+=(p^65535)>>RATE; + p+=(p^65535)>>rate; } }; @@ -172,9 +185,9 @@ struct CM: Encoder const int p2=counter1[c2][ctx].p; const int p=(p0+p0+p0+p0+p1+p1+p1+p2)>>3; - const int idx=p>>12; - const int x1=counter2[f][ctx][idx].p; - const int x2=counter2[f][ctx][idx+1].p; + const int j=p>>12; + const int x1=counter2[f][ctx][j].p; + const int x2=counter2[f][ctx][j+1].p; const int ssep=x1+(((x2-x1)*(p&4095))>>12); const int bit=c&128; @@ -185,8 +198,8 @@ struct CM: Encoder Encoder::EncodeBit1(p+ssep+ssep+ssep); counter0[ctx].UpdateBit1(); counter1[c1][ctx].UpdateBit1(); - counter2[f][ctx][idx].UpdateBit1(); - counter2[f][ctx][idx+1].UpdateBit1(); + counter2[f][ctx][j].UpdateBit1(); + counter2[f][ctx][j+1].UpdateBit1(); ctx+=ctx+1; } else @@ -194,8 +207,8 @@ struct CM: Encoder Encoder::EncodeBit0(p+ssep+ssep+ssep); counter0[ctx].UpdateBit0(); counter1[c1][ctx].UpdateBit0(); - counter2[f][ctx][idx].UpdateBit0(); - counter2[f][ctx][idx+1].UpdateBit0(); + counter2[f][ctx][j].UpdateBit0(); + counter2[f][ctx][j+1].UpdateBit0(); ctx+=ctx; } } @@ -220,9 +233,9 @@ struct CM: Encoder const int p2=counter1[c2][ctx].p; const int p=(p0+p0+p0+p0+p1+p1+p1+p2)>>3; - const int idx=p>>12; - const int x1=counter2[f][ctx][idx].p; - const int x2=counter2[f][ctx][idx+1].p; + const int j=p>>12; + const int x1=counter2[f][ctx][j].p; + const int x2=counter2[f][ctx][j+1].p; const int ssep=x1+(((x2-x1)*(p&4095))>>12); const int bit=Encoder::DecodeBit(p+ssep+ssep+ssep); @@ -231,16 +244,16 @@ struct CM: Encoder { counter0[ctx].UpdateBit1(); counter1[c1][ctx].UpdateBit1(); - counter2[f][ctx][idx].UpdateBit1(); - counter2[f][ctx][idx+1].UpdateBit1(); + counter2[f][ctx][j].UpdateBit1(); + counter2[f][ctx][j+1].UpdateBit1(); ctx+=ctx+1; } else { counter0[ctx].UpdateBit0(); counter1[c1][ctx].UpdateBit0(); - counter2[f][ctx][idx].UpdateBit0(); - counter2[f][ctx][idx+1].UpdateBit0(); + counter2[f][ctx][j].UpdateBit0(); + counter2[f][ctx][j+1].UpdateBit0(); ctx+=ctx; } } @@ -252,7 +265,7 @@ struct CM: Encoder byte* buf; -void compress(int b) +void compress(int bsize) { if (_fseeki64(in, 0, SEEK_END)) { @@ -265,11 +278,11 @@ void compress(int b) perror("Ftell() failed"); exit(1); } - if (b>flen) - b=int(flen); + if (bsize>flen) + bsize=int(flen); rewind(in); - buf=(byte*)calloc(b, 5); + buf=(byte*)calloc(bsize, 5); if (!buf) { fprintf(stderr, "Out of memory\n"); @@ -282,10 +295,10 @@ void compress(int b) putc(magic[3], out); int n; - while ((n=fread(buf, 1, b, in))>0) + while ((n=fread(buf, 1, bsize, in))>0) { - const int p=divbwt(buf, buf, (int*)&buf[b], n); - if (p<1) + const int idx=divbwt(buf, buf, (int*)&buf[bsize], n); + if (idx<1) { perror("Divbwt() failed"); exit(1); @@ -295,10 +308,10 @@ void compress(int b) cm.Encode(n>>16); cm.Encode(n>>8); cm.Encode(n); - cm.Encode(p>>24); - cm.Encode(p>>16); - cm.Encode(p>>8); - cm.Encode(p); + cm.Encode(idx>>24); + cm.Encode(idx>>16); + cm.Encode(idx>>8); + cm.Encode(idx); for (int i=0; ib || p<1 || p>n) + if (n<1 || n>bsize || idx<1 || idx>n) { fprintf(stderr, "File corrupted\n"); exit(1); @@ -359,13 +372,13 @@ void decompress() ++t[(buf[i]=cm.Decode())+1]; for (int i=1; i<256; ++i) t[i]+=t[i-1]; - int* next=(int*)&buf[b]; + int* next=(int*)&buf[bsize]; for (int i=0; i=p); - for (int i=p; i;) + next[t[buf[i]]++]=i+(i>=idx); + for (int p=idx; p;) { - i=next[i-1]; - putc(buf[i-(i>=p)], out); + p=next[p-1]; + putc(buf[p-(p>=idx)], out); } } } @@ -374,7 +387,7 @@ int main(int argc, char** argv) { const clock_t start=clock(); - int block_size=20<<20; // 20 MB + int bsize=20<<20; // 20 MB bool do_decomp=false; bool overwrite=false; @@ -383,9 +396,9 @@ int main(int argc, char** argv) switch (argv[1][1]) { case 'b': - block_size=atoi(&argv[1][2]) + bsize=atoi(&argv[1][2]) <<(argv[1][strlen(argv[1])-1]=='k'?10:20); - if (block_size<1) + if (bsize<1) { fprintf(stderr, "Block size is out of range\n"); exit(1); @@ -401,6 +414,7 @@ int main(int argc, char** argv) fprintf(stderr, "Unknown option: %s\n", argv[1]); exit(1); } + --argc; ++argv; } @@ -475,7 +489,7 @@ int main(int argc, char** argv) if (do_decomp) decompress(); else - compress(block_size); + compress(bsize); fprintf(stderr, "%lld->%lld in %.3fs\n", _ftelli64(in), _ftelli64(out), double(clock()-start)/CLOCKS_PER_SEC); From 2b9cccb656c1d018a6805f53953d4f67e99764d4 Mon Sep 17 00:00:00 2001 From: Ilya Muravyov Date: Wed, 20 Apr 2016 09:32:43 +0300 Subject: [PATCH 08/34] Updated to v1.03 --- src/bcm.cpp | 61 +++++++++++++++++++++++++++-------------------------- 1 file changed, 31 insertions(+), 30 deletions(-) diff --git a/src/bcm.cpp b/src/bcm.cpp index 188e6fa..4345039 100644 --- a/src/bcm.cpp +++ b/src/bcm.cpp @@ -32,9 +32,10 @@ Written and placed in the public domain by Ilya Muravyov #include #include "divsufsort.h" // libdivsufsort-lite -typedef unsigned char byte; -typedef unsigned int uint; -typedef unsigned long long ulonglong; +typedef unsigned char BYTE; +typedef unsigned short WORD; +typedef unsigned int DWORD; +typedef unsigned long long QWORD; const char magic[]="BCM1"; @@ -43,44 +44,44 @@ FILE* out; struct Encoder { - uint code; - uint low; - uint high; + DWORD low; + DWORD high; + DWORD code; Encoder() { - code=0; low=0; - high=uint(-1); + high=DWORD(-1); + code=0; } - void EncodeBit0(uint p) + void EncodeBit0(DWORD p) { #ifdef _WIN64 - low+=((ulonglong(high-low)*p)>>18)+1; + low+=((QWORD(high-low)*p)>>18)+1; #else - low+=((ulonglong(high-low)*(p<<14))>>32)+1; + low+=((QWORD(high-low)*(p<<(32-18)))>>32)+1; #endif while ((low^high)<(1<<24)) { putc(low>>24, out); low<<=8; - high=(high<<8)|255; + high=(high<<8)+255; } } - void EncodeBit1(uint p) + void EncodeBit1(DWORD p) { #ifdef _WIN64 - high=low+((ulonglong(high-low)*p)>>18); + high=low+((QWORD(high-low)*p)>>18); #else - high=low+((ulonglong(high-low)*(p<<14))>>32); + high=low+((QWORD(high-low)*(p<<(32-18)))>>32); #endif while ((low^high)<(1<<24)) { putc(low>>24, out); low<<=8; - high=(high<<8)|255; + high=(high<<8)+255; } } @@ -96,15 +97,15 @@ struct Encoder void Init() { for (int i=0; i<4; ++i) - code=(code<<8)|getc(in); + code=(code<<8)+getc(in); } - int DecodeBit(uint p) + int DecodeBit(DWORD p) { #ifdef _WIN64 - const uint mid=low+((ulonglong(high-low)*p)>>18); + const DWORD mid=low+((QWORD(high-low)*p)>>18); #else - const uint mid=low+((ulonglong(high-low)*(p<<14))>>32); + const DWORD mid=low+((QWORD(high-low)*(p<<(32-18)))>>32); #endif const int bit=(code<=mid); if (bit) @@ -114,19 +115,19 @@ struct Encoder while ((low^high)<(1<<24)) { - code=(code<<8)|getc(in); low<<=8; - high=(high<<8)|255; + high=(high<<8)+255; + code=(code<<8)+getc(in); } return bit; } }; -template +template struct Counter { - int p; + WORD p; Counter() { @@ -135,12 +136,12 @@ struct Counter void UpdateBit0() { - p-=p>>rate; + p-=p>>RATE; } void UpdateBit1() { - p+=(p^65535)>>rate; + p+=(p^65535)>>RATE; } }; @@ -263,7 +264,7 @@ struct CM: Encoder } } cm; -byte* buf; +BYTE* buf; void compress(int bsize) { @@ -282,7 +283,7 @@ void compress(int bsize) bsize=int(flen); rewind(in); - buf=(byte*)calloc(bsize, 5); + buf=(BYTE*)calloc(bsize, 5); if (!buf) { fprintf(stderr, "Out of memory\n"); @@ -350,7 +351,7 @@ void decompress() break; if (!bsize) { - buf=(byte*)calloc(bsize=n, 5); + buf=(BYTE*)calloc(bsize=n, 5); if (!buf) { fprintf(stderr, "Out of memory\n"); @@ -422,7 +423,7 @@ int main(int argc, char** argv) if (argc<2) { fprintf(stderr, - "BCM - A BWT-based file compressor, v1.02\n" + "BCM - A BWT-based file compressor, v1.03\n" "\n" "Usage: BCM [options] infile [outfile]\n" "\n" From ffc59d6dad11db20426964c13998e2a5cd84772e Mon Sep 17 00:00:00 2001 From: Ilya Muravyov Date: Wed, 20 Apr 2016 09:33:10 +0300 Subject: [PATCH 09/34] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 0a2ce0f..8f454cb 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,11 @@ -## BCM v1.02 +## BCM v1.03 #### DESCRIPTION BCM is a high-performance file compressor that utilizes advanced context modeling techniques to achieve a very high compression ratio. All in all, it's like a big brother of the BZIP2. #### AUTHORS - Ilya Muravyov -- The libdivsufsort-lite library is developed by Yuta Mori +- The libdivsufsort-lite library used is developed by Yuta Mori #### THANKS Special thanks to Yuta Mori, Matt Mahoney, Eugene Shelwien, Alexander Ratushnyak, Przemyslaw Skibinski and LovePimple. From ebf65b69200fd4c4bcf4c708fc8f684e9ab511a9 Mon Sep 17 00:00:00 2001 From: Ilya Muravyov Date: Sun, 8 May 2016 20:38:07 +0300 Subject: [PATCH 10/34] Updated to v1.04 --- src/bcm.cpp | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/src/bcm.cpp b/src/bcm.cpp index 4345039..276991e 100644 --- a/src/bcm.cpp +++ b/src/bcm.cpp @@ -286,7 +286,7 @@ void compress(int bsize) buf=(BYTE*)calloc(bsize, 5); if (!buf) { - fprintf(stderr, "Out of memory\n"); + fprintf(stderr, "Out of memory!\n"); exit(1); } @@ -333,7 +333,7 @@ void decompress() || getc(in)!=magic[2] || getc(in)!=magic[3]) { - fprintf(stderr, "Not in BCM format\n"); + fprintf(stderr, "Not in BCM format!\n"); exit(1); } @@ -354,7 +354,7 @@ void decompress() buf=(BYTE*)calloc(bsize=n, 5); if (!buf) { - fprintf(stderr, "Out of memory\n"); + fprintf(stderr, "Out of memory!\n"); exit(1); } } @@ -364,7 +364,7 @@ void decompress() |cm.Decode(); if (n<1 || n>bsize || idx<1 || idx>n) { - fprintf(stderr, "File corrupted\n"); + fprintf(stderr, "File corrupted!\n"); exit(1); } // Inverse BWT @@ -388,7 +388,7 @@ int main(int argc, char** argv) { const clock_t start=clock(); - int bsize=20<<20; // 20 MB + int bsize=32<<20; // 32 MB bool do_decomp=false; bool overwrite=false; @@ -401,7 +401,7 @@ int main(int argc, char** argv) <<(argv[1][strlen(argv[1])-1]=='k'?10:20); if (bsize<1) { - fprintf(stderr, "Block size is out of range\n"); + fprintf(stderr, "Block size is out of range!\n"); exit(1); } break; @@ -423,12 +423,12 @@ int main(int argc, char** argv) if (argc<2) { fprintf(stderr, - "BCM - A BWT-based file compressor, v1.03\n" + "BCM - A BWT-based file compressor, v1.04\n" "\n" "Usage: BCM [options] infile [outfile]\n" "\n" "Options:\n" - " -b#[k] Set block size to # MB or KB (default is 20 MB)\n" + " -b#[k] Set block size to # MB or KB (default is 32 MB)\n" " -d Decompress\n" " -f Force overwrite of output file\n"); exit(1); @@ -472,8 +472,15 @@ int main(int argc, char** argv) if (f) { fclose(f); - fprintf(stderr, "%s already exists\n", ofname); - exit(1); + + fprintf(stderr, "%s already exists; overwrite (y or n)? ", ofname); + fflush(stderr); + + if (getchar()!='y') + { + fprintf(stderr, "\tnot overwritten\n"); + exit(1); + } } } @@ -492,7 +499,7 @@ int main(int argc, char** argv) else compress(bsize); - fprintf(stderr, "%lld->%lld in %.3fs\n", _ftelli64(in), _ftelli64(out), + fprintf(stderr, "%lld -> %lld in %.3fs\n", _ftelli64(in), _ftelli64(out), double(clock()-start)/CLOCKS_PER_SEC); fclose(in); From 209c08b651e73241d3387930c77a74731f8f0d4e Mon Sep 17 00:00:00 2001 From: Ilya Muravyov Date: Sun, 8 May 2016 20:41:30 +0300 Subject: [PATCH 11/34] Update README.md --- LICENSE | 37 +++++++++++++++++-------------------- README.md | 6 ++++-- 2 files changed, 21 insertions(+), 22 deletions(-) diff --git a/LICENSE b/LICENSE index cf1ab25..6d2bffe 100644 --- a/LICENSE +++ b/LICENSE @@ -1,24 +1,21 @@ -This is free and unencumbered software released into the public domain. +The MIT License (MIT) -Anyone is free to copy, modify, publish, use, compile, sell, or -distribute this software, either in source code form or as a compiled -binary, for any purpose, commercial or non-commercial, and by any -means. +Copyright (C) 2008-2016 Ilya Muravyov -In jurisdictions that recognize copyright laws, the author or authors -of this software dedicate any and all copyright interest in the -software to the public domain. We make this dedication for the benefit -of the public at large and to the detriment of our heirs and -successors. We intend this dedication to be an overt act of -relinquishment in perpetuity of all present and future rights to this -software under copyright law. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR -OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -OTHER DEALINGS IN THE SOFTWARE. +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. -For more information, please refer to +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md index 8f454cb..3d98ebe 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,6 @@ -## BCM v1.03 +#### CAUTION: Upcoming versions of the BCM file compressor will not be compatible with current releases! Please remove all of your forks and stay tuned! + +## BCM v1.04 #### DESCRIPTION BCM is a high-performance file compressor that utilizes advanced context modeling techniques to achieve a very high compression ratio. All in all, it's like a big brother of the BZIP2. @@ -8,4 +10,4 @@ BCM is a high-performance file compressor that utilizes advanced context modelin - The libdivsufsort-lite library used is developed by Yuta Mori #### THANKS -Special thanks to Yuta Mori, Matt Mahoney, Eugene Shelwien, Alexander Ratushnyak, Przemyslaw Skibinski and LovePimple. +Special thanks to Yuta Mori, Matt Mahoney, Eugene Shelwien, Alexander Rhatushnyak, Przemyslaw Skibinski and LovePimple. From ead12bcd558a53bca9a86eb95a38e5e2eac45e25 Mon Sep 17 00:00:00 2001 From: Ilya Muravyov Date: Fri, 20 May 2016 19:47:55 +0300 Subject: [PATCH 12/34] Update bcm.cpp --- src/bcm.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bcm.cpp b/src/bcm.cpp index 276991e..2f31c57 100644 --- a/src/bcm.cpp +++ b/src/bcm.cpp @@ -1,7 +1,7 @@ /* BCM - A BWT-based file compressor -Written and placed in the public domain by Ilya Muravyov +Copyright (C) 2008-2016 Ilya Muravyov */ From d78d9b5c3d28162f15bd99d9a4e1310c74ce2537 Mon Sep 17 00:00:00 2001 From: Ilya Muravyov Date: Fri, 20 May 2016 19:52:28 +0300 Subject: [PATCH 13/34] Update README.md --- README.md | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 3d98ebe..2483880 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,12 @@ -#### CAUTION: Upcoming versions of the BCM file compressor will not be compatible with current releases! Please remove all of your forks and stay tuned! +**CAUTION: Upcoming versions of the BCM file compressor will not be compatible with current releases! Please remove all of your forks and stay tuned!** -## BCM v1.04 +# BCM -#### DESCRIPTION +### Description BCM is a high-performance file compressor that utilizes advanced context modeling techniques to achieve a very high compression ratio. All in all, it's like a big brother of the BZIP2. -#### AUTHORS -- Ilya Muravyov -- The libdivsufsort-lite library used is developed by Yuta Mori +### Author +Ilya Muravyov -#### THANKS +### Thanks Special thanks to Yuta Mori, Matt Mahoney, Eugene Shelwien, Alexander Rhatushnyak, Przemyslaw Skibinski and LovePimple. From 151351425009d81acd9f54b32dd2b950f84a4d7b Mon Sep 17 00:00:00 2001 From: Ilya Muravyov Date: Wed, 25 May 2016 12:03:30 +0400 Subject: [PATCH 14/34] Updated to v1.10 beta --- src/bcm.cpp | 192 ++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 143 insertions(+), 49 deletions(-) diff --git a/src/bcm.cpp b/src/bcm.cpp index 2f31c57..4ccdf5d 100644 --- a/src/bcm.cpp +++ b/src/bcm.cpp @@ -1,8 +1,27 @@ /* BCM - A BWT-based file compressor + Copyright (C) 2008-2016 Ilya Muravyov +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + */ #ifdef __GNUC__ @@ -23,6 +42,7 @@ Copyright (C) 2008-2016 Ilya Muravyov #endif // __GNUC__ +#define _CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES 1 #define _CRT_SECURE_NO_WARNINGS #define _CRT_DISABLE_PERFCRIT_LOCKS @@ -37,7 +57,7 @@ typedef unsigned short WORD; typedef unsigned int DWORD; typedef unsigned long long QWORD; -const char magic[]="BCM1"; +const char magic[]="BCM!"; FILE* in; FILE* out; @@ -165,11 +185,32 @@ struct CM: Encoder for (int j=0; j<256; ++j) { for (int k=0; k<17; ++k) - counter2[i][j][k].p=(k-(k==16))<<12; + counter2[i][j][k].p=(k<<12)-(k==16); } } } + void Encode32(DWORD n) + { + for (int i=0; i<32; ++i) + { + if (n&(1<<31)) + Encoder::EncodeBit1(1<<17); + else + Encoder::EncodeBit0(1<<17); + n+=n; + } + } + + DWORD Decode32() + { + DWORD n=0; + for (int i=0; i<32; ++i) + n+=n+Encoder::DecodeBit(1<<17); + + return n; + } + void Encode(int c) { if (c1==c2) @@ -184,7 +225,7 @@ struct CM: Encoder const int p0=counter0[ctx].p; const int p1=counter1[c1][ctx].p; const int p2=counter1[c2][ctx].p; - const int p=(p0+p0+p0+p0+p1+p1+p1+p2)>>3; + const int p=((p0+p1)*7+p2+p2)>>4; const int j=p>>12; const int x1=counter2[f][ctx][j].p; @@ -196,7 +237,7 @@ struct CM: Encoder if (bit) { - Encoder::EncodeBit1(p+ssep+ssep+ssep); + Encoder::EncodeBit1(ssep*3+p); counter0[ctx].UpdateBit1(); counter1[c1][ctx].UpdateBit1(); counter2[f][ctx][j].UpdateBit1(); @@ -205,7 +246,7 @@ struct CM: Encoder } else { - Encoder::EncodeBit0(p+ssep+ssep+ssep); + Encoder::EncodeBit0(ssep*3+p); counter0[ctx].UpdateBit0(); counter1[c1][ctx].UpdateBit0(); counter2[f][ctx][j].UpdateBit0(); @@ -232,14 +273,14 @@ struct CM: Encoder const int p0=counter0[ctx].p; const int p1=counter1[c1][ctx].p; const int p2=counter1[c2][ctx].p; - const int p=(p0+p0+p0+p0+p1+p1+p1+p2)>>3; + const int p=((p0+p1)*7+p2+p2)>>4; const int j=p>>12; const int x1=counter2[f][ctx][j].p; const int x2=counter2[f][ctx][j+1].p; const int ssep=x1+(((x2-x1)*(p&4095))>>12); - const int bit=Encoder::DecodeBit(p+ssep+ssep+ssep); + const int bit=Encoder::DecodeBit(ssep*3+p); if (bit) { @@ -264,6 +305,64 @@ struct CM: Encoder } } cm; +struct CRC +{ + DWORD t[4][256]; + DWORD crc; + + CRC() + { + for (int i=0; i<256; ++i) + { + DWORD r=i; + for (int j=0; j<8; ++j) + r=(r>>1)^(-int(r&1)&0xedb88320); + t[0][i]=r; + } + + for (int i=0; i<256; ++i) + { + t[1][i]=t[0][t[0][i]&255]^(t[0][i]>>8); + t[2][i]=t[0][t[1][i]&255]^(t[1][i]>>8); + t[3][i]=t[0][t[2][i]&255]^(t[2][i]>>8); + } + + crc=DWORD(-1); + } + + DWORD operator()() const + { + return ~crc; + } + + void Clear() + { + crc=DWORD(-1); + } + + void Update(int c) + { + crc=t[0][(crc^c)&255]^(crc>>8); + } + + void Update(BYTE* p, int n) + { +#ifdef _WIN32 + for (; n>=4; n-=4) + { + crc^=*(const DWORD*)p; + p+=4; + crc=t[0][crc>>24] + ^t[1][(crc>>16)&255] + ^t[2][(crc>>8)&255] + ^t[3][crc&255]; + } +#endif + for (; n>0; --n) + crc=t[0][(crc^*p++)&255]^(crc>>8); + } +} crc; + BYTE* buf; void compress(int bsize) @@ -298,6 +397,8 @@ void compress(int bsize) int n; while ((n=fread(buf, 1, bsize, in))>0) { + crc.Update(buf, n); + const int idx=divbwt(buf, buf, (int*)&buf[bsize], n); if (idx<1) { @@ -305,23 +406,15 @@ void compress(int bsize) exit(1); } - cm.Encode(n>>24); - cm.Encode(n>>16); - cm.Encode(n>>8); - cm.Encode(n); - cm.Encode(idx>>24); - cm.Encode(idx>>16); - cm.Encode(idx>>8); - cm.Encode(idx); + cm.Encode32(n); + cm.Encode32(idx); for (int i=0; i0) { - const int n=(cm.Decode()<<24) - |(cm.Decode()<<16) - |(cm.Decode()<<8) - |cm.Decode(); - if (!n) // EOF - break; if (!bsize) { buf=(BYTE*)calloc(bsize=n, 5); @@ -358,16 +446,14 @@ void decompress() exit(1); } } - const int idx=(cm.Decode()<<24) - |(cm.Decode()<<16) - |(cm.Decode()<<8) - |cm.Decode(); - if (n<1 || n>bsize || idx<1 || idx>n) + + const int idx=cm.Decode32(); + if (n>bsize || idx<1 || idx>n) { fprintf(stderr, "File corrupted!\n"); exit(1); } - // Inverse BWT + // Inverse BW-transform int t[257]={0}; for (int i=0; i=idx)], out); + const int c=buf[p-(p>=idx)]; + putc(c, out); + crc.Update(c); } } + + if (cm.Decode32()!=crc()) + { + fprintf(stderr, "CRC error!\n"); + exit(1); + } } int main(int argc, char** argv) @@ -423,7 +517,8 @@ int main(int argc, char** argv) if (argc<2) { fprintf(stderr, - "BCM - A BWT-based file compressor, v1.04\n" + "BCM - A BWT-based file compressor, v1.10 beta\n" + "Copyright (C) 2008-2016 Ilya Muravyov\n" "\n" "Usage: BCM [options] infile [outfile]\n" "\n" @@ -459,13 +554,6 @@ int main(int argc, char** argv) else strcpy(ofname, argv[2]); - if (!strcmp(ofname, argv[1])) - { - fprintf(stderr, "%s: Cannot %scompress onto itself\n", argv[1], - do_decomp?"de":""); - exit(1); - } - if (!overwrite) { FILE* f=fopen(ofname, "rb"); @@ -473,14 +561,11 @@ int main(int argc, char** argv) { fclose(f); - fprintf(stderr, "%s already exists; overwrite (y or n)? ", ofname); + fprintf(stderr, "%s already exists. Overwrite (y/n)? ", ofname); fflush(stderr); if (getchar()!='y') - { - fprintf(stderr, "\tnot overwritten\n"); exit(1); - } } } @@ -491,17 +576,26 @@ int main(int argc, char** argv) exit(1); } - fprintf(stderr, "%s: ", argv[1]); - fflush(stderr); - if (do_decomp) + { + fprintf(stderr, "Decompressing %s: ", argv[1]); + fflush(stderr); + decompress(); + } else + { + fprintf(stderr, "Compressing %s: ", argv[1]); + fflush(stderr); + compress(bsize); + } - fprintf(stderr, "%lld -> %lld in %.3fs\n", _ftelli64(in), _ftelli64(out), + fprintf(stderr, "%lld->%lld in %.1fs\n", _ftelli64(in), _ftelli64(out), double(clock()-start)/CLOCKS_PER_SEC); + fprintf(stderr, "CRC = %08X\n", crc()); // DEBUG + fclose(in); fclose(out); From ee400729973b52a285d8cf3ca6c539286d809af1 Mon Sep 17 00:00:00 2001 From: Ilya Muravyov Date: Wed, 25 May 2016 11:04:43 +0300 Subject: [PATCH 15/34] Update README.md --- README.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/README.md b/README.md index 2483880..09d0b3f 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,4 @@ -**CAUTION: Upcoming versions of the BCM file compressor will not be compatible with current releases! Please remove all of your forks and stay tuned!** - -# BCM +# BCM v1.10 beta ### Description BCM is a high-performance file compressor that utilizes advanced context modeling techniques to achieve a very high compression ratio. All in all, it's like a big brother of the BZIP2. From 78672c86f89047f3daf7b2a079dc07c01a49bf15 Mon Sep 17 00:00:00 2001 From: Ilya Muravyov Date: Wed, 8 Jun 2016 20:47:03 +0400 Subject: [PATCH 16/34] Updated to v1.20 beta --- src/bcm.cpp | 1041 ++++++++++++++++++++++++++------------------------- 1 file changed, 533 insertions(+), 508 deletions(-) diff --git a/src/bcm.cpp b/src/bcm.cpp index 4ccdf5d..9bae682 100644 --- a/src/bcm.cpp +++ b/src/bcm.cpp @@ -50,556 +50,581 @@ SOFTWARE. #include #include #include + +#ifndef NO_UTIME +#include +#include +#include +#endif + #include "divsufsort.h" // libdivsufsort-lite -typedef unsigned char BYTE; -typedef unsigned short WORD; -typedef unsigned int DWORD; -typedef unsigned long long QWORD; +typedef unsigned char byte; +typedef unsigned short word; +typedef unsigned int uint; +typedef unsigned long long ulonglong; const char magic[]="BCM!"; -FILE* in; -FILE* out; +FILE* fin; +FILE* fout; struct Encoder { - DWORD low; - DWORD high; - DWORD code; - - Encoder() - { - low=0; - high=DWORD(-1); - code=0; - } - - void EncodeBit0(DWORD p) - { + uint low; + uint high; + uint code; + + Encoder() + { + low=0; + high=uint(-1); + code=0; + } + + void EncodeBit0(uint p) + { #ifdef _WIN64 - low+=((QWORD(high-low)*p)>>18)+1; + low+=((ulonglong(high-low)*p)>>18)+1; #else - low+=((QWORD(high-low)*(p<<(32-18)))>>32)+1; + low+=((ulonglong(high-low)*(p<<(32-18)))>>32)+1; #endif - while ((low^high)<(1<<24)) - { - putc(low>>24, out); - low<<=8; - high=(high<<8)+255; - } - } - - void EncodeBit1(DWORD p) - { + while ((low^high)<(1<<24)) + { + putc(low>>24, fout); + low<<=8; + high=(high<<8)+255; + } + } + + void EncodeBit1(uint p) + { #ifdef _WIN64 - high=low+((QWORD(high-low)*p)>>18); + high=low+((ulonglong(high-low)*p)>>18); #else - high=low+((QWORD(high-low)*(p<<(32-18)))>>32); + high=low+((ulonglong(high-low)*(p<<(32-18)))>>32); #endif - while ((low^high)<(1<<24)) - { - putc(low>>24, out); - low<<=8; - high=(high<<8)+255; - } - } - - void Flush() - { - for (int i=0; i<4; ++i) - { - putc(low>>24, out); - low<<=8; - } - } - - void Init() - { - for (int i=0; i<4; ++i) - code=(code<<8)+getc(in); - } - - int DecodeBit(DWORD p) - { + while ((low^high)<(1<<24)) + { + putc(low>>24, fout); + low<<=8; + high=(high<<8)+255; + } + } + + void Flush() + { + for (int i=0; i<4; ++i) + { + putc(low>>24, fout); + low<<=8; + } + } + + void Init() + { + for (int i=0; i<4; ++i) + code=(code<<8)+getc(fin); + } + + int DecodeBit(uint p) + { #ifdef _WIN64 - const DWORD mid=low+((QWORD(high-low)*p)>>18); + const uint mid=low+((ulonglong(high-low)*p)>>18); #else - const DWORD mid=low+((QWORD(high-low)*(p<<(32-18)))>>32); + const uint mid=low+((ulonglong(high-low)*(p<<(32-18)))>>32); #endif - const int bit=(code<=mid); - if (bit) - high=mid; - else - low=mid+1; - - while ((low^high)<(1<<24)) - { - low<<=8; - high=(high<<8)+255; - code=(code<<8)+getc(in); - } - - return bit; - } + const int bit=(code<=mid); + if (bit) + high=mid; + else + low=mid+1; + + while ((low^high)<(1<<24)) + { + low<<=8; + high=(high<<8)+255; + code=(code<<8)+getc(fin); + } + + return bit; + } }; template struct Counter { - WORD p; - - Counter() - { - p=1<<15; - } - - void UpdateBit0() - { - p-=p>>RATE; - } - - void UpdateBit1() - { - p+=(p^65535)>>RATE; - } + word p; + + Counter() + { + p=1<<15; + } + + void UpdateBit0() + { + p-=p>>RATE; + } + + void UpdateBit1() + { + p+=(p^65535)>>RATE; + } }; struct CM: Encoder { - Counter<2> counter0[256]; - Counter<4> counter1[256][256]; - Counter<6> counter2[2][256][17]; - int c1; - int c2; - int run; - - CM() - { - c1=0; - c2=0; - run=0; - - for (int i=0; i<2; ++i) - { - for (int j=0; j<256; ++j) - { - for (int k=0; k<17; ++k) - counter2[i][j][k].p=(k<<12)-(k==16); - } - } - } - - void Encode32(DWORD n) - { - for (int i=0; i<32; ++i) - { - if (n&(1<<31)) - Encoder::EncodeBit1(1<<17); - else - Encoder::EncodeBit0(1<<17); - n+=n; - } - } - - DWORD Decode32() - { - DWORD n=0; - for (int i=0; i<32; ++i) - n+=n+Encoder::DecodeBit(1<<17); - - return n; - } - - void Encode(int c) - { - if (c1==c2) - ++run; - else - run=0; - const int f=(run>2); - - int ctx=1; - while (ctx<256) - { - const int p0=counter0[ctx].p; - const int p1=counter1[c1][ctx].p; - const int p2=counter1[c2][ctx].p; - const int p=((p0+p1)*7+p2+p2)>>4; - - const int j=p>>12; - const int x1=counter2[f][ctx][j].p; - const int x2=counter2[f][ctx][j+1].p; - const int ssep=x1+(((x2-x1)*(p&4095))>>12); - - const int bit=c&128; - c+=c; - - if (bit) - { - Encoder::EncodeBit1(ssep*3+p); - counter0[ctx].UpdateBit1(); - counter1[c1][ctx].UpdateBit1(); - counter2[f][ctx][j].UpdateBit1(); - counter2[f][ctx][j+1].UpdateBit1(); - ctx+=ctx+1; - } - else - { - Encoder::EncodeBit0(ssep*3+p); - counter0[ctx].UpdateBit0(); - counter1[c1][ctx].UpdateBit0(); - counter2[f][ctx][j].UpdateBit0(); - counter2[f][ctx][j+1].UpdateBit0(); - ctx+=ctx; - } - } - - c2=c1; - c1=ctx&255; - } - - int Decode() - { - if (c1==c2) - ++run; - else - run=0; - const int f=(run>2); - - int ctx=1; - while (ctx<256) - { - const int p0=counter0[ctx].p; - const int p1=counter1[c1][ctx].p; - const int p2=counter1[c2][ctx].p; - const int p=((p0+p1)*7+p2+p2)>>4; - - const int j=p>>12; - const int x1=counter2[f][ctx][j].p; - const int x2=counter2[f][ctx][j+1].p; - const int ssep=x1+(((x2-x1)*(p&4095))>>12); - - const int bit=Encoder::DecodeBit(ssep*3+p); - - if (bit) - { - counter0[ctx].UpdateBit1(); - counter1[c1][ctx].UpdateBit1(); - counter2[f][ctx][j].UpdateBit1(); - counter2[f][ctx][j+1].UpdateBit1(); - ctx+=ctx+1; - } - else - { - counter0[ctx].UpdateBit0(); - counter1[c1][ctx].UpdateBit0(); - counter2[f][ctx][j].UpdateBit0(); - counter2[f][ctx][j+1].UpdateBit0(); - ctx+=ctx; - } - } - - c2=c1; - return c1=ctx&255; - } + Counter<2> counter0[256]; + Counter<4> counter1[256][256]; + Counter<6> counter2[2][256][17]; + int c1; + int c2; + int run; + + CM() + { + c1=0; + c2=0; + run=0; + + for (int i=0; i<2; ++i) + { + for (int j=0; j<256; ++j) + { + for (int k=0; k<17; ++k) + counter2[i][j][k].p=(k<<12)-(k==16); + } + } + } + + void Encode32(uint n) + { + for (int i=0; i<32; ++i) + { + if (n&(1<<31)) + Encoder::EncodeBit1(1<<17); + else + Encoder::EncodeBit0(1<<17); + n+=n; + } + } + + uint Decode32() + { + uint n=0; + for (int i=0; i<32; ++i) + n+=n+Encoder::DecodeBit(1<<17); + + return n; + } + + void Encode(int c) + { + if (c1==c2) + ++run; + else + run=0; + const int f=(run>2); + + int ctx=1; + while (ctx<256) + { + const int p0=counter0[ctx].p; + const int p1=counter1[c1][ctx].p; + const int p2=counter1[c2][ctx].p; + const int p=((p0+p1)*7+p2+p2)>>4; + + const int j=p>>12; + const int x1=counter2[f][ctx][j].p; + const int x2=counter2[f][ctx][j+1].p; + const int ssep=x1+(((x2-x1)*(p&4095))>>12); + + const int bit=c&128; + c+=c; + + if (bit) + { + Encoder::EncodeBit1(ssep*3+p); + counter0[ctx].UpdateBit1(); + counter1[c1][ctx].UpdateBit1(); + counter2[f][ctx][j].UpdateBit1(); + counter2[f][ctx][j+1].UpdateBit1(); + ctx+=ctx+1; + } + else + { + Encoder::EncodeBit0(ssep*3+p); + counter0[ctx].UpdateBit0(); + counter1[c1][ctx].UpdateBit0(); + counter2[f][ctx][j].UpdateBit0(); + counter2[f][ctx][j+1].UpdateBit0(); + ctx+=ctx; + } + } + + c2=c1; + c1=ctx&255; + } + + int Decode() + { + if (c1==c2) + ++run; + else + run=0; + const int f=(run>2); + + int ctx=1; + while (ctx<256) + { + const int p0=counter0[ctx].p; + const int p1=counter1[c1][ctx].p; + const int p2=counter1[c2][ctx].p; + const int p=((p0+p1)*7+p2+p2)>>4; + + const int j=p>>12; + const int x1=counter2[f][ctx][j].p; + const int x2=counter2[f][ctx][j+1].p; + const int ssep=x1+(((x2-x1)*(p&4095))>>12); + + const int bit=Encoder::DecodeBit(ssep*3+p); + + if (bit) + { + counter0[ctx].UpdateBit1(); + counter1[c1][ctx].UpdateBit1(); + counter2[f][ctx][j].UpdateBit1(); + counter2[f][ctx][j+1].UpdateBit1(); + ctx+=ctx+1; + } + else + { + counter0[ctx].UpdateBit0(); + counter1[c1][ctx].UpdateBit0(); + counter2[f][ctx][j].UpdateBit0(); + counter2[f][ctx][j+1].UpdateBit0(); + ctx+=ctx; + } + } + + c2=c1; + return c1=ctx&255; + } } cm; struct CRC { - DWORD t[4][256]; - DWORD crc; - - CRC() - { - for (int i=0; i<256; ++i) - { - DWORD r=i; - for (int j=0; j<8; ++j) - r=(r>>1)^(-int(r&1)&0xedb88320); - t[0][i]=r; - } - - for (int i=0; i<256; ++i) - { - t[1][i]=t[0][t[0][i]&255]^(t[0][i]>>8); - t[2][i]=t[0][t[1][i]&255]^(t[1][i]>>8); - t[3][i]=t[0][t[2][i]&255]^(t[2][i]>>8); - } - - crc=DWORD(-1); - } - - DWORD operator()() const - { - return ~crc; - } - - void Clear() - { - crc=DWORD(-1); - } - - void Update(int c) - { - crc=t[0][(crc^c)&255]^(crc>>8); - } - - void Update(BYTE* p, int n) - { + uint t[4][256]; + uint crc; + + CRC() + { + for (int i=0; i<256; ++i) + { + uint r=i; + for (int j=0; j<8; ++j) + r=(r>>1)^(-int(r&1)&0xedb88320); + t[0][i]=r; + } + + for (int i=0; i<256; ++i) + { + t[1][i]=t[0][t[0][i]&255]^(t[0][i]>>8); + t[2][i]=t[0][t[1][i]&255]^(t[1][i]>>8); + t[3][i]=t[0][t[2][i]&255]^(t[2][i]>>8); + } + + crc=uint(-1); + } + + uint operator()() const + { + return ~crc; + } + + void Clear() + { + crc=uint(-1); + } + + void Update(int c) + { + crc=t[0][(crc^c)&255]^(crc>>8); + } + + void Update(byte* p, int n) + { #ifdef _WIN32 - for (; n>=4; n-=4) - { - crc^=*(const DWORD*)p; - p+=4; - crc=t[0][crc>>24] - ^t[1][(crc>>16)&255] - ^t[2][(crc>>8)&255] - ^t[3][crc&255]; - } + for (; n>=4; n-=4) + { + crc^=*(const uint*)p; + p+=4; + crc=t[0][crc>>24] + ^t[1][(crc>>16)&255] + ^t[2][(crc>>8)&255] + ^t[3][crc&255]; + } #endif - for (; n>0; --n) - crc=t[0][(crc^*p++)&255]^(crc>>8); - } + for (; n>0; --n) + crc=t[0][(crc^*p++)&255]^(crc>>8); + } } crc; -BYTE* buf; - void compress(int bsize) { - if (_fseeki64(in, 0, SEEK_END)) - { - perror("Fseek() failed"); - exit(1); - } - const long long flen=_ftelli64(in); - if (flen<0) - { - perror("Ftell() failed"); - exit(1); - } - if (bsize>flen) - bsize=int(flen); - rewind(in); - - buf=(BYTE*)calloc(bsize, 5); - if (!buf) - { - fprintf(stderr, "Out of memory!\n"); - exit(1); - } - - putc(magic[0], out); - putc(magic[1], out); - putc(magic[2], out); - putc(magic[3], out); - - int n; - while ((n=fread(buf, 1, bsize, in))>0) - { - crc.Update(buf, n); - - const int idx=divbwt(buf, buf, (int*)&buf[bsize], n); - if (idx<1) - { - perror("Divbwt() failed"); - exit(1); - } - - cm.Encode32(n); - cm.Encode32(idx); - - for (int i=0; iflen) + bsize=int(flen); + rewind(fin); + + byte* buf=(byte*)calloc(bsize, 5); + if (!buf) + { + fprintf(stderr, "Out of memory!\n"); + exit(1); + } + + putc(magic[0], fout); + putc(magic[1], fout); + putc(magic[2], fout); + putc(magic[3], fout); + + int n; + while ((n=fread(buf, 1, bsize, fin))>0) + { + crc.Update(buf, n); + + const int idx=divbwt(buf, buf, (int*)&buf[bsize], n); + if (idx<1) + { + perror("Divbwt failed"); + exit(1); + } + + cm.Encode32(n); + cm.Encode32(idx); + + for (int i=0; i0) - { - if (!bsize) - { - buf=(BYTE*)calloc(bsize=n, 5); - if (!buf) - { - fprintf(stderr, "Out of memory!\n"); - exit(1); - } - } - - const int idx=cm.Decode32(); - if (n>bsize || idx<1 || idx>n) - { - fprintf(stderr, "File corrupted!\n"); - exit(1); - } - // Inverse BW-transform - int t[257]={0}; - for (int i=0; i=idx); - for (int p=idx; p;) - { - p=next[p-1]; - const int c=buf[p-(p>=idx)]; - putc(c, out); - crc.Update(c); - } - } - - if (cm.Decode32()!=crc()) - { - fprintf(stderr, "CRC error!\n"); - exit(1); - } + if (getc(fin)!=magic[0] + ||getc(fin)!=magic[1] + ||getc(fin)!=magic[2] + ||getc(fin)!=magic[3]) + { + fprintf(stderr, "Not in BCM format!\n"); + exit(1); + } + + cm.Init(); + + int bsize=0; + byte* buf; + + int n; + while ((n=cm.Decode32())>0) + { + if (!bsize) + { + buf=(byte*)calloc(bsize=n, 5); + if (!buf) + { + fprintf(stderr, "Out of memory!\n"); + exit(1); + } + } + + const int idx=cm.Decode32(); + if (n>bsize || idx<1 || idx>n) + { + fprintf(stderr, "File corrupted!\n"); + exit(1); + } + // Inverse BW-transform + int t[257]={0}; + for (int i=0; i=idx); + for (int p=idx; p;) + { + p=next[p-1]; + const int c=buf[p-(p>=idx)]; + putc(c, fout); + crc.Update(c); + } + } + + free(buf); + + if (cm.Decode32()!=crc()) + { + fprintf(stderr, "CRC error!\n"); + exit(1); + } } int main(int argc, char** argv) { - const clock_t start=clock(); - - int bsize=32<<20; // 32 MB - bool do_decomp=false; - bool overwrite=false; - - while (argc>1 && *argv[1]=='-') - { - switch (argv[1][1]) - { - case 'b': - bsize=atoi(&argv[1][2]) - <<(argv[1][strlen(argv[1])-1]=='k'?10:20); - if (bsize<1) - { - fprintf(stderr, "Block size is out of range!\n"); - exit(1); - } - break; - case 'd': - do_decomp=true; - break; - case 'f': - overwrite=true; - break; - default: - fprintf(stderr, "Unknown option: %s\n", argv[1]); - exit(1); - } - - --argc; - ++argv; - } - - if (argc<2) - { - fprintf(stderr, - "BCM - A BWT-based file compressor, v1.10 beta\n" - "Copyright (C) 2008-2016 Ilya Muravyov\n" - "\n" - "Usage: BCM [options] infile [outfile]\n" - "\n" - "Options:\n" - " -b#[k] Set block size to # MB or KB (default is 32 MB)\n" - " -d Decompress\n" - " -f Force overwrite of output file\n"); - exit(1); - } - - in=fopen(argv[1], "rb"); - if (!in) - { - perror(argv[1]); - exit(1); - } - - char ofname[FILENAME_MAX]; - if (argc<3) - { - strcpy(ofname, argv[1]); - if (do_decomp) - { - const int p=strlen(ofname)-4; - if (p>0 && !strcmp(&ofname[p], ".bcm")) - ofname[p]='\0'; - else - strcat(ofname, ".out"); - } - else - strcat(ofname, ".bcm"); - } - else - strcpy(ofname, argv[2]); - - if (!overwrite) - { - FILE* f=fopen(ofname, "rb"); - if (f) - { - fclose(f); - - fprintf(stderr, "%s already exists. Overwrite (y/n)? ", ofname); - fflush(stderr); - - if (getchar()!='y') - exit(1); - } - } - - out=fopen(ofname, "wb"); - if (!out) - { - perror(ofname); - exit(1); - } - - if (do_decomp) - { - fprintf(stderr, "Decompressing %s: ", argv[1]); - fflush(stderr); - - decompress(); - } - else - { - fprintf(stderr, "Compressing %s: ", argv[1]); - fflush(stderr); - - compress(bsize); - } - - fprintf(stderr, "%lld->%lld in %.1fs\n", _ftelli64(in), _ftelli64(out), - double(clock()-start)/CLOCKS_PER_SEC); - - fprintf(stderr, "CRC = %08X\n", crc()); // DEBUG - - fclose(in); - fclose(out); - - free(buf); - - return 0; + const clock_t start=clock(); + + int bsize=64<<20; // 64 MB + bool do_decomp=false; + bool overwrite=false; + + while (argc>1 && *argv[1]=='-') + { + switch (argv[1][1]) + { + case 'b': + bsize=atoi(&argv[1][2]) + <<(argv[1][strlen(argv[1])-1]=='k'?10:20); + if (bsize<1) + { + fprintf(stderr, "Block size is out of range!\n"); + exit(1); + } + break; + case 'd': + do_decomp=true; + break; + case 'f': + overwrite=true; + break; + default: + fprintf(stderr, "Unknown option: %s\n", argv[1]); + exit(1); + } + + --argc; + ++argv; + } + + if (argc<2) + { + fprintf(stderr, + "BCM - A BWT-based file compressor, v1.20 beta\n" + "Copyright (C) 2008-2016 Ilya Muravyov\n" + "\n" + "Usage: BCM [options] infile [outfile]\n" + "\n" + "Options:\n" + " -b#[k] Set block size to # MB or KB (default is 64 MB)\n" + " -d Decompress\n" + " -f Force overwrite of output file\n"); + exit(1); + } + + fin=fopen(argv[1], "rb"); + if (!fin) + { + perror(argv[1]); + exit(1); + } + + char ofname[FILENAME_MAX]; + if (argc<3) + { + strcpy(ofname, argv[1]); + if (do_decomp) + { + const int p=strlen(ofname)-4; + if (p>0 && !strcmp(&ofname[p], ".bcm")) + ofname[p]='\0'; + else + strcat(ofname, ".out"); + } + else + strcat(ofname, ".bcm"); + } + else + strcpy(ofname, argv[2]); + + if (!overwrite) + { + FILE* f=fopen(ofname, "rb"); + if (f) + { + fclose(f); + + fprintf(stderr, "%s already exists. Overwrite (y/n)? ", ofname); + fflush(stderr); + + if (getchar()!='y') + exit(1); + } + } + + fout=fopen(ofname, "wb"); + if (!fout) + { + perror(ofname); + exit(1); + } + + if (do_decomp) + { + fprintf(stderr, "Decompressing %s: ", argv[1]); + fflush(stderr); + + decompress(); + } + else + { + fprintf(stderr, "Compressing %s: ", argv[1]); + fflush(stderr); + + compress(bsize); + } + + fprintf(stderr, "%lld -> %lld in %1.2fs\n", _ftelli64(fin), _ftelli64(fout), + double(clock()-start)/CLOCKS_PER_SEC); + + fclose(fin); + fclose(fout); + +#ifndef NO_UTIME + struct stat sb; + if (stat(argv[1], &sb)) + { + perror("Stat failed"); + exit(1); + } + struct utimbuf ub; + ub.actime=sb.st_atime; + ub.modtime=sb.st_mtime; + if (utime(ofname, &ub)) + { + perror("Utime failed"); + exit(1); + } +#endif + + fprintf(stderr, "CRC = %08X\n", crc()); // DEBUG + + return 0; } From 53a728ca6668820a1285240c53b42fb8c8c10898 Mon Sep 17 00:00:00 2001 From: Ilya Muravyov Date: Wed, 8 Jun 2016 19:47:56 +0300 Subject: [PATCH 17/34] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 09d0b3f..dbc2a37 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# BCM v1.10 beta +# BCM v1.20 beta ### Description BCM is a high-performance file compressor that utilizes advanced context modeling techniques to achieve a very high compression ratio. All in all, it's like a big brother of the BZIP2. From 71949f635a61ccfd7e418419a402c0a1e3dee5e5 Mon Sep 17 00:00:00 2001 From: Ilya Muravyov Date: Tue, 14 Jun 2016 23:12:47 +0300 Subject: [PATCH 18/34] Updated to v1.21 beta --- src/bcm.cpp | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/bcm.cpp b/src/bcm.cpp index 9bae682..27485c6 100644 --- a/src/bcm.cpp +++ b/src/bcm.cpp @@ -29,6 +29,7 @@ SOFTWARE. #define _FILE_OFFSET_BITS 64 #define _fseeki64 fseeko64 #define _ftelli64 ftello64 +#define _stati64 stat #ifdef HAVE_GETC_UNLOCKED #undef getc @@ -416,6 +417,8 @@ void compress(int bsize) for (int i=0; i Date: Tue, 14 Jun 2016 23:13:19 +0300 Subject: [PATCH 19/34] Update bcm.cpp --- README.md | 4 ++-- src/bcm.cpp | 65 ++++++++++++++++++++++++---------------------------- src/make.cmd | 1 + 3 files changed, 33 insertions(+), 37 deletions(-) create mode 100644 src/make.cmd diff --git a/README.md b/README.md index dbc2a37..fd564ea 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# BCM v1.20 beta +# BCM v1.21 beta ### Description BCM is a high-performance file compressor that utilizes advanced context modeling techniques to achieve a very high compression ratio. All in all, it's like a big brother of the BZIP2. @@ -7,4 +7,4 @@ BCM is a high-performance file compressor that utilizes advanced context modelin Ilya Muravyov ### Thanks -Special thanks to Yuta Mori, Matt Mahoney, Eugene Shelwien, Alexander Rhatushnyak, Przemyslaw Skibinski and LovePimple. +Special thanks to Yuta Mori, Matt Mahoney, Eugene Shelwien, Alexander Rhatushnyak, Przemyslaw Skibinski, Malcolm Taylor and LovePimple. diff --git a/src/bcm.cpp b/src/bcm.cpp index 27485c6..f0e3b96 100644 --- a/src/bcm.cpp +++ b/src/bcm.cpp @@ -24,25 +24,23 @@ SOFTWARE. */ -#ifdef __GNUC__ - -#define _FILE_OFFSET_BITS 64 -#define _fseeki64 fseeko64 -#define _ftelli64 ftello64 -#define _stati64 stat - -#ifdef HAVE_GETC_UNLOCKED -#undef getc -#define getc getc_unlocked -#endif - -#ifdef HAVE_PUTC_UNLOCKED -#undef putc -#define putc putc_unlocked +#ifndef _MSC_VER +# define _FILE_OFFSET_BITS 64 + +# define _fseeki64 fseeko +# define _ftelli64 ftello +# define _stati64 stat + +# ifdef HAVE_GETC_UNLOCKED +# undef getc +# define getc getc_unlocked +# endif +# ifdef HAVE_PUTC_UNLOCKED +# undef putc +# define putc putc_unlocked +# endif #endif -#endif // __GNUC__ - #define _CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES 1 #define _CRT_SECURE_NO_WARNINGS #define _CRT_DISABLE_PERFCRIT_LOCKS @@ -53,9 +51,14 @@ SOFTWARE. #include #ifndef NO_UTIME -#include -#include -#include +# include +# include + +# ifdef _MSC_VER +# include +# else +# include +# endif #endif #include "divsufsort.h" // libdivsufsort-lite @@ -373,20 +376,12 @@ struct CRC void compress(int bsize) { - if (_fseeki64(fin, 0, SEEK_END)) - { - perror("Fseek failed"); - exit(1); - } + _fseeki64(fin, 0, SEEK_END); const long long flen=_ftelli64(fin); - if (flen<0) - { - perror("Ftell failed"); - exit(1); - } - if (bsize>flen) + _fseeki64(fin, 0, SEEK_SET); + + if (flen>0 && bsize>flen) bsize=int(flen); - rewind(fin); byte* buf=(byte*)calloc(bsize, 5); if (!buf) @@ -530,15 +525,15 @@ int main(int argc, char** argv) if (argc<2) { fprintf(stderr, - "BCM - A BWT-based file compressor, v1.21 beta\n" + "BCM - A BWT-based file compressor, v1.22 beta\n" "Copyright (C) 2008-2016 Ilya Muravyov\n" "\n" - "Usage: BCM [options] infile [outfile]\n" + "Usage: %s [options] infile [outfile]\n" "\n" "Options:\n" " -b#[k] Set block size to # MB or KB (default is 32 MB)\n" " -d Decompress\n" - " -f Force overwrite of output file\n"); + " -f Force overwrite of output file\n", argv[0]); exit(1); } diff --git a/src/make.cmd b/src/make.cmd new file mode 100644 index 0000000..2800358 --- /dev/null +++ b/src/make.cmd @@ -0,0 +1 @@ +icl /O3 /Qipo /EHc /GA /GR- /Febcm.exe divsufsort.c bcm.cpp From 69cf0c221b368412e77c142e1dc18e77af360aa9 Mon Sep 17 00:00:00 2001 From: Ilya Muravyov Date: Tue, 16 Aug 2016 11:51:44 +0400 Subject: [PATCH 20/34] Updated to v1.22 --- src/bcm.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/bcm.cpp b/src/bcm.cpp index f0e3b96..c578b5b 100644 --- a/src/bcm.cpp +++ b/src/bcm.cpp @@ -413,7 +413,8 @@ void compress(int bsize) for (int i=0; i0) + fprintf(stderr, "%3d%%\r", int((_ftelli64(fin)*100)/flen)); } free(buf); From f6dd5c9bac1e610134d89a330e16c672c24a3ba0 Mon Sep 17 00:00:00 2001 From: Ilya Muravyov Date: Tue, 16 Aug 2016 11:52:21 +0400 Subject: [PATCH 21/34] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index fd564ea..bcbe0bf 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# BCM v1.21 beta +# BCM v1.22 beta ### Description BCM is a high-performance file compressor that utilizes advanced context modeling techniques to achieve a very high compression ratio. All in all, it's like a big brother of the BZIP2. From ec4d5b5c88b67753993d441670d921af3a4285a1 Mon Sep 17 00:00:00 2001 From: Ilya Muravyov Date: Tue, 16 Aug 2016 19:16:45 +0300 Subject: [PATCH 22/34] Updated to v1.25 --- src/bcm.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bcm.cpp b/src/bcm.cpp index c578b5b..6c11e75 100644 --- a/src/bcm.cpp +++ b/src/bcm.cpp @@ -526,7 +526,7 @@ int main(int argc, char** argv) if (argc<2) { fprintf(stderr, - "BCM - A BWT-based file compressor, v1.22 beta\n" + "BCM - A BWT-based file compressor, v1.25\n" "Copyright (C) 2008-2016 Ilya Muravyov\n" "\n" "Usage: %s [options] infile [outfile]\n" From 4f984a7e068ed62ab7d426896aa627d4caf6f625 Mon Sep 17 00:00:00 2001 From: Ilya Muravyov Date: Tue, 16 Aug 2016 19:18:50 +0300 Subject: [PATCH 23/34] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index bcbe0bf..8dd6544 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# BCM v1.22 beta +# BCM v1.25 ### Description BCM is a high-performance file compressor that utilizes advanced context modeling techniques to achieve a very high compression ratio. All in all, it's like a big brother of the BZIP2. From f9f8268c7749b661c5e3f7123281233a6fd00e21 Mon Sep 17 00:00:00 2001 From: Ilya Muravyov Date: Wed, 10 Jan 2018 19:25:50 +0300 Subject: [PATCH 24/34] Delete make.cmd --- src/make.cmd | 1 - 1 file changed, 1 deletion(-) delete mode 100644 src/make.cmd diff --git a/src/make.cmd b/src/make.cmd deleted file mode 100644 index 2800358..0000000 --- a/src/make.cmd +++ /dev/null @@ -1 +0,0 @@ -icl /O3 /Qipo /EHc /GA /GR- /Febcm.exe divsufsort.c bcm.cpp From 7f4a4ad8acda0e5ca996ed1eff20a18892c7281e Mon Sep 17 00:00:00 2001 From: Ilya Muravyov Date: Thu, 11 Jan 2018 18:52:39 +0300 Subject: [PATCH 25/34] Updated to v1.30 --- README.md | 2 +- src/bcm.cpp | 150 +++++++++++++++++++++++++++------------------------- 2 files changed, 79 insertions(+), 73 deletions(-) diff --git a/README.md b/README.md index 8dd6544..8480242 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# BCM v1.25 +# BCM v1.30 ### Description BCM is a high-performance file compressor that utilizes advanced context modeling techniques to achieve a very high compression ratio. All in all, it's like a big brother of the BZIP2. diff --git a/src/bcm.cpp b/src/bcm.cpp index 6c11e75..ff8fa0a 100644 --- a/src/bcm.cpp +++ b/src/bcm.cpp @@ -2,7 +2,7 @@ BCM - A BWT-based file compressor -Copyright (C) 2008-2016 Ilya Muravyov +Copyright (C) 2008-2018 Ilya Muravyov Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -318,7 +318,7 @@ struct CM: Encoder struct CRC { - uint t[4][256]; + uint t[256]; uint crc; CRC() @@ -328,51 +328,41 @@ struct CRC uint r=i; for (int j=0; j<8; ++j) r=(r>>1)^(-int(r&1)&0xedb88320); - t[0][i]=r; - } - - for (int i=0; i<256; ++i) - { - t[1][i]=t[0][t[0][i]&255]^(t[0][i]>>8); - t[2][i]=t[0][t[1][i]&255]^(t[1][i]>>8); - t[3][i]=t[0][t[2][i]&255]^(t[2][i]>>8); + t[i]=r; } crc=uint(-1); } - uint operator()() const + void Clear() { - return ~crc; + crc=uint(-1); } - void Clear() + uint operator()() const { - crc=uint(-1); + return ~crc; } void Update(int c) { - crc=t[0][(crc^c)&255]^(crc>>8); + crc=t[(crc^c)&255]^(crc>>8); } +} crc; - void Update(byte* p, int n) +template +inline T* mem_alloc(size_t n) +{ + T* p=(T*)malloc(n*sizeof(T)); + if (!p) { -#ifdef _WIN32 - for (; n>=4; n-=4) - { - crc^=*(const uint*)p; - p+=4; - crc=t[0][crc>>24] - ^t[1][(crc>>16)&255] - ^t[2][(crc>>8)&255] - ^t[3][crc&255]; - } -#endif - for (; n>0; --n) - crc=t[0][(crc^*p++)&255]^(crc>>8); + fprintf(stderr, "Out of memory!\n"); + exit(1); } -} crc; + return p; +} + +#define mem_free(p) free(p) void compress(int bsize) { @@ -383,12 +373,8 @@ void compress(int bsize) if (flen>0 && bsize>flen) bsize=int(flen); - byte* buf=(byte*)calloc(bsize, 5); - if (!buf) - { - fprintf(stderr, "Out of memory!\n"); - exit(1); - } + byte* buf=mem_alloc(bsize); + int* ptr=mem_alloc(bsize); putc(magic[0], fout); putc(magic[1], fout); @@ -398,9 +384,10 @@ void compress(int bsize) int n; while ((n=fread(buf, 1, bsize, fin))>0) { - crc.Update(buf, n); + for (int i=0; i0) - fprintf(stderr, "%3d%%\r", int((_ftelli64(fin)*100)/flen)); + fprintf(stderr, "%lld -> %lld\r", _ftelli64(fin), _ftelli64(fout)); } - free(buf); - cm.Encode32(0); // EOF cm.Encode32(crc()); cm.Flush(); + + mem_free(buf); + mem_free(ptr); } void decompress() @@ -439,59 +426,81 @@ void decompress() cm.Init(); int bsize=0; - byte* buf; + byte* buf=NULL; + uint* ptr=NULL; int n; while ((n=cm.Decode32())>0) { if (!bsize) { - buf=(byte*)calloc(bsize=n, 5); - if (!buf) - { - fprintf(stderr, "Out of memory!\n"); - exit(1); - } + if ((bsize=n)>=(1<<24)) // 5n + buf=mem_alloc(bsize); + + ptr=mem_alloc(bsize); } const int idx=cm.Decode32(); if (n>bsize || idx<1 || idx>n) { - fprintf(stderr, "File corrupted!\n"); + fprintf(stderr, "Corrupt input!\n"); exit(1); } + // Inverse BW-transform - int t[257]={0}; - for (int i=0; i=idx); - for (int p=idx; p;) + if (n>=(1<<24)) // 5n { - p=next[p-1]; - const int c=buf[p-(p>=idx)]; - putc(c, fout); - crc.Update(c); + int t[257]={0}; + for (int i=0; i=idx); + for (int p=idx; p;) + { + p=ptr[p-1]; + const int c=buf[p-(p>=idx)]; + putc(c, fout); + crc.Update(c); + } + } + else // 4n + { + int t[257]={0}; + for (int i=0; i=idx))<<8; + for (int p=idx; p;) + { + p=ptr[p-1]>>8; + const int c=ptr[p-(p>=idx)]&255; + putc(c, fout); + crc.Update(c); + } } - } - free(buf); + fprintf(stderr, "%lld -> %lld\r", _ftelli64(fin), _ftelli64(fout)); + } if (cm.Decode32()!=crc()) { fprintf(stderr, "CRC error!\n"); exit(1); } + + mem_free(buf); + mem_free(ptr); } int main(int argc, char** argv) { const clock_t start=clock(); - int bsize=32<<20; // 32 MB + int bsize=(1<<24)-1; // 16 MB bool do_decomp=false; bool overwrite=false; @@ -526,13 +535,13 @@ int main(int argc, char** argv) if (argc<2) { fprintf(stderr, - "BCM - A BWT-based file compressor, v1.25\n" - "Copyright (C) 2008-2016 Ilya Muravyov\n" + "BCM - A BWT-based file compressor, v1.30\n" + "Copyright (C) 2008-2018 Ilya Muravyov\n" "\n" "Usage: %s [options] infile [outfile]\n" "\n" "Options:\n" - " -b#[k] Set block size to # MB or KB (default is 32 MB)\n" + " -b#[k] Set block size to # MB or KB (default is 16 MB)\n" " -d Decompress\n" " -f Force overwrite of output file\n", argv[0]); exit(1); @@ -571,7 +580,6 @@ int main(int argc, char** argv) fclose(f); fprintf(stderr, "%s already exists. Overwrite (y/n)? ", ofname); - fflush(stderr); if (getchar()!='y') exit(1); @@ -621,7 +629,5 @@ int main(int argc, char** argv) } #endif - fprintf(stderr, "CRC = %08X\n", crc()); // DEBUG - return 0; } From b8bc4927051d30abf07e793dc10746cd6430bdff Mon Sep 17 00:00:00 2001 From: Ilya Muravyov Date: Sat, 20 Jan 2018 16:48:21 +0300 Subject: [PATCH 26/34] Update README.md --- LICENSE | 37 ++++++++++++++++++++----------------- README.md | 13 ++++++++++++- 2 files changed, 32 insertions(+), 18 deletions(-) diff --git a/LICENSE b/LICENSE index 6d2bffe..fdddb29 100644 --- a/LICENSE +++ b/LICENSE @@ -1,21 +1,24 @@ -The MIT License (MIT) +This is free and unencumbered software released into the public domain. -Copyright (C) 2008-2016 Ilya Muravyov +Anyone is free to copy, modify, publish, use, compile, sell, or +distribute this software, either in source code form or as a compiled +binary, for any purpose, commercial or non-commercial, and by any +means. -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: +In jurisdictions that recognize copyright laws, the author or authors +of this software dedicate any and all copyright interest in the +software to the public domain. We make this dedication for the benefit +of the public at large and to the detriment of our heirs and +successors. We intend this dedication to be an overt act of +relinquishment in perpetuity of all present and future rights to this +software under copyright law. -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. +For more information, please refer to diff --git a/README.md b/README.md index 8480242..f1f8d42 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,19 @@ -# BCM v1.30 +# BCM v1.40 ### Description BCM is a high-performance file compressor that utilizes advanced context modeling techniques to achieve a very high compression ratio. All in all, it's like a big brother of the BZIP2. +### Quick Benchmark +[ENWIK8][1] | Compressed Size | +------------|-------------------| +Original | 100,000,000 bytes | +GZIP -9 | 36,445,248 bytes | +BZIP2 -9 | 29,008,758 bytes | +7-Zip Ultra | 24,864,804 bytes | +BCM -9 | 20,789,667 bytes | + +[1]:http://mattmahoney.net/dc/text.html + ### Author Ilya Muravyov From 703765371917167d313fb8b4b4331c963324df81 Mon Sep 17 00:00:00 2001 From: Ilya Muravyov Date: Thu, 16 Jan 2020 17:56:17 +0300 Subject: [PATCH 27/34] Updated to v1.40 --- src/bcm.cpp | 436 +++++++++++++++++++++++++++++----------------------- 1 file changed, 242 insertions(+), 194 deletions(-) diff --git a/src/bcm.cpp b/src/bcm.cpp index ff8fa0a..3d88442 100644 --- a/src/bcm.cpp +++ b/src/bcm.cpp @@ -2,25 +2,7 @@ BCM - A BWT-based file compressor -Copyright (C) 2008-2018 Ilya Muravyov - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. +Written and placed in the public domain by Ilya Muravyov */ @@ -63,80 +45,132 @@ SOFTWARE. #include "divsufsort.h" // libdivsufsort-lite -typedef unsigned char byte; -typedef unsigned short word; -typedef unsigned int uint; -typedef unsigned long long ulonglong; +typedef unsigned char U8; +typedef unsigned short U16; +typedef unsigned int U32; +typedef unsigned long long U64; +typedef signed long long S64; -const char magic[]="BCM!"; +// Globals -FILE* fin; -FILE* fout; +FILE* g_in; +FILE* g_out; + +const char g_magic[]="BCM!"; struct Encoder { - uint low; - uint high; - uint code; + U32 low; + U32 high; + U32 code; Encoder() { low=0; - high=uint(-1); + high=0xFFFFFFFF; code=0; } - void EncodeBit0(uint p) + void Flush() + { + for (int i=0; i<4; ++i) + { + putc(low>>24, g_out); + low<<=8; + } + } + + void Init() + { + for (int i=0; i<4; ++i) + code=(code<<8)+getc(g_in); + } + + template + void EncodeDirectBits(U32 x) + { + for (U32 i=1<<(N-1); i!=0; i>>=1) + { + if (x&i) + high=low+((high-low)>>1); + else + low+=((high-low)>>1)+1; + + if ((low^high)<(1<<24)) + { + putc(low>>24, g_out); + low<<=8; + high=(high<<8)+255; + } + } + } + + void EncodeBit1(U32 p) { #ifdef _WIN64 - low+=((ulonglong(high-low)*p)>>18)+1; + high=low+((U64(high-low)*p)>>18); #else - low+=((ulonglong(high-low)*(p<<(32-18)))>>32)+1; + high=low+((U64(high-low)*(p<<(32-18)))>>32); #endif while ((low^high)<(1<<24)) { - putc(low>>24, fout); + putc(low>>24, g_out); low<<=8; high=(high<<8)+255; } } - void EncodeBit1(uint p) + void EncodeBit0(U32 p) { #ifdef _WIN64 - high=low+((ulonglong(high-low)*p)>>18); + low+=((U64(high-low)*p)>>18)+1; #else - high=low+((ulonglong(high-low)*(p<<(32-18)))>>32); + low+=((U64(high-low)*(p<<(32-18)))>>32)+1; #endif while ((low^high)<(1<<24)) { - putc(low>>24, fout); + putc(low>>24, g_out); low<<=8; high=(high<<8)+255; } } - void Flush() + template + U32 DecodeDirectBits() { - for (int i=0; i<4; ++i) + U32 x=0; + + for (int i=0; i>24, fout); - low<<=8; + const U32 mid=low+((high-low)>>1); + if (code<=mid) + { + high=mid; + x+=x+1; + } + else + { + low=mid+1; + x+=x; + } + + if ((low^high)<(1<<24)) + { + low<<=8; + high=(high<<8)+255; + code=(code<<8)+getc(g_in); + } } - } - void Init() - { - for (int i=0; i<4; ++i) - code=(code<<8)+getc(fin); + return x; } - int DecodeBit(uint p) + int DecodeBit(U32 p) { #ifdef _WIN64 - const uint mid=low+((ulonglong(high-low)*p)>>18); + const U32 mid=low+((U64(high-low)*p)>>18); #else - const uint mid=low+((ulonglong(high-low)*(p<<(32-18)))>>32); + const U32 mid=low+((U64(high-low)*(p<<(32-18)))>>32); #endif const int bit=(code<=mid); if (bit) @@ -148,7 +182,7 @@ struct Encoder { low<<=8; high=(high<<8)+255; - code=(code<<8)+getc(fin); + code=(code<<8)+getc(g_in); } return bit; @@ -158,11 +192,11 @@ struct Encoder template struct Counter { - word p; + U16 p; Counter() { - p=1<<15; + p=1<<15; // 0.5 } void UpdateBit0() @@ -172,7 +206,7 @@ struct Counter void UpdateBit1() { - p+=(p^65535)>>RATE; + p+=(p^0xFFFF)>>RATE; } }; @@ -201,27 +235,6 @@ struct CM: Encoder } } - void Encode32(uint n) - { - for (int i=0; i<32; ++i) - { - if (n&(1<<31)) - Encoder::EncodeBit1(1<<17); - else - Encoder::EncodeBit0(1<<17); - n+=n; - } - } - - uint Decode32() - { - uint n=0; - for (int i=0; i<32; ++i) - n+=n+Encoder::DecodeBit(1<<17); - - return n; - } - void Encode(int c) { if (c1==c2) @@ -236,19 +249,16 @@ struct CM: Encoder const int p0=counter0[ctx].p; const int p1=counter1[c1][ctx].p; const int p2=counter1[c2][ctx].p; - const int p=((p0+p1)*7+p2+p2)>>4; + const int p=(((p0+p1)*7)+p2+p2)>>4; const int j=p>>12; const int x1=counter2[f][ctx][j].p; const int x2=counter2[f][ctx][j+1].p; const int ssep=x1+(((x2-x1)*(p&4095))>>12); - const int bit=c&128; - c+=c; - - if (bit) + if (c&128) { - Encoder::EncodeBit1(ssep*3+p); + Encoder::EncodeBit1((ssep*3)+p); counter0[ctx].UpdateBit1(); counter1[c1][ctx].UpdateBit1(); counter2[f][ctx][j].UpdateBit1(); @@ -257,17 +267,19 @@ struct CM: Encoder } else { - Encoder::EncodeBit0(ssep*3+p); + Encoder::EncodeBit0((ssep*3)+p); counter0[ctx].UpdateBit0(); counter1[c1][ctx].UpdateBit0(); counter2[f][ctx][j].UpdateBit0(); counter2[f][ctx][j+1].UpdateBit0(); ctx+=ctx; } + + c+=c; } c2=c1; - c1=ctx&255; + c1=ctx-256; } int Decode() @@ -284,16 +296,14 @@ struct CM: Encoder const int p0=counter0[ctx].p; const int p1=counter1[c1][ctx].p; const int p2=counter1[c2][ctx].p; - const int p=((p0+p1)*7+p2+p2)>>4; + const int p=(((p0+p1)*7)+p2+p2)>>4; const int j=p>>12; const int x1=counter2[f][ctx][j].p; const int x2=counter2[f][ctx][j+1].p; const int ssep=x1+(((x2-x1)*(p&4095))>>12); - const int bit=Encoder::DecodeBit(ssep*3+p); - - if (bit) + if (Encoder::DecodeBit((ssep*3)+p)) { counter0[ctx].UpdateBit1(); counter1[c1][ctx].UpdateBit1(); @@ -312,41 +322,41 @@ struct CM: Encoder } c2=c1; - return c1=ctx&255; + return c1=ctx-256; } } cm; struct CRC { - uint t[256]; - uint crc; + U32 tab[256]; + U32 crc; CRC() { for (int i=0; i<256; ++i) { - uint r=i; + U32 r=i; for (int j=0; j<8; ++j) - r=(r>>1)^(-int(r&1)&0xedb88320); - t[i]=r; + r=(r>>1)^(0xEDB88320&-int(r&1)); + tab[i]=r; } - crc=uint(-1); + crc=0xFFFFFFFF; } void Clear() { - crc=uint(-1); + crc=0xFFFFFFFF; } - uint operator()() const + U32 operator()() const { - return ~crc; + return crc^0xFFFFFFFF; } void Update(int c) { - crc=t[(crc^c)&255]^(crc>>8); + crc=(crc>>8)^tab[(crc^c)&255]; } } crc; @@ -356,33 +366,45 @@ inline T* mem_alloc(size_t n) T* p=(T*)malloc(n*sizeof(T)); if (!p) { - fprintf(stderr, "Out of memory!\n"); + perror("Malloc() failed"); exit(1); } + return p; } #define mem_free(p) free(p) -void compress(int bsize) +void compress(int level) { - _fseeki64(fin, 0, SEEK_END); - const long long flen=_ftelli64(fin); - _fseeki64(fin, 0, SEEK_SET); + const int config_tab[10]= + { + 0, + 1<<20, // -1 - 1 MB + 1<<22, // -2 - 4 MB + 1<<23, // -3 - 8 MB + 0x00FFFFFF, // -4 - ~16 MB (Default) + 1<<25, // -5 - 32 MB + 1<<26, // -6 - 64 MB + 1<<27, // -7 - 128 MB + 1<<28, // -8 - 256 MB + 0x7FFFFFFF, // -9 - ~2 GB + }; + + int block_size=config_tab[level]; - if (flen>0 && bsize>flen) - bsize=int(flen); + _fseeki64(g_in, 0, SEEK_END); + const S64 file_size=_ftelli64(g_in); + _fseeki64(g_in, 0, SEEK_SET); - byte* buf=mem_alloc(bsize); - int* ptr=mem_alloc(bsize); + if (file_size>0 && block_size>file_size) + block_size=int(file_size); - putc(magic[0], fout); - putc(magic[1], fout); - putc(magic[2], fout); - putc(magic[3], fout); + U8* buf=mem_alloc(block_size); + int* ptr=mem_alloc(block_size); int n; - while ((n=fread(buf, 1, bsize, fin))>0) + while ((n=fread(buf, 1, block_size, g_in))>0) { for (int i=0; i(n); + cm.EncodeDirectBits<32>(idx); for (int i=0; i %lld\r", _ftelli64(fin), _ftelli64(fout)); + fprintf(stderr, "%lld -> %lld\r", _ftelli64(g_in), _ftelli64(g_out)); } - cm.Encode32(0); // EOF - cm.Encode32(crc()); + cm.EncodeDirectBits<32>(0); // EOF + cm.EncodeDirectBits<32>(crc()); cm.Flush(); @@ -414,41 +436,32 @@ void compress(int bsize) void decompress() { - if (getc(fin)!=magic[0] - ||getc(fin)!=magic[1] - ||getc(fin)!=magic[2] - ||getc(fin)!=magic[3]) - { - fprintf(stderr, "Not in BCM format!\n"); - exit(1); - } - cm.Init(); - int bsize=0; - byte* buf=NULL; - uint* ptr=NULL; + int block_size=0; + U8* buf=NULL; + U32* ptr=NULL; int n; - while ((n=cm.Decode32())>0) + while ((n=cm.DecodeDirectBits<32>())>0) { - if (!bsize) + if (block_size==0) { - if ((bsize=n)>=(1<<24)) // 5n - buf=mem_alloc(bsize); + if ((block_size=n)>=(1<<24)) // 5*N + buf=mem_alloc(block_size); - ptr=mem_alloc(bsize); + ptr=mem_alloc(block_size); } - const int idx=cm.Decode32(); - if (n>bsize || idx<1 || idx>n) + const int idx=cm.DecodeDirectBits<32>(); + if (n>block_size || idx<1 || idx>n) { fprintf(stderr, "Corrupt input!\n"); exit(1); } // Inverse BW-transform - if (n>=(1<<24)) // 5n + if (n>=(1<<24)) // 5*N { int t[257]={0}; for (int i=0; i=idx)]; - putc(c, fout); crc.Update(c); + putc(c, g_out); } } - else // 4n + else // 4*N { int t[257]={0}; for (int i=0; i>8; const int c=ptr[p-(p>=idx)]&255; - putc(c, fout); crc.Update(c); + putc(c, g_out); } } - fprintf(stderr, "%lld -> %lld\r", _ftelli64(fin), _ftelli64(fout)); + fprintf(stderr, "%lld -> %lld\r", _ftelli64(g_in), _ftelli64(g_out)); } - if (cm.Decode32()!=crc()) + if (cm.DecodeDirectBits<32>()!=crc()) { fprintf(stderr, "CRC error!\n"); exit(1); @@ -500,32 +513,39 @@ int main(int argc, char** argv) { const clock_t start=clock(); - int bsize=(1<<24)-1; // 16 MB + int level=4; bool do_decomp=false; bool overwrite=false; while (argc>1 && *argv[1]=='-') { - switch (argv[1][1]) + for (int i=1; argv[1][i]!='\0'; ++i) { - case 'b': - bsize=atoi(&argv[1][2]) - <<(argv[1][strlen(argv[1])-1]=='k'?10:20); - if (bsize<1) + switch (argv[1][i]) { - fprintf(stderr, "Block size is out of range!\n"); + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': +#ifdef _WIN64 + case '9': +#endif + level=argv[1][i]-'0'; + break; + case 'd': + do_decomp=true; + break; + case 'f': + overwrite=true; + break; + default: + fprintf(stderr, "Unknown option: -%c\n", argv[1][i]); exit(1); } - break; - case 'd': - do_decomp=true; - break; - case 'f': - overwrite=true; - break; - default: - fprintf(stderr, "Unknown option: %s\n", argv[1]); - exit(1); } --argc; @@ -535,96 +555,124 @@ int main(int argc, char** argv) if (argc<2) { fprintf(stderr, - "BCM - A BWT-based file compressor, v1.30\n" - "Copyright (C) 2008-2018 Ilya Muravyov\n" + "BCM - A BWT-based file compressor, v1.40\n" "\n" - "Usage: %s [options] infile [outfile]\n" + "Usage: BCM [options] infile [outfile]\n" "\n" "Options:\n" - " -b#[k] Set block size to # MB or KB (default is 16 MB)\n" - " -d Decompress\n" - " -f Force overwrite of output file\n", argv[0]); +#ifdef _WIN64 + " -1 .. -9 Set block size to 1 MB .. 2 GB\n" +#else + " -1 .. -8 Set block size to 1 MB .. 256 MB\n" +#endif + " -d Decompress\n" + " -f Force overwrite of output file\n"); exit(1); } - fin=fopen(argv[1], "rb"); - if (!fin) + g_in=fopen(argv[1], "rb"); + if (!g_in) { perror(argv[1]); exit(1); } - char ofname[FILENAME_MAX]; + char out_name[FILENAME_MAX]; if (argc<3) { - strcpy(ofname, argv[1]); + strcpy(out_name, argv[1]); if (do_decomp) { - const int p=strlen(ofname)-4; - if (p>0 && !strcmp(&ofname[p], ".bcm")) - ofname[p]='\0'; + const int p=strlen(out_name)-4; + if (p>0 && strcmp(&out_name[p], ".bcm")==0) + out_name[p]='\0'; else - strcat(ofname, ".out"); + strcat(out_name, ".out"); } else - strcat(ofname, ".bcm"); + strcat(out_name, ".bcm"); } else - strcpy(ofname, argv[2]); + strcpy(out_name, argv[2]); if (!overwrite) { - FILE* f=fopen(ofname, "rb"); + FILE* f=fopen(out_name, "rb"); if (f) { fclose(f); - fprintf(stderr, "%s already exists. Overwrite (y/n)? ", ofname); + fprintf(stderr, "%s already exists. Overwrite (y/n)? ", out_name); + fflush(stderr); if (getchar()!='y') + { + fprintf(stderr, "Not overwritten\n"); exit(1); + } } } - fout=fopen(ofname, "wb"); - if (!fout) - { - perror(ofname); - exit(1); - } - if (do_decomp) { + if (getc(g_in)!=g_magic[0] + ||getc(g_in)!=g_magic[1] + ||getc(g_in)!=g_magic[2] + ||getc(g_in)!=g_magic[3]) + { + fprintf(stderr, "%s: Not in BCM format\n", argv[1]); + exit(1); + } + + g_out=fopen(out_name, "wb"); + if (!g_out) + { + perror(out_name); + exit(1); + } + fprintf(stderr, "Decompressing %s:\n", argv[1]); decompress(); } else { + g_out=fopen(out_name, "wb"); + if (!g_out) + { + perror(out_name); + exit(1); + } + + putc(g_magic[0], g_out); + putc(g_magic[1], g_out); + putc(g_magic[2], g_out); + putc(g_magic[3], g_out); + fprintf(stderr, "Compressing %s:\n", argv[1]); - compress(bsize); + compress(level); } - fprintf(stderr, "%lld -> %lld in %1.2fs\n", _ftelli64(fin), _ftelli64(fout), - double(clock()-start)/CLOCKS_PER_SEC); + fprintf(stderr, "%lld -> %lld in %1.1f sec\n", _ftelli64(g_in), + _ftelli64(g_out), double(clock()-start)/CLOCKS_PER_SEC); - fclose(fin); - fclose(fout); + fclose(g_in); + fclose(g_out); #ifndef NO_UTIME struct _stati64 sb; - if (_stati64(argv[1], &sb)) + if (_stati64(argv[1], &sb)!=0) { - perror("Stat failed"); + perror("Stat() failed"); exit(1); } struct utimbuf ub; ub.actime=sb.st_atime; ub.modtime=sb.st_mtime; - if (utime(ofname, &ub)) + if (utime(out_name, &ub)!=0) { - perror("Utime failed"); + perror("Utime() failed"); exit(1); } #endif From 940dca40a587cbf63fe3c073920bc5886fb30431 Mon Sep 17 00:00:00 2001 From: Ilya Muravyov Date: Thu, 6 Feb 2020 18:26:11 +0300 Subject: [PATCH 28/34] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f1f8d42..2080a82 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# BCM v1.40 +# BCM v1.50 ### Description BCM is a high-performance file compressor that utilizes advanced context modeling techniques to achieve a very high compression ratio. All in all, it's like a big brother of the BZIP2. From 0dc6e19242f59849092fe98b1f60d736ee287555 Mon Sep 17 00:00:00 2001 From: Ilya Muravyov Date: Thu, 6 Feb 2020 18:26:51 +0300 Subject: [PATCH 29/34] Updated to v1.50 --- src/bcm.cpp | 427 ++++++++++++++++++++++++---------------------------- 1 file changed, 199 insertions(+), 228 deletions(-) diff --git a/src/bcm.cpp b/src/bcm.cpp index 3d88442..2c23a23 100644 --- a/src/bcm.cpp +++ b/src/bcm.cpp @@ -53,10 +53,10 @@ typedef signed long long S64; // Globals -FILE* g_in; -FILE* g_out; +const char magic[]="BCM!"; -const char g_magic[]="BCM!"; +FILE* in; +FILE* out; struct Encoder { @@ -67,7 +67,7 @@ struct Encoder Encoder() { low=0; - high=0xFFFFFFFF; + high=U32(-1); code=0; } @@ -75,7 +75,7 @@ struct Encoder { for (int i=0; i<4; ++i) { - putc(low>>24, g_out); + putc(low>>24, out); low<<=8; } } @@ -83,106 +83,45 @@ struct Encoder void Init() { for (int i=0; i<4; ++i) - code=(code<<8)+getc(g_in); + code=(code<<8)+getc(in); } - template - void EncodeDirectBits(U32 x) + template + void EncodeBit(int bit, U32 p) { - for (U32 i=1<<(N-1); i!=0; i>>=1) - { - if (x&i) - high=low+((high-low)>>1); - else - low+=((high-low)>>1)+1; - - if ((low^high)<(1<<24)) - { - putc(low>>24, g_out); - low<<=8; - high=(high<<8)+255; - } - } - } + const U32 mid=low+((U64(high-low)*p)>>P_LOG); - void EncodeBit1(U32 p) - { -#ifdef _WIN64 - high=low+((U64(high-low)*p)>>18); -#else - high=low+((U64(high-low)*(p<<(32-18)))>>32); -#endif - while ((low^high)<(1<<24)) - { - putc(low>>24, g_out); - low<<=8; - high=(high<<8)+255; - } - } + if (bit) + high=mid; + else + low=mid+1; - void EncodeBit0(U32 p) - { -#ifdef _WIN64 - low+=((U64(high-low)*p)>>18)+1; -#else - low+=((U64(high-low)*(p<<(32-18)))>>32)+1; -#endif + // Renormalize while ((low^high)<(1<<24)) { - putc(low>>24, g_out); + putc(low>>24, out); low<<=8; high=(high<<8)+255; } } - template - U32 DecodeDirectBits() - { - U32 x=0; - - for (int i=0; i>1); - if (code<=mid) - { - high=mid; - x+=x+1; - } - else - { - low=mid+1; - x+=x; - } - - if ((low^high)<(1<<24)) - { - low<<=8; - high=(high<<8)+255; - code=(code<<8)+getc(g_in); - } - } - - return x; - } - + template int DecodeBit(U32 p) { -#ifdef _WIN64 - const U32 mid=low+((U64(high-low)*p)>>18); -#else - const U32 mid=low+((U64(high-low)*(p<<(32-18)))>>32); -#endif + const U32 mid=low+((U64(high-low)*p)>>P_LOG); + const int bit=(code<=mid); if (bit) high=mid; else low=mid+1; + // Renormalize while ((low^high)<(1<<24)) { low<<=8; high=(high<<8)+255; - code=(code<<8)+getc(g_in); + code=(code<<8)+getc(in); } return bit; @@ -199,14 +138,14 @@ struct Counter p=1<<15; // 0.5 } - void UpdateBit0() + void Update1() { - p-=p>>RATE; + p+=(p^0xFFFF)>>RATE; } - void UpdateBit1() + void Update0() { - p+=(p^0xFFFF)>>RATE; + p-=p>>RATE; } }; @@ -215,79 +154,94 @@ struct CM: Encoder Counter<2> counter0[256]; Counter<4> counter1[256][256]; Counter<6> counter2[2][256][17]; + int run; int c1; int c2; - int run; CM() { + run=0; c1=0; c2=0; - run=0; for (int i=0; i<2; ++i) { for (int j=0; j<256; ++j) { - for (int k=0; k<17; ++k) + for (int k=0; k<=16; ++k) counter2[i][j][k].p=(k<<12)-(k==16); } } } - void Encode(int c) + void Put32(U32 x) + { + for (U32 i=1<<31; i>0; i>>=1) + EncodeBit<1>(x&i, 1); // p=0.5 + } + + U32 Get32() + { + U32 x=0; + for (int i=0; i<32; ++i) + x+=x+DecodeBit<1>(1); // p=0.5 + + return x; + } + + void Put(int c) { - if (c1==c2) - ++run; - else - run=0; const int f=(run>2); int ctx=1; - while (ctx<256) + for (int i=128; i>0; i>>=1) { const int p0=counter0[ctx].p; const int p1=counter1[c1][ctx].p; const int p2=counter1[c2][ctx].p; const int p=(((p0+p1)*7)+p2+p2)>>4; + // SSE with linear interpolation const int j=p>>12; const int x1=counter2[f][ctx][j].p; const int x2=counter2[f][ctx][j+1].p; const int ssep=x1+(((x2-x1)*(p&4095))>>12); - if (c&128) + if (c&i) { - Encoder::EncodeBit1((ssep*3)+p); - counter0[ctx].UpdateBit1(); - counter1[c1][ctx].UpdateBit1(); - counter2[f][ctx][j].UpdateBit1(); - counter2[f][ctx][j+1].UpdateBit1(); + EncodeBit<18>(1, p+ssep+ssep+ssep); + + counter0[ctx].Update1(); + counter1[c1][ctx].Update1(); + counter2[f][ctx][j].Update1(); + counter2[f][ctx][j+1].Update1(); + ctx+=ctx+1; } else { - Encoder::EncodeBit0((ssep*3)+p); - counter0[ctx].UpdateBit0(); - counter1[c1][ctx].UpdateBit0(); - counter2[f][ctx][j].UpdateBit0(); - counter2[f][ctx][j+1].UpdateBit0(); + EncodeBit<18>(0, p+ssep+ssep+ssep); + + counter0[ctx].Update0(); + counter1[c1][ctx].Update0(); + counter2[f][ctx][j].Update0(); + counter2[f][ctx][j+1].Update0(); + ctx+=ctx; } - - c+=c; } c2=c1; c1=ctx-256; - } - int Decode() - { if (c1==c2) ++run; else run=0; + } + + int Get() + { const int f=(run>2); int ctx=1; @@ -298,31 +252,41 @@ struct CM: Encoder const int p2=counter1[c2][ctx].p; const int p=(((p0+p1)*7)+p2+p2)>>4; + // SSE with linear interpolation const int j=p>>12; const int x1=counter2[f][ctx][j].p; const int x2=counter2[f][ctx][j+1].p; const int ssep=x1+(((x2-x1)*(p&4095))>>12); - if (Encoder::DecodeBit((ssep*3)+p)) + if (DecodeBit<18>(p+ssep+ssep+ssep)) { - counter0[ctx].UpdateBit1(); - counter1[c1][ctx].UpdateBit1(); - counter2[f][ctx][j].UpdateBit1(); - counter2[f][ctx][j+1].UpdateBit1(); + counter0[ctx].Update1(); + counter1[c1][ctx].Update1(); + counter2[f][ctx][j].Update1(); + counter2[f][ctx][j+1].Update1(); + ctx+=ctx+1; } else { - counter0[ctx].UpdateBit0(); - counter1[c1][ctx].UpdateBit0(); - counter2[f][ctx][j].UpdateBit0(); - counter2[f][ctx][j+1].UpdateBit0(); + counter0[ctx].Update0(); + counter1[c1][ctx].Update0(); + counter2[f][ctx][j].Update0(); + counter2[f][ctx][j+1].Update0(); + ctx+=ctx; } } c2=c1; - return c1=ctx-256; + c1=ctx-256; + + if (c1==c2) + ++run; + else + run=0; + + return c1; } } cm; @@ -340,28 +304,29 @@ struct CRC r=(r>>1)^(0xEDB88320&-int(r&1)); tab[i]=r; } - - crc=0xFFFFFFFF; + crc=U32(-1); } - void Clear() + U32 operator()() const { - crc=0xFFFFFFFF; + return crc^U32(-1); } - U32 operator()() const + void Update(U8* buf, int n) { - return crc^0xFFFFFFFF; + for (int i=0; i>8)^tab[(crc^buf[i])&255]; } - void Update(int c) + void Put(int c) { crc=(crc>>8)^tab[(crc^c)&255]; + putc(c, out); } } crc; template -inline T* mem_alloc(size_t n) +inline T* MemAlloc(size_t n) { T* p=(T*)malloc(n*sizeof(T)); if (!p) @@ -373,11 +338,9 @@ inline T* mem_alloc(size_t n) return p; } -#define mem_free(p) free(p) - -void compress(int level) +void Compress(int level) { - const int config_tab[10]= + const int tab[10]= { 0, 1<<20, // -1 - 1 MB @@ -390,24 +353,31 @@ void compress(int level) 1<<28, // -8 - 256 MB 0x7FFFFFFF, // -9 - ~2 GB }; + int bsize=tab[level]; // Block size - int block_size=config_tab[level]; - - _fseeki64(g_in, 0, SEEK_END); - const S64 file_size=_ftelli64(g_in); - _fseeki64(g_in, 0, SEEK_SET); + if (_fseeki64(in, 0, SEEK_END)) + { + perror("Fseek() failed"); + exit(1); + } + const S64 flen=_ftelli64(in); + if (flen<0) + { + perror("Ftell() failed"); + exit(1); + } + rewind(in); - if (file_size>0 && block_size>file_size) - block_size=int(file_size); + if (bsize>flen) + bsize=int(flen); - U8* buf=mem_alloc(block_size); - int* ptr=mem_alloc(block_size); + U8* buf=MemAlloc(bsize); + int* ptr=MemAlloc(bsize); int n; - while ((n=fread(buf, 1, block_size, g_in))>0) + while ((n=fread(buf, 1, bsize, in))>0) { - for (int i=0; i(n); - cm.EncodeDirectBits<32>(idx); + cm.Put32(n); // Block size + cm.Put32(idx); // BWT index for (int i=0; i %lld\r", _ftelli64(g_in), _ftelli64(g_out)); + fprintf(stderr, "%lld -> %lld\r", _ftelli64(in), _ftelli64(out)); } - cm.EncodeDirectBits<32>(0); // EOF - cm.EncodeDirectBits<32>(crc()); + cm.Put32(0); // EOF + cm.Put32(crc()); // CRC32 cm.Flush(); - mem_free(buf); - mem_free(ptr); + free(buf); + free(ptr); } -void decompress() +void Decompress() { cm.Init(); - int block_size=0; + int bsize=0; U8* buf=NULL; U32* ptr=NULL; int n; - while ((n=cm.DecodeDirectBits<32>())>0) + while ((n=cm.Get32())>0) { - if (block_size==0) + if (!bsize) { - if ((block_size=n)>=(1<<24)) // 5*N - buf=mem_alloc(block_size); + bsize=n; + + if (bsize>=(1<<24)) // 5*N + buf=MemAlloc(bsize); - ptr=mem_alloc(block_size); + ptr=MemAlloc(bsize); } - const int idx=cm.DecodeDirectBits<32>(); - if (n>block_size || idx<1 || idx>n) + const int idx=cm.Get32(); + if (n>bsize || idx<1 || idx>n) { fprintf(stderr, "Corrupt input!\n"); exit(1); } // Inverse BW-transform + if (n>=(1<<24)) // 5*N { - int t[257]={0}; + int cnt[257]={0}; for (int i=0; i=idx); + cnt[i]+=cnt[i-1]; + + for (int i=0; i=idx)]; - crc.Update(c); - putc(c, g_out); + crc.Put(buf[p-(p>=idx)]); } } else // 4*N { - int t[257]={0}; + int cnt[257]={0}; for (int i=0; i=idx))<<8; + cnt[i]+=cnt[i-1]; + + for (int i=0; i>8; - const int c=ptr[p-(p>=idx)]&255; - crc.Update(c); - putc(c, g_out); + crc.Put(ptr[p-(p>=idx)]); } } - fprintf(stderr, "%lld -> %lld\r", _ftelli64(g_in), _ftelli64(g_out)); + fprintf(stderr, "%lld -> %lld\r", _ftelli64(in), _ftelli64(out)); } - if (cm.DecodeDirectBits<32>()!=crc()) + if (cm.Get32()!=crc()) { fprintf(stderr, "CRC error!\n"); exit(1); } - mem_free(buf); - mem_free(ptr); + free(buf); + free(ptr); } int main(int argc, char** argv) @@ -514,8 +491,8 @@ int main(int argc, char** argv) const clock_t start=clock(); int level=4; - bool do_decomp=false; - bool overwrite=false; + int decompress=0; + int overwrite=0; while (argc>1 && *argv[1]=='-') { @@ -531,16 +508,14 @@ int main(int argc, char** argv) case '6': case '7': case '8': -#ifdef _WIN64 case '9': -#endif level=argv[1][i]-'0'; break; case 'd': - do_decomp=true; + decompress=1; break; case 'f': - overwrite=true; + overwrite=1; break; default: fprintf(stderr, "Unknown option: -%c\n", argv[1][i]); @@ -555,54 +530,50 @@ int main(int argc, char** argv) if (argc<2) { fprintf(stderr, - "BCM - A BWT-based file compressor, v1.40\n" + "BCM - A BWT-based file compressor, v1.50\n" "\n" "Usage: BCM [options] infile [outfile]\n" "\n" "Options:\n" -#ifdef _WIN64 " -1 .. -9 Set block size to 1 MB .. 2 GB\n" -#else - " -1 .. -8 Set block size to 1 MB .. 256 MB\n" -#endif " -d Decompress\n" " -f Force overwrite of output file\n"); exit(1); } - g_in=fopen(argv[1], "rb"); - if (!g_in) + in=fopen(argv[1], "rb"); + if (!in) { perror(argv[1]); exit(1); } - char out_name[FILENAME_MAX]; + char ofname[FILENAME_MAX]; if (argc<3) { - strcpy(out_name, argv[1]); - if (do_decomp) + strcpy(ofname, argv[1]); + if (decompress) { - const int p=strlen(out_name)-4; - if (p>0 && strcmp(&out_name[p], ".bcm")==0) - out_name[p]='\0'; + const int p=strlen(ofname)-4; + if (p>0 && !strcmp(&ofname[p], ".bcm")) + ofname[p]='\0'; else - strcat(out_name, ".out"); + strcat(ofname, ".out"); } else - strcat(out_name, ".bcm"); + strcat(ofname, ".bcm"); } else - strcpy(out_name, argv[2]); + strcpy(ofname, argv[2]); if (!overwrite) { - FILE* f=fopen(out_name, "rb"); + FILE* f=fopen(ofname, "rb"); if (f) { fclose(f); - fprintf(stderr, "%s already exists. Overwrite (y/n)? ", out_name); + fprintf(stderr, "%s already exists. Overwrite (y/n)? ", ofname); fflush(stderr); if (getchar()!='y') @@ -613,56 +584,56 @@ int main(int argc, char** argv) } } - if (do_decomp) + if (decompress) { - if (getc(g_in)!=g_magic[0] - ||getc(g_in)!=g_magic[1] - ||getc(g_in)!=g_magic[2] - ||getc(g_in)!=g_magic[3]) + if (getc(in)!=magic[0] + ||getc(in)!=magic[1] + ||getc(in)!=magic[2] + ||getc(in)!=magic[3]) { fprintf(stderr, "%s: Not in BCM format\n", argv[1]); exit(1); } - g_out=fopen(out_name, "wb"); - if (!g_out) + out=fopen(ofname, "wb"); + if (!out) { - perror(out_name); + perror(ofname); exit(1); } fprintf(stderr, "Decompressing %s:\n", argv[1]); - decompress(); + Decompress(); } else { - g_out=fopen(out_name, "wb"); - if (!g_out) + out=fopen(ofname, "wb"); + if (!out) { - perror(out_name); + perror(ofname); exit(1); } - putc(g_magic[0], g_out); - putc(g_magic[1], g_out); - putc(g_magic[2], g_out); - putc(g_magic[3], g_out); + putc(magic[0], out); + putc(magic[1], out); + putc(magic[2], out); + putc(magic[3], out); fprintf(stderr, "Compressing %s:\n", argv[1]); - compress(level); + Compress(level); } - fprintf(stderr, "%lld -> %lld in %1.1f sec\n", _ftelli64(g_in), - _ftelli64(g_out), double(clock()-start)/CLOCKS_PER_SEC); + fprintf(stderr, "%lld -> %lld in %1.1f sec\n", + _ftelli64(in), _ftelli64(out), double(clock()-start)/CLOCKS_PER_SEC); - fclose(g_in); - fclose(g_out); + fclose(in); + fclose(out); #ifndef NO_UTIME struct _stati64 sb; - if (_stati64(argv[1], &sb)!=0) + if (_stati64(argv[1], &sb)) { perror("Stat() failed"); exit(1); @@ -670,7 +641,7 @@ int main(int argc, char** argv) struct utimbuf ub; ub.actime=sb.st_atime; ub.modtime=sb.st_mtime; - if (utime(out_name, &ub)!=0) + if (utime(ofname, &ub)) { perror("Utime() failed"); exit(1); From 940ca919ee1fd6ccf9f57f99a3566e3715c7a1b1 Mon Sep 17 00:00:00 2001 From: Ilya Muravyov Date: Sun, 15 Mar 2020 12:50:37 +0300 Subject: [PATCH 30/34] Updated to v1.51 --- src/bcm.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/bcm.cpp b/src/bcm.cpp index 2c23a23..baba4e4 100644 --- a/src/bcm.cpp +++ b/src/bcm.cpp @@ -530,7 +530,8 @@ int main(int argc, char** argv) if (argc<2) { fprintf(stderr, - "BCM - A BWT-based file compressor, v1.50\n" + "BCM - A BWT-based file compressor, v1.51\n" + "Written and placed in the public domain by Ilya Muravyov\n" "\n" "Usage: BCM [options] infile [outfile]\n" "\n" From 6b8ddf3a116b8eecac8e85277e7039ac3bacbf10 Mon Sep 17 00:00:00 2001 From: Ilya Muravyov Date: Sun, 15 Mar 2020 12:53:41 +0300 Subject: [PATCH 31/34] Update README.md Delete divsufsort Fixed FORCEINLINE macro for _MSC_VER --- LICENSE | 225 ++++- README.md | 4 +- src/divsufsort.c | 1782 ------------------------------------ src/divsufsort.h | 63 -- src/libsais.c | 2282 ++++++++++++++++++++++++++++++++++++++++++++++ src/libsais.h | 54 ++ 6 files changed, 2539 insertions(+), 1871 deletions(-) delete mode 100644 src/divsufsort.c delete mode 100644 src/divsufsort.h create mode 100644 src/libsais.c create mode 100644 src/libsais.h diff --git a/LICENSE b/LICENSE index fdddb29..261eeb9 100644 --- a/LICENSE +++ b/LICENSE @@ -1,24 +1,201 @@ -This is free and unencumbered software released into the public domain. - -Anyone is free to copy, modify, publish, use, compile, sell, or -distribute this software, either in source code form or as a compiled -binary, for any purpose, commercial or non-commercial, and by any -means. - -In jurisdictions that recognize copyright laws, the author or authors -of this software dedicate any and all copyright interest in the -software to the public domain. We make this dedication for the benefit -of the public at large and to the detriment of our heirs and -successors. We intend this dedication to be an overt act of -relinquishment in perpetuity of all present and future rights to this -software under copyright law. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR -OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -OTHER DEALINGS IN THE SOFTWARE. - -For more information, please refer to + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md index 2080a82..1f36a18 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# BCM v1.50 +# BCM v1.60 ### Description BCM is a high-performance file compressor that utilizes advanced context modeling techniques to achieve a very high compression ratio. All in all, it's like a big brother of the BZIP2. @@ -18,4 +18,4 @@ BCM -9 | 20,789,667 bytes | Ilya Muravyov ### Thanks -Special thanks to Yuta Mori, Matt Mahoney, Eugene Shelwien, Alexander Rhatushnyak, Przemyslaw Skibinski, Malcolm Taylor and LovePimple. +Special thanks to Ilya Grebnov, Yuta Mori, Matt Mahoney, Eugene Shelwien, Alexander Rhatushnyak, Przemyslaw Skibinski, Malcolm Taylor and LovePimple. diff --git a/src/divsufsort.c b/src/divsufsort.c deleted file mode 100644 index 9bbac45..0000000 --- a/src/divsufsort.c +++ /dev/null @@ -1,1782 +0,0 @@ -/* - * divsufsort.c for libdivsufsort-lite - * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -#include -#include -#include -#ifdef _OPENMP -# include -#endif -#include "divsufsort.h" - - -/*- Constants -*/ -#define INLINE __inline -#if defined(ALPHABET_SIZE) && (ALPHABET_SIZE < 1) -# undef ALPHABET_SIZE -#endif -#if !defined(ALPHABET_SIZE) -# define ALPHABET_SIZE (256) -#endif -#define BUCKET_A_SIZE (ALPHABET_SIZE) -#define BUCKET_B_SIZE (ALPHABET_SIZE * ALPHABET_SIZE) -#if defined(SS_INSERTIONSORT_THRESHOLD) -# if SS_INSERTIONSORT_THRESHOLD < 1 -# undef SS_INSERTIONSORT_THRESHOLD -# define SS_INSERTIONSORT_THRESHOLD (1) -# endif -#else -# define SS_INSERTIONSORT_THRESHOLD (8) -#endif -#if defined(SS_BLOCKSIZE) -# if SS_BLOCKSIZE < 0 -# undef SS_BLOCKSIZE -# define SS_BLOCKSIZE (0) -# elif 32768 <= SS_BLOCKSIZE -# undef SS_BLOCKSIZE -# define SS_BLOCKSIZE (32767) -# endif -#else -# define SS_BLOCKSIZE (1024) -#endif -/* minstacksize = log(SS_BLOCKSIZE) / log(3) * 2 */ -#if SS_BLOCKSIZE == 0 -# define SS_MISORT_STACKSIZE (96) -#elif SS_BLOCKSIZE <= 4096 -# define SS_MISORT_STACKSIZE (16) -#else -# define SS_MISORT_STACKSIZE (24) -#endif -#define SS_SMERGE_STACKSIZE (32) -#define TR_INSERTIONSORT_THRESHOLD (8) -#define TR_STACKSIZE (64) - - -/*- Macros -*/ -#ifndef SWAP -# define SWAP(_a, _b) do { t = (_a); (_a) = (_b); (_b) = t; } while(0) -#endif /* SWAP */ -#ifndef MIN -# define MIN(_a, _b) (((_a) < (_b)) ? (_a) : (_b)) -#endif /* MIN */ -#ifndef MAX -# define MAX(_a, _b) (((_a) > (_b)) ? (_a) : (_b)) -#endif /* MAX */ -#define STACK_PUSH(_a, _b, _c, _d)\ - do {\ - assert(ssize < STACK_SIZE);\ - stack[ssize].a = (_a), stack[ssize].b = (_b),\ - stack[ssize].c = (_c), stack[ssize++].d = (_d);\ - } while(0) -#define STACK_PUSH5(_a, _b, _c, _d, _e)\ - do {\ - assert(ssize < STACK_SIZE);\ - stack[ssize].a = (_a), stack[ssize].b = (_b),\ - stack[ssize].c = (_c), stack[ssize].d = (_d), stack[ssize++].e = (_e);\ - } while(0) -#define STACK_POP(_a, _b, _c, _d)\ - do {\ - assert(0 <= ssize);\ - if(ssize == 0) { return; }\ - (_a) = stack[--ssize].a, (_b) = stack[ssize].b,\ - (_c) = stack[ssize].c, (_d) = stack[ssize].d;\ - } while(0) -#define STACK_POP5(_a, _b, _c, _d, _e)\ - do {\ - assert(0 <= ssize);\ - if(ssize == 0) { return; }\ - (_a) = stack[--ssize].a, (_b) = stack[ssize].b,\ - (_c) = stack[ssize].c, (_d) = stack[ssize].d, (_e) = stack[ssize].e;\ - } while(0) -#define BUCKET_A(_c0) bucket_A[(_c0)] -#if ALPHABET_SIZE == 256 -#define BUCKET_B(_c0, _c1) (bucket_B[((_c1) << 8) | (_c0)]) -#define BUCKET_BSTAR(_c0, _c1) (bucket_B[((_c0) << 8) | (_c1)]) -#else -#define BUCKET_B(_c0, _c1) (bucket_B[(_c1) * ALPHABET_SIZE + (_c0)]) -#define BUCKET_BSTAR(_c0, _c1) (bucket_B[(_c0) * ALPHABET_SIZE + (_c1)]) -#endif - - -/*- Private Functions -*/ - -static const int lg_table[256]= { - -1,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4, - 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5, - 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6, - 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6, - 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, - 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, - 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, - 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7 -}; - -#if (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE) - -static INLINE -int -ss_ilg(int n) { -#if SS_BLOCKSIZE == 0 - return (n & 0xffff0000) ? - ((n & 0xff000000) ? - 24 + lg_table[(n >> 24) & 0xff] : - 16 + lg_table[(n >> 16) & 0xff]) : - ((n & 0x0000ff00) ? - 8 + lg_table[(n >> 8) & 0xff] : - 0 + lg_table[(n >> 0) & 0xff]); -#elif SS_BLOCKSIZE < 256 - return lg_table[n]; -#else - return (n & 0xff00) ? - 8 + lg_table[(n >> 8) & 0xff] : - 0 + lg_table[(n >> 0) & 0xff]; -#endif -} - -#endif /* (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE) */ - -#if SS_BLOCKSIZE != 0 - -static const int sqq_table[256] = { - 0, 16, 22, 27, 32, 35, 39, 42, 45, 48, 50, 53, 55, 57, 59, 61, - 64, 65, 67, 69, 71, 73, 75, 76, 78, 80, 81, 83, 84, 86, 87, 89, - 90, 91, 93, 94, 96, 97, 98, 99, 101, 102, 103, 104, 106, 107, 108, 109, -110, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, -128, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, -143, 144, 144, 145, 146, 147, 148, 149, 150, 150, 151, 152, 153, 154, 155, 155, -156, 157, 158, 159, 160, 160, 161, 162, 163, 163, 164, 165, 166, 167, 167, 168, -169, 170, 170, 171, 172, 173, 173, 174, 175, 176, 176, 177, 178, 178, 179, 180, -181, 181, 182, 183, 183, 184, 185, 185, 186, 187, 187, 188, 189, 189, 190, 191, -192, 192, 193, 193, 194, 195, 195, 196, 197, 197, 198, 199, 199, 200, 201, 201, -202, 203, 203, 204, 204, 205, 206, 206, 207, 208, 208, 209, 209, 210, 211, 211, -212, 212, 213, 214, 214, 215, 215, 216, 217, 217, 218, 218, 219, 219, 220, 221, -221, 222, 222, 223, 224, 224, 225, 225, 226, 226, 227, 227, 228, 229, 229, 230, -230, 231, 231, 232, 232, 233, 234, 234, 235, 235, 236, 236, 237, 237, 238, 238, -239, 240, 240, 241, 241, 242, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, -247, 248, 248, 249, 249, 250, 250, 251, 251, 252, 252, 253, 253, 254, 254, 255 -}; - -static INLINE -int -ss_isqrt(int x) { - int y, e; - - if(x >= (SS_BLOCKSIZE * SS_BLOCKSIZE)) { return SS_BLOCKSIZE; } - e = (x & 0xffff0000) ? - ((x & 0xff000000) ? - 24 + lg_table[(x >> 24) & 0xff] : - 16 + lg_table[(x >> 16) & 0xff]) : - ((x & 0x0000ff00) ? - 8 + lg_table[(x >> 8) & 0xff] : - 0 + lg_table[(x >> 0) & 0xff]); - - if(e >= 16) { - y = sqq_table[x >> ((e - 6) - (e & 1))] << ((e >> 1) - 7); - if(e >= 24) { y = (y + 1 + x / y) >> 1; } - y = (y + 1 + x / y) >> 1; - } else if(e >= 8) { - y = (sqq_table[x >> ((e - 6) - (e & 1))] >> (7 - (e >> 1))) + 1; - } else { - return sqq_table[x] >> 4; - } - - return (x < (y * y)) ? y - 1 : y; -} - -#endif /* SS_BLOCKSIZE != 0 */ - - -/*---------------------------------------------------------------------------*/ - -/* Compares two suffixes. */ -static INLINE -int -ss_compare(const unsigned char *T, - const int *p1, const int *p2, - int depth) { - const unsigned char *U1, *U2, *U1n, *U2n; - - for(U1 = T + depth + *p1, - U2 = T + depth + *p2, - U1n = T + *(p1 + 1) + 2, - U2n = T + *(p2 + 1) + 2; - (U1 < U1n) && (U2 < U2n) && (*U1 == *U2); - ++U1, ++U2) { - } - - return U1 < U1n ? - (U2 < U2n ? *U1 - *U2 : 1) : - (U2 < U2n ? -1 : 0); -} - - -/*---------------------------------------------------------------------------*/ - -#if (SS_BLOCKSIZE != 1) && (SS_INSERTIONSORT_THRESHOLD != 1) - -/* Insertionsort for small size groups */ -static -void -ss_insertionsort(const unsigned char *T, const int *PA, - int *first, int *last, int depth) { - int *i, *j; - int t; - int r; - - for(i = last - 2; first <= i; --i) { - for(t = *i, j = i + 1; 0 < (r = ss_compare(T, PA + t, PA + *j, depth));) { - do { *(j - 1) = *j; } while((++j < last) && (*j < 0)); - if(last <= j) { break; } - } - if(r == 0) { *j = ~*j; } - *(j - 1) = t; - } -} - -#endif /* (SS_BLOCKSIZE != 1) && (SS_INSERTIONSORT_THRESHOLD != 1) */ - - -/*---------------------------------------------------------------------------*/ - -#if (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE) - -static INLINE -void -ss_fixdown(const unsigned char *Td, const int *PA, - int *SA, int i, int size) { - int j, k; - int v; - int c, d, e; - - for(v = SA[i], c = Td[PA[v]]; (j = 2 * i + 1) < size; SA[i] = SA[k], i = k) { - d = Td[PA[SA[k = j++]]]; - if(d < (e = Td[PA[SA[j]]])) { k = j; d = e; } - if(d <= c) { break; } - } - SA[i] = v; -} - -/* Simple top-down heapsort. */ -static -void -ss_heapsort(const unsigned char *Td, const int *PA, int *SA, int size) { - int i, m; - int t; - - m = size; - if((size % 2) == 0) { - m--; - if(Td[PA[SA[m / 2]]] < Td[PA[SA[m]]]) { SWAP(SA[m], SA[m / 2]); } - } - - for(i = m / 2 - 1; 0 <= i; --i) { ss_fixdown(Td, PA, SA, i, m); } - if((size % 2) == 0) { SWAP(SA[0], SA[m]); ss_fixdown(Td, PA, SA, 0, m); } - for(i = m - 1; 0 < i; --i) { - t = SA[0], SA[0] = SA[i]; - ss_fixdown(Td, PA, SA, 0, i); - SA[i] = t; - } -} - - -/*---------------------------------------------------------------------------*/ - -/* Returns the median of three elements. */ -static INLINE -int * -ss_median3(const unsigned char *Td, const int *PA, - int *v1, int *v2, int *v3) { - int *t; - if(Td[PA[*v1]] > Td[PA[*v2]]) { SWAP(v1, v2); } - if(Td[PA[*v2]] > Td[PA[*v3]]) { - if(Td[PA[*v1]] > Td[PA[*v3]]) { return v1; } - else { return v3; } - } - return v2; -} - -/* Returns the median of five elements. */ -static INLINE -int * -ss_median5(const unsigned char *Td, const int *PA, - int *v1, int *v2, int *v3, int *v4, int *v5) { - int *t; - if(Td[PA[*v2]] > Td[PA[*v3]]) { SWAP(v2, v3); } - if(Td[PA[*v4]] > Td[PA[*v5]]) { SWAP(v4, v5); } - if(Td[PA[*v2]] > Td[PA[*v4]]) { SWAP(v2, v4); SWAP(v3, v5); } - if(Td[PA[*v1]] > Td[PA[*v3]]) { SWAP(v1, v3); } - if(Td[PA[*v1]] > Td[PA[*v4]]) { SWAP(v1, v4); SWAP(v3, v5); } - if(Td[PA[*v3]] > Td[PA[*v4]]) { return v4; } - return v3; -} - -/* Returns the pivot element. */ -static INLINE -int * -ss_pivot(const unsigned char *Td, const int *PA, int *first, int *last) { - int *middle; - int t; - - t = last - first; - middle = first + t / 2; - - if(t <= 512) { - if(t <= 32) { - return ss_median3(Td, PA, first, middle, last - 1); - } else { - t >>= 2; - return ss_median5(Td, PA, first, first + t, middle, last - 1 - t, last - 1); - } - } - t >>= 3; - first = ss_median3(Td, PA, first, first + t, first + (t << 1)); - middle = ss_median3(Td, PA, middle - t, middle, middle + t); - last = ss_median3(Td, PA, last - 1 - (t << 1), last - 1 - t, last - 1); - return ss_median3(Td, PA, first, middle, last); -} - - -/*---------------------------------------------------------------------------*/ - -/* Binary partition for substrings. */ -static INLINE -int * -ss_partition(const int *PA, - int *first, int *last, int depth) { - int *a, *b; - int t; - for(a = first - 1, b = last;;) { - for(; (++a < b) && ((PA[*a] + depth) >= (PA[*a + 1] + 1));) { *a = ~*a; } - for(; (a < --b) && ((PA[*b] + depth) < (PA[*b + 1] + 1));) { } - if(b <= a) { break; } - t = ~*b; - *b = *a; - *a = t; - } - if(first < a) { *first = ~*first; } - return a; -} - -/* Multikey introsort for medium size groups. */ -static -void -ss_mintrosort(const unsigned char *T, const int *PA, - int *first, int *last, - int depth) { -#define STACK_SIZE SS_MISORT_STACKSIZE - struct { int *a, *b, c; int d; } stack[STACK_SIZE]; - const unsigned char *Td; - int *a, *b, *c, *d, *e, *f; - int s, t; - int ssize; - int limit; - int v, x = 0; - - for(ssize = 0, limit = ss_ilg(last - first);;) { - - if((last - first) <= SS_INSERTIONSORT_THRESHOLD) { -#if 1 < SS_INSERTIONSORT_THRESHOLD - if(1 < (last - first)) { ss_insertionsort(T, PA, first, last, depth); } -#endif - STACK_POP(first, last, depth, limit); - continue; - } - - Td = T + depth; - if(limit-- == 0) { ss_heapsort(Td, PA, first, last - first); } - if(limit < 0) { - for(a = first + 1, v = Td[PA[*first]]; a < last; ++a) { - if((x = Td[PA[*a]]) != v) { - if(1 < (a - first)) { break; } - v = x; - first = a; - } - } - if(Td[PA[*first] - 1] < v) { - first = ss_partition(PA, first, a, depth); - } - if((a - first) <= (last - a)) { - if(1 < (a - first)) { - STACK_PUSH(a, last, depth, -1); - last = a, depth += 1, limit = ss_ilg(a - first); - } else { - first = a, limit = -1; - } - } else { - if(1 < (last - a)) { - STACK_PUSH(first, a, depth + 1, ss_ilg(a - first)); - first = a, limit = -1; - } else { - last = a, depth += 1, limit = ss_ilg(a - first); - } - } - continue; - } - - /* choose pivot */ - a = ss_pivot(Td, PA, first, last); - v = Td[PA[*a]]; - SWAP(*first, *a); - - /* partition */ - for(b = first; (++b < last) && ((x = Td[PA[*b]]) == v);) { } - if(((a = b) < last) && (x < v)) { - for(; (++b < last) && ((x = Td[PA[*b]]) <= v);) { - if(x == v) { SWAP(*b, *a); ++a; } - } - } - for(c = last; (b < --c) && ((x = Td[PA[*c]]) == v);) { } - if((b < (d = c)) && (x > v)) { - for(; (b < --c) && ((x = Td[PA[*c]]) >= v);) { - if(x == v) { SWAP(*c, *d); --d; } - } - } - for(; b < c;) { - SWAP(*b, *c); - for(; (++b < c) && ((x = Td[PA[*b]]) <= v);) { - if(x == v) { SWAP(*b, *a); ++a; } - } - for(; (b < --c) && ((x = Td[PA[*c]]) >= v);) { - if(x == v) { SWAP(*c, *d); --d; } - } - } - - if(a <= d) { - c = b - 1; - - if((s = a - first) > (t = b - a)) { s = t; } - for(e = first, f = b - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); } - if((s = d - c) > (t = last - d - 1)) { s = t; } - for(e = b, f = last - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); } - - a = first + (b - a), c = last - (d - c); - b = (v <= Td[PA[*a] - 1]) ? a : ss_partition(PA, a, c, depth); - - if((a - first) <= (last - c)) { - if((last - c) <= (c - b)) { - STACK_PUSH(b, c, depth + 1, ss_ilg(c - b)); - STACK_PUSH(c, last, depth, limit); - last = a; - } else if((a - first) <= (c - b)) { - STACK_PUSH(c, last, depth, limit); - STACK_PUSH(b, c, depth + 1, ss_ilg(c - b)); - last = a; - } else { - STACK_PUSH(c, last, depth, limit); - STACK_PUSH(first, a, depth, limit); - first = b, last = c, depth += 1, limit = ss_ilg(c - b); - } - } else { - if((a - first) <= (c - b)) { - STACK_PUSH(b, c, depth + 1, ss_ilg(c - b)); - STACK_PUSH(first, a, depth, limit); - first = c; - } else if((last - c) <= (c - b)) { - STACK_PUSH(first, a, depth, limit); - STACK_PUSH(b, c, depth + 1, ss_ilg(c - b)); - first = c; - } else { - STACK_PUSH(first, a, depth, limit); - STACK_PUSH(c, last, depth, limit); - first = b, last = c, depth += 1, limit = ss_ilg(c - b); - } - } - } else { - limit += 1; - if(Td[PA[*first] - 1] < v) { - first = ss_partition(PA, first, last, depth); - limit = ss_ilg(last - first); - } - depth += 1; - } - } -#undef STACK_SIZE -} - -#endif /* (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE) */ - - -/*---------------------------------------------------------------------------*/ - -#if SS_BLOCKSIZE != 0 - -static INLINE -void -ss_blockswap(int *a, int *b, int n) { - int t; - for(; 0 < n; --n, ++a, ++b) { - t = *a, *a = *b, *b = t; - } -} - -static INLINE -void -ss_rotate(int *first, int *middle, int *last) { - int *a, *b, t; - int l, r; - l = middle - first, r = last - middle; - for(; (0 < l) && (0 < r);) { - if(l == r) { ss_blockswap(first, middle, l); break; } - if(l < r) { - a = last - 1, b = middle - 1; - t = *a; - do { - *a-- = *b, *b-- = *a; - if(b < first) { - *a = t; - last = a; - if((r -= l + 1) <= l) { break; } - a -= 1, b = middle - 1; - t = *a; - } - } while(1); - } else { - a = first, b = middle; - t = *a; - do { - *a++ = *b, *b++ = *a; - if(last <= b) { - *a = t; - first = a + 1; - if((l -= r + 1) <= r) { break; } - a += 1, b = middle; - t = *a; - } - } while(1); - } - } -} - - -/*---------------------------------------------------------------------------*/ - -static -void -ss_inplacemerge(const unsigned char *T, const int *PA, - int *first, int *middle, int *last, - int depth) { - const int *p; - int *a, *b; - int len, half; - int q, r; - int x; - - for(;;) { - if(*(last - 1) < 0) { x = 1; p = PA + ~*(last - 1); } - else { x = 0; p = PA + *(last - 1); } - for(a = first, len = middle - first, half = len >> 1, r = -1; - 0 < len; - len = half, half >>= 1) { - b = a + half; - q = ss_compare(T, PA + ((0 <= *b) ? *b : ~*b), p, depth); - if(q < 0) { - a = b + 1; - half -= (len & 1) ^ 1; - } else { - r = q; - } - } - if(a < middle) { - if(r == 0) { *a = ~*a; } - ss_rotate(a, middle, last); - last -= middle - a; - middle = a; - if(first == middle) { break; } - } - --last; - if(x != 0) { while(*--last < 0) { } } - if(middle == last) { break; } - } -} - - -/*---------------------------------------------------------------------------*/ - -/* Merge-forward with internal buffer. */ -static -void -ss_mergeforward(const unsigned char *T, const int *PA, - int *first, int *middle, int *last, - int *buf, int depth) { - int *a, *b, *c, *bufend; - int t; - int r; - - bufend = buf + (middle - first) - 1; - ss_blockswap(buf, first, middle - first); - - for(t = *(a = first), b = buf, c = middle;;) { - r = ss_compare(T, PA + *b, PA + *c, depth); - if(r < 0) { - do { - *a++ = *b; - if(bufend <= b) { *bufend = t; return; } - *b++ = *a; - } while(*b < 0); - } else if(r > 0) { - do { - *a++ = *c, *c++ = *a; - if(last <= c) { - while(b < bufend) { *a++ = *b, *b++ = *a; } - *a = *b, *b = t; - return; - } - } while(*c < 0); - } else { - *c = ~*c; - do { - *a++ = *b; - if(bufend <= b) { *bufend = t; return; } - *b++ = *a; - } while(*b < 0); - - do { - *a++ = *c, *c++ = *a; - if(last <= c) { - while(b < bufend) { *a++ = *b, *b++ = *a; } - *a = *b, *b = t; - return; - } - } while(*c < 0); - } - } -} - -/* Merge-backward with internal buffer. */ -static -void -ss_mergebackward(const unsigned char *T, const int *PA, - int *first, int *middle, int *last, - int *buf, int depth) { - const int *p1, *p2; - int *a, *b, *c, *bufend; - int t; - int r; - int x; - - bufend = buf + (last - middle) - 1; - ss_blockswap(buf, middle, last - middle); - - x = 0; - if(*bufend < 0) { p1 = PA + ~*bufend; x |= 1; } - else { p1 = PA + *bufend; } - if(*(middle - 1) < 0) { p2 = PA + ~*(middle - 1); x |= 2; } - else { p2 = PA + *(middle - 1); } - for(t = *(a = last - 1), b = bufend, c = middle - 1;;) { - r = ss_compare(T, p1, p2, depth); - if(0 < r) { - if(x & 1) { do { *a-- = *b, *b-- = *a; } while(*b < 0); x ^= 1; } - *a-- = *b; - if(b <= buf) { *buf = t; break; } - *b-- = *a; - if(*b < 0) { p1 = PA + ~*b; x |= 1; } - else { p1 = PA + *b; } - } else if(r < 0) { - if(x & 2) { do { *a-- = *c, *c-- = *a; } while(*c < 0); x ^= 2; } - *a-- = *c, *c-- = *a; - if(c < first) { - while(buf < b) { *a-- = *b, *b-- = *a; } - *a = *b, *b = t; - break; - } - if(*c < 0) { p2 = PA + ~*c; x |= 2; } - else { p2 = PA + *c; } - } else { - if(x & 1) { do { *a-- = *b, *b-- = *a; } while(*b < 0); x ^= 1; } - *a-- = ~*b; - if(b <= buf) { *buf = t; break; } - *b-- = *a; - if(x & 2) { do { *a-- = *c, *c-- = *a; } while(*c < 0); x ^= 2; } - *a-- = *c, *c-- = *a; - if(c < first) { - while(buf < b) { *a-- = *b, *b-- = *a; } - *a = *b, *b = t; - break; - } - if(*b < 0) { p1 = PA + ~*b; x |= 1; } - else { p1 = PA + *b; } - if(*c < 0) { p2 = PA + ~*c; x |= 2; } - else { p2 = PA + *c; } - } - } -} - -/* D&C based merge. */ -static -void -ss_swapmerge(const unsigned char *T, const int *PA, - int *first, int *middle, int *last, - int *buf, int bufsize, int depth) { -#define STACK_SIZE SS_SMERGE_STACKSIZE -#define GETIDX(a) ((0 <= (a)) ? (a) : (~(a))) -#define MERGE_CHECK(a, b, c)\ - do {\ - if(((c) & 1) ||\ - (((c) & 2) && (ss_compare(T, PA + GETIDX(*((a) - 1)), PA + *(a), depth) == 0))) {\ - *(a) = ~*(a);\ - }\ - if(((c) & 4) && ((ss_compare(T, PA + GETIDX(*((b) - 1)), PA + *(b), depth) == 0))) {\ - *(b) = ~*(b);\ - }\ - } while(0) - struct { int *a, *b, *c; int d; } stack[STACK_SIZE]; - int *l, *r, *lm, *rm; - int m, len, half; - int ssize; - int check, next; - - for(check = 0, ssize = 0;;) { - if((last - middle) <= bufsize) { - if((first < middle) && (middle < last)) { - ss_mergebackward(T, PA, first, middle, last, buf, depth); - } - MERGE_CHECK(first, last, check); - STACK_POP(first, middle, last, check); - continue; - } - - if((middle - first) <= bufsize) { - if(first < middle) { - ss_mergeforward(T, PA, first, middle, last, buf, depth); - } - MERGE_CHECK(first, last, check); - STACK_POP(first, middle, last, check); - continue; - } - - for(m = 0, len = MIN(middle - first, last - middle), half = len >> 1; - 0 < len; - len = half, half >>= 1) { - if(ss_compare(T, PA + GETIDX(*(middle + m + half)), - PA + GETIDX(*(middle - m - half - 1)), depth) < 0) { - m += half + 1; - half -= (len & 1) ^ 1; - } - } - - if(0 < m) { - lm = middle - m, rm = middle + m; - ss_blockswap(lm, middle, m); - l = r = middle, next = 0; - if(rm < last) { - if(*rm < 0) { - *rm = ~*rm; - if(first < lm) { for(; *--l < 0;) { } next |= 4; } - next |= 1; - } else if(first < lm) { - for(; *r < 0; ++r) { } - next |= 2; - } - } - - if((l - first) <= (last - r)) { - STACK_PUSH(r, rm, last, (next & 3) | (check & 4)); - middle = lm, last = l, check = (check & 3) | (next & 4); - } else { - if((next & 2) && (r == middle)) { next ^= 6; } - STACK_PUSH(first, lm, l, (check & 3) | (next & 4)); - first = r, middle = rm, check = (next & 3) | (check & 4); - } - } else { - if(ss_compare(T, PA + GETIDX(*(middle - 1)), PA + *middle, depth) == 0) { - *middle = ~*middle; - } - MERGE_CHECK(first, last, check); - STACK_POP(first, middle, last, check); - } - } -#undef STACK_SIZE -} - -#endif /* SS_BLOCKSIZE != 0 */ - - -/*---------------------------------------------------------------------------*/ - -/* Substring sort */ -static -void -sssort(const unsigned char *T, const int *PA, - int *first, int *last, - int *buf, int bufsize, - int depth, int n, int lastsuffix) { - int *a; -#if SS_BLOCKSIZE != 0 - int *b, *middle, *curbuf; - int j, k, curbufsize, limit; -#endif - int i; - - if(lastsuffix != 0) { ++first; } - -#if SS_BLOCKSIZE == 0 - ss_mintrosort(T, PA, first, last, depth); -#else - if((bufsize < SS_BLOCKSIZE) && - (bufsize < (last - first)) && - (bufsize < (limit = ss_isqrt(last - first)))) { - if(SS_BLOCKSIZE < limit) { limit = SS_BLOCKSIZE; } - buf = middle = last - limit, bufsize = limit; - } else { - middle = last, limit = 0; - } - for(a = first, i = 0; SS_BLOCKSIZE < (middle - a); a += SS_BLOCKSIZE, ++i) { -#if SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE - ss_mintrosort(T, PA, a, a + SS_BLOCKSIZE, depth); -#elif 1 < SS_BLOCKSIZE - ss_insertionsort(T, PA, a, a + SS_BLOCKSIZE, depth); -#endif - curbufsize = last - (a + SS_BLOCKSIZE); - curbuf = a + SS_BLOCKSIZE; - if(curbufsize <= bufsize) { curbufsize = bufsize, curbuf = buf; } - for(b = a, k = SS_BLOCKSIZE, j = i; j & 1; b -= k, k <<= 1, j >>= 1) { - ss_swapmerge(T, PA, b - k, b, b + k, curbuf, curbufsize, depth); - } - } -#if SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE - ss_mintrosort(T, PA, a, middle, depth); -#elif 1 < SS_BLOCKSIZE - ss_insertionsort(T, PA, a, middle, depth); -#endif - for(k = SS_BLOCKSIZE; i != 0; k <<= 1, i >>= 1) { - if(i & 1) { - ss_swapmerge(T, PA, a - k, a, middle, buf, bufsize, depth); - a -= k; - } - } - if(limit != 0) { -#if SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE - ss_mintrosort(T, PA, middle, last, depth); -#elif 1 < SS_BLOCKSIZE - ss_insertionsort(T, PA, middle, last, depth); -#endif - ss_inplacemerge(T, PA, first, middle, last, depth); - } -#endif - - if(lastsuffix != 0) { - /* Insert last type B* suffix. */ - int PAi[2]; PAi[0] = PA[*(first - 1)], PAi[1] = n - 2; - for(a = first, i = *(first - 1); - (a < last) && ((*a < 0) || (0 < ss_compare(T, &(PAi[0]), PA + *a, depth))); - ++a) { - *(a - 1) = *a; - } - *(a - 1) = i; - } -} - - -/*---------------------------------------------------------------------------*/ - -static INLINE -int -tr_ilg(int n) { - return (n & 0xffff0000) ? - ((n & 0xff000000) ? - 24 + lg_table[(n >> 24) & 0xff] : - 16 + lg_table[(n >> 16) & 0xff]) : - ((n & 0x0000ff00) ? - 8 + lg_table[(n >> 8) & 0xff] : - 0 + lg_table[(n >> 0) & 0xff]); -} - - -/*---------------------------------------------------------------------------*/ - -/* Simple insertionsort for small size groups. */ -static -void -tr_insertionsort(const int *ISAd, int *first, int *last) { - int *a, *b; - int t, r; - - for(a = first + 1; a < last; ++a) { - for(t = *a, b = a - 1; 0 > (r = ISAd[t] - ISAd[*b]);) { - do { *(b + 1) = *b; } while((first <= --b) && (*b < 0)); - if(b < first) { break; } - } - if(r == 0) { *b = ~*b; } - *(b + 1) = t; - } -} - - -/*---------------------------------------------------------------------------*/ - -static INLINE -void -tr_fixdown(const int *ISAd, int *SA, int i, int size) { - int j, k; - int v; - int c, d, e; - - for(v = SA[i], c = ISAd[v]; (j = 2 * i + 1) < size; SA[i] = SA[k], i = k) { - d = ISAd[SA[k = j++]]; - if(d < (e = ISAd[SA[j]])) { k = j; d = e; } - if(d <= c) { break; } - } - SA[i] = v; -} - -/* Simple top-down heapsort. */ -static -void -tr_heapsort(const int *ISAd, int *SA, int size) { - int i, m; - int t; - - m = size; - if((size % 2) == 0) { - m--; - if(ISAd[SA[m / 2]] < ISAd[SA[m]]) { SWAP(SA[m], SA[m / 2]); } - } - - for(i = m / 2 - 1; 0 <= i; --i) { tr_fixdown(ISAd, SA, i, m); } - if((size % 2) == 0) { SWAP(SA[0], SA[m]); tr_fixdown(ISAd, SA, 0, m); } - for(i = m - 1; 0 < i; --i) { - t = SA[0], SA[0] = SA[i]; - tr_fixdown(ISAd, SA, 0, i); - SA[i] = t; - } -} - - -/*---------------------------------------------------------------------------*/ - -/* Returns the median of three elements. */ -static INLINE -int * -tr_median3(const int *ISAd, int *v1, int *v2, int *v3) { - int *t; - if(ISAd[*v1] > ISAd[*v2]) { SWAP(v1, v2); } - if(ISAd[*v2] > ISAd[*v3]) { - if(ISAd[*v1] > ISAd[*v3]) { return v1; } - else { return v3; } - } - return v2; -} - -/* Returns the median of five elements. */ -static INLINE -int * -tr_median5(const int *ISAd, - int *v1, int *v2, int *v3, int *v4, int *v5) { - int *t; - if(ISAd[*v2] > ISAd[*v3]) { SWAP(v2, v3); } - if(ISAd[*v4] > ISAd[*v5]) { SWAP(v4, v5); } - if(ISAd[*v2] > ISAd[*v4]) { SWAP(v2, v4); SWAP(v3, v5); } - if(ISAd[*v1] > ISAd[*v3]) { SWAP(v1, v3); } - if(ISAd[*v1] > ISAd[*v4]) { SWAP(v1, v4); SWAP(v3, v5); } - if(ISAd[*v3] > ISAd[*v4]) { return v4; } - return v3; -} - -/* Returns the pivot element. */ -static INLINE -int * -tr_pivot(const int *ISAd, int *first, int *last) { - int *middle; - int t; - - t = last - first; - middle = first + t / 2; - - if(t <= 512) { - if(t <= 32) { - return tr_median3(ISAd, first, middle, last - 1); - } else { - t >>= 2; - return tr_median5(ISAd, first, first + t, middle, last - 1 - t, last - 1); - } - } - t >>= 3; - first = tr_median3(ISAd, first, first + t, first + (t << 1)); - middle = tr_median3(ISAd, middle - t, middle, middle + t); - last = tr_median3(ISAd, last - 1 - (t << 1), last - 1 - t, last - 1); - return tr_median3(ISAd, first, middle, last); -} - - -/*---------------------------------------------------------------------------*/ - -typedef struct _trbudget_t trbudget_t; -struct _trbudget_t { - int chance; - int remain; - int incval; - int count; -}; - -static INLINE -void -trbudget_init(trbudget_t *budget, int chance, int incval) { - budget->chance = chance; - budget->remain = budget->incval = incval; -} - -static INLINE -int -trbudget_check(trbudget_t *budget, int size) { - if(size <= budget->remain) { budget->remain -= size; return 1; } - if(budget->chance == 0) { budget->count += size; return 0; } - budget->remain += budget->incval - size; - budget->chance -= 1; - return 1; -} - - -/*---------------------------------------------------------------------------*/ - -static INLINE -void -tr_partition(const int *ISAd, - int *first, int *middle, int *last, - int **pa, int **pb, int v) { - int *a, *b, *c, *d, *e, *f; - int t, s; - int x = 0; - - for(b = middle - 1; (++b < last) && ((x = ISAd[*b]) == v);) { } - if(((a = b) < last) && (x < v)) { - for(; (++b < last) && ((x = ISAd[*b]) <= v);) { - if(x == v) { SWAP(*b, *a); ++a; } - } - } - for(c = last; (b < --c) && ((x = ISAd[*c]) == v);) { } - if((b < (d = c)) && (x > v)) { - for(; (b < --c) && ((x = ISAd[*c]) >= v);) { - if(x == v) { SWAP(*c, *d); --d; } - } - } - for(; b < c;) { - SWAP(*b, *c); - for(; (++b < c) && ((x = ISAd[*b]) <= v);) { - if(x == v) { SWAP(*b, *a); ++a; } - } - for(; (b < --c) && ((x = ISAd[*c]) >= v);) { - if(x == v) { SWAP(*c, *d); --d; } - } - } - - if(a <= d) { - c = b - 1; - if((s = a - first) > (t = b - a)) { s = t; } - for(e = first, f = b - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); } - if((s = d - c) > (t = last - d - 1)) { s = t; } - for(e = b, f = last - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); } - first += (b - a), last -= (d - c); - } - *pa = first, *pb = last; -} - -static -void -tr_copy(int *ISA, const int *SA, - int *first, int *a, int *b, int *last, - int depth) { - /* sort suffixes of middle partition - by using sorted order of suffixes of left and right partition. */ - int *c, *d, *e; - int s, v; - - v = b - SA - 1; - for(c = first, d = a - 1; c <= d; ++c) { - if((0 <= (s = *c - depth)) && (ISA[s] == v)) { - *++d = s; - ISA[s] = d - SA; - } - } - for(c = last - 1, e = d + 1, d = b; e < d; --c) { - if((0 <= (s = *c - depth)) && (ISA[s] == v)) { - *--d = s; - ISA[s] = d - SA; - } - } -} - -static -void -tr_partialcopy(int *ISA, const int *SA, - int *first, int *a, int *b, int *last, - int depth) { - int *c, *d, *e; - int s, v; - int rank, lastrank, newrank = -1; - - v = b - SA - 1; - lastrank = -1; - for(c = first, d = a - 1; c <= d; ++c) { - if((0 <= (s = *c - depth)) && (ISA[s] == v)) { - *++d = s; - rank = ISA[s + depth]; - if(lastrank != rank) { lastrank = rank; newrank = d - SA; } - ISA[s] = newrank; - } - } - - lastrank = -1; - for(e = d; first <= e; --e) { - rank = ISA[*e]; - if(lastrank != rank) { lastrank = rank; newrank = e - SA; } - if(newrank != rank) { ISA[*e] = newrank; } - } - - lastrank = -1; - for(c = last - 1, e = d + 1, d = b; e < d; --c) { - if((0 <= (s = *c - depth)) && (ISA[s] == v)) { - *--d = s; - rank = ISA[s + depth]; - if(lastrank != rank) { lastrank = rank; newrank = d - SA; } - ISA[s] = newrank; - } - } -} - -static -void -tr_introsort(int *ISA, const int *ISAd, - int *SA, int *first, int *last, - trbudget_t *budget) { -#define STACK_SIZE TR_STACKSIZE - struct { const int *a; int *b, *c; int d, e; }stack[STACK_SIZE]; - int *a, *b, *c; - int t; - int v, x = 0; - int incr = ISAd - ISA; - int limit, next; - int ssize, trlink = -1; - - for(ssize = 0, limit = tr_ilg(last - first);;) { - - if(limit < 0) { - if(limit == -1) { - /* tandem repeat partition */ - tr_partition(ISAd - incr, first, first, last, &a, &b, last - SA - 1); - - /* update ranks */ - if(a < last) { - for(c = first, v = a - SA - 1; c < a; ++c) { ISA[*c] = v; } - } - if(b < last) { - for(c = a, v = b - SA - 1; c < b; ++c) { ISA[*c] = v; } - } - - /* push */ - if(1 < (b - a)) { - STACK_PUSH5(NULL, a, b, 0, 0); - STACK_PUSH5(ISAd - incr, first, last, -2, trlink); - trlink = ssize - 2; - } - if((a - first) <= (last - b)) { - if(1 < (a - first)) { - STACK_PUSH5(ISAd, b, last, tr_ilg(last - b), trlink); - last = a, limit = tr_ilg(a - first); - } else if(1 < (last - b)) { - first = b, limit = tr_ilg(last - b); - } else { - STACK_POP5(ISAd, first, last, limit, trlink); - } - } else { - if(1 < (last - b)) { - STACK_PUSH5(ISAd, first, a, tr_ilg(a - first), trlink); - first = b, limit = tr_ilg(last - b); - } else if(1 < (a - first)) { - last = a, limit = tr_ilg(a - first); - } else { - STACK_POP5(ISAd, first, last, limit, trlink); - } - } - } else if(limit == -2) { - /* tandem repeat copy */ - a = stack[--ssize].b, b = stack[ssize].c; - if(stack[ssize].d == 0) { - tr_copy(ISA, SA, first, a, b, last, ISAd - ISA); - } else { - if(0 <= trlink) { stack[trlink].d = -1; } - tr_partialcopy(ISA, SA, first, a, b, last, ISAd - ISA); - } - STACK_POP5(ISAd, first, last, limit, trlink); - } else { - /* sorted partition */ - if(0 <= *first) { - a = first; - do { ISA[*a] = a - SA; } while((++a < last) && (0 <= *a)); - first = a; - } - if(first < last) { - a = first; do { *a = ~*a; } while(*++a < 0); - next = (ISA[*a] != ISAd[*a]) ? tr_ilg(a - first + 1) : -1; - if(++a < last) { for(b = first, v = a - SA - 1; b < a; ++b) { ISA[*b] = v; } } - - /* push */ - if(trbudget_check(budget, a - first)) { - if((a - first) <= (last - a)) { - STACK_PUSH5(ISAd, a, last, -3, trlink); - ISAd += incr, last = a, limit = next; - } else { - if(1 < (last - a)) { - STACK_PUSH5(ISAd + incr, first, a, next, trlink); - first = a, limit = -3; - } else { - ISAd += incr, last = a, limit = next; - } - } - } else { - if(0 <= trlink) { stack[trlink].d = -1; } - if(1 < (last - a)) { - first = a, limit = -3; - } else { - STACK_POP5(ISAd, first, last, limit, trlink); - } - } - } else { - STACK_POP5(ISAd, first, last, limit, trlink); - } - } - continue; - } - - if((last - first) <= TR_INSERTIONSORT_THRESHOLD) { - tr_insertionsort(ISAd, first, last); - limit = -3; - continue; - } - - if(limit-- == 0) { - tr_heapsort(ISAd, first, last - first); - for(a = last - 1; first < a; a = b) { - for(x = ISAd[*a], b = a - 1; (first <= b) && (ISAd[*b] == x); --b) { *b = ~*b; } - } - limit = -3; - continue; - } - - /* choose pivot */ - a = tr_pivot(ISAd, first, last); - SWAP(*first, *a); - v = ISAd[*first]; - - /* partition */ - tr_partition(ISAd, first, first + 1, last, &a, &b, v); - if((last - first) != (b - a)) { - next = (ISA[*a] != v) ? tr_ilg(b - a) : -1; - - /* update ranks */ - for(c = first, v = a - SA - 1; c < a; ++c) { ISA[*c] = v; } - if(b < last) { for(c = a, v = b - SA - 1; c < b; ++c) { ISA[*c] = v; } } - - /* push */ - if((1 < (b - a)) && (trbudget_check(budget, b - a))) { - if((a - first) <= (last - b)) { - if((last - b) <= (b - a)) { - if(1 < (a - first)) { - STACK_PUSH5(ISAd + incr, a, b, next, trlink); - STACK_PUSH5(ISAd, b, last, limit, trlink); - last = a; - } else if(1 < (last - b)) { - STACK_PUSH5(ISAd + incr, a, b, next, trlink); - first = b; - } else { - ISAd += incr, first = a, last = b, limit = next; - } - } else if((a - first) <= (b - a)) { - if(1 < (a - first)) { - STACK_PUSH5(ISAd, b, last, limit, trlink); - STACK_PUSH5(ISAd + incr, a, b, next, trlink); - last = a; - } else { - STACK_PUSH5(ISAd, b, last, limit, trlink); - ISAd += incr, first = a, last = b, limit = next; - } - } else { - STACK_PUSH5(ISAd, b, last, limit, trlink); - STACK_PUSH5(ISAd, first, a, limit, trlink); - ISAd += incr, first = a, last = b, limit = next; - } - } else { - if((a - first) <= (b - a)) { - if(1 < (last - b)) { - STACK_PUSH5(ISAd + incr, a, b, next, trlink); - STACK_PUSH5(ISAd, first, a, limit, trlink); - first = b; - } else if(1 < (a - first)) { - STACK_PUSH5(ISAd + incr, a, b, next, trlink); - last = a; - } else { - ISAd += incr, first = a, last = b, limit = next; - } - } else if((last - b) <= (b - a)) { - if(1 < (last - b)) { - STACK_PUSH5(ISAd, first, a, limit, trlink); - STACK_PUSH5(ISAd + incr, a, b, next, trlink); - first = b; - } else { - STACK_PUSH5(ISAd, first, a, limit, trlink); - ISAd += incr, first = a, last = b, limit = next; - } - } else { - STACK_PUSH5(ISAd, first, a, limit, trlink); - STACK_PUSH5(ISAd, b, last, limit, trlink); - ISAd += incr, first = a, last = b, limit = next; - } - } - } else { - if((1 < (b - a)) && (0 <= trlink)) { stack[trlink].d = -1; } - if((a - first) <= (last - b)) { - if(1 < (a - first)) { - STACK_PUSH5(ISAd, b, last, limit, trlink); - last = a; - } else if(1 < (last - b)) { - first = b; - } else { - STACK_POP5(ISAd, first, last, limit, trlink); - } - } else { - if(1 < (last - b)) { - STACK_PUSH5(ISAd, first, a, limit, trlink); - first = b; - } else if(1 < (a - first)) { - last = a; - } else { - STACK_POP5(ISAd, first, last, limit, trlink); - } - } - } - } else { - if(trbudget_check(budget, last - first)) { - limit = tr_ilg(last - first), ISAd += incr; - } else { - if(0 <= trlink) { stack[trlink].d = -1; } - STACK_POP5(ISAd, first, last, limit, trlink); - } - } - } -#undef STACK_SIZE -} - - - -/*---------------------------------------------------------------------------*/ - -/* Tandem repeat sort */ -static -void -trsort(int *ISA, int *SA, int n, int depth) { - int *ISAd; - int *first, *last; - trbudget_t budget; - int t, skip, unsorted; - - trbudget_init(&budget, tr_ilg(n) * 2 / 3, n); -/* trbudget_init(&budget, tr_ilg(n) * 3 / 4, n); */ - for(ISAd = ISA + depth; -n < *SA; ISAd += ISAd - ISA) { - first = SA; - skip = 0; - unsorted = 0; - do { - if((t = *first) < 0) { first -= t; skip += t; } - else { - if(skip != 0) { *(first + skip) = skip; skip = 0; } - last = SA + ISA[t] + 1; - if(1 < (last - first)) { - budget.count = 0; - tr_introsort(ISA, ISAd, SA, first, last, &budget); - if(budget.count != 0) { unsorted += budget.count; } - else { skip = first - last; } - } else if((last - first) == 1) { - skip = -1; - } - first = last; - } - } while(first < (SA + n)); - if(skip != 0) { *(first + skip) = skip; } - if(unsorted == 0) { break; } - } -} - - -/*---------------------------------------------------------------------------*/ - -/* Sorts suffixes of type B*. */ -static -int -sort_typeBstar(const unsigned char *T, int *SA, - int *bucket_A, int *bucket_B, - int n) { - int *PAb, *ISAb, *buf; -#ifdef _OPENMP - int *curbuf; - int l; -#endif - int i, j, k, t, m, bufsize; - int c0, c1; -#ifdef _OPENMP - int d0, d1; - int tmp; -#endif - - /* Initialize bucket arrays. */ - for(i = 0; i < BUCKET_A_SIZE; ++i) { bucket_A[i] = 0; } - for(i = 0; i < BUCKET_B_SIZE; ++i) { bucket_B[i] = 0; } - - /* Count the number of occurrences of the first one or two characters of each - type A, B and B* suffix. Moreover, store the beginning position of all - type B* suffixes into the array SA. */ - for(i = n - 1, m = n, c0 = T[n - 1]; 0 <= i;) { - /* type A suffix. */ - do { ++BUCKET_A(c1 = c0); } while((0 <= --i) && ((c0 = T[i]) >= c1)); - if(0 <= i) { - /* type B* suffix. */ - ++BUCKET_BSTAR(c0, c1); - SA[--m] = i; - /* type B suffix. */ - for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) <= c1); --i, c1 = c0) { - ++BUCKET_B(c0, c1); - } - } - } - m = n - m; -/* -note: - A type B* suffix is lexicographically smaller than a type B suffix that - begins with the same first two characters. -*/ - - /* Calculate the index of start/end point of each bucket. */ - for(c0 = 0, i = 0, j = 0; c0 < ALPHABET_SIZE; ++c0) { - t = i + BUCKET_A(c0); - BUCKET_A(c0) = i + j; /* start point */ - i = t + BUCKET_B(c0, c0); - for(c1 = c0 + 1; c1 < ALPHABET_SIZE; ++c1) { - j += BUCKET_BSTAR(c0, c1); - BUCKET_BSTAR(c0, c1) = j; /* end point */ - i += BUCKET_B(c0, c1); - } - } - - if(0 < m) { - /* Sort the type B* suffixes by their first two characters. */ - PAb = SA + n - m; ISAb = SA + m; - for(i = m - 2; 0 <= i; --i) { - t = PAb[i], c0 = T[t], c1 = T[t + 1]; - SA[--BUCKET_BSTAR(c0, c1)] = i; - } - t = PAb[m - 1], c0 = T[t], c1 = T[t + 1]; - SA[--BUCKET_BSTAR(c0, c1)] = m - 1; - - /* Sort the type B* substrings using sssort. */ -#ifdef _OPENMP - tmp = omp_get_max_threads(); - buf = SA + m, bufsize = (n - (2 * m)) / tmp; - c0 = ALPHABET_SIZE - 2, c1 = ALPHABET_SIZE - 1, j = m; -#pragma omp parallel default(shared) private(curbuf, k, l, d0, d1, tmp) - { - tmp = omp_get_thread_num(); - curbuf = buf + tmp * bufsize; - k = 0; - for(;;) { - #pragma omp critical(sssort_lock) - { - if(0 < (l = j)) { - d0 = c0, d1 = c1; - do { - k = BUCKET_BSTAR(d0, d1); - if(--d1 <= d0) { - d1 = ALPHABET_SIZE - 1; - if(--d0 < 0) { break; } - } - } while(((l - k) <= 1) && (0 < (l = k))); - c0 = d0, c1 = d1, j = k; - } - } - if(l == 0) { break; } - sssort(T, PAb, SA + k, SA + l, - curbuf, bufsize, 2, n, *(SA + k) == (m - 1)); - } - } -#else - buf = SA + m, bufsize = n - (2 * m); - for(c0 = ALPHABET_SIZE - 2, j = m; 0 < j; --c0) { - for(c1 = ALPHABET_SIZE - 1; c0 < c1; j = i, --c1) { - i = BUCKET_BSTAR(c0, c1); - if(1 < (j - i)) { - sssort(T, PAb, SA + i, SA + j, - buf, bufsize, 2, n, *(SA + i) == (m - 1)); - } - } - } -#endif - - /* Compute ranks of type B* substrings. */ - for(i = m - 1; 0 <= i; --i) { - if(0 <= SA[i]) { - j = i; - do { ISAb[SA[i]] = i; } while((0 <= --i) && (0 <= SA[i])); - SA[i + 1] = i - j; - if(i <= 0) { break; } - } - j = i; - do { ISAb[SA[i] = ~SA[i]] = j; } while(SA[--i] < 0); - ISAb[SA[i]] = j; - } - - /* Construct the inverse suffix array of type B* suffixes using trsort. */ - trsort(ISAb, SA, m, 1); - - /* Set the sorted order of tyoe B* suffixes. */ - for(i = n - 1, j = m, c0 = T[n - 1]; 0 <= i;) { - for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) >= c1); --i, c1 = c0) { } - if(0 <= i) { - t = i; - for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) <= c1); --i, c1 = c0) { } - SA[ISAb[--j]] = ((t == 0) || (1 < (t - i))) ? t : ~t; - } - } - - /* Calculate the index of start/end point of each bucket. */ - BUCKET_B(ALPHABET_SIZE - 1, ALPHABET_SIZE - 1) = n; /* end point */ - for(c0 = ALPHABET_SIZE - 2, k = m - 1; 0 <= c0; --c0) { - i = BUCKET_A(c0 + 1) - 1; - for(c1 = ALPHABET_SIZE - 1; c0 < c1; --c1) { - t = i - BUCKET_B(c0, c1); - BUCKET_B(c0, c1) = i; /* end point */ - - /* Move all type B* suffixes to the correct position. */ - for(i = t, j = BUCKET_BSTAR(c0, c1); - j <= k; - --i, --k) { SA[i] = SA[k]; } - } - BUCKET_BSTAR(c0, c0 + 1) = i - BUCKET_B(c0, c0) + 1; /* start point */ - BUCKET_B(c0, c0) = i; /* end point */ - } - } - - return m; -} - -/* Constructs the suffix array by using the sorted order of type B* suffixes. */ -static -void -construct_SA(const unsigned char *T, int *SA, - int *bucket_A, int *bucket_B, - int n, int m) { - int *i, *j, *k; - int s; - int c0, c1, c2; - - if(0 < m) { - /* Construct the sorted order of type B suffixes by using - the sorted order of type B* suffixes. */ - for(c1 = ALPHABET_SIZE - 2; 0 <= c1; --c1) { - /* Scan the suffix array from right to left. */ - for(i = SA + BUCKET_BSTAR(c1, c1 + 1), - j = SA + BUCKET_A(c1 + 1) - 1, k = NULL, c2 = -1; - i <= j; - --j) { - if(0 < (s = *j)) { - assert(T[s] == c1); - assert(((s + 1) < n) && (T[s] <= T[s + 1])); - assert(T[s - 1] <= T[s]); - *j = ~s; - c0 = T[--s]; - if((0 < s) && (T[s - 1] > c0)) { s = ~s; } - if(c0 != c2) { - if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; } - k = SA + BUCKET_B(c2 = c0, c1); - } - assert(k < j); - *k-- = s; - } else { - assert(((s == 0) && (T[s] == c1)) || (s < 0)); - *j = ~s; - } - } - } - } - - /* Construct the suffix array by using - the sorted order of type B suffixes. */ - k = SA + BUCKET_A(c2 = T[n - 1]); - *k++ = (T[n - 2] < c2) ? ~(n - 1) : (n - 1); - /* Scan the suffix array from left to right. */ - for(i = SA, j = SA + n; i < j; ++i) { - if(0 < (s = *i)) { - assert(T[s - 1] >= T[s]); - c0 = T[--s]; - if((s == 0) || (T[s - 1] < c0)) { s = ~s; } - if(c0 != c2) { - BUCKET_A(c2) = k - SA; - k = SA + BUCKET_A(c2 = c0); - } - assert(i < k); - *k++ = s; - } else { - assert(s < 0); - *i = ~s; - } - } -} - -/* Constructs the burrows-wheeler transformed string directly - by using the sorted order of type B* suffixes. */ -static -int -construct_BWT(const unsigned char *T, int *SA, - int *bucket_A, int *bucket_B, - int n, int m) { - int *i, *j, *k, *orig; - int s; - int c0, c1, c2; - - if(0 < m) { - /* Construct the sorted order of type B suffixes by using - the sorted order of type B* suffixes. */ - for(c1 = ALPHABET_SIZE - 2; 0 <= c1; --c1) { - /* Scan the suffix array from right to left. */ - for(i = SA + BUCKET_BSTAR(c1, c1 + 1), - j = SA + BUCKET_A(c1 + 1) - 1, k = NULL, c2 = -1; - i <= j; - --j) { - if(0 < (s = *j)) { - assert(T[s] == c1); - assert(((s + 1) < n) && (T[s] <= T[s + 1])); - assert(T[s - 1] <= T[s]); - c0 = T[--s]; - *j = ~((int)c0); - if((0 < s) && (T[s - 1] > c0)) { s = ~s; } - if(c0 != c2) { - if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; } - k = SA + BUCKET_B(c2 = c0, c1); - } - assert(k < j); - *k-- = s; - } else if(s != 0) { - *j = ~s; -#ifndef NDEBUG - } else { - assert(T[s] == c1); -#endif - } - } - } - } - - /* Construct the BWTed string by using - the sorted order of type B suffixes. */ - k = SA + BUCKET_A(c2 = T[n - 1]); - *k++ = (T[n - 2] < c2) ? ~((int)T[n - 2]) : (n - 1); - /* Scan the suffix array from left to right. */ - for(i = SA, j = SA + n, orig = SA; i < j; ++i) { - if(0 < (s = *i)) { - assert(T[s - 1] >= T[s]); - c0 = T[--s]; - *i = c0; - if((0 < s) && (T[s - 1] < c0)) { s = ~((int)T[s - 1]); } - if(c0 != c2) { - BUCKET_A(c2) = k - SA; - k = SA + BUCKET_A(c2 = c0); - } - assert(i < k); - *k++ = s; - } else if(s != 0) { - *i = ~s; - } else { - orig = i; - } - } - - return orig - SA; -} - - -/*---------------------------------------------------------------------------*/ - -/*- Function -*/ - -int -divsufsort(const unsigned char *T, int *SA, int n) { - int *bucket_A, *bucket_B; - int m; - int err = 0; - - /* Check arguments. */ - if((T == NULL) || (SA == NULL) || (n < 0)) { return -1; } - else if(n == 0) { return 0; } - else if(n == 1) { SA[0] = 0; return 0; } - else if(n == 2) { m = (T[0] < T[1]); SA[m ^ 1] = 0, SA[m] = 1; return 0; } - - bucket_A = (int *)malloc(BUCKET_A_SIZE * sizeof(int)); - bucket_B = (int *)malloc(BUCKET_B_SIZE * sizeof(int)); - - /* Suffixsort. */ - if((bucket_A != NULL) && (bucket_B != NULL)) { - m = sort_typeBstar(T, SA, bucket_A, bucket_B, n); - construct_SA(T, SA, bucket_A, bucket_B, n, m); - } else { - err = -2; - } - - free(bucket_B); - free(bucket_A); - - return err; -} - -int -divbwt(const unsigned char *T, unsigned char *U, int *A, int n) { - int *B; - int *bucket_A, *bucket_B; - int m, pidx, i; - - /* Check arguments. */ - if((T == NULL) || (U == NULL) || (n < 0)) { return -1; } - else if(n <= 1) { if(n == 1) { U[0] = T[0]; } return n; } - - if((B = A) == NULL) { B = (int *)malloc((size_t)(n + 1) * sizeof(int)); } - bucket_A = (int *)malloc(BUCKET_A_SIZE * sizeof(int)); - bucket_B = (int *)malloc(BUCKET_B_SIZE * sizeof(int)); - - /* Burrows-Wheeler Transform. */ - if((B != NULL) && (bucket_A != NULL) && (bucket_B != NULL)) { - m = sort_typeBstar(T, B, bucket_A, bucket_B, n); - pidx = construct_BWT(T, B, bucket_A, bucket_B, n, m); - - /* Copy to output string. */ - U[0] = T[n - 1]; - for(i = 0; i < pidx; ++i) { U[i + 1] = (unsigned char)B[i]; } - for(i += 1; i < n; ++i) { U[i] = (unsigned char)B[i]; } - pidx += 1; - } else { - pidx = -2; - } - - free(bucket_B); - free(bucket_A); - if(A == NULL) { free(B); } - - return pidx; -} diff --git a/src/divsufsort.h b/src/divsufsort.h deleted file mode 100644 index 8d8952e..0000000 --- a/src/divsufsort.h +++ /dev/null @@ -1,63 +0,0 @@ -/* - * divsufsort.h for libdivsufsort-lite - * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -#ifndef _DIVSUFSORT_H -#define _DIVSUFSORT_H 1 - -#ifdef __cplusplus -extern "C" { -#endif /* __cplusplus */ - - -/*- Prototypes -*/ - -/** - * Constructs the suffix array of a given string. - * @param T[0..n-1] The input string. - * @param SA[0..n-1] The output array of suffixes. - * @param n The length of the given string. - * @return 0 if no error occurred, -1 or -2 otherwise. - */ -int -divsufsort(const unsigned char *T, int *SA, int n); - -/** - * Constructs the burrows-wheeler transformed string of a given string. - * @param T[0..n-1] The input string. - * @param U[0..n-1] The output string. (can be T) - * @param A[0..n-1] The temporary array. (can be NULL) - * @param n The length of the given string. - * @return The primary index if no error occurred, -1 or -2 otherwise. - */ -int -divbwt(const unsigned char *T, unsigned char *U, int *A, int n); - - -#ifdef __cplusplus -} /* extern "C" */ -#endif /* __cplusplus */ - -#endif /* _DIVSUFSORT_H */ diff --git a/src/libsais.c b/src/libsais.c new file mode 100644 index 0000000..b917c1f --- /dev/null +++ b/src/libsais.c @@ -0,0 +1,2282 @@ +/*-- + +This file is a part of libsais, a library for linear time +suffix array and burrows wheeler transform construction. + + Copyright (c) 2021 Ilya Grebnov + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +Please see the file LICENSE for full copyright information. + +--*/ + +#include +#include +#include +#include +#include + +#include "libsais.h" + +#define INT_BIT (32) +#define ALPHABET_SIZE (1 << CHAR_BIT) +#define SUFFIX_GROUP_BIT (INT_BIT - 1) +#define SUFFIX_GROUP_MARKER (1 << (SUFFIX_GROUP_BIT - 1)) + +#define BUCKETS_INDEX2(_c, _s) (((_c) << 1) + (_s)) +#define BUCKETS_INDEX4(_c, _s) (((_c) << 2) + (_s)) + +#if defined(__GNUC__) || defined(__clang__) + #define RESTRICT __restrict__ + #define FORCEINLINE inline __attribute__((__always_inline__)) +#elif defined(_MSC_VER) || defined(__INTEL_COMPILER) + #define RESTRICT __restrict + #define FORCEINLINE __forceinline +#else + #error Your compiler, configuration or platform is not supported. +#endif + +#if defined(__has_builtin) + #if __has_builtin(__builtin_prefetch) + #define HAS_BUILTIN_PREFECTCH + #endif +#elif defined(__GNUC__) && __GNUC__ > 3 + #define HAS_BUILTIN_PREFECTCH +#endif + +#if defined(HAS_BUILTIN_PREFECTCH) + #define libsais_prefetch(address) __builtin_prefetch((const void *)(address), 0, 0) + #define libsais_prefetchw(address) __builtin_prefetch((const void *)(address), 1, 0) +#elif defined (_M_IX86) || defined (_M_AMD64) + #include + #define libsais_prefetch(address) _mm_prefetch((const void *)(address), _MM_HINT_NTA) + #define libsais_prefetchw(address) _m_prefetchw((const void *)(address)) +#elif defined (_M_ARM) + #include + #define libsais_prefetch(address) __prefetch((const void *)(address)) + #define libsais_prefetchw(address) __prefetchw((const void *)(address)) +#elif defined (_M_ARM64) + #include + #define libsais_prefetch(address) __prefetch2((const void *)(address), 1) + #define libsais_prefetchw(address) __prefetch2((const void *)(address), 17) +#else + #error Your compiler, configuration or platform is not supported. +#endif + +static FORCEINLINE void * libsais_align_up(const void * address, size_t alignment) +{ + return (void *)((((intptr_t)address) + ((intptr_t)alignment) - 1) & (-((intptr_t)alignment))); +} + +static FORCEINLINE void * libsais_aligned_malloc(size_t size, size_t alignment) +{ + void * address = malloc(size + sizeof(short) + alignment - 1); + if (address != NULL) + { + void * aligned_address = libsais_align_up((void *)((intptr_t)address + (intptr_t)(sizeof(short))), alignment); + ((short *)aligned_address)[-1] = (short)((intptr_t)aligned_address - (intptr_t)address); + + return aligned_address; + } + + return NULL; +} + +static FORCEINLINE void libsais_aligned_free(void * aligned_address) +{ + if (aligned_address != NULL) + { + free((void *)((intptr_t)aligned_address - ((short *)aligned_address)[-1])); + } +} + +static int libsais_gather_lms_suffixes_8u(const unsigned char * RESTRICT T, int * RESTRICT SA, int n) +{ + const ptrdiff_t prefetch_distance = 128; + + int i = n - 2; + int m = n - 1; + size_t s = 1; + ptrdiff_t c0 = T[n - 1]; + ptrdiff_t c1 = 0; + + for (; i >= 3; i -= 4) + { + libsais_prefetch(&T[i - prefetch_distance]); + + c1 = T[i - 0]; s = (s << 1) + (size_t)(c1 > (c0 - (ptrdiff_t)(s & 1))); SA[m] = i + 1; m -= ((s & 3) == 1); + c0 = T[i - 1]; s = (s << 1) + (size_t)(c0 > (c1 - (ptrdiff_t)(s & 1))); SA[m] = i - 0; m -= ((s & 3) == 1); + c1 = T[i - 2]; s = (s << 1) + (size_t)(c1 > (c0 - (ptrdiff_t)(s & 1))); SA[m] = i - 1; m -= ((s & 3) == 1); + c0 = T[i - 3]; s = (s << 1) + (size_t)(c0 > (c1 - (ptrdiff_t)(s & 1))); SA[m] = i - 2; m -= ((s & 3) == 1); + } + + for (; i >= 0; i -= 1) + { + c1 = c0; c0 = T[i]; s = (s << 1) + (size_t)(c0 > (c1 - (ptrdiff_t)(s & 1))); SA[m] = i + 1; m -= ((s & 3) == 1); + } + + return n - 1 - m; +} + +static int libsais_gather_lms_suffixes_32s(const int * RESTRICT T, int * RESTRICT SA, int n) +{ + const ptrdiff_t prefetch_distance = 32; + + int i = n - 2; + int m = n - 1; + size_t s = 1; + ptrdiff_t c0 = T[n - 1]; + ptrdiff_t c1 = 0; + + for (; i >= 3; i -= 4) + { + libsais_prefetch(&T[i - prefetch_distance]); + + c1 = T[i - 0]; s = (s << 1) + (size_t)(c1 > (c0 - (ptrdiff_t)(s & 1))); SA[m] = i + 1; m -= ((s & 3) == 1); + c0 = T[i - 1]; s = (s << 1) + (size_t)(c0 > (c1 - (ptrdiff_t)(s & 1))); SA[m] = i - 0; m -= ((s & 3) == 1); + c1 = T[i - 2]; s = (s << 1) + (size_t)(c1 > (c0 - (ptrdiff_t)(s & 1))); SA[m] = i - 1; m -= ((s & 3) == 1); + c0 = T[i - 3]; s = (s << 1) + (size_t)(c0 > (c1 - (ptrdiff_t)(s & 1))); SA[m] = i - 2; m -= ((s & 3) == 1); + } + + for (; i >= 0; i -= 1) + { + c1 = c0; c0 = T[i]; s = (s << 1) + (size_t)(c0 > (c1 - (ptrdiff_t)(s & 1))); SA[m] = i + 1; m -= ((s & 3) == 1); + } + + return n - 1 - m; +} + +static int libsais_gather_compacted_lms_suffixes_32s(const int * RESTRICT T, int * RESTRICT SA, int n) +{ + const ptrdiff_t prefetch_distance = 32; + + int i = n - 2; + int m = n - 1; + size_t s = 1; + ptrdiff_t c0 = T[n - 1]; + ptrdiff_t c1 = 0; + + for (; i >= 3; i -= 4) + { + libsais_prefetch(&T[i - prefetch_distance]); + + c1 = T[i - 0]; s = (s << 1) + (size_t)(c1 > (c0 - (ptrdiff_t)(s & 1))); SA[m] = i + 1; m -= ((ptrdiff_t)(s & 3) == (c0 >= 0)); + c0 = T[i - 1]; s = (s << 1) + (size_t)(c0 > (c1 - (ptrdiff_t)(s & 1))); SA[m] = i - 0; m -= ((ptrdiff_t)(s & 3) == (c1 >= 0)); + c1 = T[i - 2]; s = (s << 1) + (size_t)(c1 > (c0 - (ptrdiff_t)(s & 1))); SA[m] = i - 1; m -= ((ptrdiff_t)(s & 3) == (c0 >= 0)); + c0 = T[i - 3]; s = (s << 1) + (size_t)(c0 > (c1 - (ptrdiff_t)(s & 1))); SA[m] = i - 2; m -= ((ptrdiff_t)(s & 3) == (c1 >= 0)); + } + + for (; i >= 0; i -= 1) + { + c1 = c0; c0 = T[i]; s = (s << 1) + (size_t)(c0 > (c1 - (ptrdiff_t)(s & 1))); SA[m] = i + 1; m -= ((ptrdiff_t)(s & 3) == (c1 >= 0)); + } + + return n - 1 - m; +} + +static void libsais_count_lms_suffixes_32s_2k(const int * RESTRICT T, int n, int k, int * RESTRICT buckets) +{ + const ptrdiff_t prefetch_distance = 32; + + memset(buckets, 0, 2 * (size_t)k * sizeof(int)); + + int i = n - 2; + size_t s = 1; + ptrdiff_t c0 = T[n - 1]; + ptrdiff_t c1 = 0; + + for (; i >= prefetch_distance + 3; i -= 4) + { + libsais_prefetch(&T[i - 2 * prefetch_distance]); + + libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0], 0)]); + libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1], 0)]); + libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2], 0)]); + libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3], 0)]); + + c1 = T[i - 0]; s = (s << 1) + (size_t)(c1 > (c0 - (ptrdiff_t)(s & 1))); + buckets[BUCKETS_INDEX2((size_t)c0, (s & 3) == 1)]++; + + c0 = T[i - 1]; s = (s << 1) + (size_t)(c0 > (c1 - (ptrdiff_t)(s & 1))); + buckets[BUCKETS_INDEX2((size_t)c1, (s & 3) == 1)]++; + + c1 = T[i - 2]; s = (s << 1) + (size_t)(c1 > (c0 - (ptrdiff_t)(s & 1))); + buckets[BUCKETS_INDEX2((size_t)c0, (s & 3) == 1)]++; + + c0 = T[i - 3]; s = (s << 1) + (size_t)(c0 > (c1 - (ptrdiff_t)(s & 1))); + buckets[BUCKETS_INDEX2((size_t)c1, (s & 3) == 1)]++; + } + + for (; i >= 0; i -= 1) + { + c1 = c0; c0 = T[i]; s = (s << 1) + (size_t)(c0 > (c1 - (ptrdiff_t)(s & 1))); + buckets[BUCKETS_INDEX2((size_t)c1, (s & 3) == 1)]++; + } + + buckets[BUCKETS_INDEX2((size_t)c0, 0)]++; +} + +static int libsais_count_and_gather_lms_suffixes_8u(const unsigned char * RESTRICT T, int * RESTRICT SA, int n, int * RESTRICT buckets) +{ + const ptrdiff_t prefetch_distance = 128; + + memset(buckets, 0, 4 * ALPHABET_SIZE * sizeof(int)); + + int i = n - 2; + int m = n - 1; + size_t s = 1; + ptrdiff_t c0 = T[n - 1]; + ptrdiff_t c1 = 0; + + for (; i >= 3; i -= 4) + { + libsais_prefetch(&T[i - prefetch_distance]); + + c1 = T[i - 0]; s = (s << 1) + (size_t)(c1 > (c0 - (ptrdiff_t)(s & 1))); SA[m] = i + 1; m -= ((s & 3) == 1); + buckets[BUCKETS_INDEX4((size_t)c0, s & 3)]++; + + c0 = T[i - 1]; s = (s << 1) + (size_t)(c0 > (c1 - (ptrdiff_t)(s & 1))); SA[m] = i - 0; m -= ((s & 3) == 1); + buckets[BUCKETS_INDEX4((size_t)c1, s & 3)]++; + + c1 = T[i - 2]; s = (s << 1) + (size_t)(c1 > (c0 - (ptrdiff_t)(s & 1))); SA[m] = i - 1; m -= ((s & 3) == 1); + buckets[BUCKETS_INDEX4((size_t)c0, s & 3)]++; + + c0 = T[i - 3]; s = (s << 1) + (size_t)(c0 > (c1 - (ptrdiff_t)(s & 1))); SA[m] = i - 2; m -= ((s & 3) == 1); + buckets[BUCKETS_INDEX4((size_t)c1, s & 3)]++; + } + + for (; i >= 0; i -= 1) + { + c1 = c0; c0 = T[i]; s = (s << 1) + (size_t)(c0 > (c1 - (ptrdiff_t)(s & 1))); SA[m] = i + 1; m -= ((s & 3) == 1); + buckets[BUCKETS_INDEX4((size_t)c1, s & 3)]++; + } + + buckets[BUCKETS_INDEX4((size_t)c0, (s << 1) & 3)]++; + + return n - 1 - m; +} + +static int libsais_count_and_gather_lms_suffixes_32s_4k(const int * RESTRICT T, int * RESTRICT SA, int n, int k, int * RESTRICT buckets) +{ + const ptrdiff_t prefetch_distance = 32; + + memset(buckets, 0, 4 * (size_t)k * sizeof(int)); + + int i = n - 2; + int m = n - 1; + size_t s = 1; + ptrdiff_t c0 = T[n - 1]; + ptrdiff_t c1 = 0; + + for (; i >= prefetch_distance + 3; i -= 4) + { + libsais_prefetch(&T[i - 2 * prefetch_distance]); + + libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 0], 0)]); + libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 1], 0)]); + libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 2], 0)]); + libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 3], 0)]); + + c1 = T[i - 0]; s = (s << 1) + (size_t)(c1 > (c0 - (ptrdiff_t)(s & 1))); SA[m] = i + 1; m -= ((s & 3) == 1); + buckets[BUCKETS_INDEX4((size_t)c0, s & 3)]++; + + c0 = T[i - 1]; s = (s << 1) + (size_t)(c0 > (c1 - (ptrdiff_t)(s & 1))); SA[m] = i - 0; m -= ((s & 3) == 1); + buckets[BUCKETS_INDEX4((size_t)c1, s & 3)]++; + + c1 = T[i - 2]; s = (s << 1) + (size_t)(c1 > (c0 - (ptrdiff_t)(s & 1))); SA[m] = i - 1; m -= ((s & 3) == 1); + buckets[BUCKETS_INDEX4((size_t)c0, s & 3)]++; + + c0 = T[i - 3]; s = (s << 1) + (size_t)(c0 > (c1 - (ptrdiff_t)(s & 1))); SA[m] = i - 2; m -= ((s & 3) == 1); + buckets[BUCKETS_INDEX4((size_t)c1, s & 3)]++; + } + + for (; i >= 0; i -= 1) + { + c1 = c0; c0 = T[i]; s = (s << 1) + (size_t)(c0 > (c1 - (ptrdiff_t)(s & 1))); SA[m] = i + 1; m -= ((s & 3) == 1); + buckets[BUCKETS_INDEX4((size_t)c1, s & 3)]++; + } + + buckets[BUCKETS_INDEX4((size_t)c0, (s << 1) & 3)]++; + + return n - 1 - m; +} + +static int libsais_count_and_gather_lms_suffixes_32s_2k(const int * RESTRICT T, int * RESTRICT SA, int n, int k, int * RESTRICT buckets) +{ + const ptrdiff_t prefetch_distance = 32; + + memset(buckets, 0, 2 * (size_t)k * sizeof(int)); + + int i = n - 2; + int m = n - 1; + size_t s = 1; + ptrdiff_t c0 = T[n - 1]; + ptrdiff_t c1 = 0; + + for (; i >= prefetch_distance + 3; i -= 4) + { + libsais_prefetch(&T[i - 2 * prefetch_distance]); + + libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0], 0)]); + libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1], 0)]); + libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2], 0)]); + libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3], 0)]); + + c1 = T[i - 0]; s = (s << 1) + (size_t)(c1 > (c0 - (ptrdiff_t)(s & 1))); SA[m] = i + 1; m -= ((s & 3) == 1); + buckets[BUCKETS_INDEX2((size_t)c0, (s & 3) == 1)]++; + + c0 = T[i - 1]; s = (s << 1) + (size_t)(c0 > (c1 - (ptrdiff_t)(s & 1))); SA[m] = i - 0; m -= ((s & 3) == 1); + buckets[BUCKETS_INDEX2((size_t)c1, (s & 3) == 1)]++; + + c1 = T[i - 2]; s = (s << 1) + (size_t)(c1 > (c0 - (ptrdiff_t)(s & 1))); SA[m] = i - 1; m -= ((s & 3) == 1); + buckets[BUCKETS_INDEX2((size_t)c0, (s & 3) == 1)]++; + + c0 = T[i - 3]; s = (s << 1) + (size_t)(c0 > (c1 - (ptrdiff_t)(s & 1))); SA[m] = i - 2; m -= ((s & 3) == 1); + buckets[BUCKETS_INDEX2((size_t)c1, (s & 3) == 1)]++; + } + + for (; i >= 0; i -= 1) + { + c1 = c0; c0 = T[i]; s = (s << 1) + (size_t)(c0 > (c1 - (ptrdiff_t)(s & 1))); SA[m] = i + 1; m -= ((s & 3) == 1); + buckets[BUCKETS_INDEX2((size_t)c1, (s & 3) == 1)]++; + } + + buckets[BUCKETS_INDEX2((size_t)c0, 0)]++; + + return n - 1 - m; +} + +static int libsais_count_and_gather_compacted_lms_suffixes_32s_2k(const int * RESTRICT T, int * RESTRICT SA, int n, int k, int * RESTRICT buckets) +{ + const ptrdiff_t prefetch_distance = 32; + + memset(buckets, 0, 2 * (size_t)k * sizeof(int)); + + int i = n - 2; + int m = n - 1; + size_t s = 1; + ptrdiff_t c0 = T[n - 1]; + ptrdiff_t c1 = 0; + + for (; i >= prefetch_distance + 3; i -= 4) + { + libsais_prefetch(&T[i - 2 * prefetch_distance]); + + libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0] & INT_MAX, 0)]); + libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1] & INT_MAX, 0)]); + libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2] & INT_MAX, 0)]); + libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3] & INT_MAX, 0)]); + + c1 = T[i - 0]; s = (s << 1) + (size_t)(c1 > (c0 - (ptrdiff_t)(s & 1))); SA[m] = i + 1; m -= ((ptrdiff_t)(s & 3) == (c0 >=0)); + c0 &= INT_MAX; buckets[BUCKETS_INDEX2((size_t)c0, (s & 3) == 1)]++; + + c0 = T[i - 1]; s = (s << 1) + (size_t)(c0 > (c1 - (ptrdiff_t)(s & 1))); SA[m] = i - 0; m -= ((ptrdiff_t)(s & 3) == (c1 >= 0)); + c1 &= INT_MAX; buckets[BUCKETS_INDEX2((size_t)c1, (s & 3) == 1)]++; + + c1 = T[i - 2]; s = (s << 1) + (size_t)(c1 > (c0 - (ptrdiff_t)(s & 1))); SA[m] = i - 1; m -= ((ptrdiff_t)(s & 3) == (c0 >= 0)); + c0 &= INT_MAX; buckets[BUCKETS_INDEX2((size_t)c0, (s & 3) == 1)]++; + + c0 = T[i - 3]; s = (s << 1) + (size_t)(c0 > (c1 - (ptrdiff_t)(s & 1))); SA[m] = i - 2; m -= ((ptrdiff_t)(s & 3) == (c1 >= 0)); + c1 &= INT_MAX; buckets[BUCKETS_INDEX2((size_t)c1, (s & 3) == 1)]++; + } + + for (; i >= 0; i -= 1) + { + c1 = c0; c0 = T[i]; s = (s << 1) + (size_t)(c0 > (c1 - (ptrdiff_t)(s & 1))); SA[m] = i + 1; m -= ((ptrdiff_t)(s & 3) == (c1 >= 0)); + c1 &= INT_MAX; buckets[BUCKETS_INDEX2((size_t)c1, (s & 3) == 1)]++; + } + + c0 &= INT_MAX; buckets[BUCKETS_INDEX2((size_t)c0, 0)]++; + + return n - 1 - m; +} + +static void libsais_count_suffixes_32s(const int * RESTRICT T, int n, int k, int * RESTRICT buckets) +{ + const ptrdiff_t prefetch_distance = 32; + + memset(buckets, 0, (size_t)k * sizeof(int)); + + ptrdiff_t i, j; + for (i = 0, j = (ptrdiff_t)n - 7; i < j; i += 8) + { + libsais_prefetch(&T[i + prefetch_distance]); + + buckets[T[i + 0]]++; + buckets[T[i + 1]]++; + buckets[T[i + 2]]++; + buckets[T[i + 3]]++; + buckets[T[i + 4]]++; + buckets[T[i + 5]]++; + buckets[T[i + 6]]++; + buckets[T[i + 7]]++; + } + + for (j += 7; i < j; i += 1) + { + buckets[T[i]]++; + } +} + +static void libsais_initialize_buckets_start_and_end_8u(int * RESTRICT buckets) +{ + int * RESTRICT bucket_start = &buckets[6 * ALPHABET_SIZE]; + int * RESTRICT bucket_end = &buckets[7 * ALPHABET_SIZE]; + + ptrdiff_t i, j; int sum = 0; + for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4(UCHAR_MAX, 0); i += BUCKETS_INDEX4(1, 0), j += 1) + { + bucket_start[j] = sum; + sum += buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 2)] + buckets[i + BUCKETS_INDEX4(0, 3)]; + bucket_end[j] = sum; + } +} + +static void libsais_initialize_buckets_start_and_end_32s_6k(int k, int * RESTRICT buckets) +{ + int * RESTRICT bucket_start = &buckets[4 * k]; + int * RESTRICT bucket_end = &buckets[5 * k]; + + ptrdiff_t i, j; int sum = 0; + for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4((ptrdiff_t)k - 1, 0); i += BUCKETS_INDEX4(1, 0), j += 1) + { + bucket_start[j] = sum; + sum += buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 2)] + buckets[i + BUCKETS_INDEX4(0, 3)]; + bucket_end[j] = sum; + } +} + +static void libsais_initialize_buckets_start_and_end_32s_4k(int k, int * RESTRICT buckets) +{ + int * RESTRICT bucket_start = &buckets[2 * k]; + int * RESTRICT bucket_end = &buckets[3 * k]; + + ptrdiff_t i, j; int sum = 0; + for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((ptrdiff_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0), j += 1) + { + bucket_start[j] = sum; + sum += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)]; + bucket_end[j] = sum; + } +} + +static void libsais_initialize_buckets_end_32s_2k(int k, int * RESTRICT buckets) +{ + ptrdiff_t i; int sum0 = 0; + for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((ptrdiff_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0)) + { + sum0 += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)]; buckets[i + BUCKETS_INDEX2(0, 0)] = sum0; + } +} + +static void libsais_initialize_buckets_start_32s_1k(int k, int * RESTRICT buckets) +{ + ptrdiff_t i; int sum = 0; + for (i = 0; i <= (ptrdiff_t)k - 1; i += 1) { int tmp = buckets[i]; buckets[i] = sum; sum += tmp; } +} + +static void libsais_initialize_buckets_end_32s_1k(int k, int * RESTRICT buckets) +{ + ptrdiff_t i; int sum = 0; + for (i = 0; i <= (ptrdiff_t)k - 1; i += 1) { sum += buckets[i]; buckets[i] = sum; } +} + +static void libsais_initialize_buckets_start_and_end_32s_2k(int k, int * RESTRICT buckets) +{ + ptrdiff_t i, j; + for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((ptrdiff_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0), j += 1) + { + buckets[j] = buckets[i]; + } + + buckets[k] = 0; memcpy(&buckets[k + 1], buckets, ((size_t)k - 1) * sizeof(int)); +} + +static int libsais_initialize_buckets_for_lms_suffixes_radix_sort_8u(const unsigned char * RESTRICT T, int * RESTRICT buckets, int first_lms_suffix) +{ + { + size_t s = 0; + ptrdiff_t c0 = T[first_lms_suffix]; + ptrdiff_t c1 = 0; + + for (; --first_lms_suffix >= 0; ) + { + c1 = c0; c0 = T[first_lms_suffix]; s = (s << 1) + (size_t)(c0 > (c1 - (ptrdiff_t)(s & 1))); + buckets[BUCKETS_INDEX4((size_t)c1, s & 3)]--; + } + + buckets[BUCKETS_INDEX4((size_t)c0, (s << 1) & 3)]--; + } + + { + int * RESTRICT temp_bucket = &buckets[4 * ALPHABET_SIZE]; + + ptrdiff_t i, j; int sum = 0; + for (i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX4(UCHAR_MAX, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) + { + temp_bucket[j + BUCKETS_INDEX2(0, 1)] = sum; sum += buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 3)]; temp_bucket[j] = sum; + } + + return sum; + } +} + +static void libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_2k(const int * RESTRICT T, int k, int * RESTRICT buckets, int first_lms_suffix) +{ + buckets[BUCKETS_INDEX2(T[first_lms_suffix], 0)]++; + buckets[BUCKETS_INDEX2(T[first_lms_suffix], 1)]--; + + ptrdiff_t i; int sum0 = 0, sum1 = 0; + for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((ptrdiff_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0)) + { + sum0 += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)]; + sum1 += buckets[i + BUCKETS_INDEX2(0, 1)]; + + buckets[i + BUCKETS_INDEX2(0, 0)] = sum0; + buckets[i + BUCKETS_INDEX2(0, 1)] = sum1; + } +} + +static int libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(const int * RESTRICT T, int k, int * RESTRICT buckets, int first_lms_suffix) +{ + { + size_t s = 0; + ptrdiff_t c0 = T[first_lms_suffix]; + ptrdiff_t c1 = 0; + + for (; --first_lms_suffix >= 0; ) + { + c1 = c0; c0 = T[first_lms_suffix]; s = (s << 1) + (size_t)(c0 > (c1 - (ptrdiff_t)(s & 1))); + buckets[BUCKETS_INDEX4((size_t)c1, s & 3)]--; + } + + buckets[BUCKETS_INDEX4((size_t)c0, (s << 1) & 3)]--; + } + + { + int * RESTRICT temp_bucket = &buckets[4 * k]; + + ptrdiff_t i, j; int sum = 0; + for (i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX4((ptrdiff_t)k - 1, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) + { + temp_bucket[j + BUCKETS_INDEX2(0, 1)] = sum; sum += buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 3)]; temp_bucket[j] = sum; + } + + return sum; + } +} + +static void libsais_initialize_buckets_for_radix_and_partial_sorting_32s_4k(const int * RESTRICT T, int k, int * RESTRICT buckets, int first_lms_suffix) +{ + int * RESTRICT bucket_start = &buckets[2 * k]; + int * RESTRICT bucket_end = &buckets[3 * k]; + + buckets[BUCKETS_INDEX2(T[first_lms_suffix], 0)]++; + buckets[BUCKETS_INDEX2(T[first_lms_suffix], 1)]--; + + ptrdiff_t i, j; int sum0 = 0, sum1 = 0; + for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((ptrdiff_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0), j += 1) + { + bucket_start[j] = sum1; + + sum0 += buckets[i + BUCKETS_INDEX2(0, 1)]; + sum1 += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)]; + buckets[i + BUCKETS_INDEX2(0, 1)] = sum0; + + bucket_end[j] = sum1; + } +} + +static void libsais_radix_sort_lms_suffixes_8u(const unsigned char * RESTRICT T, int * RESTRICT SA, int n, int m, int * RESTRICT buckets) +{ + const ptrdiff_t prefetch_distance = 32; + + int * RESTRICT induction_bucket = &buckets[4 * ALPHABET_SIZE]; + + ptrdiff_t i, j; + for (i = (ptrdiff_t)n - 1, j = (ptrdiff_t)n - (ptrdiff_t)m + prefetch_distance + 3; i > j; i -= 4) + { + libsais_prefetch(&SA[i - 2 * prefetch_distance]); + + libsais_prefetch(&T[SA[i - prefetch_distance - 0]]); + libsais_prefetch(&T[SA[i - prefetch_distance - 1]]); + libsais_prefetch(&T[SA[i - prefetch_distance - 2]]); + libsais_prefetch(&T[SA[i - prefetch_distance - 3]]); + + int p0 = SA[i - 0]; SA[--induction_bucket[BUCKETS_INDEX2(T[p0], 0)]] = p0; + int p1 = SA[i - 1]; SA[--induction_bucket[BUCKETS_INDEX2(T[p1], 0)]] = p1; + int p2 = SA[i - 2]; SA[--induction_bucket[BUCKETS_INDEX2(T[p2], 0)]] = p2; + int p3 = SA[i - 3]; SA[--induction_bucket[BUCKETS_INDEX2(T[p3], 0)]] = p3; + } + + for (j -= prefetch_distance + 3; i > j; i -= 1) + { + int p = SA[i]; SA[--induction_bucket[BUCKETS_INDEX2(T[p], 0)]] = p; + } +} + +static void libsais_radix_sort_lms_suffixes_32s_2k(const int * RESTRICT T, int * RESTRICT SA, int n, int m, int * RESTRICT induction_bucket) +{ + const ptrdiff_t prefetch_distance = 32; + + ptrdiff_t i, j; + for (i = (ptrdiff_t)n - 1, j = (ptrdiff_t)n - (ptrdiff_t)m + 2 * prefetch_distance + 3; i > j; i -= 4) + { + libsais_prefetch(&SA[i - 3 * prefetch_distance]); + + libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 0]]); + libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 1]]); + libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 2]]); + libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 3]]); + + libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 0]], 0)]); + libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 1]], 0)]); + libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 2]], 0)]); + libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 3]], 0)]); + + int p0 = SA[i - 0]; SA[--induction_bucket[BUCKETS_INDEX2(T[p0], 0)]] = p0; + int p1 = SA[i - 1]; SA[--induction_bucket[BUCKETS_INDEX2(T[p1], 0)]] = p1; + int p2 = SA[i - 2]; SA[--induction_bucket[BUCKETS_INDEX2(T[p2], 0)]] = p2; + int p3 = SA[i - 3]; SA[--induction_bucket[BUCKETS_INDEX2(T[p3], 0)]] = p3; + } + + for (j -= 2 * prefetch_distance + 3; i > j; i -= 1) + { + int p = SA[i]; SA[--induction_bucket[BUCKETS_INDEX2(T[p], 0)]] = p; + } +} + +static int libsais_radix_sort_lms_suffixes_32s_1k(const int * RESTRICT T, int * RESTRICT SA, int n, int * RESTRICT buckets) +{ + const ptrdiff_t prefetch_distance = 32; + + int i = n - 2; + int m = 0; + size_t s = 1; + ptrdiff_t c0 = T[n - 1]; + ptrdiff_t c1 = 0; + ptrdiff_t c2 = 0; + + for (; i >= prefetch_distance + 3; i -= 4) + { + libsais_prefetch(&T[i - 2 * prefetch_distance]); + + libsais_prefetchw(&buckets[T[i - prefetch_distance - 0]]); + libsais_prefetchw(&buckets[T[i - prefetch_distance - 1]]); + libsais_prefetchw(&buckets[T[i - prefetch_distance - 2]]); + libsais_prefetchw(&buckets[T[i - prefetch_distance - 3]]); + + c1 = T[i - 0]; s = (s << 1) + (size_t)(c1 > (c0 - (ptrdiff_t)(s & 1))); + if ((s & 3) == 1) { SA[--buckets[c2 = c0]] = i + 1; m++; } + + c0 = T[i - 1]; s = (s << 1) + (size_t)(c0 > (c1 - (ptrdiff_t)(s & 1))); + if ((s & 3) == 1) { SA[--buckets[c2 = c1]] = i - 0; m++; } + + c1 = T[i - 2]; s = (s << 1) + (size_t)(c1 > (c0 - (ptrdiff_t)(s & 1))); + if ((s & 3) == 1) { SA[--buckets[c2 = c0]] = i - 1; m++; } + + c0 = T[i - 3]; s = (s << 1) + (size_t)(c0 > (c1 - (ptrdiff_t)(s & 1))); + if ((s & 3) == 1) { SA[--buckets[c2 = c1]] = i - 2; m++; } + } + + for (; i >= 0; i -= 1) + { + c1 = c0; c0 = T[i]; s = (s << 1) + (size_t)(c0 > (c1 - (ptrdiff_t)(s & 1))); + if ((s & 3) == 1) { SA[--buckets[c2 = c1]] = i + 1; m++; } + } + + if (m > 1) + { + SA[buckets[c2]] = 0; + } + + return m; +} + +static void libsais_radix_sort_set_markers_32s(int * RESTRICT SA, int k, int * RESTRICT induction_bucket, int marker) +{ + const ptrdiff_t prefetch_distance = 32; + + ptrdiff_t i, j; + for (i = 0, j = (ptrdiff_t)k - 1 - prefetch_distance - 3; i < j; i += 4) + { + libsais_prefetch(&induction_bucket[BUCKETS_INDEX2(i + 2 * prefetch_distance, 0)]); + + libsais_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 0, 0)]]); + libsais_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 1, 0)]]); + libsais_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 2, 0)]]); + libsais_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 3, 0)]]); + + SA[induction_bucket[BUCKETS_INDEX2(i + 0, 0)]] |= marker; + SA[induction_bucket[BUCKETS_INDEX2(i + 1, 0)]] |= marker; + SA[induction_bucket[BUCKETS_INDEX2(i + 2, 0)]] |= marker; + SA[induction_bucket[BUCKETS_INDEX2(i + 3, 0)]] |= marker; + } + + for (j += prefetch_distance + 3; i < j; i += 1) + { + SA[induction_bucket[BUCKETS_INDEX2(i, 0)]] |= marker; + } +} + +static void libsais_initialize_buckets_for_partial_sorting_8u(const unsigned char * RESTRICT T, int * RESTRICT buckets, int first_lms_suffix, int left_suffixes_count) +{ + int * RESTRICT temp_bucket = &buckets[4 * ALPHABET_SIZE]; + + buckets[BUCKETS_INDEX4((size_t)T[first_lms_suffix], 1)]++; + + ptrdiff_t i, j; int sum0 = left_suffixes_count + 1, sum1 = 0; + for (i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX4(UCHAR_MAX, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) + { + temp_bucket[j + BUCKETS_INDEX2(0, 0)] = sum0; + + sum0 += buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 2)]; + sum1 += buckets[i + BUCKETS_INDEX4(0, 1)]; + + buckets[j + BUCKETS_INDEX2(0, 0)] = sum0; + buckets[j + BUCKETS_INDEX2(0, 1)] = sum1; + } +} + +static void libsais_initialize_buckets_for_partial_sorting_32s_6k(const int * RESTRICT T, int k, int * RESTRICT buckets, int first_lms_suffix, int left_suffixes_count) +{ + int * RESTRICT temp_bucket = &buckets[4 * k]; + + buckets[BUCKETS_INDEX4((size_t)T[first_lms_suffix], 1)]++; + + ptrdiff_t i, j; int sum0 = left_suffixes_count + 1, sum1 = 0; + for (i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX4((ptrdiff_t)k - 1, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) + { + temp_bucket[j + BUCKETS_INDEX2(0, 0)] = sum0; + + sum0 += buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 2)]; + sum1 += buckets[i + BUCKETS_INDEX4(0, 1)]; + + buckets[j + BUCKETS_INDEX2(0, 0)] = sum0; + buckets[j + BUCKETS_INDEX2(0, 1)] = sum1; + } +} + +static int libsais_partial_sorting_scan_left_to_right_8u(const unsigned char * RESTRICT T, int * RESTRICT SA, int n, int * RESTRICT buckets, int left_suffixes_count, int d) +{ + const ptrdiff_t prefetch_distance = 32; + + int * RESTRICT induction_bucket = &buckets[4 * ALPHABET_SIZE]; + int * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; + + SA[induction_bucket[BUCKETS_INDEX2(T[n - 1], T[n - 2] >= T[n - 1])]++] = (n - 1) | INT_MIN; + distinct_names[BUCKETS_INDEX2(T[n - 1], T[n - 2] >= T[n - 1])] = ++d; + + ptrdiff_t i, j; + for (i = 0, j = (ptrdiff_t)left_suffixes_count - prefetch_distance - 1; i < j; i += 2) + { + libsais_prefetch(&SA[i + 2 * prefetch_distance]); + + libsais_prefetch(&T[SA[i + prefetch_distance + 0] & INT_MAX] - 1); + libsais_prefetch(&T[SA[i + prefetch_distance + 0] & INT_MAX] - 2); + libsais_prefetch(&T[SA[i + prefetch_distance + 1] & INT_MAX] - 1); + libsais_prefetch(&T[SA[i + prefetch_distance + 1] & INT_MAX] - 2); + + int p0 = SA[i + 0]; d += (p0 < 0); p0 &= INT_MAX; int v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] >= T[p0 - 1]); + SA[induction_bucket[v0]++] = (p0 - 1) | ((distinct_names[v0] != d) << (INT_BIT - 1)); distinct_names[v0] = d; + + int p1 = SA[i + 1]; d += (p1 < 0); p1 &= INT_MAX; int v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] >= T[p1 - 1]); + SA[induction_bucket[v1]++] = (p1 - 1) | ((distinct_names[v1] != d) << (INT_BIT - 1)); distinct_names[v1] = d; + } + + for (j += prefetch_distance + 1; i < j; i += 1) + { + int p = SA[i]; d += (p < 0); p &= INT_MAX; int v = BUCKETS_INDEX2(T[p - 1], T[p - 2] >= T[p - 1]); + SA[induction_bucket[v]++] = (p - 1) | ((distinct_names[v] != d) << (INT_BIT - 1)); distinct_names[v] = d; + } + + return d; +} + +static int libsais_partial_sorting_scan_left_to_right_32s_6k(const int * RESTRICT T, int * RESTRICT SA, int n, int k, int * RESTRICT buckets, int left_suffixes_count, int d) +{ + const ptrdiff_t prefetch_distance = 32; + + int * RESTRICT induction_bucket = &buckets[4 * k]; + int * RESTRICT distinct_names = &buckets[2 * k]; + + SA[induction_bucket[BUCKETS_INDEX2(T[n - 1], T[n - 2] >= T[n - 1])]++] = (n - 1) | INT_MIN; + distinct_names[BUCKETS_INDEX2(T[n - 1], T[n - 2] >= T[n - 1])] = ++d; + + ptrdiff_t i, j; + for (i = 0, j = (ptrdiff_t)left_suffixes_count - 2 * prefetch_distance - 1; i < j; i += 2) + { + libsais_prefetch(&SA[i + 3 * prefetch_distance]); + + libsais_prefetch(&T[SA[i + 2 * prefetch_distance + 0] & INT_MAX] - 1); + libsais_prefetch(&T[SA[i + 2 * prefetch_distance + 0] & INT_MAX] - 2); + libsais_prefetch(&T[SA[i + 2 * prefetch_distance + 1] & INT_MAX] - 1); + libsais_prefetch(&T[SA[i + 2 * prefetch_distance + 1] & INT_MAX] - 2); + + int p0 = SA[i + prefetch_distance + 0] & INT_MAX; int v0 = BUCKETS_INDEX2(T[p0 - (p0 > 0)], 0); + libsais_prefetchw(&induction_bucket[v0]); libsais_prefetchw(&distinct_names[v0]); + + int p1 = SA[i + prefetch_distance + 1] & INT_MAX; int v1 = BUCKETS_INDEX2(T[p1 - (p1 > 0)], 0); + libsais_prefetchw(&induction_bucket[v1]); libsais_prefetchw(&distinct_names[v1]); + + int p2 = SA[i + 0]; d += (p2 < 0); p2 &= INT_MAX; int v2 = BUCKETS_INDEX2(T[p2 - 1], T[p2 - 2] >= T[p2 - 1]); + SA[induction_bucket[v2]++] = (p2 - 1) | ((distinct_names[v2] != d) << (INT_BIT - 1)); distinct_names[v2] = d; + + int p3 = SA[i + 1]; d += (p3 < 0); p3 &= INT_MAX; int v3 = BUCKETS_INDEX2(T[p3 - 1], T[p3 - 2] >= T[p3 - 1]); + SA[induction_bucket[v3]++] = (p3 - 1) | ((distinct_names[v3] != d) << (INT_BIT - 1)); distinct_names[v3] = d; + } + + for (j += 2 * prefetch_distance + 1; i < j; i += 1) + { + int p = SA[i]; d += (p < 0); p &= INT_MAX; int v = BUCKETS_INDEX2(T[p - 1], T[p - 2] >= T[p - 1]); + SA[induction_bucket[v]++] = (p - 1) | ((distinct_names[v] != d) << (INT_BIT - 1)); distinct_names[v] = d; + } + + return d; +} + +static int libsais_partial_sorting_scan_left_to_right_32s_4k(const int * RESTRICT T, int * RESTRICT SA, int n, int k, int * RESTRICT buckets, int d) +{ + const ptrdiff_t prefetch_distance = 32; + + int * RESTRICT induction_bucket = &buckets[2 * k]; + int * RESTRICT distinct_names = &buckets[0 * k]; + + SA[induction_bucket[T[n - 1]]++] = (n - 1) | ((T[n - 2] < T[n - 1]) << (INT_BIT - 1)) | SUFFIX_GROUP_MARKER; + distinct_names[BUCKETS_INDEX2(T[n - 1], T[n - 2] < T[n - 1])] = ++d; + + ptrdiff_t i, j; + for (i = 0, j = (ptrdiff_t)n - 2 * prefetch_distance - 1; i < j; i += 2) + { + libsais_prefetchw(&SA[i + 3 * prefetch_distance]); + + int s0 = SA[i + 2 * prefetch_distance + 0]; const int * Ts0 = &T[s0 & ~SUFFIX_GROUP_MARKER] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL); + int s1 = SA[i + 2 * prefetch_distance + 1]; const int * Ts1 = &T[s1 & ~SUFFIX_GROUP_MARKER] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL); + int s2 = SA[i + 1 * prefetch_distance + 0]; if (s2 > 0) { const ptrdiff_t Ts2 = T[(s2 & ~SUFFIX_GROUP_MARKER) - 1]; libsais_prefetchw(&induction_bucket[Ts2]); libsais_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts2, 0)]); } + int s3 = SA[i + 1 * prefetch_distance + 1]; if (s3 > 0) { const ptrdiff_t Ts3 = T[(s3 & ~SUFFIX_GROUP_MARKER) - 1]; libsais_prefetchw(&induction_bucket[Ts3]); libsais_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts3, 0)]); } + + int p0 = SA[i + 0]; SA[i + 0] = p0 & INT_MAX; + if (p0 > 0) + { + SA[i + 0] = 0; d += (p0 >> (SUFFIX_GROUP_BIT - 1)); p0 &= ~SUFFIX_GROUP_MARKER; int v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] < T[p0 - 1]); + SA[induction_bucket[T[p0 - 1]]++] = (p0 - 1) | ((T[p0 - 2] < T[p0 - 1]) << (INT_BIT - 1)) | ((distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v0] = d; + } + + int p1 = SA[i + 1]; SA[i + 1] = p1 & INT_MAX; + if (p1 > 0) + { + SA[i + 1] = 0; d += (p1 >> (SUFFIX_GROUP_BIT - 1)); p1 &= ~SUFFIX_GROUP_MARKER; int v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] < T[p1 - 1]); + SA[induction_bucket[T[p1 - 1]]++] = (p1 - 1) | ((T[p1 - 2] < T[p1 - 1]) << (INT_BIT - 1)) | ((distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v1] = d; + } + } + + for (j += 2 * prefetch_distance + 1; i < j; i += 1) + { + int p = SA[i]; SA[i] = p & INT_MAX; + if (p > 0) + { + SA[i] = 0; d += (p >> (SUFFIX_GROUP_BIT - 1)); p &= ~SUFFIX_GROUP_MARKER; int v = BUCKETS_INDEX2(T[p - 1], T[p - 2] < T[p - 1]); + SA[induction_bucket[T[p - 1]]++] = (p - 1) | ((T[p - 2] < T[p - 1]) << (INT_BIT - 1)) | ((distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v] = d; + } + } + + return d; +} + +static void libsais_partial_sorting_scan_left_to_right_32s_1k(const int * RESTRICT T, int * RESTRICT SA, int n, int k, int * RESTRICT induction_bucket) +{ + const ptrdiff_t prefetch_distance = 32; + + SA[induction_bucket[T[n - 1]]++] = (n - 1) | ((T[n - 2] < T[n - 1]) << (INT_BIT - 1)); + + ptrdiff_t i, j; + for (i = 0, j = (ptrdiff_t)n - 2 * prefetch_distance - 1; i < j; i += 2) + { + libsais_prefetchw(&SA[i + 3 * prefetch_distance]); + + int s0 = SA[i + 2 * prefetch_distance + 0]; const int * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); + int s1 = SA[i + 2 * prefetch_distance + 1]; const int * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); + int s2 = SA[i + 1 * prefetch_distance + 0]; if (s2 > 0) { libsais_prefetchw(&induction_bucket[T[s2 - 1]]); libsais_prefetch(&T[s2] - 2); } + int s3 = SA[i + 1 * prefetch_distance + 1]; if (s3 > 0) { libsais_prefetchw(&induction_bucket[T[s3 - 1]]); libsais_prefetch(&T[s3] - 2); } + + int p0 = SA[i + 0]; SA[i + 0] = p0 & INT_MAX; if (p0 > 0) { SA[i + 0] = 0; SA[induction_bucket[T[p0 - 1]]++] = (p0 - 1) | ((T[p0 - 2] < T[p0 - 1]) << (INT_BIT - 1)); } + int p1 = SA[i + 1]; SA[i + 1] = p1 & INT_MAX; if (p1 > 0) { SA[i + 1] = 0; SA[induction_bucket[T[p1 - 1]]++] = (p1 - 1) | ((T[p1 - 2] < T[p1 - 1]) << (INT_BIT - 1)); } + } + + for (j += 2 * prefetch_distance + 1; i < j; i += 1) + { + int p = SA[i]; SA[i] = p & INT_MAX; if (p > 0) { SA[i] = 0; SA[induction_bucket[T[p - 1]]++] = (p - 1) | ((T[p - 2] < T[p - 1]) << (INT_BIT - 1)); } + } +} + +static void libsais_partial_sorting_shift_markers_8u(int * RESTRICT SA, const int * RESTRICT buckets) +{ + const ptrdiff_t prefetch_distance = 32; + + const int * RESTRICT temp_bucket = &buckets[4 * ALPHABET_SIZE]; + + ptrdiff_t c; + for (c = BUCKETS_INDEX2(UCHAR_MAX, 0); c >= BUCKETS_INDEX2(1, 0); c -= BUCKETS_INDEX2(1, 0)) + { + ptrdiff_t i, j; int s = INT_MIN; + for (i = (ptrdiff_t)temp_bucket[c] - 1, j = (ptrdiff_t)buckets[c - BUCKETS_INDEX2(1, 0)] + 3; i >= j; i -= 4) + { + libsais_prefetchw(&SA[i - prefetch_distance]); + + int p0 = SA[i - 0], q0 = (p0 & INT_MIN) ^ s; s = s ^ q0; SA[i - 0] = p0 ^ q0; + int p1 = SA[i - 1], q1 = (p1 & INT_MIN) ^ s; s = s ^ q1; SA[i - 1] = p1 ^ q1; + int p2 = SA[i - 2], q2 = (p2 & INT_MIN) ^ s; s = s ^ q2; SA[i - 2] = p2 ^ q2; + int p3 = SA[i - 3], q3 = (p3 & INT_MIN) ^ s; s = s ^ q3; SA[i - 3] = p3 ^ q3; + } + + for (j -= 3; i >= j; i -= 1) + { + int p = SA[i], q = (p & INT_MIN) ^ s; s = s ^ q; SA[i] = p ^ q; + } + } +} + +static void libsais_partial_sorting_shift_markers_32s_6k(int * RESTRICT SA, int k, const int * RESTRICT buckets) +{ + const ptrdiff_t prefetch_distance = 32; + + const int * RESTRICT temp_bucket = &buckets[4 * k]; + + ptrdiff_t c; + for (c = BUCKETS_INDEX2((ptrdiff_t)k - 1, 0); c >= BUCKETS_INDEX2(1, 0); c -= BUCKETS_INDEX2(1, 0)) + { + ptrdiff_t i, j; int s = INT_MIN; + for (i = (ptrdiff_t)temp_bucket[c] - 1, j = (ptrdiff_t)buckets[c - BUCKETS_INDEX2(1, 0)] + 3; i >= j; i -= 4) + { + libsais_prefetchw(&SA[i - prefetch_distance]); + + int p0 = SA[i - 0], q0 = (p0 & INT_MIN) ^ s; s = s ^ q0; SA[i - 0] = p0 ^ q0; + int p1 = SA[i - 1], q1 = (p1 & INT_MIN) ^ s; s = s ^ q1; SA[i - 1] = p1 ^ q1; + int p2 = SA[i - 2], q2 = (p2 & INT_MIN) ^ s; s = s ^ q2; SA[i - 2] = p2 ^ q2; + int p3 = SA[i - 3], q3 = (p3 & INT_MIN) ^ s; s = s ^ q3; SA[i - 3] = p3 ^ q3; + } + + for (j -= 3; i >= j; i -= 1) + { + int p = SA[i], q = (p & INT_MIN) ^ s; s = s ^ q; SA[i] = p ^ q; + } + } +} + +static void libsais_partial_sorting_shift_markers_32s_4k(int * RESTRICT SA, int n) +{ + const ptrdiff_t prefetch_distance = 32; + + ptrdiff_t i; int s = SUFFIX_GROUP_MARKER; + for (i = (ptrdiff_t)n - 1; i >= 3; i -= 4) + { + libsais_prefetchw(&SA[i - prefetch_distance]); + + int p0 = SA[i - 0], q0 = ((p0 & SUFFIX_GROUP_MARKER) ^ s) & ((p0 > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q0; SA[i - 0] = p0 ^ q0; + int p1 = SA[i - 1], q1 = ((p1 & SUFFIX_GROUP_MARKER) ^ s) & ((p1 > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q1; SA[i - 1] = p1 ^ q1; + int p2 = SA[i - 2], q2 = ((p2 & SUFFIX_GROUP_MARKER) ^ s) & ((p2 > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q2; SA[i - 2] = p2 ^ q2; + int p3 = SA[i - 3], q3 = ((p3 & SUFFIX_GROUP_MARKER) ^ s) & ((p3 > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q3; SA[i - 3] = p3 ^ q3; + } + + for (; i >= 0; i -= 1) + { + int p = SA[i], q = ((p & SUFFIX_GROUP_MARKER) ^ s) & ((p > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q; SA[i] = p ^ q; + } +} + +static int libsais_partial_sorting_scan_right_to_left_8u(const unsigned char * RESTRICT T, int * RESTRICT SA, int n, int * RESTRICT buckets, int first_lms_suffix, int left_suffixes_count, int d) +{ + const ptrdiff_t prefetch_distance = 32; + + int * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; + int * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; + + ptrdiff_t i, j; + for (i = (ptrdiff_t)n - (ptrdiff_t)first_lms_suffix - 1, j = (ptrdiff_t)left_suffixes_count + 1 + prefetch_distance + 1; i >= j; i -= 2) + { + libsais_prefetch(&SA[i - 2 * prefetch_distance]); + + libsais_prefetch(&T[SA[i - prefetch_distance - 0] & INT_MAX] - 1); + libsais_prefetch(&T[SA[i - prefetch_distance - 0] & INT_MAX] - 2); + libsais_prefetch(&T[SA[i - prefetch_distance - 1] & INT_MAX] - 1); + libsais_prefetch(&T[SA[i - prefetch_distance - 1] & INT_MAX] - 2); + + int p0 = SA[i - 0]; d += (p0 < 0); p0 &= INT_MAX; int v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]); + SA[--induction_bucket[v0]] = (p0 - 1) | ((distinct_names[v0] != d) << (INT_BIT - 1)); distinct_names[v0] = d; + + int p1 = SA[i - 1]; d += (p1 < 0); p1 &= INT_MAX; int v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]); + SA[--induction_bucket[v1]] = (p1 - 1) | ((distinct_names[v1] != d) << (INT_BIT - 1)); distinct_names[v1] = d; + } + + for (j -= prefetch_distance + 1; i >= j; i -= 1) + { + int p = SA[i]; d += (p < 0); p &= INT_MAX; int v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]); + SA[--induction_bucket[v]] = (p - 1) | ((distinct_names[v] != d) << (INT_BIT - 1)); distinct_names[v] = d; + } + + return d; +} + +static int libsais_partial_sorting_scan_right_to_left_32s_6k(const int * RESTRICT T, int * RESTRICT SA, int n, int k, int * RESTRICT buckets, int first_lms_suffix, int left_suffixes_count, int d) +{ + const ptrdiff_t prefetch_distance = 32; + + int * RESTRICT induction_bucket = &buckets[0 * k]; + int * RESTRICT distinct_names = &buckets[2 * k]; + + ptrdiff_t i, j; + for (i = (ptrdiff_t)n - (ptrdiff_t)first_lms_suffix - 1, j = (ptrdiff_t)left_suffixes_count + 1 + 2 * prefetch_distance + 1; i >= j; i -= 2) + { + libsais_prefetch(&SA[i - 3 * prefetch_distance]); + + libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 0] & INT_MAX] - 1); + libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 0] & INT_MAX] - 2); + libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 1] & INT_MAX] - 1); + libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 1] & INT_MAX] - 2); + + int p0 = SA[i - prefetch_distance - 0] & INT_MAX; int v0 = BUCKETS_INDEX2(T[p0 - (p0 > 0)], 0); + libsais_prefetchw(&induction_bucket[v0]); libsais_prefetchw(&distinct_names[v0]); + + int p1 = SA[i - prefetch_distance - 1] & INT_MAX; int v1 = BUCKETS_INDEX2(T[p1 - (p1 > 0)], 0); + libsais_prefetchw(&induction_bucket[v1]); libsais_prefetchw(&distinct_names[v1]); + + int p2 = SA[i - 0]; d += (p2 < 0); p2 &= INT_MAX; int v2 = BUCKETS_INDEX2(T[p2 - 1], T[p2 - 2] > T[p2 - 1]); + SA[--induction_bucket[v2]] = (p2 - 1) | ((distinct_names[v2] != d) << (INT_BIT - 1)); distinct_names[v2] = d; + + int p3 = SA[i - 1]; d += (p3 < 0); p3 &= INT_MAX; int v3 = BUCKETS_INDEX2(T[p3 - 1], T[p3 - 2] > T[p3 - 1]); + SA[--induction_bucket[v3]] = (p3 - 1) | ((distinct_names[v3] != d) << (INT_BIT - 1)); distinct_names[v3] = d; + } + + for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1) + { + int p = SA[i]; d += (p < 0); p &= INT_MAX; int v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]); + SA[--induction_bucket[v]] = (p - 1) | ((distinct_names[v] != d) << (INT_BIT - 1)); distinct_names[v] = d; + } + + return d; +} + +static int libsais_partial_sorting_scan_right_to_left_32s_4k(const int * RESTRICT T, int * RESTRICT SA, int n, int k, int * RESTRICT buckets, int d) +{ + const ptrdiff_t prefetch_distance = 32; + + int * RESTRICT induction_bucket = &buckets[3 * k]; + int * RESTRICT distinct_names = &buckets[0 * k]; + + ptrdiff_t i; + for (i = (ptrdiff_t)n - 1; i >= 2 * prefetch_distance + 1; i -= 2) + { + libsais_prefetchw(&SA[i - 3 * prefetch_distance]); + + int s0 = SA[i - 2 * prefetch_distance - 0]; const int * Ts0 = &T[s0 & ~SUFFIX_GROUP_MARKER] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL); + int s1 = SA[i - 2 * prefetch_distance - 1]; const int * Ts1 = &T[s1 & ~SUFFIX_GROUP_MARKER] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL); + int s2 = SA[i - 1 * prefetch_distance - 0]; if (s2 > 0) { const ptrdiff_t Ts2 = T[(s2 & ~SUFFIX_GROUP_MARKER) - 1]; libsais_prefetchw(&induction_bucket[Ts2]); libsais_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts2, 0)]); } + int s3 = SA[i - 1 * prefetch_distance - 1]; if (s3 > 0) { const ptrdiff_t Ts3 = T[(s3 & ~SUFFIX_GROUP_MARKER) - 1]; libsais_prefetchw(&induction_bucket[Ts3]); libsais_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts3, 0)]); } + + int p0 = SA[i - 0]; + if (p0 > 0) + { + SA[i - 0] = 0; d += (p0 >> (SUFFIX_GROUP_BIT - 1)); p0 &= ~SUFFIX_GROUP_MARKER; int v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]); + SA[--induction_bucket[T[p0 - 1]]] = (p0 - 1) | ((T[p0 - 2] > T[p0 - 1]) << (INT_BIT - 1)) | ((distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v0] = d; + } + + int p1 = SA[i - 1]; + if (p1 > 0) + { + SA[i - 1] = 0; d += (p1 >> (SUFFIX_GROUP_BIT - 1)); p1 &= ~SUFFIX_GROUP_MARKER; int v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]); + SA[--induction_bucket[T[p1 - 1]]] = (p1 - 1) | ((T[p1 - 2] > T[p1 - 1]) << (INT_BIT - 1)) | ((distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v1] = d; + } + } + + for (; i >= 0; i -= 1) + { + int p = SA[i]; + if (p > 0) + { + SA[i] = 0; d += (p >> (SUFFIX_GROUP_BIT - 1)); p &= ~SUFFIX_GROUP_MARKER; int v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]); + SA[--induction_bucket[T[p - 1]]] = (p - 1) | ((T[p - 2] > T[p - 1]) << (INT_BIT - 1)) | ((distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v] = d; + } + } + + return d; +} + +static void libsais_partial_sorting_scan_right_to_left_32s_1k(const int * RESTRICT T, int * RESTRICT SA, int n, int k, int * RESTRICT induction_bucket) +{ + const ptrdiff_t prefetch_distance = 32; + + ptrdiff_t i; + for (i = (ptrdiff_t)n - 1; i >= 2 * prefetch_distance + 1; i -= 2) + { + libsais_prefetchw(&SA[i - 3 * prefetch_distance]); + + int s0 = SA[i - 2 * prefetch_distance - 0]; const int * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); + int s1 = SA[i - 2 * prefetch_distance - 1]; const int * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); + int s2 = SA[i - 1 * prefetch_distance - 0]; if (s2 > 0) { libsais_prefetchw(&induction_bucket[T[s2 - 1]]); libsais_prefetch(&T[s2] - 2); } + int s3 = SA[i - 1 * prefetch_distance - 1]; if (s3 > 0) { libsais_prefetchw(&induction_bucket[T[s3 - 1]]); libsais_prefetch(&T[s3] - 2); } + + int p0 = SA[i - 0]; if (p0 > 0) { SA[i - 0] = 0; SA[--induction_bucket[T[p0 - 1]]] = (p0 - 1) | ((T[p0 - 2] > T[p0 - 1]) << (INT_BIT - 1)); } + int p1 = SA[i - 1]; if (p1 > 0) { SA[i - 1] = 0; SA[--induction_bucket[T[p1 - 1]]] = (p1 - 1) | ((T[p1 - 2] > T[p1 - 1]) << (INT_BIT - 1)); } + } + + for (; i >= 0; i -= 1) + { + int p = SA[i]; if (p > 0) { SA[i] = 0; SA[--induction_bucket[T[p - 1]]] = (p - 1) | ((T[p - 2] > T[p - 1]) << (INT_BIT - 1)); } + } +} + +static void libsais_partial_sorting_gather_lms_suffixes_32s_4k(int * RESTRICT SA, int n) +{ + const ptrdiff_t prefetch_distance = 32; + + ptrdiff_t i, j, l; + for (i = 0, j = (ptrdiff_t)n - 3, l = 0; i < j; i += 4) + { + libsais_prefetch(&SA[i + prefetch_distance]); + + int s0 = SA[i + 0]; SA[l] = (s0 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER); l += (s0 < 0); + int s1 = SA[i + 1]; SA[l] = (s1 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER); l += (s1 < 0); + int s2 = SA[i + 2]; SA[l] = (s2 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER); l += (s2 < 0); + int s3 = SA[i + 3]; SA[l] = (s3 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER); l += (s3 < 0); + } + + for (j += 3; i < j; i += 1) + { + int s = SA[i]; SA[l] = (s - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER); l += (s < 0); + } +} + +static void libsais_partial_sorting_gather_lms_suffixes_32s_1k(int * RESTRICT SA, int n) +{ + const ptrdiff_t prefetch_distance = 32; + + ptrdiff_t i, j, l; + for (i = 0, j = (ptrdiff_t)n - 3, l = 0; i < j; i += 4) + { + libsais_prefetch(&SA[i + prefetch_distance]); + + int s0 = SA[i + 0]; SA[l] = s0 & INT_MAX; l += (s0 < 0); + int s1 = SA[i + 1]; SA[l] = s1 & INT_MAX; l += (s1 < 0); + int s2 = SA[i + 2]; SA[l] = s2 & INT_MAX; l += (s2 < 0); + int s3 = SA[i + 3]; SA[l] = s3 & INT_MAX; l += (s3 < 0); + } + + for (j += 3; i < j; i += 1) + { + int s = SA[i]; SA[l] = s & INT_MAX; l += (s < 0); + } +} + +static void libsais_induce_partial_order_8u(const unsigned char * RESTRICT T, int * RESTRICT SA, int n, int * RESTRICT buckets, int first_lms_suffix, int left_suffixes_count) +{ + memset(&buckets[2 * ALPHABET_SIZE], 0, 2 * ALPHABET_SIZE * sizeof(int)); + + int d = libsais_partial_sorting_scan_left_to_right_8u(T, SA, n, buckets, left_suffixes_count, 0); + libsais_partial_sorting_shift_markers_8u(SA, buckets); + libsais_partial_sorting_scan_right_to_left_8u(T, SA, n, buckets, first_lms_suffix, left_suffixes_count, d); +} + +static void libsais_induce_partial_order_32s_6k(const int * RESTRICT T, int * RESTRICT SA, int n, int k, int * RESTRICT buckets, int first_lms_suffix, int left_suffixes_count) +{ + memset(&buckets[2 * k], 0, 2 * (size_t)k * sizeof(int)); + + int d = libsais_partial_sorting_scan_left_to_right_32s_6k(T, SA, n, k, buckets, left_suffixes_count, 0); + libsais_partial_sorting_shift_markers_32s_6k(SA, k, buckets); + libsais_partial_sorting_scan_right_to_left_32s_6k(T, SA, n, k, buckets, first_lms_suffix, left_suffixes_count, d); +} + +static void libsais_induce_partial_order_32s_4k(const int * RESTRICT T, int * RESTRICT SA, int n, int k, int * RESTRICT buckets) +{ + memset(buckets, 0, 2 * (size_t)k * sizeof(int)); + + int d = libsais_partial_sorting_scan_left_to_right_32s_4k(T, SA, n, k, buckets, 0); + libsais_partial_sorting_shift_markers_32s_4k(SA, n); + libsais_partial_sorting_scan_right_to_left_32s_4k(T, SA, n, k, buckets, d); + libsais_partial_sorting_gather_lms_suffixes_32s_4k(SA, n); +} + +static void libsais_induce_partial_order_32s_2k(const int * RESTRICT T, int * RESTRICT SA, int n, int k, int * RESTRICT buckets) +{ + libsais_partial_sorting_scan_left_to_right_32s_1k(T, SA, n, k, &buckets[1 * k]); + libsais_partial_sorting_scan_right_to_left_32s_1k(T, SA, n, k, &buckets[0 * k]); + libsais_partial_sorting_gather_lms_suffixes_32s_1k(SA, n); +} + +static void libsais_induce_partial_order_32s_1k(const int * RESTRICT T, int * RESTRICT SA, int n, int k, int * RESTRICT buckets) +{ + libsais_count_suffixes_32s(T, n, k, buckets); + libsais_initialize_buckets_start_32s_1k(k, buckets); + libsais_partial_sorting_scan_left_to_right_32s_1k(T, SA, n, k, buckets); + + libsais_count_suffixes_32s(T, n, k, buckets); + libsais_initialize_buckets_end_32s_1k(k, buckets); + libsais_partial_sorting_scan_right_to_left_32s_1k(T, SA, n, k, buckets); + + libsais_partial_sorting_gather_lms_suffixes_32s_1k(SA, n); +} + +static int libsais_renumber_and_gather_lms_suffixes_8u(int * RESTRICT SA, int n, int m) +{ + const ptrdiff_t prefetch_distance = 32; + + int * RESTRICT SAm = &SA[m]; + + memset(SAm, 0, ((size_t)n >> 1) * sizeof(int)); + + ptrdiff_t i, j; int name = 0; + for (i = 0, j = (ptrdiff_t)m - prefetch_distance - 3; i < j; i += 4) + { + libsais_prefetch(&SA[i + 2 * prefetch_distance]); + + libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 0] & INT_MAX) >> 1]); + libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 1] & INT_MAX) >> 1]); + libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 2] & INT_MAX) >> 1]); + libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 3] & INT_MAX) >> 1]); + + int p0 = SA[i + 0]; SAm[(p0 & INT_MAX) >> 1] = name | INT_MIN; name += p0 < 0; + int p1 = SA[i + 1]; SAm[(p1 & INT_MAX) >> 1] = name | INT_MIN; name += p1 < 0; + int p2 = SA[i + 2]; SAm[(p2 & INT_MAX) >> 1] = name | INT_MIN; name += p2 < 0; + int p3 = SA[i + 3]; SAm[(p3 & INT_MAX) >> 1] = name | INT_MIN; name += p3 < 0; + } + + for (j += prefetch_distance + 3; i < j; i += 1) + { + int p = SA[i]; SAm[(p & INT_MAX) >> 1] = name | INT_MIN; name += p < 0; + } + + if (name < m) + { + ptrdiff_t l; + for (i = (ptrdiff_t)m + ((ptrdiff_t)n >> 1) - 1, j = (ptrdiff_t)m + 3, l = (ptrdiff_t)n - 1; i >= j; i -= 4) + { + libsais_prefetch(&SA[i - prefetch_distance]); + + int s0 = SA[i - 0]; SA[l] = s0 & INT_MAX; l -= s0 < 0; + int s1 = SA[i - 1]; SA[l] = s1 & INT_MAX; l -= s1 < 0; + int s2 = SA[i - 2]; SA[l] = s2 & INT_MAX; l -= s2 < 0; + int s3 = SA[i - 3]; SA[l] = s3 & INT_MAX; l -= s3 < 0; + } + + for (j -= 3; i >= j; i -= 1) + { + int s = SA[i]; SA[l] = s & INT_MAX; l -= s < 0; + } + } + else + { + for (i = 0; i < m; i += 1) { SA[i] &= INT_MAX; } + } + + return name; +} + +static int libsais_renumber_and_mark_distinct_lms_suffixes_32s_4k(int * RESTRICT SA, int n, int m) +{ + const ptrdiff_t prefetch_distance = 32; + + int * RESTRICT SAm = &SA[m]; + + memset(SAm, 0, ((size_t)n >> 1) * sizeof(int)); + + ptrdiff_t i, j; int p0, p1, p2, p3 = -1, name = 1; + for (i = 0, j = (ptrdiff_t)m - prefetch_distance - 3; i < j; i += 4) + { + libsais_prefetchw(&SA[i + 2 * prefetch_distance]); + + libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 0] & INT_MAX) >> 1]); + libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 1] & INT_MAX) >> 1]); + libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 2] & INT_MAX) >> 1]); + libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 3] & INT_MAX) >> 1]); + + p0 = SA[i + 0]; SAm[(SA[i + 0] = p0 & INT_MAX) >> 1] = name | (p0 & p3 & INT_MIN); name += p0 < 0; + p1 = SA[i + 1]; SAm[(SA[i + 1] = p1 & INT_MAX) >> 1] = name | (p1 & p0 & INT_MIN); name += p1 < 0; + p2 = SA[i + 2]; SAm[(SA[i + 2] = p2 & INT_MAX) >> 1] = name | (p2 & p1 & INT_MIN); name += p2 < 0; + p3 = SA[i + 3]; SAm[(SA[i + 3] = p3 & INT_MAX) >> 1] = name | (p3 & p2 & INT_MIN); name += p3 < 0; + } + + for (j += prefetch_distance + 3; i < j; i += 1) + { + p2 = p3; p3 = SA[i]; SAm[(SA[i] = p3 & INT_MAX) >> 1] = name | (p3 & p2 & INT_MIN); name += p3 < 0; + } + + if (name <= m) + { + p3 = -1; + for (i = m, j = (ptrdiff_t)m + ((ptrdiff_t)n >> 1) - 3; i < j; i += 4) + { + libsais_prefetchw(&SA[i + prefetch_distance]); + + p0 = SA[i + 0]; SA[i + 0] = p0 & (p3 | INT_MAX); p0 = (p0 == 0) ? p3 : p0; + p1 = SA[i + 1]; SA[i + 1] = p1 & (p0 | INT_MAX); p1 = (p1 == 0) ? p0 : p1; + p2 = SA[i + 2]; SA[i + 2] = p2 & (p1 | INT_MAX); p2 = (p2 == 0) ? p1 : p2; + p3 = SA[i + 3]; SA[i + 3] = p3 & (p2 | INT_MAX); p3 = (p3 == 0) ? p2 : p3; + } + + for (j += 3; i < j; i += 1) + { + p2 = p3; p3 = SA[i]; SA[i] = p3 & (p2 | INT_MAX); p3 = (p3 == 0) ? p2 : p3; + } + } + + return name - 1; +} + +static int libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k(int * RESTRICT T, int * RESTRICT SA, int n, int m) +{ + const ptrdiff_t prefetch_distance = 32; + + int * RESTRICT SAm = &SA[m]; + + { + libsais_gather_lms_suffixes_32s(T, SA, n); + + memset(&SA[m], 0, ((size_t)n - (size_t)m - (size_t)m) * sizeof(int)); + + ptrdiff_t i, j; + for (i = (ptrdiff_t)n - (ptrdiff_t)m, j = (ptrdiff_t)n - 1 - prefetch_distance - 3; i < j; i += 4) + { + libsais_prefetch(&SA[i + 2 * prefetch_distance]); + + libsais_prefetchw(&SAm[((unsigned int)SA[i + prefetch_distance + 0]) >> 1]); + libsais_prefetchw(&SAm[((unsigned int)SA[i + prefetch_distance + 1]) >> 1]); + libsais_prefetchw(&SAm[((unsigned int)SA[i + prefetch_distance + 2]) >> 1]); + libsais_prefetchw(&SAm[((unsigned int)SA[i + prefetch_distance + 3]) >> 1]); + + SAm[((unsigned int)SA[i + 0]) >> 1] = SA[i + 1] - SA[i + 0] + 1 + INT_MIN; + SAm[((unsigned int)SA[i + 1]) >> 1] = SA[i + 2] - SA[i + 1] + 1 + INT_MIN; + SAm[((unsigned int)SA[i + 2]) >> 1] = SA[i + 3] - SA[i + 2] + 1 + INT_MIN; + SAm[((unsigned int)SA[i + 3]) >> 1] = SA[i + 4] - SA[i + 3] + 1 + INT_MIN; + } + + for (j += prefetch_distance + 3; i < j; i += 1) + { + SAm[((unsigned int)SA[i]) >> 1] = SA[i + 1] - SA[i] + 1 + INT_MIN; + } + + SAm[((unsigned int)SA[n - 1]) >> 1] = 1 + INT_MIN; + } + + { + ptrdiff_t i, j; + for (i = 0, j = (ptrdiff_t)(n >> 1) - 3; i < j; i += 4) + { + libsais_prefetchw(&SAm[i + prefetch_distance]); + + SAm[i + 0] = (SAm[i + 0] < 0 ? SAm[i + 0] : 0) & INT_MAX; + SAm[i + 1] = (SAm[i + 1] < 0 ? SAm[i + 1] : 0) & INT_MAX; + SAm[i + 2] = (SAm[i + 2] < 0 ? SAm[i + 2] : 0) & INT_MAX; + SAm[i + 3] = (SAm[i + 3] < 0 ? SAm[i + 3] : 0) & INT_MAX; + } + + for (j += 3; i < j; i += 1) + { + SAm[i] = (SAm[i] < 0 ? SAm[i] : 0) & INT_MAX; + } + } + + int name = 1; + + { + ptrdiff_t i, j, p = SA[0], plen = SAm[p >> 1]; int pdiff = INT_MIN; + for (i = 1, j = m - prefetch_distance - 1; i < j; i += 2) + { + libsais_prefetch(&SA[i + 2 * prefetch_distance]); + + libsais_prefetchw(&SAm[((unsigned int)SA[i + prefetch_distance + 0]) >> 1]); libsais_prefetch(&T[((unsigned int)SA[i + prefetch_distance + 0])]); + libsais_prefetchw(&SAm[((unsigned int)SA[i + prefetch_distance + 1]) >> 1]); libsais_prefetch(&T[((unsigned int)SA[i + prefetch_distance + 1])]); + + ptrdiff_t q = SA[i + 0], qlen = SAm[q >> 1]; int qdiff = INT_MIN; + if (plen == qlen) { ptrdiff_t l = 0; do { if (T[p + l] != T[q + l]) { break; } } while (++l < qlen); qdiff = (l - qlen) & INT_MIN; } + SAm[p >> 1] = name | (pdiff & qdiff); name += (qdiff < 0); + + p = SA[i + 1]; plen = SAm[p >> 1]; pdiff = INT_MIN; + if (qlen == plen) { ptrdiff_t l = 0; do { if (T[q + l] != T[p + l]) { break; } } while (++l < plen); pdiff = (l - plen) & INT_MIN; } + SAm[q >> 1] = name | (qdiff & pdiff); name += (pdiff < 0); + } + + for (j += prefetch_distance + 1; i < j; i += 1) + { + ptrdiff_t q = SA[i], qlen = SAm[q >> 1]; int qdiff = INT_MIN; + if (plen == qlen) { ptrdiff_t l = 0; do { if (T[p + l] != T[q + l]) { break; } } while (++l < plen); qdiff = (l - plen) & INT_MIN; } + SAm[p >> 1] = name | (pdiff & qdiff); name += (qdiff < 0); + + p = q; plen = qlen; pdiff = qdiff; + } + + SAm[p >> 1] = name | pdiff; name++; + } + + if (name <= m) + { + ptrdiff_t i, j; int p0, p1, p2, p3 = -1; + for (i = m, j = (ptrdiff_t)m + ((ptrdiff_t)n >> 1) - 3; i < j; i += 4) + { + libsais_prefetchw(&SA[i + prefetch_distance]); + + p0 = SA[i + 0]; SA[i + 0] = p0 & (p3 | INT_MAX); p0 = (p0 == 0) ? p3 : p0; + p1 = SA[i + 1]; SA[i + 1] = p1 & (p0 | INT_MAX); p1 = (p1 == 0) ? p0 : p1; + p2 = SA[i + 2]; SA[i + 2] = p2 & (p1 | INT_MAX); p2 = (p2 == 0) ? p1 : p2; + p3 = SA[i + 3]; SA[i + 3] = p3 & (p2 | INT_MAX); p3 = (p3 == 0) ? p2 : p3; + } + + for (j += 3; i < j; i += 1) + { + p2 = p3; p3 = SA[i]; SA[i] = p3 & (p2 | INT_MAX); p3 = (p3 == 0) ? p2 : p3; + } + } + + return name - 1; +} + +static void libsais_reconstruct_lms_suffixes(int * RESTRICT SA, int n, int m) +{ + const ptrdiff_t prefetch_distance = 32; + + const int * RESTRICT SAnm = &SA[n - m]; + + ptrdiff_t i, j; + for (i = 0, j = (ptrdiff_t)m - prefetch_distance - 3; i < j; i += 4) + { + libsais_prefetchw(&SA[i + 2 * prefetch_distance]); + + libsais_prefetch(&SAnm[SA[i + prefetch_distance + 0]]); + libsais_prefetch(&SAnm[SA[i + prefetch_distance + 1]]); + libsais_prefetch(&SAnm[SA[i + prefetch_distance + 2]]); + libsais_prefetch(&SAnm[SA[i + prefetch_distance + 3]]); + + SA[i + 0] = SAnm[SA[i + 0]]; + SA[i + 1] = SAnm[SA[i + 1]]; + SA[i + 2] = SAnm[SA[i + 2]]; + SA[i + 3] = SAnm[SA[i + 3]]; + } + + for (j += prefetch_distance + 3; i < j; i += 1) + { + SA[i] = SAnm[SA[i]]; + } +} + +static void libsais_place_lms_suffixes_interval_8u(int * RESTRICT SA, int n, int m, const int * RESTRICT buckets) +{ + const int * RESTRICT bucket_end = &buckets[7 * ALPHABET_SIZE]; + + ptrdiff_t c, j = n; + for (c = UCHAR_MAX - 1; c >= 0; --c) + { + ptrdiff_t l = (ptrdiff_t)buckets[BUCKETS_INDEX2(c, 1) + BUCKETS_INDEX2(1, 0)] - (ptrdiff_t)buckets[BUCKETS_INDEX2(c, 1)]; + if (l > 0) + { + ptrdiff_t i = bucket_end[c]; + if (j - i > 0) + { + memset(&SA[i], 0, (size_t)(j - i) * sizeof(int)); + } + + memmove(&SA[j = (i - l)], &SA[m -= (int)l], (size_t)l * sizeof(int)); + } + } + + memset(&SA[0], 0, (size_t)j * sizeof(int)); +} + +static void libsais_place_lms_suffixes_interval_32s_4k(int * RESTRICT SA, int n, int k, int m, const int * RESTRICT buckets) +{ + const int * RESTRICT bucket_end = &buckets[3 * k]; + + ptrdiff_t c, j = n; + for (c = (ptrdiff_t)k - 2; c >= 0; --c) + { + ptrdiff_t l = (ptrdiff_t)buckets[BUCKETS_INDEX2(c, 1) + BUCKETS_INDEX2(1, 0)] - (ptrdiff_t)buckets[BUCKETS_INDEX2(c, 1)]; + if (l > 0) + { + ptrdiff_t i = bucket_end[c]; + if (j - i > 0) + { + memset(&SA[i], 0, (size_t)(j - i) * sizeof(int)); + } + + memmove(&SA[j = (i - l)], &SA[m -= (int)l], (size_t)l * sizeof(int)); + } + } + + memset(&SA[0], 0, (size_t)j * sizeof(int)); +} + +static void libsais_place_lms_suffixes_interval_32s_2k(int * RESTRICT SA, int n, int k, int m, const int * RESTRICT buckets) +{ + ptrdiff_t c, j = n; + for (c = BUCKETS_INDEX2((ptrdiff_t)k - 2, 0); c >= BUCKETS_INDEX2(0, 0); c -= BUCKETS_INDEX2(1, 0)) + { + ptrdiff_t l = (ptrdiff_t)buckets[c + BUCKETS_INDEX2(1, 1)] - (ptrdiff_t)buckets[c + BUCKETS_INDEX2(0, 1)]; + if (l > 0) + { + ptrdiff_t i = buckets[c]; + if (j - i > 0) + { + memset(&SA[i], 0, (size_t)(j - i) * sizeof(int)); + } + + memmove(&SA[j = (i - l)], &SA[m -= (int)l], (size_t)l * sizeof(int)); + } + } + + memset(&SA[0], 0, (size_t)j * sizeof(int)); +} + +static void libsais_place_lms_suffixes_interval_32s_1k(const int * RESTRICT T, int * RESTRICT SA, int n, int k, int m, int * RESTRICT buckets) +{ + const ptrdiff_t prefetch_distance = 32; + + int c = k - 1; ptrdiff_t i, l = buckets[c]; + for (i = (ptrdiff_t)m - 1; i >= prefetch_distance + 3; i -= 4) + { + libsais_prefetch(&SA[i - 2 * prefetch_distance]); + + libsais_prefetch(&T[SA[i - prefetch_distance - 0]]); + libsais_prefetch(&T[SA[i - prefetch_distance - 1]]); + libsais_prefetch(&T[SA[i - prefetch_distance - 2]]); + libsais_prefetch(&T[SA[i - prefetch_distance - 3]]); + + int p0 = SA[i - 0]; if (T[p0] != c) { c = T[p0]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(int)); l = buckets[c]; } SA[--l] = p0; + int p1 = SA[i - 1]; if (T[p1] != c) { c = T[p1]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(int)); l = buckets[c]; } SA[--l] = p1; + int p2 = SA[i - 2]; if (T[p2] != c) { c = T[p2]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(int)); l = buckets[c]; } SA[--l] = p2; + int p3 = SA[i - 3]; if (T[p3] != c) { c = T[p3]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(int)); l = buckets[c]; } SA[--l] = p3; + } + + for (; i >= 0; i -= 1) + { + int p = SA[i]; if (T[p] != c) { c = T[p]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(int)); l = buckets[c]; } SA[--l] = p; + } + + memset(&SA[0], 0, (size_t)l * sizeof(int)); +} + +static void libsais_place_lms_suffixes_histogram_32s_6k(int * RESTRICT SA, int n, int k, int m, const int * RESTRICT buckets) +{ + const int * RESTRICT bucket_end = &buckets[5 * k]; + + ptrdiff_t c, j = n; + for (c = (ptrdiff_t)k - 2; c >= 0; --c) + { + ptrdiff_t l = (ptrdiff_t)buckets[BUCKETS_INDEX4(c, 1)]; + if (l > 0) + { + ptrdiff_t i = bucket_end[c]; + if (j - i > 0) + { + memset(&SA[i], 0, (size_t)(j - i) * sizeof(int)); + } + + memmove(&SA[j = (i - l)], &SA[m -= (int)l], (size_t)l * sizeof(int)); + } + } + + memset(&SA[0], 0, (size_t)j * sizeof(int)); +} + +static void libsais_place_lms_suffixes_histogram_32s_4k(int * RESTRICT SA, int n, int k, int m, const int * RESTRICT buckets) +{ + const int * RESTRICT bucket_end = &buckets[3 * k]; + + ptrdiff_t c, j = n; + for (c = (ptrdiff_t)k - 2; c >= 0; --c) + { + ptrdiff_t l = (ptrdiff_t)buckets[BUCKETS_INDEX2(c, 1)]; + if (l > 0) + { + ptrdiff_t i = bucket_end[c]; + if (j - i > 0) + { + memset(&SA[i], 0, (size_t)(j - i) * sizeof(int)); + } + + memmove(&SA[j = (i - l)], &SA[m -= (int)l], (size_t)l * sizeof(int)); + } + } + + memset(&SA[0], 0, (size_t)j * sizeof(int)); +} + +static void libsais_place_lms_suffixes_histogram_32s_2k(int * RESTRICT SA, int n, int k, int m, const int * RESTRICT buckets) +{ + ptrdiff_t c, j = n; + for (c = BUCKETS_INDEX2((ptrdiff_t)k - 2, 0); c >= BUCKETS_INDEX2(0, 0); c -= BUCKETS_INDEX2(1, 0)) + { + ptrdiff_t l = (ptrdiff_t)buckets[c + BUCKETS_INDEX2(0, 1)]; + if (l > 0) + { + ptrdiff_t i = buckets[c]; + if (j - i > 0) + { + memset(&SA[i], 0, (size_t)(j - i) * sizeof(int)); + } + + memmove(&SA[j = (i - l)], &SA[m -= (int)l], (size_t)l * sizeof(int)); + } + } + + memset(&SA[0], 0, (size_t)j * sizeof(int)); +} + +static void libsais_final_bwt_scan_left_to_right_8u(const unsigned char * RESTRICT T, int * RESTRICT SA, int n, int * RESTRICT buckets) +{ + const ptrdiff_t prefetch_distance = 32; + + int * RESTRICT induction_bucket = &buckets[6 * ALPHABET_SIZE]; + + SA[induction_bucket[T[n - 1]]++] = (n - 1) | ((T[n - 2] < T[n - 1]) << (INT_BIT - 1)); + + ptrdiff_t i, j; + for (i = 0, j = (ptrdiff_t)n - prefetch_distance - 1; i < j; i += 2) + { + libsais_prefetchw(&SA[i + 2 * prefetch_distance]); + + int s0 = SA[i + prefetch_distance + 0]; const unsigned char * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL); + int s1 = SA[i + prefetch_distance + 1]; const unsigned char * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL); + + int p0 = SA[i + 0]; SA[i + 0] = p0 & INT_MAX; if (p0 > 0) { p0--; SA[i + 0] = T[p0] | INT_MIN; SA[induction_bucket[T[p0]]++] = p0 | (((T[p0 - (p0 > 0)] < T[p0])) << (INT_BIT - 1)); } + int p1 = SA[i + 1]; SA[i + 1] = p1 & INT_MAX; if (p1 > 0) { p1--; SA[i + 1] = T[p1] | INT_MIN; SA[induction_bucket[T[p1]]++] = p1 | (((T[p1 - (p1 > 0)] < T[p1])) << (INT_BIT - 1)); } + } + + for (j += prefetch_distance + 1; i < j; i += 1) + { + int p = SA[i]; SA[i] = p & INT_MAX; if (p > 0) { p--; SA[i] = T[p] | INT_MIN; SA[induction_bucket[T[p]]++] = p | (((T[p - (p > 0)] < T[p])) << (INT_BIT - 1)); } + } +} + +static void libsais_final_sorting_scan_left_to_right_8u(const unsigned char * RESTRICT T, int * RESTRICT SA, int n, int * RESTRICT buckets) +{ + const ptrdiff_t prefetch_distance = 32; + + int * RESTRICT induction_bucket = &buckets[6 * ALPHABET_SIZE]; + + SA[induction_bucket[T[n - 1]]++] = (n - 1) | ((T[n - 2] < T[n - 1]) << (INT_BIT - 1)); + + ptrdiff_t i, j; + for (i = 0, j = (ptrdiff_t)n - prefetch_distance - 1; i < j; i += 2) + { + libsais_prefetchw(&SA[i + 2 * prefetch_distance]); + + int s0 = SA[i + prefetch_distance + 0]; const unsigned char * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL); + int s1 = SA[i + prefetch_distance + 1]; const unsigned char * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL); + + int p0 = SA[i + 0]; SA[i + 0] = p0 ^ INT_MIN; if (p0 > 0) { p0--; SA[induction_bucket[T[p0]]++] = p0 | (((T[p0 - (p0 > 0)] < T[p0])) << (INT_BIT - 1)); } + int p1 = SA[i + 1]; SA[i + 1] = p1 ^ INT_MIN; if (p1 > 0) { p1--; SA[induction_bucket[T[p1]]++] = p1 | (((T[p1 - (p1 > 0)] < T[p1])) << (INT_BIT - 1)); } + } + + for (j += prefetch_distance + 1; i < j; i += 1) + { + int p = SA[i]; SA[i] = p ^ INT_MIN; if (p > 0) { p--; SA[induction_bucket[T[p]]++] = p | (((T[p - (p > 0)] < T[p])) << (INT_BIT - 1)); } + } +} + +static void libsais_final_sorting_scan_left_to_right_32s(const int * RESTRICT T, int * RESTRICT SA, int n, int * RESTRICT induction_bucket) +{ + const ptrdiff_t prefetch_distance = 32; + + SA[induction_bucket[T[n - 1]]++] = (n - 1) | ((T[n - 2] < T[n - 1]) << (INT_BIT - 1)); + + ptrdiff_t i, j; + for (i = 0, j = (ptrdiff_t)n - 2 * prefetch_distance - 1; i < j; i += 2) + { + libsais_prefetchw(&SA[i + 3 * prefetch_distance]); + + int s0 = SA[i + 2 * prefetch_distance + 0]; const int * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); + int s1 = SA[i + 2 * prefetch_distance + 1]; const int * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); + int s2 = SA[i + 1 * prefetch_distance + 0]; if (s2 > 0) { libsais_prefetchw(&induction_bucket[T[s2 - 1]]); libsais_prefetch(&T[s2] - 2); } + int s3 = SA[i + 1 * prefetch_distance + 1]; if (s3 > 0) { libsais_prefetchw(&induction_bucket[T[s3 - 1]]); libsais_prefetch(&T[s3] - 2); } + + int p0 = SA[i + 0]; SA[i + 0] = p0 ^ INT_MIN; if (p0 > 0) { p0--; SA[induction_bucket[T[p0]]++] = p0 | (((T[p0 - (p0 > 0)] < T[p0])) << (INT_BIT - 1)); } + int p1 = SA[i + 1]; SA[i + 1] = p1 ^ INT_MIN; if (p1 > 0) { p1--; SA[induction_bucket[T[p1]]++] = p1 | (((T[p1 - (p1 > 0)] < T[p1])) << (INT_BIT - 1)); } + } + + for (j += 2 * prefetch_distance + 1; i < j; i += 1) + { + int p = SA[i]; SA[i] = p ^ INT_MIN; if (p > 0) { p--; SA[induction_bucket[T[p]]++] = p | (((T[p - (p > 0)] < T[p])) << (INT_BIT - 1)); } + } +} + +static int libsais_final_bwt_scan_right_to_left_8u(const unsigned char * RESTRICT T, int * RESTRICT SA, int n, int * RESTRICT buckets) +{ + const ptrdiff_t prefetch_distance = 32; + + int * RESTRICT induction_bucket = &buckets[7 * ALPHABET_SIZE]; + + ptrdiff_t i; int index = -1; + for (i = (ptrdiff_t)n - 1; i >= prefetch_distance + 1; i -= 2) + { + libsais_prefetchw(&SA[i - 2 * prefetch_distance]); + + int s0 = SA[i - prefetch_distance - 0]; const unsigned char * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL); + int s1 = SA[i - prefetch_distance - 1]; const unsigned char * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL); + + int p0 = SA[i - 0]; index = (p0 == 0) ? (int)(i - 0) : index; + SA[i - 0] = p0 & INT_MAX; if (p0 > 0) { p0--; unsigned char c0 = T[p0 - (p0 > 0)], c1 = T[p0]; SA[i - 0] = c1; int t = c0 | INT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p0 : t; } + + int p1 = SA[i - 1]; index = (p1 == 0) ? (int)(i - 1) : index; + SA[i - 1] = p1 & INT_MAX; if (p1 > 0) { p1--; unsigned char c0 = T[p1 - (p1 > 0)], c1 = T[p1]; SA[i - 1] = c1; int t = c0 | INT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p1 : t; } + } + + for (; i >= 0; i -= 1) + { + int p = SA[i]; index = (p == 0) ? (int)i : index; + SA[i] = p & INT_MAX; if (p > 0) { p--; unsigned char c0 = T[p - (p > 0)], c1 = T[p]; SA[i] = c1; int t = c0 | INT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t; } + } + + return index; +} + +static void libsais_final_sorting_scan_right_to_left_8u(const unsigned char * RESTRICT T, int * RESTRICT SA, int n, int * RESTRICT buckets) +{ + const ptrdiff_t prefetch_distance = 32; + + int * RESTRICT induction_bucket = &buckets[7 * ALPHABET_SIZE]; + + ptrdiff_t i; + for (i = (ptrdiff_t)n - 1; i >= prefetch_distance + 1; i -= 2) + { + libsais_prefetchw(&SA[i - 2 * prefetch_distance]); + + int s0 = SA[i - prefetch_distance - 0]; const unsigned char * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL); + int s1 = SA[i - prefetch_distance - 1]; const unsigned char * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL); + + int p0 = SA[i - 0]; SA[i - 0] = p0 & INT_MAX; if (p0 > 0) { p0--; SA[--induction_bucket[T[p0]]] = p0 | (((T[p0 - (p0 > 0)] > T[p0])) << (INT_BIT - 1)); } + int p1 = SA[i - 1]; SA[i - 1] = p1 & INT_MAX; if (p1 > 0) { p1--; SA[--induction_bucket[T[p1]]] = p1 | (((T[p1 - (p1 > 0)] > T[p1])) << (INT_BIT - 1)); } + } + + for (; i >= 0; i -= 1) + { + int p = SA[i]; SA[i] = p & INT_MAX; if (p > 0) { p--; SA[--induction_bucket[T[p]]] = p | (((T[p - (p > 0)] > T[p])) << (INT_BIT - 1)); } + } +} + +static void libsais_final_sorting_scan_right_to_left_32s(const int * RESTRICT T, int * RESTRICT SA, int n, int * RESTRICT induction_bucket) +{ + const ptrdiff_t prefetch_distance = 32; + + ptrdiff_t i; + for (i = (ptrdiff_t)n - 1; i >= 2 * prefetch_distance + 1; i -= 2) + { + libsais_prefetchw(&SA[i - 3 * prefetch_distance]); + + int s0 = SA[i - 2 * prefetch_distance - 0]; const int * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); + int s1 = SA[i - 2 * prefetch_distance - 1]; const int * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); + int s2 = SA[i - 1 * prefetch_distance - 0]; if (s2 > 0) { libsais_prefetchw(&induction_bucket[T[s2 - 1]]); libsais_prefetch(&T[s2] - 2); } + int s3 = SA[i - 1 * prefetch_distance - 1]; if (s3 > 0) { libsais_prefetchw(&induction_bucket[T[s3 - 1]]); libsais_prefetch(&T[s3] - 2); } + + int p0 = SA[i - 0]; SA[i - 0] = p0 & INT_MAX; if (p0 > 0) { p0--; SA[--induction_bucket[T[p0]]] = p0 | (((T[p0 - (p0 > 0)] > T[p0])) << (INT_BIT - 1)); } + int p1 = SA[i - 1]; SA[i - 1] = p1 & INT_MAX; if (p1 > 0) { p1--; SA[--induction_bucket[T[p1]]] = p1 | (((T[p1 - (p1 > 0)] > T[p1])) << (INT_BIT - 1)); } + } + + for (; i >= 0; i -= 1) + { + int p = SA[i]; SA[i] = p & INT_MAX; if (p > 0) { p--; SA[--induction_bucket[T[p]]] = p | (((T[p - (p > 0)] > T[p])) << (INT_BIT - 1)); } + } +} + +static int libsais_induce_final_order_8u(const unsigned char * RESTRICT T, int * RESTRICT SA, int n, int bwt, int * RESTRICT buckets) +{ + if (bwt) + { + libsais_final_bwt_scan_left_to_right_8u(T, SA, n, buckets); + return libsais_final_bwt_scan_right_to_left_8u(T, SA, n, buckets); + } + else + { + libsais_final_sorting_scan_left_to_right_8u(T, SA, n, buckets); + libsais_final_sorting_scan_right_to_left_8u(T, SA, n, buckets); + return 0; + } +} + +static void libsais_induce_final_order_32s_6k(const int * RESTRICT T, int * RESTRICT SA, int n, int k, int * RESTRICT buckets) +{ + libsais_final_sorting_scan_left_to_right_32s(T, SA, n, &buckets[4 * k]); + libsais_final_sorting_scan_right_to_left_32s(T, SA, n, &buckets[5 * k]); +} + +static void libsais_induce_final_order_32s_4k(const int * RESTRICT T, int * RESTRICT SA, int n, int k, int * RESTRICT buckets) +{ + libsais_final_sorting_scan_left_to_right_32s(T, SA, n, &buckets[2 * k]); + libsais_final_sorting_scan_right_to_left_32s(T, SA, n, &buckets[3 * k]); +} + +static void libsais_induce_final_order_32s_2k(const int * RESTRICT T, int * RESTRICT SA, int n, int k, int * RESTRICT buckets) +{ + libsais_final_sorting_scan_left_to_right_32s(T, SA, n, &buckets[1 * k]); + libsais_final_sorting_scan_right_to_left_32s(T, SA, n, &buckets[0 * k]); +} + +static void libsais_induce_final_order_32s_1k(const int * RESTRICT T, int * RESTRICT SA, int n, int k, int * RESTRICT buckets) +{ + libsais_count_suffixes_32s(T, n, k, buckets); + libsais_initialize_buckets_start_32s_1k(k, buckets); + libsais_final_sorting_scan_left_to_right_32s(T, SA, n, buckets); + + libsais_count_suffixes_32s(T, n, k, buckets); + libsais_initialize_buckets_end_32s_1k(k, buckets); + libsais_final_sorting_scan_right_to_left_32s(T, SA, n, buckets); +} + +static int libsais_compact_lms_suffixes_32s(int * RESTRICT T, int * RESTRICT SA, int n, int m, int fs) +{ + const ptrdiff_t prefetch_distance = 32; + + int f = 0; + + { + int * RESTRICT SAm = &SA[m]; + + int i, j; + for (i = 0, j = m - 2 * (int)prefetch_distance - 3; i < j; i += 4) + { + libsais_prefetch(&SA[i + 3 * prefetch_distance]); + + libsais_prefetchw(&SAm[((unsigned int)SA[i + 2 * prefetch_distance + 0]) >> 1]); + libsais_prefetchw(&SAm[((unsigned int)SA[i + 2 * prefetch_distance + 1]) >> 1]); + libsais_prefetchw(&SAm[((unsigned int)SA[i + 2 * prefetch_distance + 2]) >> 1]); + libsais_prefetchw(&SAm[((unsigned int)SA[i + 2 * prefetch_distance + 3]) >> 1]); + + unsigned int q0 = (unsigned int)SA[i + prefetch_distance + 0]; const int * Tq0 = &T[q0]; libsais_prefetchw(SAm[q0 >> 1] < 0 ? Tq0 : NULL); + unsigned int q1 = (unsigned int)SA[i + prefetch_distance + 1]; const int * Tq1 = &T[q1]; libsais_prefetchw(SAm[q1 >> 1] < 0 ? Tq1 : NULL); + unsigned int q2 = (unsigned int)SA[i + prefetch_distance + 2]; const int * Tq2 = &T[q2]; libsais_prefetchw(SAm[q2 >> 1] < 0 ? Tq2 : NULL); + unsigned int q3 = (unsigned int)SA[i + prefetch_distance + 3]; const int * Tq3 = &T[q3]; libsais_prefetchw(SAm[q3 >> 1] < 0 ? Tq3 : NULL); + + unsigned int p0 = (unsigned int)SA[i + 0]; int s0 = SAm[p0 >> 1]; if (s0 < 0) { T[p0] |= INT_MIN; f++; s0 = i + 0 + INT_MIN + f; } SAm[p0 >> 1] = s0 - f; + unsigned int p1 = (unsigned int)SA[i + 1]; int s1 = SAm[p1 >> 1]; if (s1 < 0) { T[p1] |= INT_MIN; f++; s1 = i + 1 + INT_MIN + f; } SAm[p1 >> 1] = s1 - f; + unsigned int p2 = (unsigned int)SA[i + 2]; int s2 = SAm[p2 >> 1]; if (s2 < 0) { T[p2] |= INT_MIN; f++; s2 = i + 2 + INT_MIN + f; } SAm[p2 >> 1] = s2 - f; + unsigned int p3 = (unsigned int)SA[i + 3]; int s3 = SAm[p3 >> 1]; if (s3 < 0) { T[p3] |= INT_MIN; f++; s3 = i + 3 + INT_MIN + f; } SAm[p3 >> 1] = s3 - f; + } + + for (j += 2 * (int)prefetch_distance + 3; i < j; i += 1) + { + unsigned int p = (unsigned int)SA[i]; int s = SAm[p >> 1]; if (s < 0) { T[p] |= INT_MIN; f++; s = i + INT_MIN + f; } SAm[p >> 1] = s - f; + } + } + + { + int * RESTRICT SAl = &SA[0]; + int * RESTRICT SAr = &SA[0]; + + ptrdiff_t i, j, l = (ptrdiff_t)m - 1, r = (ptrdiff_t)n + (ptrdiff_t)fs - 1; + for (i = (ptrdiff_t)m + ((ptrdiff_t)n >> 1) - 1, j = (ptrdiff_t)m + 3; i >= j; i -= 4) + { + libsais_prefetch(&SA[i - prefetch_distance]); + + int p0 = SA[i - 0]; SAl[l] = p0 & INT_MAX; l -= p0 < 0; SAr[r] = p0 - 1; r -= p0 > 0; + int p1 = SA[i - 1]; SAl[l] = p1 & INT_MAX; l -= p1 < 0; SAr[r] = p1 - 1; r -= p1 > 0; + int p2 = SA[i - 2]; SAl[l] = p2 & INT_MAX; l -= p2 < 0; SAr[r] = p2 - 1; r -= p2 > 0; + int p3 = SA[i - 3]; SAl[l] = p3 & INT_MAX; l -= p3 < 0; SAr[r] = p3 - 1; r -= p3 > 0; + } + + for (j -= 3; i >= j; i -= 1) + { + int p = SA[i]; SAl[l] = p & INT_MAX; l -= p < 0; SAr[r] = p - 1; r -= p > 0; + } + + memcpy(&SA[(ptrdiff_t)n + (ptrdiff_t)fs - (ptrdiff_t)m], &SA[(ptrdiff_t)l + 1], (size_t)f * sizeof(int)); + } + + return f; +} + +static void libsais_merge_compacted_lms_suffixes_32s(int * RESTRICT T, int * RESTRICT SA, int n, int m, int f) +{ + const ptrdiff_t prefetch_distance = 32; + + const int * RESTRICT SAnm = &SA[n - m - 1]; + + { + int i, j, l = 0, tmp = SAnm[l]; + for (i = 0, j = n - 6; i < j; i += 4) + { + libsais_prefetch(&T[i + prefetch_distance]); + + int c0 = T[i + 0]; if (c0 < 0) { T[i + 0] = c0 & INT_MAX; SA[tmp] = i + 0; i++; tmp = SAnm[++l]; } + int c1 = T[i + 1]; if (c1 < 0) { T[i + 1] = c1 & INT_MAX; SA[tmp] = i + 1; i++; tmp = SAnm[++l]; } + int c2 = T[i + 2]; if (c2 < 0) { T[i + 2] = c2 & INT_MAX; SA[tmp] = i + 2; i++; tmp = SAnm[++l]; } + int c3 = T[i + 3]; if (c3 < 0) { T[i + 3] = c3 & INT_MAX; SA[tmp] = i + 3; i++; tmp = SAnm[++l]; } + } + + for (j += 6; i < j; i += 1) + { + int c0 = T[i]; if (c0 < 0) { T[i] = c0 & INT_MAX; SA[tmp] = i; i++; tmp = SAnm[++l]; } + } + } + + { + ptrdiff_t i, j, l = f; int tmp = SAnm[l]; + for (i = 0, j = (ptrdiff_t)m - 3; i < j; i += 4) + { + libsais_prefetch(&SA[i + prefetch_distance]); + + if (SA[i + 0] == 0) { SA[i + 0] = tmp; tmp = SAnm[++l]; } + if (SA[i + 1] == 0) { SA[i + 1] = tmp; tmp = SAnm[++l]; } + if (SA[i + 2] == 0) { SA[i + 2] = tmp; tmp = SAnm[++l]; } + if (SA[i + 3] == 0) { SA[i + 3] = tmp; tmp = SAnm[++l]; } + } + + for (j += 3; i < j; i += 1) + { + if (SA[i] == 0) { SA[i] = tmp; tmp = SAnm[++l]; } + } + } +} + +static void libsais_reconstruct_compacted_lms_suffixes_32s_2k(int * RESTRICT T, int * RESTRICT SA, int n, int k, int m, int fs, int f, int * RESTRICT buckets) +{ + if (f > 0) + { + memcpy(&SA[n - m - 1], &SA[n + fs - m], (size_t)f * sizeof(int)); + + libsais_count_and_gather_compacted_lms_suffixes_32s_2k(T, SA, n, k, buckets); + libsais_reconstruct_lms_suffixes(SA, n, m - f); + + memcpy(&SA[n - m - 1 + f], &SA[0], ((size_t)m - (size_t)f) * sizeof(int)); + memset(&SA[0], 0, (size_t)m * sizeof(int)); + + libsais_merge_compacted_lms_suffixes_32s(T, SA, n, m, f); + } + else + { + libsais_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets); + libsais_reconstruct_lms_suffixes(SA, n, m); + } +} + +static void libsais_reconstruct_compacted_lms_suffixes_32s_1k(int * RESTRICT T, int * RESTRICT SA, int n, int k, int m, int fs, int f) +{ + if (f > 0) + { + memmove(&SA[n - m - 1], &SA[n + fs - m], (size_t)f * sizeof(int)); + + libsais_gather_compacted_lms_suffixes_32s(T, SA, n); + libsais_reconstruct_lms_suffixes(SA, n, m - f); + + memcpy(&SA[n - m - 1 + f], &SA[0], ((size_t)m - (size_t)f) * sizeof(int)); + memset(&SA[0], 0, (size_t)m * sizeof(int)); + + libsais_merge_compacted_lms_suffixes_32s(T, SA, n, m, f); + } + else + { + libsais_gather_lms_suffixes_32s(T, SA, n); + libsais_reconstruct_lms_suffixes(SA, n, m); + } +} + +static int libsais_main_32s(int * RESTRICT T, int * RESTRICT SA, int n, int k, int fs) +{ + if (k > 0 && fs / k >= 6) + { + int alignment = (fs - 1024) / k >= 6 ? 1024 : 16; + int * RESTRICT buckets = (fs - alignment) / k >= 6 ? (int *)libsais_align_up(&SA[n + fs - 6 * k - alignment], (size_t)alignment * sizeof(int)) : &SA[n + fs - 6 * k]; + + int m = libsais_count_and_gather_lms_suffixes_32s_4k(T, SA, n, k, buckets); + if (m > 1) + { + memset(SA, 0, ((size_t)n - (size_t)m) * sizeof(int)); + + int first_lms_suffix = SA[n - m]; + int left_suffixes_count = libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(T, k, buckets, first_lms_suffix); + + libsais_radix_sort_lms_suffixes_32s_2k(T, SA, n, m, &buckets[4 * k]); + libsais_radix_sort_set_markers_32s(SA, k, &buckets[4 * k], INT_MIN); + + libsais_initialize_buckets_for_partial_sorting_32s_6k(T, k, buckets, first_lms_suffix, left_suffixes_count); + libsais_induce_partial_order_32s_6k(T, SA, n, k, buckets, first_lms_suffix, left_suffixes_count); + + int names = libsais_renumber_and_mark_distinct_lms_suffixes_32s_4k(SA, n, m); + if (names < m) + { + int f = libsais_compact_lms_suffixes_32s(T, SA, n, m, fs); + + if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f) != 0) + { + return -2; + } + + libsais_reconstruct_compacted_lms_suffixes_32s_2k(T, SA, n, k, m, fs, f, buckets); + } + else + { + libsais_count_lms_suffixes_32s_2k(T, n, k, buckets); + } + + libsais_initialize_buckets_start_and_end_32s_4k(k, buckets); + libsais_place_lms_suffixes_histogram_32s_4k(SA, n, k, m, buckets); + libsais_induce_final_order_32s_4k(T, SA, n, k, buckets); + } + else + { + SA[0] = SA[n - 1]; + + libsais_initialize_buckets_start_and_end_32s_6k(k, buckets); + libsais_place_lms_suffixes_histogram_32s_6k(SA, n, k, m, buckets); + libsais_induce_final_order_32s_6k(T, SA, n, k, buckets); + } + + return 0; + } + else if (k > 0 && fs / k >= 4) + { + int alignment = (fs - 1024) / k >= 4 ? 1024 : 16; + int * RESTRICT buckets = (fs - alignment) / k >= 4 ? (int *)libsais_align_up(&SA[n + fs - 4 * k - alignment], (size_t)alignment * sizeof(int)) : &SA[n + fs - 4 * k]; + + int m = libsais_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets); + if (m > 1) + { + libsais_initialize_buckets_for_radix_and_partial_sorting_32s_4k(T, k, buckets, SA[n - m]); + + libsais_radix_sort_lms_suffixes_32s_2k(T, SA, n, m, &buckets[1]); + libsais_radix_sort_set_markers_32s(SA, k, &buckets[1], SUFFIX_GROUP_MARKER); + + libsais_place_lms_suffixes_interval_32s_4k(SA, n, k, m - 1, buckets); + libsais_induce_partial_order_32s_4k(T, SA, n, k, buckets); + + int names = libsais_renumber_and_mark_distinct_lms_suffixes_32s_4k(SA, n, m); + if (names < m) + { + int f = libsais_compact_lms_suffixes_32s(T, SA, n, m, fs); + + if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f) != 0) + { + return -2; + } + + libsais_reconstruct_compacted_lms_suffixes_32s_2k(T, SA, n, k, m, fs, f, buckets); + } + else + { + libsais_count_lms_suffixes_32s_2k(T, n, k, buckets); + } + } + else + { + SA[0] = SA[n - 1]; + } + + libsais_initialize_buckets_start_and_end_32s_4k(k, buckets); + libsais_place_lms_suffixes_histogram_32s_4k(SA, n, k, m, buckets); + libsais_induce_final_order_32s_4k(T, SA, n, k, buckets); + + return 0; + } + else if (k > 0 && fs / k >= 2) + { + int alignment = (fs - 1024) / k >= 2 ? 1024 : 16; + int * RESTRICT buckets = (fs - alignment) / k >= 2 ? (int *)libsais_align_up(&SA[n + fs - 2 * k - alignment], (size_t)alignment * sizeof(int)) : &SA[n + fs - 2 * k]; + + int m = libsais_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets); + if (m > 1) + { + libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_2k(T, k, buckets, SA[n - m]); + + libsais_radix_sort_lms_suffixes_32s_2k(T, SA, n, m, &buckets[1]); + libsais_place_lms_suffixes_interval_32s_2k(SA, n, k, m - 1, buckets); + + libsais_initialize_buckets_start_and_end_32s_2k(k, buckets); + libsais_induce_partial_order_32s_2k(T, SA, n, k, buckets); + + int names = libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k(T, SA, n, m); + if (names < m) + { + int f = libsais_compact_lms_suffixes_32s(T, SA, n, m, fs); + + if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f) != 0) + { + return -2; + } + + libsais_reconstruct_compacted_lms_suffixes_32s_2k(T, SA, n, k, m, fs, f, buckets); + } + else + { + libsais_count_lms_suffixes_32s_2k(T, n, k, buckets); + } + } + else + { + SA[0] = SA[n - 1]; + } + + libsais_initialize_buckets_end_32s_2k(k, buckets); + libsais_place_lms_suffixes_histogram_32s_2k(SA, n, k, m, buckets); + + libsais_initialize_buckets_start_and_end_32s_2k(k, buckets); + libsais_induce_final_order_32s_2k(T, SA, n, k, buckets); + + return 0; + } + else + { + int * buffer = fs < k ? (int *)libsais_aligned_malloc((size_t)k * sizeof(int), 4096) : (int *)NULL; + + int alignment = fs - 1024 >= k ? 1024 : 16; + int * RESTRICT buckets = fs - alignment >= k ? (int *)libsais_align_up(&SA[n + fs - k - alignment], (size_t)alignment * sizeof(int)) : fs >= k ? &SA[n + fs - k] : buffer; + + if (buckets == NULL) { return -2; } + + memset(SA, 0, (size_t)n * sizeof(int)); + + libsais_count_suffixes_32s(T, n, k, buckets); + libsais_initialize_buckets_end_32s_1k(k, buckets); + + int m = libsais_radix_sort_lms_suffixes_32s_1k(T, SA, n, buckets); + if (m > 1) + { + libsais_induce_partial_order_32s_1k(T, SA, n, k, buckets); + + int names = libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k(T, SA, n, m); + if (names < m) + { + if (buffer != NULL) { libsais_aligned_free(buffer); buckets = NULL; } + + int f = libsais_compact_lms_suffixes_32s(T, SA, n, m, fs); + + if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f) != 0) + { + return -2; + } + + libsais_reconstruct_compacted_lms_suffixes_32s_1k(T, SA, n, k, m, fs, f); + + if (buckets == NULL) { buckets = buffer = (int *)libsais_aligned_malloc((size_t)k * sizeof(int), 4096); } + if (buckets == NULL) { return -2; } + } + + libsais_count_suffixes_32s(T, n, k, buckets); + libsais_initialize_buckets_end_32s_1k(k, buckets); + libsais_place_lms_suffixes_interval_32s_1k(T, SA, n, k, m, buckets); + } + + libsais_induce_final_order_32s_1k(T, SA, n, k, buckets); + libsais_aligned_free(buffer); + + return 0; + } +} + +static int libsais_main_8u(const unsigned char * T, int * SA, int n, int bwt) +{ + int * RESTRICT buckets = (int *)libsais_aligned_malloc(8 * ALPHABET_SIZE * sizeof(int), 4096); + + if (buckets != NULL) + { + int m = libsais_count_and_gather_lms_suffixes_8u(T, SA, n, buckets); + + libsais_initialize_buckets_start_and_end_8u(buckets); + + if (m > 0) + { + int first_lms_suffix = SA[n - m]; + int left_suffixes_count = libsais_initialize_buckets_for_lms_suffixes_radix_sort_8u(T, buckets, first_lms_suffix); + + libsais_radix_sort_lms_suffixes_8u(T, SA, n, m, buckets); + libsais_initialize_buckets_for_partial_sorting_8u(T, buckets, first_lms_suffix, left_suffixes_count); + libsais_induce_partial_order_8u(T, SA, n, buckets, first_lms_suffix, left_suffixes_count); + + int names = libsais_renumber_and_gather_lms_suffixes_8u(SA, n, m); + if (names < m) + { + if (libsais_main_32s(SA + n - m, SA, m, names, n - 2 * m) != 0) + { + libsais_aligned_free(buckets); + return -2; + } + + libsais_gather_lms_suffixes_8u(T, SA, n); + libsais_reconstruct_lms_suffixes(SA, n, m); + } + + libsais_place_lms_suffixes_interval_8u(SA, n, m, buckets); + } + else + { + memset(SA, 0, (size_t)n * sizeof(int)); + } + + int index = libsais_induce_final_order_8u(T, SA, n, bwt, buckets); + + libsais_aligned_free(buckets); + return index; + } + + return -2; +} + +static void libsais_bwt_copy_8u(unsigned char * RESTRICT U, int * RESTRICT A, int n) +{ + const ptrdiff_t prefetch_distance = 32; + + ptrdiff_t i, j; + for (i = 0, j = (ptrdiff_t)n - 7; i < j; i += 8) + { + libsais_prefetch(&A[i + prefetch_distance]); + + U[i + 0] = (unsigned char)A[i + 0]; + U[i + 1] = (unsigned char)A[i + 1]; + U[i + 2] = (unsigned char)A[i + 2]; + U[i + 3] = (unsigned char)A[i + 3]; + U[i + 4] = (unsigned char)A[i + 4]; + U[i + 5] = (unsigned char)A[i + 5]; + U[i + 6] = (unsigned char)A[i + 6]; + U[i + 7] = (unsigned char)A[i + 7]; + } + + for (j += 7; i < j; i += 1) + { + U[i] = (unsigned char)A[i]; + } +} + +int libsais(const unsigned char * T, int * SA, int n) +{ + if ((T == NULL) || (SA == NULL) || (n < 0)) + { + return -1; + } + else if (n < 2) + { + if (n == 1) { SA[0] = 0; } + return 0; + } + + return libsais_main_8u(T, SA, n, 0); +} + +int libsais_bwt(const unsigned char * T, unsigned char * U, int * A, int n) +{ + if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0)) + { + return -1; + } + else if (n <= 1) + { + if (n == 1) { U[0] = T[0]; } + return n; + } + + int index = libsais_main_8u(T, A, n, 1); + if (index >= 0) + { + U[0] = T[n - 1]; + libsais_bwt_copy_8u(U + 1, A, index); + libsais_bwt_copy_8u(U + 1 + index, A + 1 + index, n - index - 1); + + index++; + } + + return index; +} diff --git a/src/libsais.h b/src/libsais.h new file mode 100644 index 0000000..6e815e0 --- /dev/null +++ b/src/libsais.h @@ -0,0 +1,54 @@ +/*-- + +This file is a part of libsais, a library for linear time +suffix array and burrows wheeler transform construction. + + Copyright (c) 2021 Ilya Grebnov + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +Please see the file LICENSE for full copyright information. + +--*/ + +#ifndef LIBSAIS_H +#define LIBSAIS_H 1 + +#ifdef __cplusplus +extern "C" { +#endif + + /** + * Constructs the suffix array of a given string. + * @param T [0..n-1] The input string. + * @param SA [0..n-1] The output array of suffixes. + * @param n The length of the given string. + * @return 0 if no error occurred, -1 or -2 otherwise. + */ + int libsais(const unsigned char * T, int * SA, int n); + + /** + * Constructs the burrows-wheeler transformed string of a given string. + * @param T [0..n-1] The input string. + * @param U [0..n-1] The output string. (can be T) + * @param A [0..n-1] The temporary array. + * @param n The length of the given string. + * @return The primary index if no error occurred, -1 or -2 otherwise. + */ + int libsais_bwt(const unsigned char * T, unsigned char * U, int * A, int n); + +#ifdef __cplusplus +} +#endif + +#endif From a7ba61a886812644a017f27d818ac40d39ff56e5 Mon Sep 17 00:00:00 2001 From: Ilya Muravyov Date: Thu, 18 Mar 2021 15:14:25 +0300 Subject: [PATCH 32/34] Updated to v1.60 --- src/bcm.cpp | 74 ++++++++++++++++++++++++++++------------------------- 1 file changed, 39 insertions(+), 35 deletions(-) diff --git a/src/bcm.cpp b/src/bcm.cpp index baba4e4..77ad7e6 100644 --- a/src/bcm.cpp +++ b/src/bcm.cpp @@ -2,7 +2,7 @@ BCM - A BWT-based file compressor -Written and placed in the public domain by Ilya Muravyov +Copyright (C) 2008-2021 Ilya Muravyov */ @@ -43,7 +43,7 @@ Written and placed in the public domain by Ilya Muravyov # endif #endif -#include "divsufsort.h" // libdivsufsort-lite +#include "libsais.h" typedef unsigned char U8; typedef unsigned short U16; @@ -299,7 +299,7 @@ struct CRC { for (int i=0; i<256; ++i) { - U32 r=i; + U32 r=i; for (int j=0; j<8; ++j) r=(r>>1)^(0xEDB88320&-int(r&1)); tab[i]=r; @@ -312,16 +312,15 @@ struct CRC return crc^U32(-1); } - void Update(U8* buf, int n) + void Update(int c) { - for (int i=0; i>8)^tab[(crc^buf[i])&255]; + crc=(crc>>8)^tab[(crc^c)&255]; } - void Put(int c) + void Update(U8* buf, int n) { - crc=(crc>>8)^tab[(crc^c)&255]; - putc(c, out); + for (int i=0; i>8)^tab[(crc^buf[i])&255]; } } crc; @@ -334,7 +333,6 @@ inline T* MemAlloc(size_t n) perror("Malloc() failed"); exit(1); } - return p; } @@ -379,10 +377,10 @@ void Compress(int level) { crc.Update(buf, n); - const int idx=divbwt(buf, buf, ptr, n); + const int idx=libsais_bwt(buf, buf, ptr, n); if (idx<1) { - perror("Divbwt() failed"); + fprintf(stderr, "BWT() failed: idx = %d\n", idx); exit(1); } @@ -406,22 +404,21 @@ void Compress(int level) void Decompress() { - cm.Init(); + int cnt[257]; int bsize=0; - U8* buf=NULL; - U32* ptr=NULL; + U8* buf=nullptr; + U32* ptr=nullptr; + + cm.Init(); int n; while ((n=cm.Get32())>0) { if (!bsize) { - bsize=n; - - if (bsize>=(1<<24)) // 5*N + if ((bsize=n)>=(1<<24)) // 5*N buf=MemAlloc(bsize); - ptr=MemAlloc(bsize); } @@ -436,7 +433,7 @@ void Decompress() if (n>=(1<<24)) // 5*N { - int cnt[257]={0}; + memset(cnt, 0, sizeof(cnt)); for (int i=0; i=idx)]); + const int c=buf[p-(p>=idx)]; + crc.Update(c); + putc(c, out); } } else // 4*N { - int cnt[257]={0}; + memset(cnt, 0, sizeof(cnt)); for (int i=0; i>8; - crc.Put(ptr[p-(p>=idx)]); + const int c=ptr[p-(p>=idx)]; + crc.Update(c); + putc(c, out); } } @@ -482,7 +485,8 @@ void Decompress() exit(1); } - free(buf); + if (buf) + free(buf); free(ptr); } @@ -518,7 +522,7 @@ int main(int argc, char** argv) overwrite=1; break; default: - fprintf(stderr, "Unknown option: -%c\n", argv[1][i]); + fprintf(stderr, "Unknown option '-%c'\n", argv[1][i]); exit(1); } } @@ -530,8 +534,8 @@ int main(int argc, char** argv) if (argc<2) { fprintf(stderr, - "BCM - A BWT-based file compressor, v1.51\n" - "Written and placed in the public domain by Ilya Muravyov\n" + "BCM - A BWT-based file compressor, v1.60\n" + "Copyright (C) 2008-2021 Ilya Muravyov\n" "\n" "Usage: BCM [options] infile [outfile]\n" "\n" @@ -574,7 +578,7 @@ int main(int argc, char** argv) { fclose(f); - fprintf(stderr, "%s already exists. Overwrite (y/n)? ", ofname); + fprintf(stderr, "File '%s' already exists. Overwrite (y/n)? ", ofname); fflush(stderr); if (getchar()!='y') @@ -588,9 +592,9 @@ int main(int argc, char** argv) if (decompress) { if (getc(in)!=magic[0] - ||getc(in)!=magic[1] - ||getc(in)!=magic[2] - ||getc(in)!=magic[3]) + || getc(in)!=magic[1] + || getc(in)!=magic[2] + || getc(in)!=magic[3]) { fprintf(stderr, "%s: Not in BCM format\n", argv[1]); exit(1); @@ -603,7 +607,7 @@ int main(int argc, char** argv) exit(1); } - fprintf(stderr, "Decompressing %s:\n", argv[1]); + fprintf(stderr, "Decompressing '%s':\n", argv[1]); Decompress(); } @@ -621,7 +625,7 @@ int main(int argc, char** argv) putc(magic[2], out); putc(magic[3], out); - fprintf(stderr, "Compressing %s:\n", argv[1]); + fprintf(stderr, "Compressing '%s':\n", argv[1]); Compress(level); } From b0fc347ac336649bc40306ca54052c81c5cc7911 Mon Sep 17 00:00:00 2001 From: zvezdochiot Date: Fri, 19 Mar 2021 12:29:08 +0300 Subject: [PATCH 33/34] 1.60: depends --- Makefile | 19 + README.md | 8 +- src/bcm.cpp | 1314 ++++++++++++++-------------- src/libsais.c | 2282 ------------------------------------------------- src/libsais.h | 54 -- 5 files changed, 683 insertions(+), 2994 deletions(-) create mode 100644 Makefile delete mode 100644 src/libsais.c delete mode 100644 src/libsais.h diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..41773b1 --- /dev/null +++ b/Makefile @@ -0,0 +1,19 @@ +PROJECT=bcm +PROG=$(PROJECT) +TARGET=$(PROG) +SRCS= src +CXX=g++ +CXXFLAGS=-Wall --std=c++11 +LDFLAGS=-lsais -s +RM=rm -f + +all: $(TARGET) + +$(SRCS)/$(PROG).o: $(SRCS)/$(PROG).cpp + $(CXX) $(CXXFLAGS) -c -o $@ $^ + +$(PROG): $(SRCS)/$(PROG).o + $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) + +clean: + $(RM) $(SRCS)/*.o $(PROG) diff --git a/README.md b/README.md index 1f36a18..27cde5a 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# BCM v1.60 +# BCM ### Description BCM is a high-performance file compressor that utilizes advanced context modeling techniques to achieve a very high compression ratio. All in all, it's like a big brother of the BZIP2. @@ -14,6 +14,12 @@ BCM -9 | 20,789,667 bytes | [1]:http://mattmahoney.net/dc/text.html +### Depends + +Libsais: +* https://github.com/IlyaGrebnov/libsais +* https://github.com/FS-make-simple/libsais + ### Author Ilya Muravyov diff --git a/src/bcm.cpp b/src/bcm.cpp index 77ad7e6..aef2706 100644 --- a/src/bcm.cpp +++ b/src/bcm.cpp @@ -1,657 +1,657 @@ -/* - -BCM - A BWT-based file compressor - -Copyright (C) 2008-2021 Ilya Muravyov - -*/ - -#ifndef _MSC_VER -# define _FILE_OFFSET_BITS 64 - -# define _fseeki64 fseeko -# define _ftelli64 ftello -# define _stati64 stat - -# ifdef HAVE_GETC_UNLOCKED -# undef getc -# define getc getc_unlocked -# endif -# ifdef HAVE_PUTC_UNLOCKED -# undef putc -# define putc putc_unlocked -# endif -#endif - -#define _CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES 1 -#define _CRT_SECURE_NO_WARNINGS -#define _CRT_DISABLE_PERFCRIT_LOCKS - -#include -#include -#include -#include - -#ifndef NO_UTIME -# include -# include - -# ifdef _MSC_VER -# include -# else -# include -# endif -#endif - -#include "libsais.h" - -typedef unsigned char U8; -typedef unsigned short U16; -typedef unsigned int U32; -typedef unsigned long long U64; -typedef signed long long S64; - -// Globals - -const char magic[]="BCM!"; - -FILE* in; -FILE* out; - -struct Encoder -{ - U32 low; - U32 high; - U32 code; - - Encoder() - { - low=0; - high=U32(-1); - code=0; - } - - void Flush() - { - for (int i=0; i<4; ++i) - { - putc(low>>24, out); - low<<=8; - } - } - - void Init() - { - for (int i=0; i<4; ++i) - code=(code<<8)+getc(in); - } - - template - void EncodeBit(int bit, U32 p) - { - const U32 mid=low+((U64(high-low)*p)>>P_LOG); - - if (bit) - high=mid; - else - low=mid+1; - - // Renormalize - while ((low^high)<(1<<24)) - { - putc(low>>24, out); - low<<=8; - high=(high<<8)+255; - } - } - - template - int DecodeBit(U32 p) - { - const U32 mid=low+((U64(high-low)*p)>>P_LOG); - - const int bit=(code<=mid); - if (bit) - high=mid; - else - low=mid+1; - - // Renormalize - while ((low^high)<(1<<24)) - { - low<<=8; - high=(high<<8)+255; - code=(code<<8)+getc(in); - } - - return bit; - } -}; - -template -struct Counter -{ - U16 p; - - Counter() - { - p=1<<15; // 0.5 - } - - void Update1() - { - p+=(p^0xFFFF)>>RATE; - } - - void Update0() - { - p-=p>>RATE; - } -}; - -struct CM: Encoder -{ - Counter<2> counter0[256]; - Counter<4> counter1[256][256]; - Counter<6> counter2[2][256][17]; - int run; - int c1; - int c2; - - CM() - { - run=0; - c1=0; - c2=0; - - for (int i=0; i<2; ++i) - { - for (int j=0; j<256; ++j) - { - for (int k=0; k<=16; ++k) - counter2[i][j][k].p=(k<<12)-(k==16); - } - } - } - - void Put32(U32 x) - { - for (U32 i=1<<31; i>0; i>>=1) - EncodeBit<1>(x&i, 1); // p=0.5 - } - - U32 Get32() - { - U32 x=0; - for (int i=0; i<32; ++i) - x+=x+DecodeBit<1>(1); // p=0.5 - - return x; - } - - void Put(int c) - { - const int f=(run>2); - - int ctx=1; - for (int i=128; i>0; i>>=1) - { - const int p0=counter0[ctx].p; - const int p1=counter1[c1][ctx].p; - const int p2=counter1[c2][ctx].p; - const int p=(((p0+p1)*7)+p2+p2)>>4; - - // SSE with linear interpolation - const int j=p>>12; - const int x1=counter2[f][ctx][j].p; - const int x2=counter2[f][ctx][j+1].p; - const int ssep=x1+(((x2-x1)*(p&4095))>>12); - - if (c&i) - { - EncodeBit<18>(1, p+ssep+ssep+ssep); - - counter0[ctx].Update1(); - counter1[c1][ctx].Update1(); - counter2[f][ctx][j].Update1(); - counter2[f][ctx][j+1].Update1(); - - ctx+=ctx+1; - } - else - { - EncodeBit<18>(0, p+ssep+ssep+ssep); - - counter0[ctx].Update0(); - counter1[c1][ctx].Update0(); - counter2[f][ctx][j].Update0(); - counter2[f][ctx][j+1].Update0(); - - ctx+=ctx; - } - } - - c2=c1; - c1=ctx-256; - - if (c1==c2) - ++run; - else - run=0; - } - - int Get() - { - const int f=(run>2); - - int ctx=1; - while (ctx<256) - { - const int p0=counter0[ctx].p; - const int p1=counter1[c1][ctx].p; - const int p2=counter1[c2][ctx].p; - const int p=(((p0+p1)*7)+p2+p2)>>4; - - // SSE with linear interpolation - const int j=p>>12; - const int x1=counter2[f][ctx][j].p; - const int x2=counter2[f][ctx][j+1].p; - const int ssep=x1+(((x2-x1)*(p&4095))>>12); - - if (DecodeBit<18>(p+ssep+ssep+ssep)) - { - counter0[ctx].Update1(); - counter1[c1][ctx].Update1(); - counter2[f][ctx][j].Update1(); - counter2[f][ctx][j+1].Update1(); - - ctx+=ctx+1; - } - else - { - counter0[ctx].Update0(); - counter1[c1][ctx].Update0(); - counter2[f][ctx][j].Update0(); - counter2[f][ctx][j+1].Update0(); - - ctx+=ctx; - } - } - - c2=c1; - c1=ctx-256; - - if (c1==c2) - ++run; - else - run=0; - - return c1; - } -} cm; - -struct CRC -{ - U32 tab[256]; - U32 crc; - - CRC() - { - for (int i=0; i<256; ++i) - { - U32 r=i; - for (int j=0; j<8; ++j) - r=(r>>1)^(0xEDB88320&-int(r&1)); - tab[i]=r; - } - crc=U32(-1); - } - - U32 operator()() const - { - return crc^U32(-1); - } - - void Update(int c) - { - crc=(crc>>8)^tab[(crc^c)&255]; - } - - void Update(U8* buf, int n) - { - for (int i=0; i>8)^tab[(crc^buf[i])&255]; - } -} crc; - -template -inline T* MemAlloc(size_t n) -{ - T* p=(T*)malloc(n*sizeof(T)); - if (!p) - { - perror("Malloc() failed"); - exit(1); - } - return p; -} - -void Compress(int level) -{ - const int tab[10]= - { - 0, - 1<<20, // -1 - 1 MB - 1<<22, // -2 - 4 MB - 1<<23, // -3 - 8 MB - 0x00FFFFFF, // -4 - ~16 MB (Default) - 1<<25, // -5 - 32 MB - 1<<26, // -6 - 64 MB - 1<<27, // -7 - 128 MB - 1<<28, // -8 - 256 MB - 0x7FFFFFFF, // -9 - ~2 GB - }; - int bsize=tab[level]; // Block size - - if (_fseeki64(in, 0, SEEK_END)) - { - perror("Fseek() failed"); - exit(1); - } - const S64 flen=_ftelli64(in); - if (flen<0) - { - perror("Ftell() failed"); - exit(1); - } - rewind(in); - - if (bsize>flen) - bsize=int(flen); - - U8* buf=MemAlloc(bsize); - int* ptr=MemAlloc(bsize); - - int n; - while ((n=fread(buf, 1, bsize, in))>0) - { - crc.Update(buf, n); - - const int idx=libsais_bwt(buf, buf, ptr, n); - if (idx<1) - { - fprintf(stderr, "BWT() failed: idx = %d\n", idx); - exit(1); - } - - cm.Put32(n); // Block size - cm.Put32(idx); // BWT index - - for (int i=0; i %lld\r", _ftelli64(in), _ftelli64(out)); - } - - cm.Put32(0); // EOF - cm.Put32(crc()); // CRC32 - - cm.Flush(); - - free(buf); - free(ptr); -} - -void Decompress() -{ - int cnt[257]; - - int bsize=0; - U8* buf=nullptr; - U32* ptr=nullptr; - - cm.Init(); - - int n; - while ((n=cm.Get32())>0) - { - if (!bsize) - { - if ((bsize=n)>=(1<<24)) // 5*N - buf=MemAlloc(bsize); - ptr=MemAlloc(bsize); - } - - const int idx=cm.Get32(); - if (n>bsize || idx<1 || idx>n) - { - fprintf(stderr, "Corrupt input!\n"); - exit(1); - } - - // Inverse BW-transform - - if (n>=(1<<24)) // 5*N - { - memset(cnt, 0, sizeof(cnt)); - for (int i=0; i=idx)]; - crc.Update(c); - putc(c, out); - } - } - else // 4*N - { - memset(cnt, 0, sizeof(cnt)); - for (int i=0; i>8; - const int c=ptr[p-(p>=idx)]; - crc.Update(c); - putc(c, out); - } - } - - fprintf(stderr, "%lld -> %lld\r", _ftelli64(in), _ftelli64(out)); - } - - if (cm.Get32()!=crc()) - { - fprintf(stderr, "CRC error!\n"); - exit(1); - } - - if (buf) - free(buf); - free(ptr); -} - -int main(int argc, char** argv) -{ - const clock_t start=clock(); - - int level=4; - int decompress=0; - int overwrite=0; - - while (argc>1 && *argv[1]=='-') - { - for (int i=1; argv[1][i]!='\0'; ++i) - { - switch (argv[1][i]) - { - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - level=argv[1][i]-'0'; - break; - case 'd': - decompress=1; - break; - case 'f': - overwrite=1; - break; - default: - fprintf(stderr, "Unknown option '-%c'\n", argv[1][i]); - exit(1); - } - } - - --argc; - ++argv; - } - - if (argc<2) - { - fprintf(stderr, - "BCM - A BWT-based file compressor, v1.60\n" - "Copyright (C) 2008-2021 Ilya Muravyov\n" - "\n" - "Usage: BCM [options] infile [outfile]\n" - "\n" - "Options:\n" - " -1 .. -9 Set block size to 1 MB .. 2 GB\n" - " -d Decompress\n" - " -f Force overwrite of output file\n"); - exit(1); - } - - in=fopen(argv[1], "rb"); - if (!in) - { - perror(argv[1]); - exit(1); - } - - char ofname[FILENAME_MAX]; - if (argc<3) - { - strcpy(ofname, argv[1]); - if (decompress) - { - const int p=strlen(ofname)-4; - if (p>0 && !strcmp(&ofname[p], ".bcm")) - ofname[p]='\0'; - else - strcat(ofname, ".out"); - } - else - strcat(ofname, ".bcm"); - } - else - strcpy(ofname, argv[2]); - - if (!overwrite) - { - FILE* f=fopen(ofname, "rb"); - if (f) - { - fclose(f); - - fprintf(stderr, "File '%s' already exists. Overwrite (y/n)? ", ofname); - fflush(stderr); - - if (getchar()!='y') - { - fprintf(stderr, "Not overwritten\n"); - exit(1); - } - } - } - - if (decompress) - { - if (getc(in)!=magic[0] - || getc(in)!=magic[1] - || getc(in)!=magic[2] - || getc(in)!=magic[3]) - { - fprintf(stderr, "%s: Not in BCM format\n", argv[1]); - exit(1); - } - - out=fopen(ofname, "wb"); - if (!out) - { - perror(ofname); - exit(1); - } - - fprintf(stderr, "Decompressing '%s':\n", argv[1]); - - Decompress(); - } - else - { - out=fopen(ofname, "wb"); - if (!out) - { - perror(ofname); - exit(1); - } - - putc(magic[0], out); - putc(magic[1], out); - putc(magic[2], out); - putc(magic[3], out); - - fprintf(stderr, "Compressing '%s':\n", argv[1]); - - Compress(level); - } - - fprintf(stderr, "%lld -> %lld in %1.1f sec\n", - _ftelli64(in), _ftelli64(out), double(clock()-start)/CLOCKS_PER_SEC); - - fclose(in); - fclose(out); - -#ifndef NO_UTIME - struct _stati64 sb; - if (_stati64(argv[1], &sb)) - { - perror("Stat() failed"); - exit(1); - } - struct utimbuf ub; - ub.actime=sb.st_atime; - ub.modtime=sb.st_mtime; - if (utime(ofname, &ub)) - { - perror("Utime() failed"); - exit(1); - } -#endif - - return 0; -} +/* + +BCM - A BWT-based file compressor + +Copyright (C) 2008-2021 Ilya Muravyov + +*/ + +#ifndef _MSC_VER +# define _FILE_OFFSET_BITS 64 + +# define _fseeki64 fseeko +# define _ftelli64 ftello +# define _stati64 stat + +# ifdef HAVE_GETC_UNLOCKED +# undef getc +# define getc getc_unlocked +# endif +# ifdef HAVE_PUTC_UNLOCKED +# undef putc +# define putc putc_unlocked +# endif +#endif + +#define _CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES 1 +#define _CRT_SECURE_NO_WARNINGS +#define _CRT_DISABLE_PERFCRIT_LOCKS + +#include +#include +#include +#include + +#ifndef NO_UTIME +# include +# include + +# ifdef _MSC_VER +# include +# else +# include +# endif +#endif + +#include + +typedef unsigned char U8; +typedef unsigned short U16; +typedef unsigned int U32; +typedef unsigned long long U64; +typedef signed long long S64; + +// Globals + +const char magic[]="BCM!"; + +FILE* in; +FILE* out; + +struct Encoder +{ + U32 low; + U32 high; + U32 code; + + Encoder() + { + low=0; + high=U32(-1); + code=0; + } + + void Flush() + { + for (int i=0; i<4; ++i) + { + putc(low>>24, out); + low<<=8; + } + } + + void Init() + { + for (int i=0; i<4; ++i) + code=(code<<8)+getc(in); + } + + template + void EncodeBit(int bit, U32 p) + { + const U32 mid=low+((U64(high-low)*p)>>P_LOG); + + if (bit) + high=mid; + else + low=mid+1; + + // Renormalize + while ((low^high)<(1<<24)) + { + putc(low>>24, out); + low<<=8; + high=(high<<8)+255; + } + } + + template + int DecodeBit(U32 p) + { + const U32 mid=low+((U64(high-low)*p)>>P_LOG); + + const int bit=(code<=mid); + if (bit) + high=mid; + else + low=mid+1; + + // Renormalize + while ((low^high)<(1<<24)) + { + low<<=8; + high=(high<<8)+255; + code=(code<<8)+getc(in); + } + + return bit; + } +}; + +template +struct Counter +{ + U16 p; + + Counter() + { + p=1<<15; // 0.5 + } + + void Update1() + { + p+=(p^0xFFFF)>>RATE; + } + + void Update0() + { + p-=p>>RATE; + } +}; + +struct CM: Encoder +{ + Counter<2> counter0[256]; + Counter<4> counter1[256][256]; + Counter<6> counter2[2][256][17]; + int run; + int c1; + int c2; + + CM() + { + run=0; + c1=0; + c2=0; + + for (int i=0; i<2; ++i) + { + for (int j=0; j<256; ++j) + { + for (int k=0; k<=16; ++k) + counter2[i][j][k].p=(k<<12)-(k==16); + } + } + } + + void Put32(U32 x) + { + for (U32 i=1<<31; i>0; i>>=1) + EncodeBit<1>(x&i, 1); // p=0.5 + } + + U32 Get32() + { + U32 x=0; + for (int i=0; i<32; ++i) + x+=x+DecodeBit<1>(1); // p=0.5 + + return x; + } + + void Put(int c) + { + const int f=(run>2); + + int ctx=1; + for (int i=128; i>0; i>>=1) + { + const int p0=counter0[ctx].p; + const int p1=counter1[c1][ctx].p; + const int p2=counter1[c2][ctx].p; + const int p=(((p0+p1)*7)+p2+p2)>>4; + + // SSE with linear interpolation + const int j=p>>12; + const int x1=counter2[f][ctx][j].p; + const int x2=counter2[f][ctx][j+1].p; + const int ssep=x1+(((x2-x1)*(p&4095))>>12); + + if (c&i) + { + EncodeBit<18>(1, p+ssep+ssep+ssep); + + counter0[ctx].Update1(); + counter1[c1][ctx].Update1(); + counter2[f][ctx][j].Update1(); + counter2[f][ctx][j+1].Update1(); + + ctx+=ctx+1; + } + else + { + EncodeBit<18>(0, p+ssep+ssep+ssep); + + counter0[ctx].Update0(); + counter1[c1][ctx].Update0(); + counter2[f][ctx][j].Update0(); + counter2[f][ctx][j+1].Update0(); + + ctx+=ctx; + } + } + + c2=c1; + c1=ctx-256; + + if (c1==c2) + ++run; + else + run=0; + } + + int Get() + { + const int f=(run>2); + + int ctx=1; + while (ctx<256) + { + const int p0=counter0[ctx].p; + const int p1=counter1[c1][ctx].p; + const int p2=counter1[c2][ctx].p; + const int p=(((p0+p1)*7)+p2+p2)>>4; + + // SSE with linear interpolation + const int j=p>>12; + const int x1=counter2[f][ctx][j].p; + const int x2=counter2[f][ctx][j+1].p; + const int ssep=x1+(((x2-x1)*(p&4095))>>12); + + if (DecodeBit<18>(p+ssep+ssep+ssep)) + { + counter0[ctx].Update1(); + counter1[c1][ctx].Update1(); + counter2[f][ctx][j].Update1(); + counter2[f][ctx][j+1].Update1(); + + ctx+=ctx+1; + } + else + { + counter0[ctx].Update0(); + counter1[c1][ctx].Update0(); + counter2[f][ctx][j].Update0(); + counter2[f][ctx][j+1].Update0(); + + ctx+=ctx; + } + } + + c2=c1; + c1=ctx-256; + + if (c1==c2) + ++run; + else + run=0; + + return c1; + } +} cm; + +struct CRC +{ + U32 tab[256]; + U32 crc; + + CRC() + { + for (int i=0; i<256; ++i) + { + U32 r=i; + for (int j=0; j<8; ++j) + r=(r>>1)^(0xEDB88320&-int(r&1)); + tab[i]=r; + } + crc=U32(-1); + } + + U32 operator()() const + { + return crc^U32(-1); + } + + void Update(int c) + { + crc=(crc>>8)^tab[(crc^c)&255]; + } + + void Update(U8* buf, int n) + { + for (int i=0; i>8)^tab[(crc^buf[i])&255]; + } +} crc; + +template +inline T* MemAlloc(size_t n) +{ + T* p=(T*)malloc(n*sizeof(T)); + if (!p) + { + perror("Malloc() failed"); + exit(1); + } + return p; +} + +void Compress(int level) +{ + const int tab[10]= + { + 0, + 1<<20, // -1 - 1 MB + 1<<22, // -2 - 4 MB + 1<<23, // -3 - 8 MB + 0x00FFFFFF, // -4 - ~16 MB (Default) + 1<<25, // -5 - 32 MB + 1<<26, // -6 - 64 MB + 1<<27, // -7 - 128 MB + 1<<28, // -8 - 256 MB + 0x7FFFFFFF, // -9 - ~2 GB + }; + int bsize=tab[level]; // Block size + + if (_fseeki64(in, 0, SEEK_END)) + { + perror("Fseek() failed"); + exit(1); + } + const S64 flen=_ftelli64(in); + if (flen<0) + { + perror("Ftell() failed"); + exit(1); + } + rewind(in); + + if (bsize>flen) + bsize=int(flen); + + U8* buf=MemAlloc(bsize); + int* ptr=MemAlloc(bsize); + + int n; + while ((n=fread(buf, 1, bsize, in))>0) + { + crc.Update(buf, n); + + const int idx=libsais_bwt(buf, buf, ptr, n); + if (idx<1) + { + fprintf(stderr, "BWT() failed: idx = %d\n", idx); + exit(1); + } + + cm.Put32(n); // Block size + cm.Put32(idx); // BWT index + + for (int i=0; i %lld\r", _ftelli64(in), _ftelli64(out)); + } + + cm.Put32(0); // EOF + cm.Put32(crc()); // CRC32 + + cm.Flush(); + + free(buf); + free(ptr); +} + +void Decompress() +{ + int cnt[257]; + + int bsize=0; + U8* buf=nullptr; + U32* ptr=nullptr; + + cm.Init(); + + int n; + while ((n=cm.Get32())>0) + { + if (!bsize) + { + if ((bsize=n)>=(1<<24)) // 5*N + buf=MemAlloc(bsize); + ptr=MemAlloc(bsize); + } + + const int idx=cm.Get32(); + if (n>bsize || idx<1 || idx>n) + { + fprintf(stderr, "Corrupt input!\n"); + exit(1); + } + + // Inverse BW-transform + + if (n>=(1<<24)) // 5*N + { + memset(cnt, 0, sizeof(cnt)); + for (int i=0; i=idx)]; + crc.Update(c); + putc(c, out); + } + } + else // 4*N + { + memset(cnt, 0, sizeof(cnt)); + for (int i=0; i>8; + const int c=ptr[p-(p>=idx)]; + crc.Update(c); + putc(c, out); + } + } + + fprintf(stderr, "%lld -> %lld\r", _ftelli64(in), _ftelli64(out)); + } + + if (cm.Get32()!=crc()) + { + fprintf(stderr, "CRC error!\n"); + exit(1); + } + + if (buf) + free(buf); + free(ptr); +} + +int main(int argc, char** argv) +{ + const clock_t start=clock(); + + int level=4; + int decompress=0; + int overwrite=0; + + while (argc>1 && *argv[1]=='-') + { + for (int i=1; argv[1][i]!='\0'; ++i) + { + switch (argv[1][i]) + { + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + level=argv[1][i]-'0'; + break; + case 'd': + decompress=1; + break; + case 'f': + overwrite=1; + break; + default: + fprintf(stderr, "Unknown option '-%c'\n", argv[1][i]); + exit(1); + } + } + + --argc; + ++argv; + } + + if (argc<2) + { + fprintf(stderr, + "BCM - A BWT-based file compressor, v1.60\n" + "Copyright (C) 2008-2021 Ilya Muravyov\n" + "\n" + "Usage: BCM [options] infile [outfile]\n" + "\n" + "Options:\n" + " -1 .. -9 Set block size to 1 MB .. 2 GB\n" + " -d Decompress\n" + " -f Force overwrite of output file\n"); + exit(1); + } + + in=fopen(argv[1], "rb"); + if (!in) + { + perror(argv[1]); + exit(1); + } + + char ofname[FILENAME_MAX]; + if (argc<3) + { + strcpy(ofname, argv[1]); + if (decompress) + { + const int p=strlen(ofname)-4; + if (p>0 && !strcmp(&ofname[p], ".bcm")) + ofname[p]='\0'; + else + strcat(ofname, ".out"); + } + else + strcat(ofname, ".bcm"); + } + else + strcpy(ofname, argv[2]); + + if (!overwrite) + { + FILE* f=fopen(ofname, "rb"); + if (f) + { + fclose(f); + + fprintf(stderr, "File '%s' already exists. Overwrite (y/n)? ", ofname); + fflush(stderr); + + if (getchar()!='y') + { + fprintf(stderr, "Not overwritten\n"); + exit(1); + } + } + } + + if (decompress) + { + if (getc(in)!=magic[0] + || getc(in)!=magic[1] + || getc(in)!=magic[2] + || getc(in)!=magic[3]) + { + fprintf(stderr, "%s: Not in BCM format\n", argv[1]); + exit(1); + } + + out=fopen(ofname, "wb"); + if (!out) + { + perror(ofname); + exit(1); + } + + fprintf(stderr, "Decompressing '%s':\n", argv[1]); + + Decompress(); + } + else + { + out=fopen(ofname, "wb"); + if (!out) + { + perror(ofname); + exit(1); + } + + putc(magic[0], out); + putc(magic[1], out); + putc(magic[2], out); + putc(magic[3], out); + + fprintf(stderr, "Compressing '%s':\n", argv[1]); + + Compress(level); + } + + fprintf(stderr, "%lld -> %lld in %1.1f sec\n", + _ftelli64(in), _ftelli64(out), double(clock()-start)/CLOCKS_PER_SEC); + + fclose(in); + fclose(out); + +#ifndef NO_UTIME + struct _stati64 sb; + if (_stati64(argv[1], &sb)) + { + perror("Stat() failed"); + exit(1); + } + struct utimbuf ub; + ub.actime=sb.st_atime; + ub.modtime=sb.st_mtime; + if (utime(ofname, &ub)) + { + perror("Utime() failed"); + exit(1); + } +#endif + + return 0; +} diff --git a/src/libsais.c b/src/libsais.c deleted file mode 100644 index b917c1f..0000000 --- a/src/libsais.c +++ /dev/null @@ -1,2282 +0,0 @@ -/*-- - -This file is a part of libsais, a library for linear time -suffix array and burrows wheeler transform construction. - - Copyright (c) 2021 Ilya Grebnov - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - -Please see the file LICENSE for full copyright information. - ---*/ - -#include -#include -#include -#include -#include - -#include "libsais.h" - -#define INT_BIT (32) -#define ALPHABET_SIZE (1 << CHAR_BIT) -#define SUFFIX_GROUP_BIT (INT_BIT - 1) -#define SUFFIX_GROUP_MARKER (1 << (SUFFIX_GROUP_BIT - 1)) - -#define BUCKETS_INDEX2(_c, _s) (((_c) << 1) + (_s)) -#define BUCKETS_INDEX4(_c, _s) (((_c) << 2) + (_s)) - -#if defined(__GNUC__) || defined(__clang__) - #define RESTRICT __restrict__ - #define FORCEINLINE inline __attribute__((__always_inline__)) -#elif defined(_MSC_VER) || defined(__INTEL_COMPILER) - #define RESTRICT __restrict - #define FORCEINLINE __forceinline -#else - #error Your compiler, configuration or platform is not supported. -#endif - -#if defined(__has_builtin) - #if __has_builtin(__builtin_prefetch) - #define HAS_BUILTIN_PREFECTCH - #endif -#elif defined(__GNUC__) && __GNUC__ > 3 - #define HAS_BUILTIN_PREFECTCH -#endif - -#if defined(HAS_BUILTIN_PREFECTCH) - #define libsais_prefetch(address) __builtin_prefetch((const void *)(address), 0, 0) - #define libsais_prefetchw(address) __builtin_prefetch((const void *)(address), 1, 0) -#elif defined (_M_IX86) || defined (_M_AMD64) - #include - #define libsais_prefetch(address) _mm_prefetch((const void *)(address), _MM_HINT_NTA) - #define libsais_prefetchw(address) _m_prefetchw((const void *)(address)) -#elif defined (_M_ARM) - #include - #define libsais_prefetch(address) __prefetch((const void *)(address)) - #define libsais_prefetchw(address) __prefetchw((const void *)(address)) -#elif defined (_M_ARM64) - #include - #define libsais_prefetch(address) __prefetch2((const void *)(address), 1) - #define libsais_prefetchw(address) __prefetch2((const void *)(address), 17) -#else - #error Your compiler, configuration or platform is not supported. -#endif - -static FORCEINLINE void * libsais_align_up(const void * address, size_t alignment) -{ - return (void *)((((intptr_t)address) + ((intptr_t)alignment) - 1) & (-((intptr_t)alignment))); -} - -static FORCEINLINE void * libsais_aligned_malloc(size_t size, size_t alignment) -{ - void * address = malloc(size + sizeof(short) + alignment - 1); - if (address != NULL) - { - void * aligned_address = libsais_align_up((void *)((intptr_t)address + (intptr_t)(sizeof(short))), alignment); - ((short *)aligned_address)[-1] = (short)((intptr_t)aligned_address - (intptr_t)address); - - return aligned_address; - } - - return NULL; -} - -static FORCEINLINE void libsais_aligned_free(void * aligned_address) -{ - if (aligned_address != NULL) - { - free((void *)((intptr_t)aligned_address - ((short *)aligned_address)[-1])); - } -} - -static int libsais_gather_lms_suffixes_8u(const unsigned char * RESTRICT T, int * RESTRICT SA, int n) -{ - const ptrdiff_t prefetch_distance = 128; - - int i = n - 2; - int m = n - 1; - size_t s = 1; - ptrdiff_t c0 = T[n - 1]; - ptrdiff_t c1 = 0; - - for (; i >= 3; i -= 4) - { - libsais_prefetch(&T[i - prefetch_distance]); - - c1 = T[i - 0]; s = (s << 1) + (size_t)(c1 > (c0 - (ptrdiff_t)(s & 1))); SA[m] = i + 1; m -= ((s & 3) == 1); - c0 = T[i - 1]; s = (s << 1) + (size_t)(c0 > (c1 - (ptrdiff_t)(s & 1))); SA[m] = i - 0; m -= ((s & 3) == 1); - c1 = T[i - 2]; s = (s << 1) + (size_t)(c1 > (c0 - (ptrdiff_t)(s & 1))); SA[m] = i - 1; m -= ((s & 3) == 1); - c0 = T[i - 3]; s = (s << 1) + (size_t)(c0 > (c1 - (ptrdiff_t)(s & 1))); SA[m] = i - 2; m -= ((s & 3) == 1); - } - - for (; i >= 0; i -= 1) - { - c1 = c0; c0 = T[i]; s = (s << 1) + (size_t)(c0 > (c1 - (ptrdiff_t)(s & 1))); SA[m] = i + 1; m -= ((s & 3) == 1); - } - - return n - 1 - m; -} - -static int libsais_gather_lms_suffixes_32s(const int * RESTRICT T, int * RESTRICT SA, int n) -{ - const ptrdiff_t prefetch_distance = 32; - - int i = n - 2; - int m = n - 1; - size_t s = 1; - ptrdiff_t c0 = T[n - 1]; - ptrdiff_t c1 = 0; - - for (; i >= 3; i -= 4) - { - libsais_prefetch(&T[i - prefetch_distance]); - - c1 = T[i - 0]; s = (s << 1) + (size_t)(c1 > (c0 - (ptrdiff_t)(s & 1))); SA[m] = i + 1; m -= ((s & 3) == 1); - c0 = T[i - 1]; s = (s << 1) + (size_t)(c0 > (c1 - (ptrdiff_t)(s & 1))); SA[m] = i - 0; m -= ((s & 3) == 1); - c1 = T[i - 2]; s = (s << 1) + (size_t)(c1 > (c0 - (ptrdiff_t)(s & 1))); SA[m] = i - 1; m -= ((s & 3) == 1); - c0 = T[i - 3]; s = (s << 1) + (size_t)(c0 > (c1 - (ptrdiff_t)(s & 1))); SA[m] = i - 2; m -= ((s & 3) == 1); - } - - for (; i >= 0; i -= 1) - { - c1 = c0; c0 = T[i]; s = (s << 1) + (size_t)(c0 > (c1 - (ptrdiff_t)(s & 1))); SA[m] = i + 1; m -= ((s & 3) == 1); - } - - return n - 1 - m; -} - -static int libsais_gather_compacted_lms_suffixes_32s(const int * RESTRICT T, int * RESTRICT SA, int n) -{ - const ptrdiff_t prefetch_distance = 32; - - int i = n - 2; - int m = n - 1; - size_t s = 1; - ptrdiff_t c0 = T[n - 1]; - ptrdiff_t c1 = 0; - - for (; i >= 3; i -= 4) - { - libsais_prefetch(&T[i - prefetch_distance]); - - c1 = T[i - 0]; s = (s << 1) + (size_t)(c1 > (c0 - (ptrdiff_t)(s & 1))); SA[m] = i + 1; m -= ((ptrdiff_t)(s & 3) == (c0 >= 0)); - c0 = T[i - 1]; s = (s << 1) + (size_t)(c0 > (c1 - (ptrdiff_t)(s & 1))); SA[m] = i - 0; m -= ((ptrdiff_t)(s & 3) == (c1 >= 0)); - c1 = T[i - 2]; s = (s << 1) + (size_t)(c1 > (c0 - (ptrdiff_t)(s & 1))); SA[m] = i - 1; m -= ((ptrdiff_t)(s & 3) == (c0 >= 0)); - c0 = T[i - 3]; s = (s << 1) + (size_t)(c0 > (c1 - (ptrdiff_t)(s & 1))); SA[m] = i - 2; m -= ((ptrdiff_t)(s & 3) == (c1 >= 0)); - } - - for (; i >= 0; i -= 1) - { - c1 = c0; c0 = T[i]; s = (s << 1) + (size_t)(c0 > (c1 - (ptrdiff_t)(s & 1))); SA[m] = i + 1; m -= ((ptrdiff_t)(s & 3) == (c1 >= 0)); - } - - return n - 1 - m; -} - -static void libsais_count_lms_suffixes_32s_2k(const int * RESTRICT T, int n, int k, int * RESTRICT buckets) -{ - const ptrdiff_t prefetch_distance = 32; - - memset(buckets, 0, 2 * (size_t)k * sizeof(int)); - - int i = n - 2; - size_t s = 1; - ptrdiff_t c0 = T[n - 1]; - ptrdiff_t c1 = 0; - - for (; i >= prefetch_distance + 3; i -= 4) - { - libsais_prefetch(&T[i - 2 * prefetch_distance]); - - libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0], 0)]); - libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1], 0)]); - libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2], 0)]); - libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3], 0)]); - - c1 = T[i - 0]; s = (s << 1) + (size_t)(c1 > (c0 - (ptrdiff_t)(s & 1))); - buckets[BUCKETS_INDEX2((size_t)c0, (s & 3) == 1)]++; - - c0 = T[i - 1]; s = (s << 1) + (size_t)(c0 > (c1 - (ptrdiff_t)(s & 1))); - buckets[BUCKETS_INDEX2((size_t)c1, (s & 3) == 1)]++; - - c1 = T[i - 2]; s = (s << 1) + (size_t)(c1 > (c0 - (ptrdiff_t)(s & 1))); - buckets[BUCKETS_INDEX2((size_t)c0, (s & 3) == 1)]++; - - c0 = T[i - 3]; s = (s << 1) + (size_t)(c0 > (c1 - (ptrdiff_t)(s & 1))); - buckets[BUCKETS_INDEX2((size_t)c1, (s & 3) == 1)]++; - } - - for (; i >= 0; i -= 1) - { - c1 = c0; c0 = T[i]; s = (s << 1) + (size_t)(c0 > (c1 - (ptrdiff_t)(s & 1))); - buckets[BUCKETS_INDEX2((size_t)c1, (s & 3) == 1)]++; - } - - buckets[BUCKETS_INDEX2((size_t)c0, 0)]++; -} - -static int libsais_count_and_gather_lms_suffixes_8u(const unsigned char * RESTRICT T, int * RESTRICT SA, int n, int * RESTRICT buckets) -{ - const ptrdiff_t prefetch_distance = 128; - - memset(buckets, 0, 4 * ALPHABET_SIZE * sizeof(int)); - - int i = n - 2; - int m = n - 1; - size_t s = 1; - ptrdiff_t c0 = T[n - 1]; - ptrdiff_t c1 = 0; - - for (; i >= 3; i -= 4) - { - libsais_prefetch(&T[i - prefetch_distance]); - - c1 = T[i - 0]; s = (s << 1) + (size_t)(c1 > (c0 - (ptrdiff_t)(s & 1))); SA[m] = i + 1; m -= ((s & 3) == 1); - buckets[BUCKETS_INDEX4((size_t)c0, s & 3)]++; - - c0 = T[i - 1]; s = (s << 1) + (size_t)(c0 > (c1 - (ptrdiff_t)(s & 1))); SA[m] = i - 0; m -= ((s & 3) == 1); - buckets[BUCKETS_INDEX4((size_t)c1, s & 3)]++; - - c1 = T[i - 2]; s = (s << 1) + (size_t)(c1 > (c0 - (ptrdiff_t)(s & 1))); SA[m] = i - 1; m -= ((s & 3) == 1); - buckets[BUCKETS_INDEX4((size_t)c0, s & 3)]++; - - c0 = T[i - 3]; s = (s << 1) + (size_t)(c0 > (c1 - (ptrdiff_t)(s & 1))); SA[m] = i - 2; m -= ((s & 3) == 1); - buckets[BUCKETS_INDEX4((size_t)c1, s & 3)]++; - } - - for (; i >= 0; i -= 1) - { - c1 = c0; c0 = T[i]; s = (s << 1) + (size_t)(c0 > (c1 - (ptrdiff_t)(s & 1))); SA[m] = i + 1; m -= ((s & 3) == 1); - buckets[BUCKETS_INDEX4((size_t)c1, s & 3)]++; - } - - buckets[BUCKETS_INDEX4((size_t)c0, (s << 1) & 3)]++; - - return n - 1 - m; -} - -static int libsais_count_and_gather_lms_suffixes_32s_4k(const int * RESTRICT T, int * RESTRICT SA, int n, int k, int * RESTRICT buckets) -{ - const ptrdiff_t prefetch_distance = 32; - - memset(buckets, 0, 4 * (size_t)k * sizeof(int)); - - int i = n - 2; - int m = n - 1; - size_t s = 1; - ptrdiff_t c0 = T[n - 1]; - ptrdiff_t c1 = 0; - - for (; i >= prefetch_distance + 3; i -= 4) - { - libsais_prefetch(&T[i - 2 * prefetch_distance]); - - libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 0], 0)]); - libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 1], 0)]); - libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 2], 0)]); - libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 3], 0)]); - - c1 = T[i - 0]; s = (s << 1) + (size_t)(c1 > (c0 - (ptrdiff_t)(s & 1))); SA[m] = i + 1; m -= ((s & 3) == 1); - buckets[BUCKETS_INDEX4((size_t)c0, s & 3)]++; - - c0 = T[i - 1]; s = (s << 1) + (size_t)(c0 > (c1 - (ptrdiff_t)(s & 1))); SA[m] = i - 0; m -= ((s & 3) == 1); - buckets[BUCKETS_INDEX4((size_t)c1, s & 3)]++; - - c1 = T[i - 2]; s = (s << 1) + (size_t)(c1 > (c0 - (ptrdiff_t)(s & 1))); SA[m] = i - 1; m -= ((s & 3) == 1); - buckets[BUCKETS_INDEX4((size_t)c0, s & 3)]++; - - c0 = T[i - 3]; s = (s << 1) + (size_t)(c0 > (c1 - (ptrdiff_t)(s & 1))); SA[m] = i - 2; m -= ((s & 3) == 1); - buckets[BUCKETS_INDEX4((size_t)c1, s & 3)]++; - } - - for (; i >= 0; i -= 1) - { - c1 = c0; c0 = T[i]; s = (s << 1) + (size_t)(c0 > (c1 - (ptrdiff_t)(s & 1))); SA[m] = i + 1; m -= ((s & 3) == 1); - buckets[BUCKETS_INDEX4((size_t)c1, s & 3)]++; - } - - buckets[BUCKETS_INDEX4((size_t)c0, (s << 1) & 3)]++; - - return n - 1 - m; -} - -static int libsais_count_and_gather_lms_suffixes_32s_2k(const int * RESTRICT T, int * RESTRICT SA, int n, int k, int * RESTRICT buckets) -{ - const ptrdiff_t prefetch_distance = 32; - - memset(buckets, 0, 2 * (size_t)k * sizeof(int)); - - int i = n - 2; - int m = n - 1; - size_t s = 1; - ptrdiff_t c0 = T[n - 1]; - ptrdiff_t c1 = 0; - - for (; i >= prefetch_distance + 3; i -= 4) - { - libsais_prefetch(&T[i - 2 * prefetch_distance]); - - libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0], 0)]); - libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1], 0)]); - libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2], 0)]); - libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3], 0)]); - - c1 = T[i - 0]; s = (s << 1) + (size_t)(c1 > (c0 - (ptrdiff_t)(s & 1))); SA[m] = i + 1; m -= ((s & 3) == 1); - buckets[BUCKETS_INDEX2((size_t)c0, (s & 3) == 1)]++; - - c0 = T[i - 1]; s = (s << 1) + (size_t)(c0 > (c1 - (ptrdiff_t)(s & 1))); SA[m] = i - 0; m -= ((s & 3) == 1); - buckets[BUCKETS_INDEX2((size_t)c1, (s & 3) == 1)]++; - - c1 = T[i - 2]; s = (s << 1) + (size_t)(c1 > (c0 - (ptrdiff_t)(s & 1))); SA[m] = i - 1; m -= ((s & 3) == 1); - buckets[BUCKETS_INDEX2((size_t)c0, (s & 3) == 1)]++; - - c0 = T[i - 3]; s = (s << 1) + (size_t)(c0 > (c1 - (ptrdiff_t)(s & 1))); SA[m] = i - 2; m -= ((s & 3) == 1); - buckets[BUCKETS_INDEX2((size_t)c1, (s & 3) == 1)]++; - } - - for (; i >= 0; i -= 1) - { - c1 = c0; c0 = T[i]; s = (s << 1) + (size_t)(c0 > (c1 - (ptrdiff_t)(s & 1))); SA[m] = i + 1; m -= ((s & 3) == 1); - buckets[BUCKETS_INDEX2((size_t)c1, (s & 3) == 1)]++; - } - - buckets[BUCKETS_INDEX2((size_t)c0, 0)]++; - - return n - 1 - m; -} - -static int libsais_count_and_gather_compacted_lms_suffixes_32s_2k(const int * RESTRICT T, int * RESTRICT SA, int n, int k, int * RESTRICT buckets) -{ - const ptrdiff_t prefetch_distance = 32; - - memset(buckets, 0, 2 * (size_t)k * sizeof(int)); - - int i = n - 2; - int m = n - 1; - size_t s = 1; - ptrdiff_t c0 = T[n - 1]; - ptrdiff_t c1 = 0; - - for (; i >= prefetch_distance + 3; i -= 4) - { - libsais_prefetch(&T[i - 2 * prefetch_distance]); - - libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0] & INT_MAX, 0)]); - libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1] & INT_MAX, 0)]); - libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2] & INT_MAX, 0)]); - libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3] & INT_MAX, 0)]); - - c1 = T[i - 0]; s = (s << 1) + (size_t)(c1 > (c0 - (ptrdiff_t)(s & 1))); SA[m] = i + 1; m -= ((ptrdiff_t)(s & 3) == (c0 >=0)); - c0 &= INT_MAX; buckets[BUCKETS_INDEX2((size_t)c0, (s & 3) == 1)]++; - - c0 = T[i - 1]; s = (s << 1) + (size_t)(c0 > (c1 - (ptrdiff_t)(s & 1))); SA[m] = i - 0; m -= ((ptrdiff_t)(s & 3) == (c1 >= 0)); - c1 &= INT_MAX; buckets[BUCKETS_INDEX2((size_t)c1, (s & 3) == 1)]++; - - c1 = T[i - 2]; s = (s << 1) + (size_t)(c1 > (c0 - (ptrdiff_t)(s & 1))); SA[m] = i - 1; m -= ((ptrdiff_t)(s & 3) == (c0 >= 0)); - c0 &= INT_MAX; buckets[BUCKETS_INDEX2((size_t)c0, (s & 3) == 1)]++; - - c0 = T[i - 3]; s = (s << 1) + (size_t)(c0 > (c1 - (ptrdiff_t)(s & 1))); SA[m] = i - 2; m -= ((ptrdiff_t)(s & 3) == (c1 >= 0)); - c1 &= INT_MAX; buckets[BUCKETS_INDEX2((size_t)c1, (s & 3) == 1)]++; - } - - for (; i >= 0; i -= 1) - { - c1 = c0; c0 = T[i]; s = (s << 1) + (size_t)(c0 > (c1 - (ptrdiff_t)(s & 1))); SA[m] = i + 1; m -= ((ptrdiff_t)(s & 3) == (c1 >= 0)); - c1 &= INT_MAX; buckets[BUCKETS_INDEX2((size_t)c1, (s & 3) == 1)]++; - } - - c0 &= INT_MAX; buckets[BUCKETS_INDEX2((size_t)c0, 0)]++; - - return n - 1 - m; -} - -static void libsais_count_suffixes_32s(const int * RESTRICT T, int n, int k, int * RESTRICT buckets) -{ - const ptrdiff_t prefetch_distance = 32; - - memset(buckets, 0, (size_t)k * sizeof(int)); - - ptrdiff_t i, j; - for (i = 0, j = (ptrdiff_t)n - 7; i < j; i += 8) - { - libsais_prefetch(&T[i + prefetch_distance]); - - buckets[T[i + 0]]++; - buckets[T[i + 1]]++; - buckets[T[i + 2]]++; - buckets[T[i + 3]]++; - buckets[T[i + 4]]++; - buckets[T[i + 5]]++; - buckets[T[i + 6]]++; - buckets[T[i + 7]]++; - } - - for (j += 7; i < j; i += 1) - { - buckets[T[i]]++; - } -} - -static void libsais_initialize_buckets_start_and_end_8u(int * RESTRICT buckets) -{ - int * RESTRICT bucket_start = &buckets[6 * ALPHABET_SIZE]; - int * RESTRICT bucket_end = &buckets[7 * ALPHABET_SIZE]; - - ptrdiff_t i, j; int sum = 0; - for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4(UCHAR_MAX, 0); i += BUCKETS_INDEX4(1, 0), j += 1) - { - bucket_start[j] = sum; - sum += buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 2)] + buckets[i + BUCKETS_INDEX4(0, 3)]; - bucket_end[j] = sum; - } -} - -static void libsais_initialize_buckets_start_and_end_32s_6k(int k, int * RESTRICT buckets) -{ - int * RESTRICT bucket_start = &buckets[4 * k]; - int * RESTRICT bucket_end = &buckets[5 * k]; - - ptrdiff_t i, j; int sum = 0; - for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4((ptrdiff_t)k - 1, 0); i += BUCKETS_INDEX4(1, 0), j += 1) - { - bucket_start[j] = sum; - sum += buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 2)] + buckets[i + BUCKETS_INDEX4(0, 3)]; - bucket_end[j] = sum; - } -} - -static void libsais_initialize_buckets_start_and_end_32s_4k(int k, int * RESTRICT buckets) -{ - int * RESTRICT bucket_start = &buckets[2 * k]; - int * RESTRICT bucket_end = &buckets[3 * k]; - - ptrdiff_t i, j; int sum = 0; - for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((ptrdiff_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0), j += 1) - { - bucket_start[j] = sum; - sum += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)]; - bucket_end[j] = sum; - } -} - -static void libsais_initialize_buckets_end_32s_2k(int k, int * RESTRICT buckets) -{ - ptrdiff_t i; int sum0 = 0; - for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((ptrdiff_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0)) - { - sum0 += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)]; buckets[i + BUCKETS_INDEX2(0, 0)] = sum0; - } -} - -static void libsais_initialize_buckets_start_32s_1k(int k, int * RESTRICT buckets) -{ - ptrdiff_t i; int sum = 0; - for (i = 0; i <= (ptrdiff_t)k - 1; i += 1) { int tmp = buckets[i]; buckets[i] = sum; sum += tmp; } -} - -static void libsais_initialize_buckets_end_32s_1k(int k, int * RESTRICT buckets) -{ - ptrdiff_t i; int sum = 0; - for (i = 0; i <= (ptrdiff_t)k - 1; i += 1) { sum += buckets[i]; buckets[i] = sum; } -} - -static void libsais_initialize_buckets_start_and_end_32s_2k(int k, int * RESTRICT buckets) -{ - ptrdiff_t i, j; - for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((ptrdiff_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0), j += 1) - { - buckets[j] = buckets[i]; - } - - buckets[k] = 0; memcpy(&buckets[k + 1], buckets, ((size_t)k - 1) * sizeof(int)); -} - -static int libsais_initialize_buckets_for_lms_suffixes_radix_sort_8u(const unsigned char * RESTRICT T, int * RESTRICT buckets, int first_lms_suffix) -{ - { - size_t s = 0; - ptrdiff_t c0 = T[first_lms_suffix]; - ptrdiff_t c1 = 0; - - for (; --first_lms_suffix >= 0; ) - { - c1 = c0; c0 = T[first_lms_suffix]; s = (s << 1) + (size_t)(c0 > (c1 - (ptrdiff_t)(s & 1))); - buckets[BUCKETS_INDEX4((size_t)c1, s & 3)]--; - } - - buckets[BUCKETS_INDEX4((size_t)c0, (s << 1) & 3)]--; - } - - { - int * RESTRICT temp_bucket = &buckets[4 * ALPHABET_SIZE]; - - ptrdiff_t i, j; int sum = 0; - for (i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX4(UCHAR_MAX, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) - { - temp_bucket[j + BUCKETS_INDEX2(0, 1)] = sum; sum += buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 3)]; temp_bucket[j] = sum; - } - - return sum; - } -} - -static void libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_2k(const int * RESTRICT T, int k, int * RESTRICT buckets, int first_lms_suffix) -{ - buckets[BUCKETS_INDEX2(T[first_lms_suffix], 0)]++; - buckets[BUCKETS_INDEX2(T[first_lms_suffix], 1)]--; - - ptrdiff_t i; int sum0 = 0, sum1 = 0; - for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((ptrdiff_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0)) - { - sum0 += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)]; - sum1 += buckets[i + BUCKETS_INDEX2(0, 1)]; - - buckets[i + BUCKETS_INDEX2(0, 0)] = sum0; - buckets[i + BUCKETS_INDEX2(0, 1)] = sum1; - } -} - -static int libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(const int * RESTRICT T, int k, int * RESTRICT buckets, int first_lms_suffix) -{ - { - size_t s = 0; - ptrdiff_t c0 = T[first_lms_suffix]; - ptrdiff_t c1 = 0; - - for (; --first_lms_suffix >= 0; ) - { - c1 = c0; c0 = T[first_lms_suffix]; s = (s << 1) + (size_t)(c0 > (c1 - (ptrdiff_t)(s & 1))); - buckets[BUCKETS_INDEX4((size_t)c1, s & 3)]--; - } - - buckets[BUCKETS_INDEX4((size_t)c0, (s << 1) & 3)]--; - } - - { - int * RESTRICT temp_bucket = &buckets[4 * k]; - - ptrdiff_t i, j; int sum = 0; - for (i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX4((ptrdiff_t)k - 1, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) - { - temp_bucket[j + BUCKETS_INDEX2(0, 1)] = sum; sum += buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 3)]; temp_bucket[j] = sum; - } - - return sum; - } -} - -static void libsais_initialize_buckets_for_radix_and_partial_sorting_32s_4k(const int * RESTRICT T, int k, int * RESTRICT buckets, int first_lms_suffix) -{ - int * RESTRICT bucket_start = &buckets[2 * k]; - int * RESTRICT bucket_end = &buckets[3 * k]; - - buckets[BUCKETS_INDEX2(T[first_lms_suffix], 0)]++; - buckets[BUCKETS_INDEX2(T[first_lms_suffix], 1)]--; - - ptrdiff_t i, j; int sum0 = 0, sum1 = 0; - for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((ptrdiff_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0), j += 1) - { - bucket_start[j] = sum1; - - sum0 += buckets[i + BUCKETS_INDEX2(0, 1)]; - sum1 += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)]; - buckets[i + BUCKETS_INDEX2(0, 1)] = sum0; - - bucket_end[j] = sum1; - } -} - -static void libsais_radix_sort_lms_suffixes_8u(const unsigned char * RESTRICT T, int * RESTRICT SA, int n, int m, int * RESTRICT buckets) -{ - const ptrdiff_t prefetch_distance = 32; - - int * RESTRICT induction_bucket = &buckets[4 * ALPHABET_SIZE]; - - ptrdiff_t i, j; - for (i = (ptrdiff_t)n - 1, j = (ptrdiff_t)n - (ptrdiff_t)m + prefetch_distance + 3; i > j; i -= 4) - { - libsais_prefetch(&SA[i - 2 * prefetch_distance]); - - libsais_prefetch(&T[SA[i - prefetch_distance - 0]]); - libsais_prefetch(&T[SA[i - prefetch_distance - 1]]); - libsais_prefetch(&T[SA[i - prefetch_distance - 2]]); - libsais_prefetch(&T[SA[i - prefetch_distance - 3]]); - - int p0 = SA[i - 0]; SA[--induction_bucket[BUCKETS_INDEX2(T[p0], 0)]] = p0; - int p1 = SA[i - 1]; SA[--induction_bucket[BUCKETS_INDEX2(T[p1], 0)]] = p1; - int p2 = SA[i - 2]; SA[--induction_bucket[BUCKETS_INDEX2(T[p2], 0)]] = p2; - int p3 = SA[i - 3]; SA[--induction_bucket[BUCKETS_INDEX2(T[p3], 0)]] = p3; - } - - for (j -= prefetch_distance + 3; i > j; i -= 1) - { - int p = SA[i]; SA[--induction_bucket[BUCKETS_INDEX2(T[p], 0)]] = p; - } -} - -static void libsais_radix_sort_lms_suffixes_32s_2k(const int * RESTRICT T, int * RESTRICT SA, int n, int m, int * RESTRICT induction_bucket) -{ - const ptrdiff_t prefetch_distance = 32; - - ptrdiff_t i, j; - for (i = (ptrdiff_t)n - 1, j = (ptrdiff_t)n - (ptrdiff_t)m + 2 * prefetch_distance + 3; i > j; i -= 4) - { - libsais_prefetch(&SA[i - 3 * prefetch_distance]); - - libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 0]]); - libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 1]]); - libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 2]]); - libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 3]]); - - libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 0]], 0)]); - libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 1]], 0)]); - libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 2]], 0)]); - libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 3]], 0)]); - - int p0 = SA[i - 0]; SA[--induction_bucket[BUCKETS_INDEX2(T[p0], 0)]] = p0; - int p1 = SA[i - 1]; SA[--induction_bucket[BUCKETS_INDEX2(T[p1], 0)]] = p1; - int p2 = SA[i - 2]; SA[--induction_bucket[BUCKETS_INDEX2(T[p2], 0)]] = p2; - int p3 = SA[i - 3]; SA[--induction_bucket[BUCKETS_INDEX2(T[p3], 0)]] = p3; - } - - for (j -= 2 * prefetch_distance + 3; i > j; i -= 1) - { - int p = SA[i]; SA[--induction_bucket[BUCKETS_INDEX2(T[p], 0)]] = p; - } -} - -static int libsais_radix_sort_lms_suffixes_32s_1k(const int * RESTRICT T, int * RESTRICT SA, int n, int * RESTRICT buckets) -{ - const ptrdiff_t prefetch_distance = 32; - - int i = n - 2; - int m = 0; - size_t s = 1; - ptrdiff_t c0 = T[n - 1]; - ptrdiff_t c1 = 0; - ptrdiff_t c2 = 0; - - for (; i >= prefetch_distance + 3; i -= 4) - { - libsais_prefetch(&T[i - 2 * prefetch_distance]); - - libsais_prefetchw(&buckets[T[i - prefetch_distance - 0]]); - libsais_prefetchw(&buckets[T[i - prefetch_distance - 1]]); - libsais_prefetchw(&buckets[T[i - prefetch_distance - 2]]); - libsais_prefetchw(&buckets[T[i - prefetch_distance - 3]]); - - c1 = T[i - 0]; s = (s << 1) + (size_t)(c1 > (c0 - (ptrdiff_t)(s & 1))); - if ((s & 3) == 1) { SA[--buckets[c2 = c0]] = i + 1; m++; } - - c0 = T[i - 1]; s = (s << 1) + (size_t)(c0 > (c1 - (ptrdiff_t)(s & 1))); - if ((s & 3) == 1) { SA[--buckets[c2 = c1]] = i - 0; m++; } - - c1 = T[i - 2]; s = (s << 1) + (size_t)(c1 > (c0 - (ptrdiff_t)(s & 1))); - if ((s & 3) == 1) { SA[--buckets[c2 = c0]] = i - 1; m++; } - - c0 = T[i - 3]; s = (s << 1) + (size_t)(c0 > (c1 - (ptrdiff_t)(s & 1))); - if ((s & 3) == 1) { SA[--buckets[c2 = c1]] = i - 2; m++; } - } - - for (; i >= 0; i -= 1) - { - c1 = c0; c0 = T[i]; s = (s << 1) + (size_t)(c0 > (c1 - (ptrdiff_t)(s & 1))); - if ((s & 3) == 1) { SA[--buckets[c2 = c1]] = i + 1; m++; } - } - - if (m > 1) - { - SA[buckets[c2]] = 0; - } - - return m; -} - -static void libsais_radix_sort_set_markers_32s(int * RESTRICT SA, int k, int * RESTRICT induction_bucket, int marker) -{ - const ptrdiff_t prefetch_distance = 32; - - ptrdiff_t i, j; - for (i = 0, j = (ptrdiff_t)k - 1 - prefetch_distance - 3; i < j; i += 4) - { - libsais_prefetch(&induction_bucket[BUCKETS_INDEX2(i + 2 * prefetch_distance, 0)]); - - libsais_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 0, 0)]]); - libsais_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 1, 0)]]); - libsais_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 2, 0)]]); - libsais_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 3, 0)]]); - - SA[induction_bucket[BUCKETS_INDEX2(i + 0, 0)]] |= marker; - SA[induction_bucket[BUCKETS_INDEX2(i + 1, 0)]] |= marker; - SA[induction_bucket[BUCKETS_INDEX2(i + 2, 0)]] |= marker; - SA[induction_bucket[BUCKETS_INDEX2(i + 3, 0)]] |= marker; - } - - for (j += prefetch_distance + 3; i < j; i += 1) - { - SA[induction_bucket[BUCKETS_INDEX2(i, 0)]] |= marker; - } -} - -static void libsais_initialize_buckets_for_partial_sorting_8u(const unsigned char * RESTRICT T, int * RESTRICT buckets, int first_lms_suffix, int left_suffixes_count) -{ - int * RESTRICT temp_bucket = &buckets[4 * ALPHABET_SIZE]; - - buckets[BUCKETS_INDEX4((size_t)T[first_lms_suffix], 1)]++; - - ptrdiff_t i, j; int sum0 = left_suffixes_count + 1, sum1 = 0; - for (i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX4(UCHAR_MAX, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) - { - temp_bucket[j + BUCKETS_INDEX2(0, 0)] = sum0; - - sum0 += buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 2)]; - sum1 += buckets[i + BUCKETS_INDEX4(0, 1)]; - - buckets[j + BUCKETS_INDEX2(0, 0)] = sum0; - buckets[j + BUCKETS_INDEX2(0, 1)] = sum1; - } -} - -static void libsais_initialize_buckets_for_partial_sorting_32s_6k(const int * RESTRICT T, int k, int * RESTRICT buckets, int first_lms_suffix, int left_suffixes_count) -{ - int * RESTRICT temp_bucket = &buckets[4 * k]; - - buckets[BUCKETS_INDEX4((size_t)T[first_lms_suffix], 1)]++; - - ptrdiff_t i, j; int sum0 = left_suffixes_count + 1, sum1 = 0; - for (i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX4((ptrdiff_t)k - 1, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) - { - temp_bucket[j + BUCKETS_INDEX2(0, 0)] = sum0; - - sum0 += buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 2)]; - sum1 += buckets[i + BUCKETS_INDEX4(0, 1)]; - - buckets[j + BUCKETS_INDEX2(0, 0)] = sum0; - buckets[j + BUCKETS_INDEX2(0, 1)] = sum1; - } -} - -static int libsais_partial_sorting_scan_left_to_right_8u(const unsigned char * RESTRICT T, int * RESTRICT SA, int n, int * RESTRICT buckets, int left_suffixes_count, int d) -{ - const ptrdiff_t prefetch_distance = 32; - - int * RESTRICT induction_bucket = &buckets[4 * ALPHABET_SIZE]; - int * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; - - SA[induction_bucket[BUCKETS_INDEX2(T[n - 1], T[n - 2] >= T[n - 1])]++] = (n - 1) | INT_MIN; - distinct_names[BUCKETS_INDEX2(T[n - 1], T[n - 2] >= T[n - 1])] = ++d; - - ptrdiff_t i, j; - for (i = 0, j = (ptrdiff_t)left_suffixes_count - prefetch_distance - 1; i < j; i += 2) - { - libsais_prefetch(&SA[i + 2 * prefetch_distance]); - - libsais_prefetch(&T[SA[i + prefetch_distance + 0] & INT_MAX] - 1); - libsais_prefetch(&T[SA[i + prefetch_distance + 0] & INT_MAX] - 2); - libsais_prefetch(&T[SA[i + prefetch_distance + 1] & INT_MAX] - 1); - libsais_prefetch(&T[SA[i + prefetch_distance + 1] & INT_MAX] - 2); - - int p0 = SA[i + 0]; d += (p0 < 0); p0 &= INT_MAX; int v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] >= T[p0 - 1]); - SA[induction_bucket[v0]++] = (p0 - 1) | ((distinct_names[v0] != d) << (INT_BIT - 1)); distinct_names[v0] = d; - - int p1 = SA[i + 1]; d += (p1 < 0); p1 &= INT_MAX; int v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] >= T[p1 - 1]); - SA[induction_bucket[v1]++] = (p1 - 1) | ((distinct_names[v1] != d) << (INT_BIT - 1)); distinct_names[v1] = d; - } - - for (j += prefetch_distance + 1; i < j; i += 1) - { - int p = SA[i]; d += (p < 0); p &= INT_MAX; int v = BUCKETS_INDEX2(T[p - 1], T[p - 2] >= T[p - 1]); - SA[induction_bucket[v]++] = (p - 1) | ((distinct_names[v] != d) << (INT_BIT - 1)); distinct_names[v] = d; - } - - return d; -} - -static int libsais_partial_sorting_scan_left_to_right_32s_6k(const int * RESTRICT T, int * RESTRICT SA, int n, int k, int * RESTRICT buckets, int left_suffixes_count, int d) -{ - const ptrdiff_t prefetch_distance = 32; - - int * RESTRICT induction_bucket = &buckets[4 * k]; - int * RESTRICT distinct_names = &buckets[2 * k]; - - SA[induction_bucket[BUCKETS_INDEX2(T[n - 1], T[n - 2] >= T[n - 1])]++] = (n - 1) | INT_MIN; - distinct_names[BUCKETS_INDEX2(T[n - 1], T[n - 2] >= T[n - 1])] = ++d; - - ptrdiff_t i, j; - for (i = 0, j = (ptrdiff_t)left_suffixes_count - 2 * prefetch_distance - 1; i < j; i += 2) - { - libsais_prefetch(&SA[i + 3 * prefetch_distance]); - - libsais_prefetch(&T[SA[i + 2 * prefetch_distance + 0] & INT_MAX] - 1); - libsais_prefetch(&T[SA[i + 2 * prefetch_distance + 0] & INT_MAX] - 2); - libsais_prefetch(&T[SA[i + 2 * prefetch_distance + 1] & INT_MAX] - 1); - libsais_prefetch(&T[SA[i + 2 * prefetch_distance + 1] & INT_MAX] - 2); - - int p0 = SA[i + prefetch_distance + 0] & INT_MAX; int v0 = BUCKETS_INDEX2(T[p0 - (p0 > 0)], 0); - libsais_prefetchw(&induction_bucket[v0]); libsais_prefetchw(&distinct_names[v0]); - - int p1 = SA[i + prefetch_distance + 1] & INT_MAX; int v1 = BUCKETS_INDEX2(T[p1 - (p1 > 0)], 0); - libsais_prefetchw(&induction_bucket[v1]); libsais_prefetchw(&distinct_names[v1]); - - int p2 = SA[i + 0]; d += (p2 < 0); p2 &= INT_MAX; int v2 = BUCKETS_INDEX2(T[p2 - 1], T[p2 - 2] >= T[p2 - 1]); - SA[induction_bucket[v2]++] = (p2 - 1) | ((distinct_names[v2] != d) << (INT_BIT - 1)); distinct_names[v2] = d; - - int p3 = SA[i + 1]; d += (p3 < 0); p3 &= INT_MAX; int v3 = BUCKETS_INDEX2(T[p3 - 1], T[p3 - 2] >= T[p3 - 1]); - SA[induction_bucket[v3]++] = (p3 - 1) | ((distinct_names[v3] != d) << (INT_BIT - 1)); distinct_names[v3] = d; - } - - for (j += 2 * prefetch_distance + 1; i < j; i += 1) - { - int p = SA[i]; d += (p < 0); p &= INT_MAX; int v = BUCKETS_INDEX2(T[p - 1], T[p - 2] >= T[p - 1]); - SA[induction_bucket[v]++] = (p - 1) | ((distinct_names[v] != d) << (INT_BIT - 1)); distinct_names[v] = d; - } - - return d; -} - -static int libsais_partial_sorting_scan_left_to_right_32s_4k(const int * RESTRICT T, int * RESTRICT SA, int n, int k, int * RESTRICT buckets, int d) -{ - const ptrdiff_t prefetch_distance = 32; - - int * RESTRICT induction_bucket = &buckets[2 * k]; - int * RESTRICT distinct_names = &buckets[0 * k]; - - SA[induction_bucket[T[n - 1]]++] = (n - 1) | ((T[n - 2] < T[n - 1]) << (INT_BIT - 1)) | SUFFIX_GROUP_MARKER; - distinct_names[BUCKETS_INDEX2(T[n - 1], T[n - 2] < T[n - 1])] = ++d; - - ptrdiff_t i, j; - for (i = 0, j = (ptrdiff_t)n - 2 * prefetch_distance - 1; i < j; i += 2) - { - libsais_prefetchw(&SA[i + 3 * prefetch_distance]); - - int s0 = SA[i + 2 * prefetch_distance + 0]; const int * Ts0 = &T[s0 & ~SUFFIX_GROUP_MARKER] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL); - int s1 = SA[i + 2 * prefetch_distance + 1]; const int * Ts1 = &T[s1 & ~SUFFIX_GROUP_MARKER] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL); - int s2 = SA[i + 1 * prefetch_distance + 0]; if (s2 > 0) { const ptrdiff_t Ts2 = T[(s2 & ~SUFFIX_GROUP_MARKER) - 1]; libsais_prefetchw(&induction_bucket[Ts2]); libsais_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts2, 0)]); } - int s3 = SA[i + 1 * prefetch_distance + 1]; if (s3 > 0) { const ptrdiff_t Ts3 = T[(s3 & ~SUFFIX_GROUP_MARKER) - 1]; libsais_prefetchw(&induction_bucket[Ts3]); libsais_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts3, 0)]); } - - int p0 = SA[i + 0]; SA[i + 0] = p0 & INT_MAX; - if (p0 > 0) - { - SA[i + 0] = 0; d += (p0 >> (SUFFIX_GROUP_BIT - 1)); p0 &= ~SUFFIX_GROUP_MARKER; int v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] < T[p0 - 1]); - SA[induction_bucket[T[p0 - 1]]++] = (p0 - 1) | ((T[p0 - 2] < T[p0 - 1]) << (INT_BIT - 1)) | ((distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v0] = d; - } - - int p1 = SA[i + 1]; SA[i + 1] = p1 & INT_MAX; - if (p1 > 0) - { - SA[i + 1] = 0; d += (p1 >> (SUFFIX_GROUP_BIT - 1)); p1 &= ~SUFFIX_GROUP_MARKER; int v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] < T[p1 - 1]); - SA[induction_bucket[T[p1 - 1]]++] = (p1 - 1) | ((T[p1 - 2] < T[p1 - 1]) << (INT_BIT - 1)) | ((distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v1] = d; - } - } - - for (j += 2 * prefetch_distance + 1; i < j; i += 1) - { - int p = SA[i]; SA[i] = p & INT_MAX; - if (p > 0) - { - SA[i] = 0; d += (p >> (SUFFIX_GROUP_BIT - 1)); p &= ~SUFFIX_GROUP_MARKER; int v = BUCKETS_INDEX2(T[p - 1], T[p - 2] < T[p - 1]); - SA[induction_bucket[T[p - 1]]++] = (p - 1) | ((T[p - 2] < T[p - 1]) << (INT_BIT - 1)) | ((distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v] = d; - } - } - - return d; -} - -static void libsais_partial_sorting_scan_left_to_right_32s_1k(const int * RESTRICT T, int * RESTRICT SA, int n, int k, int * RESTRICT induction_bucket) -{ - const ptrdiff_t prefetch_distance = 32; - - SA[induction_bucket[T[n - 1]]++] = (n - 1) | ((T[n - 2] < T[n - 1]) << (INT_BIT - 1)); - - ptrdiff_t i, j; - for (i = 0, j = (ptrdiff_t)n - 2 * prefetch_distance - 1; i < j; i += 2) - { - libsais_prefetchw(&SA[i + 3 * prefetch_distance]); - - int s0 = SA[i + 2 * prefetch_distance + 0]; const int * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); - int s1 = SA[i + 2 * prefetch_distance + 1]; const int * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); - int s2 = SA[i + 1 * prefetch_distance + 0]; if (s2 > 0) { libsais_prefetchw(&induction_bucket[T[s2 - 1]]); libsais_prefetch(&T[s2] - 2); } - int s3 = SA[i + 1 * prefetch_distance + 1]; if (s3 > 0) { libsais_prefetchw(&induction_bucket[T[s3 - 1]]); libsais_prefetch(&T[s3] - 2); } - - int p0 = SA[i + 0]; SA[i + 0] = p0 & INT_MAX; if (p0 > 0) { SA[i + 0] = 0; SA[induction_bucket[T[p0 - 1]]++] = (p0 - 1) | ((T[p0 - 2] < T[p0 - 1]) << (INT_BIT - 1)); } - int p1 = SA[i + 1]; SA[i + 1] = p1 & INT_MAX; if (p1 > 0) { SA[i + 1] = 0; SA[induction_bucket[T[p1 - 1]]++] = (p1 - 1) | ((T[p1 - 2] < T[p1 - 1]) << (INT_BIT - 1)); } - } - - for (j += 2 * prefetch_distance + 1; i < j; i += 1) - { - int p = SA[i]; SA[i] = p & INT_MAX; if (p > 0) { SA[i] = 0; SA[induction_bucket[T[p - 1]]++] = (p - 1) | ((T[p - 2] < T[p - 1]) << (INT_BIT - 1)); } - } -} - -static void libsais_partial_sorting_shift_markers_8u(int * RESTRICT SA, const int * RESTRICT buckets) -{ - const ptrdiff_t prefetch_distance = 32; - - const int * RESTRICT temp_bucket = &buckets[4 * ALPHABET_SIZE]; - - ptrdiff_t c; - for (c = BUCKETS_INDEX2(UCHAR_MAX, 0); c >= BUCKETS_INDEX2(1, 0); c -= BUCKETS_INDEX2(1, 0)) - { - ptrdiff_t i, j; int s = INT_MIN; - for (i = (ptrdiff_t)temp_bucket[c] - 1, j = (ptrdiff_t)buckets[c - BUCKETS_INDEX2(1, 0)] + 3; i >= j; i -= 4) - { - libsais_prefetchw(&SA[i - prefetch_distance]); - - int p0 = SA[i - 0], q0 = (p0 & INT_MIN) ^ s; s = s ^ q0; SA[i - 0] = p0 ^ q0; - int p1 = SA[i - 1], q1 = (p1 & INT_MIN) ^ s; s = s ^ q1; SA[i - 1] = p1 ^ q1; - int p2 = SA[i - 2], q2 = (p2 & INT_MIN) ^ s; s = s ^ q2; SA[i - 2] = p2 ^ q2; - int p3 = SA[i - 3], q3 = (p3 & INT_MIN) ^ s; s = s ^ q3; SA[i - 3] = p3 ^ q3; - } - - for (j -= 3; i >= j; i -= 1) - { - int p = SA[i], q = (p & INT_MIN) ^ s; s = s ^ q; SA[i] = p ^ q; - } - } -} - -static void libsais_partial_sorting_shift_markers_32s_6k(int * RESTRICT SA, int k, const int * RESTRICT buckets) -{ - const ptrdiff_t prefetch_distance = 32; - - const int * RESTRICT temp_bucket = &buckets[4 * k]; - - ptrdiff_t c; - for (c = BUCKETS_INDEX2((ptrdiff_t)k - 1, 0); c >= BUCKETS_INDEX2(1, 0); c -= BUCKETS_INDEX2(1, 0)) - { - ptrdiff_t i, j; int s = INT_MIN; - for (i = (ptrdiff_t)temp_bucket[c] - 1, j = (ptrdiff_t)buckets[c - BUCKETS_INDEX2(1, 0)] + 3; i >= j; i -= 4) - { - libsais_prefetchw(&SA[i - prefetch_distance]); - - int p0 = SA[i - 0], q0 = (p0 & INT_MIN) ^ s; s = s ^ q0; SA[i - 0] = p0 ^ q0; - int p1 = SA[i - 1], q1 = (p1 & INT_MIN) ^ s; s = s ^ q1; SA[i - 1] = p1 ^ q1; - int p2 = SA[i - 2], q2 = (p2 & INT_MIN) ^ s; s = s ^ q2; SA[i - 2] = p2 ^ q2; - int p3 = SA[i - 3], q3 = (p3 & INT_MIN) ^ s; s = s ^ q3; SA[i - 3] = p3 ^ q3; - } - - for (j -= 3; i >= j; i -= 1) - { - int p = SA[i], q = (p & INT_MIN) ^ s; s = s ^ q; SA[i] = p ^ q; - } - } -} - -static void libsais_partial_sorting_shift_markers_32s_4k(int * RESTRICT SA, int n) -{ - const ptrdiff_t prefetch_distance = 32; - - ptrdiff_t i; int s = SUFFIX_GROUP_MARKER; - for (i = (ptrdiff_t)n - 1; i >= 3; i -= 4) - { - libsais_prefetchw(&SA[i - prefetch_distance]); - - int p0 = SA[i - 0], q0 = ((p0 & SUFFIX_GROUP_MARKER) ^ s) & ((p0 > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q0; SA[i - 0] = p0 ^ q0; - int p1 = SA[i - 1], q1 = ((p1 & SUFFIX_GROUP_MARKER) ^ s) & ((p1 > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q1; SA[i - 1] = p1 ^ q1; - int p2 = SA[i - 2], q2 = ((p2 & SUFFIX_GROUP_MARKER) ^ s) & ((p2 > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q2; SA[i - 2] = p2 ^ q2; - int p3 = SA[i - 3], q3 = ((p3 & SUFFIX_GROUP_MARKER) ^ s) & ((p3 > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q3; SA[i - 3] = p3 ^ q3; - } - - for (; i >= 0; i -= 1) - { - int p = SA[i], q = ((p & SUFFIX_GROUP_MARKER) ^ s) & ((p > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q; SA[i] = p ^ q; - } -} - -static int libsais_partial_sorting_scan_right_to_left_8u(const unsigned char * RESTRICT T, int * RESTRICT SA, int n, int * RESTRICT buckets, int first_lms_suffix, int left_suffixes_count, int d) -{ - const ptrdiff_t prefetch_distance = 32; - - int * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; - int * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; - - ptrdiff_t i, j; - for (i = (ptrdiff_t)n - (ptrdiff_t)first_lms_suffix - 1, j = (ptrdiff_t)left_suffixes_count + 1 + prefetch_distance + 1; i >= j; i -= 2) - { - libsais_prefetch(&SA[i - 2 * prefetch_distance]); - - libsais_prefetch(&T[SA[i - prefetch_distance - 0] & INT_MAX] - 1); - libsais_prefetch(&T[SA[i - prefetch_distance - 0] & INT_MAX] - 2); - libsais_prefetch(&T[SA[i - prefetch_distance - 1] & INT_MAX] - 1); - libsais_prefetch(&T[SA[i - prefetch_distance - 1] & INT_MAX] - 2); - - int p0 = SA[i - 0]; d += (p0 < 0); p0 &= INT_MAX; int v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]); - SA[--induction_bucket[v0]] = (p0 - 1) | ((distinct_names[v0] != d) << (INT_BIT - 1)); distinct_names[v0] = d; - - int p1 = SA[i - 1]; d += (p1 < 0); p1 &= INT_MAX; int v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]); - SA[--induction_bucket[v1]] = (p1 - 1) | ((distinct_names[v1] != d) << (INT_BIT - 1)); distinct_names[v1] = d; - } - - for (j -= prefetch_distance + 1; i >= j; i -= 1) - { - int p = SA[i]; d += (p < 0); p &= INT_MAX; int v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]); - SA[--induction_bucket[v]] = (p - 1) | ((distinct_names[v] != d) << (INT_BIT - 1)); distinct_names[v] = d; - } - - return d; -} - -static int libsais_partial_sorting_scan_right_to_left_32s_6k(const int * RESTRICT T, int * RESTRICT SA, int n, int k, int * RESTRICT buckets, int first_lms_suffix, int left_suffixes_count, int d) -{ - const ptrdiff_t prefetch_distance = 32; - - int * RESTRICT induction_bucket = &buckets[0 * k]; - int * RESTRICT distinct_names = &buckets[2 * k]; - - ptrdiff_t i, j; - for (i = (ptrdiff_t)n - (ptrdiff_t)first_lms_suffix - 1, j = (ptrdiff_t)left_suffixes_count + 1 + 2 * prefetch_distance + 1; i >= j; i -= 2) - { - libsais_prefetch(&SA[i - 3 * prefetch_distance]); - - libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 0] & INT_MAX] - 1); - libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 0] & INT_MAX] - 2); - libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 1] & INT_MAX] - 1); - libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 1] & INT_MAX] - 2); - - int p0 = SA[i - prefetch_distance - 0] & INT_MAX; int v0 = BUCKETS_INDEX2(T[p0 - (p0 > 0)], 0); - libsais_prefetchw(&induction_bucket[v0]); libsais_prefetchw(&distinct_names[v0]); - - int p1 = SA[i - prefetch_distance - 1] & INT_MAX; int v1 = BUCKETS_INDEX2(T[p1 - (p1 > 0)], 0); - libsais_prefetchw(&induction_bucket[v1]); libsais_prefetchw(&distinct_names[v1]); - - int p2 = SA[i - 0]; d += (p2 < 0); p2 &= INT_MAX; int v2 = BUCKETS_INDEX2(T[p2 - 1], T[p2 - 2] > T[p2 - 1]); - SA[--induction_bucket[v2]] = (p2 - 1) | ((distinct_names[v2] != d) << (INT_BIT - 1)); distinct_names[v2] = d; - - int p3 = SA[i - 1]; d += (p3 < 0); p3 &= INT_MAX; int v3 = BUCKETS_INDEX2(T[p3 - 1], T[p3 - 2] > T[p3 - 1]); - SA[--induction_bucket[v3]] = (p3 - 1) | ((distinct_names[v3] != d) << (INT_BIT - 1)); distinct_names[v3] = d; - } - - for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1) - { - int p = SA[i]; d += (p < 0); p &= INT_MAX; int v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]); - SA[--induction_bucket[v]] = (p - 1) | ((distinct_names[v] != d) << (INT_BIT - 1)); distinct_names[v] = d; - } - - return d; -} - -static int libsais_partial_sorting_scan_right_to_left_32s_4k(const int * RESTRICT T, int * RESTRICT SA, int n, int k, int * RESTRICT buckets, int d) -{ - const ptrdiff_t prefetch_distance = 32; - - int * RESTRICT induction_bucket = &buckets[3 * k]; - int * RESTRICT distinct_names = &buckets[0 * k]; - - ptrdiff_t i; - for (i = (ptrdiff_t)n - 1; i >= 2 * prefetch_distance + 1; i -= 2) - { - libsais_prefetchw(&SA[i - 3 * prefetch_distance]); - - int s0 = SA[i - 2 * prefetch_distance - 0]; const int * Ts0 = &T[s0 & ~SUFFIX_GROUP_MARKER] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL); - int s1 = SA[i - 2 * prefetch_distance - 1]; const int * Ts1 = &T[s1 & ~SUFFIX_GROUP_MARKER] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL); - int s2 = SA[i - 1 * prefetch_distance - 0]; if (s2 > 0) { const ptrdiff_t Ts2 = T[(s2 & ~SUFFIX_GROUP_MARKER) - 1]; libsais_prefetchw(&induction_bucket[Ts2]); libsais_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts2, 0)]); } - int s3 = SA[i - 1 * prefetch_distance - 1]; if (s3 > 0) { const ptrdiff_t Ts3 = T[(s3 & ~SUFFIX_GROUP_MARKER) - 1]; libsais_prefetchw(&induction_bucket[Ts3]); libsais_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts3, 0)]); } - - int p0 = SA[i - 0]; - if (p0 > 0) - { - SA[i - 0] = 0; d += (p0 >> (SUFFIX_GROUP_BIT - 1)); p0 &= ~SUFFIX_GROUP_MARKER; int v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]); - SA[--induction_bucket[T[p0 - 1]]] = (p0 - 1) | ((T[p0 - 2] > T[p0 - 1]) << (INT_BIT - 1)) | ((distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v0] = d; - } - - int p1 = SA[i - 1]; - if (p1 > 0) - { - SA[i - 1] = 0; d += (p1 >> (SUFFIX_GROUP_BIT - 1)); p1 &= ~SUFFIX_GROUP_MARKER; int v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]); - SA[--induction_bucket[T[p1 - 1]]] = (p1 - 1) | ((T[p1 - 2] > T[p1 - 1]) << (INT_BIT - 1)) | ((distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v1] = d; - } - } - - for (; i >= 0; i -= 1) - { - int p = SA[i]; - if (p > 0) - { - SA[i] = 0; d += (p >> (SUFFIX_GROUP_BIT - 1)); p &= ~SUFFIX_GROUP_MARKER; int v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]); - SA[--induction_bucket[T[p - 1]]] = (p - 1) | ((T[p - 2] > T[p - 1]) << (INT_BIT - 1)) | ((distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v] = d; - } - } - - return d; -} - -static void libsais_partial_sorting_scan_right_to_left_32s_1k(const int * RESTRICT T, int * RESTRICT SA, int n, int k, int * RESTRICT induction_bucket) -{ - const ptrdiff_t prefetch_distance = 32; - - ptrdiff_t i; - for (i = (ptrdiff_t)n - 1; i >= 2 * prefetch_distance + 1; i -= 2) - { - libsais_prefetchw(&SA[i - 3 * prefetch_distance]); - - int s0 = SA[i - 2 * prefetch_distance - 0]; const int * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); - int s1 = SA[i - 2 * prefetch_distance - 1]; const int * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); - int s2 = SA[i - 1 * prefetch_distance - 0]; if (s2 > 0) { libsais_prefetchw(&induction_bucket[T[s2 - 1]]); libsais_prefetch(&T[s2] - 2); } - int s3 = SA[i - 1 * prefetch_distance - 1]; if (s3 > 0) { libsais_prefetchw(&induction_bucket[T[s3 - 1]]); libsais_prefetch(&T[s3] - 2); } - - int p0 = SA[i - 0]; if (p0 > 0) { SA[i - 0] = 0; SA[--induction_bucket[T[p0 - 1]]] = (p0 - 1) | ((T[p0 - 2] > T[p0 - 1]) << (INT_BIT - 1)); } - int p1 = SA[i - 1]; if (p1 > 0) { SA[i - 1] = 0; SA[--induction_bucket[T[p1 - 1]]] = (p1 - 1) | ((T[p1 - 2] > T[p1 - 1]) << (INT_BIT - 1)); } - } - - for (; i >= 0; i -= 1) - { - int p = SA[i]; if (p > 0) { SA[i] = 0; SA[--induction_bucket[T[p - 1]]] = (p - 1) | ((T[p - 2] > T[p - 1]) << (INT_BIT - 1)); } - } -} - -static void libsais_partial_sorting_gather_lms_suffixes_32s_4k(int * RESTRICT SA, int n) -{ - const ptrdiff_t prefetch_distance = 32; - - ptrdiff_t i, j, l; - for (i = 0, j = (ptrdiff_t)n - 3, l = 0; i < j; i += 4) - { - libsais_prefetch(&SA[i + prefetch_distance]); - - int s0 = SA[i + 0]; SA[l] = (s0 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER); l += (s0 < 0); - int s1 = SA[i + 1]; SA[l] = (s1 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER); l += (s1 < 0); - int s2 = SA[i + 2]; SA[l] = (s2 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER); l += (s2 < 0); - int s3 = SA[i + 3]; SA[l] = (s3 - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER); l += (s3 < 0); - } - - for (j += 3; i < j; i += 1) - { - int s = SA[i]; SA[l] = (s - SUFFIX_GROUP_MARKER) & (~SUFFIX_GROUP_MARKER); l += (s < 0); - } -} - -static void libsais_partial_sorting_gather_lms_suffixes_32s_1k(int * RESTRICT SA, int n) -{ - const ptrdiff_t prefetch_distance = 32; - - ptrdiff_t i, j, l; - for (i = 0, j = (ptrdiff_t)n - 3, l = 0; i < j; i += 4) - { - libsais_prefetch(&SA[i + prefetch_distance]); - - int s0 = SA[i + 0]; SA[l] = s0 & INT_MAX; l += (s0 < 0); - int s1 = SA[i + 1]; SA[l] = s1 & INT_MAX; l += (s1 < 0); - int s2 = SA[i + 2]; SA[l] = s2 & INT_MAX; l += (s2 < 0); - int s3 = SA[i + 3]; SA[l] = s3 & INT_MAX; l += (s3 < 0); - } - - for (j += 3; i < j; i += 1) - { - int s = SA[i]; SA[l] = s & INT_MAX; l += (s < 0); - } -} - -static void libsais_induce_partial_order_8u(const unsigned char * RESTRICT T, int * RESTRICT SA, int n, int * RESTRICT buckets, int first_lms_suffix, int left_suffixes_count) -{ - memset(&buckets[2 * ALPHABET_SIZE], 0, 2 * ALPHABET_SIZE * sizeof(int)); - - int d = libsais_partial_sorting_scan_left_to_right_8u(T, SA, n, buckets, left_suffixes_count, 0); - libsais_partial_sorting_shift_markers_8u(SA, buckets); - libsais_partial_sorting_scan_right_to_left_8u(T, SA, n, buckets, first_lms_suffix, left_suffixes_count, d); -} - -static void libsais_induce_partial_order_32s_6k(const int * RESTRICT T, int * RESTRICT SA, int n, int k, int * RESTRICT buckets, int first_lms_suffix, int left_suffixes_count) -{ - memset(&buckets[2 * k], 0, 2 * (size_t)k * sizeof(int)); - - int d = libsais_partial_sorting_scan_left_to_right_32s_6k(T, SA, n, k, buckets, left_suffixes_count, 0); - libsais_partial_sorting_shift_markers_32s_6k(SA, k, buckets); - libsais_partial_sorting_scan_right_to_left_32s_6k(T, SA, n, k, buckets, first_lms_suffix, left_suffixes_count, d); -} - -static void libsais_induce_partial_order_32s_4k(const int * RESTRICT T, int * RESTRICT SA, int n, int k, int * RESTRICT buckets) -{ - memset(buckets, 0, 2 * (size_t)k * sizeof(int)); - - int d = libsais_partial_sorting_scan_left_to_right_32s_4k(T, SA, n, k, buckets, 0); - libsais_partial_sorting_shift_markers_32s_4k(SA, n); - libsais_partial_sorting_scan_right_to_left_32s_4k(T, SA, n, k, buckets, d); - libsais_partial_sorting_gather_lms_suffixes_32s_4k(SA, n); -} - -static void libsais_induce_partial_order_32s_2k(const int * RESTRICT T, int * RESTRICT SA, int n, int k, int * RESTRICT buckets) -{ - libsais_partial_sorting_scan_left_to_right_32s_1k(T, SA, n, k, &buckets[1 * k]); - libsais_partial_sorting_scan_right_to_left_32s_1k(T, SA, n, k, &buckets[0 * k]); - libsais_partial_sorting_gather_lms_suffixes_32s_1k(SA, n); -} - -static void libsais_induce_partial_order_32s_1k(const int * RESTRICT T, int * RESTRICT SA, int n, int k, int * RESTRICT buckets) -{ - libsais_count_suffixes_32s(T, n, k, buckets); - libsais_initialize_buckets_start_32s_1k(k, buckets); - libsais_partial_sorting_scan_left_to_right_32s_1k(T, SA, n, k, buckets); - - libsais_count_suffixes_32s(T, n, k, buckets); - libsais_initialize_buckets_end_32s_1k(k, buckets); - libsais_partial_sorting_scan_right_to_left_32s_1k(T, SA, n, k, buckets); - - libsais_partial_sorting_gather_lms_suffixes_32s_1k(SA, n); -} - -static int libsais_renumber_and_gather_lms_suffixes_8u(int * RESTRICT SA, int n, int m) -{ - const ptrdiff_t prefetch_distance = 32; - - int * RESTRICT SAm = &SA[m]; - - memset(SAm, 0, ((size_t)n >> 1) * sizeof(int)); - - ptrdiff_t i, j; int name = 0; - for (i = 0, j = (ptrdiff_t)m - prefetch_distance - 3; i < j; i += 4) - { - libsais_prefetch(&SA[i + 2 * prefetch_distance]); - - libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 0] & INT_MAX) >> 1]); - libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 1] & INT_MAX) >> 1]); - libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 2] & INT_MAX) >> 1]); - libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 3] & INT_MAX) >> 1]); - - int p0 = SA[i + 0]; SAm[(p0 & INT_MAX) >> 1] = name | INT_MIN; name += p0 < 0; - int p1 = SA[i + 1]; SAm[(p1 & INT_MAX) >> 1] = name | INT_MIN; name += p1 < 0; - int p2 = SA[i + 2]; SAm[(p2 & INT_MAX) >> 1] = name | INT_MIN; name += p2 < 0; - int p3 = SA[i + 3]; SAm[(p3 & INT_MAX) >> 1] = name | INT_MIN; name += p3 < 0; - } - - for (j += prefetch_distance + 3; i < j; i += 1) - { - int p = SA[i]; SAm[(p & INT_MAX) >> 1] = name | INT_MIN; name += p < 0; - } - - if (name < m) - { - ptrdiff_t l; - for (i = (ptrdiff_t)m + ((ptrdiff_t)n >> 1) - 1, j = (ptrdiff_t)m + 3, l = (ptrdiff_t)n - 1; i >= j; i -= 4) - { - libsais_prefetch(&SA[i - prefetch_distance]); - - int s0 = SA[i - 0]; SA[l] = s0 & INT_MAX; l -= s0 < 0; - int s1 = SA[i - 1]; SA[l] = s1 & INT_MAX; l -= s1 < 0; - int s2 = SA[i - 2]; SA[l] = s2 & INT_MAX; l -= s2 < 0; - int s3 = SA[i - 3]; SA[l] = s3 & INT_MAX; l -= s3 < 0; - } - - for (j -= 3; i >= j; i -= 1) - { - int s = SA[i]; SA[l] = s & INT_MAX; l -= s < 0; - } - } - else - { - for (i = 0; i < m; i += 1) { SA[i] &= INT_MAX; } - } - - return name; -} - -static int libsais_renumber_and_mark_distinct_lms_suffixes_32s_4k(int * RESTRICT SA, int n, int m) -{ - const ptrdiff_t prefetch_distance = 32; - - int * RESTRICT SAm = &SA[m]; - - memset(SAm, 0, ((size_t)n >> 1) * sizeof(int)); - - ptrdiff_t i, j; int p0, p1, p2, p3 = -1, name = 1; - for (i = 0, j = (ptrdiff_t)m - prefetch_distance - 3; i < j; i += 4) - { - libsais_prefetchw(&SA[i + 2 * prefetch_distance]); - - libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 0] & INT_MAX) >> 1]); - libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 1] & INT_MAX) >> 1]); - libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 2] & INT_MAX) >> 1]); - libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 3] & INT_MAX) >> 1]); - - p0 = SA[i + 0]; SAm[(SA[i + 0] = p0 & INT_MAX) >> 1] = name | (p0 & p3 & INT_MIN); name += p0 < 0; - p1 = SA[i + 1]; SAm[(SA[i + 1] = p1 & INT_MAX) >> 1] = name | (p1 & p0 & INT_MIN); name += p1 < 0; - p2 = SA[i + 2]; SAm[(SA[i + 2] = p2 & INT_MAX) >> 1] = name | (p2 & p1 & INT_MIN); name += p2 < 0; - p3 = SA[i + 3]; SAm[(SA[i + 3] = p3 & INT_MAX) >> 1] = name | (p3 & p2 & INT_MIN); name += p3 < 0; - } - - for (j += prefetch_distance + 3; i < j; i += 1) - { - p2 = p3; p3 = SA[i]; SAm[(SA[i] = p3 & INT_MAX) >> 1] = name | (p3 & p2 & INT_MIN); name += p3 < 0; - } - - if (name <= m) - { - p3 = -1; - for (i = m, j = (ptrdiff_t)m + ((ptrdiff_t)n >> 1) - 3; i < j; i += 4) - { - libsais_prefetchw(&SA[i + prefetch_distance]); - - p0 = SA[i + 0]; SA[i + 0] = p0 & (p3 | INT_MAX); p0 = (p0 == 0) ? p3 : p0; - p1 = SA[i + 1]; SA[i + 1] = p1 & (p0 | INT_MAX); p1 = (p1 == 0) ? p0 : p1; - p2 = SA[i + 2]; SA[i + 2] = p2 & (p1 | INT_MAX); p2 = (p2 == 0) ? p1 : p2; - p3 = SA[i + 3]; SA[i + 3] = p3 & (p2 | INT_MAX); p3 = (p3 == 0) ? p2 : p3; - } - - for (j += 3; i < j; i += 1) - { - p2 = p3; p3 = SA[i]; SA[i] = p3 & (p2 | INT_MAX); p3 = (p3 == 0) ? p2 : p3; - } - } - - return name - 1; -} - -static int libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k(int * RESTRICT T, int * RESTRICT SA, int n, int m) -{ - const ptrdiff_t prefetch_distance = 32; - - int * RESTRICT SAm = &SA[m]; - - { - libsais_gather_lms_suffixes_32s(T, SA, n); - - memset(&SA[m], 0, ((size_t)n - (size_t)m - (size_t)m) * sizeof(int)); - - ptrdiff_t i, j; - for (i = (ptrdiff_t)n - (ptrdiff_t)m, j = (ptrdiff_t)n - 1 - prefetch_distance - 3; i < j; i += 4) - { - libsais_prefetch(&SA[i + 2 * prefetch_distance]); - - libsais_prefetchw(&SAm[((unsigned int)SA[i + prefetch_distance + 0]) >> 1]); - libsais_prefetchw(&SAm[((unsigned int)SA[i + prefetch_distance + 1]) >> 1]); - libsais_prefetchw(&SAm[((unsigned int)SA[i + prefetch_distance + 2]) >> 1]); - libsais_prefetchw(&SAm[((unsigned int)SA[i + prefetch_distance + 3]) >> 1]); - - SAm[((unsigned int)SA[i + 0]) >> 1] = SA[i + 1] - SA[i + 0] + 1 + INT_MIN; - SAm[((unsigned int)SA[i + 1]) >> 1] = SA[i + 2] - SA[i + 1] + 1 + INT_MIN; - SAm[((unsigned int)SA[i + 2]) >> 1] = SA[i + 3] - SA[i + 2] + 1 + INT_MIN; - SAm[((unsigned int)SA[i + 3]) >> 1] = SA[i + 4] - SA[i + 3] + 1 + INT_MIN; - } - - for (j += prefetch_distance + 3; i < j; i += 1) - { - SAm[((unsigned int)SA[i]) >> 1] = SA[i + 1] - SA[i] + 1 + INT_MIN; - } - - SAm[((unsigned int)SA[n - 1]) >> 1] = 1 + INT_MIN; - } - - { - ptrdiff_t i, j; - for (i = 0, j = (ptrdiff_t)(n >> 1) - 3; i < j; i += 4) - { - libsais_prefetchw(&SAm[i + prefetch_distance]); - - SAm[i + 0] = (SAm[i + 0] < 0 ? SAm[i + 0] : 0) & INT_MAX; - SAm[i + 1] = (SAm[i + 1] < 0 ? SAm[i + 1] : 0) & INT_MAX; - SAm[i + 2] = (SAm[i + 2] < 0 ? SAm[i + 2] : 0) & INT_MAX; - SAm[i + 3] = (SAm[i + 3] < 0 ? SAm[i + 3] : 0) & INT_MAX; - } - - for (j += 3; i < j; i += 1) - { - SAm[i] = (SAm[i] < 0 ? SAm[i] : 0) & INT_MAX; - } - } - - int name = 1; - - { - ptrdiff_t i, j, p = SA[0], plen = SAm[p >> 1]; int pdiff = INT_MIN; - for (i = 1, j = m - prefetch_distance - 1; i < j; i += 2) - { - libsais_prefetch(&SA[i + 2 * prefetch_distance]); - - libsais_prefetchw(&SAm[((unsigned int)SA[i + prefetch_distance + 0]) >> 1]); libsais_prefetch(&T[((unsigned int)SA[i + prefetch_distance + 0])]); - libsais_prefetchw(&SAm[((unsigned int)SA[i + prefetch_distance + 1]) >> 1]); libsais_prefetch(&T[((unsigned int)SA[i + prefetch_distance + 1])]); - - ptrdiff_t q = SA[i + 0], qlen = SAm[q >> 1]; int qdiff = INT_MIN; - if (plen == qlen) { ptrdiff_t l = 0; do { if (T[p + l] != T[q + l]) { break; } } while (++l < qlen); qdiff = (l - qlen) & INT_MIN; } - SAm[p >> 1] = name | (pdiff & qdiff); name += (qdiff < 0); - - p = SA[i + 1]; plen = SAm[p >> 1]; pdiff = INT_MIN; - if (qlen == plen) { ptrdiff_t l = 0; do { if (T[q + l] != T[p + l]) { break; } } while (++l < plen); pdiff = (l - plen) & INT_MIN; } - SAm[q >> 1] = name | (qdiff & pdiff); name += (pdiff < 0); - } - - for (j += prefetch_distance + 1; i < j; i += 1) - { - ptrdiff_t q = SA[i], qlen = SAm[q >> 1]; int qdiff = INT_MIN; - if (plen == qlen) { ptrdiff_t l = 0; do { if (T[p + l] != T[q + l]) { break; } } while (++l < plen); qdiff = (l - plen) & INT_MIN; } - SAm[p >> 1] = name | (pdiff & qdiff); name += (qdiff < 0); - - p = q; plen = qlen; pdiff = qdiff; - } - - SAm[p >> 1] = name | pdiff; name++; - } - - if (name <= m) - { - ptrdiff_t i, j; int p0, p1, p2, p3 = -1; - for (i = m, j = (ptrdiff_t)m + ((ptrdiff_t)n >> 1) - 3; i < j; i += 4) - { - libsais_prefetchw(&SA[i + prefetch_distance]); - - p0 = SA[i + 0]; SA[i + 0] = p0 & (p3 | INT_MAX); p0 = (p0 == 0) ? p3 : p0; - p1 = SA[i + 1]; SA[i + 1] = p1 & (p0 | INT_MAX); p1 = (p1 == 0) ? p0 : p1; - p2 = SA[i + 2]; SA[i + 2] = p2 & (p1 | INT_MAX); p2 = (p2 == 0) ? p1 : p2; - p3 = SA[i + 3]; SA[i + 3] = p3 & (p2 | INT_MAX); p3 = (p3 == 0) ? p2 : p3; - } - - for (j += 3; i < j; i += 1) - { - p2 = p3; p3 = SA[i]; SA[i] = p3 & (p2 | INT_MAX); p3 = (p3 == 0) ? p2 : p3; - } - } - - return name - 1; -} - -static void libsais_reconstruct_lms_suffixes(int * RESTRICT SA, int n, int m) -{ - const ptrdiff_t prefetch_distance = 32; - - const int * RESTRICT SAnm = &SA[n - m]; - - ptrdiff_t i, j; - for (i = 0, j = (ptrdiff_t)m - prefetch_distance - 3; i < j; i += 4) - { - libsais_prefetchw(&SA[i + 2 * prefetch_distance]); - - libsais_prefetch(&SAnm[SA[i + prefetch_distance + 0]]); - libsais_prefetch(&SAnm[SA[i + prefetch_distance + 1]]); - libsais_prefetch(&SAnm[SA[i + prefetch_distance + 2]]); - libsais_prefetch(&SAnm[SA[i + prefetch_distance + 3]]); - - SA[i + 0] = SAnm[SA[i + 0]]; - SA[i + 1] = SAnm[SA[i + 1]]; - SA[i + 2] = SAnm[SA[i + 2]]; - SA[i + 3] = SAnm[SA[i + 3]]; - } - - for (j += prefetch_distance + 3; i < j; i += 1) - { - SA[i] = SAnm[SA[i]]; - } -} - -static void libsais_place_lms_suffixes_interval_8u(int * RESTRICT SA, int n, int m, const int * RESTRICT buckets) -{ - const int * RESTRICT bucket_end = &buckets[7 * ALPHABET_SIZE]; - - ptrdiff_t c, j = n; - for (c = UCHAR_MAX - 1; c >= 0; --c) - { - ptrdiff_t l = (ptrdiff_t)buckets[BUCKETS_INDEX2(c, 1) + BUCKETS_INDEX2(1, 0)] - (ptrdiff_t)buckets[BUCKETS_INDEX2(c, 1)]; - if (l > 0) - { - ptrdiff_t i = bucket_end[c]; - if (j - i > 0) - { - memset(&SA[i], 0, (size_t)(j - i) * sizeof(int)); - } - - memmove(&SA[j = (i - l)], &SA[m -= (int)l], (size_t)l * sizeof(int)); - } - } - - memset(&SA[0], 0, (size_t)j * sizeof(int)); -} - -static void libsais_place_lms_suffixes_interval_32s_4k(int * RESTRICT SA, int n, int k, int m, const int * RESTRICT buckets) -{ - const int * RESTRICT bucket_end = &buckets[3 * k]; - - ptrdiff_t c, j = n; - for (c = (ptrdiff_t)k - 2; c >= 0; --c) - { - ptrdiff_t l = (ptrdiff_t)buckets[BUCKETS_INDEX2(c, 1) + BUCKETS_INDEX2(1, 0)] - (ptrdiff_t)buckets[BUCKETS_INDEX2(c, 1)]; - if (l > 0) - { - ptrdiff_t i = bucket_end[c]; - if (j - i > 0) - { - memset(&SA[i], 0, (size_t)(j - i) * sizeof(int)); - } - - memmove(&SA[j = (i - l)], &SA[m -= (int)l], (size_t)l * sizeof(int)); - } - } - - memset(&SA[0], 0, (size_t)j * sizeof(int)); -} - -static void libsais_place_lms_suffixes_interval_32s_2k(int * RESTRICT SA, int n, int k, int m, const int * RESTRICT buckets) -{ - ptrdiff_t c, j = n; - for (c = BUCKETS_INDEX2((ptrdiff_t)k - 2, 0); c >= BUCKETS_INDEX2(0, 0); c -= BUCKETS_INDEX2(1, 0)) - { - ptrdiff_t l = (ptrdiff_t)buckets[c + BUCKETS_INDEX2(1, 1)] - (ptrdiff_t)buckets[c + BUCKETS_INDEX2(0, 1)]; - if (l > 0) - { - ptrdiff_t i = buckets[c]; - if (j - i > 0) - { - memset(&SA[i], 0, (size_t)(j - i) * sizeof(int)); - } - - memmove(&SA[j = (i - l)], &SA[m -= (int)l], (size_t)l * sizeof(int)); - } - } - - memset(&SA[0], 0, (size_t)j * sizeof(int)); -} - -static void libsais_place_lms_suffixes_interval_32s_1k(const int * RESTRICT T, int * RESTRICT SA, int n, int k, int m, int * RESTRICT buckets) -{ - const ptrdiff_t prefetch_distance = 32; - - int c = k - 1; ptrdiff_t i, l = buckets[c]; - for (i = (ptrdiff_t)m - 1; i >= prefetch_distance + 3; i -= 4) - { - libsais_prefetch(&SA[i - 2 * prefetch_distance]); - - libsais_prefetch(&T[SA[i - prefetch_distance - 0]]); - libsais_prefetch(&T[SA[i - prefetch_distance - 1]]); - libsais_prefetch(&T[SA[i - prefetch_distance - 2]]); - libsais_prefetch(&T[SA[i - prefetch_distance - 3]]); - - int p0 = SA[i - 0]; if (T[p0] != c) { c = T[p0]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(int)); l = buckets[c]; } SA[--l] = p0; - int p1 = SA[i - 1]; if (T[p1] != c) { c = T[p1]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(int)); l = buckets[c]; } SA[--l] = p1; - int p2 = SA[i - 2]; if (T[p2] != c) { c = T[p2]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(int)); l = buckets[c]; } SA[--l] = p2; - int p3 = SA[i - 3]; if (T[p3] != c) { c = T[p3]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(int)); l = buckets[c]; } SA[--l] = p3; - } - - for (; i >= 0; i -= 1) - { - int p = SA[i]; if (T[p] != c) { c = T[p]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(int)); l = buckets[c]; } SA[--l] = p; - } - - memset(&SA[0], 0, (size_t)l * sizeof(int)); -} - -static void libsais_place_lms_suffixes_histogram_32s_6k(int * RESTRICT SA, int n, int k, int m, const int * RESTRICT buckets) -{ - const int * RESTRICT bucket_end = &buckets[5 * k]; - - ptrdiff_t c, j = n; - for (c = (ptrdiff_t)k - 2; c >= 0; --c) - { - ptrdiff_t l = (ptrdiff_t)buckets[BUCKETS_INDEX4(c, 1)]; - if (l > 0) - { - ptrdiff_t i = bucket_end[c]; - if (j - i > 0) - { - memset(&SA[i], 0, (size_t)(j - i) * sizeof(int)); - } - - memmove(&SA[j = (i - l)], &SA[m -= (int)l], (size_t)l * sizeof(int)); - } - } - - memset(&SA[0], 0, (size_t)j * sizeof(int)); -} - -static void libsais_place_lms_suffixes_histogram_32s_4k(int * RESTRICT SA, int n, int k, int m, const int * RESTRICT buckets) -{ - const int * RESTRICT bucket_end = &buckets[3 * k]; - - ptrdiff_t c, j = n; - for (c = (ptrdiff_t)k - 2; c >= 0; --c) - { - ptrdiff_t l = (ptrdiff_t)buckets[BUCKETS_INDEX2(c, 1)]; - if (l > 0) - { - ptrdiff_t i = bucket_end[c]; - if (j - i > 0) - { - memset(&SA[i], 0, (size_t)(j - i) * sizeof(int)); - } - - memmove(&SA[j = (i - l)], &SA[m -= (int)l], (size_t)l * sizeof(int)); - } - } - - memset(&SA[0], 0, (size_t)j * sizeof(int)); -} - -static void libsais_place_lms_suffixes_histogram_32s_2k(int * RESTRICT SA, int n, int k, int m, const int * RESTRICT buckets) -{ - ptrdiff_t c, j = n; - for (c = BUCKETS_INDEX2((ptrdiff_t)k - 2, 0); c >= BUCKETS_INDEX2(0, 0); c -= BUCKETS_INDEX2(1, 0)) - { - ptrdiff_t l = (ptrdiff_t)buckets[c + BUCKETS_INDEX2(0, 1)]; - if (l > 0) - { - ptrdiff_t i = buckets[c]; - if (j - i > 0) - { - memset(&SA[i], 0, (size_t)(j - i) * sizeof(int)); - } - - memmove(&SA[j = (i - l)], &SA[m -= (int)l], (size_t)l * sizeof(int)); - } - } - - memset(&SA[0], 0, (size_t)j * sizeof(int)); -} - -static void libsais_final_bwt_scan_left_to_right_8u(const unsigned char * RESTRICT T, int * RESTRICT SA, int n, int * RESTRICT buckets) -{ - const ptrdiff_t prefetch_distance = 32; - - int * RESTRICT induction_bucket = &buckets[6 * ALPHABET_SIZE]; - - SA[induction_bucket[T[n - 1]]++] = (n - 1) | ((T[n - 2] < T[n - 1]) << (INT_BIT - 1)); - - ptrdiff_t i, j; - for (i = 0, j = (ptrdiff_t)n - prefetch_distance - 1; i < j; i += 2) - { - libsais_prefetchw(&SA[i + 2 * prefetch_distance]); - - int s0 = SA[i + prefetch_distance + 0]; const unsigned char * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL); - int s1 = SA[i + prefetch_distance + 1]; const unsigned char * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL); - - int p0 = SA[i + 0]; SA[i + 0] = p0 & INT_MAX; if (p0 > 0) { p0--; SA[i + 0] = T[p0] | INT_MIN; SA[induction_bucket[T[p0]]++] = p0 | (((T[p0 - (p0 > 0)] < T[p0])) << (INT_BIT - 1)); } - int p1 = SA[i + 1]; SA[i + 1] = p1 & INT_MAX; if (p1 > 0) { p1--; SA[i + 1] = T[p1] | INT_MIN; SA[induction_bucket[T[p1]]++] = p1 | (((T[p1 - (p1 > 0)] < T[p1])) << (INT_BIT - 1)); } - } - - for (j += prefetch_distance + 1; i < j; i += 1) - { - int p = SA[i]; SA[i] = p & INT_MAX; if (p > 0) { p--; SA[i] = T[p] | INT_MIN; SA[induction_bucket[T[p]]++] = p | (((T[p - (p > 0)] < T[p])) << (INT_BIT - 1)); } - } -} - -static void libsais_final_sorting_scan_left_to_right_8u(const unsigned char * RESTRICT T, int * RESTRICT SA, int n, int * RESTRICT buckets) -{ - const ptrdiff_t prefetch_distance = 32; - - int * RESTRICT induction_bucket = &buckets[6 * ALPHABET_SIZE]; - - SA[induction_bucket[T[n - 1]]++] = (n - 1) | ((T[n - 2] < T[n - 1]) << (INT_BIT - 1)); - - ptrdiff_t i, j; - for (i = 0, j = (ptrdiff_t)n - prefetch_distance - 1; i < j; i += 2) - { - libsais_prefetchw(&SA[i + 2 * prefetch_distance]); - - int s0 = SA[i + prefetch_distance + 0]; const unsigned char * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL); - int s1 = SA[i + prefetch_distance + 1]; const unsigned char * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL); - - int p0 = SA[i + 0]; SA[i + 0] = p0 ^ INT_MIN; if (p0 > 0) { p0--; SA[induction_bucket[T[p0]]++] = p0 | (((T[p0 - (p0 > 0)] < T[p0])) << (INT_BIT - 1)); } - int p1 = SA[i + 1]; SA[i + 1] = p1 ^ INT_MIN; if (p1 > 0) { p1--; SA[induction_bucket[T[p1]]++] = p1 | (((T[p1 - (p1 > 0)] < T[p1])) << (INT_BIT - 1)); } - } - - for (j += prefetch_distance + 1; i < j; i += 1) - { - int p = SA[i]; SA[i] = p ^ INT_MIN; if (p > 0) { p--; SA[induction_bucket[T[p]]++] = p | (((T[p - (p > 0)] < T[p])) << (INT_BIT - 1)); } - } -} - -static void libsais_final_sorting_scan_left_to_right_32s(const int * RESTRICT T, int * RESTRICT SA, int n, int * RESTRICT induction_bucket) -{ - const ptrdiff_t prefetch_distance = 32; - - SA[induction_bucket[T[n - 1]]++] = (n - 1) | ((T[n - 2] < T[n - 1]) << (INT_BIT - 1)); - - ptrdiff_t i, j; - for (i = 0, j = (ptrdiff_t)n - 2 * prefetch_distance - 1; i < j; i += 2) - { - libsais_prefetchw(&SA[i + 3 * prefetch_distance]); - - int s0 = SA[i + 2 * prefetch_distance + 0]; const int * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); - int s1 = SA[i + 2 * prefetch_distance + 1]; const int * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); - int s2 = SA[i + 1 * prefetch_distance + 0]; if (s2 > 0) { libsais_prefetchw(&induction_bucket[T[s2 - 1]]); libsais_prefetch(&T[s2] - 2); } - int s3 = SA[i + 1 * prefetch_distance + 1]; if (s3 > 0) { libsais_prefetchw(&induction_bucket[T[s3 - 1]]); libsais_prefetch(&T[s3] - 2); } - - int p0 = SA[i + 0]; SA[i + 0] = p0 ^ INT_MIN; if (p0 > 0) { p0--; SA[induction_bucket[T[p0]]++] = p0 | (((T[p0 - (p0 > 0)] < T[p0])) << (INT_BIT - 1)); } - int p1 = SA[i + 1]; SA[i + 1] = p1 ^ INT_MIN; if (p1 > 0) { p1--; SA[induction_bucket[T[p1]]++] = p1 | (((T[p1 - (p1 > 0)] < T[p1])) << (INT_BIT - 1)); } - } - - for (j += 2 * prefetch_distance + 1; i < j; i += 1) - { - int p = SA[i]; SA[i] = p ^ INT_MIN; if (p > 0) { p--; SA[induction_bucket[T[p]]++] = p | (((T[p - (p > 0)] < T[p])) << (INT_BIT - 1)); } - } -} - -static int libsais_final_bwt_scan_right_to_left_8u(const unsigned char * RESTRICT T, int * RESTRICT SA, int n, int * RESTRICT buckets) -{ - const ptrdiff_t prefetch_distance = 32; - - int * RESTRICT induction_bucket = &buckets[7 * ALPHABET_SIZE]; - - ptrdiff_t i; int index = -1; - for (i = (ptrdiff_t)n - 1; i >= prefetch_distance + 1; i -= 2) - { - libsais_prefetchw(&SA[i - 2 * prefetch_distance]); - - int s0 = SA[i - prefetch_distance - 0]; const unsigned char * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL); - int s1 = SA[i - prefetch_distance - 1]; const unsigned char * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL); - - int p0 = SA[i - 0]; index = (p0 == 0) ? (int)(i - 0) : index; - SA[i - 0] = p0 & INT_MAX; if (p0 > 0) { p0--; unsigned char c0 = T[p0 - (p0 > 0)], c1 = T[p0]; SA[i - 0] = c1; int t = c0 | INT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p0 : t; } - - int p1 = SA[i - 1]; index = (p1 == 0) ? (int)(i - 1) : index; - SA[i - 1] = p1 & INT_MAX; if (p1 > 0) { p1--; unsigned char c0 = T[p1 - (p1 > 0)], c1 = T[p1]; SA[i - 1] = c1; int t = c0 | INT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p1 : t; } - } - - for (; i >= 0; i -= 1) - { - int p = SA[i]; index = (p == 0) ? (int)i : index; - SA[i] = p & INT_MAX; if (p > 0) { p--; unsigned char c0 = T[p - (p > 0)], c1 = T[p]; SA[i] = c1; int t = c0 | INT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t; } - } - - return index; -} - -static void libsais_final_sorting_scan_right_to_left_8u(const unsigned char * RESTRICT T, int * RESTRICT SA, int n, int * RESTRICT buckets) -{ - const ptrdiff_t prefetch_distance = 32; - - int * RESTRICT induction_bucket = &buckets[7 * ALPHABET_SIZE]; - - ptrdiff_t i; - for (i = (ptrdiff_t)n - 1; i >= prefetch_distance + 1; i -= 2) - { - libsais_prefetchw(&SA[i - 2 * prefetch_distance]); - - int s0 = SA[i - prefetch_distance - 0]; const unsigned char * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); Ts0--; libsais_prefetch(s0 > 0 ? Ts0 : NULL); - int s1 = SA[i - prefetch_distance - 1]; const unsigned char * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); Ts1--; libsais_prefetch(s1 > 0 ? Ts1 : NULL); - - int p0 = SA[i - 0]; SA[i - 0] = p0 & INT_MAX; if (p0 > 0) { p0--; SA[--induction_bucket[T[p0]]] = p0 | (((T[p0 - (p0 > 0)] > T[p0])) << (INT_BIT - 1)); } - int p1 = SA[i - 1]; SA[i - 1] = p1 & INT_MAX; if (p1 > 0) { p1--; SA[--induction_bucket[T[p1]]] = p1 | (((T[p1 - (p1 > 0)] > T[p1])) << (INT_BIT - 1)); } - } - - for (; i >= 0; i -= 1) - { - int p = SA[i]; SA[i] = p & INT_MAX; if (p > 0) { p--; SA[--induction_bucket[T[p]]] = p | (((T[p - (p > 0)] > T[p])) << (INT_BIT - 1)); } - } -} - -static void libsais_final_sorting_scan_right_to_left_32s(const int * RESTRICT T, int * RESTRICT SA, int n, int * RESTRICT induction_bucket) -{ - const ptrdiff_t prefetch_distance = 32; - - ptrdiff_t i; - for (i = (ptrdiff_t)n - 1; i >= 2 * prefetch_distance + 1; i -= 2) - { - libsais_prefetchw(&SA[i - 3 * prefetch_distance]); - - int s0 = SA[i - 2 * prefetch_distance - 0]; const int * Ts0 = &T[s0] - 1; libsais_prefetch(s0 > 0 ? Ts0 : NULL); - int s1 = SA[i - 2 * prefetch_distance - 1]; const int * Ts1 = &T[s1] - 1; libsais_prefetch(s1 > 0 ? Ts1 : NULL); - int s2 = SA[i - 1 * prefetch_distance - 0]; if (s2 > 0) { libsais_prefetchw(&induction_bucket[T[s2 - 1]]); libsais_prefetch(&T[s2] - 2); } - int s3 = SA[i - 1 * prefetch_distance - 1]; if (s3 > 0) { libsais_prefetchw(&induction_bucket[T[s3 - 1]]); libsais_prefetch(&T[s3] - 2); } - - int p0 = SA[i - 0]; SA[i - 0] = p0 & INT_MAX; if (p0 > 0) { p0--; SA[--induction_bucket[T[p0]]] = p0 | (((T[p0 - (p0 > 0)] > T[p0])) << (INT_BIT - 1)); } - int p1 = SA[i - 1]; SA[i - 1] = p1 & INT_MAX; if (p1 > 0) { p1--; SA[--induction_bucket[T[p1]]] = p1 | (((T[p1 - (p1 > 0)] > T[p1])) << (INT_BIT - 1)); } - } - - for (; i >= 0; i -= 1) - { - int p = SA[i]; SA[i] = p & INT_MAX; if (p > 0) { p--; SA[--induction_bucket[T[p]]] = p | (((T[p - (p > 0)] > T[p])) << (INT_BIT - 1)); } - } -} - -static int libsais_induce_final_order_8u(const unsigned char * RESTRICT T, int * RESTRICT SA, int n, int bwt, int * RESTRICT buckets) -{ - if (bwt) - { - libsais_final_bwt_scan_left_to_right_8u(T, SA, n, buckets); - return libsais_final_bwt_scan_right_to_left_8u(T, SA, n, buckets); - } - else - { - libsais_final_sorting_scan_left_to_right_8u(T, SA, n, buckets); - libsais_final_sorting_scan_right_to_left_8u(T, SA, n, buckets); - return 0; - } -} - -static void libsais_induce_final_order_32s_6k(const int * RESTRICT T, int * RESTRICT SA, int n, int k, int * RESTRICT buckets) -{ - libsais_final_sorting_scan_left_to_right_32s(T, SA, n, &buckets[4 * k]); - libsais_final_sorting_scan_right_to_left_32s(T, SA, n, &buckets[5 * k]); -} - -static void libsais_induce_final_order_32s_4k(const int * RESTRICT T, int * RESTRICT SA, int n, int k, int * RESTRICT buckets) -{ - libsais_final_sorting_scan_left_to_right_32s(T, SA, n, &buckets[2 * k]); - libsais_final_sorting_scan_right_to_left_32s(T, SA, n, &buckets[3 * k]); -} - -static void libsais_induce_final_order_32s_2k(const int * RESTRICT T, int * RESTRICT SA, int n, int k, int * RESTRICT buckets) -{ - libsais_final_sorting_scan_left_to_right_32s(T, SA, n, &buckets[1 * k]); - libsais_final_sorting_scan_right_to_left_32s(T, SA, n, &buckets[0 * k]); -} - -static void libsais_induce_final_order_32s_1k(const int * RESTRICT T, int * RESTRICT SA, int n, int k, int * RESTRICT buckets) -{ - libsais_count_suffixes_32s(T, n, k, buckets); - libsais_initialize_buckets_start_32s_1k(k, buckets); - libsais_final_sorting_scan_left_to_right_32s(T, SA, n, buckets); - - libsais_count_suffixes_32s(T, n, k, buckets); - libsais_initialize_buckets_end_32s_1k(k, buckets); - libsais_final_sorting_scan_right_to_left_32s(T, SA, n, buckets); -} - -static int libsais_compact_lms_suffixes_32s(int * RESTRICT T, int * RESTRICT SA, int n, int m, int fs) -{ - const ptrdiff_t prefetch_distance = 32; - - int f = 0; - - { - int * RESTRICT SAm = &SA[m]; - - int i, j; - for (i = 0, j = m - 2 * (int)prefetch_distance - 3; i < j; i += 4) - { - libsais_prefetch(&SA[i + 3 * prefetch_distance]); - - libsais_prefetchw(&SAm[((unsigned int)SA[i + 2 * prefetch_distance + 0]) >> 1]); - libsais_prefetchw(&SAm[((unsigned int)SA[i + 2 * prefetch_distance + 1]) >> 1]); - libsais_prefetchw(&SAm[((unsigned int)SA[i + 2 * prefetch_distance + 2]) >> 1]); - libsais_prefetchw(&SAm[((unsigned int)SA[i + 2 * prefetch_distance + 3]) >> 1]); - - unsigned int q0 = (unsigned int)SA[i + prefetch_distance + 0]; const int * Tq0 = &T[q0]; libsais_prefetchw(SAm[q0 >> 1] < 0 ? Tq0 : NULL); - unsigned int q1 = (unsigned int)SA[i + prefetch_distance + 1]; const int * Tq1 = &T[q1]; libsais_prefetchw(SAm[q1 >> 1] < 0 ? Tq1 : NULL); - unsigned int q2 = (unsigned int)SA[i + prefetch_distance + 2]; const int * Tq2 = &T[q2]; libsais_prefetchw(SAm[q2 >> 1] < 0 ? Tq2 : NULL); - unsigned int q3 = (unsigned int)SA[i + prefetch_distance + 3]; const int * Tq3 = &T[q3]; libsais_prefetchw(SAm[q3 >> 1] < 0 ? Tq3 : NULL); - - unsigned int p0 = (unsigned int)SA[i + 0]; int s0 = SAm[p0 >> 1]; if (s0 < 0) { T[p0] |= INT_MIN; f++; s0 = i + 0 + INT_MIN + f; } SAm[p0 >> 1] = s0 - f; - unsigned int p1 = (unsigned int)SA[i + 1]; int s1 = SAm[p1 >> 1]; if (s1 < 0) { T[p1] |= INT_MIN; f++; s1 = i + 1 + INT_MIN + f; } SAm[p1 >> 1] = s1 - f; - unsigned int p2 = (unsigned int)SA[i + 2]; int s2 = SAm[p2 >> 1]; if (s2 < 0) { T[p2] |= INT_MIN; f++; s2 = i + 2 + INT_MIN + f; } SAm[p2 >> 1] = s2 - f; - unsigned int p3 = (unsigned int)SA[i + 3]; int s3 = SAm[p3 >> 1]; if (s3 < 0) { T[p3] |= INT_MIN; f++; s3 = i + 3 + INT_MIN + f; } SAm[p3 >> 1] = s3 - f; - } - - for (j += 2 * (int)prefetch_distance + 3; i < j; i += 1) - { - unsigned int p = (unsigned int)SA[i]; int s = SAm[p >> 1]; if (s < 0) { T[p] |= INT_MIN; f++; s = i + INT_MIN + f; } SAm[p >> 1] = s - f; - } - } - - { - int * RESTRICT SAl = &SA[0]; - int * RESTRICT SAr = &SA[0]; - - ptrdiff_t i, j, l = (ptrdiff_t)m - 1, r = (ptrdiff_t)n + (ptrdiff_t)fs - 1; - for (i = (ptrdiff_t)m + ((ptrdiff_t)n >> 1) - 1, j = (ptrdiff_t)m + 3; i >= j; i -= 4) - { - libsais_prefetch(&SA[i - prefetch_distance]); - - int p0 = SA[i - 0]; SAl[l] = p0 & INT_MAX; l -= p0 < 0; SAr[r] = p0 - 1; r -= p0 > 0; - int p1 = SA[i - 1]; SAl[l] = p1 & INT_MAX; l -= p1 < 0; SAr[r] = p1 - 1; r -= p1 > 0; - int p2 = SA[i - 2]; SAl[l] = p2 & INT_MAX; l -= p2 < 0; SAr[r] = p2 - 1; r -= p2 > 0; - int p3 = SA[i - 3]; SAl[l] = p3 & INT_MAX; l -= p3 < 0; SAr[r] = p3 - 1; r -= p3 > 0; - } - - for (j -= 3; i >= j; i -= 1) - { - int p = SA[i]; SAl[l] = p & INT_MAX; l -= p < 0; SAr[r] = p - 1; r -= p > 0; - } - - memcpy(&SA[(ptrdiff_t)n + (ptrdiff_t)fs - (ptrdiff_t)m], &SA[(ptrdiff_t)l + 1], (size_t)f * sizeof(int)); - } - - return f; -} - -static void libsais_merge_compacted_lms_suffixes_32s(int * RESTRICT T, int * RESTRICT SA, int n, int m, int f) -{ - const ptrdiff_t prefetch_distance = 32; - - const int * RESTRICT SAnm = &SA[n - m - 1]; - - { - int i, j, l = 0, tmp = SAnm[l]; - for (i = 0, j = n - 6; i < j; i += 4) - { - libsais_prefetch(&T[i + prefetch_distance]); - - int c0 = T[i + 0]; if (c0 < 0) { T[i + 0] = c0 & INT_MAX; SA[tmp] = i + 0; i++; tmp = SAnm[++l]; } - int c1 = T[i + 1]; if (c1 < 0) { T[i + 1] = c1 & INT_MAX; SA[tmp] = i + 1; i++; tmp = SAnm[++l]; } - int c2 = T[i + 2]; if (c2 < 0) { T[i + 2] = c2 & INT_MAX; SA[tmp] = i + 2; i++; tmp = SAnm[++l]; } - int c3 = T[i + 3]; if (c3 < 0) { T[i + 3] = c3 & INT_MAX; SA[tmp] = i + 3; i++; tmp = SAnm[++l]; } - } - - for (j += 6; i < j; i += 1) - { - int c0 = T[i]; if (c0 < 0) { T[i] = c0 & INT_MAX; SA[tmp] = i; i++; tmp = SAnm[++l]; } - } - } - - { - ptrdiff_t i, j, l = f; int tmp = SAnm[l]; - for (i = 0, j = (ptrdiff_t)m - 3; i < j; i += 4) - { - libsais_prefetch(&SA[i + prefetch_distance]); - - if (SA[i + 0] == 0) { SA[i + 0] = tmp; tmp = SAnm[++l]; } - if (SA[i + 1] == 0) { SA[i + 1] = tmp; tmp = SAnm[++l]; } - if (SA[i + 2] == 0) { SA[i + 2] = tmp; tmp = SAnm[++l]; } - if (SA[i + 3] == 0) { SA[i + 3] = tmp; tmp = SAnm[++l]; } - } - - for (j += 3; i < j; i += 1) - { - if (SA[i] == 0) { SA[i] = tmp; tmp = SAnm[++l]; } - } - } -} - -static void libsais_reconstruct_compacted_lms_suffixes_32s_2k(int * RESTRICT T, int * RESTRICT SA, int n, int k, int m, int fs, int f, int * RESTRICT buckets) -{ - if (f > 0) - { - memcpy(&SA[n - m - 1], &SA[n + fs - m], (size_t)f * sizeof(int)); - - libsais_count_and_gather_compacted_lms_suffixes_32s_2k(T, SA, n, k, buckets); - libsais_reconstruct_lms_suffixes(SA, n, m - f); - - memcpy(&SA[n - m - 1 + f], &SA[0], ((size_t)m - (size_t)f) * sizeof(int)); - memset(&SA[0], 0, (size_t)m * sizeof(int)); - - libsais_merge_compacted_lms_suffixes_32s(T, SA, n, m, f); - } - else - { - libsais_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets); - libsais_reconstruct_lms_suffixes(SA, n, m); - } -} - -static void libsais_reconstruct_compacted_lms_suffixes_32s_1k(int * RESTRICT T, int * RESTRICT SA, int n, int k, int m, int fs, int f) -{ - if (f > 0) - { - memmove(&SA[n - m - 1], &SA[n + fs - m], (size_t)f * sizeof(int)); - - libsais_gather_compacted_lms_suffixes_32s(T, SA, n); - libsais_reconstruct_lms_suffixes(SA, n, m - f); - - memcpy(&SA[n - m - 1 + f], &SA[0], ((size_t)m - (size_t)f) * sizeof(int)); - memset(&SA[0], 0, (size_t)m * sizeof(int)); - - libsais_merge_compacted_lms_suffixes_32s(T, SA, n, m, f); - } - else - { - libsais_gather_lms_suffixes_32s(T, SA, n); - libsais_reconstruct_lms_suffixes(SA, n, m); - } -} - -static int libsais_main_32s(int * RESTRICT T, int * RESTRICT SA, int n, int k, int fs) -{ - if (k > 0 && fs / k >= 6) - { - int alignment = (fs - 1024) / k >= 6 ? 1024 : 16; - int * RESTRICT buckets = (fs - alignment) / k >= 6 ? (int *)libsais_align_up(&SA[n + fs - 6 * k - alignment], (size_t)alignment * sizeof(int)) : &SA[n + fs - 6 * k]; - - int m = libsais_count_and_gather_lms_suffixes_32s_4k(T, SA, n, k, buckets); - if (m > 1) - { - memset(SA, 0, ((size_t)n - (size_t)m) * sizeof(int)); - - int first_lms_suffix = SA[n - m]; - int left_suffixes_count = libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(T, k, buckets, first_lms_suffix); - - libsais_radix_sort_lms_suffixes_32s_2k(T, SA, n, m, &buckets[4 * k]); - libsais_radix_sort_set_markers_32s(SA, k, &buckets[4 * k], INT_MIN); - - libsais_initialize_buckets_for_partial_sorting_32s_6k(T, k, buckets, first_lms_suffix, left_suffixes_count); - libsais_induce_partial_order_32s_6k(T, SA, n, k, buckets, first_lms_suffix, left_suffixes_count); - - int names = libsais_renumber_and_mark_distinct_lms_suffixes_32s_4k(SA, n, m); - if (names < m) - { - int f = libsais_compact_lms_suffixes_32s(T, SA, n, m, fs); - - if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f) != 0) - { - return -2; - } - - libsais_reconstruct_compacted_lms_suffixes_32s_2k(T, SA, n, k, m, fs, f, buckets); - } - else - { - libsais_count_lms_suffixes_32s_2k(T, n, k, buckets); - } - - libsais_initialize_buckets_start_and_end_32s_4k(k, buckets); - libsais_place_lms_suffixes_histogram_32s_4k(SA, n, k, m, buckets); - libsais_induce_final_order_32s_4k(T, SA, n, k, buckets); - } - else - { - SA[0] = SA[n - 1]; - - libsais_initialize_buckets_start_and_end_32s_6k(k, buckets); - libsais_place_lms_suffixes_histogram_32s_6k(SA, n, k, m, buckets); - libsais_induce_final_order_32s_6k(T, SA, n, k, buckets); - } - - return 0; - } - else if (k > 0 && fs / k >= 4) - { - int alignment = (fs - 1024) / k >= 4 ? 1024 : 16; - int * RESTRICT buckets = (fs - alignment) / k >= 4 ? (int *)libsais_align_up(&SA[n + fs - 4 * k - alignment], (size_t)alignment * sizeof(int)) : &SA[n + fs - 4 * k]; - - int m = libsais_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets); - if (m > 1) - { - libsais_initialize_buckets_for_radix_and_partial_sorting_32s_4k(T, k, buckets, SA[n - m]); - - libsais_radix_sort_lms_suffixes_32s_2k(T, SA, n, m, &buckets[1]); - libsais_radix_sort_set_markers_32s(SA, k, &buckets[1], SUFFIX_GROUP_MARKER); - - libsais_place_lms_suffixes_interval_32s_4k(SA, n, k, m - 1, buckets); - libsais_induce_partial_order_32s_4k(T, SA, n, k, buckets); - - int names = libsais_renumber_and_mark_distinct_lms_suffixes_32s_4k(SA, n, m); - if (names < m) - { - int f = libsais_compact_lms_suffixes_32s(T, SA, n, m, fs); - - if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f) != 0) - { - return -2; - } - - libsais_reconstruct_compacted_lms_suffixes_32s_2k(T, SA, n, k, m, fs, f, buckets); - } - else - { - libsais_count_lms_suffixes_32s_2k(T, n, k, buckets); - } - } - else - { - SA[0] = SA[n - 1]; - } - - libsais_initialize_buckets_start_and_end_32s_4k(k, buckets); - libsais_place_lms_suffixes_histogram_32s_4k(SA, n, k, m, buckets); - libsais_induce_final_order_32s_4k(T, SA, n, k, buckets); - - return 0; - } - else if (k > 0 && fs / k >= 2) - { - int alignment = (fs - 1024) / k >= 2 ? 1024 : 16; - int * RESTRICT buckets = (fs - alignment) / k >= 2 ? (int *)libsais_align_up(&SA[n + fs - 2 * k - alignment], (size_t)alignment * sizeof(int)) : &SA[n + fs - 2 * k]; - - int m = libsais_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets); - if (m > 1) - { - libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_2k(T, k, buckets, SA[n - m]); - - libsais_radix_sort_lms_suffixes_32s_2k(T, SA, n, m, &buckets[1]); - libsais_place_lms_suffixes_interval_32s_2k(SA, n, k, m - 1, buckets); - - libsais_initialize_buckets_start_and_end_32s_2k(k, buckets); - libsais_induce_partial_order_32s_2k(T, SA, n, k, buckets); - - int names = libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k(T, SA, n, m); - if (names < m) - { - int f = libsais_compact_lms_suffixes_32s(T, SA, n, m, fs); - - if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f) != 0) - { - return -2; - } - - libsais_reconstruct_compacted_lms_suffixes_32s_2k(T, SA, n, k, m, fs, f, buckets); - } - else - { - libsais_count_lms_suffixes_32s_2k(T, n, k, buckets); - } - } - else - { - SA[0] = SA[n - 1]; - } - - libsais_initialize_buckets_end_32s_2k(k, buckets); - libsais_place_lms_suffixes_histogram_32s_2k(SA, n, k, m, buckets); - - libsais_initialize_buckets_start_and_end_32s_2k(k, buckets); - libsais_induce_final_order_32s_2k(T, SA, n, k, buckets); - - return 0; - } - else - { - int * buffer = fs < k ? (int *)libsais_aligned_malloc((size_t)k * sizeof(int), 4096) : (int *)NULL; - - int alignment = fs - 1024 >= k ? 1024 : 16; - int * RESTRICT buckets = fs - alignment >= k ? (int *)libsais_align_up(&SA[n + fs - k - alignment], (size_t)alignment * sizeof(int)) : fs >= k ? &SA[n + fs - k] : buffer; - - if (buckets == NULL) { return -2; } - - memset(SA, 0, (size_t)n * sizeof(int)); - - libsais_count_suffixes_32s(T, n, k, buckets); - libsais_initialize_buckets_end_32s_1k(k, buckets); - - int m = libsais_radix_sort_lms_suffixes_32s_1k(T, SA, n, buckets); - if (m > 1) - { - libsais_induce_partial_order_32s_1k(T, SA, n, k, buckets); - - int names = libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k(T, SA, n, m); - if (names < m) - { - if (buffer != NULL) { libsais_aligned_free(buffer); buckets = NULL; } - - int f = libsais_compact_lms_suffixes_32s(T, SA, n, m, fs); - - if (libsais_main_32s(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f) != 0) - { - return -2; - } - - libsais_reconstruct_compacted_lms_suffixes_32s_1k(T, SA, n, k, m, fs, f); - - if (buckets == NULL) { buckets = buffer = (int *)libsais_aligned_malloc((size_t)k * sizeof(int), 4096); } - if (buckets == NULL) { return -2; } - } - - libsais_count_suffixes_32s(T, n, k, buckets); - libsais_initialize_buckets_end_32s_1k(k, buckets); - libsais_place_lms_suffixes_interval_32s_1k(T, SA, n, k, m, buckets); - } - - libsais_induce_final_order_32s_1k(T, SA, n, k, buckets); - libsais_aligned_free(buffer); - - return 0; - } -} - -static int libsais_main_8u(const unsigned char * T, int * SA, int n, int bwt) -{ - int * RESTRICT buckets = (int *)libsais_aligned_malloc(8 * ALPHABET_SIZE * sizeof(int), 4096); - - if (buckets != NULL) - { - int m = libsais_count_and_gather_lms_suffixes_8u(T, SA, n, buckets); - - libsais_initialize_buckets_start_and_end_8u(buckets); - - if (m > 0) - { - int first_lms_suffix = SA[n - m]; - int left_suffixes_count = libsais_initialize_buckets_for_lms_suffixes_radix_sort_8u(T, buckets, first_lms_suffix); - - libsais_radix_sort_lms_suffixes_8u(T, SA, n, m, buckets); - libsais_initialize_buckets_for_partial_sorting_8u(T, buckets, first_lms_suffix, left_suffixes_count); - libsais_induce_partial_order_8u(T, SA, n, buckets, first_lms_suffix, left_suffixes_count); - - int names = libsais_renumber_and_gather_lms_suffixes_8u(SA, n, m); - if (names < m) - { - if (libsais_main_32s(SA + n - m, SA, m, names, n - 2 * m) != 0) - { - libsais_aligned_free(buckets); - return -2; - } - - libsais_gather_lms_suffixes_8u(T, SA, n); - libsais_reconstruct_lms_suffixes(SA, n, m); - } - - libsais_place_lms_suffixes_interval_8u(SA, n, m, buckets); - } - else - { - memset(SA, 0, (size_t)n * sizeof(int)); - } - - int index = libsais_induce_final_order_8u(T, SA, n, bwt, buckets); - - libsais_aligned_free(buckets); - return index; - } - - return -2; -} - -static void libsais_bwt_copy_8u(unsigned char * RESTRICT U, int * RESTRICT A, int n) -{ - const ptrdiff_t prefetch_distance = 32; - - ptrdiff_t i, j; - for (i = 0, j = (ptrdiff_t)n - 7; i < j; i += 8) - { - libsais_prefetch(&A[i + prefetch_distance]); - - U[i + 0] = (unsigned char)A[i + 0]; - U[i + 1] = (unsigned char)A[i + 1]; - U[i + 2] = (unsigned char)A[i + 2]; - U[i + 3] = (unsigned char)A[i + 3]; - U[i + 4] = (unsigned char)A[i + 4]; - U[i + 5] = (unsigned char)A[i + 5]; - U[i + 6] = (unsigned char)A[i + 6]; - U[i + 7] = (unsigned char)A[i + 7]; - } - - for (j += 7; i < j; i += 1) - { - U[i] = (unsigned char)A[i]; - } -} - -int libsais(const unsigned char * T, int * SA, int n) -{ - if ((T == NULL) || (SA == NULL) || (n < 0)) - { - return -1; - } - else if (n < 2) - { - if (n == 1) { SA[0] = 0; } - return 0; - } - - return libsais_main_8u(T, SA, n, 0); -} - -int libsais_bwt(const unsigned char * T, unsigned char * U, int * A, int n) -{ - if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0)) - { - return -1; - } - else if (n <= 1) - { - if (n == 1) { U[0] = T[0]; } - return n; - } - - int index = libsais_main_8u(T, A, n, 1); - if (index >= 0) - { - U[0] = T[n - 1]; - libsais_bwt_copy_8u(U + 1, A, index); - libsais_bwt_copy_8u(U + 1 + index, A + 1 + index, n - index - 1); - - index++; - } - - return index; -} diff --git a/src/libsais.h b/src/libsais.h deleted file mode 100644 index 6e815e0..0000000 --- a/src/libsais.h +++ /dev/null @@ -1,54 +0,0 @@ -/*-- - -This file is a part of libsais, a library for linear time -suffix array and burrows wheeler transform construction. - - Copyright (c) 2021 Ilya Grebnov - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - -Please see the file LICENSE for full copyright information. - ---*/ - -#ifndef LIBSAIS_H -#define LIBSAIS_H 1 - -#ifdef __cplusplus -extern "C" { -#endif - - /** - * Constructs the suffix array of a given string. - * @param T [0..n-1] The input string. - * @param SA [0..n-1] The output array of suffixes. - * @param n The length of the given string. - * @return 0 if no error occurred, -1 or -2 otherwise. - */ - int libsais(const unsigned char * T, int * SA, int n); - - /** - * Constructs the burrows-wheeler transformed string of a given string. - * @param T [0..n-1] The input string. - * @param U [0..n-1] The output string. (can be T) - * @param A [0..n-1] The temporary array. - * @param n The length of the given string. - * @return The primary index if no error occurred, -1 or -2 otherwise. - */ - int libsais_bwt(const unsigned char * T, unsigned char * U, int * A, int n); - -#ifdef __cplusplus -} -#endif - -#endif From b6a0a094fa973793d754de8b3a1e54b7e59de261 Mon Sep 17 00:00:00 2001 From: zvezdochiot Date: Sat, 22 May 2021 23:20:58 +0300 Subject: [PATCH 34/34] 1.65: update for libsais 2.2.0 --- README.md | 8 +- src/bcm.cpp | 254 ++++++++++++++++++++++++++-------------------------- 2 files changed, 133 insertions(+), 129 deletions(-) diff --git a/README.md b/README.md index 27cde5a..b77f9d5 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,9 @@ +![GitHub release (latest by date)](https://img.shields.io/github/v/release/FS-make-simple/bcm) +![GitHub Release Date](https://img.shields.io/github/release-date/FS-make-simple/bcm) +![GitHub repo size](https://img.shields.io/github/repo-size/FS-make-simple/bcm) +![GitHub all releases](https://img.shields.io/github/downloads/FS-make-simple/bcm/total) +![GitHub](https://img.shields.io/github/license/FS-make-simple/bcm) + # BCM ### Description @@ -10,7 +16,7 @@ Original | 100,000,000 bytes | GZIP -9 | 36,445,248 bytes | BZIP2 -9 | 29,008,758 bytes | 7-Zip Ultra | 24,864,804 bytes | -BCM -9 | 20,789,667 bytes | +BCM -b100 | 20,789,667 bytes | [1]:http://mattmahoney.net/dc/text.html diff --git a/src/bcm.cpp b/src/bcm.cpp index aef2706..a2373de 100644 --- a/src/bcm.cpp +++ b/src/bcm.cpp @@ -4,28 +4,29 @@ BCM - A BWT-based file compressor Copyright (C) 2008-2021 Ilya Muravyov -*/ +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at -#ifndef _MSC_VER -# define _FILE_OFFSET_BITS 64 + http://www.apache.org/licenses/LICENSE-2.0 -# define _fseeki64 fseeko -# define _ftelli64 ftello -# define _stati64 stat +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. -# ifdef HAVE_GETC_UNLOCKED -# undef getc -# define getc getc_unlocked -# endif -# ifdef HAVE_PUTC_UNLOCKED -# undef putc -# define putc putc_unlocked -# endif -#endif +*/ + +#ifdef _MSC_VER +# define _CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES 1 +# define _CRT_SECURE_NO_WARNINGS +# define _CRT_DISABLE_PERFCRIT_LOCKS -#define _CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES 1 -#define _CRT_SECURE_NO_WARNINGS -#define _CRT_DISABLE_PERFCRIT_LOCKS +# define fseeko64 _fseeki64 +# define ftello64 _ftelli64 +# define stat64 _stati64 +#endif #include #include @@ -43,7 +44,7 @@ Copyright (C) 2008-2021 Ilya Muravyov # endif #endif -#include +#include "libsais.h" typedef unsigned char U8; typedef unsigned short U16; @@ -53,7 +54,7 @@ typedef signed long long S64; // Globals -const char magic[]="BCM!"; +#define BCM_ID 0x214D4342 // "BCM!" FILE* in; FILE* out; @@ -67,7 +68,7 @@ struct Encoder Encoder() { low=0; - high=U32(-1); + high=0xFFFFFFFF; code=0; } @@ -83,7 +84,7 @@ struct Encoder void Init() { for (int i=0; i<4; ++i) - code=(code<<8)+getc(in); + code=(code<<8)|getc(in); } template @@ -101,7 +102,7 @@ struct Encoder { putc(low>>24, out); low<<=8; - high=(high<<8)+255; + high=(high<<8)|255; } } @@ -120,8 +121,8 @@ struct Encoder while ((low^high)<(1<<24)) { low<<=8; - high=(high<<8)+255; - code=(code<<8)+getc(in); + high=(high<<8)|255; + code=(code<<8)|getc(in); } return bit; @@ -292,42 +293,64 @@ struct CM: Encoder struct CRC { - U32 tab[256]; + U32 tab[8][256]; U32 crc; CRC() { for (int i=0; i<256; ++i) { - U32 r=i; + U32 x=i; for (int j=0; j<8; ++j) - r=(r>>1)^(0xEDB88320&-int(r&1)); - tab[i]=r; + x=(x>>1)^(0xEDB88320&-int(x&1)); + tab[0][i]=x; } - crc=U32(-1); + for (int i=0; i<256; ++i) + { + tab[1][i]=(tab[0][i]>>8)^tab[0][tab[0][i]&255]; + tab[2][i]=(tab[1][i]>>8)^tab[0][tab[1][i]&255]; + tab[3][i]=(tab[2][i]>>8)^tab[0][tab[2][i]&255]; + tab[4][i]=(tab[3][i]>>8)^tab[0][tab[3][i]&255]; + tab[5][i]=(tab[4][i]>>8)^tab[0][tab[4][i]&255]; + tab[6][i]=(tab[5][i]>>8)^tab[0][tab[5][i]&255]; + tab[7][i]=(tab[6][i]>>8)^tab[0][tab[6][i]&255]; + } + crc=0xFFFFFFFF; } U32 operator()() const { - return crc^U32(-1); + return crc^0xFFFFFFFF; } - void Update(int c) + void Update(U8* s, int n) { - crc=(crc>>8)^tab[(crc^c)&255]; - } - - void Update(U8* buf, int n) - { - for (int i=0; i>8)^tab[(crc^buf[i])&255]; + U32 x=crc; + while (n>=8) + { + x^=*reinterpret_cast(s); + const U32 t=*reinterpret_cast(s+4); + x=tab[0][t>>24] + ^tab[1][(t>>16)&255] + ^tab[2][(t>>8)&255] + ^tab[3][t&255] + ^tab[4][x>>24] + ^tab[5][(x>>16)&255] + ^tab[6][(x>>8)&255] + ^tab[7][x&255]; + s+=8; + n-=8; + } + while (n--) + x=(x>>8)^tab[0][(x^*s++)&255]; + crc=x; } } crc; template inline T* MemAlloc(size_t n) { - T* p=(T*)malloc(n*sizeof(T)); + T* p=reinterpret_cast(malloc(n*sizeof(T))); if (!p) { perror("Malloc() failed"); @@ -336,29 +359,14 @@ inline T* MemAlloc(size_t n) return p; } -void Compress(int level) +void Compress(int bsize) { - const int tab[10]= - { - 0, - 1<<20, // -1 - 1 MB - 1<<22, // -2 - 4 MB - 1<<23, // -3 - 8 MB - 0x00FFFFFF, // -4 - ~16 MB (Default) - 1<<25, // -5 - 32 MB - 1<<26, // -6 - 64 MB - 1<<27, // -7 - 128 MB - 1<<28, // -8 - 256 MB - 0x7FFFFFFF, // -9 - ~2 GB - }; - int bsize=tab[level]; // Block size - - if (_fseeki64(in, 0, SEEK_END)) + if (fseeko64(in, 0, SEEK_END)) { perror("Fseek() failed"); exit(1); } - const S64 flen=_ftelli64(in); + const S64 flen=ftello64(in); if (flen<0) { perror("Ftell() failed"); @@ -377,10 +385,10 @@ void Compress(int level) { crc.Update(buf, n); - const int idx=libsais_bwt(buf, buf, ptr, n); + const int idx=libsais_bwt(buf, buf, ptr, n, 0); if (idx<1) { - fprintf(stderr, "BWT() failed: idx = %d\n", idx); + perror("Libsais_bwt() failed"); exit(1); } @@ -390,7 +398,7 @@ void Compress(int level) for (int i=0; i %lld\r", _ftelli64(in), _ftelli64(out)); + fprintf(stderr, "%lld -> %lld\r", ftello64(in), ftello64(out)); } cm.Put32(0); // EOF @@ -407,8 +415,8 @@ void Decompress() int cnt[257]; int bsize=0; - U8* buf=nullptr; - U32* ptr=nullptr; + U8* buf=NULL; + int* ptr=NULL; cm.Init(); @@ -417,9 +425,9 @@ void Decompress() { if (!bsize) { - if ((bsize=n)>=(1<<24)) // 5*N - buf=MemAlloc(bsize); - ptr=MemAlloc(bsize); + bsize=n; + buf=MemAlloc(bsize); + ptr=MemAlloc(bsize); } const int idx=cm.Get32(); @@ -431,52 +439,39 @@ void Decompress() // Inverse BW-transform - if (n>=(1<<24)) // 5*N + memset(cnt, 0, sizeof(cnt)); + for (int i=0; i=idx)]; - crc.Update(c); - putc(c, out); + if (cnt[c+half]<=p) + c+=half+1; + half>>=1; } + buf[i]=c; + p=ptr[p]; } - else // 4*N + + crc.Update(buf, n); + + if (fwrite(buf, 1, n, out)!=n) { - memset(cnt, 0, sizeof(cnt)); - for (int i=0; i>8; - const int c=ptr[p-(p>=idx)]; - crc.Update(c); - putc(c, out); - } + perror("Fwrite() failed"); + exit(1); } - fprintf(stderr, "%lld -> %lld\r", _ftelli64(in), _ftelli64(out)); + fprintf(stderr, "%lld -> %lld\r", ftello64(in), ftello64(out)); } if (cm.Get32()!=crc()) @@ -485,8 +480,7 @@ void Decompress() exit(1); } - if (buf) - free(buf); + free(buf); free(ptr); } @@ -494,7 +488,7 @@ int main(int argc, char** argv) { const clock_t start=clock(); - int level=4; + int bsize=1<<24; // 16 MB int decompress=0; int overwrite=0; @@ -504,6 +498,7 @@ int main(int argc, char** argv) { switch (argv[1][i]) { + case '0': case '1': case '2': case '3': @@ -513,7 +508,14 @@ int main(int argc, char** argv) case '7': case '8': case '9': - level=argv[1][i]-'0'; + break; // Skip + case 'b': + bsize=atoi(&argv[1][i+1])<<20; + if (bsize<1) + { + fprintf(stderr, "Block size is out of range\n"); + exit(1); + } break; case 'd': decompress=1; @@ -526,7 +528,6 @@ int main(int argc, char** argv) exit(1); } } - --argc; ++argv; } @@ -534,15 +535,15 @@ int main(int argc, char** argv) if (argc<2) { fprintf(stderr, - "BCM - A BWT-based file compressor, v1.60\n" + "BCM - A BWT-based file compressor, v1.65\n" "Copyright (C) 2008-2021 Ilya Muravyov\n" "\n" - "Usage: BCM [options] infile [outfile]\n" + "Usage: bcm [options] infile [outfile]\n" "\n" "Options:\n" - " -1 .. -9 Set block size to 1 MB .. 2 GB\n" - " -d Decompress\n" - " -f Force overwrite of output file\n"); + " -b# Set block size to # MB (default: 16)\n" + " -d Decompress\n" + " -f Force overwrite of output file\n"); exit(1); } @@ -591,10 +592,9 @@ int main(int argc, char** argv) if (decompress) { - if (getc(in)!=magic[0] - || getc(in)!=magic[1] - || getc(in)!=magic[2] - || getc(in)!=magic[3]) + int id; + fread(&id, 1, sizeof(id), in); + if (id!=BCM_ID) { fprintf(stderr, "%s: Not in BCM format\n", argv[1]); exit(1); @@ -620,25 +620,23 @@ int main(int argc, char** argv) exit(1); } - putc(magic[0], out); - putc(magic[1], out); - putc(magic[2], out); - putc(magic[3], out); + const int id=BCM_ID; + fwrite(&id, 1, sizeof(id), out); fprintf(stderr, "Compressing '%s':\n", argv[1]); - Compress(level); + Compress(bsize); } - fprintf(stderr, "%lld -> %lld in %1.1f sec\n", - _ftelli64(in), _ftelli64(out), double(clock()-start)/CLOCKS_PER_SEC); + fprintf(stderr, "%lld -> %lld in %.1f sec\n", + ftello64(in), ftello64(out), double(clock()-start)/CLOCKS_PER_SEC); fclose(in); fclose(out); #ifndef NO_UTIME - struct _stati64 sb; - if (_stati64(argv[1], &sb)) + struct stat64 sb; + if (stat64(argv[1], &sb)) { perror("Stat() failed"); exit(1);