(yet unused) text detection preset
diff --git a/Makefile b/Makefile
index ab7b001..e4e8a8b 100644
--- a/Makefile
+++ b/Makefile
@@ -6,7 +6,7 @@ PREFIX?=/usr/local
.PHONY: all clean format install
OBJECTS=obj/main.o obj/libsais.o obj/crc32.o obj/mtf.o obj/srt.o obj/rle.o \
- obj/cm.o obj/libbz3.o
+ obj/cm.o obj/libbz3.o obj/txt.o
all: bzip3
@@ -14,7 +14,7 @@ obj/%.o: src/%.c
$(CC) $(CFLAGS) -c $< -o $@
bzip3: $(OBJECTS)
- $(CC) $(CFLAGS) -o $@ $^
+ $(CC) $(CFLAGS) -o $@ $^ -lm
clean:
rm -f bzip3 obj/*.o
diff --git a/include/txt.h b/include/txt.h
new file mode 100644
index 0000000..ef92a03
--- /dev/null
+++ b/include/txt.h
@@ -0,0 +1,9 @@
+
+#ifndef _TXT_H
+#define _TXT_H
+
+#include "common.h"
+
+int is_text(const u8 * buffer, s32 length);
+
+#endif
diff --git a/src/libbz3.c b/src/libbz3.c
index 46c6193..460569b 100644
--- a/src/libbz3.c
+++ b/src/libbz3.c
@@ -11,6 +11,7 @@
#include "crc32.h"
#include "libsais.h"
#include "mtf.h"
+#include "txt.h"
#include "rle.h"
#include "srt.h"
diff --git a/src/srt.c b/src/srt.c
index 1703034..c5c1e38 100644
--- a/src/srt.c
+++ b/src/srt.c
@@ -30,10 +30,9 @@ static s32 preprocess(const u32 * freqs, u8 * symbols) {
while (1) {
h /= 3;
for (u32 i = h; i < nb_symbols; i++) {
- const s32 t = symbols[i] & 0xFF;
+ const s32 t = symbols[i];
s32 b = i - h;
- while ((b >= 0) && freqs[symbols[b]] < freqs[t] ||
- (freqs[t] == freqs[symbols[b]]) && t < symbols[b]) {
+ while ((b >= 0) && (freqs[symbols[b]] < freqs[t] || (freqs[t] == freqs[symbols[b]] && t < symbols[b]))) {
symbols[b + h] = symbols[b];
b -= h;
}
diff --git a/src/txt.c b/src/txt.c
new file mode 100644
index 0000000..7968733
--- /dev/null
+++ b/src/txt.c
@@ -0,0 +1,85 @@
+
+#include "txt.h"
+
+#include <math.h>
+
+#ifdef STANDALONE
+#include <stdio.h>
+#endif
+
+int is_text(const u8 * data, s32 len) {
+#ifdef STANDALONE
+ printf("Data of length %d.\n");
+#endif
+
+ s32 histogram[256] = { 0 };
+ for(s32 i = 0; i < len; i++)
+ histogram[data[i]]++;
+
+ // Text criterions:
+ // 1. Shannon entropy is between 4.5 and 5.2.
+ // 2. Majority of the document must be uppercase/lowercase numbers.
+ // 3. The file has a proper amount of whitespace
+ // -----
+
+ // Step 1
+ double entropy = 0;
+ for(s32 i = 0; i < 256; i++) {
+ double p = (double)histogram[i] / len;
+ if(p == 0) continue;
+ entropy += p * log2(p);
+ }
+ entropy = -entropy;
+
+#ifdef STANDALONE
+ printf("Shannon entropy: %lf\n", entropy);
+#endif
+
+ if(entropy > 5.4 || entropy < 4.5)
+ return 0;
+
+ // Step 2
+ s32 letters = 0;
+ s32 whitespace = 0;
+ for(s32 i = 0; i < 256; i++) {
+ if(i >= 'A' && i <= 'Z')
+ letters += histogram[i];
+ else if(i >= 'a' && i <= 'z')
+ letters += histogram[i];
+ else if(i >= '0' && i <= '9')
+ letters += histogram[i];
+ else if(i == ' ' || i == '\t' || i == '\n' || i == '\r' || i == '\v')
+ whitespace += histogram[i];
+ }
+
+#ifdef STANDALONE
+ printf("Letters: %d, whitespace: %d, text to other ratio: %lf\n", letters, whitespace, (double)(letters+whitespace) / len);
+#endif
+
+ if((double)(letters+whitespace) / len < 0.6)
+ return 0;
+
+ // Step 3
+ double letters_ratio = (double)letters / whitespace;
+ if(letters_ratio < 2 || letters_ratio > 9)
+ return 0;
+
+#ifdef STANDALONE
+ printf("Letter to whitespace ratio: %lf\n", letters_ratio);
+#endif
+
+ return 1;
+}
+
+#ifdef STANDALONE
+int main(int argc, char * argv[]) {
+ FILE * f = fopen(argv[1], "rb");
+ fseek(f, 0, SEEK_END);
+ s32 len = ftell(f);
+ fseek(f, 0, SEEK_SET);
+ u8 * data = malloc(len);
+ fread(data, 1, len, f);
+ fclose(f);
+ printf("%d\n", is_text(data, len));
+}
+#endif
