:: commit f0091e81ce05168e66da6d78ba84baa65b9e8564

Kamila Szewczyk <kspalaiologos@gmail.com> — 2022-05-02 18:32

parents: 0b96721f0b

(yet unused) text detection preset

diff --git a/Makefile b/Makefile
index ab7b001..e4e8a8b 100644
--- a/Makefile
+++ b/Makefile
@@ -6,7 +6,7 @@ PREFIX?=/usr/local
 .PHONY: all clean format install
 
 OBJECTS=obj/main.o obj/libsais.o obj/crc32.o obj/mtf.o obj/srt.o obj/rle.o \
-        obj/cm.o obj/libbz3.o
+        obj/cm.o obj/libbz3.o obj/txt.o
 
 all: bzip3
 
@@ -14,7 +14,7 @@ obj/%.o: src/%.c
 	$(CC) $(CFLAGS) -c $< -o $@
 
 bzip3: $(OBJECTS)
-	$(CC) $(CFLAGS) -o $@ $^
+	$(CC) $(CFLAGS) -o $@ $^ -lm
 
 clean:
 	rm -f bzip3 obj/*.o
diff --git a/include/txt.h b/include/txt.h
new file mode 100644
index 0000000..ef92a03
--- /dev/null
+++ b/include/txt.h
@@ -0,0 +1,9 @@
+
+#ifndef _TXT_H
+#define _TXT_H
+
+#include "common.h"
+
+int is_text(const u8 * buffer, s32 length);
+
+#endif
diff --git a/src/libbz3.c b/src/libbz3.c
index 46c6193..460569b 100644
--- a/src/libbz3.c
+++ b/src/libbz3.c
@@ -11,6 +11,7 @@
 #include "crc32.h"
 #include "libsais.h"
 #include "mtf.h"
+#include "txt.h"
 #include "rle.h"
 #include "srt.h"
 
diff --git a/src/srt.c b/src/srt.c
index 1703034..c5c1e38 100644
--- a/src/srt.c
+++ b/src/srt.c
@@ -30,10 +30,9 @@ static s32 preprocess(const u32 * freqs, u8 * symbols) {
     while (1) {
         h /= 3;
         for (u32 i = h; i < nb_symbols; i++) {
-            const s32 t = symbols[i] & 0xFF;
+            const s32 t = symbols[i];
             s32 b = i - h;
-            while ((b >= 0) && freqs[symbols[b]] < freqs[t] ||
-                   (freqs[t] == freqs[symbols[b]]) && t < symbols[b]) {
+            while ((b >= 0) && (freqs[symbols[b]] < freqs[t] || (freqs[t] == freqs[symbols[b]] && t < symbols[b]))) {
                 symbols[b + h] = symbols[b];
                 b -= h;
             }
diff --git a/src/txt.c b/src/txt.c
new file mode 100644
index 0000000..7968733
--- /dev/null
+++ b/src/txt.c
@@ -0,0 +1,85 @@
+
+#include "txt.h"
+
+#include <math.h>
+
+#ifdef STANDALONE
+#include <stdio.h>
+#endif
+
+int is_text(const u8 * data, s32 len) {
+#ifdef STANDALONE
+    printf("Data of length %d.\n");
+#endif
+
+    s32 histogram[256] = { 0 };
+    for(s32 i = 0; i < len; i++)
+        histogram[data[i]]++;
+    
+    // Text criterions:
+    // 1. Shannon entropy is between 4.5 and 5.2.
+    // 2. Majority of the document must be uppercase/lowercase numbers.
+    // 3. The file has a proper amount of whitespace
+    // -----
+
+    // Step 1
+    double entropy = 0;
+    for(s32 i = 0; i < 256; i++) {
+        double p = (double)histogram[i] / len;
+        if(p == 0) continue;
+        entropy += p * log2(p);
+    }
+    entropy = -entropy;
+
+#ifdef STANDALONE
+    printf("Shannon entropy: %lf\n", entropy);
+#endif
+
+    if(entropy > 5.4 || entropy < 4.5)
+        return 0;
+    
+    // Step 2
+    s32 letters = 0;
+    s32 whitespace = 0;
+    for(s32 i = 0; i < 256; i++) {
+        if(i >= 'A' && i <= 'Z')
+            letters += histogram[i];
+        else if(i >= 'a' && i <= 'z')
+            letters += histogram[i];
+        else if(i >= '0' && i <= '9')
+            letters += histogram[i];
+        else if(i == ' ' || i == '\t' || i == '\n' || i == '\r' || i == '\v')
+            whitespace += histogram[i];
+    }
+
+#ifdef STANDALONE
+    printf("Letters: %d, whitespace: %d, text to other ratio: %lf\n", letters, whitespace, (double)(letters+whitespace) / len);
+#endif
+
+    if((double)(letters+whitespace) / len < 0.6)
+        return 0;
+
+    // Step 3
+    double letters_ratio = (double)letters / whitespace;
+    if(letters_ratio < 2 || letters_ratio > 9)
+        return 0;
+
+#ifdef STANDALONE
+    printf("Letter to whitespace ratio: %lf\n", letters_ratio);
+#endif
+
+    return 1;
+}
+
+#ifdef STANDALONE
+int main(int argc, char * argv[]) {
+    FILE * f = fopen(argv[1], "rb");
+    fseek(f, 0, SEEK_END);
+    s32 len = ftell(f);
+    fseek(f, 0, SEEK_SET);
+    u8 * data = malloc(len);
+    fread(data, 1, len, f);
+    fclose(f);
+    printf("%d\n", is_text(data, len));
+}
+#endif
tab: 248 wrap: offon