Reading multiple directories in transcript browser

This commit is contained in:
Krzosa Karol
2024-06-19 10:40:20 +02:00
parent 055be9b058
commit 7fe6aa2a97
5 changed files with 88 additions and 173 deletions

View File

@@ -1,11 +1,12 @@
#define BASIC_IMPL
#include "basic.h"
#include "pdfio.h"
#include <string.h>
#include "read_pdf.cpp"
int main(int argc, char *argv[]) {
InitScratch();
Scratch scratch;
return (0);
}

View File

@@ -1,8 +1,3 @@
struct PdfPage {
String content;
int64_t number;
};
String Merge(Allocator allocator, Array<String> list, String separator = " ") {
int64_t char_count = 0;
For(list) char_count += it.len;
@@ -27,52 +22,3 @@ String Merge(Allocator allocator, Array<String> list, String separator = " ") {
string.data[size] = 0;
return string;
}
Array<PdfPage> ReadPDF(Allocator allocator, String filename) {
Scratch scratch;
char buffer[1024];
char *filename_char = NullTerminate(scratch, filename);
pdfio_file_t *file = pdfioFileOpen(filename_char, NULL, NULL, NULL, NULL);
if (file == NULL) return {};
defer { pdfioFileClose(file); };
Array<PdfPage> pages = {allocator};
for (int i = 0, num_pages = pdfioFileGetNumPages(file); i < num_pages; i++) {
pdfio_obj_t *obj = pdfioFileGetPage(file, i);
if (obj == NULL) continue;
PdfPage *page = pages.alloc();
page->number = i + 1;
Array<String> content = {scratch};
size_t num_streams = pdfioPageGetNumStreams(obj);
for (int j = 0; j < num_streams; j++) {
pdfio_stream_t *st = pdfioPageOpenStream(obj, j, true);
if (st == NULL) continue;
defer { pdfioStreamClose(st); };
bool first = true;
while (pdfioStreamGetToken(st, buffer, sizeof(buffer))) {
if (buffer[0] == '(') {
if (first) {
first = false;
} else {
// content.add(" ");
}
content.add(Copy(scratch, buffer + 1));
} else if (!strcmp(buffer, "Td") || !strcmp(buffer, "TD") || !strcmp(buffer, "T*") || !strcmp(buffer, "\'") || !strcmp(buffer, "\"")) {
// content.add("\n");
first = true;
}
}
// if (!first) content.add("\n");
}
page->content = Merge(allocator, content, "");
}
return pages;
}

View File

@@ -0,0 +1,16 @@
import pypdf
reader = pypdf.PdfReader("C:/Users/Karol/Desktop/Hegels-Logic.pdf")
f = open("asd.txt", "w", encoding="utf-8")
for i in range(len(reader.pages)):
page = reader.pages[i]
text = page.extract_text()
text = text.replace("\n", "")
text = text.replace("-", "")
f.write(f">>>>>>>>{i + 1}<<<<<<<<<\n")
f.write(text)
f.write("\n")
f.close()

View File

@@ -8,6 +8,8 @@
#include <semaphore>
#include <mutex>
Arena Perm;
struct TimeString {
uint16_t hour;
uint16_t minute;
@@ -87,21 +89,17 @@ struct XToTimeString {
uint16_t second;
String filepath;
};
Arena XArena;
Arena Perm;
Array<XToTimeString> XToTimeStringArray = {Perm};
bool AppInitializedWithFolder;
Arena XArena;
Array<String> InitForFolder(String folder) {
void AddFolder(String folder, Array<String> *filenames, Array<XToTimeString> *x_to_time_string) {
Scratch scratch;
Array<String> filenames = {Perm};
Array<String> srt_files = {scratch};
for (FileIter iter = IterateFiles(scratch, folder); IsValid(iter); Advance(&iter)) {
filenames.add(Copy(Perm, iter.absolute_path));
String file = Copy(Perm, iter.absolute_path);
filenames->add(file);
if (EndsWith(iter.filename, ".srt")) {
srt_files.add(Copy(scratch, iter.absolute_path));
srt_files.add(file);
}
}
@@ -135,14 +133,11 @@ Array<String> InitForFolder(String folder) {
For(it_time_file.time_strings) {
String s = Copy(XArena, it.string);
s.data[s.len] = ' ';
XToTimeStringArray.add({s, it.hour, it.minute, it.second, it_time_file.file});
x_to_time_string->add({s, it.hour, it.minute, it.second, it_time_file.file});
}
}
Release(it_io.arena);
}
AppInitializedWithFolder = true;
return filenames;
}
//
@@ -204,13 +199,14 @@ int main() {
Arena *frame_arena = AllocArena();
XArena.align = 0;
String start_string = "C:/video";
String start_string = "read=D:/zizek";
For(start_string) Prompt.add(it);
std::thread search_thread(SearchThreadEntry);
int64_t chosen_text = 0;
int64_t match_search_offset = 0;
Array<String> filenames = {};
std::thread search_thread(SearchThreadEntry);
int64_t chosen_text = 0;
int64_t match_search_offset = 0;
Array<String> filenames = {};
Array<XToTimeString> x_to_time_string = {};
InitWindow(1920, 1080, "Transcript Browser");
SetWindowState(FLAG_WINDOW_RESIZABLE);
@@ -261,9 +257,10 @@ int main() {
match_search_offset = Clamp(match_search_offset, (int64_t)0, Max(Matches.len - 1 - 10, (int64_t)0));
if (IsKeyPressed(KEY_ENTER)) {
if (!AppInitializedWithFolder) {
String prompt = {Prompt.data, Prompt.len};
if (StartsWith(prompt, "read=")) {
Prompt.add('\0');
filenames = InitForFolder(Prompt.data);
AddFolder(prompt.skip(5), &filenames, &x_to_time_string);
Prompt.clear();
} else if (ItemFound) {
String base = ChopLastPeriod(ItemFound->filepath); // .srt
@@ -284,7 +281,7 @@ int main() {
}
} else if (Matches.len) {
String string = Matches[chosen_text];
For(XToTimeStringArray) {
For(x_to_time_string) {
uintptr_t begin = (uintptr_t)(it.string.data);
uintptr_t end = (uintptr_t)(it.string.data + it.string.len);
uintptr_t needle = (uintptr_t)string.data;
@@ -303,12 +300,7 @@ int main() {
float y = 0;
int xwidth = MeasureTextEx(font, "_", font_size, 1).x;
if (!AppInitializedWithFolder) {
Prompt.add('\0');
DrawTextEx(font, "> ", {0, y}, font_size, 1, BLACK);
DrawTextEx(font, Prompt.data, {(float)xwidth * 3, y}, font_size, 1, BLACK);
Prompt.len -= 1;
} else if (ItemFound) {
if (ItemFound) {
uintptr_t begin_region = (uintptr_t)XArena.data;
uintptr_t end_region = (uintptr_t)XArena.data + XArena.len;