From bd97ecb9eb047e9fc58706616d70672668a38e08 Mon Sep 17 00:00:00 2001 From: Krzosa Karol Date: Fri, 12 Jul 2024 08:27:24 +0200 Subject: [PATCH] Big update --- src/transcript_browser/read_pdf.cpp | 76 +++++++++++++++++++++++++++++ src/transcript_browser/read_srt.cpp | 63 ++++++++++++++++++++++++ 2 files changed, 139 insertions(+) create mode 100644 src/transcript_browser/read_pdf.cpp create mode 100644 src/transcript_browser/read_srt.cpp diff --git a/src/transcript_browser/read_pdf.cpp b/src/transcript_browser/read_pdf.cpp new file mode 100644 index 0000000..938bb21 --- /dev/null +++ b/src/transcript_browser/read_pdf.cpp @@ -0,0 +1,76 @@ +struct PDFPage { + String string; + int number; +}; + +struct PDF { + String filename; + Array pages; +}; + +// @todo: pull request the object close +// @todo: something needs to be done with unicode codepoints +PDF pdfioReadPDF(Allocator allocator, String filename) { + Assert(filename.data[filename.len] == 0); + PDF result = {}; + result.pages = {allocator}; + result.filename = filename; + + pdfio_file_t *file = pdfioFileOpen(filename.data, NULL, NULL, NULL, NULL); + Assert(file); + defer { pdfioFileClose(file); }; + + char buffer[1024]; + size_t page_count = pdfioFileGetNumPages(file); + result.pages.reserve(page_count); + + for (size_t page_i = 0; page_i < page_count; page_i += 1) { + pdfio_obj_t *obj = pdfioFileGetPage(file, page_i); + if (obj == NULL) continue; + defer { pdfioObjClose(obj); }; + PDFPage *page = result.pages.alloc(); + page->number = (int)page_i; + + Scratch scratch((Arena *)allocator.object); + Array strings = {scratch}; + size_t num_streams = pdfioPageGetNumStreams(obj); + for (size_t stream_i = 0; stream_i < num_streams; stream_i += 1) { + pdfio_stream_t *st = pdfioPageOpenStream(obj, stream_i, true); + if (st == NULL) continue; + defer { pdfioStreamClose(st); }; + + bool first = true; + while (pdfioStreamGetToken(st, buffer, sizeof(buffer))) { + if (buffer[0] == '(') { + if (first) { + first = false; + } + + strings.add(Copy(scratch, buffer + 1)); + } else if (!strcmp(buffer, "Td") || !strcmp(buffer, "TD") || !strcmp(buffer, "T*") || !strcmp(buffer, "\'") || !strcmp(buffer, "\"")) { + first = true; + } + } + } + page->string = Merge(allocator, strings, ""); + } + + return result; +} + +#if 1 +extern "C" void OutputDebugStringA(const char *); +void Printf(const char *string, ...) { + Scratch scratch; + STRING_FORMAT(scratch, string, result); + OutputDebugStringA(result.data); +} +#else + #define Printf(...) (0) +#endif + +void PrintPDF(PDF pdf) { + For(pdf.pages) { + Printf("%d, %.*s\n", it.number, FmtString(it.string)); + } +} \ No newline at end of file diff --git a/src/transcript_browser/read_srt.cpp b/src/transcript_browser/read_srt.cpp new file mode 100644 index 0000000..b198dec --- /dev/null +++ b/src/transcript_browser/read_srt.cpp @@ -0,0 +1,63 @@ +struct TimeString { + uint16_t hour; + uint16_t minute; + uint16_t second; + String string; +}; + +Array ParseSrtFile(Arena *arena, String filename) { + String content = ReadFile(*arena, filename); + Array lines = Split(*arena, content, "\n"); + + IterRemove(lines) { + IterRemovePrepare(lines); + it = Trim(it); + if (it.len == 0) remove_item = true; + } + + long section_number = 1; + Array time_strings = {*arena}; + for (int i = 0; i < lines.len;) { + String it0 = lines[i++]; + long num = strtol(it0.data, NULL, 10); + Assert(section_number == num); + section_number += 1; + + TimeString item = {}; + String it1 = lines[i++]; + item.hour = (uint16_t)strtol(it1.data, NULL, 10); + item.minute = (uint16_t)strtol(it1.data + 3, NULL, 10); + item.second = (uint16_t)strtol(it1.data + 6, NULL, 10); + + String next_section_number = Format(*arena, "%d", section_number); + while (i < lines.len && lines[i] != next_section_number) { + String it = lines[i]; + item.string = lines[i]; + time_strings.add(item); + i += 1; + } + } + + IterRemove(time_strings) { + IterRemovePrepare(time_strings); + if (i > 0 && AreEqual(time_strings[i - 1].string, time_strings[i].string, true)) { + remove_item = true; + } + } + + return time_strings; +} + +String FindVideoForSRT(Array &filenames, String srt_path) { + String base = ChopLastPeriod(srt_path); // .srt + base = ChopLastPeriod(base); // .en + + For(filenames) { + if (StartsWith(it, base)) { + if (EndsWith(it, ".mkv") || EndsWith(it, ".webm") || EndsWith(it, ".mp4")) { + return it; + } + } + } + return {}; +}