Files
imgui_transcript_browser/src/pdf_browser/read_pdf.py
2024-06-20 08:17:54 +02:00

37 lines
1.0 KiB
Python

import os, sys
import pypdf
if len(sys.argv) != 2:
print("expected single argument with filename or folder to extract text from")
exit(0)
def write_pdf_for(filename):
reader = pypdf.PdfReader(filename)
extract_filename = filename + ".txt"
if os.path.exists(extract_filename):
print(f"skipping {extract_filename}, file exists")
return
f = open(extract_filename, "w", encoding="utf-8")
for i in range(len(reader.pages)):
page = reader.pages[i]
text = page.extract_text()
text = text.replace("\n", "")
text = text.replace("-", "")
f.write(f">>>>>>>>{i + 1}<<<<<<<<<\n")
f.write(text)
f.write("\n")
f.close()
print(f"generated: {extract_filename}")
if os.path.isdir(sys.argv[1]):
for file in os.listdir(sys.argv[1]):
if file.endswith(".pdf"):
write_pdf_for(file)
elif os.path.isfile(sys.argv[1]):
write_pdf_for(sys.argv[1])
else:
print(f"argument you passed in: {sys.argv[1]} is not a filename or folder")