37 lines
1.0 KiB
Python
37 lines
1.0 KiB
Python
import os, sys
|
|
import pypdf
|
|
|
|
if len(sys.argv) != 2:
|
|
print("expected single argument with filename or folder to extract text from")
|
|
exit(0)
|
|
|
|
|
|
def write_pdf_for(filename):
|
|
reader = pypdf.PdfReader(filename)
|
|
extract_filename = filename + ".txt"
|
|
if os.path.exists(extract_filename):
|
|
print(f"skipping {extract_filename}, file exists")
|
|
return
|
|
|
|
f = open(extract_filename, "w", encoding="utf-8")
|
|
for i in range(len(reader.pages)):
|
|
page = reader.pages[i]
|
|
text = page.extract_text()
|
|
text = text.replace("\n", "")
|
|
text = text.replace("-", "")
|
|
|
|
f.write(f">>>>>>>>{i + 1}<<<<<<<<<\n")
|
|
f.write(text)
|
|
f.write("\n")
|
|
|
|
f.close()
|
|
print(f"generated: {extract_filename}")
|
|
|
|
if os.path.isdir(sys.argv[1]):
|
|
for file in os.listdir(sys.argv[1]):
|
|
if file.endswith(".pdf"):
|
|
write_pdf_for(file)
|
|
elif os.path.isfile(sys.argv[1]):
|
|
write_pdf_for(sys.argv[1])
|
|
else:
|
|
print(f"argument you passed in: {sys.argv[1]} is not a filename or folder") |