
Jeff R. answered 07/28/22
Tutor
New to Wyzant
Principal Engineer II @ Forute100 Company B.S Comp Science & Math
Here's the solution with explanations in the code in the form of comments:
from os import listdir
from os.path import isfile, join
import PyPDF2 # python3 -m pip install PyPDF2
'''
Change path to the local location of your folder:
path = 'C:\Users\FooBar\Desktop\test-folder' # for Windows
path = '/Users/FooBar/Desktop/test-folder' # for Mac
'''
path = ''
# Make a list of absolute path's to all of the PDF files in the target folder
files = [join(path, f) for f in listdir(path) if isfile(join(path, f)) and join(path, f).endswith('.pdf')]
# Iterate through the list of PDF files using the PDF's aboslute path
for f in files:
with open(f, 'rb') as file_handle:
# Set strict=False to allow PDF files that don't comply to the PDF spec: https://www.pdfa.org/resource/pdf-specification-index/
pdf_reader = PyPDF2.PdfFileReader(file_handle, strict=False)
page_text = ''
# Iterate through each page in the PDF document to extract the text and add to plain-text string
for page_num in range(0, pdf_reader.getNumPages()):
page = pdf_reader.getPage(page_num)
page_text += page.extract_text()
# Write the plain text string to a file with the same name
with open(f.replace('.pdf', '.txt'), 'a+') as text_file_handle:
text_file_handle.writelines(page_text)