There are many different Python packages for processing PDF files. I looked at a few of them. There examples are all using Python 3.6.
- Dockerfile
- requirements.txt
Here are some examples for extracting information out of PDF files using Python.
Python packages for manipulating PDF
pdfminer.six - Python library that focuses on getting and analyzing text data, including a PDF converter to tranform PDF files into text or html.
pdfrw - pure Python library that reads and writes PDFs
Split into pages
The PyPDF2 package works well for spliting the file into pages.
import sys
from subprocess import call
import PyPDF2
def split_into_pages(filename,outfile_prefix):
with open(filename, "rb") as file:
f = PyPDF2.PdfFileReader(file)
for i in range(0, f.getNumPages()):
page = f.getPage(i)
base = outfile_prefix + "_page_" +str(i)
pageFilename = base + ".pdf"
output = PyPDF2.PdfFileWriter()
output.addPage(page)
with open(pageFilename, "wb") as outputStream:
output.write(outputStream)
Create jpg for each page
- Split into pages and then convert to jpg
def split_into_jpg_pages(filename,outfile_prefix):
generated_files = []
with open(filename, "rb") as file:
f = PyPDF2.PdfFileReader(file)
for i in range(0, f.getNumPages()):
page = f.getPage(i)
base = f"{outfile_prefix}_page_{i}"
pageFilename = base + ".pdf"
# write pdf file
output = PyPDF2.PdfFileWriter()
output.addPage(page)
with open(pageFilename, "wb") as outputStream:
output.write(outputStream)
# call external program 'convert' to convert to jpg
jpgPageFilename = base + ".jpg"
call(["convert", pageFilename, jpgPageFilename])
# could remove the pdf file now
generated_files.append(jpgPageFilename)
return(generated_files)
Extract all text via PyPDF2
def extract_all_text(filename):
page_text = []
with open(filename, "rb") as file:
f = PyPDF2.PdfFileReader(file)
for i in range(0, f.getNumPages()):
page = f.getPage(i)
pcontent = page.extractText() + "\n"
pcontent = " ".join(pcontent.replace(u"\xa0", u" ").strip().split())
page_text.append(pcontent)
return(page_text)
Extract all text via pdfminer
- Use tools/pdf2txt.py
This produces a cleaner text output.
Extracting JPGs from PDF, Quick and Dirty
From https://nedbatchelder.com/blog/200712/extracting_jpgs_from_pdfs.html (Converted to Python3)
Process a PDF file and create a jpg file o Create jpg files files each image, names jpg0.jpg, jpg1.jpg, and so on.
import sys
with open("source.pdf", "rb") as file:
pdf = file.read()
startmark = b"\xff\xd8"
startfix = 0
endmark = b"\xff\xd9"
endfix = 2
i = 0
njpg = 0
while True:
istream = pdf.find(b"stream", i)
print("istream " + str(istream))
if istream < 0:
break
istart = pdf.find(startmark, istream, istream+20)
print(" istart " + str(istart))
if istart < 0:
i = istream+20
continue
iend = pdf.find(b"endstream", istart)
if iend < 0:
raise Exception("Didn't find end of stream!")
iend = pdf.find(endmark, iend-20)
if iend < 0:
raise Exception("Didn't find end of JPG!")
istart += startfix
iend += endfix
print( "JPG %d from %d to %d" % (njpg, istart, iend))
jpg = pdf[istart:iend]
with open("jpg%s.jpg" % njpg, "wb") as jpgfile:
jpgfile.write(jpg)
njpg += 1
i = iend
Extracting text
Example from https://stackoverflow.com/questions/20327681/extract-images-from-pdf-using-python-pypdf2
import sys
import PyPDF2
from PIL import Image
if __name__ == '__main__':
input1 = PyPDF2.PdfFileReader(open("source.pdf", "rb"))
page0 = input1.getPage(0)
content = ""
pages = []
for i in range(0, input1.getNumPages()):
pcontent = input1.getPage(i).extractText() + "\n"
pcontent = " ".join(pcontent.replace(u"\xa0", u" ").strip().split())
content += pcontent
pages.append(pcontent)
content = content.replace("IRB Number:","\n\nIRB Number:")
print(content)
#sys.exit()
xObject = page0['/Resources']['/XObject'].getObject()
for obj in xObject:
if xObject[obj]['/Subtype'] == '/Image':
size = (xObject[obj]['/Width'], xObject[obj]['/Height'])
data = xObject[obj].getData()
if xObject[obj]['/ColorSpace'] == '/DeviceRGB':
mode = "RGB"
else:
mode = "P"
if xObject[obj]['/Filter'] == '/FlateDecode':
img = Image.frombytes(mode, size, data)
img.save(obj[1:] + ".png")
elif xObject[obj]['/Filter'] == '/DCTDecode':
img = open(obj[1:] + ".jpg", "wb")
img.write(data)
img.close()
elif xObject[obj]['/Filter'] == '/JPXDecode':
img = open(obj[1:] + ".jp2", "wb")
img.write(data)