Pdf manipulation in Python
June 5, 2009 Python 3 CommentsFor the past few months, apart from the data issue, I am involved in merging multiple pdfs to create a book of reading for the university. Refer to my post: Pdf merging. I am using two different libraries to complete this project: pypdf and reportlab
Here is a very simple example of using reportlab library to create pages with or without content and save to output file (Note: it will only return buffer of the page, so we need to use PdfFileReader to read the page):
from pyPdf import PdfFileWriter, PdfFileReader
from reportlab.lib import pagesizes
from reportlab.pdfgen import canvas
from reportlab.lib.units import cm, mm, inch
from StringIO import StringIO
PAGESIZE = pagesizes.A4
def createPdfPage(nPages, pagesize=None, content=None):
buffer = StringIO()
c = canvas.Canvas(None)
if pagesize is None:
pagesize = PAGESIZE
c.setPageSize(pagesize)
c.showOutline()
for page in range(nPages):
if content:
c.drawString(9*cm, 22*cm, content)
c.showPage()
buffer.write(c.getpdfdata())
buffer.seek(0)
return buffer
#simple PdfFileWriter class
class PdfWriter(object):
def __init__(self, outputFile):
self.outputWriter = PdfFileWriter()
self.__outputFile = outputFile
def savePdf(self):
outputStream = file(self.__outputFile, "wb")
self.outputWriter.write(outputStream)
outputStream.close()
def addPage(self, page):
self.outputWriter.addPage(page)
#create 2 pages of pdfs file, one is empty, one with content
outputFile = "test.pdf"
#create PdfWriter
pdfWriter = PdfWriter(outputFile)
#create a page without any content
emptyPageBuffer = createPdfPage(1)
emptyPageReader = PdfFileReader(emptyPageBuffer)
#get the page and append it to the output stream
pdfWriter.addPage(emptyPageReader.getPage(0))
pageWithContent = createPdfPage(1, content="more content")
pageWithContentReader = PdfFileReader(pageWithContent)
#get the page and append it to the output stream
pdfWriter.addPage(pageWithContentReader.getPage(0))
#save the pdf
pdfWriter.savePdf()
Here is another example merging two files with additional blank page in between the two files:
fileOne = "test.pdf"
fileTwo = "test2.pdf"
outputFile = "outputFile.pdf"
#createWriter for fileOne
pdfWriter = PdfWriter(fileOne)
#create pdfReader for test.pdf
fileOneStream = file(fileOne, "rb")
pdfReader = PdfFileReader(fileOneStream)
for page in range(pdfReader.getNumPages()):
pdfWriter.addPage(pdfReader.getPage(page))
fileOneStream.close()
#create an empty page
#create a page without any content
emptyPageBuffer = createPdfPage(1)
emptyPageReader = PdfFileReader(emptyPageBuffer)
#get the page and append it to the output stream
pdfWriter.addPage(emptyPageReader.getPage(0))
#create pdfReader for test2.pdf
fileTwoStream = file(fileTwo, "rb")
pdfReader = PdfFileReader(fileTwoStream)
for page in range(pdfReader.getNumPages()):
pdfWriter.addPage(pdfReader.getPage(page))
fileTwoStream.close()
#save the pdf
pdfWriter.savePdf()
There are lot of cases users need to split the pdfs file using tools like Adobe or other available tools. Although the splitted pdfs can be viewed using pdf viewer, some of these pdfs might be corrupted, e.g. no pdf end of file maker (%%EOF) at the end of the pdf. PdfFileReader will not be able to read the pdf if the EOF marker not found. To fix this:
#check if the pdf is corrupted, and try to fix it...
def fixPdf(pdfFile):
try:
fileOpen = file(pdfFile, "a")
fileOpen.write("%%EOF")
fileOpen.close()
return "Fixed"
except Exception, e:
return "Unable to open file: %s with error: %s" % (pdfFile, str(e))
corruptedFile = "corrupted.pdf"
try:
fileStream = file(corruptedFile)
pdfReader = PdfFileReader(fileStream)
except:
fileStream.close()
print 'error in opeing pdf file, try to fix it'
print fixPdf(corruptedFile)
#try to reopen the pdf file again
try:
fileStream = file(corruptedFile)
pdfReader = PdfFileReader(fileStream)
print 'number of pages: ', pdfReader.getNumPages()
fileStream.close()
except:
print 'this pdf file cannot be fixed'
Below are the example to get the individual page detail in the pdf file, this might be useful to find the inconsistency page size found in the pdf:
#get page detail
def getpageBox(page):
return page.trimBox
def rectangle2box(pdfPage):
return {
'width' : pdfPage.upperRight[0],
'height' : pdfPage.upperRight[1],
'offset_x': pdfPage.lowerLeft[0],
'offset_y': pdfPage.lowerLeft[1],
'unit' : 'pt',
'units_x' : pdfPage.upperRight[0],
'units_y' : pdfPage.upperRight[1],
}
testFile = "test2.pdf"
fileStream = file(testFile)
pdfReader = PdfFileReader(fileStream)
for page in range(pdfReader.getNumPages()):
pageBox = getpageBox(pdfReader.getPage(page))
rectangleDetail = rectangle2box(pageBox)
print '--- page number: ', page + 1
for key in rectangleDetail:
print "%s\t: %s" % (key, rectangleDetail[key])
