Command line concatenate PDFs on OS X

A great post at http://gotofritz.net/blog/howto/joining-pdf-files-in-os-x-from-the-command-line/ which does what it says on the tin.

He explains how to link to a python script included in OS X since some years ago so as to make concatenating PDFs as easy as typing

pdfconcat -o output.pdf *.pdf

The heavy lifting is all done with calls to OS X Quartz.CoreGraphics module so this isn't going to work on other platforms, but for the curious it demonstrates how easily you can do such stuff on OS X.

#! /usr/bin/python
#
# join
#   Joing pages from a a collection of PDF files into a single PDF file.
#
#   join [--output <file>] [--shuffle] [--verbose]"
#
#   Parameter:
#
#   --shuffle
#	Take a page from each PDF input file in turn before taking another from each file.
#	If this option is not specified then all of the pages from a PDF file are appended
#	to the output PDF file before the next input PDF file is processed.
#
#   --verbose
#   Write information about the doings of this tool to stderr.
#
import sys
import os
import getopt
import tempfile
import shutil
from CoreFoundation import *
from Quartz.CoreGraphics import *

verbose = False

def createPDFDocumentWithPath(path):
	global verbose
	if verbose:
		print "Creating PDF document from file %s" % (path)
	return CGPDFDocumentCreateWithURL(CFURLCreateFromFileSystemRepresentation(kCFAllocatorDefault, path, len(path), False))

def writePageFromDoc(writeContext, doc, pageNum):

	global verbose
	page = CGPDFDocumentGetPage(doc, pageNum)
	if page:
		mediaBox = CGPDFPageGetBoxRect(page, kCGPDFMediaBox)
		if CGRectIsEmpty(mediaBox):
			mediaBox = None
			
		CGContextBeginPage(writeContext, mediaBox)
		CGContextDrawPDFPage(writeContext, page)
		CGContextEndPage(writeContext)
		if verbose:
			print "Copied page %d from %s" % (pageNum, doc)

def shufflePages(writeContext, docs, maxPages):
	
	for pageNum in xrange(1, maxPages + 1):
		for doc in docs:
			writePageFromDoc(writeContext, doc, pageNum)
				
def append(writeContext, docs, maxPages):

	for doc in docs:
		for pageNum in xrange(1, maxPages + 1) :
			writePageFromDoc(writeContext, doc, pageNum)

def main(argv):

	global verbose

	# The PDF context we will draw into to create a new PDF
	writeContext = None

	# If True then generate more verbose information
	source = None
	shuffle = False
	
	# Parse the command line options
	try:
		options, args = getopt.getopt(argv, "o:sv", ["output=", "shuffle", "verbose"])

	except getopt.GetoptError:
		usage()
		sys.exit(2)

	for option, arg in options:

		if option in ("-o", "--output") :
			if verbose:
				print "Setting %s as the destination." % (arg)
			writeContext = CGPDFContextCreateWithURL(CFURLCreateFromFileSystemRepresentation(kCFAllocatorDefault, arg, len(arg), False), None, None)

		elif option in ("-s", "--shuffle") :
			if verbose :
				print "Shuffle pages to the output file."
			shuffle = True

		elif option in ("-v", "--verbose") :
			print "Verbose mode enabled."
			verbose = True

		else :
			print "Unknown argument: %s" % (option)
	
	if writeContext:
		# create PDFDocuments for all of the files.
		docs = map(createPDFDocumentWithPath, args)
		
		# find the maximum number of pages.
		maxPages = 0
		for doc in docs:
			if CGPDFDocumentGetNumberOfPages(doc) > maxPages:
				maxPages = CGPDFDocumentGetNumberOfPages(doc)
	
		if shuffle:
			shufflePages(writeContext, docs, maxPages)
		else:
			append(writeContext, docs, maxPages)
		
		CGPDFContextClose(writeContext)
		del writeContext
		#CGContextRelease(writeContext)
    
def usage():
	print "Usage: join [--output <file>] [--shuffle] [--verbose]"

if __name__ == "__main__":
	main(sys.argv[1:])

A Very Small Editable PDF for Testing

A Small PDF

Ever wanted a small PDF file, or to tweak your own PDF for testing? Below is a very small PDF which you can paste into a text editor and save as MySmallPdf.PDF.
Two things are easy to edit:

  • The page size. Look for the line /MediaBox [0 0 612 144]. Leave the first (0,0) pair and edit the (612,144) to be (width of page,height of page) in 1/72ths of an inch.
  • One line of text. Look for the line (This is a small text editable pdf) Tj and replace the text with your own.

This is particularly helpful if you are testing http://itextpdf.com/ or itextsharp or some other PDF generator or API, and you want a couple of small identifiable test files to play with.

The Editable PDF

%PDF-1.4
1 0 obj
<< /Type /Catalog
/Outlines 2 0 R
/Pages 3 0 R
>>
endobj
2 0 obj
<< /Type /Outlines
/Count 0
>>
endobj
3 0 obj
<< /Type /Pages
/Kids [4 0 R]
/Count 1
>>
endobj
4 0 obj
<< /Type /Page
/Parent 3 0 R
/MediaBox [0 0 612 144]
/Contents 5 0 R
/Resources << /ProcSet 6 0 R
/Font << /F1 7 0 R >>
>>
>>
endobj
5 0 obj
<< /Length 73 >>
stream
BT
/F1 24 Tf
100 100 Td
(This is a small text editable pdf) Tj
ET
endstream
endobj
6 0 obj
[/PDF /Text]
endobj
7 0 obj
<< /Type /Font
/Subtype /Type1
/Name /F1
/BaseFont /Helvetica
/Encoding /MacRomanEncoding
>>
endobj
xref
0 8
0000000000 65535 f
0000000009 00000 n
0000000074 00000 n
0000000120 00000 n
0000000179 00000 n
0000000364 00000 n
0000000466 00000 n
0000000496 00000 n
trailer
<< /Size 8
/Root 1 0 R
>>
startxref
625
%%EOF

The PDF ISO32000 standard

The PDF reference is available for free from Adobe at http://www.adobe.com/content/dotcom/en/devnet/pdf/pdf_reference.html The one you want is the copy of the ISO reference at http://wwwimages.adobe.com/www.adobe.com/content/dam/Adobe/en/devnet/pdf/pdfs/PDF32000_2008.pdf