OLPC Research Paper Reader

From Catholicpenguin

import os, sys, tempfile
import Image
from subprocess import Popen,PIPE
import getopt

'''
Simple script which takes a two-column PDF in standard research-paper format,
and converts it into a format suitable for reading on the OLPC. Requires PIL,
and xpdf with pdftoppm to be in the PATH.
(Get a Windows version from http://www.foolabs.com/xpdf/download.html)
'''
'''
    -v:  debug (default off)
    -s:  single column (default off, i.e., two column)
    -o:  offset (in pixels, at whatever DPI rendered) (default 0)
    -d:  DPI to render (default 240)
    -f:  PDF file (required)

Usage:
For a typical two-column pdf that is properly aligned,
    ./pdf2olpc.py -f file.pdf
Maybe it is shifted a bit? Bigger numbers shift the middle to the left
    ./pdf2olpc.py -f file.pdf -o 50
Grumble. It's a single-column article 
    ./pdf2olpc.py -s -f file.pdf
Grumble. It's a single-column article that needs shifting (bigger == higher up)
  and also has tiny margins (reduce DPI to get more of the page)
    ./pdf2olpc.py -s -f file.pdf -o 75 -d 130
'''

OFFSET = 0 # In pixels, for stupid offcentered papers
DEBUG = False
DPI = -1 # Controls the amount of border removed - 240 seems about right for
# most research papers. Bigger == more border removed
SINGLE = False
FILE = ''

print sys.argv
optlist,argv = getopt.getopt(sys.argv[1:],'vso:d:f:')
print optlist, argv
for k,v in optlist:    
    if k == '-v': DEBUG = True
    if k == '-s': SINGLE = True
    if k == '-o': OFFSET = int(v)
    if k == '-d': DPI = int(v)
    if k == '-f': FILE = v

# Defaults which worked for me
if DPI < 0 and not SINGLE:
    DPI = 240
    OFFSET = 0
if DPI < 0 and SINGLE: 
    DPI = 155
    OFFSET = 115
    

pdf = FILE
pdf_fn,pdf_ext = os.path.splitext(os.path.split(pdf)[1])

# First, convert the PDF into images
#pdir = tempfile.mkdtemp()
pdir = 'pdftmp'
os.mkdir('pdftmp')
args = ['pdftoppm','-r',str(DPI),pdf,os.path.join(pdir,pdf_fn)]
if DEBUG: print 'Running pdftoppm with args:'+str(args)
o = Popen(args,stdout=PIPE).communicate()[0]
if DEBUG: print 'Done. Got result:'+str(o)

# Then, find out how many files we have
pdf_pages = sorted(os.listdir(pdir))
if DEBUG: print 'Got pages:' + str(pdf_pages)

# Make final directory for the book
bookdir = pdf_fn
os.mkdir(bookdir)

# Process the PDF page images and chop into 4 pieces sized exactly to the OLPC
# screen (900x1200)
bookpagecnt = 0
for pdf_page in pdf_pages:    
    im = Image.open(os.path.join(pdir,pdf_page))
    if DEBUG: print 'Opened image:',pdf_page,im.format,"%dx%d" % im.size, im.mode

    # Depending on SINGLE, either tile into 4, or in 1 big image in the center
    def crop(box):
        global bookpagecnt
        region = im.crop(box)
        bookpage = os.path.join(bookdir,pdf_fn + '%.4d'%bookpagecnt + '.png')
        if DEBUG: print 'Saving page:',bookpage
        region.save(bookpage,'PNG')
        bookpagecnt+=1
    xsize,ysize = im.size

    if not SINGLE: # Double column format, so tile into 4
        midx = xsize/2 + OFFSET
        midy = ysize/2
        box0 = (midx-900,midy-1200,midx,midy)
        box1 = (midx-900,midy,midx,midy+1200)
        box2 = (midx,midy-1200,midx+900,midy)
        box3 = (midx,midy,midx+900,midy+1200)
        crop(box0)
        crop(box1)
        crop(box2)
        crop(box3)
    else: # Single column, one big page
        midx = xsize/2
        midy = ysize/2 - OFFSET
        box0 = (midx-900/2,midy-1200/2,midx+900/2,midy+1200/2)
        crop(box0)

# Remove temp directory
for root, dirs, files in os.walk(pdir, topdown=False):
    for name in files:
        os.remove(os.path.join(root, name))
    for name in dirs:
        os.rmdir(os.path.join(root, name))
os.rmdir(pdir)