Commit e67f5783 authored by Merlijn Wajer's avatar Merlijn Wajer
Browse files

add hocr-split-pages

parent 53af1790
#!/usr/bin/env python
import sys
import argparse
from lxml import etree
from hocr.searching import hocr_load_lookup_table, hocr_lookup_page_by_dat
from hocr.parse import hocr_page_iterator
from hocr.util import open_if_required, get_header_footer
def process_file(filepath, outfmt):
fd = open_if_required(filepath)
top, bottom = get_header_footer(fd)
for pageno, page in enumerate(hocr_page_iterator(fd)):
fp = open(outfmt % pageno, 'bw+')
fp.write(top)
s = etree.tostring(page, pretty_print=True, method='xml',
encoding='utf-8').decode('utf-8')
s = ' ' + s.replace(' xmlns="http://www.w3.org/1999/xhtml"', '')
s = s.encode('utf-8')
fp.write(s)
fp.write(bottom)
fp.close()
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='hOCR single page extractor')
parser.add_argument('-f', '--infile', help='Filename to read',
type=str, default=None)
parser.add_argument('-o', '--out-format', help='Outfile format - make sure it '
'takes an int as format',
type=str, default=None)
args = parser.parse_args()
process_file(args.infile, args.out_format)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment