Commit a7d074de authored by Merlijn Wajer's avatar Merlijn Wajer
Browse files

pdf-to-hocr: fix ocr_line bug and add scaler

This commit fixes a bug where ocr_line elements would not have any
title for some PDFs (like ones created by OCRMyPDF).

This commit also adds PDF Metadata JSON as a requirement to make the
hOCR files, using the information contained within to estimate the DPI
and to scale the hOCR coordinates.
parent c09452ca
......@@ -2,6 +2,7 @@
import sys
import argparse
import json
from lxml import etree
......@@ -121,7 +122,7 @@ def pdf_baseline_from_charboxes(charboxes):
return float(m), int(c)
def pdf_process_text_block(pageno, block, parent_block):
def pdf_process_text_block(pageno, block, parent_block, scaler=lambda x: x):
parelem = etree.Element('p', attrib={'class': 'ocr_par'})
parelem.attrib['id'] = get_id(pageno, 'par')
......@@ -137,11 +138,11 @@ def pdf_process_text_block(pageno, block, parent_block):
lineelem = etree.Element('span', attrib={'class': 'ocr_line'})
kv = {}
kv['bbox'] = [str(int(x)) for x in line['bbox']]
kv['bbox'] = get_bbox(line['bbox'], scaler)
last_wordelem, cboxes = pdf_process_characters(pageno, line, lineelem)
cboxes = pdf_process_characters(pageno, line, lineelem, scaler)
if last_wordelem is not None:
if cboxes:
char_boxes += cboxes
# TODO: Just use the dictionary properties?
......@@ -151,17 +152,20 @@ def pdf_process_text_block(pageno, block, parent_block):
lineelem.attrib['title'] = assemble_hocr_title_element(kv)
lineelem.attrib['id'] = get_id(pageno, 'line')
if last_wordelem is not None:
lineelem.append(last_wordelem)
parelem.append(lineelem)
kv = {}
kv = {'bbox': [str(int(x)) for x in block['bbox']]}
kv = {'bbox': get_bbox(block['bbox'], scaler)}
parelem.attrib['title'] = assemble_hocr_title_element(kv)
parent_block.append(parelem)
def get_bbox(bbox_data, scaler):
d = [int(scaler(x)) for x in bbox_data]
d = [x if x > 0 else 0 for x in d]
d = [str(x) for x in d]
return d
def _gather_word_data(pageno, wordelem, wordchar_bboxes, span):
# Turn these calculations into a function
word_bbox = [
......@@ -179,8 +183,11 @@ def _gather_word_data(pageno, wordelem, wordchar_bboxes, span):
wordelem.attrib['title'] = \
assemble_hocr_title_element(word_data)
def pdf_process_characters(pageno, line, lineelem):
def pdf_process_characters(pageno, line, lineelem, scaler):
"""
Process spans in a line, searching for spaces so that we can split the
characters into words.
"""
wordelem = etree.Element('span', attrib={'class': 'ocrx_word'})
charelem = None
word_start = True
......@@ -221,8 +228,8 @@ def pdf_process_characters(pageno, line, lineelem):
conf = float(100)
bbox = [int(x) for x in char['bbox']]
#bbox = [str(int(x)) for x in char['bbox']]
bbox = get_bbox(char['bbox'], scaler)
bbox = [int(x) for x in bbox] # temporary go back to int
wordchar_bboxes.append(bbox)
all_wordchar_bboxes.append(bbox)
......@@ -240,21 +247,26 @@ def pdf_process_characters(pageno, line, lineelem):
# wordelem[-1].text += chr(0x200e)
# Sometimes we find no legit chars in a word, in which case charelem is None
if found_any and len(wordchar_bboxes):
if found_any:
if not word_start:
lineelem.append(wordelem)
_gather_word_data(pageno, wordelem, wordchar_bboxes, span)
return wordelem, all_wordchar_bboxes
return all_wordchar_bboxes
return None, None
return None
def pdf_page_to_hocr_page(page, pageno=None):
def pdf_page_to_hocr_page(page, page_metadata, pageno=None):
pagedata = page.get_text(option='rawdict')
# TODO: left to right, right to left
width, height = page_metadata['estimated_default_render_res'][2:]
kv = {}
w = str(int(pagedata['width']))
h = str(int(pagedata['height']))
w = str(int(width))
h = str(int(height))
kv['bbox'] = ['0', '0', w, h]
if pageno is not None:
......@@ -262,10 +274,13 @@ def pdf_page_to_hocr_page(page, pageno=None):
else:
kv['ppageno'] = '0'
kv['image'] = 'https://archive.org/todo' # TODO: some image path?
kv['image'] = 'todo://todo' # TODO: some image path?
#kv['image'] = 'https://archive.org/todo' # TODO: some image path?
# TODO
dpi = 300
dpi = int(page_metadata['estimated_dpi'])
scaler = lambda x: int(x * page_metadata['estimated_scale'])
#dpi = 300
kv['scan_res'] = '%d %d' % (dpi, dpi)
pageelem = etree.Element('div', attrib={'class': 'ocr_page',
......@@ -274,24 +289,28 @@ def pdf_page_to_hocr_page(page, pageno=None):
})
for block in pagedata['blocks']:
# TODO: Look into negative bbox values
if block['type'] != 0:
# TODO: Skip blocks that are not text, for now
continue
kv = {}
kv['bbox'] = [str(int(x)) for x in block['bbox']]
kv['bbox'] = get_bbox(block['bbox'], scaler)
blockelem = etree.Element('div', attrib={'class': 'ocr_carea'})
blockelem.attrib['title'] = assemble_hocr_title_element(kv)
blockelem.attrib['id'] = get_id(pageno, 'block')
pdf_process_text_block(pageno, block, blockelem)
pdf_process_text_block(pageno, block, blockelem, scaler=scaler)
pageelem.append(blockelem)
return pageelem
def process_files(filename):
def process_files(filename, json_metadata_file):
metadata = json.load(open(json_metadata_file))
print('''<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
......@@ -306,7 +325,9 @@ def process_files(filename):
doc = fitz.open(filename)
for idx, page in enumerate(doc):
hocr_page = pdf_page_to_hocr_page(page, pageno=idx)
page_metadata = metadata['page_data'][idx]
hocr_page = pdf_page_to_hocr_page(page, page_metadata, pageno=idx)
s = etree.tostring(hocr_page, pretty_print=True, method='xml',
encoding='utf-8').decode('utf-8')
print(s)
......@@ -319,6 +340,8 @@ if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Page to hOCR converter')
parser.add_argument('-f', '--infile', help='Input file',
type=str, default=None)
parser.add_argument('-J', '--json-metadata-file', help='Input json metadata file',
type=str, default=None)
args = parser.parse_args()
process_files(args.infile)
process_files(args.infile, args.json_metadata_file)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment