Commit a7d074de authored by Merlijn Wajer's avatar Merlijn Wajer
Browse files

pdf-to-hocr: fix ocr_line bug and add scaler

This commit fixes a bug where ocr_line elements would not have any
title for some PDFs (like ones created by OCRMyPDF).

This commit also adds PDF Metadata JSON as a requirement to make the
hOCR files, using the information contained within to estimate the DPI
and to scale the hOCR coordinates.
parent c09452ca
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
import sys import sys
import argparse import argparse
import json
from lxml import etree from lxml import etree
...@@ -121,7 +122,7 @@ def pdf_baseline_from_charboxes(charboxes): ...@@ -121,7 +122,7 @@ def pdf_baseline_from_charboxes(charboxes):
return float(m), int(c) return float(m), int(c)
def pdf_process_text_block(pageno, block, parent_block): def pdf_process_text_block(pageno, block, parent_block, scaler=lambda x: x):
parelem = etree.Element('p', attrib={'class': 'ocr_par'}) parelem = etree.Element('p', attrib={'class': 'ocr_par'})
parelem.attrib['id'] = get_id(pageno, 'par') parelem.attrib['id'] = get_id(pageno, 'par')
...@@ -137,11 +138,11 @@ def pdf_process_text_block(pageno, block, parent_block): ...@@ -137,11 +138,11 @@ def pdf_process_text_block(pageno, block, parent_block):
lineelem = etree.Element('span', attrib={'class': 'ocr_line'}) lineelem = etree.Element('span', attrib={'class': 'ocr_line'})
kv = {} kv = {}
kv['bbox'] = [str(int(x)) for x in line['bbox']] kv['bbox'] = get_bbox(line['bbox'], scaler)
last_wordelem, cboxes = pdf_process_characters(pageno, line, lineelem) cboxes = pdf_process_characters(pageno, line, lineelem, scaler)
if last_wordelem is not None: if cboxes:
char_boxes += cboxes char_boxes += cboxes
# TODO: Just use the dictionary properties? # TODO: Just use the dictionary properties?
...@@ -151,17 +152,20 @@ def pdf_process_text_block(pageno, block, parent_block): ...@@ -151,17 +152,20 @@ def pdf_process_text_block(pageno, block, parent_block):
lineelem.attrib['title'] = assemble_hocr_title_element(kv) lineelem.attrib['title'] = assemble_hocr_title_element(kv)
lineelem.attrib['id'] = get_id(pageno, 'line') lineelem.attrib['id'] = get_id(pageno, 'line')
if last_wordelem is not None: parelem.append(lineelem)
lineelem.append(last_wordelem)
parelem.append(lineelem)
kv = {} kv = {}
kv = {'bbox': [str(int(x)) for x in block['bbox']]} kv = {'bbox': get_bbox(block['bbox'], scaler)}
parelem.attrib['title'] = assemble_hocr_title_element(kv) parelem.attrib['title'] = assemble_hocr_title_element(kv)
parent_block.append(parelem) parent_block.append(parelem)
def get_bbox(bbox_data, scaler):
d = [int(scaler(x)) for x in bbox_data]
d = [x if x > 0 else 0 for x in d]
d = [str(x) for x in d]
return d
def _gather_word_data(pageno, wordelem, wordchar_bboxes, span): def _gather_word_data(pageno, wordelem, wordchar_bboxes, span):
# Turn these calculations into a function # Turn these calculations into a function
word_bbox = [ word_bbox = [
...@@ -179,8 +183,11 @@ def _gather_word_data(pageno, wordelem, wordchar_bboxes, span): ...@@ -179,8 +183,11 @@ def _gather_word_data(pageno, wordelem, wordchar_bboxes, span):
wordelem.attrib['title'] = \ wordelem.attrib['title'] = \
assemble_hocr_title_element(word_data) assemble_hocr_title_element(word_data)
def pdf_process_characters(pageno, line, lineelem, scaler):
def pdf_process_characters(pageno, line, lineelem): """
Process spans in a line, searching for spaces so that we can split the
characters into words.
"""
wordelem = etree.Element('span', attrib={'class': 'ocrx_word'}) wordelem = etree.Element('span', attrib={'class': 'ocrx_word'})
charelem = None charelem = None
word_start = True word_start = True
...@@ -221,8 +228,8 @@ def pdf_process_characters(pageno, line, lineelem): ...@@ -221,8 +228,8 @@ def pdf_process_characters(pageno, line, lineelem):
conf = float(100) conf = float(100)
bbox = [int(x) for x in char['bbox']] bbox = get_bbox(char['bbox'], scaler)
#bbox = [str(int(x)) for x in char['bbox']] bbox = [int(x) for x in bbox] # temporary go back to int
wordchar_bboxes.append(bbox) wordchar_bboxes.append(bbox)
all_wordchar_bboxes.append(bbox) all_wordchar_bboxes.append(bbox)
...@@ -240,21 +247,26 @@ def pdf_process_characters(pageno, line, lineelem): ...@@ -240,21 +247,26 @@ def pdf_process_characters(pageno, line, lineelem):
# wordelem[-1].text += chr(0x200e) # wordelem[-1].text += chr(0x200e)
# Sometimes we find no legit chars in a word, in which case charelem is None # Sometimes we find no legit chars in a word, in which case charelem is None
if found_any and len(wordchar_bboxes): if found_any:
_gather_word_data(pageno, wordelem, wordchar_bboxes, span) if not word_start:
lineelem.append(wordelem)
_gather_word_data(pageno, wordelem, wordchar_bboxes, span)
return wordelem, all_wordchar_bboxes return all_wordchar_bboxes
return None, None return None
def pdf_page_to_hocr_page(page, pageno=None): def pdf_page_to_hocr_page(page, page_metadata, pageno=None):
pagedata = page.get_text(option='rawdict') pagedata = page.get_text(option='rawdict')
# TODO: left to right, right to left # TODO: left to right, right to left
width, height = page_metadata['estimated_default_render_res'][2:]
kv = {} kv = {}
w = str(int(pagedata['width'])) w = str(int(width))
h = str(int(pagedata['height'])) h = str(int(height))
kv['bbox'] = ['0', '0', w, h] kv['bbox'] = ['0', '0', w, h]
if pageno is not None: if pageno is not None:
...@@ -262,10 +274,13 @@ def pdf_page_to_hocr_page(page, pageno=None): ...@@ -262,10 +274,13 @@ def pdf_page_to_hocr_page(page, pageno=None):
else: else:
kv['ppageno'] = '0' kv['ppageno'] = '0'
kv['image'] = 'https://archive.org/todo' # TODO: some image path? kv['image'] = 'todo://todo' # TODO: some image path?
#kv['image'] = 'https://archive.org/todo' # TODO: some image path?
# TODO # TODO
dpi = 300 dpi = int(page_metadata['estimated_dpi'])
scaler = lambda x: int(x * page_metadata['estimated_scale'])
#dpi = 300
kv['scan_res'] = '%d %d' % (dpi, dpi) kv['scan_res'] = '%d %d' % (dpi, dpi)
pageelem = etree.Element('div', attrib={'class': 'ocr_page', pageelem = etree.Element('div', attrib={'class': 'ocr_page',
...@@ -274,24 +289,28 @@ def pdf_page_to_hocr_page(page, pageno=None): ...@@ -274,24 +289,28 @@ def pdf_page_to_hocr_page(page, pageno=None):
}) })
for block in pagedata['blocks']: for block in pagedata['blocks']:
# TODO: Look into negative bbox values
if block['type'] != 0: if block['type'] != 0:
# TODO: Skip blocks that are not text, for now # TODO: Skip blocks that are not text, for now
continue continue
kv = {} kv = {}
kv['bbox'] = [str(int(x)) for x in block['bbox']] kv['bbox'] = get_bbox(block['bbox'], scaler)
blockelem = etree.Element('div', attrib={'class': 'ocr_carea'}) blockelem = etree.Element('div', attrib={'class': 'ocr_carea'})
blockelem.attrib['title'] = assemble_hocr_title_element(kv) blockelem.attrib['title'] = assemble_hocr_title_element(kv)
blockelem.attrib['id'] = get_id(pageno, 'block') blockelem.attrib['id'] = get_id(pageno, 'block')
pdf_process_text_block(pageno, block, blockelem) pdf_process_text_block(pageno, block, blockelem, scaler=scaler)
pageelem.append(blockelem) pageelem.append(blockelem)
return pageelem return pageelem
def process_files(filename): def process_files(filename, json_metadata_file):
metadata = json.load(open(json_metadata_file))
print('''<?xml version="1.0" encoding="UTF-8"?> print('''<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
...@@ -306,7 +325,9 @@ def process_files(filename): ...@@ -306,7 +325,9 @@ def process_files(filename):
doc = fitz.open(filename) doc = fitz.open(filename)
for idx, page in enumerate(doc): for idx, page in enumerate(doc):
hocr_page = pdf_page_to_hocr_page(page, pageno=idx) page_metadata = metadata['page_data'][idx]
hocr_page = pdf_page_to_hocr_page(page, page_metadata, pageno=idx)
s = etree.tostring(hocr_page, pretty_print=True, method='xml', s = etree.tostring(hocr_page, pretty_print=True, method='xml',
encoding='utf-8').decode('utf-8') encoding='utf-8').decode('utf-8')
print(s) print(s)
...@@ -319,6 +340,8 @@ if __name__ == '__main__': ...@@ -319,6 +340,8 @@ if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Page to hOCR converter') parser = argparse.ArgumentParser(description='Page to hOCR converter')
parser.add_argument('-f', '--infile', help='Input file', parser.add_argument('-f', '--infile', help='Input file',
type=str, default=None) type=str, default=None)
parser.add_argument('-J', '--json-metadata-file', help='Input json metadata file',
type=str, default=None)
args = parser.parse_args() args = parser.parse_args()
process_files(args.infile) process_files(args.infile, args.json_metadata_file)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment