Commit 423e5950 authored by Aram Verstegen's avatar Aram Verstegen
Browse files
parents d8a83abd e197ff68
COPYING
\ No newline at end of file
...@@ -31,7 +31,7 @@ import io ...@@ -31,7 +31,7 @@ import io
# X Add scan_res (from image or item metadata? meh) # X Add scan_res (from image or item metadata? meh)
# X word confidence wordFromDictionary (?) + char confs?, 'suspicious' attribute # X word confidence wordFromDictionary (?) + char confs?, 'suspicious' attribute
# - Add lots of (strict) assertions to prevent silent bugs (unknown areas/types, etc) # - Add lots of (strict) assertions to prevent silent bugs (unknown areas/types, etc)
# - ocr_page image property - point to the some url for convience (where do we # - ocr_page image property - point to the some url for convenience (where do we
# get it from?) # get it from?)
# Tested versions: # Tested versions:
...@@ -203,6 +203,7 @@ def abbyy_process_text_block(pageno, NS, block, parent_block, dpi): ...@@ -203,6 +203,7 @@ def abbyy_process_text_block(pageno, NS, block, parent_block, dpi):
# (if they are different from paragraph) # (if they are different from paragraph)
for par in block.xpath('./x:text/x:par', namespaces={'x': NS}): for par in block.xpath('./x:text/x:par', namespaces={'x': NS}):
par_has_text = False
parelem = etree.Element('p', attrib={'class': 'ocr_par'}) parelem = etree.Element('p', attrib={'class': 'ocr_par'})
parelem.attrib['id'] = get_id(pageno, 'par') parelem.attrib['id'] = get_id(pageno, 'par')
...@@ -272,6 +273,7 @@ def abbyy_process_text_block(pageno, NS, block, parent_block, dpi): ...@@ -272,6 +273,7 @@ def abbyy_process_text_block(pageno, NS, block, parent_block, dpi):
last_wordelem, cboxes = abbyy_process_characters(pageno, NS, line, lineelem, formatting, line_fontsize) last_wordelem, cboxes = abbyy_process_characters(pageno, NS, line, lineelem, formatting, line_fontsize)
# We want to at least find a word, otherwise, discard # We want to at least find a word, otherwise, discard
if last_wordelem is not None: if last_wordelem is not None:
par_has_text = True
char_boxes += cboxes char_boxes += cboxes
m, c = abbyy_baseline_from_charboxes(char_boxes) m, c = abbyy_baseline_from_charboxes(char_boxes)
...@@ -283,9 +285,9 @@ def abbyy_process_text_block(pageno, NS, block, parent_block, dpi): ...@@ -283,9 +285,9 @@ def abbyy_process_text_block(pageno, NS, block, parent_block, dpi):
if last_wordelem is not None: if last_wordelem is not None:
lineelem.append(last_wordelem) lineelem.append(last_wordelem)
parelem.append(lineelem) parelem.append(lineelem)
if leftm is None: if leftm is None or not par_has_text:
print('paragraph has no lines/boundingbox', file=sys.stderr) print('paragraph has no lines/boundingbox', file=sys.stderr)
else: else:
kv = {'bbox': list(map(str, [leftm, topm, rightm, bottomm]))} kv = {'bbox': list(map(str, [leftm, topm, rightm, bottomm]))}
......
...@@ -10,14 +10,15 @@ from hocr.util import open_if_required ...@@ -10,14 +10,15 @@ from hocr.util import open_if_required
from hocr.fts import find_matches from hocr.fts import find_matches
def process_file(hocrfile, textfile, tablepath): def process_file(hocrfile, textfile, tablepath, es_workaround):
lookup_table = hocr_load_lookup_table(tablepath) lookup_table = hocr_load_lookup_table(tablepath)
hocrfp = open_if_required(hocrfile) hocrfp = open_if_required(hocrfile)
textfp = open_if_required(textfile) textfp = open_if_required(textfile)
text = textfp.read().decode('utf-8') text = textfp.read().decode('utf-8')
for word_results in find_matches(lookup_table, hocrfp, text): for word_results in find_matches(lookup_table, hocrfp, text,
es_whitespace_fixup_required=es_workaround):
json.dump(word_results, sys.stdout) json.dump(word_results, sys.stdout)
sys.stdout.write('\n') sys.stdout.write('\n')
...@@ -32,6 +33,9 @@ if __name__ == '__main__': ...@@ -32,6 +33,9 @@ if __name__ == '__main__':
default=None) default=None)
parser.add_argument('--table', help='Table to use', parser.add_argument('--table', help='Table to use',
type=str, default=None) type=str, default=None)
parser.add_argument('--es-workaround', help='Flag to enable working around'
'ES stripping leading whitespace',
default=False, action='store_true')
args = parser.parse_args() args = parser.parse_args()
process_file(args.hocr, args.annotated_text, args.table) process_file(args.hocr, args.annotated_text, args.table, args.es_workaround)
...@@ -61,7 +61,7 @@ def process_files(files_to_process): ...@@ -61,7 +61,7 @@ def process_files(files_to_process):
# Let's Remove the xmlns in the div. # Let's Remove the xmlns in the div.
# Yes, this is horrible, but cleanup_namespaces doesn't help # Yes, this is horrible, but cleanup_namespaces doesn't help
# since as far as tostring knows, this is the root. # since as far as tostring knows, this is the root.
# Let's also add two spaces for indendation for the first # Let's also add two spaces for indentation for the first
# page, and one for all the other pages. # page, and one for all the other pages.
if page_no == 1: if page_no == 1:
s = ' ' + \ s = ' ' + \
...@@ -85,7 +85,6 @@ if __name__ == '__main__': ...@@ -85,7 +85,6 @@ if __name__ == '__main__':
files_to_process = glob(args.glob) files_to_process = glob(args.glob)
if not len(files_to_process): if not len(files_to_process):
import sys
print('No files to process!', file=sys.stderr) print('No files to process!', file=sys.stderr)
sys.exit(1) sys.exit(1)
......
...@@ -36,7 +36,7 @@ def process_files(infile): ...@@ -36,7 +36,7 @@ def process_files(infile):
# Let's Remove the xmlns in the div. # Let's Remove the xmlns in the div.
# Yes, this is horrible, but cleanup_namespaces doesn't help # Yes, this is horrible, but cleanup_namespaces doesn't help
# since as far as tostring knows, this is the root. # since as far as tostring knows, this is the root.
# Let's also add two spaces for indendation for the first # Let's also add two spaces for indentation for the first
# page, and one for all the other pages. # page, and one for all the other pages.
if page_no == 1: if page_no == 1:
s = ' ' + \ s = ' ' + \
......
#!/usr/bin/env python
import sys
import argparse
from lxml import etree
from hocr.searching import hocr_load_lookup_table, hocr_lookup_page_by_dat
from hocr.parse import hocr_page_iterator
from hocr.util import open_if_required, get_header_footer
def process_file(filepath, outfmt):
fd = open_if_required(filepath)
top, bottom = get_header_footer(fd)
for pageno, page in enumerate(hocr_page_iterator(fd)):
fp = open(outfmt % pageno, 'bw+')
fp.write(top)
s = etree.tostring(page, pretty_print=True, method='xml',
encoding='utf-8').decode('utf-8')
s = ' ' + s.replace(' xmlns="http://www.w3.org/1999/xhtml"', '')
s = s.encode('utf-8')
fp.write(s)
fp.write(bottom)
fp.close()
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='hOCR single page extractor')
parser.add_argument('-f', '--infile', help='Filename to read',
type=str, default=None)
parser.add_argument('-o', '--out-format', help='Outfile format - make sure it '
'takes an int as format',
type=str, default=None)
args = parser.parse_args()
process_file(args.infile, args.out_format)
...@@ -6,7 +6,6 @@ from collections import OrderedDict ...@@ -6,7 +6,6 @@ from collections import OrderedDict
import hocr.parse import hocr.parse
from ebooklib import epub from ebooklib import epub
from abbyy_to_epub3.verify_epub import EpubVerify
from derivermodule.metadata import parse_item_metadata from derivermodule.metadata import parse_item_metadata
from internetarchivepdf.scandata import * from internetarchivepdf.scandata import *
...@@ -532,25 +531,6 @@ class EpubGenerator(object): ...@@ -532,25 +531,6 @@ class EpubGenerator(object):
self.book.spine = ['cover', 'nav', ] + pages_epub self.book.spine = ['cover', 'nav', ] + pages_epub
epub.write_epub(self.epub_zip_file_path, self.book, {}) epub.write_epub(self.epub_zip_file_path, self.book, {})
def verify(self):
self.verifier = EpubVerify()
result = self.verifier.run_epubcheck(self.epub_zip_file_path)
#print(result.messages)
for err in result.messages:
try:
print(err.level)
print(err.message)
#print(err.suggestion)
print()
except:
pass
return
errors = [err for err in result.messages if
# only keep desired_levels
err.level.lower() in ['error', 'fatal']]
if errors:
raise RuntimeError(errors)
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser(description='hOCR to ePUB converter') parser = argparse.ArgumentParser(description='hOCR to ePUB converter')
......
#!/usr/bin/env python3
import sys
import argparse
from lxml import etree
from xml.sax.saxutils import escape as xmlescape
import fitz
from hocr.util import open_if_required
import numpy as np
# Functions related to unique id="" generation
def page_id(pageno):
check_page_reset(pageno)
return 'page_%.06d' % pageno
LAST_PAGE = None
def check_page_reset(pageno):
global LAST_PAGE
if LAST_PAGE is None:
LAST_PAGE = pageno
if pageno != LAST_PAGE:
LAST_PAGE = pageno
reset_ids()
__IDS = {'block': 0,
'par': 0,
'line': 0,
'word': 0,
'photo': 0,
'table': 0,
'separator': 0}
def reset_ids():
global __IDS
for x in __IDS:
__IDS[x] = 0
def get_id(pageno, name):
global __IDS
check_page_reset(pageno)
ret = '%s_%.06d_%.06d' % (name, pageno, __IDS[name])
__IDS[name] += 1
return ret
def assemble_hocr_title_element(keyvals):
"""
Create a title="<...>" string from key, value pairs
Args:
* keyvals (dict): key value pairs
Returns: string to insert as title (without the surrounding double quotes)
"""
r = ''
for key, val in keyvals.items():
tot = [key]
if isinstance(val, list):
tot += val
else:
tot.append(val)
r += xmlescape(' '.join(tot))
r += '; '
if r:
# Strip off last '; '
return r[:-2]
return r
# TODO: Perhaps update/sync with abbyy code
def pdf_baseline_from_charboxes(charboxes):
"""
Calculates the baseline of characters part of a single line segment using
least squares on the center ((left+right)/2) of the bottom of every bounding box.
Args:
* charboxes: list of character bounding boxes (which are a list of 4 entries)
Returns:
Tuple of m, c (float, int) where m is the increment and c is the offset.
"""
points = []
x = []
y = []
for charbox in charboxes:
# (Left+Right)/2
x.append((charbox[0] + charbox[2])/2)
# Bottom
y.append(charbox[3])
x = np.array(x)
y = np.array(y)
# Normalise to minimal coordinate, maybe we ought to normalise to the first
# coordinate?
y -= y.min()
A = np.vstack([x, np.ones(len(x))]).T
r = np.linalg.lstsq(A, y, rcond=None)
m, c = r[0]
return float(m), int(c)
def pdf_process_text_block(pageno, block, parent_block):
parelem = etree.Element('p', attrib={'class': 'ocr_par'})
parelem.attrib['id'] = get_id(pageno, 'par')
leftm = None
topm = None
rightm = None
bottomm = None
for line in block['lines']:
char_boxes = []
lineelem = etree.Element('span', attrib={'class': 'ocr_line'})
kv = {}
kv['bbox'] = [str(int(x)) for x in line['bbox']]
last_wordelem, cboxes = pdf_process_characters(pageno, line, lineelem)
if last_wordelem is not None:
char_boxes += cboxes
# TODO: Just use the dictionary properties?
m, c = pdf_baseline_from_charboxes(char_boxes)
kv['baseline'] = '%f %d' % (m, c)
lineelem.attrib['title'] = assemble_hocr_title_element(kv)
lineelem.attrib['id'] = get_id(pageno, 'line')
if last_wordelem is not None:
lineelem.append(last_wordelem)
parelem.append(lineelem)
kv = {}
kv = {'bbox': [str(int(x)) for x in block['bbox']]}
parelem.attrib['title'] = assemble_hocr_title_element(kv)
parent_block.append(parelem)
def _gather_word_data(pageno, wordelem, wordchar_bboxes, span):
# Turn these calculations into a function
word_bbox = [
str(min(x[0] for x in wordchar_bboxes)),
str(min(x[1] for x in wordchar_bboxes)),
str(max(x[2] for x in wordchar_bboxes)),
str(max(x[3] for x in wordchar_bboxes)),
]
word_data = {'bbox': word_bbox, 'x_wconf': '100', }
word_data['x_fsize'] = str(int(span['size']))
wordelem.attrib['id'] = get_id(pageno, 'word')
wordelem.attrib['title'] = \
assemble_hocr_title_element(word_data)
def pdf_process_characters(pageno, line, lineelem):
wordelem = etree.Element('span', attrib={'class': 'ocrx_word'})
charelem = None
word_start = True
found_any = False
wordchar_bboxes = []
all_wordchar_bboxes = []
for span in line['spans']:
for char in span['chars']:
# TODO: What do we do with multiple spaces after each other?
# I guess if there's more than one we could consider adding them to
# the word, but I think it's better to just ignore them for hOCR
# data purposes.
if char['c'] == ' ':
if len(wordchar_bboxes) == 0:
# multiple repeating spaces
continue
if wordelem is not None:
lineelem.append(wordelem)
#if 'dir' in wordelem.attrib and wordelem.attrib['dir'] == 'rtl':
# wordelem[-1].text += chr(0x200e)
_gather_word_data(pageno, wordelem, wordchar_bboxes, span)
wordelem = etree.Element('span', attrib={'class': 'ocrx_word'})
word_start = True
wordchar_bboxes = []
continue
found_any = True
charelem = etree.Element('span', attrib={'class': 'ocrx_cinfo'})
charelem.text = char['c']
conf = float(100)
bbox = [int(x) for x in char['bbox']]
#bbox = [str(int(x)) for x in char['bbox']]
wordchar_bboxes.append(bbox)
all_wordchar_bboxes.append(bbox)
title_data = {}
title_data['x_bboxes'] = [str(x) for x in bbox]
title_data['x_confs'] = str(conf)
charelem.attrib['title'] = assemble_hocr_title_element(title_data)
wordelem.append(charelem)
word_start = False
## If word if rtl, let's add <200e>
#if 'dir' in wordelem.attrib and wordelem.attrib['dir'] == 'rtl':
# wordelem[-1].text += chr(0x200e)
# Sometimes we find no legit chars in a word, in which case charelem is None
if found_any and len(wordchar_bboxes):
_gather_word_data(pageno, wordelem, wordchar_bboxes, span)
return wordelem, all_wordchar_bboxes
return None, None
def pdf_page_to_hocr_page(page, pageno=None):
pagedata = page.get_text(option='rawdict')
# TODO: left to right, right to left
kv = {}
w = str(int(pagedata['width']))
h = str(int(pagedata['height']))
kv['bbox'] = ['0', '0', w, h]
if pageno is not None:
kv['ppageno'] = str(pageno)
else:
kv['ppageno'] = '0'
kv['image'] = 'https://archive.org/todo' # TODO: some image path?
# TODO
dpi = 300
kv['scan_res'] = '%d %d' % (dpi, dpi)
pageelem = etree.Element('div', attrib={'class': 'ocr_page',
'id': page_id(pageno),
'title': assemble_hocr_title_element(kv),
})
for block in pagedata['blocks']:
if block['type'] != 0:
# TODO: Skip blocks that are not text, for now
continue
kv = {}
kv['bbox'] = [str(int(x)) for x in block['bbox']]
blockelem = etree.Element('div', attrib={'class': 'ocr_carea'})
blockelem.attrib['title'] = assemble_hocr_title_element(kv)
blockelem.attrib['id'] = get_id(pageno, 'block')
pdf_process_text_block(pageno, block, blockelem)
pageelem.append(blockelem)
return pageelem
def process_files(filename):
print('''<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<title></title>
<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
<meta name="ocr-system" content="%s" />
<meta name="ocr-capabilities" content="ocr_page ocr_carea ocr_par ocr_line ocrx_word ocrp_wconf ocrp_lang ocrp_dir ocrp_font ocrp_fsize" />
</head>
<body>
''' % xmlescape('TODO PDF Producer'))
doc = fitz.open(filename)
for idx, page in enumerate(doc):
hocr_page = pdf_page_to_hocr_page(page, pageno=idx)
s = etree.tostring(hocr_page, pretty_print=True, method='xml',
encoding='utf-8').decode('utf-8')
print(s)
print(''' </body>
</html>
''')
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Page to hOCR converter')
parser.add_argument('-f', '--infile', help='Input file',
type=str, default=None)
args = parser.parse_args()
process_files(args.infile)
...@@ -28,8 +28,8 @@ ArmenianWestern,hyw,Western Armenian ...@@ -28,8 +28,8 @@ ArmenianWestern,hyw,Western Armenian
Assamese,asm,Assamese Assamese,asm,Assamese
Awar,aya,Awar Awar,aya,Awar
Aymara,aym,Aymara Aymara,aym,Aymara
AzeriCyrillic,, AzeriCyrillic,aze,Azerbaijani
AzeriLatin,, AzeriLatin,aze,Azerbaijani
Bashkir,bak,Bashkir Bashkir,bak,Bashkir
Basque,eus,Basque Basque,eus,Basque
Belarusian,bel,Belarusian Belarusian,bel,Belarusian
...@@ -99,7 +99,7 @@ French,fra,French ...@@ -99,7 +99,7 @@ French,fra,French
FrenchBelgian,fra,French FrenchBelgian,fra,French
FrenchCanadian,fra,French FrenchCanadian,fra,French
FrenchLuxembourg,fra,French FrenchLuxembourg,fra,French
FrenchMonaco,, FrenchMonaco,fra,French
FrenchProperNames,, FrenchProperNames,,
FrenchStandard,fra,French FrenchStandard,fra,French
FrenchSwiss,fra,French FrenchSwiss,fra,French
...@@ -136,13 +136,14 @@ Hausa,hau,Hausa ...@@ -136,13 +136,14 @@ Hausa,hau,Hausa
Hausa_Legacy,, Hausa_Legacy,,
Hawaiian,haw,Hawaiian Hawaiian,haw,Hawaiian
Hawaiian_Legacy,, Hawaiian_Legacy,,
Hebrew,yid,Yiddish
Hindi,hin,Hindi Hindi,hin,Hindi
Hungarian,hun,Hungarian Hungarian,hun,Hungarian
Icelandic,isl,Icelandic Icelandic,isl,Icelandic
Ido,ido,Ido Ido,ido,Ido
Indonesian,ind,Indonesian Indonesian,ind,Indonesian
Ingush,inh,Ingush Ingush,inh,Ingush
Interlingua,, Interlingua,ina,Interlingua (International Auxiliary Language Association)
Irish,gle,Irish Irish,gle,Irish
Irish_Legacy,, Irish_Legacy,,
Italian,ita,Italian Italian,ita,Italian
...@@ -153,7 +154,7 @@ Japanese,jpn,Japanese ...@@ -153,7 +154,7 @@ Japanese,jpn,Japanese
Kabardian,kbd,Kabardian Kabardian,kbd,Kabardian
Kachin,kac,Kachin Kachin,kac,Kachin
Kalmyk,xal,Kalmyk Kalmyk,xal,Kalmyk
KarachayBalkar,, KarachayBalkar,krc,Karachay-Balkar
Karakalpak,kaa,Kara-Kalpak Karakalpak,kaa,Kara-Kalpak
Kashmiri,kas,Kashmiri Kashmiri,kas,Kashmiri
KashmiriIndia,, KashmiriIndia,,
...@@ -183,7 +184,7 @@ Latvian,lav,Latvian ...@@ -183,7 +184,7 @@ Latvian,lav,Latvian
LatvianGothic,, LatvianGothic,,
Lezgin,lez,Lezghian Lezgin,lez,Lezghian
Lithuanian,lit,Lithuanian Lithuanian,lit,Lithuanian
LithuanianClassic,, LithuanianClassic,olt,Old Lithuanian
Luba,lua,Luba-Lulua Luba,lua,Luba-Lulua
Macedonian,mkd,Macedonian Macedonian,mkd,Macedonian
Malagasy,mlg,Malagasy Malagasy,mlg,Malagasy
...@@ -230,29 +231,29 @@ Pinyin,pny,Pinyin ...@@ -230,29 +231,29 @@ Pinyin,pny,Pinyin
Polish,pol,Polish Polish,pol,Polish
PortugueseBrazilian,por,Portuguese