Commit 0259bad6 authored by Merlijn Wajer's avatar Merlijn Wajer
Browse files

hocr/fts: workaround elastic search bugs

Some search/match algorithms of Elastic Search seem to strip leading
(and trailing) document whitespace. This throws off our page index
(offset table), so account for leading whitespace being stripped, if
parent 8cbf5151
......@@ -10,14 +10,15 @@ from hocr.util import open_if_required
from hocr.fts import find_matches
def process_file(hocrfile, textfile, tablepath):
def process_file(hocrfile, textfile, tablepath, es_workaround):
lookup_table = hocr_load_lookup_table(tablepath)
hocrfp = open_if_required(hocrfile)
textfp = open_if_required(textfile)
text ='utf-8')
for word_results in find_matches(lookup_table, hocrfp, text):
for word_results in find_matches(lookup_table, hocrfp, text,
json.dump(word_results, sys.stdout)
......@@ -32,6 +33,9 @@ if __name__ == '__main__':
parser.add_argument('--table', help='Table to use',
type=str, default=None)
parser.add_argument('--es-workaround', help='Flag to enable working around'
'ES stripping leading whitespace',
default=False, action='store_true')
args = parser.parse_args()
process_file(args.hocr, args.annotated_text, args.table)
process_file(args.hocr, args.annotated_text, args.table, args.es_workaround)
......@@ -2,7 +2,7 @@ from hocr.parse import hocr_page_to_word_data_fast, hocr_page_get_dimensions
from hocr.searching import hocr_lookup_page_by_dat, \
from hocr.text import get_paragraph_hocr_words, hocr_paragraph_text, \
get_paragraph_hocr_words, hocr_page_text_from_word_data
Highly experimental and unstable interface to retrieve page indexes and
......@@ -187,11 +187,32 @@ def find_word_boxes(solr_line, hocr_text, hocr_par, page, page_no):
return results
def find_matches(lookup_table, hocrfp, text):
def find_matches(lookup_table, hocrfp, text, es_whitespace_fixup_required=False):
text_byte_count = 0
current_dat = None
page_number = 0
if es_whitespace_fixup_required:
# There might be faster ways of doing this (e.g. read the _searchtext
# file and count the amount of 'whitespace' bytes)
done = False
for dat in lookup_table:
page = hocr_lookup_page_by_dat(hocrfp, dat)
word_data = hocr_page_to_word_data_fast(page)
page_text = hocr_page_text_from_word_data(word_data)
for line in page_text:
if line.strip() == '':
# Add counted bytes, one for newline
text_byte_count += len(line) + 1
done = True
if done:
# For every line in the highlighted text, let's find matches...
for line in text[:-1].split('\n'):
# Line should contain both '{{{' and '}}}'
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment