Commit 0259bad6 authored by Merlijn Wajer's avatar Merlijn Wajer
Browse files

hocr/fts: workaround elastic search bugs

Some search/match algorithms of Elastic Search seem to strip leading
(and trailing) document whitespace. This throws off our page index
(offset table), so account for leading whitespace being stripped, if
requested.
parent 8cbf5151
...@@ -10,14 +10,15 @@ from hocr.util import open_if_required ...@@ -10,14 +10,15 @@ from hocr.util import open_if_required
from hocr.fts import find_matches from hocr.fts import find_matches
def process_file(hocrfile, textfile, tablepath): def process_file(hocrfile, textfile, tablepath, es_workaround):
lookup_table = hocr_load_lookup_table(tablepath) lookup_table = hocr_load_lookup_table(tablepath)
hocrfp = open_if_required(hocrfile) hocrfp = open_if_required(hocrfile)
textfp = open_if_required(textfile) textfp = open_if_required(textfile)
text = textfp.read().decode('utf-8') text = textfp.read().decode('utf-8')
for word_results in find_matches(lookup_table, hocrfp, text): for word_results in find_matches(lookup_table, hocrfp, text,
es_whitespace_fixup_required=es_workaround):
json.dump(word_results, sys.stdout) json.dump(word_results, sys.stdout)
sys.stdout.write('\n') sys.stdout.write('\n')
...@@ -32,6 +33,9 @@ if __name__ == '__main__': ...@@ -32,6 +33,9 @@ if __name__ == '__main__':
default=None) default=None)
parser.add_argument('--table', help='Table to use', parser.add_argument('--table', help='Table to use',
type=str, default=None) type=str, default=None)
parser.add_argument('--es-workaround', help='Flag to enable working around'
'ES stripping leading whitespace',
default=False, action='store_true')
args = parser.parse_args() args = parser.parse_args()
process_file(args.hocr, args.annotated_text, args.table) process_file(args.hocr, args.annotated_text, args.table, args.es_workaround)
...@@ -2,7 +2,7 @@ from hocr.parse import hocr_page_to_word_data_fast, hocr_page_get_dimensions ...@@ -2,7 +2,7 @@ from hocr.parse import hocr_page_to_word_data_fast, hocr_page_get_dimensions
from hocr.searching import hocr_lookup_page_by_dat, \ from hocr.searching import hocr_lookup_page_by_dat, \
hocr_lookup_by_plaintext_offset hocr_lookup_by_plaintext_offset
from hocr.text import get_paragraph_hocr_words, hocr_paragraph_text, \ from hocr.text import get_paragraph_hocr_words, hocr_paragraph_text, \
get_paragraph_hocr_words get_paragraph_hocr_words, hocr_page_text_from_word_data
""" """
Highly experimental and unstable interface to retrieve page indexes and Highly experimental and unstable interface to retrieve page indexes and
...@@ -187,11 +187,32 @@ def find_word_boxes(solr_line, hocr_text, hocr_par, page, page_no): ...@@ -187,11 +187,32 @@ def find_word_boxes(solr_line, hocr_text, hocr_par, page, page_no):
return results return results
def find_matches(lookup_table, hocrfp, text): def find_matches(lookup_table, hocrfp, text, es_whitespace_fixup_required=False):
text_byte_count = 0 text_byte_count = 0
current_dat = None current_dat = None
page_number = 0 page_number = 0
if es_whitespace_fixup_required:
# There might be faster ways of doing this (e.g. read the _searchtext
# file and count the amount of 'whitespace' bytes)
done = False
for dat in lookup_table:
page = hocr_lookup_page_by_dat(hocrfp, dat)
word_data = hocr_page_to_word_data_fast(page)
page_text = hocr_page_text_from_word_data(word_data)
for line in page_text:
if line.strip() == '':
# Add counted bytes, one for newline
text_byte_count += len(line) + 1
continue
else:
done = True
break
if done:
break
# For every line in the highlighted text, let's find matches... # For every line in the highlighted text, let's find matches...
for line in text[:-1].split('\n'): for line in text[:-1].split('\n'):
# Line should contain both '{{{' and '}}}' # Line should contain both '{{{' and '}}}'
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment