Commit 37ea5e0d authored by Merlijn Wajer's avatar Merlijn Wajer
Browse files

hocr/fts: More clean and robust matching

Matching code is now more clear, and also more robust against elastic
search highlighting in weird ways (across paragraphs, causing lines
(which are entire paragraphs in our case) matching to break.

Matching should also more accurately match multiple words.
parent 4dc15827
...@@ -5,29 +5,81 @@ from hocr.text import get_paragraph_hocr_words, hocr_paragraph_text, \ ...@@ -5,29 +5,81 @@ from hocr.text import get_paragraph_hocr_words, hocr_paragraph_text, \
get_paragraph_hocr_words get_paragraph_hocr_words
""" """
Highly experimental and unstable interface to retrieve page indexes and bounding Highly experimental and unstable interface to retrieve page indexes and
boxes for words matching a certain full-text-search query. bounding boxes for words matching a certain full-text-search query.
Required an external system to perform the highlighting (bin/fts-text-annotate Required an external system to perform the highlighting (bin/fts-text-annotate
can serve this purpose) can serve this purpose).
This API is highly unstable, the code is in need of cleanups, and there likely This API is highly unstable, the code is in need of cleanups, and there likely
isn't a big use for this particular file outside of Internet Archive purposes. isn't a big use for this particular file outside of Internet Archive purposes,
unless you want to match ElasticSearch highlighted plaintext to hOCR pages to
get the text coordinates of the matches.
""" """
# TODO:
# * Add notes here about internal api, internal code, backwards compat, why the
# whole thing is so awkward, what search inside does, etc
# * Add documentation to functions, make sure they are also added to sphinx
# * Fixup the poor exception messages, and fixup failing edge cases
def match_words(hocr_words, match_indexes):
def m(match, word_start, word_end):
# XXX: We might need to turn some of these <= into < or so, there could
# be an off-by-one error here, matching a next word when it should not.
return (match[0] >= word_start and match[0] <= word_end) or \
(match[1] >= word_start and match[1] <= word_end) or \
(word_start >= match[0] and word_end <= match[0]) or \
(word_start >= match[1] and word_end <= match[1])
# XXX: remove this, we do not need this in production matching_words = []
import re in_word = False
re_braces = re.compile(r'(\{\{\{|\}\}\})') current_match_words = []
current_match_index = 0
str_idx = 0
for word in hocr_words:
if current_match_index >= len(match_indexes):
break
word_start = str_idx
word_end = str_idx + len(word['text'])
if in_word:
# match in the word, or word in match
if m(match_indexes[current_match_index], word_start, word_end):
current_match_words.append(word)
else:
matching_words.append(current_match_words)
current_match_words = []
current_match_index += 1
in_word = False
if not in_word and current_match_index < len(match_indexes):
if m(match_indexes[current_match_index], word_start, word_end):
current_match_words.append(word)
in_word = True
# Add + 1 for the space after a word
str_idx += len(word['text']) + 1
if current_match_index < len(match_indexes):
word_start = str_idx
word_end = str_idx + len(word['text'])
if in_word:
if m(match_indexes[current_match_index], word_start, word_end):
current_match_words.append(word)
else:
matching_words.append(current_match_words)
current_match_words = []
current_match_index += 1
in_word = False
else:
if m(match_indexes[current_match_index], word_start, word_end):
current_match_words.append(word)
matching_words.append(current_match_words)
return matching_words
# TODO rename and note unstable api
def find_word_boxes(solr_line, hocr_text, hocr_par, page, page_no): def find_word_boxes(solr_line, hocr_text, hocr_par, page, page_no):
match_number = 0 match_number = 0
match_with = solr_line match_with = solr_line
...@@ -38,25 +90,10 @@ def find_word_boxes(solr_line, hocr_text, hocr_par, page, page_no): ...@@ -38,25 +90,10 @@ def find_word_boxes(solr_line, hocr_text, hocr_par, page, page_no):
results = [] results = []
# TODO: Let's not use regex here, we might not even need this check at all
if re_braces.sub('', cur['text']) != hocr_text:
# XXX: Let's not accept mismatches at the moment.
print('solr_line', repr(solr_line))
print('hocr_text:', hocr_text)
print('FAIL2')
raise Exception('FAIL2')
#import sys; sys.exit(1)
cur['error'] = 'mismatch'
match_number += 1
results.append((match_number, cur))
return results
# Contains a tuple for each match, with the starting and ending rune # Contains a tuple for each match, with the starting and ending rune
match_indexes = [] match_indexes = []
# Match solr_line words to hocr_par words # Match solr_line words to hocr_par words
# TODO: This needs could be improved some, see below on dealing with
# match_indexes being empty.
sub_idx = 0 sub_idx = 0
while True: while True:
s = solr_line[sub_idx:].find('{{{') s = solr_line[sub_idx:].find('{{{')
...@@ -71,138 +108,109 @@ def find_word_boxes(solr_line, hocr_text, hocr_par, page, page_no): ...@@ -71,138 +108,109 @@ def find_word_boxes(solr_line, hocr_text, hocr_par, page, page_no):
re = e + sub_idx re = e + sub_idx
sub_idx += e sub_idx += e
match_indexes.append((rs, match_indexes.append((rs, re))
re))
#print('MATCH_INDEXES:', match_indexes) # Normalise indices for string without {{{ and }}}, this makes life easier
# later on
for idx in range(len(match_indexes)):
rs, re = match_indexes[idx]
match_indexes[idx] = (rs - 6 * (idx) - 3, re - 6 * (idx) - 3)
if not len(match_indexes): if not len(match_indexes):
# XXX TODO FIXME: This needs to be a hard fail, but let's make it if solr_line.find('}}}') < solr_line.find('{{{'):
# soft fail until I have more time to investigate. Might just need # XXX: Known bug, this happens because elastic currently matches
# to add +3 to sub_idx for the start and ending case or something. # across paragraph boundaries, which we do not support. We could
# extend our match to support multi-paragraph matching, but we're
# probably going to change our elastic search to not match across
# paragraphs or even pages.
pass
else:
raise Exception('No match_indexes for %s' % repr(solr_line))
return None return None
# Get words for this paragraph, so we can match the words against the # Now we know where our matches are in the text (without the brackets), we
# rune indexes that we know we are interested in. We're going to count # need to match that to the hocr words, because just knowing which text is
# the amount of runes in a word (+1 for a space) and do that until we # not relevant - we need to find the bounding boxes for the matching text.
# hit a match in match_indexes
hocr_words = get_paragraph_hocr_words(hocr_par) hocr_words = get_paragraph_hocr_words(hocr_par)
hocr_word_idx = 0 word_matches = match_words(hocr_words, match_indexes)
words = [] # We have bounding boxes per word, but the current API doesn't permit us to
# pass multiple bounding boxes for a single match, so let's find the
idx = 0 # encompassing bounding box
for (start, end) in match_indexes:
found = False
# Fast forward to start using hocr_words
for word in hocr_words[hocr_word_idx:]:
wl = len(word['text'] + ' ')
if idx + wl > start:
start_word_idx = hocr_word_idx
hocr_word_idx += 1
idx += wl
found = True
break
hocr_word_idx += 1
idx += wl
if not found:
# Hard fail if we fail to find the word
print('FAIL4')
print(solr_line)
print(hocr_text)
raise Exception('FAIL4')
#import sys; sys.exit(1)
# Add 3 for {{{. This is in the solr line, but not in our line.
idx += 3
found = False
for word in hocr_words[hocr_word_idx:]:
wl = len(word['text'] + ' ')
if idx + wl >= end:
end_word_idx = hocr_word_idx
hocr_word_idx += 1
idx += wl
found = True
break
hocr_word_idx += 1
idx += wl
# It is possible our match is the last word, so let's check for
# that.
if not found:
if idx + wl >= end:
found = True
end_word_idx = hocr_word_idx
if not found:
# Hard fail if we fail to find the word
print('FAIL4.1')
print(solr_line)
print(hocr_text)
raise Exception('FAIL4.1')
#import sys; sys.exit(1)
# Add 3 for }}}. This is in the solr line, but not in our line.
idx += 3
#words = hocr_words[start_word_idx:end_word_idx]
words.extend(hocr_words[start_word_idx:end_word_idx])
# TODO: sanity check: if search query occurs in the combined text of words
# boxes is a bounding box for each word, so just translate the hocr ones
# to what the receiving end expects.
boxes = [] boxes = []
for word in words: for words in word_matches:
boxes.append({ boxes.append({
'l': word['bbox'][0], 'l': min([x['bbox'][0] for x in words]),
't': word['bbox'][1], 't': min([x['bbox'][1] for x in words]),
'r': word['bbox'][2], 'r': max([x['bbox'][2] for x in words]),
'b': word['bbox'][3], 'b': max([x['bbox'][3] for x in words]),
'page': page_no,
}) })
# I am not sure what this bounding box is in the original code, but left = None
# let's assume it's the box that encompasses all words. top = None
allword_bboxes = { right = None
'l': min([x['bbox'][0] for x in words]), bottom = None
't': min([x['bbox'][1] for x in words]), for box in boxes:
'r': max([x['bbox'][2] for x in words]), if left is None:
'b': max([x['bbox'][3] for x in words]), left = box['l']
} else:
left = min(left, box['l'])
r = allword_bboxes
if top is None:
top = box['t']
else:
top = min(top, box['t'])
if right is None:
right = box['r']
else:
right = max(right, box['r'])
if bottom is None:
bottom = box['b']
else:
bottom = max(bottom, box['b'])
all_boxes = {'l': left, 't': top, 'r': right, 'b': bottom}
r = all_boxes
page_width, page_height = hocr_page_get_dimensions(page) page_width, page_height = hocr_page_get_dimensions(page)
r.update({'page': page_no, r.update({'page': page_no,
'boxes': boxes, 'boxes': boxes,
'page_width': page_width, 'page_width': page_width,
'page_height': page_height, 'page_height': page_height})
})
cur['par'].append(r) cur['par'].append(r)
results.append(cur) results.append(cur)
return results return results
def find_matches(lookup_table, hocrfp, text): def find_matches(lookup_table, hocrfp, text):
text_byte_count = 0 text_byte_count = 0
current_dat = None current_dat = None
page_number = 0 page_number = 0
# For every line in the highlighted text, let's find matches...
for line in text[:-1].split('\n'): for line in text[:-1].split('\n'):
contains_match = '{{{' in line # Line should contain both '{{{' and '}}}'
if contains_match: contains_left_match = '{{{' in line
contains_match = '}}}' in line contains_right_match = '}}}' in line
contains_match = contains_left_match and contains_right_match
if contains_left_match or contains_right_match and not contains_match:
# Matches span multiple lines...
pass
if contains_match: if contains_match:
page_number, new_dat = hocr_lookup_by_plaintext_offset(lookup_table, page_number, new_dat = \
text_byte_count) hocr_lookup_by_plaintext_offset(lookup_table,
text_byte_count)
# Check if we need to change/reload our page and paragraphs
# variables
if new_dat != current_dat: if new_dat != current_dat:
# Only do this if we're on a new page # Only do this if we're on a new page
current_dat = new_dat current_dat = new_dat
...@@ -210,34 +218,54 @@ def find_matches(lookup_table, hocrfp, text): ...@@ -210,34 +218,54 @@ def find_matches(lookup_table, hocrfp, text):
paragraphs = hocr_page_to_word_data_fast(page) paragraphs = hocr_page_to_word_data_fast(page)
# Figure out what paragraph we are at, based on text length? # Figure out what paragraph we are at, based on text length?
# Find paragraph that contains this line, we know where the line
# starts, so now we just need to find the paragraph, we can do this
# with getting the paragraph text on a page, and add the amount of
# characters, until we reach the line start.
page_start_at = current_dat[0] page_start_at = current_dat[0]
match_at = text_byte_count match_at = text_byte_count
cnt = 0 cnt = 0
match = None match = None
for idx, paragraph in enumerate(paragraphs): for idx, paragraph in enumerate(paragraphs):
txt = hocr_paragraph_text(paragraph) txt = hocr_paragraph_text(paragraph)
cnt += len(txt) + 1 # '\n' # Add + 1 for newline
cnt += len(txt) + 1
if page_start_at + cnt > match_at: if page_start_at + cnt > match_at:
match = idx match = idx
break break
if match is None: if match is None:
# This should never happen
raise Exception('Could not find any match!') raise Exception('Could not find any match!')
paragraph = paragraphs[match] paragraph_words = paragraphs[match]
txt = hocr_paragraph_text(paragraph) paragraph_txt = hocr_paragraph_text(paragraph_words)
if txt != line.replace('{{{', '').replace('}}}', ''): # TODO: We might want to remove this in the future, it's wasteful
raise Exception('Reconstructed text does not match:', 'TEXT', txt, 'LINE', line) # to do the replace again.
if paragraph_txt != line.replace('{{{', '').replace('}}}', ''):
word_results = find_word_boxes(line, txt, paragraph, page, raise Exception('Reconstructed text does not match')
page_number)
yield word_results word_results = find_word_boxes(line, paragraph_txt,
paragraph_words, page, page_number)
if contains_match:
text_byte_count -= line.count('{{{') * 3 # We currently (rarely) allow word_results to be empty.
text_byte_count -= line.count('}}}') * 3 # This happens for example in a paragraph like this:
text_byte_count += len(line) + 1 # '\n' # " THE}}} CAMP OF {{{THE BRITISH MISSION AT ADOWA. From a Drawing by F. VILLIERs. the}}}"
# where elastic finds very strange matches and also across
# paragraph, which we do not support. this is also special cased in
# find_word_boxes
if word_results is not None:
yield word_results
# Correct for any {{{ and }}}, which are not in our plaintext, but are
# in the FTS text
subtract = 0
if contains_left_match:
subtract += line.count('{{{') * 3
if contains_right_match:
subtract += line.count('}}}') * 3
# Add counted bytes, plus one for the newline
text_byte_count += len(line) - subtract + 1
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment