Commit 14896264 authored by Merlijn Wajer's avatar Merlijn Wajer
Browse files

hocr/parse: deal with em, strong in word elements

parent 9e505a69
......@@ -269,8 +269,22 @@ def hocr_page_to_word_data_fast(hocr_page):
has_ocrx_cinfo = 2
if wordbased:
# Words may contains additional nodes like <em>
while True:
children = word.getchildren()
if len(children) == 0:
if len(children) > 1:
raise ValueError('Not character based but word has multiple children?')
word = children[0]
rawtext = word.text
if word.text is None:
raise ValueError('Word with no text value?')
word_data.append({'bbox': box, 'text': rawtext,
'confidence': conf})
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment