Commit 5c15ef26 authored by Merlijn Wajer's avatar Merlijn Wajer
Browse files

hocr/parse: deal with em/strong in normal parse as well

parent 099aa0f2
...@@ -162,7 +162,22 @@ def hocr_page_to_word_data(hocr_page, scaler=1): ...@@ -162,7 +162,22 @@ def hocr_page_to_word_data(hocr_page, scaler=1):
wordbased = False wordbased = False
if wordbased: if wordbased:
rawtext = word.text wword = word
# Words may contains additional nodes like <em>
while True:
children = wword.getchildren()
if len(children) == 0:
if len(children) > 1:
raise ValueError('Not character based but word has multiple children?')
wword = children[0]
rawtext = wword.text
if wword.text is None:
raise ValueError('Word with no text value?')
box =['title']).group(1).split() box =['title']).group(1).split()
box = [float(i) for i in box] box = [float(i) for i in box]
