hocr/parse: deal with em, strong in word elements

......@@ -269,8 +269,22 @@ def hocr_page_to_word_data_fast(hocr_page):
has_ocrx_cinfo = 2
if wordbased:
# Words may contains additional nodes like <em>
while True:
children = word.getchildren()
if len(children) == 0:
if len(children) > 1:
raise ValueError('Not character based but word has multiple children?')
word = children[0]
rawtext = word.text
if word.text is None:
raise ValueError('Word with no text value?')
word_data.append({'bbox': box, 'text': rawtext,
'confidence': conf})
