Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Aram Verstegen
archive-hocr-tools
Commits
ebc4cfd2
Commit
ebc4cfd2
authored
Feb 03, 2021
by
Merlijn Wajer
Browse files
WIP: move fts code to hocr
parent
cc67a41d
Changes
3
Hide whitespace changes
Inline
Side-by-side
bin/fts-text-match
View file @
ebc4cfd2
...
...
@@ -5,168 +5,9 @@ import argparse
import
json
from
hocr.parse
import
hocr_page_to_word_data_fast
,
hocr_page_get_dimensions
from
hocr.searching
import
hocr_load_lookup_table
,
hocr_lookup_page_by_dat
,
\
hocr_lookup_by_plaintext_offset
from
hocr.searching
import
hocr_load_lookup_table
from
hocr.util
import
open_if_required
from
hocr.text
import
hocr_paragraph_text
,
get_paragraph_hocr_words
# XXX: remove this
import
re
re_braces
=
re
.
compile
(
r
'(\{\{\{|\}\}\})'
)
def
find_word_boxes
(
solr_line
,
hocr_text
,
hocr_par
,
page
,
page_no
):
match_with
=
solr_line
cur
=
{
'text'
:
solr_line
,
'par'
:
[]
}
results
=
[]
if
re_braces
.
sub
(
''
,
cur
[
'text'
])
!=
hocr_text
:
# XXX: Let's not accept mismatches at the moment.
print
(
'solr_line'
,
repr
(
solr_line
))
print
(
'hocr_text:'
,
hocr_text
)
print
(
'FAIL2'
)
import
sys
;
sys
.
exit
(
1
)
cur
[
'error'
]
=
'mismatch'
match_number
+=
1
results
.
append
((
match_number
,
cur
))
return
results
# Contains a tuple for each match, with the starting and ending rune
match_indexes
=
[]
# Match solr_line words to hocr_par words
# TODO: This needs could be improved some, see below on dealing with
# match_indexes being empty.
sub_idx
=
0
while
True
:
s
=
solr_line
[
sub_idx
:].
find
(
'{{{'
)
if
s
==
-
1
:
break
rs
=
s
+
sub_idx
+
3
sub_idx
+=
s
e
=
solr_line
[
sub_idx
:].
find
(
'}}}'
)
if
e
==
-
1
:
break
re
=
e
+
sub_idx
sub_idx
+=
e
match_indexes
.
append
((
rs
,
re
))
#print('MATCH_INDEXES:', match_indexes)
if
not
len
(
match_indexes
):
# XXX TODO FIXME: This needs to be a hard fail, but let's make it
# soft fail until I have more time to investigate. Might just need
# to add +3 to sub_idx for the start and ending case or something.
return
None
# Get words for this paragraph, so we can match the words against the
# rune indexes that we know we are interested in. We're going to count
# the amount of runes in a word (+1 for a space) and do that until we
# hit a match in match_indexes
hocr_words
=
get_paragraph_hocr_words
(
hocr_par
)
hocr_word_idx
=
0
words
=
[]
idx
=
0
for
(
start
,
end
)
in
match_indexes
:
found
=
False
# Fast forward to start using hocr_words
for
word
in
hocr_words
[
hocr_word_idx
:]:
wl
=
len
(
word
[
'text'
]
+
' '
)
if
idx
+
wl
>
start
:
start_word_idx
=
hocr_word_idx
hocr_word_idx
+=
1
idx
+=
wl
found
=
True
break
hocr_word_idx
+=
1
idx
+=
wl
if
not
found
:
# Hard fail if we fail to find the word
print
(
'FAIL4'
)
print
(
solr_line
)
print
(
hocr_text
)
import
sys
;
sys
.
exit
(
1
)
# Add 3 for {{{. This is in the solr line, but not in our line.
idx
+=
3
found
=
False
for
word
in
hocr_words
[
hocr_word_idx
:]:
wl
=
len
(
word
[
'text'
]
+
' '
)
if
idx
+
wl
>=
end
:
end_word_idx
=
hocr_word_idx
hocr_word_idx
+=
1
idx
+=
wl
found
=
True
break
hocr_word_idx
+=
1
idx
+=
wl
# It is possible our match is the last word, so let's check for
# that.
if
not
found
:
if
idx
+
wl
>=
end
:
found
=
True
end_word_idx
=
hocr_word_idx
if
not
found
:
# Hard fail if we fail to find the word
print
(
'FAIL4.1'
)
print
(
solr_line
)
print
(
hocr_text
)
import
sys
;
sys
.
exit
(
1
)
# Add 3 for }}}. This is in the solr line, but not in our line.
idx
+=
3
#words = hocr_words[start_word_idx:end_word_idx]
words
.
extend
(
hocr_words
[
start_word_idx
:
end_word_idx
])
# TODO: sanity check: if search query occurs in the combined text of words
# boxes is a bounding box for each word, so just translate the hocr ones
# to what the receiving end expects.
boxes
=
[]
for
word
in
words
:
boxes
.
append
({
'l'
:
word
[
'bbox'
][
0
],
't'
:
word
[
'bbox'
][
1
],
'r'
:
word
[
'bbox'
][
2
],
'b'
:
word
[
'bbox'
][
3
],
})
# I am not sure what this bounding box is in the original code, but
# let's assume it's the box that encompasses all words.
allword_bboxes
=
{
'l'
:
min
([
x
[
'bbox'
][
0
]
for
x
in
words
]),
't'
:
min
([
x
[
'bbox'
][
1
]
for
x
in
words
]),
'r'
:
max
([
x
[
'bbox'
][
2
]
for
x
in
words
]),
'b'
:
max
([
x
[
'bbox'
][
3
]
for
x
in
words
]),
}
r
=
allword_bboxes
page_width
,
page_height
=
hocr_page_get_dimensions
(
page
)
r
.
update
({
'page'
:
page_no
,
'boxes'
:
boxes
,
'page_width'
:
page_width
,
'page_height'
:
page_height
,
})
cur
[
'par'
].
append
(
r
)
results
.
append
(
cur
)
return
results
from
hocr.fts
import
find_matches
def
process_file
(
hocrfile
,
textfile
,
tablepath
):
...
...
@@ -176,60 +17,9 @@ def process_file(hocrfile, textfile, tablepath):
textfp
=
open_if_required
(
textfile
)
text
=
textfp
.
read
().
decode
(
'utf-8'
)
text_byte_count
=
0
current_dat
=
None
page_number
=
0
for
line
in
text
[:
-
1
].
split
(
'
\n
'
):
contains_match
=
'{{{'
in
line
if
contains_match
:
contains_match
=
'}}}'
in
line
if
contains_match
:
page_number
,
new_dat
=
hocr_lookup_by_plaintext_offset
(
lookup_table
,
text_byte_count
)
if
new_dat
!=
current_dat
:
# Only do this if we're on a new page
current_dat
=
new_dat
page
=
hocr_lookup_page_by_dat
(
hocrfp
,
current_dat
)
paragraphs
=
hocr_page_to_word_data_fast
(
page
)
# Figure out what paragraph we are at, based on text length?
page_start_at
=
current_dat
[
0
]
match_at
=
text_byte_count
cnt
=
0
match
=
None
for
idx
,
paragraph
in
enumerate
(
paragraphs
):
txt
=
hocr_paragraph_text
(
paragraph
)
cnt
+=
len
(
txt
)
+
1
# '\n'
if
page_start_at
+
cnt
>
match_at
:
match
=
idx
break
if
match
is
None
:
raise
Exception
(
'Could not find any match!'
)
paragraph
=
paragraphs
[
match
]
txt
=
hocr_paragraph_text
(
paragraph
)
if
txt
!=
line
.
replace
(
'{{{'
,
''
).
replace
(
'}}}'
,
''
):
raise
Exception
(
'No no no'
)
word_results
=
find_word_boxes
(
line
,
txt
,
paragraph
,
page
,
page_number
)
json
.
dump
(
word_results
,
sys
.
stdout
)
sys
.
stdout
.
write
(
'
\n
'
)
if
contains_match
:
text_byte_count
-=
line
.
count
(
'{{{'
)
*
3
text_byte_count
-=
line
.
count
(
'}}}'
)
*
3
text_byte_count
+=
len
(
line
)
+
1
# '\n'
for
word_results
in
find_matches
(
lookup_table
,
hocrfp
,
text
):
json
.
dump
(
word_results
,
sys
.
stdout
)
sys
.
stdout
.
write
(
'
\n
'
)
if
__name__
==
'__main__'
:
...
...
hocr/__init__.py
View file @
ebc4cfd2
from
.
import
parse
,
text
,
util
,
searching
from
.
import
parse
,
text
,
util
,
searching
,
fts
from
.version
import
__version__
hocr/fts.py
0 → 100644
View file @
ebc4cfd2
from
hocr.parse
import
hocr_page_to_word_data_fast
,
hocr_page_get_dimensions
from
hocr.searching
import
hocr_lookup_page_by_dat
,
\
hocr_lookup_by_plaintext_offset
from
hocr.text
import
get_paragraph_hocr_words
,
hocr_paragraph_text
,
\
get_paragraph_hocr_words
"""
Highly experimental and unstable interface to retreive page indexes and bounding
boxes for words matching a certain full-text-search query.
Required an external system to perform the highlighting (bin/fts-text-annotate
can serve this purpose)
This API is highly unstable, the code is in need of cleanups, and there likely
isn't a big use for this particular file outside of Internet Archive purposes.
"""
# TODO: notes here about internal api, interal code, backwards compat, why the
# whole thing is so awkward, what search inside does, etc
# XXX: remove this, we do not need this in production
import
re
re_braces
=
re
.
compile
(
r
'(\{\{\{|\}\}\})'
)
# TODO rename and note unstable api
def
find_word_boxes
(
solr_line
,
hocr_text
,
hocr_par
,
page
,
page_no
):
match_with
=
solr_line
cur
=
{
'text'
:
solr_line
,
'par'
:
[]
}
results
=
[]
# TODO: Let's not use regex here, we might not even need this check at all
if
re_braces
.
sub
(
''
,
cur
[
'text'
])
!=
hocr_text
:
# XXX: Let's not accept mismatches at the moment.
print
(
'solr_line'
,
repr
(
solr_line
))
print
(
'hocr_text:'
,
hocr_text
)
print
(
'FAIL2'
)
import
sys
;
sys
.
exit
(
1
)
cur
[
'error'
]
=
'mismatch'
match_number
+=
1
results
.
append
((
match_number
,
cur
))
return
results
# Contains a tuple for each match, with the starting and ending rune
match_indexes
=
[]
# Match solr_line words to hocr_par words
# TODO: This needs could be improved some, see below on dealing with
# match_indexes being empty.
sub_idx
=
0
while
True
:
s
=
solr_line
[
sub_idx
:].
find
(
'{{{'
)
if
s
==
-
1
:
break
rs
=
s
+
sub_idx
+
3
sub_idx
+=
s
e
=
solr_line
[
sub_idx
:].
find
(
'}}}'
)
if
e
==
-
1
:
break
re
=
e
+
sub_idx
sub_idx
+=
e
match_indexes
.
append
((
rs
,
re
))
#print('MATCH_INDEXES:', match_indexes)
if
not
len
(
match_indexes
):
# XXX TODO FIXME: This needs to be a hard fail, but let's make it
# soft fail until I have more time to investigate. Might just need
# to add +3 to sub_idx for the start and ending case or something.
return
None
# Get words for this paragraph, so we can match the words against the
# rune indexes that we know we are interested in. We're going to count
# the amount of runes in a word (+1 for a space) and do that until we
# hit a match in match_indexes
hocr_words
=
get_paragraph_hocr_words
(
hocr_par
)
hocr_word_idx
=
0
words
=
[]
idx
=
0
for
(
start
,
end
)
in
match_indexes
:
found
=
False
# Fast forward to start using hocr_words
for
word
in
hocr_words
[
hocr_word_idx
:]:
wl
=
len
(
word
[
'text'
]
+
' '
)
if
idx
+
wl
>
start
:
start_word_idx
=
hocr_word_idx
hocr_word_idx
+=
1
idx
+=
wl
found
=
True
break
hocr_word_idx
+=
1
idx
+=
wl
if
not
found
:
# Hard fail if we fail to find the word
print
(
'FAIL4'
)
print
(
solr_line
)
print
(
hocr_text
)
import
sys
;
sys
.
exit
(
1
)
# Add 3 for {{{. This is in the solr line, but not in our line.
idx
+=
3
found
=
False
for
word
in
hocr_words
[
hocr_word_idx
:]:
wl
=
len
(
word
[
'text'
]
+
' '
)
if
idx
+
wl
>=
end
:
end_word_idx
=
hocr_word_idx
hocr_word_idx
+=
1
idx
+=
wl
found
=
True
break
hocr_word_idx
+=
1
idx
+=
wl
# It is possible our match is the last word, so let's check for
# that.
if
not
found
:
if
idx
+
wl
>=
end
:
found
=
True
end_word_idx
=
hocr_word_idx
if
not
found
:
# Hard fail if we fail to find the word
print
(
'FAIL4.1'
)
print
(
solr_line
)
print
(
hocr_text
)
import
sys
;
sys
.
exit
(
1
)
# Add 3 for }}}. This is in the solr line, but not in our line.
idx
+=
3
#words = hocr_words[start_word_idx:end_word_idx]
words
.
extend
(
hocr_words
[
start_word_idx
:
end_word_idx
])
# TODO: sanity check: if search query occurs in the combined text of words
# boxes is a bounding box for each word, so just translate the hocr ones
# to what the receiving end expects.
boxes
=
[]
for
word
in
words
:
boxes
.
append
({
'l'
:
word
[
'bbox'
][
0
],
't'
:
word
[
'bbox'
][
1
],
'r'
:
word
[
'bbox'
][
2
],
'b'
:
word
[
'bbox'
][
3
],
})
# I am not sure what this bounding box is in the original code, but
# let's assume it's the box that encompasses all words.
allword_bboxes
=
{
'l'
:
min
([
x
[
'bbox'
][
0
]
for
x
in
words
]),
't'
:
min
([
x
[
'bbox'
][
1
]
for
x
in
words
]),
'r'
:
max
([
x
[
'bbox'
][
2
]
for
x
in
words
]),
'b'
:
max
([
x
[
'bbox'
][
3
]
for
x
in
words
]),
}
r
=
allword_bboxes
page_width
,
page_height
=
hocr_page_get_dimensions
(
page
)
r
.
update
({
'page'
:
page_no
,
'boxes'
:
boxes
,
'page_width'
:
page_width
,
'page_height'
:
page_height
,
})
cur
[
'par'
].
append
(
r
)
results
.
append
(
cur
)
return
results
def
find_matches
(
lookup_table
,
hocrfp
,
text
):
text_byte_count
=
0
current_dat
=
None
page_number
=
0
for
line
in
text
[:
-
1
].
split
(
'
\n
'
):
contains_match
=
'{{{'
in
line
if
contains_match
:
contains_match
=
'}}}'
in
line
if
contains_match
:
page_number
,
new_dat
=
hocr_lookup_by_plaintext_offset
(
lookup_table
,
text_byte_count
)
if
new_dat
!=
current_dat
:
# Only do this if we're on a new page
current_dat
=
new_dat
page
=
hocr_lookup_page_by_dat
(
hocrfp
,
current_dat
)
paragraphs
=
hocr_page_to_word_data_fast
(
page
)
# Figure out what paragraph we are at, based on text length?
page_start_at
=
current_dat
[
0
]
match_at
=
text_byte_count
cnt
=
0
match
=
None
for
idx
,
paragraph
in
enumerate
(
paragraphs
):
txt
=
hocr_paragraph_text
(
paragraph
)
cnt
+=
len
(
txt
)
+
1
# '\n'
if
page_start_at
+
cnt
>
match_at
:
match
=
idx
break
if
match
is
None
:
raise
Exception
(
'Could not find any match!'
)
paragraph
=
paragraphs
[
match
]
txt
=
hocr_paragraph_text
(
paragraph
)
if
txt
!=
line
.
replace
(
'{{{'
,
''
).
replace
(
'}}}'
,
''
):
raise
Exception
(
'No no no'
)
word_results
=
find_word_boxes
(
line
,
txt
,
paragraph
,
page
,
page_number
)
yield
word_results
if
contains_match
:
text_byte_count
-=
line
.
count
(
'{{{'
)
*
3
text_byte_count
-=
line
.
count
(
'}}}'
)
*
3
text_byte_count
+=
len
(
line
)
+
1
# '\n'
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment