Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Merlijn Wajer
Python Derivermodule
Commits
8920a58e
Commit
8920a58e
authored
Jan 15, 2021
by
Merlijn Wajer
Browse files
documentation, misc functions
parent
9ce6ec09
Changes
7
Hide whitespace changes
Inline
Side-by-side
derivermodule/const.py
View file @
8920a58e
#: Directory of the item files within the container
PB_ITEM
=
'/item'
#: Directory of the task related JSON files within the container
PB_TASK
=
'/task'
#: Temporary (hard disk backed) temporary directory (auto cleaned upon exit)
PB_TMP
=
'/tmp'
#: Temporary fast (memory backed) temporary directory (auto cleaned upon exit)
#: Limited in size, currently 1.5G
PB_FAST
=
'/var/tmp/fast'
derivermodule/files.py
0 → 100644
View file @
8920a58e
import
pathlib
from
.const
import
PB_ITEM
def
canonical_item_filename
(
filepath
):
"""
Normalises an absolute path to a file in the container to a canonical path.
For example '/item/test_item/test_directory/test_file_hocr.html.gz' would be
turned into 'test_item/test_directory/test_file_hocr.html.gz'.
Args:
* filepath (``str``): Absolute path to the file within the container
Returns:
* canonical path (``str``)
"""
root
=
pathlib
.
Path
(
PB_ITEM
)
item_path
=
pathlib
.
Path
(
filepath
)
rel
=
item_path
.
relative_to
(
root
)
return
str
(
rel
)
derivermodule/imagestack.py
View file @
8920a58e
...
...
@@ -7,12 +7,11 @@ from .logger import logger
def
get_imagestack_info
(
task_info
):
"""
Reads the source_format from the task information (parse with
task.get_task_info)
and returns a dictionary with the following information:
task.get_task_info)
.
{
'image_type': 'jp2' or 'jpg' or 'tif',
'archive_type': 'zip' or 'tar'.
}
Returns a dictionary with the following information:
>>> { 'image_type': 'jp2' or 'jpg' or 'tif', 'archive_type': 'zip' or 'tar'. }
"""
source_format
=
task_info
[
'sourceFormat'
]
if
source_format
==
'Single Page Processed JP2 ZIP'
:
...
...
@@ -29,20 +28,26 @@ def get_imagestack_info(task_info):
return
{
'image_type'
:
'tif'
,
'archive_type'
:
'tar'
}
raise
Exception
(
'Unhandled imagestack format: %s'
%
source_format
)
def
unpack_and_validate_imagestack
(
imagestack_path
,
imagestack_info
,
dst
):
"""
Unpack and validate an imagestack
"""Unpack and validate an imagestack
An imagestack is valid if it contains at least one directory that contains
at least one image of the expected image type.
Args:
imagestack_path (str): The imagestack archive path
imagestack_info (dict): {'archive_type': ..., 'image_type': ...}
dst (str): Destination directory for the unpacked imagestack
* imagestack_path (``str``): The imagestack archive path
* imagestack_info (``dict``)::
>>> {'archive_type': ..., 'image_type': ...}
* dst (``str``): Destination directory for the unpacked imagestack
Returns:
(str, int): Tuple containing the path to the unpacked image directory
and the image count.
* ``(str, int)``: Tuple containing the path to the unpacked image directory
and the image count.
"""
logger
.
info
(
'Unpacking image stack.'
)
start_time
=
time
()
...
...
derivermodule/logger.py
View file @
8920a58e
...
...
@@ -22,6 +22,10 @@ def set_level_critical():
ch
.
setLevel
(
logging
.
CRITICAL
)
def
get_logger
(
logger_name
):
"""
Returns an instance of ``logging.Logger`` with a specific log format and
logging level.
"""
DEFAULT_LOGGING_LEVEL
=
logging
.
INFO
LOG_FORMAT
=
'%(asctime)s %(levelname)-8s %(message)s'
...
...
@@ -35,4 +39,5 @@ def get_logger(logger_name):
return
logger
#: Default logger with name ``'derivermodule'``
logger
=
get_logger
(
'derivermodule'
)
derivermodule/metadata.py
View file @
8920a58e
from
os
import
rename
from
os
import
rename
,
stat
from
os.path
import
join
from
xml.etree
import
ElementTree
as
ET
from
xml.dom
import
minidom
import
hashlib
,
zlib
from
subprocess
import
check_output
from
collections
import
OrderedDict
import
xmltodict
from
.const
import
PB_ITEM
def
parse_item_metadata
(
path
):
tree
=
ET
.
parse
(
path
)
root
=
tree
.
getroot
()
...
...
@@ -28,7 +34,28 @@ def parse_item_metadata(path):
return
md
def
load_item_metadata
(
identifier
):
"""
Returns the metadata of an item as a (modifiable) dictionary.
Args:
* identifier (``str``): Identifier of the item
Returns:
* item metadata (``dict``)
Example usage::
from derivermodule.task import get_task_info
from derivermodule.metadata import load_item_metadata
info = get_task_info()
identifier = info['identifier']
metadata = load_item_metadata(identifier)
"""
path
=
open
(
join
(
PB_ITEM
,
'%s_meta.xml'
%
identifier
))
return
parse_item_metadata
(
path
)
...
...
@@ -57,12 +84,166 @@ def metadata_to_metaxml(metadata):
xml
=
check_output
([
'xmllint'
,
'--format'
,
'-'
],
input
=
result
)
return
xml
.
decode
(
'
UTF
-8'
)
return
xml
.
decode
(
'
utf
-8'
)
def
write_item_metadata
(
identifier
,
metadata
):
"""
Write the (changed) item metadata dictionary to disk.
This is required if changes made to the `metadata` are to persist.
The `metadata` should be loaded using `load_item_metadata`.
Args:
* identifier (``str``): Identifier of the item
* metadata (``dict``): Metadata, loaded by `load_item_metadata` and
potentially modified.
Returns:
* Nothing
"""
metaxml
=
metadata_to_metaxml
(
metadata
)
fp
=
open
(
join
(
PB_ITEM
,
'%s_meta.xml_tmp'
%
identifier
),
'w+'
)
fp
.
write
(
metaxml
)
fp
.
close
()
rename
(
join
(
PB_ITEM
,
'%s_meta.xml_tmp'
%
identifier
),
join
(
PB_ITEM
,
'%s_meta.xml'
%
identifier
))
def
load_files_metadata
(
identifier
):
"""
Returns the file-level metadata of an item as an opaque (not to be directly
modified) object. Use `lookup_file_metadata` to get a reference to the
metadata of a specific file.
Args:
* identifier (``str``): Identifier of the item
Returns:
* object containing the file-level metadata of an item.
"""
path
=
open
(
join
(
PB_ITEM
,
'%s_files.xml'
%
identifier
))
data
=
path
.
read
()
path
.
close
()
return
xmltodict
.
parse
(
data
)
## TODO: Perform a *LOT* of testing on this
#def write_files_metadata(identifier, metadata):
# """
# Writes the (changed) file level metadata dictionary to disk.
# This is required if changes made to the `metadata` are to persist.
# The `metadata` should be loaded using `load_files_metadata` and not changed
# directly, only by calling `lookup_file_metadata` and changing the values in
# the result of that function.
#
# Args:
#
# * identifier (``str``): Identifier of the item
# * metadata (``dict``): Metadata, loaded by `load_files_metadata` and
# potentially modified.
#
# Returns:
#
# * Nothing
# """
# # TODO: Sort by @name entries, alphabetically?
# result = xmltodict.unparse(metadata, pretty=True).encode('utf-8')
# metafilesxml = check_output(['xmllint', '--format', '-'], input=result)
# fp = open(join(PB_ITEM, '%s_files.xml_tmp' % identifier), 'wb+')
# fp.write(metafilesxml)
# fp.close()
# rename(join(PB_ITEM, '%s_files.xml_tmp' % identifier), join(PB_ITEM, '%s_files.xml' % identifier))
def
lookup_file_metadata
(
files_metadata
,
filename
):
"""
Fetch file-level metadata for a specific file, as a dictionary
(``collections.OrderedDict``)
Args:
* files_metadata: files metadata as returned by `load_files_metadata`.
* filename (``str``): filename/path canonical (relative) to the item
Returns:
File-level metadata if it exists (``dict``) or ``None``.
"""
# XXX: in doc, mention that changing the properties here should reflect in
# final written file
file_list
=
files_metadata
[
'files'
][
'file'
]
for
file_info
in
file_list
:
if
file_info
[
'@name'
]
==
filename
:
return
file_info
return
None
##: Indicates that a file is an original file (used with create_file_metadata)
#SOURCE_ORIGINAL = 'original'
#
##: Indicates that a file is a derivative file (used with create_file_metadata)
#SOURCE_DERIVATIVE = 'derivative'
#
#def create_file_metadata(files_metadata, filename, source=SOURCE_ORIGINAL, fileformat=None):
# if lookup_file_metadata(files_metadata, filename) is not None:
# raise ValueError('%s already exists in files_metadata' % filename)
#
# # XXX: in doc, mention that one should be really careful with this
# if source not in (SOURCE_ORIGINAL, SOURCE_DERIVATIVE):
# raise ValueError('Invalid source type.')
#
# if fileformat is None:
# raise ValueError('Please specify a valid fileformat')
#
# entry = OrderedDict([('@name', filename),
# ('@source', source),
# ('format', fileformat)])
#
# return entry
#
#
#def append_file_metadata(files_metadata, file_entry):
# # TODO: for doc, requires file to exist on disk in PB_ITEM
#
# #md = _calculate_file_metadata(join(PB_ITEM, file_entry['@name']))
# #file_entry.update(md)
# files_metadata['files']['file'].append(file_entry)
#
#
#def _calculate_file_metadata(filename):
# data = {}
# data.update(_calc_hashes(filename))
#
# stat_data = stat(filename)
# data['mtime'] = stat_data.st_mtime
# data['size'] = stat_data.st_size
#
# return data
#
#
#def _calc_hashes(filename):
# fd = open(filename, 'rb')
# crc32 = 0
# md5 = hashlib.md5()
# sha1 = hashlib.sha1()
#
# while True:
# s = fd.read(65536)
# if not s:
# break
# crc32 = zlib.crc32(s, crc32)
# md5.update(s)
# sha1.update(s)
#
# fd.close()
#
# data = {'crc32': ('%08X' % crc32).lower(),
# 'md5': md5.hexdigest(),
# 'sha1': sha1.hexdigest() }
#
# return data
derivermodule/scandata.py
View file @
8920a58e
...
...
@@ -8,16 +8,19 @@ import xmltodict
from
.const
import
PB_ITEM
,
PB_TMP
from
.logger
import
logger
def
get_scandata_xml
(
identifier
,
source_file
):
"""
Parses the scandata.xml file for a given identifier and source_file.
Args:
* identifier (str): Identifier of the item
* source_file (str): sourceFile to be operated on
* identifier (``str``): Identifier of the item
* source_file (``str``): sourceFile to be operated on
Returns:
Returns the p
ath to the scandata (str) or None
* P
ath to the scandata (
``
str
``
) or None
"""
item_dir
=
Path
(
PB_ITEM
)
...
...
@@ -63,6 +66,14 @@ def get_scandata_xml(identifier, source_file):
def
scandata_parse
(
scandata_path
):
"""
Parse scandata.xml to native Python format
Args:
* scandata_path (``str``): Path to the scandata
Returns:
* Scandata as dictionary
"""
scandata
=
xmltodict
.
parse
(
open
(
scandata_path
,
'rb'
).
read
())
return
scandata
...
...
@@ -70,7 +81,15 @@ def scandata_parse(scandata_path):
def
scandata_get_page_count
(
scandata
):
"""
Return the number of page elements in a parsed scandata object
Get the number of page elements in a parsed scandata object
Args:
* scandata (``dict``): Scandata as returned by `scandata_parse`.
Returns:
* The number of page elements (``int``)
"""
pages
=
scandata
.
get
(
'book'
,
{}).
get
(
'pageData'
,
{}).
get
(
'page'
,
[])
if
not
isinstance
(
pages
,
list
):
...
...
@@ -87,6 +106,11 @@ def scandata_get_skip_pages(scandata):
Args:
* scandata: Parsed scandata as returned by scandata_parse
Returns:
* Indexes of pages that should not added to access formats
(``list of int``)
"""
skip
=
[]
...
...
derivermodule/task.py
View file @
8920a58e
from
json
import
load
,
dump
from
os.path
import
join
from
os
import
environ
...
...
@@ -8,16 +9,26 @@ def get_task_info():
"""
Parses the /task/task.json file and returns the parsed information as native
python dictionary.
Returns:
* Task arguments (``dict``)
"""
return
load
(
open
(
join
(
PB_TASK
,
'task.json'
)))
def
get_petabox_info
():
"""
Parses the /task/petabox.json file and returns the parsed information as
native python dictionary.
Returns:
* Petabox info (``dict``)
"""
return
load
(
open
(
join
(
PB_TASK
,
'petabox.json'
)))
def
write_extra_targets
(
extra_targets
):
"""
Create /task/extras_targets.json file based on `extra_targets`.
...
...
@@ -27,19 +38,36 @@ def write_extra_targets(extra_targets):
* `extra_targets`: A list of dictionaries, containing the filename as str
in 'name', and optionally if the file is to be parsed as original in
'mark_original', as boolean.
Returns:
* Nothing
Example:
>>> from derivermodule.files import canonical_item_filename
>>> write_extra_targets([
>>> {'name': canonical_item_filename(target_file.replace('.txt', '-other.txt')),
>>> 'mark_original': False}
>>> ])
"""
extra_targets_file
=
join
(
PB_TASK
,
'extra_targets.json'
)
fp
=
open
(
extra_targets_file
,
'w+'
)
dump
(
extra_targets
,
fp
)
fp
.
close
()
def
get_task_args
(
task_info
):
"""
Args:
* task_info (dict): task_info as returned by get_task_info()
Returns:
dict of task arguments
* Tasks arguments (``dict``)
"""
if
'task'
in
task_info
and
'args'
in
task_info
[
'task'
]:
return
task_info
[
'task'
][
'args'
]
...
...
@@ -55,17 +83,21 @@ def get_task_arg_or_environ_arg(task_args, name):
enviroment variables are upper cased and use underscores. The functions
applies the task argument to environment variable to the `name` argument.
So if the task argument is 'ocr-perform-foo', pass that as `name`, and the
function will also check the environment variables for 'OCR_PERFORM_FOO'.
So if the task argument is ``'ocr-perform-foo'``, pass that as ``name``, and
the function will also check the environment variables for
``'OCR_PERFORM_FOO'``.
Environment arguments are preferred over task arguments.
Environment arguments have higher precedence over task arguments by this
function.
Args:
* task_args: As returned by get_task_args
* task_args: As returned by
`
get_task_args
`
* name: Custom task argument name, implementation defined.
Returns the value of the argument, if present, and None otherwise.
Returns:
* the value of the argument, if present, and None otherwise.
"""
environ_name
=
name
.
replace
(
'-'
,
'_'
).
upper
()
if
environ_name
in
environ
:
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment