Commit 8920a58e authored by Merlijn Wajer's avatar Merlijn Wajer
Browse files

documentation, misc functions

parent 9ce6ec09
#: Directory of the item files within the container
PB_ITEM = '/item'
#: Directory of the task related JSON files within the container
PB_TASK = '/task'
#: Temporary (hard disk backed) temporary directory (auto cleaned upon exit)
PB_TMP = '/tmp'
#: Temporary fast (memory backed) temporary directory (auto cleaned upon exit)
#: Limited in size, currently 1.5G
PB_FAST = '/var/tmp/fast'
import pathlib
from .const import PB_ITEM
def canonical_item_filename(filepath):
"""
Normalises an absolute path to a file in the container to a canonical path.
For example '/item/test_item/test_directory/test_file_hocr.html.gz' would be
turned into 'test_item/test_directory/test_file_hocr.html.gz'.
Args:
* filepath (``str``): Absolute path to the file within the container
Returns:
* canonical path (``str``)
"""
root = pathlib.Path(PB_ITEM)
item_path = pathlib.Path(filepath)
rel = item_path.relative_to(root)
return str(rel)
......@@ -7,12 +7,11 @@ from .logger import logger
def get_imagestack_info(task_info):
"""
Reads the source_format from the task information (parse with
task.get_task_info) and returns a dictionary with the following information:
task.get_task_info).
{
'image_type': 'jp2' or 'jpg' or 'tif',
'archive_type': 'zip' or 'tar'.
}
Returns a dictionary with the following information:
>>> { 'image_type': 'jp2' or 'jpg' or 'tif', 'archive_type': 'zip' or 'tar'. }
"""
source_format = task_info['sourceFormat']
if source_format == 'Single Page Processed JP2 ZIP':
......@@ -29,20 +28,26 @@ def get_imagestack_info(task_info):
return {'image_type': 'tif', 'archive_type': 'tar'}
raise Exception('Unhandled imagestack format: %s' % source_format)
def unpack_and_validate_imagestack(imagestack_path, imagestack_info, dst):
""" Unpack and validate an imagestack
"""Unpack and validate an imagestack
An imagestack is valid if it contains at least one directory that contains
at least one image of the expected image type.
Args:
imagestack_path (str): The imagestack archive path
imagestack_info (dict): {'archive_type': ..., 'image_type': ...}
dst (str): Destination directory for the unpacked imagestack
* imagestack_path (``str``): The imagestack archive path
* imagestack_info (``dict``)::
>>> {'archive_type': ..., 'image_type': ...}
* dst (``str``): Destination directory for the unpacked imagestack
Returns:
(str, int): Tuple containing the path to the unpacked image directory
and the image count.
* ``(str, int)``: Tuple containing the path to the unpacked image directory
and the image count.
"""
logger.info('Unpacking image stack.')
start_time = time()
......
......@@ -22,6 +22,10 @@ def set_level_critical():
ch.setLevel(logging.CRITICAL)
def get_logger(logger_name):
"""
Returns an instance of ``logging.Logger`` with a specific log format and
logging level.
"""
DEFAULT_LOGGING_LEVEL = logging.INFO
LOG_FORMAT = '%(asctime)s %(levelname)-8s %(message)s'
......@@ -35,4 +39,5 @@ def get_logger(logger_name):
return logger
#: Default logger with name ``'derivermodule'``
logger = get_logger('derivermodule')
from os import rename
from os import rename, stat
from os.path import join
from xml.etree import ElementTree as ET
from xml.dom import minidom
import hashlib, zlib
from subprocess import check_output
from collections import OrderedDict
import xmltodict
from .const import PB_ITEM
def parse_item_metadata(path):
tree = ET.parse(path)
root = tree.getroot()
......@@ -28,7 +34,28 @@ def parse_item_metadata(path):
return md
def load_item_metadata(identifier):
"""
Returns the metadata of an item as a (modifiable) dictionary.
Args:
* identifier (``str``): Identifier of the item
Returns:
* item metadata (``dict``)
Example usage::
from derivermodule.task import get_task_info
from derivermodule.metadata import load_item_metadata
info = get_task_info()
identifier = info['identifier']
metadata = load_item_metadata(identifier)
"""
path = open(join(PB_ITEM, '%s_meta.xml' % identifier))
return parse_item_metadata(path)
......@@ -57,12 +84,166 @@ def metadata_to_metaxml(metadata):
xml = check_output(['xmllint', '--format', '-'], input=result)
return xml.decode('UTF-8')
return xml.decode('utf-8')
def write_item_metadata(identifier, metadata):
"""
Write the (changed) item metadata dictionary to disk.
This is required if changes made to the `metadata` are to persist.
The `metadata` should be loaded using `load_item_metadata`.
Args:
* identifier (``str``): Identifier of the item
* metadata (``dict``): Metadata, loaded by `load_item_metadata` and
potentially modified.
Returns:
* Nothing
"""
metaxml = metadata_to_metaxml(metadata)
fp = open(join(PB_ITEM, '%s_meta.xml_tmp' % identifier), 'w+')
fp.write(metaxml)
fp.close()
rename(join(PB_ITEM, '%s_meta.xml_tmp' % identifier), join(PB_ITEM, '%s_meta.xml' % identifier))
def load_files_metadata(identifier):
"""
Returns the file-level metadata of an item as an opaque (not to be directly
modified) object. Use `lookup_file_metadata` to get a reference to the
metadata of a specific file.
Args:
* identifier (``str``): Identifier of the item
Returns:
* object containing the file-level metadata of an item.
"""
path = open(join(PB_ITEM, '%s_files.xml' % identifier))
data = path.read()
path.close()
return xmltodict.parse(data)
## TODO: Perform a *LOT* of testing on this
#def write_files_metadata(identifier, metadata):
# """
# Writes the (changed) file level metadata dictionary to disk.
# This is required if changes made to the `metadata` are to persist.
# The `metadata` should be loaded using `load_files_metadata` and not changed
# directly, only by calling `lookup_file_metadata` and changing the values in
# the result of that function.
#
# Args:
#
# * identifier (``str``): Identifier of the item
# * metadata (``dict``): Metadata, loaded by `load_files_metadata` and
# potentially modified.
#
# Returns:
#
# * Nothing
# """
# # TODO: Sort by @name entries, alphabetically?
# result = xmltodict.unparse(metadata, pretty=True).encode('utf-8')
# metafilesxml = check_output(['xmllint', '--format', '-'], input=result)
# fp = open(join(PB_ITEM, '%s_files.xml_tmp' % identifier), 'wb+')
# fp.write(metafilesxml)
# fp.close()
# rename(join(PB_ITEM, '%s_files.xml_tmp' % identifier), join(PB_ITEM, '%s_files.xml' % identifier))
def lookup_file_metadata(files_metadata, filename):
"""
Fetch file-level metadata for a specific file, as a dictionary
(``collections.OrderedDict``)
Args:
* files_metadata: files metadata as returned by `load_files_metadata`.
* filename (``str``): filename/path canonical (relative) to the item
Returns:
File-level metadata if it exists (``dict``) or ``None``.
"""
# XXX: in doc, mention that changing the properties here should reflect in
# final written file
file_list = files_metadata['files']['file']
for file_info in file_list:
if file_info['@name'] == filename:
return file_info
return None
##: Indicates that a file is an original file (used with create_file_metadata)
#SOURCE_ORIGINAL = 'original'
#
##: Indicates that a file is a derivative file (used with create_file_metadata)
#SOURCE_DERIVATIVE = 'derivative'
#
#def create_file_metadata(files_metadata, filename, source=SOURCE_ORIGINAL, fileformat=None):
# if lookup_file_metadata(files_metadata, filename) is not None:
# raise ValueError('%s already exists in files_metadata' % filename)
#
# # XXX: in doc, mention that one should be really careful with this
# if source not in (SOURCE_ORIGINAL, SOURCE_DERIVATIVE):
# raise ValueError('Invalid source type.')
#
# if fileformat is None:
# raise ValueError('Please specify a valid fileformat')
#
# entry = OrderedDict([('@name', filename),
# ('@source', source),
# ('format', fileformat)])
#
# return entry
#
#
#def append_file_metadata(files_metadata, file_entry):
# # TODO: for doc, requires file to exist on disk in PB_ITEM
#
# #md = _calculate_file_metadata(join(PB_ITEM, file_entry['@name']))
# #file_entry.update(md)
# files_metadata['files']['file'].append(file_entry)
#
#
#def _calculate_file_metadata(filename):
# data = {}
# data.update(_calc_hashes(filename))
#
# stat_data = stat(filename)
# data['mtime'] = stat_data.st_mtime
# data['size'] = stat_data.st_size
#
# return data
#
#
#def _calc_hashes(filename):
# fd = open(filename, 'rb')
# crc32 = 0
# md5 = hashlib.md5()
# sha1 = hashlib.sha1()
#
# while True:
# s = fd.read(65536)
# if not s:
# break
# crc32 = zlib.crc32(s, crc32)
# md5.update(s)
# sha1.update(s)
#
# fd.close()
#
# data = {'crc32': ('%08X' % crc32).lower(),
# 'md5': md5.hexdigest(),
# 'sha1': sha1.hexdigest() }
#
# return data
......@@ -8,16 +8,19 @@ import xmltodict
from .const import PB_ITEM, PB_TMP
from .logger import logger
def get_scandata_xml(identifier, source_file):
"""
Parses the scandata.xml file for a given identifier and source_file.
Args:
* identifier (str): Identifier of the item
* source_file (str): sourceFile to be operated on
* identifier (``str``): Identifier of the item
* source_file (``str``): sourceFile to be operated on
Returns:
Returns the path to the scandata (str) or None
* Path to the scandata (``str``) or None
"""
item_dir = Path(PB_ITEM)
......@@ -63,6 +66,14 @@ def get_scandata_xml(identifier, source_file):
def scandata_parse(scandata_path):
"""
Parse scandata.xml to native Python format
Args:
* scandata_path (``str``): Path to the scandata
Returns:
* Scandata as dictionary
"""
scandata = xmltodict.parse(open(scandata_path, 'rb').read())
return scandata
......@@ -70,7 +81,15 @@ def scandata_parse(scandata_path):
def scandata_get_page_count(scandata):
"""
Return the number of page elements in a parsed scandata object
Get the number of page elements in a parsed scandata object
Args:
* scandata (``dict``): Scandata as returned by `scandata_parse`.
Returns:
* The number of page elements (``int``)
"""
pages = scandata.get('book', {}).get('pageData', {}).get('page', [])
if not isinstance(pages, list):
......@@ -87,6 +106,11 @@ def scandata_get_skip_pages(scandata):
Args:
* scandata: Parsed scandata as returned by scandata_parse
Returns:
* Indexes of pages that should not added to access formats
(``list of int``)
"""
skip = []
......
from json import load, dump
from os.path import join
from os import environ
......@@ -8,16 +9,26 @@ def get_task_info():
"""
Parses the /task/task.json file and returns the parsed information as native
python dictionary.
Returns:
* Task arguments (``dict``)
"""
return load(open(join(PB_TASK, 'task.json')))
def get_petabox_info():
"""
Parses the /task/petabox.json file and returns the parsed information as
native python dictionary.
Returns:
* Petabox info (``dict``)
"""
return load(open(join(PB_TASK, 'petabox.json')))
def write_extra_targets(extra_targets):
"""
Create /task/extras_targets.json file based on `extra_targets`.
......@@ -27,19 +38,36 @@ def write_extra_targets(extra_targets):
* `extra_targets`: A list of dictionaries, containing the filename as str
in 'name', and optionally if the file is to be parsed as original in
'mark_original', as boolean.
Returns:
* Nothing
Example:
>>> from derivermodule.files import canonical_item_filename
>>> write_extra_targets([
>>> {'name': canonical_item_filename(target_file.replace('.txt', '-other.txt')),
>>> 'mark_original': False}
>>> ])
"""
extra_targets_file = join(PB_TASK, 'extra_targets.json')
fp = open(extra_targets_file, 'w+')
dump(extra_targets, fp)
fp.close()
def get_task_args(task_info):
"""
Args:
* task_info (dict): task_info as returned by get_task_info()
Returns:
dict of task arguments
* Tasks arguments (``dict``)
"""
if 'task' in task_info and 'args' in task_info['task']:
return task_info['task']['args']
......@@ -55,17 +83,21 @@ def get_task_arg_or_environ_arg(task_args, name):
enviroment variables are upper cased and use underscores. The functions
applies the task argument to environment variable to the `name` argument.
So if the task argument is 'ocr-perform-foo', pass that as `name`, and the
function will also check the environment variables for 'OCR_PERFORM_FOO'.
So if the task argument is ``'ocr-perform-foo'``, pass that as ``name``, and
the function will also check the environment variables for
``'OCR_PERFORM_FOO'``.
Environment arguments are preferred over task arguments.
Environment arguments have higher precedence over task arguments by this
function.
Args:
* task_args: As returned by get_task_args
* task_args: As returned by `get_task_args`
* name: Custom task argument name, implementation defined.
Returns the value of the argument, if present, and None otherwise.
Returns:
* the value of the argument, if present, and None otherwise.
"""
environ_name = name.replace('-', '_').upper()
if environ_name in environ:
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment