Commit 9ce6ec09 authored by Merlijn Wajer's avatar Merlijn Wajer
Browse files

doc: initial version

parent 6e5c23ef
# Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line, and also
# from the environment for the first two.
SPHINXOPTS ?=
SPHINXBUILD ?= sphinx-build
SOURCEDIR = source
BUILDDIR = build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
# Configuration file for the Sphinx documentation builder.
#
# This file only contains a selection of the most common options. For a full
# list see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html
# -- Path setup --------------------------------------------------------------
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
# import os
# import sys
# sys.path.insert(0, os.path.abspath('.'))
# -- Project information -----------------------------------------------------
project = 'Internet Archive Deriver Module'
copyright = '2020 - 2021, Merlijn Wajer <merlijn@archive.org>, Derek Fukumori'
author = 'Merlijn Wajer <merlijn@archive.org>, Derek Fukumori'
# -- General configuration ---------------------------------------------------
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
'sphinx.ext.autodoc',
'sphinx.ext.viewcode',
]
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = []
# -- Options for HTML output -------------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
#html_theme = 'alabaster'
html_theme = 'nature'
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
.. _const:
Constants
=========
Contains various constants related to the environment of the container.
Example usage::
from os.path import join
from derivermodule.const import PB_ITEM
test_txt_in_item = join(PB_ITEM, 'test.txt')
.. automodule:: derivermodule.const
:members:
.. _files:
File related functions
======================
.. automodule:: derivermodule.files
:members:
.. _helloworld:
Example deriver module
======================
This modules calls ``hocr-fold-chars`` from the ``archive-hocr-tools`` package,
and then gzips the content.
.. TODO: Annontate code some more
.. code-block:: python
#!/usr/bin/env python3
import sys
from os.path import join, basename
from subprocess import check_call, CalledProcessError
from derivermodule.logger import get_logger
from derivermodule.files import canonical_item_filename
from derivermodule.task import get_task_info, write_extra_targets
from derivermodule.metadata import load_item_metadata, write_item_metadata, \
load_files_metadata, write_files_metadata, create_file_metadata, \
append_file_metadata, SOURCE_DERIVATIVE
from derivermodule.const import PB_TMP, PB_ITEM, PB_FAST
from version import VERSION
logger = get_logger('hocr-char-to-word')
if __name__ == '__main__':
logger.info('hocr-char-to-word module version %s' % VERSION)
info = get_task_info()
identifier = info['identifier']
metadata = load_item_metadata(identifier)
files_metadata = load_files_metadata(identifier)
source_file = info['sourceFile']
target_file = info['targetFile']
target_format = info['targetFormat']
logger.info('sourceFile: \'%s\' -> targetFile \'%s\'',
source_file, target_file)
# Strip '.gz', create in /tmp
target_file_plain = join(PB_TMP, basename(target_file[:-3]))
target_fd = open(target_file_plain, 'w+')
# Call hocr-fold-chars from our hocr package
try:
check_call(['hocr-fold-chars', '-f', source_file], stdout=target_fd)
except CalledProcessError:
print('FATAL: hocr-fold-chars failed in conversion', file=sys.stderr)
sys.exit(1)
target_fd.close()
# Compress with gzip
target_fd = open(target_file, 'w+')
try:
check_call(['gzip', '-c', target_file_plain], stdout=target_fd)
except CalledProcessError:
print('FATAL: hocr-fold-chars failed in compression', file=sys.stderr)
sys.exit(1)
target_fd.close()
# Mark _hocr.html.gz as derivative (mark_original=False)
write_extra_targets([
{'name': canonical_item_filename(target_file), 'mark_original': False}
])
# Create _files.xml entry for our file
# The file metadata should not exist, since we would not be running in thos
# module otherwise (target_file is being made, so it cannot exist)
file_md = create_file_metadata(files_metadata,
canonical_item_filename(target_file),
source=SOURCE_DERIVATIVE,
fileformat=target_format)
file_md['chocr_to_word_module_version'] = VERSION
# Append the entry to the files_metadata
append_file_metadata(files_metadata, file_md)
# Write changes, if any.
write_item_metadata(identifier, metadata)
write_files_metadata(identifier, files_metadata)
.. _imagestack:
Archive.org image stacks
========================
.. automodule:: derivermodule.imagestack
:members:
.. Internet Archive Deriver Module documentation master file, created by
sphinx-quickstart on Fri Jan 15 17:06:03 2021.
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
Welcome to Internet Archive Deriver Module's documentation!
===========================================================
Introduction
------------
You have landed on the documentation for the Internet Archive's
``derivermodule``. This is a module used internally to ease the creation of new
so called "deriver modules" in Python. Deriver modules are pieces of code that
operate on uploaded files, creating "derivative" formats (for example, if one
uploads a ``flac`` audio file, a derivative ``opus`` file could be created to
allow for quicker downloads of a lossy various of the audio.
Concept: the container environment
----------------------------------
All deriver modules based on the ``derivermodule`` (this) library currently run
in a Docker container.
The contain environment has the following paths and files set up::
/item/
<identifier>_meta.xml
<identifier>_files.xml
<other item files>
/task/
task.json
petabox.json
/tmp
/var/tmp/fast
TODO: General description of various files in docker derive process, what
metadata is stored where, etc.
task.json
~~~~~~~~~
Overview of keys:
* ``identifier``: identifier of the item (``str``)
* ``sourceFile``: absolute path to the source file for the derive (``str``)
* ``sourceFormat``: format of source file (``str``)
* ``targetFile``: absolute path to the target file for the derive (``str``)
* ``targetFormat``: format of the target file (``str``)
* ``?``
petabox.json
~~~~~~~~~~~~
Useful keys:
* ``statsd-server``: address of the statsd server (``str``)
* ``statsd-port``: port of the statsd server (``str``?)
* ``?``
Quickstart
----------
Check out the "Example deriver module" for a simple deriver module that uses
most of the functionality exposed by this library.
Additionally, the (internal) ``www/tesseract`` and ``www/pdf`` git repositories
might also be a good reference.
There is also this example repository, which is not python-specific:
https://git.archive.org/www/serverless
Including the module in your container
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
TDB
Components
----------
.. toctree::
:maxdepth: 2
helloworld.rst
const.rst
logger.rst
metadata.rst
task.rst
files.rst
imagestack.rst
scandata.rst
Indices and tables
==================
* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`
.. _logger:
Logging
=======
.. automodule:: derivermodule.logger
:members:
.. _metadata:
Metadata
========
.. automodule:: derivermodule.metadata
:members:
.. _scandata:
Archive.org Scandata
====================
.. automodule:: derivermodule.scandata
:members:
.. _task:
Task related functions
======================
.. automodule:: derivermodule.task
:members:
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment