Commit daf4a88a authored by Merlijn Wajer's avatar Merlijn Wajer
Browse files

metadata: flush, sync and validate XML

It looks like we were seeing disks corrupting _meta.xml files in rare
occasions, so let's try extra hard to make sure that what we write to
the disk is what we wanted to write to the disk.
parent 9cc8b45b
from os import rename, stat
from os import rename, stat, remove, fsync, posix_fadvise, \
POSIX_FADV_DONTNEED, chmod, close
from os.path import join
from tempfile import mkstemp
from xml.etree import ElementTree as ET
from xml.dom import minidom
......@@ -8,12 +10,13 @@ import hashlib, zlib
from json import dump
from subprocess import check_output
from subprocess import check_output, check_call, CalledProcessError, \
DEVNULL, STDOUT
from collections import OrderedDict
import xmltodict
from .const import PB_ITEM, PB_TASK
from .const import PB_ITEM, PB_TASK, PB_TMP
def parse_item_metadata(path):
......@@ -106,10 +109,29 @@ def write_item_metadata(identifier, metadata):
* Nothing
"""
metaxml = metadata_to_metaxml(metadata)
fp = open(join(PB_ITEM, '%s_meta.xml_tmp' % identifier), 'w+')
fp, tmp_path = mkstemp(suffix='.xml', dir=PB_ITEM, text=True)
close(fp)
# We write the XML to a temporary file, flush it, sync it to disk, and
# finally tell Linux to drop it from the cache, to prevent the disk from
# silently corrupted the file before xmllint performs the final validation.
# Alternatively, we could write it to disk and read it from disk with
# O_DIRECT, and pass that to xmllint's stdin.
fp = open(tmp_path, 'w+')
fp.write(metaxml)
fp.flush()
fsync(fp.fileno())
posix_fadvise(fp.fileno(), 0, 0, POSIX_FADV_DONTNEED)
fp.close()
rename(join(PB_ITEM, '%s_meta.xml_tmp' % identifier), join(PB_ITEM, '%s_meta.xml' % identifier))
try:
check_call(['xmllint', '--format', tmp_path], stdout=DEVNULL, stderr=STDOUT)
except CalledProcessError:
remove(tmp_path)
raise
chmod(tmp_path, 0o644)
rename(tmp_path, join(PB_ITEM, '%s_meta.xml' % identifier))
def load_files_metadata(identifier):
......
......@@ -14,7 +14,10 @@ def get_task_info():
* Task arguments (``dict``)
"""
return load(open(join(PB_TASK, 'task.json')))
fp = open(join(PB_TASK, 'task.json'))
data = load(fp)
fp.close()
return data
def get_petabox_info():
......
__version__ = '1.0.0'
__version__ = '1.0.1'
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment