JsonIO atomic write improvements (#2132)

Use `async with Lock` instead of deprecated `with await lock` usage.

Forces a file fsync prior and a directory fsync (where available) after rename to prevent issues with left behind temp files.

Also should clarify: this is not threadsafe. Comments were clarified, function names remain misleading.
This commit is contained in:
Michael H 2018-09-24 04:29:14 -04:00 committed by Toby Harradine
parent 404800c556
commit 84ac5f3952

View File

@ -5,8 +5,8 @@ import asyncio
import logging
from uuid import uuid4
# This is basically our old DataIO, except that it's now threadsafe
# and just a base for much more elaborate classes
# This is basically our old DataIO and just a base for much more elaborate classes
# This still isn't completely threadsafe, (do not use config in threads)
from pathlib import Path
log = logging.getLogger("red")
@ -27,18 +27,50 @@ class JsonIO:
# noinspection PyUnresolvedReferences
def _save_json(self, data, settings=PRETTY):
"""
This fsync stuff here is entirely neccessary.
On windows, it is not available in entirety.
If a windows user ends up with tons of temp files, they should consider hosting on
something POSIX compatible, or using the mongo backend instead.
Most users wont encounter this issue, but with high write volumes,
without the fsync on both the temp file, and after the replace on the directory,
There's no real durability or atomicity guarantee from the filesystem.
In depth overview of underlying reasons why this is needed:
https://lwn.net/Articles/457667/
Also see:
http://man7.org/linux/man-pages/man2/open.2.html#NOTES (synchronous I/O section)
And:
https://www.mjmwired.net/kernel/Documentation/filesystems/ext4.txt#310
"""
log.debug("Saving file {}".format(self.path))
filename = self.path.stem
tmp_file = "{}-{}.tmp".format(filename, uuid4().fields[0])
tmp_path = self.path.parent / tmp_file
with tmp_path.open(encoding="utf-8", mode="w") as f:
json.dump(data, f, **settings)
f.flush() # This does get closed on context exit, ...
os.fsync(f.fileno()) # but that needs to happen prior to this line
tmp_path.replace(self.path)
# pylint: disable=E1101
try:
fd = os.open(self.path.parent, os.O_DIRECTORY)
os.fsync(fd)
except AttributeError:
fd = None
finally:
if fd is not None:
os.close(fd)
async def _threadsafe_save_json(self, data, settings=PRETTY):
loop = asyncio.get_event_loop()
func = functools.partial(self._save_json, data, settings)
with await self._lock:
async with self._lock:
await loop.run_in_executor(None, func)
# noinspection PyUnresolvedReferences
@ -51,6 +83,5 @@ class JsonIO:
async def _threadsafe_load_json(self, path):
loop = asyncio.get_event_loop()
func = functools.partial(self._load_json, path)
task = loop.run_in_executor(None, func)
with await self._lock:
return await asyncio.wait_for(task)
async with self._lock:
return await loop.run_in_executor(None, func)