Kill JsonIO and fix JSON driver caching issues (#2796)

* Kill JsonIO and fix JSON driver caching issues

Signed-off-by: Toby Harradine <tobyharradine@gmail.com>

* Ensure lock covers critical region in set()

Signed-off-by: Toby Harradine <tobyharradine@gmail.com>

* Make tests pass

Signed-off-by: Toby Harradine <tobyharradine@gmail.com>

* Use pickle over deepcopy in Config

Signed-off-by: Toby Harradine <tobyharradine@gmail.com>

* Fix temp instance creation

Signed-off-by: Toby Harradine <tobyharradine@gmail.com>

* Serialise value before doing anything in set()

Signed-off-by: Toby Harradine <tobyharradine@gmail.com>
This commit is contained in:
Toby Harradine 2019-06-28 04:49:45 +10:00 committed by Michael H
parent f3bbfdc64d
commit bff7e214ab
9 changed files with 133 additions and 157 deletions

1
.github/CODEOWNERS vendored
View File

@ -16,7 +16,6 @@ redbot/core/drivers/* @tekulvw
redbot/core/events.py @tekulvw
redbot/core/global_checks.py @tekulvw
redbot/core/i18n.py @tekulvw
redbot/core/json_io.py @tekulvw
redbot/core/modlog.py @palmtree5
redbot/core/rpc.py @tekulvw
redbot/core/utils/chat_formatting.py @tekulvw

View File

@ -3,6 +3,7 @@
# Discord Version check
import asyncio
import json
import logging
import os
import sys
@ -12,7 +13,6 @@ import discord
import redbot.logging
from redbot.core.bot import Red, ExitCodes
from redbot.core.cog_manager import CogManagerUI
from redbot.core.json_io import JsonIO
from redbot.core.global_checks import init_global_checks
from redbot.core.events import init_events
from redbot.core.cli import interactive_config, confirm, parse_cli_flags
@ -61,7 +61,8 @@ def list_instances():
)
sys.exit(1)
else:
data = JsonIO(data_manager.config_file)._load_json()
with data_manager.config_file.open(encoding="utf-8") as fs:
data = json.load(fs)
text = "Configured Instances:\n\n"
for instance_name in sorted(data.keys()):
text += "{}\n".format(instance_name)

View File

@ -1,8 +1,8 @@
import logging
import collections
from copy import deepcopy
from typing import Any, Union, Tuple, Dict, Awaitable, AsyncContextManager, TypeVar
import logging
import pickle
import weakref
from typing import Any, Union, Tuple, Dict, Awaitable, AsyncContextManager, TypeVar
import discord
@ -57,7 +57,7 @@ class _ValueCtxManager(Awaitable[_T], AsyncContextManager[_T]): # pylint: disab
"list or dict) in order to use a config value as "
"a context manager."
)
self.__original_value = deepcopy(self.raw_value)
self.__original_value = pickle.loads(pickle.dumps(self.raw_value, -1))
return self.raw_value
async def __aexit__(self, exc_type, exc, tb):
@ -204,7 +204,7 @@ class Group(Value):
@property
def defaults(self):
return deepcopy(self._defaults)
return pickle.loads(pickle.dumps(self._defaults, -1))
async def _get(self, default: Dict[str, Any] = ...) -> Dict[str, Any]:
default = default if default is not ... else self.defaults
@ -446,7 +446,7 @@ class Group(Value):
result = self.nested_update(value, defaults.get(key, {}))
defaults[key] = result
else:
defaults[key] = deepcopy(current[key])
defaults[key] = pickle.loads(pickle.dumps(current[key], -1))
return defaults
async def set(self, value):
@ -558,7 +558,7 @@ class Config:
@property
def defaults(self):
return deepcopy(self._defaults)
return pickle.loads(pickle.dumps(self._defaults, -1))
@staticmethod
def _create_uuid(identifier: int):
@ -727,7 +727,7 @@ class Config:
if key not in self._defaults:
self._defaults[key] = {}
data = deepcopy(kwargs)
data = pickle.loads(pickle.dumps(kwargs, -1))
for k, v in data.items():
to_add = self._get_defaults_dict(k, v)
@ -988,7 +988,7 @@ class Config:
pass
else:
for k, v in dict_.items():
data = deepcopy(defaults)
data = pickle.loads(pickle.dumps(defaults, -1))
data.update(v)
ret[int(k)] = data
@ -1066,7 +1066,7 @@ class Config:
ret = {}
defaults = self.defaults.get(self.MEMBER, {})
for member_id, member_data in guild_data.items():
new_member_data = deepcopy(defaults)
new_member_data = pickle.loads(pickle.dumps(defaults, -1))
new_member_data.update(member_data)
ret[int(member_id)] = new_member_data
return ret

View File

@ -1359,9 +1359,9 @@ class Core(commands.Cog, CoreLogic):
docs = await db[c_name].find().to_list(None)
for item in docs:
item_id = str(item.pop("_id"))
output = item
target = JSON(c_name, item_id, data_path_override=c_data_path)
await target.jsonIO._threadsafe_save_json(output)
target.data = item
await target._save()
backup_filename = "redv3-{}-{}.tar.gz".format(
instance_name, ctx.message.created_at.strftime("%Y-%m-%d %H-%M-%S")
)

View File

@ -1,16 +1,17 @@
import inspect
import json
import logging
import os
import sys
import tempfile
from copy import deepcopy
from pathlib import Path
from typing import Any, Dict
import appdirs
from discord.utils import deprecated
from . import commands
from .json_io import JsonIO
__all__ = [
"create_temp_config",
@ -25,12 +26,15 @@ __all__ = [
log = logging.getLogger("red.data_manager")
jsonio = None
basic_config = None
instance_name = None
basic_config_default = {"DATA_PATH": None, "COG_PATH_APPEND": "cogs", "CORE_PATH_APPEND": "core"}
basic_config_default: Dict[str, Any] = {
"DATA_PATH": None,
"COG_PATH_APPEND": "cogs",
"CORE_PATH_APPEND": "core",
}
config_dir = None
appdir = appdirs.AppDirs("Red-DiscordBot")
@ -57,9 +61,13 @@ def create_temp_config():
default_dirs["STORAGE_TYPE"] = "JSON"
default_dirs["STORAGE_DETAILS"] = {}
config = JsonIO(config_file)._load_json()
with config_file.open("r", encoding="utf-8") as fs:
config = json.load(fs)
config[name] = default_dirs
JsonIO(config_file)._save_json(config)
with config_file.open("w", encoding="utf-8") as fs:
json.dump(config, fs, indent=4)
def load_basic_configuration(instance_name_: str):
@ -76,23 +84,21 @@ def load_basic_configuration(instance_name_: str):
The instance name given by CLI argument and created during
redbot setup.
"""
global jsonio
global basic_config
global instance_name
jsonio = JsonIO(config_file)
instance_name = instance_name_
try:
config = jsonio._load_json()
basic_config = config[instance_name]
with config_file.open(encoding="utf-8") as fs:
config = json.load(fs)
except (FileNotFoundError, KeyError):
print(
"You need to configure the bot instance using `redbot-setup`"
" prior to running the bot."
)
sys.exit(1)
else:
basic_config = config[instance_name]
def _base_data_path() -> Path:

View File

@ -1,9 +1,12 @@
from pathlib import Path
import copy
import weakref
import asyncio
import json
import logging
from ..json_io import JsonIO
import os
import pickle
import weakref
from pathlib import Path
from typing import Any, Dict
from uuid import uuid4
from .red_base import BaseDriver, IdentifierData
@ -59,13 +62,10 @@ class JSON(BaseDriver):
self.data_path = data_path_override
else:
self.data_path = Path.cwd() / "cogs" / ".data" / self.cog_name
self.data_path.mkdir(parents=True, exist_ok=True)
self.data_path = self.data_path / self.file_name
self.jsonIO = JsonIO(self.data_path)
self._lock = asyncio.Lock()
self._load_data()
async def has_valid_connection(self) -> bool:
@ -90,10 +90,12 @@ class JSON(BaseDriver):
return
try:
self.data = self.jsonIO._load_json()
with self.data_path.open("r", encoding="utf-8") as fs:
self.data = json.load(fs)
except FileNotFoundError:
self.data = {}
self.jsonIO._save_json(self.data)
with self.data_path.open("w", encoding="utf-8") as fs:
json.dump(self.data, fs, indent=4)
def migrate_identifier(self, raw_identifier: int):
if self.unique_cog_identifier in self.data:
@ -104,7 +106,7 @@ class JSON(BaseDriver):
if ident in self.data:
self.data[self.unique_cog_identifier] = self.data[ident]
del self.data[ident]
self.jsonIO._save_json(self.data)
_save_json(self.data_path, self.data)
break
async def get(self, identifier_data: IdentifierData):
@ -112,18 +114,23 @@ class JSON(BaseDriver):
full_identifiers = identifier_data.to_tuple()
for i in full_identifiers:
partial = partial[i]
return copy.deepcopy(partial)
return pickle.loads(pickle.dumps(partial, -1))
async def set(self, identifier_data: IdentifierData, value=None):
partial = self.data
full_identifiers = identifier_data.to_tuple()
# This is both our deepcopy() and our way of making sure this value is actually JSON
# serializable.
value_copy = json.loads(json.dumps(value))
async with self._lock:
for i in full_identifiers[:-1]:
if i not in partial:
partial[i] = {}
partial = partial[i]
partial[full_identifiers[-1]] = value_copy
partial[full_identifiers[-1]] = copy.deepcopy(value)
await self.jsonIO._threadsafe_save_json(self.data)
await self._save()
async def clear(self, identifier_data: IdentifierData):
partial = self.data
@ -131,22 +138,28 @@ class JSON(BaseDriver):
try:
for i in full_identifiers[:-1]:
partial = partial[i]
except KeyError:
pass
else:
async with self._lock:
try:
del partial[full_identifiers[-1]]
except KeyError:
pass
else:
await self.jsonIO._threadsafe_save_json(self.data)
await self._save()
async def import_data(self, cog_data, custom_group_data):
def update_write_data(identifier_data: IdentifierData, data):
def update_write_data(identifier_data: IdentifierData, _data):
partial = self.data
idents = identifier_data.to_tuple()
for ident in idents[:-1]:
if ident not in partial:
partial[ident] = {}
partial = partial[ident]
partial[idents[-1]] = data
partial[idents[-1]] = _data
async with self._lock:
for category, all_data in cog_data:
splitted_pkey = self._split_primary_key(category, custom_group_data, all_data)
for pkey, data in splitted_pkey:
@ -159,7 +172,53 @@ class JSON(BaseDriver):
is_custom=category in custom_group_data,
)
update_write_data(ident_data, data)
await self.jsonIO._threadsafe_save_json(self.data)
await self._save()
async def _save(self) -> None:
loop = asyncio.get_running_loop()
await loop.run_in_executor(None, _save_json, self.data_path, self.data)
def get_config_details(self):
return
def _save_json(path: Path, data: Dict[str, Any]) -> None:
"""
This fsync stuff here is entirely neccessary.
On windows, it is not available in entirety.
If a windows user ends up with tons of temp files, they should consider hosting on
something POSIX compatible, or using the mongo backend instead.
Most users wont encounter this issue, but with high write volumes,
without the fsync on both the temp file, and after the replace on the directory,
There's no real durability or atomicity guarantee from the filesystem.
In depth overview of underlying reasons why this is needed:
https://lwn.net/Articles/457667/
Also see:
http://man7.org/linux/man-pages/man2/open.2.html#NOTES (synchronous I/O section)
And:
https://www.mjmwired.net/kernel/Documentation/filesystems/ext4.txt#310
"""
filename = path.stem
tmp_file = "{}-{}.tmp".format(filename, uuid4().fields[0])
tmp_path = path.parent / tmp_file
with tmp_path.open(encoding="utf-8", mode="w") as fs:
json.dump(data, fs, indent=4)
fs.flush() # This does get closed on context exit, ...
os.fsync(fs.fileno()) # but that needs to happen prior to this line
tmp_path.replace(path)
try:
flag = os.O_DIRECTORY # pylint: disable=no-member
except AttributeError:
pass
else:
fd = os.open(path.parent, flag)
try:
os.fsync(fd)
finally:
os.close(fd)

View File

@ -1,90 +0,0 @@
import functools
import json
import os
import asyncio
import logging
from copy import deepcopy
from uuid import uuid4
# This is basically our old DataIO and just a base for much more elaborate classes
# This still isn't completely threadsafe, (do not use config in threads)
from pathlib import Path
log = logging.getLogger("red")
PRETTY = {"indent": 4, "sort_keys": False, "separators": (",", " : ")}
MINIFIED = {"sort_keys": False, "separators": (",", ":")}
class JsonIO:
"""Basic functions for atomic saving / loading of json files"""
def __init__(self, path: Path = Path.cwd()):
"""
:param path: Full path to file.
"""
self._lock = asyncio.Lock()
self.path = path
# noinspection PyUnresolvedReferences
def _save_json(self, data, settings=PRETTY):
"""
This fsync stuff here is entirely neccessary.
On windows, it is not available in entirety.
If a windows user ends up with tons of temp files, they should consider hosting on
something POSIX compatible, or using the mongo backend instead.
Most users wont encounter this issue, but with high write volumes,
without the fsync on both the temp file, and after the replace on the directory,
There's no real durability or atomicity guarantee from the filesystem.
In depth overview of underlying reasons why this is needed:
https://lwn.net/Articles/457667/
Also see:
http://man7.org/linux/man-pages/man2/open.2.html#NOTES (synchronous I/O section)
And:
https://www.mjmwired.net/kernel/Documentation/filesystems/ext4.txt#310
"""
filename = self.path.stem
tmp_file = "{}-{}.tmp".format(filename, uuid4().fields[0])
tmp_path = self.path.parent / tmp_file
with tmp_path.open(encoding="utf-8", mode="w") as f:
json.dump(data, f, **settings)
f.flush() # This does get closed on context exit, ...
os.fsync(f.fileno()) # but that needs to happen prior to this line
tmp_path.replace(self.path)
# pylint: disable=no-member
try:
fd = os.open(self.path.parent, os.O_DIRECTORY)
os.fsync(fd)
except AttributeError:
fd = None
finally:
if fd is not None:
os.close(fd)
async def _threadsafe_save_json(self, data, settings=PRETTY):
loop = asyncio.get_event_loop()
# the deepcopy is needed here. otherwise,
# the dict can change during serialization
# and this will break the encoder.
data_copy = deepcopy(data)
func = functools.partial(self._save_json, data_copy, settings)
async with self._lock:
await loop.run_in_executor(None, func)
# noinspection PyUnresolvedReferences
def _load_json(self):
with self.path.open(encoding="utf-8", mode="r") as f:
data = json.load(f)
return data
async def _threadsafe_load_json(self, path):
loop = asyncio.get_event_loop()
func = functools.partial(self._load_json, path)
async with self._lock:
return await loop.run_in_executor(None, func)

View File

@ -8,7 +8,6 @@ __all__ = ["cleanup_datamanager", "data_mgr_config", "cog_instance"]
@pytest.fixture(autouse=True)
def cleanup_datamanager():
data_manager.basic_config = None
data_manager.jsonio = None
@pytest.fixture()

View File

@ -23,7 +23,6 @@ from redbot.core.data_manager import (
core_data_path,
storage_details,
)
from redbot.core.json_io import JsonIO
from redbot.core.utils import safe_delete
from redbot.core import Config
from redbot.core.drivers import BackendType, IdentifierData
@ -50,7 +49,8 @@ def load_existing_config():
if not config_file.exists():
return {}
return JsonIO(config_file)._load_json()
with config_file.open(encoding="utf-8") as fs:
return json.load(fs)
instance_data = load_existing_config()
@ -74,7 +74,9 @@ def save_config(name, data, remove=False):
print("Not continuing")
sys.exit(0)
config[name] = data
JsonIO(config_file)._save_json(config)
with config_file.open("w", encoding="utf-8") as fs:
json.dump(config, fs, indent=4)
def get_data_dir():
@ -220,7 +222,7 @@ async def json_to_mongov2(instance):
if "." in cog_name:
# Garbage handler
continue
with p.open(mode="r") as f:
with p.open(mode="r", encoding="utf-8") as f:
cog_data = json.load(f)
for identifier, all_data in cog_data.items():
try: