Source code for biothings.hub.autoupdate.dumper

import json
import os
import re
from functools import partial
from urllib.parse import urljoin, urlparse

import boto3
import requests
from requests_aws4auth import AWS4Auth

from biothings import config as btconfig
from biothings.hub.dataload.dumper import DumperException, HTTPDumper
from biothings.utils.common import md5sum, uncompressall


[docs] class BiothingsDumper(HTTPDumper): """ This dumper is used to maintain a BioThings API up-to-date. BioThings data is available as either as an ElasticSearch snapshot when full update, and a collection of diff files for incremental updates. It will either download incremental updates and apply diff, or trigger an ElasticSearch restore if the latest version is a full update. This dumper can also be configured with precedence rules: when a full and a incremental update is available, rules can set so full is preferably used over incremental (size can also be considered when selecting the preferred way). """ # URL pointing to versions.json file, this is the main entry point VERSION_URL = None # set during autohub init SRC_NAME = None SRC_ROOT_FOLDER = None # Auto-deploy data update ? AUTO_UPLOAD = False # Optionally, a schedule can be used to automatically check new version # SCHEDULE = "0 9 * * *" # what backend the dumper should work with. Must be defined before instantiation # (can be an instance or a partial() returning an instance) TARGET_BACKEND = None # TODO: should we ensure ARCHIVE is always true ? # (ie we have to keep all versions to apply them in order) # must be set before use if accessing restricted bucket AWS_ACCESS_KEY_ID = None AWS_SECRET_ACCESS_KEY = None def __init__(self, *args, **kwargs): super(BiothingsDumper, self).__init__(*args, **kwargs) # list of build_version to download/apply, in order self._target_backend = None
[docs] def prepare_client(self): """ Depending on presence of credentials, inject authentication in client.get() """ super().prepare_client() if self.__class__.AWS_ACCESS_KEY_ID and self.__class__.AWS_SECRET_ACCESS_KEY: self._client = requests.Session() self._client.verify = self.__class__.VERIFY_CERT def auth_get(url, *args, **kwargs): if ".s3-website-" in url: raise DumperException("Can't access s3 static website using authentication") # extract region from URL (reliable ?) pat = re.compile(r"https?://(.*)\.(.*)\.amazonaws.com.*") m = pat.match(url) if m: bucket_name, frag = m.groups() # looks like "s3-us-west-2" # whether static website is activated or not region = frag.replace("s3-", "") if region == "s3": # url doesn't contain a region, we need to query the bucket s3client = boto3.client( "s3", aws_access_key_id=self.__class__.AWS_ACCESS_KEY_ID, aws_secret_access_key=self.__class__.AWS_SECRET_ACCESS_KEY, ) bucket_info = s3client.get_bucket_location(Bucket=bucket_name) region = bucket_info["LocationConstraint"] auth = AWS4Auth( self.__class__.AWS_ACCESS_KEY_ID, self.__class__.AWS_SECRET_ACCESS_KEY, region, "s3" ) return self._client.get(url, auth=auth, *args, **kwargs) else: raise DumperException(f"Couldn't determine s3 region from url '{url}'") self.client.get = auth_get
@property def base_url(self): # add trailing / so urljoin won't remove folder from path return os.path.dirname(self.__class__.VERSION_URL) + "/" @property def target_backend(self): if not self._target_backend: if type(self.__class__.TARGET_BACKEND) == partial: self._target_backend = self.__class__.TARGET_BACKEND() else: self._target_backend = self.__class__.TARGET_BACKEND return self._target_backend
[docs] async def get_target_backend(self): """ Example: [{ 'host': 'es6.mygene.info:9200', 'index': 'mygene_allspecies_20200823_ufkwdv79', 'index_alias': 'mygene_allspecies', 'version': '20200906', 'count': 38729977 }] """ async def do(): cnt = self.target_backend.count() return { "host": self.target_backend.target_esidxer.es_host, "index": self.target_backend.target_name, "index_alias": self.target_backend.target_alias, "version": self.target_backend.version, "count": cnt, } result = await do() return result
[docs] async def reset_target_backend(self): async def do(): if self.target_backend.target_esidxer.exists_index(): self.target_backend.target_esidxer.delete_index() await do()
[docs] def download(self, remoteurl, localfile, headers=None): headers = headers or {} self.prepare_local_folders(localfile) parsed = urlparse(remoteurl) if self.__class__.AWS_ACCESS_KEY_ID and self.__class__.AWS_SECRET_ACCESS_KEY: # accessing diffs controled by auth key = parsed.path.strip("/") # s3 key are relative, not / at beginning # extract bucket name from URL (reliable?) pat = re.compile(r"^(.*?)\..*\.amazonaws.com") m = pat.match(parsed.netloc) if m: bucket_name = m.groups()[0] else: raise DumperException(f"Can't extract bucket name from URL '{remoteurl}'") return self.auth_download(bucket_name, key, localfile, headers) else: return self.anonymous_download(remoteurl, localfile, headers)
[docs] def anonymous_download(self, remoteurl, localfile, headers=None): headers = headers or {} res = super(BiothingsDumper, self).download(remoteurl, localfile, headers=headers) # use S3 metadata to set local mtime # we add 1 second to make sure we wouldn't download remoteurl again # because remote is older by just a few milliseconds lastmodified = int(res.headers["x-amz-meta-lastmodified"]) + 1 os.utime(localfile, (lastmodified, lastmodified)) return res
[docs] def auth_download(self, bucket_name, key, localfile, headers=None): headers = headers or {} session = boto3.Session( aws_access_key_id=self.__class__.AWS_ACCESS_KEY_ID, aws_secret_access_key=self.__class__.AWS_SECRET_ACCESS_KEY, ) bucket = session.resource("s3").Bucket(bucket_name) res = bucket.download_file(key, localfile) return res
[docs] def check_compat(self, build_meta): if hasattr(btconfig, "SKIP_CHECK_COMPAT") and btconfig.SKIP_CHECK_COMPAT: return msg = [] for version_field in ["app_version", "standalone_version", "biothings_version"]: VERSION_FIELD = version_field.upper() version = build_meta.get(version_field) assert version is not None, "Version field '%s' is None" % VERSION_FIELD # some releases use dict (most recent) some use string if isinstance(version, dict): version = version["branch"] if type(version) != list: version = [version] # remove hash from versions (only useful when version is a string, # not a dict, see above version = [re.sub(r"( \[.*\])", "", v) for v in version] version = set(version) if version == set([None]): raise DumperException( f"Remote data is too old and can't be handled with current app ({version_field} not defined)" ) versionfromconf = re.sub(r"( \[.*\])", "", getattr(btconfig, VERSION_FIELD).get("branch")) VERSION = set() VERSION.add(versionfromconf) found_compat_version = VERSION.intersection(version) assert found_compat_version, "Remote data requires %s to be %s, but current app is %s" % ( version_field, version, VERSION, ) msg.append("%s=%s:OK" % (version_field, version))
[docs] def load_remote_json(self, url): res = self.client.get(url, allow_redirects=True) redirect = res.headers.get("x-amz-website-redirect-location") if redirect: parsed = urlparse(url) newurl = parsed._replace(path=redirect) res = self.client.get(newurl.geturl()) if res.status_code != 200: self.logger.error(res) return None try: jsondat = json.loads(res.text) return jsondat except json.JSONDecodeError: self.logger.error(res.headers) self.logger.error(res) return None
[docs] def compare_remote_local(self, remote_version, local_version, orig_remote_version, orig_local_version): # we need have to some data locally. do we already have ? if remote_version > local_version: self.logger.info( "Remote version '%s' is more recent than local version '%s', download needed" % (orig_remote_version, orig_local_version) ) return True else: self.logger.info( "Remote version '%s' is the same as " % orig_remote_version + "local version '%s'. " % orig_local_version + "Dump is waiting to be applied" ) return False
[docs] def remote_is_better(self, remotefile, localfile): remote_dat = self.load_remote_json(remotefile) if not remote_dat: self.logger.info("Couldn't find any build metadata at url '%s'", remotefile) return False orig_remote_version = remote_dat["build_version"] local_dat = json.load(open(localfile)) orig_local_version = local_dat["build_version"] # if diff version, we want to compatr the right part (destination version) # local: "3.4", backend: "4". It's actually the same (4==4) local_version = orig_local_version.split(".")[-1] remote_version = orig_remote_version.split(".")[-1] if remote_version != orig_remote_version: self.logger.debug( "Remote version '%s' converted to '%s' (version that will be reached once incremental update has been applied)", orig_remote_version, remote_version, ) if local_version != orig_local_version: self.logger.debug( "Local version '%s' converted to '%s' (version that had been be reached using incremental update files)", orig_local_version, local_version, ) backend_version = self.target_backend.version if backend_version is None: self.logger.info("No backend version found") return self.compare_remote_local( remote_version, local_version, orig_remote_version, orig_local_version, ) elif remote_version > backend_version: self.logger.info( "Remote version '%s' is more recent than backend version '%s'", orig_remote_version, backend_version ) return self.compare_remote_local(remote_version, local_version, orig_remote_version, orig_local_version) else: self.logger.info("Backend version '%s' is up-to-date", backend_version) return False
[docs] def choose_best_version(self, versions): """ Out of all compatible versions, choose the best: 1. choose incremental vs. full according to preferences 2. version must be the highest (most up-to-date) """ # 1st pass # TODO: implemente inc/full preferences, for now prefer incremental if not versions: raise DumperException("No compatible version found") preferreds = [v for v in versions if "." in v] if preferreds: self.logger.info("Preferred versions (according to preferences): %s", preferreds) versions = preferreds # we can directly take the max because: # - version is a string # - format if YYYYMMDD # - when incremental, it's always old_version.new_version return max(versions, key=lambda e: e["build_version"])
[docs] def find_update_path(self, version, backend_version=None): """ Explore available versions and find the path to update the hub up to "version", starting from given backend_version (typically current version found in ES index). If backend_version is None (typically no index yet), a complete path will be returned, from the last compatible "full" release up-to the latest "diff" update. Returned is a list of dict, where each dict is a build metadata element containing information about each update (see versions.json), the order of the list describes the order the updates should be performed. """ avail_versions = self.load_remote_json(self.__class__.VERSION_URL) assert avail_versions["format"] == "1.0", "versions.json format has changed: %s" % avail_versions["format"] if version == "latest": version = avail_versions["versions"][-1]["build_version"] self.logger.info("Asking for latest version, ie. '%s'" % version) self.logger.info( "Find update path to bring data from version '%s' up-to version '%s'", backend_version, version ) file_url = urljoin(self.base_url, "%s.json" % version) build_meta = self.load_remote_json(file_url) if not build_meta: raise Exception(f"Can't get remote build information about version '{version}' (url was '{file_url}')") self.check_compat(build_meta) if build_meta["target_version"] == backend_version: self.logger.info("Backend is up-to-date, version '%s'", backend_version) return [] # from older to older, each required_version being compatible with the next target_version # except when version is a full update, ie. no previous version required (end of the path) path = [build_meta] # latest points to a new version, 2 options there: # - update is a full snapshot: nothing to download if type=s3, one archive to download if type=fs, we then need to trigger a restore # - update is an incremental, we need to check if the incremental is compatible with current version if build_meta["type"] == "incremental": # require_version contains the compatible version for which we can apply the diff # let's compare... if backend_version == build_meta["require_version"]: self.logger.info( "Diff update '%s' requires version '%s', which is compatible with current backend version, download update", build_meta["build_version"], build_meta["require_version"], ) else: self.logger.info( "Diff '%s' update requires version '%s'", build_meta["build_version"], build_meta["require_version"] ) self.logger.info("Now looking for a compatible version") # by default we'll check directly the required version compatibles = [v for v in avail_versions["versions"] if v["target_version"] == version.split(".")[0]] self.logger.info("Compatible versions from which we can apply this update: %s", compatibles) best_version = self.choose_best_version(compatibles) self.logger.info("Best version found: '%s'" % best_version) # keep this version as part of the update path # fill the path from older to newer (no extend or append) path = self.find_update_path(best_version["build_version"], backend_version) + path else: # full, just keep it as-is, it's a full (and it's already part of path during init, see above pass return path
[docs] def create_todump_list(self, force=False, version="latest", url=None): assert self.__class__.VERSION_URL, "VERSION_URL class attribute is not set" self.logger.info("Dumping version '%s'" % version) file_url = url or urljoin(self.base_url, "%s.json" % version) # if "latest", we compare current json file we have (because we don't know what's behind latest) # otherwise json file should match version explicitely in current folder. version = version == "latest" and self.current_release or version try: current_localfile = os.path.join(self.current_data_folder, f"{version}.json") # check it actually exists (if data folder was deleted by src_dump still refers to # this folder, this file won't exist) if not os.path.exists(current_localfile): self.logger.error("Local file '%s' doesn't exist", current_localfile) raise FileNotFoundError dump_status = self.src_doc.get("download", {}).get("status") if dump_status != "success": self.logger.error("Found dump information but status is '%s', will ignore current dump", dump_status) raise TypeError except (TypeError, FileNotFoundError): # current data folder doesn't even exist current_localfile = None self.logger.info("Local file: %s" % current_localfile) self.logger.info("Remote url : %s" % file_url) self.logger.info("Force: %s" % force) if force or current_localfile is None or self.remote_is_better(file_url, current_localfile): # manually get the diff meta file (ie. not using download() because we don't know the version yet, # it's in the diff meta build_meta = self.load_remote_json(file_url) self.check_compat(build_meta) if not build_meta: raise Exception(f"Can't get remote build information about version '{version}' (url was '{file_url}')") if build_meta["type"] == "incremental": self.release = build_meta["build_version"] # ok, now we can use download() # we will download it again during the normal process so we can then compare # when we have new data release new_localfile = os.path.join(self.new_data_folder, f"{self.release}.json") self.to_dump.append({"remote": file_url, "local": new_localfile}) # get base url (used later to get diff files) metadata_url = build_meta["metadata"]["url"] base_url = os.path.dirname(metadata_url) + "/" # "/" or urljoin will remove previous fragment... new_localfile = os.path.join(self.new_data_folder, os.path.basename(metadata_url)) self.download(metadata_url, new_localfile) metadata = json.load(open(new_localfile)) remote_files = metadata["diff"]["files"] if metadata["diff"]["mapping_file"]: remote_files.append(metadata["diff"]["mapping_file"]) for md5_fname in remote_files: fname = md5_fname["name"] p = urlparse(fname) if not p.scheme: # this is a relative path furl = urljoin(base_url, fname) else: # this is a true URL furl = fname new_localfile = os.path.join(self.new_data_folder, os.path.basename(fname)) self.to_dump.append({"remote": furl, "local": new_localfile}) else: # it's a full snapshot release, it always can be applied self.release = build_meta["build_version"] new_localfile = os.path.join(self.new_data_folder, "%s.json" % self.release) self.to_dump.append({"remote": file_url, "local": new_localfile}) # ------------------------------- # TODO review # ------------------------------- # if repo type is fs, we assume metadata contains url to archive # repo_name = list( # build_meta["metadata"]["repository"].keys())[0] # if build_meta["metadata"]["repository"][repo_name]["type"] == "fs": if build_meta["metadata"]["repository"]["type"] == "fs": archive_url = build_meta["metadata"]["archive_url"] archive = os.path.basename(archive_url) new_localfile = os.path.join(self.new_data_folder, archive) self.to_dump.append({"remote": archive_url, "local": new_localfile}) # unset this one, as it may not be pickelable (next step is "download", which # uses different processes and need workers to be pickled) self._target_backend = None return self.release
[docs] def post_dump(self, *args, **kwargs): if not self.release: # wasn't set before, means no need to post-process (ie. up-to-date, already done) return build_meta = json.load(open(os.path.join(self.new_data_folder, f"{self.release}.json"))) if build_meta["type"] == "incremental": self.logger.info("Checking md5sum for files in '%s'" % self.new_data_folder) metadata = json.load(open(os.path.join(self.new_data_folder, "metadata.json"))) for md5_fname in metadata["diff"]["files"]: spec_md5 = md5_fname["md5sum"] fname = md5_fname["name"] compute_md5 = md5sum(os.path.join(self.new_data_folder, fname)) if compute_md5 != spec_md5: self.logger.error("md5 check failed for file '%s', it may be corrupted", fname) e = DumperException(f"Bad md5sum for file '{fname}'") self.register_status("failed", download={"err": repr(e)}) raise e else: self.logger.debug(f"md5 check success for file '{fname}'") elif build_meta["type"] == "full": # if type=fs, check if archive must be uncompressed # TODO # repo_name = list(build_meta["metadata"]["repository"].keys())[0] if build_meta["metadata"]["repository"]["type"] == "fs": uncompressall(self.new_data_folder)
[docs] async def info(self, version="latest"): """ Display version information (release note, etc...) for given version { "info": ... "release_note": ... } """ file_url = urljoin(self.base_url, "%s.json" % version) result = {} build_meta = self.load_remote_json(file_url) if not build_meta: raise DumperException("Can't find version '%s'" % version) result["info"] = build_meta if build_meta.get("changes"): result["release_note"] = {} for filtyp in build_meta["changes"]: relnote_url = build_meta["changes"][filtyp]["url"] res = self.client.get(relnote_url) if res.status_code == 200: if filtyp == "json": result["release_note"][filtyp] = res.json() else: result["release_note"][filtyp] = res.text else: raise DumperException(f"Error while downloading release note '{version} ({res})': {res.text}") return result
[docs] async def versions(self): """ Display all available versions. Example: [{ 'build_version': '20171003', 'url': 'https://biothings-releases.s3.amazonaws.com:443/mygene.info/20171003.json', 'release_date': '2017-10-06T11:58:39.749357', 'require_version': None, 'target_version': '20171003', 'type': 'full' }, ...] """ avail_versions = self.load_remote_json(self.__class__.VERSION_URL) if not avail_versions: raise DumperException("Can't find any versions available...") assert avail_versions["format"] == "1.0", "versions.json format has changed: %s" % avail_versions["format"] return avail_versions["versions"]