Source code for biothings.hub.autoupdate.uploader

import asyncio
import datetime
import json
import os
from functools import partial
from typing import Optional

from elasticsearch import Elasticsearch, NotFoundError, RequestsHttpConnection
from requests_aws4auth import AWS4Auth

import biothings.hub.dataload.uploader as uploader
from biothings import config as btconfig
from biothings.utils.backend import DocESBackend
from biothings.utils.common import get_random_string
from biothings.utils.es import IndexerException


[docs] class BiothingsUploader(uploader.BaseSourceUploader): name = None # Specify the backend this uploader should work with. Must be defined before instantiation # (can be an instance or a partial() returning an instance) TARGET_BACKEND = None # Specify the syncer function this uploader will use to apply diff # (can be an instance or a partial() returning an instance) SYNCER_FUNC = None # should we delete index before restoring snapshot if index already exist ? AUTO_PURGE_INDEX = False def __init__(self, *args, **kwargs): super(BiothingsUploader, self).__init__(*args, **kwargs) self._target_backend = None self._syncer_func = None @property def target_backend(self): if not self._target_backend: if type(self.__class__.TARGET_BACKEND) == partial: self._target_backend = self.__class__.TARGET_BACKEND() else: self._target_backend = self.__class__.TARGET_BACKEND assert ( type(self._target_backend) == DocESBackend ), "Only ElasticSearch backend is supported (got %s)" % type(self._target_backend) return self._target_backend @property def syncer_func(self): if not self._syncer_func: self._syncer_func = self.__class__.SYNCER_FUNC return self._syncer_func
[docs] async def load(self, *args, **kwargs): return await super().load(steps=["data"], *args, **kwargs)
[docs] async def update_data(self, batch_size, job_manager, **kwargs): """ Look in data_folder and either restore a snapshot to ES or apply diff to current ES index """ # determine if it's about a snapshot/full and diff/incremental # we should have a json metadata matching the release self.prepare_src_dump() # load infor from src_dump release = self.src_doc.get("download", {}).get("release") assert release, "Can't find release information in src_dump document" build_meta = json.load(open(os.path.join(self.data_folder, "%s.json" % release))) if build_meta["type"] == "full": res = await self.restore_snapshot(build_meta, job_manager=job_manager, **kwargs) elif build_meta["type"] == "incremental": res = await self.apply_diff(build_meta, job_manager=job_manager, **kwargs) return res
[docs] def get_snapshot_repository_config(self, build_meta): """Return (name,config) tuple from build_meta, where name is the repo name, and config is the repo config""" # repo_name, repo_settings = list( # build_meta["metadata"]["repository"].items())[0] # TODO repo_name = build_meta["metadata"]["repository"]["name"] repo_settings = build_meta["metadata"]["repository"] return (repo_name, repo_settings)
def _get_es_client(self, es_host: str, auth: Optional[dict]): """ Get Elasticsearch Client Used by self._get_repository, self._create_repository """ es_conf = { "timeout": 120, "max_retries": 3, "retry_on_timeout": False, } if auth: # see https://git.io/JoAE4 on BioThings.API Wiki if auth["type"] == "aws": auth_args = ( auth["properties"]["access_id"], auth["properties"]["secret_key"], auth["properties"]["region"], "es", ) es_conf["http_auth"] = AWS4Auth(*auth_args) es_conf["connection_class"] = RequestsHttpConnection elif auth["type"] == "http": auth_args = ( auth["properties"]["username"], auth["properties"]["password"], ) es_conf["http_auth"] = auth_args else: raise RuntimeError("Auth settings not recognized") es = Elasticsearch(es_host, **es_conf) return es def _get_repository(self, es_host: str, repo_name: str, auth: Optional[dict]): es = self._get_es_client(es_host, auth) try: repo = es.snapshot.get_repository(repository=repo_name) except NotFoundError: repo = None return repo def _create_repository(self, es_host: str, repo_name: str, repo_settings: dict, auth: Optional[dict]): """ Create Elasticsearch Snapshot repository """ es = self._get_es_client(es_host, auth) es.snapshot.create_repository(repository=repo_name, body=repo_settings)
[docs] async def restore_snapshot(self, build_meta, job_manager, **kwargs): self.logger.debug("Restoring snapshot...") idxr = self.target_backend.target_esidxer es_host = idxr.es_host self.logger.debug("Got ES Host: %s", es_host) repo_name, repo_settings = self.get_snapshot_repository_config(build_meta) self.logger.debug("Got repo name: %s", repo_name) self.logger.debug("With settings: %s", repo_settings) # pull authentication settings from config auth = btconfig.STANDALONE_CONFIG.get(self.name, {}).get( "auth", btconfig.STANDALONE_CONFIG["_default"].get("auth") ) if auth: self.logger.debug("Obtained Auth settings, using them.") else: self.logger.debug("No Auth settings found") # all restore repos should be r/o repo_settings["settings"]["readonly"] = True # populate additional settings additional_settings = btconfig.STANDALONE_CONFIG.get(self.name, {}).get( "repo_settings", btconfig.STANDALONE_CONFIG["_default"].get("repo_settings") ) if additional_settings: self.logger.debug("Adding additional settings: %s", additional_settings) repo_settings["settings"].update(additional_settings) if "client" not in repo_settings["settings"]: self.logger.warning("\"client\" not set in repository settings. The 'default' " "client will be used.") self.logger.warning( "Make sure keys are in the Elasticsearch keystore. " "If you are trying to work with EOL versions of " "Elasticsearch, or if you intentionally enabled " 'allow_insecure_settings, set "access_key", "secret_key",' " and potentially \"region\" in additional 'repo_settings'." ) # first check if snapshot repo exists self.logger.info("Getting current repository settings") existing_repo_settings = self._get_repository(es_host, repo_name, auth) if existing_repo_settings: if existing_repo_settings[repo_name] != repo_settings: # TODO update comparison logic self.logger.info( f"Repository '{repo_name}' was found but settings are different, " "it may need to be created again" ) self.logger.debug("Existing setting: %s", existing_repo_settings[repo_name]) self.logger.debug("Required (new) setting: %s" % repo_settings) else: self.logger.info("Repo exists with correct settings") else: # ok, it doesn't exist let's try to create it self.logger.info("Repo does not exist") try: self.logger.info("Creating repo...") self._create_repository(es_host, repo_name, repo_settings, auth) except Exception as e: self.logger.info("Creation failed: %s", e) if "url" in repo_settings["settings"]: raise uploader.ResourceError( "Could not create snapshot repository. Check elasticsearch.yml configuration " + "file, you should have a line like this: " + 'repositories.url.allowed_urls: "%s*" ' % repo_settings["settings"]["url"] + "allowing snapshot to be restored from this URL. Error was: %s" % e ) else: raise uploader.ResourceError("Could not create snapshot repository: %s" % e) # repository is now ready, let's trigger the restore snapshot_name = build_meta["metadata"]["snapshot_name"] # backup the original value of indexer's replica original_number_of_replicas = idxr.get_internal_number_of_replicas() or 0 alias_name = idxr.canonical_index_name use_no_downtime_method = kwargs.get("use_no_downtime_method", True) append_ts = kwargs.get("append_ts", True) if use_no_downtime_method: base_index_name = snapshot_name if append_ts: ts = datetime.datetime.now().strftime("%Y%m%d%H%M") base_index_name = f"{snapshot_name}_{ts}" if len(base_index_name) >= 255: raise RuntimeError("Deterministic part of index name already too long") index_name = base_index_name append_random_str = False while True: if append_random_str: index_name += "_" + get_random_string() index_name = index_name[:255] # elasticsearch restriction if not idxr.exists_index(index=index_name): break else: index_name = alias_name pinfo = self.get_pinfo() pinfo["step"] = "restore" pinfo["description"] = snapshot_name def get_status_info(): try: res = idxr.get_restore_status(index_name) return res except Exception as e: # somethng went wrong, report as failure return {"status": "FAILED %s" % e} def done_callback(f, step: str): try: self.logger.info("%s launched: %s" % (step, f.result())) except Exception as e: self.logger.error("Error while launching %s: %s" % (step, e)) raise e self.logger.info( "Restoring snapshot '%s' to index '%s' on host '%s'" % (snapshot_name, index_name, idxr.es_host) ) # ESIndexer.restore is synchronous but should return relatively # quickly job = await job_manager.defer_to_thread( pinfo, partial(idxr.restore, repo_name, snapshot_name, index_name, purge=self.__class__.AUTO_PURGE_INDEX) ) job.add_done_callback(partial(done_callback, step="restore")) await job def update_alias_and_delete_old_indices(): # Find indices which starts with snapshot_name, and sort by creation date and order by asc old_indices = [] try: old_indices.extend( idxr.get_indice_names_by_settings(index=alias_name + "*", sort_by_creation_date=True, reverse=False) ) except Exception: pass self.logger.debug("Alias '%s' points to '%s'" % (alias_name, old_indices)) if index_name in old_indices: self.logger.warning("new index name in old alias, something is not right") self.logger.warning("continuing alias swap despite potential problem") old_indices.remove(index_name) try: idxr.update_alias(alias_name, index_name) self.logger.info(f"Alias '{alias_name}' updated to " f"associate with index '{index_name}'") except IndexerException as e: self.logger.warning(f"Alias index swap ran into a problem {e}") self.logger.warning(f"Deleting new index '{index_name}'") idxr.delete_index(index_name) raise # have ESIndexer look at the correct index after snapshot restore idxr.check_index() # after successful swap, delete old indices # only issue messages on errors # we only keep n recent indices depends on the config.RELEASE_KEEP_N_RECENT_INDICES # n < 0: keep all # n == 0: only keep the new created indice # n > 0: only keep at most n latest indices number_indexes_to_keep = btconfig.RELEASE_KEEP_N_RECENT_INDICES if number_indexes_to_keep > 0: old_indices = old_indices[:-number_indexes_to_keep] elif number_indexes_to_keep < 0: old_indices = [] try: for rm_idx_name in old_indices: idxr.delete_index(rm_idx_name) self.logger.info("Deleted old index '%s'" % rm_idx_name) except Exception: # nosec # just inform the user that deletion failed, not that harmful self.logger.error("Failed to delete old indices, try deleting " f"{old_indices} manually") # restore indexer's replica to original value idxr.set_internal_number_of_replicas(original_number_of_replicas) while True: status_info = get_status_info() status = status_info["status"] self.logger.info("Recovery status for index '%s': %s" % (index_name, status_info)) if status in ["INIT", "IN_PROGRESS"]: await asyncio.sleep(getattr(btconfig, "MONITOR_SNAPSHOT_DELAY", 60)) else: if status == "DONE": self.logger.info( "Snapshot '%s' successfully restored to index '%s' (host: '%s')" % (snapshot_name, index_name, idxr.es_host), extra={"notify": True}, ) if use_no_downtime_method: job = await job_manager.defer_to_thread(pinfo={}, func=update_alias_and_delete_old_indices) job.add_done_callback(partial(done_callback, step="alias")) await job else: e = uploader.ResourceError( "Failed to restore snapshot '%s' on index '%s', status: %s" % (snapshot_name, idxr._index, status) ) self.logger.error(e) raise e break # return current number of docs in index return self.target_backend.count()
[docs] async def apply_diff(self, build_meta, job_manager, **kwargs): self.logger.info("Applying incremental update from diff folder: %s" % self.data_folder) meta = json.load(open(os.path.join(self.data_folder, "metadata.json"))) # old: index we want to update old = ( self.target_backend.target_esidxer.es_host, meta["old"]["backend"], # TODO # target name can be release index name, # maybe should refer to old backend name # ---------------------------------------- # self.target_backend.target_name, # ---------------------------------------- self.target_backend.target_esidxer._doc_type, ) # new: index's data we will reach once updated (just informative) new = ( self.target_backend.target_esidxer.es_host, meta["new"]["backend"], self.target_backend.target_esidxer._doc_type, ) await self.syncer_func(old_db_col_names=old, new_db_col_names=new, diff_folder=self.data_folder) # return current number of docs in index (even if diff update) return self.target_backend.count()
[docs] def clean_archived_collections(self): pass