Source code for biothings.utils.parallel_mp

from glob import glob
from multiprocessing import Pool
from os import cpu_count
from os.path import abspath, exists, isdir, join
from traceback import format_exc

from biothings.utils.backend import DocBackendOptions
from biothings.utils.common import iter_n

# How many cpus on this machine?
DEFAULT_THREADS = cpu_count()


# simple aggregation functions
[docs] def agg_by_sum(prev, curr): return prev + curr
[docs] def agg_by_append(prev, curr): if isinstance(curr, list): return prev + curr return prev + [curr]
# avoid the global variable and the callback function this way
[docs] class ParallelResult(object): def __init__(self, agg_function, agg_function_init): self.res = agg_function_init self.agg_function = agg_function
[docs] def aggregate(self, curr): self.res = self.agg_function(self.res, curr)
# Handles errors in async apply
[docs] class ErrorHandler(object): def __init__(self, errpath, chunk_num): if errpath: self.error_file_path = errpath + "_{}".format(chunk_num) else: self.error_file_path = None
[docs] def handle(self, exception): if self.error_file_path: f = open(self.error_file_path, "w") f.write(format_exc()) f.write("\n{}\n".format(str(exception))) f.close()
[docs] def run_parallel_on_iterable( fun, iterable, backend_options=None, agg_function=agg_by_append, agg_function_init=None, chunk_size=1000000, num_workers=DEFAULT_THREADS, outpath=None, mget_chunk_size=10000, ignore_None=True, error_path=None, **query_kwargs, ): """This function will run a user function on all documents in a backend database in parallel using multiprocessing.Pool. The overview of the process looks like this: Chunk (into chunks of size "chunk_size") items in iterable, and run the following script on each chunk using a multiprocessing.Pool object with "num_workers" processes: For each document in list of ids in this chunk (documents retrived in chunks of "mget_chunk_size"): Run function "fun" with parameters (doc, chunk_num, f <file handle only passed if "outpath" is not None>), and aggregate the result with the current results using function "agg_function". :param fun: The function to run on all documents. If outpath is NOT specified, fun must accept two parameters: (doc, chunk_num), where doc is the backend document, and chunk_num is essentially a unique process id. If outpath IS specified, an additional open file handle (correctly tagged with the current chunk's chunk_num) will also be passed to fun, and thus it must accept three parameters: (doc, chunk_num, f) :param iterable: Iterable of ids. :param backend_options: An instance of biothings.utils.backend.DocBackendOptions. This contains the options necessary to instantiate the correct backend class (ES, mongo, etc). :param agg_function: This function aggregates the return value of each run of function fun. It should take 2 parameters: (prev, curr), where prev is the previous aggregated result, and curr is the output of the current function run. It should return some value that represents the aggregation of the previous aggregated results with the output of the current function. :param agg_function_init: Initialization value for the aggregated result. :param chunk_size: Length of the ids list sent to each chunk. :param num_workers: Number of processes that consume chunks in parallel. https://docs.python.org/2/library/multiprocessing.html#multiprocessing.pool.multiprocessing.Pool :param outpath: Base path for output files. Because function fun can be run many times in parallel, each chunk is sequentially numbered, and the output file name for any chunk is outpath_{chunk_num}, e.g., if outpath is out, all output files will be of the form: /path/to/cwd/out_1, /path/to/cwd/out_2, etc. :param error_path: Base path for error files. If included, exceptions inside each chunk thread will be printed to these files. :param mget_chunk_size: The size of each mget chunk inside each chunk thread. In each thread, the ids list is consumed by passing chunks to a mget_by_ids function. This parameter controls the size of each mget. :param ignore_None: If set, then falsy values will not be aggregated (0, [], None, etc) in the aggregation step. Default True. All other parameters are fed to the backend query. """ agg_function_init = agg_function_init or [] ret = ParallelResult(agg_function, agg_function_init) # assert backend_options is correct if not backend_options or not isinstance(backend_options, DocBackendOptions): raise Exception("backend_options must be a biothings.databuild.parallel2.DocBackendOptions class") # build backend from options backend = backend_options.cls.create_from_options(backend_options) # normalize path for out files if outpath: outpath = abspath(outpath) if error_path: error_path = abspath(error_path) with Pool(processes=num_workers) as p: for chunk_num, chunk in enumerate(iter_n(iterable, chunk_size)): # apply function to chunk p.apply_async( _run_one_chunk, args=( chunk_num, chunk, fun, backend_options, agg_function, agg_function_init, outpath, mget_chunk_size, ignore_None, ), callback=ret.aggregate, error_callback=ErrorHandler(error_path, chunk_num).handle, ) # close pool and wait for completion of all workers p.close() p.join() return ret.res
[docs] def run_parallel_on_ids_file( fun, ids_file, backend_options=None, agg_function=agg_by_append, agg_function_init=[], chunk_size=1000000, num_workers=DEFAULT_THREADS, outpath=None, mget_chunk_size=10000, ignore_None=True, error_path=None, **query_kwargs, ): """Implementation of run_parallel_on_iterable, where iterable comes from the lines of a file. All parameters are fed to run_on_ids_iterable, except: :param ids_file: Path to file with ids, one per line.""" def _file_iterator(fh): for line in fh: yield line.strip("\n") ids_file = abspath(ids_file) with open(ids_file, "r") as ids_handle: return run_parallel_on_iterable( fun=fun, iterable=_file_iterator(ids_handle), backend_options=backend_options, agg_function=agg_function, agg_function_init=agg_function_init, chunk_size=chunk_size, num_workers=num_workers, outpath=outpath, mget_chunk_size=mget_chunk_size, ignore_None=ignore_None, error_path=error_path, **query_kwargs, )
# TODO: allow mget args to be passed
[docs] def run_parallel_on_query( fun, backend_options=None, query=None, agg_function=agg_by_append, agg_function_init=[], chunk_size=1000000, num_workers=DEFAULT_THREADS, outpath=None, mget_chunk_size=10000, ignore_None=True, error_path=None, full_doc=False, **query_kwargs, ): """Implementation of run_parallel_on_ids_iterable, where the ids iterable comes from the result of a query on the specified backend. All parameters are fed to run_parallel_on_ids_iterable, except: :param query: ids come from results of this query run on backend, default: "match_all" :param full_doc: If True, a list of documents is passed to each subprocess, rather than ids that are looked up later. Should be faster? Unknown how this works with very large query sets... """ def _query_iterator(q): for doc in q: if full_doc: yield doc else: yield doc["_id"] # assert backend_options is correct if not backend_options or not isinstance(backend_options, DocBackendOptions): raise Exception("backend_options must be a biothings.databuild.parallel2.DocBackendOptions class") # build backend from options backend = backend_options.cls.create_from_options(backend_options) return run_parallel_on_iterable( fun=fun, iterable=_query_iterator(backend.query(query, _source=full_doc, only_source=False, **query_kwargs)), backend_options=backend_options, agg_function=agg_function, agg_function_init=agg_function_init, chunk_size=chunk_size, outpath=outpath, num_workers=num_workers, mget_chunk_size=mget_chunk_size, ignore_None=ignore_None, error_path=error_path, **query_kwargs, )
[docs] def run_parallel_on_ids_dir( fun, ids_dir, backend_options=None, agg_function=agg_by_append, agg_function_init=[], outpath=None, num_workers=DEFAULT_THREADS, mget_chunk_size=10000, ignore_None=True, error_path=None, **query_kwargs, ): """This function will run function fun on chunks defined by the files in ids_dir. All parameters are fed to run_parallel_on_iterable, except: :params ids_dir: Directory containing only files with ids, one per line. The number of files defines the number of chunks. """ ids_dir = abspath(ids_dir) assert isdir(ids_dir) return run_parallel_on_iterable( fun=fun, iterable=glob(join(ids_dir, "*")), backend_options=backend_options, agg_function=agg_function, agg_function_init=agg_function_init, chunk_size=1, outpath=outpath, num_workers=num_workers, mget_chunk_size=mget_chunk_size, ignore_None=ignore_None, error_path=error_path, **query_kwargs, )
def _run_one_chunk( chunk_num, chunk, fun, backend_options, agg_function, agg_function_init, outpath, mget_chunk_size, ignore_None ): # iterator if chunk is a file path def _file_path_iterator(fp): with open(fp, "r") as fh: for line in fh: yield line.strip("\n") # recreate backend object backend = backend_options.cls.create_from_options(backend_options) # check to see chunk is a list of ids, a list of objects, or a tuple with a path and make # the correct iterator for the type if isinstance(chunk, list) or isinstance(chunk, tuple): if isinstance(chunk[0], str) and exists(chunk[0]): iterator = backend.mget_from_ids(_file_path_iterator(chunk[0]), step=mget_chunk_size, only_source=False) elif isinstance(chunk[0], str): iterator = backend.mget_from_ids(chunk, step=mget_chunk_size, only_source=False) elif isinstance(chunk[0], dict): iterator = iter(chunk) else: raise Exception( "lists and tuples can contain either strings (containing ids or a file path), or dicts (containing full documents)" ) else: raise Exception( "chunk can only be a list/tuple of ids, a list/tuple of objects, or a list/tuple with a file path to a list of ids." ) # initialize return for this chunk ret = ParallelResult(agg_function, agg_function_init) # make file handle for this chunk if outpath: _file = open(outpath + "_{}".format(chunk_num), "w") # iterate through this chunk and run function on every doc for doc in iterator: # Actually call the function if outpath: r = fun(doc, chunk_num, _file) else: r = fun(doc, chunk_num) # aggregate the results if r or not ignore_None: ret.aggregate(r) # close handle if outpath and _file: _file.close() return ret.res