Source code for biothings.hub.databuild.mapper

from biothings.utils.dataload import alwayslist


[docs] class BaseMapper(object): """ Basic mapper used to convert documents. if mapper's name matches source's metadata's mapper, mapper.convert(docs) call will be used to process/convert/whatever passed documents """ def __init__(self, name=None, *args, **kwargs): self.name = name
[docs] def load(self): """ Do whatever is required to fill mapper with mapping data Can be called multiple time, the first time only will load data """ raise NotImplementedError("sub-class and implement me")
[docs] def process(self, docs): """ Convert given docs into other docs. """ raise NotImplementedError("sub-class and implement me")
[docs] class IDBaseMapper(BaseMapper): """ Provide mapping between different sources """ def __init__(self, name=None, convert_func=None, *args, **kwargs): """ 'name' may match a "mapper" metatdata field (see uploaders). If None, mapper will be applied to any document from a resource without "mapper" argument """ super(IDBaseMapper, self).__init__(name=name) self.map = None self.convert_func = convert_func
[docs] def translate(self, _id, transparent=False): """ Return _id translated through mapper, or _id itself if not part of mapper If 'transparent' and no match, original _id will be returned """ if self.need_load(): self.load() default = transparent and _id or None conv = self.convert_func or (lambda x: x) return self.map.get(conv(_id), default)
def __contains__(self, _id): if self.need_load(): self.load() return _id in self.map def __len__(self): if self.need_load(): self.load() return len(self.map)
[docs] def process(self, docs, key_to_convert="_id", transparent=True): """ Process 'key_to_convert' document key using mapping. If transparent and no match, original key will be used (so there's no change). Else, if no match, document will be discarded (default). Warning: key to be translated must not be None (it's considered a non-match) """ for doc in docs: _id = doc.get(key_to_convert) _newid = self.translate(_id, transparent) if _newid is None and not transparent: continue for _oneid in alwayslist(_newid): _oneid = str(_oneid) doc[key_to_convert] = _oneid yield doc
[docs] def need_load(self): return self.map is None
[docs] class TransparentMapper(BaseMapper):
[docs] def load(self, *args, **kwargs): pass
[docs] def process(self, docs, *args, **kwargs): return docs