Source code for worms.database

"""TODO: Summary
"""
import os
import json
import random
import sys
from concurrent.futures import ThreadPoolExecutor, as_completed
import itertools as it
import logging
from logging import info, warning, error
from random import shuffle
from worms.util import hash_str_to_int as myhash
import numpy as np
from tqdm import tqdm

from worms import util
from worms.bblock import BBlock, _BBlock

logging.basicConfig(level=logging.INFO)
import _pickle as pickle

try:
    # god, I'm so tired of this crap....
    from pyrosetta import pose_from_file
    from pyrosetta.rosetta.core.scoring.dssp import Dssp
    HAVE_PYROSETTA = True
except ImportError:
    HAVE_PYROSETTA = False


[docs]def flatten_path(pdbfile):
    if isinstance(pdbfile, bytes):
        pdbfile = str(pdbfile, 'utf-8')
    return pdbfile.replace(os.sep, '__') + '.pickle'


[docs]class SpliceDB:
    """Stores valid NC splices for bblock pairs"""

    def __init__(self, cachedir=None):
        if cachedir is None:
            if 'HOME' in os.environ:
                cachedir = os.environ['HOME'] + os.sep + '.worms/cache'
            else:
                cachedir = '.worms/cache'
        self.cachedir = os.path.join(cachedir, 'splices')
        self._cache = dict()
        self._dirty = set()

[docs]    def cachepath(self, params, pdbkey):
        # stock hash ok for tuples of numbers (?)
        prm = '%016x' % abs(hash(params))
        key = '%016x.pickle' % pdbkey
        return os.path.join(self.cachedir, prm, key)

[docs]    def partial(self, params, pdbkey):
        assert isinstance(pdbkey, int)
        if (params, pdbkey) not in self._cache:
            cachefile = self.cachepath(params, pdbkey)
            if os.path.exists(cachefile):
                with open(cachefile, 'rb') as f:
                    self._cache[params, pdbkey] = pickle.load(f)
            else:
                self._cache[params, pdbkey] = dict()
        return self._cache[params, pdbkey]

[docs]    def add(self, params, pdbkey0, pdbkey1, val):
        assert isinstance(pdbkey0, int)
        assert isinstance(pdbkey1, int)
        self._dirty.add((params, pdbkey0))
        self._cache[(params, pdbkey0)][pdbkey1] = val

[docs]    def sync_to_disk(self):
        for i in range(10):
            for params, pdbkey in list(self._dirty):
                cachefile = self.cachepath(params, pdbkey)
                if os.path.exists(cachefile + '.lock'): continue
                if not os.path.exists(os.path.dirname(cachefile)):
                    os.makedirs(os.path.dirname(cachefile))
                with open(cachefile + '.lock', 'w'):
                    with open(cachefile, 'wb') as out:
                        data = self._cache[params, pdbkey]
                        pickle.dump(data, out)
                os.remove(cachefile + '.lock')
                self._dirty.remove((params, pdbkey))
        if len(self._dirty):
            print('warning: some caches unsaved', len(self._dirty))


[docs]class BBlockDB:
    """stores Poses and BBlocks in a disk cache"""

    def __init__(
            self,
            cachedir=None,
            bakerdb_files=[],
            load_poses=False,
            nprocs=1,
            lazy=True,
            read_new_pdbs=False,
            verbosity=0,
    ):
        """TODO: Summary

        Args:
            cachedir (None, optional): Description
            bakerdb_files (list, optional): Description
            load_poses (bool, optional): Description
            nprocs (int, optional): Description
            lazy (bool, optional): Description
            read_new_pdbs (bool, optional): Description
        """
        if cachedir is None:
            if 'HOME' in os.environ:
                cachedir = os.environ['HOME'] + os.sep + '.worms/cache'
            else:
                cachedir = '.worms/cache'
        self.cachedir = str(cachedir)
        self.load_poses = load_poses
        os.makedirs(self.cachedir + '/poses', exist_ok=True)
        os.makedirs(self.cachedir + '/bblock', exist_ok=True)
        self._bblock_cache, self._poses_cache = dict(), dict()
        self.nprocs = nprocs
        self.lazy = lazy
        self.read_new_pdbs = read_new_pdbs
        self.verbosity = verbosity
        self._alldb = []
        self._holding_lock = False
        for dbfile in bakerdb_files:
            with open(dbfile) as f:
                self._alldb.extend(json.load(f))
        for entry in self._alldb:
            if 'name' not in entry:
                entry['name'] = ''
            entry['file'] = entry['file'].replace(
                '__DATADIR__',
                os.path.relpath(os.path.dirname(__file__) + '/data')
            )
        self.dictdb = {e['file']: e for e in self._alldb}
        self.key_to_pdbfile = {
            myhash(e['file']): e['file']
            for e in self._alldb
        }
        if len(self._alldb) != len(self.dictdb):
            warning('!' * 100)
            warning(
                '!' * 23, 'DIRE WARNING: %6i duplicate pdb files in database' %
                (len(self._alldb) - len(self.dictdb)), '!' * 23
            )
            warning('!' * 100)
        info('loading %i db entries' % len(self._alldb))
        self.n_new_entries = 0
        self.n_missing_entries = len(self._alldb)
        if not self.lazy:
            self.n_new_entries, self.n_missing_entries = self.load_from_pdbs()
            if self._holding_lock: self.unlock_cachedir()
            if nprocs != 1:
                # reload because processpool cache entries not serialized back
                self.nprocs = 1
                self.load_from_pdbs()
        for i, k in enumerate(sorted(self.dictdb)):
            self._alldb[i] = self.dictdb[k]

[docs]    def lock_cachedir(self):
        assert not os.path.exists(self.cachedir + '/lock'), (
            "database is locked! if you're sure no other jobs are editing it, remove "
            + self.cachedir + "/lock"
        )
        open(self.cachedir + '/lock', 'w').close()
        assert os.path.exists(self.cachedir + '/lock')
        self._holding_lock = True

[docs]    def unlock_cachedir(self):
        os.remove(self.cachedir + '/lock')
        self._holding_lock = False

[docs]    def islocked_cachedir(self):
        return os.path.exists(self.cachedir + '/lock')

[docs]    def check_lock_cachedir(self):
        if not self._holding_lock:
            self.lock_cachedir()

    def __getitem__(self, i):
        if isinstance(i, str):
            return self._bblock_cache[i]
        else:
            return self._bblock_cache.values()[i]

    def __len__(self):
        return len(self._bblock_cache)

[docs]    def pose(self, pdbfile):
        """load pose from _bblock_cache, read from file if not in memory"""
        if isinstance(pdbfile, bytes):
            pdbfile = str(pdbfile, 'utf-8')
        if not pdbfile in self._poses_cache:
            if not self.load_cached_pose_into_memory(pdbfile):
                self._poses_cache[pdbfile] = pose_from_file(pdbfile)
        return self._poses_cache[pdbfile]

[docs]    def bblock(self, pdbkey):
        if isinstance(pdbkey, (str, bytes)):
            pdbkey = myhash(pdbkey)
        if isinstance(pdbkey, int):
            if not pdbkey in self._bblock_cache:
                if not self.load_cached_bblock_into_memory(pdbkey):
                    pdbfile = self.key_to_pdbfile[pdbkey]
                    raise ValueError('no bblock data for key', pdbkey, pdbfile)
            return self._bblock_cache[pdbkey]
        elif isinstance(pdbkey, list):
            return [self.bblock(f) for f in pdbkey]
        else:
            raise ValueError('bad pdbkey' + str(type(pdbkey)))

[docs]    def query(self, query, *, useclass=True, max_bblocks=150, shuffle=True):
        """
        match name, _type, _class
        if one match, use it
        if _type and _class match, check useclass option
        Het:NNCx/y require exact number or require extra

        Args:
            query (TYPE): Description
            useclass (bool, optional): Description

        Returns:
            TYPE: Description
        """
        names = self.query_names(query, useclass=useclass)
        if len(names) > max_bblocks:
            if shuffle:
                random.shuffle(names)
            names = names[:max_bblocks]
        return [self.bblock(myhash(n)) for n in names]

[docs]    def query_names(self, query, *, useclass=True):
        """query for names only"""
        if query.lower() == "all":
            return [db['file'] for db in self._alldb]
        query, subq = query.split(':') if query.count(':') else (query, None)
        if subq is None:
            c_hits = [db['file'] for db in self._alldb if query in db['class']]
            n_hits = [db['file'] for db in self._alldb if query == db['name']]
            t_hits = [db['file'] for db in self._alldb if query == db['type']]
            if not c_hits and not n_hits: return t_hits
            if not c_hits and not t_hits: return n_hits
            if not t_hits and not n_hits: return c_hits
            if not n_hits: return c_hits if useclass else t_hits
            assert False, 'invalid database or query'
        else:
            excon = None
            if subq.endswith('X'): excon = True
            if subq.endswith('Y'): excon = False
            hits = list()
            assert query == 'Het'
            for db in self._alldb:
                if not query in db['class']: continue
                nc = [_ for _ in db['connections'] if _['direction'] == 'C']
                nn = [_ for _ in db['connections'] if _['direction'] == 'N']
                nc, tc = len(nc), subq.count('C')
                nn, tn = len(nn), subq.count('N')
                if nc >= tc and nn >= tn:
                    if nc + nn == tc + tn and excon is not True:
                        hits.append(db['file'])
                    elif nc + nn > tc + tn and excon is not False:
                        hits.append(db['file'])
        return hits

[docs]    def load_cached_pose_into_memory(self, pdbfile):
        posefile = self.posefile(pdbfile)
        try:
            with open(posefile, 'rb') as f:
                try:
                    self._poses_cache[pdbfile] = pickle.load(f)
                    return True
                except EOFError:
                    warning('corrupt pickled pose will be replaced', posefile)
                    os.remove(posefile)
                    return False
        except FileNotFoundError:
            return False

[docs]    def bblockfile(self, pdbkey):
        assert not isinstance(pdbkey, str)
        return os.path.join(self.cachedir, 'bblock', '%016x.pickle' % pdbkey)

[docs]    def load_cached_bblock_into_memory(self, pdbkey):
        assert not isinstance(pdbkey, (str, bytes))
        if not isinstance(pdbkey, int):
            success = True
            for f in pdbkey:
                success &= self.load_cached_bblock_into_memory(f)
            return success
        bblockfile = self.bblockfile(pdbkey)
        try:
            with open(bblockfile, 'rb') as f:
                bbstate = list(pickle.load(f))
                self._bblock_cache[pdbkey] = _BBlock(*bbstate)
                return True
        except FileNotFoundError:
            return False

[docs]    def posefile(self, pdbfile):
        return os.path.join(self.cachedir, 'poses', flatten_path(pdbfile))

[docs]    def load_from_pdbs(self):
        shuffle(self._alldb)
        if self.nprocs is 1:
            with util.InProcessExecutor() as exe:
                result = self.load_from_pdbs_inner(exe)
        else:
            with ThreadPoolExecutor(max_workers=self.nprocs) as exe:
                result = self.load_from_pdbs_inner(exe)
        new = [_[0] for _ in result if _[0]]
        missing = [_[1] for _ in result if _[1]]
        for miss in missing:
            self._alldb.remove(self.dictdb[miss])
            del self.dictdb[miss]
        return len(new), len(missing)

[docs]    def load_from_pdbs_inner(self, exe):
        # return exe.map(self.build_pdb_data, self._alldb)
        shuffle(self._alldb)
        r = []
        kwargs = {
            'total': len(self._alldb),
            'unit': 'pdbs',
            # 'unit_scale': True,
            'leave': True
        }
        futures = [exe.submit(self.build_pdb_data, e) for e in self._alldb]
        work = as_completed(futures)
        if self.verbosity > 1:
            work = tqdm(work, 'building pdb data', **kwargs)
        for f in work:
            r.append(f.result())
        return r

[docs]    def build_pdb_data(self, entry):
        """return Nnew, Nmissing"""
        pdbfile = entry['file']
        pdbkey = myhash(pdbfile)
        cachefile = self.bblockfile(pdbkey)
        posefile = self.posefile(pdbfile)
        if os.path.exists(cachefile):
            assert self.load_cached_bblock_into_memory(pdbkey)
            if self.load_poses:
                assert self.load_cached_pose_into_memory(pdbfile)
            return None, None  # new, missing
        elif self.read_new_pdbs:
            self.check_lock_cachedir()
            read_pdb = False
            # info('BBlockDB.build_pdb_data reading %s' % pdbfile)
            pose = self.pose(pdbfile)
            ss = Dssp(pose).get_dssp_secstruct()
            bblock = BBlock(entry, pdbfile, pdbkey, pose, ss)
            self._bblock_cache[pdbfile] = bblock

            with open(cachefile, 'wb') as f:
                pickle.dump(bblock._state, f)
            if not os.path.exists(posefile):
                with open(posefile, 'wb') as f:
                    pickle.dump(pose, f)
                    info('dumped _bblock_cache files for %s' % pdbfile)

            if self.load_poses:
                self._poses_cache[pdbfile] = pose
            return pdbfile, None  # new, missing
        else:
            warning('no cached data for: ' + pdbfile)
            return None, pdbfile  # new, missing