Source code for metaseq.filetype_adapters

"""
This module provides classes that make a file format conform to a uniform API.
These are not generally needed by end-users, rather, they are used internally
by higher-level code like :mod:`metaseq.genomic_signal`.

File-type adapters accept a filename of the appropriate format (which is not
checked) as the only argument to their constructor.

Subclasses must define __getitem__ to accept a pybedtools.Interval and return
an iterator of pybedtools.Intervals

Subclasses must define make_fileobj(), which returns an object to be iterated
over in __getitem__
"""
from bx.bbi.bigbed_file import BigBedFile
from bx.bbi.bigwig_file import BigWigFile
from bx.intervals.io import StrandFormatError
import numpy as np
import subprocess
import pysam
import pybedtools
import os
import sys
from textwrap import dedent

strand_lookup = {16: '-', 0: '+'}


[docs]class BaseAdapter(object):
    """
    Base class for filetype adapters
    """
[docs]    def __init__(self, fn):
        self.fn = fn
        self.fileobj = None
        self.fileobj = self.make_fileobj()

    def __getitem__(self, key):
        raise ValueError('Subclasses must define __getitem__')

    def make_fileobj(self):
        raise ValueError('Subclasses must define make_fileobj')


[docs]class BamAdapter(BaseAdapter):
    """
    Adapter that provides random access to BAM objects using Pysam
    """
[docs]    def __init__(self, fn):
        super(BamAdapter, self).__init__(fn)

    def make_fileobj(self):
        return pysam.Samfile(self.fn, 'rb')

    def __getitem__(self, key):
        iterator = self.fileobj.fetch(
            str(key.chrom),
            key.start,
            key.stop)
        for r in iterator:
            start = r.pos
            curr_end = r.pos
            for op, bp in r.cigar:
                start = curr_end
                curr_end += bp
                if op == 0:
                    interval = pybedtools.Interval(
                        self.fileobj.references[r.rname],
                        start,
                        curr_end,
                        strand=strand_lookup[r.flag & 0x0010])
                    interval.file_type = 'bed'
                    yield interval


[docs]class BedAdapter(BaseAdapter):
    """
    Adapter that provides random access to BED files via Tabix
    """
[docs]    def __init__(self, fn):
        super(BedAdapter, self).__init__(fn)

    def make_fileobj(self):
        obj = pybedtools.BedTool(self.fn)
        if not obj._tabixed():
            obj = obj.sort().tabix(in_place=False, force=False, is_sorted=True)
            self.fn = obj.fn
        return obj

    def __getitem__(self, key):
        bt = self.fileobj.tabix_intervals(
            '%s:%s-%s' % (key.chrom, key.start, key.stop))
        for i in bt:
            yield i
        del bt


[docs]class BigBedAdapter(BaseAdapter):
    """
    Adapter that provides random access to bigBed files via bx-python
    """
[docs]    def __init__(self, fn):
        super(BigBedAdapter, self).__init__(fn)

    def make_fileobj(self):
        return BigBedFile(open(self.fn))

    def __getitem__(self, key):
        chrom = key.chrom
        start = key.start
        stop = key.end
        try:
            bx_intervals = self.fileobj.get(chrom, start, stop)
        except StrandFormatError:
            raise NotImplementedError(dedent(
                """
                It appears you have a version of bx-python where bigBed files
                are temporarily unsupported due to recent changes in the
                bx-python dependency. In the meantime, please convert bigBed to
                BAM like this:

                    bigBedToBed {0} tmp.bed
                    bedtools bedtobam -i tmp.bed > {0}.bam

                and create a genomic signal object using this {0}.bam file.
                """.format(self.fn)))
        if bx_intervals is None:
            raise StopIteration
        for i in bx_intervals:
            interval = pybedtools.create_interval_from_list(i.fields)
            interval.file_type = 'bed'
            yield interval


[docs]class BigWigAdapter(BaseAdapter):
    """
    Adapter that provides random access to bigWig files bia bx-python
    """
[docs]    def __init__(self, fn):
        super(BigWigAdapter, self).__init__(fn)

    def make_fileobj(self):
        return self.fn

    def __getitem__(self, key):
        raise NotImplementedError(
            "__getitem__ not implemented for %s" % self.__class__.__name__)

    def summarize(self, interval, bins=None, method='summarize',
                  function='mean'):

        # We may be dividing by zero in some cases, which raises a warning in
        # NumPy based on the IEEE 754 standard (see
        # http://docs.scipy.org/doc/numpy/reference/generated/
        #       numpy.seterr.html)
        #
        # That's OK -- we're expecting that to happen sometimes. So temporarily
        # disable this error reporting for the duration of this method.
        orig = np.geterr()['invalid']
        np.seterr(invalid='ignore')

        if (bins is None) or (method == 'get_as_array'):
            bw = BigWigFile(open(self.fn))
            s = bw.get_as_array(
                interval.chrom,
                interval.start,
                interval.stop,)
            if s is None:
                s = np.zeros((interval.stop - interval.start,))
            else:
                s[np.isnan(s)] = 0

        elif method == 'ucsc_summarize':
            if function in ['mean', 'min', 'max', 'std', 'coverage']:
                return self.ucsc_summarize(interval, bins, function=function)
            else:
                raise ValueError('function "%s" not supported by UCSC\'s'
                                 'bigWigSummary')

        else:
            bw = BigWigFile(open(self.fn))
            s = bw.summarize(
                interval.chrom,
                interval.start,
                interval.stop, bins)
            if s is None:
                s = np.zeros((bins,))
            else:
                if function == 'sum':
                    s = s.sum_data
                if function == 'mean':
                    s = s.sum_data / s.valid_count
                    s[np.isnan(s)] = 0
                if function == 'min':
                    s = s.min_val
                    s[np.isinf(s)] = 0
                if function == 'max':
                    s = s.max_val
                    s[np.isinf(s)] = 0
                if function == 'std':
                    s = (s.sum_squares / s.valid_count)
                    s[np.isnan(s)] = 0

        # Reset NumPy error reporting
        np.seterr(divide=orig)
        return s

    def ucsc_summarize(self, interval, bins=None, function='mean'):
        if bins is None:
            bins = len(interval)
        y = np.zeros(bins)

        cmds = [
            'bigWigSummary',
            self.fn,
            interval.chrom,
            str(interval.start),
            str(interval.stop),
            str(bins),
            '-type=%s' % function]
        p = subprocess.Popen(
            cmds,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
        )

        def gen():
            try:
                for line in p.stdout:
                    yield line
            finally:
                if p.poll() is None:
                    return
                else:
                    p.wait()
                    err = p.stderr.read().strip()
                    if p.returncode not in (0, None):
                        if err.startswith('no data'):
                            return
                        raise ValueError(
                            "cmds: %s: %s" %
                            (' '.join(cmds), p.stderr.read()))
                    if len(err) != 0:
                        sys.stderr.write(err)

        for line in gen():
            for i, x in enumerate(line.split('\t')):
                try:
                    y[i] = float(x)
                except ValueError:
                    pass
        return np.array(y)