Source code for pybedtools.contrib.venn_maker

"""
Interface between pybedtools and the R package VennDiagram.

Rather than depend on the user to have rpy2 installed, this simply writes an
R script that can be edited and tweaked by the user before being run in R.
"""
import os
import string
import pybedtools
from pybedtools import helpers
import subprocess
from collections import OrderedDict

# really just fill in x and filename...leave the rest up to the user.
#
# Note that the closing parentheses is missing -- that's so the user can add
# kwargs from the calling function
template = string.Template(
    """
library(VennDiagram)
venn.diagram(
    x=$x,
    filename=$filename,
    category.names = $names
"""
)


def _list_to_R_syntax(x):
    """
    Convert items in `x` to a string, and replace tabs with pipes in Interval
    string representations.  Put everything into an R vector and return as one
    big string.
    """
    items = []
    for i in x:
        if isinstance(i, pybedtools.Interval):
            i = str(i).replace("\t", "|")
        items.append('"%s"' % i)
    return "c(%s)" % ",".join(items)


def _dict_to_R_named_list(d):
    """
    Calls _list_to_R_syntax for each item.  Returns one big string.
    """
    items = []
    for key, val in list(d.items()):
        items.append('"%s" = %s' % (key, _list_to_R_syntax(val)))
    return "list(%s)" % ", ".join(items)


def truncator(feature):
    """
    Convert a feature of any format into a BED3 format.
    """
    return pybedtools.create_interval_from_list(
        [feature.chrom, str(feature.start), str(feature.stop)]
    )


[docs] def cleaned_intersect(items): """ Perform interval intersections such that the end products have identical \ features for overlapping intervals. The VennDiagram package does *set* intersection, not *interval* intersection. So the goal here is to represent intersecting intervals as intersecting sets of strings. Doing a simple BEDTools intersectBed call doesn't do the trick (even with the -u argument). As a concrete example, what would the string be for an intersection of the feature "chr1:1-100" in file `x` and "chr1:50-200" in file `y`? The method used here is to substitute the intervals in `y` that overlap `x` with the corresponding elements in `x`. This means that in the resulting sets, the overlapping features are identical. To follow up with the example, both `x` and `y` would have an item "chr1:50-200" in their sets, simply indicating *that* one interval overlapped. Venn diagrams are not well suited for nested overlaps or multi-overlaps. To illustrate, try drawing the 2-way Venn diagram of the following two files. Specifically, what number goes in the middle -- the number of features in `x` that intersect `y` (1) or the number of features in `y` that intersect `x` (2)?:: x: chr1 1 100 chr1 500 6000 y: chr1 50 100 chr1 80 200 chr9 777 888 In this case, this function will return the following sets:: x: chr1:1-100 chr1:500-6000 y: chr1:1-100 chr9:777-888 This means that while `x` does not change in length, `y` can. For example, if there are 2 features in `x` that overlap one feature in `y`, then `y` will gain those two features in place of its single original feature. This strategy is extended for multiple intersections -- see the source for details. """ if len(items) == 2: x = items[0].each(truncator).saveas() y = items[1].each(truncator).saveas() # Combine the unique-to-y intervals with the shared-with-x intervals. # Since x is first in x+y, resulting features are from x. new_y = (y - x).cat(x + y) return x, new_y if len(items) == 3: x = items[0].each(truncator).saveas() y = items[1].each(truncator).saveas() z = items[2].each(truncator).saveas() # Same as above. Don't care about z yet; this means that y will not # change because of z. new_y = (y - x).cat(x + y) # Combine: # unique-to-z # shared-with-any-x # shared-with-unique-to-y new_z = (z - y - x).cat(x + z).cat((y - x) + z) return x, new_y, new_z if len(items) == 4: x = items[0].each(truncator).saveas() y = items[1].each(truncator).saveas() z = items[2].each(truncator).saveas() q = items[3].each(truncator).saveas() # Same as 2-way new_y = (y - x).cat(x + y) # Same as 3-way new_z = (z - y - x).cat(x + z).cat((y - x) + z) # Combine: # unique-to-q # shared-with-any-x # shared-with-unique-to-y # shared-with-unique-to-z new_q = (q - z - y - x).cat(x + q).cat((y - x) + q).cat((z - y - x) + q) return x, new_y, new_z, new_q
[docs] def venn_maker( beds, names=None, figure_filename=None, script_filename=None, additional_args=None, run=False, ): """ Given a list of interval files, write an R script to create a Venn \ diagram of overlaps (and optionally run it). The R script calls the venn.diagram function of the R package VennDiagram for extremely flexible Venn and Euler diagram creation. Uses `cleaned_intersect()` to create string representations of shared intervals. `beds` is a list of up to 4 filenames or BedTools. `names` is a list of names to use for the Venn diagram, in the same order as `beds`. Default is "abcd"[:len(beds)]. `figure_filename` is the TIFF file to save the figure as. `script_filename` is the optional filename to write the R script to `additional_args` is list that will be inserted into the R script, verbatim. For example, to use scaled Euler diagrams with different colors, use:: additional_args = ['euler.d=TRUE', 'scaled=TRUE', 'cat.col=c("red","blue")'] If `run` is True, then assume R is installed, is on the path, and has VennDiagram installed . . . and run the script. The resulting filename will be saved as `figure_filename`. """ if figure_filename is None: figure_filename = "NULL" else: figure_filename = '"%s"' % figure_filename if names is None: names = "abcd"[: len(beds)] _beds = [] for bed in beds: if not isinstance(bed, pybedtools.BedTool): bed = pybedtools.BedTool(bed) _beds.append(bed) cleaned = cleaned_intersect(_beds) results = OrderedDict(list(zip(names, cleaned))) s = template.substitute( x=_dict_to_R_named_list(results), filename=figure_filename, names=_list_to_R_syntax(names), ) if additional_args: s += "," + ", ".join(additional_args) s += ")" if not script_filename: fn = pybedtools.BedTool._tmp() else: fn = script_filename fout = open(fn, "w") fout.write(s) fout.close() out = fn + ".Rout" if run: if not pybedtools.settings._R_installed: helpers._check_for_R() cmds = [os.path.join(pybedtools.settings._R_path, "R"), "CMD", "BATCH", fn, out] p = subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = p.communicate() if stdout or stderr: print("stdout:", stdout) print("stderr:", stderr) if not script_filename: return s return None