Source code for gffutils.inspect

from gffutils import iterators
from gffutils import interface
from collections import Counter
import sys


[docs] def inspect( data, look_for=["featuretype", "chrom", "attribute_keys", "feature_count"], limit=None, verbose=True, ): """ Inspect a GFF or GTF data source. This function is useful for figuring out the different featuretypes found in a file (for potential removal before creating a FeatureDB). Returns a dictionary with a key for each item in `look_for` and a corresponding value that is a dictionary of how many of each unique item were found. There will always be a `feature_count` key, indicating how many features were looked at (if `limit` is provided, then `feature_count` will be the same as `limit`). For example, if `look_for` is ['chrom', 'featuretype'], then the result will be a dictionary like:: { 'chrom': { 'chr1': 500, 'chr2': 435, 'chr3': 200, ... ... }. 'featuretype': { 'gene': 150, 'exon': 324, ... }, 'feature_count': 5000 } Parameters ---------- data : str, FeatureDB instance, or iterator of Features If `data` is a string, assume it's a GFF or GTF filename. If it's a FeatureDB instance, then its `all_features()` method will be automatically called. Otherwise, assume it's an iterable of Feature objects. look_for : list List of things to keep track of. Options are: - any attribute of a Feature object, such as chrom, source, start, stop, strand. - "attribute_keys", which will look at all the individual attribute keys of each feature limit : int Number of features to look at. Default is no limit. verbose : bool Report how many features have been processed. Returns ------- dict """ results = {} obj_attrs = [] for i in look_for: if i not in ["attribute_keys", "feature_count"]: obj_attrs.append(i) results[i] = Counter() attr_keys = "attribute_keys" in look_for d = iterators.DataIterator(data) feature_count = 0 for f in d: if verbose: sys.stderr.write("\r%s features inspected" % feature_count) sys.stderr.flush() for obj_attr in obj_attrs: results[obj_attr].update([getattr(f, obj_attr)]) if attr_keys: results["attribute_keys"].update(f.attributes.keys()) feature_count += 1 if limit and feature_count == limit: break new_results = {} for k, v in results.items(): new_results[k] = dict(v) new_results["feature_count"] = feature_count return new_results