Source code for dandelion.external.scanpy

import re
import scipy.stats

import numpy as np
import pandas as pd
import scanpy as sc

from anndata import AnnData
from sklearn import mixture

from dandelion.utilities._utilities import bh


[docs] def recipe_scanpy_qc( adata: AnnData, layer: str | None = None, mito_startswith: str = "MT-", max_genes: int = 2500, min_genes: int = 200, mito_cutoff: int | None = 5, run_scrublet: bool = True, pval_cutoff: float = 0.1, min_counts: int | None = None, max_counts: int | None = None, blacklist: list[str] | None = None, vdj_pattern: str = "^TR[AB][VDJ]|^IG[HKL][VDJC]", ): """ Recipe for running a standard scanpy QC workflow. Parameters ---------- adata : AnnData annotated data matrix of shape n_obs × n_vars. Rows correspond to cells and columns to genes. layer : str | None, optional name of layer to run scrublet on if supplied. mito_startswith : str, optional string pattern used for searching mitochondrial genes. max_genes : int, optional maximum number of genes expressed required for a cell to pass filtering min_genes : int, optional minimum number of genes expressed required for a cell to pass filtering mito_cutoff : int | None, optional maximum percentage mitochondrial content allowed for a cell to pass filtering. run_scrublet : bool, optional whether or not to run scrublet for doublet detection. pval_cutoff : float, optional maximum Benjamini-Hochberg corrected p value from doublet detection protocol allowed for a cell to pass filtering. Default is 0.1. min_counts : int | None, optional minimum number of counts required for a cell to pass filtering. max_counts : int | None, optional maximum number of counts required for a cell to pass filtering. blacklist : list[str] | None, optional if provided, will exclude these genes from highly variable genes list. vdj_pattern : str, optional string pattern for search VDJ genes to exclude from highly variable genes. Raises ------ ImportError if scrublet not installed. """ _adata = adata.copy() # run basic scanpy pipeline sc.pp.filter_cells(_adata, min_genes=0) _adata.var["mt"] = _adata.var_names.str.startswith(mito_startswith) sc.pp.calculate_qc_metrics( _adata, qc_vars=["mt"], percent_top=None, log1p=False, inplace=True ) if mito_cutoff is None: # use a model-based method to determine the cut off # for mitochondrial content gmm = mixture.GaussianMixture( n_components=2, max_iter=1000, covariance_type="full" ) X = _adata.obs[["pct_counts_mt", "n_genes_by_counts"]] _adata.obs["gmm_pct_count_clusters"] = gmm.fit(X).predict(X) # use a simple metric to workout which cluster # is the one that contains lower mito content? A1 = ( _adata[_adata.obs["gmm_pct_count_clusters"] == 0] .obs["pct_counts_mt"] .mean() ) B1 = ( _adata[_adata.obs["gmm_pct_count_clusters"] == 1] .obs["pct_counts_mt"] .mean() ) A2 = ( _adata[_adata.obs["gmm_pct_count_clusters"] == 0] .obs["n_genes_by_counts"] .mean() ) B2 = ( _adata[_adata.obs["gmm_pct_count_clusters"] == 1] .obs["n_genes_by_counts"] .mean() ) if (A1 > B1) and (A2 < B2): keepdict = {0: False, 1: True} else: keepdict = {1: False, 0: True} _adata.obs["gmm_pct_count_clusters_keep"] = [ keepdict[x] for x in _adata.obs["gmm_pct_count_clusters"] ] if run_scrublet: # run scrublet try: import scrublet as scr except ImportError: raise ImportError( "Please install scrublet with pip install scrublet." ) if layer is None: scrub = scr.Scrublet(_adata.X) else: scrub = scr.Scrublet(_adata.layers[layer]) doublet_scores, _ = scrub.scrub_doublets(verbose=False) _adata.obs["scrublet_score"] = doublet_scores # overcluster prep. sc.pp.normalize_total(_adata, target_sum=1e4) sc.pp.log1p(_adata) sc.pp.highly_variable_genes( _adata, min_mean=0.0125, max_mean=3, min_disp=0.5 ) for i in _adata.var.index: if vdj_pattern is not None: if re.search(vdj_pattern, i): _adata.var.at[i, "highly_variable"] = False if blacklist is not None: if i in blacklist: _adata.var.at[i, "highly_variable"] = False _adata = _adata[:, _adata.var["highly_variable"]].copy() sc.pp.scale(_adata, max_value=10) sc.tl.pca(_adata, svd_solver="arpack") sc.pp.neighbors(_adata, n_neighbors=10, n_pcs=50) # overclustering proper - do basic clustering first, # then cluster each cluster sc.tl.leiden(_adata) for clus in list(np.unique(_adata.obs["leiden"]))[0]: sc.tl.leiden( _adata, restrict_to=("leiden", [clus]), key_added="leiden_R" ) # weird how the new anndata/scanpy is forcing this for clus in list(np.unique(_adata.obs["leiden"]))[1:]: sc.tl.leiden( _adata, restrict_to=("leiden_R", [clus]), key_added="leiden_R" ) # compute the cluster scores - the median of Scrublet scores per # overclustered cluster for clus in np.unique(_adata.obs["leiden_R"]): _adata.obs.loc[ _adata.obs["leiden_R"] == clus, "scrublet_cluster_score" ] = np.median( _adata.obs.loc[_adata.obs["leiden_R"] == clus, "scrublet_score"] ) # now compute doublet p-values. figure out the median and mad # (from above-median values) for the distribution med = np.median(_adata.obs["scrublet_cluster_score"]) mask = _adata.obs["scrublet_cluster_score"] > med mad = np.median(_adata.obs["scrublet_cluster_score"][mask] - med) # 1 sided test for catching outliers pvals = 1 - scipy.stats.norm.cdf( _adata.obs["scrublet_cluster_score"], loc=med, scale=1.4826 * mad ) _adata.obs["scrublet_score_bh_pval"] = bh(pvals) # threshold the p-values to get doublet calls. _adata.obs["is_doublet"] = ( _adata.obs["scrublet_score_bh_pval"] < pval_cutoff ) else: _adata.obs["is_doublet"] = False if mito_cutoff is not None: if min_counts is None and max_counts is None: _adata.obs["filter_rna"] = ( ( pd.Series( [ ((n < min_genes) or (n > max_genes)) for n in _adata.obs["n_genes_by_counts"] ], index=_adata.obs.index, ) ) | (_adata.obs["pct_counts_mt"] >= mito_cutoff) | (_adata.obs.is_doublet) ) else: if min_counts is not None: if max_counts is not None: _adata.obs["filter_rna"] = ( ( pd.Series( [ ((n < min_genes) or (n > max_genes)) for n in _adata.obs["n_genes_by_counts"] ], index=_adata.obs.index, ) ) | ( pd.Series( [ min_counts < n > max_counts for n in _adata.obs["total_counts"] ], index=_adata.obs.index, ) ) | (_adata.obs["pct_counts_mt"] >= mito_cutoff) | (_adata.obs.is_doublet) ) else: _adata.obs["filter_rna"] = ( ( pd.Series( [ ((n < min_genes) or (n > max_genes)) for n in _adata.obs["n_genes_by_counts"] ], index=_adata.obs.index, ) ) | ( pd.Series( [ n < min_counts for n in _adata.obs["total_counts"] ], index=_adata.obs.index, ) ) | (_adata.obs["pct_counts_mt"] >= mito_cutoff) | (_adata.obs.is_doublet) ) else: if max_counts is not None: _adata.obs["filter_rna"] = ( ( pd.Series( [ ((n < min_genes) or (n > max_genes)) for n in _adata.obs["n_genes_by_counts"] ], index=_adata.obs.index, ) ) | ( pd.Series( [ n > max_counts for n in _adata.obs["total_counts"] ], index=_adata.obs.index, ) ) | (_adata.obs["pct_counts_mt"] >= mito_cutoff) | (_adata.obs.is_doublet) ) else: if min_counts is None and max_counts is None: _adata.obs["filter_rna"] = ( ( pd.Series( [ ((n < min_genes) or (n > max_genes)) for n in _adata.obs["n_genes_by_counts"] ], index=_adata.obs.index, ) ) | ~(_adata.obs.gmm_pct_count_clusters_keep) | (_adata.obs.is_doublet) ) else: if min_counts is not None: if max_counts is not None: _adata.obs["filter_rna"] = ( ( pd.Series( [ ((n < min_genes) or (n > max_genes)) for n in _adata.obs["n_genes_by_counts"] ], index=_adata.obs.index, ) ) | ( pd.Series( [ min_counts < n > max_counts for n in _adata.obs["total_counts"] ], index=_adata.obs.index, ) ) | ~(_adata.obs.gmm_pct_count_clusters_keep) | (_adata.obs.is_doublet) ) else: _adata.obs["filter_rna"] = ( ( pd.Series( [ ((n < min_genes) or (n > max_genes)) for n in _adata.obs["n_genes_by_counts"] ], index=_adata.obs.index, ) ) | ( pd.Series( [ n < min_counts for n in _adata.obs["total_counts"] ], index=_adata.obs.index, ) ) | ~(_adata.obs.gmm_pct_count_clusters_keep) | (_adata.obs.is_doublet) ) else: if max_counts is not None: _adata.obs["filter_rna"] = ( ( pd.Series( [ ((n < min_genes) or (n > max_genes)) for n in _adata.obs["n_genes_by_counts"] ], index=_adata.obs.index, ) ) | ( pd.Series( [ n > max_counts for n in _adata.obs["total_counts"] ], index=_adata.obs.index, ) ) | ~(_adata.obs.gmm_pct_count_clusters_keep) | (_adata.obs.is_doublet) ) bool_dict = {True: "True", False: "False"} _adata.obs["is_doublet"] = [bool_dict[x] for x in _adata.obs["is_doublet"]] _adata.obs["filter_rna"] = [bool_dict[x] for x in _adata.obs["filter_rna"]] # removing columns that probably don't need anymore if mito_cutoff is not None: _adata.obs = _adata.obs.drop( [ "leiden", "leiden_R", "scrublet_cluster_score", "scrublet_score_bh_pval", ], axis=1, ) else: _adata.obs = _adata.obs.drop( [ "leiden", "leiden_R", "scrublet_cluster_score", "scrublet_score_bh_pval", "gmm_pct_count_clusters", ], axis=1, ) if not run_scrublet: _adata.obs = _adata.obs.drop(["is_doublet"], axis=1) adata.obs = _adata.obs.copy()