Source code for dandelion.external.immcantation.base.scoper

import warnings

import pandas as pd

from typing import Literal

from dandelion.base.core._core import Dandelion, load_data
from dandelion.utilities._utilities import (
    sanitize_data,
    sanitize_data_for_saving,
)


[docs] def identical_clones( vdj: Dandelion, method: Literal["nt", "aa"] = "nt", junction: str = "junction", v_call: str = "v_call", j_call: str = "j_call", clone_key: str = "clone_id", fields: list[str] | None = None, cell_id: str | None = "cell_id", locus: str = "locus", only_heavy: bool = True, split_light: bool = True, first: bool = False, cdr3: bool = False, mod3: bool = False, max_n: int | None = 0, nproc: int = 1, verbose: bool = False, summarize_clones: bool = True, remove_ambiguous: bool = True, remove_extra: bool = True, ) -> None: """ Clonal assignment using sequence identity partitioning. https://scoper.readthedocs.io/en/stable/topics/identicalClones/ This is a wrapper for one of scoper's method to perform clone clustering. From the original description: identicalClones provides a simple sequence identity based partitioning approach for inferring clonal relationships in high-throughput Adaptive Immune Receptor Repertoire sequencing (AIRR-seq) data. This approach partitions B or T cell receptor sequences into clonal groups based on junction region sequence identity within partitions that share the same V gene, J gene, and junction length, allowing for ambiguous V or J gene annotations. see also https://scoper.readthedocs.io/en/stable/vignettes/Scoper-Vignette/ Parameters ---------- vdj : Dandelion a dandelion object containing the airr data. method : Literal["nt", "aa"], optional one of the "nt" for nucleotide based clustering or "aa" for amino acid based clustering. junction : str, optional character name of the column containing junction sequences. Also used to determine sequence length for grouping. v_call : str, optional name of the column containing the V-segment allele calls. j_call : str, optional name of the column containing the J-segment allele calls. clone_key : str, optional output column name containing the clonal cluster identifiers. fields : list[str], optional character vector of additional columns to use for grouping. Sequences with disjoint values in the specified fields will be classified as separate clones. cell_id : str | None, optional name of the column containing cell identifiers or barcodes. If specified, grouping will be performed in single-cell mode with the behavior governed by the locus and only_heavy arguments. If set to None then the bulk sequencing data is assumed. locus : str, optional name of the column containing locus information. only_heavy : bool, optional use only the IGH (BCR) or TRB/TRD (TCR) sequences for grouping. split_light : bool, optional split clones by light chains. first : bool, optional specifies how to handle multiple V(D)J assignments for initial grouping. If True only the first call of the gene assignments is used. If False the union of ambiguous gene assignments is used to group all sequences with any overlapping gene calls. cdr3 : bool, optional if True removes 3 nucleotides from both ends of "junction" prior to clustering (converts IMGT junction to CDR3 region). If True this will also remove records with a junction length less than 7 nucleotides. mod3 : bool, optional if True removes records with a junction length that is not divisible by 3 in nucleotide space. max_n : int | None, optional The maximum number of degenerate characters to permit in the junction sequence before excluding the record from clonal assignment. Default is set to be zero. Set it as "None" for no action. nproc : int, optional number of cores to distribute the function over. verbose : bool, optional if True prints out a summary of each step cloning process. if False (default) process cloning silently. summarize_clones : bool, optional if True performs a series of analysis to assess the clonal landscape and returns a ScoperClones object. If False then a modified input db is returned. When grouping by fields, summarize_clones should be False. remove_ambiguous : bool, optional if True removes contigs with ambiguous V(D)J assignments flagged by `check_contigs`. remove_extra : bool, optional if True removes extra contigs flagged by `check_contigs`. """ try: from rpy2.robjects.packages import importr from rpy2.rinterface import NULL from rpy2.robjects import r except: raise ( ImportError( "Unable to initialise R instance. Please run this separately through R with scoper's tutorials." ) ) scp = importr("scoper") db = load_data(vdj._data) warnings.filterwarnings("ignore") # sanitize before passing to R db = sanitize_data(db) if remove_ambiguous: if "ambiguous" in db: db = db[db["ambiguous"] == "F"].copy() if remove_extra: if "extra" in db: db = db[db["extra"] == "F"].copy() # sanitize before passing to R db, _ = sanitize_data_for_saving(db) fields = NULL if fields is None else fields cell_id = NULL if cell_id is None else cell_id db_r = safe_py2rpy(db) results = scp.identicalClones( db=db_r, method=method, junction=junction, v_call=v_call, j_call=j_call, clone=clone_key, fields=fields, cell_id=cell_id, locus=locus, only_heavy=only_heavy, split_light=split_light, first=first, cdr3=cdr3, mod3=mod3, max_n=max_n, nproc=nproc, verbose=verbose, summarize_clones=summarize_clones, ) results_dataframe = r["as.data.frame"](results) df = safe_rpy2py(results_dataframe) vdj._data = df.copy() vdj.update_metadata( clone_key=clone_key, retrieve=clone_key, retrieve_mode="merge and unique only", )
[docs] def hierarchical_clones( vdj: Dandelion, threshold: float, method: Literal["nt", "aa"] = "nt", linkage: Literal["single", "average", "complete"] = "single", normalize: Literal["len", "none"] = "len", junction: str = "junction", v_call: str = "v_call", j_call: str = "j_call", clone_id: str = "clone_id", fields: list[str] | None = None, cell_id: str | None = "cell_id", locus: str = "locus", only_heavy: bool = True, split_light: bool = True, first: bool = False, cdr3: bool = False, mod3: bool = False, max_n: int | None = 0, nproc: int = 1, verbose: bool = False, summarize_clones: bool = True, remove_ambiguous: bool = True, remove_extra: bool = True, ) -> None: """ Hierarchical clustering approach to clonal assignment. https://scoper.readthedocs.io/en/stable/topics/hierarchicalClones/ This is a wrapper for one of scoper's method to perform clone clustering. From the original description: hierarchicalClones provides a hierarchical agglomerative clustering approach to infer clonal relationships in high-throughput Adaptive Immune Receptor Repertoire sequencing (AIRR-seq) data. This approach clusters B or T cell receptor sequences based on junction region sequence similarity within partitions that share the same V gene, J gene, and junction length, allowing for ambiguous V or J gene annotations. see also https://scoper.readthedocs.io/en/stable/vignettes/Scoper-Vignette/ Parameters ---------- vdj : Dandelion a dandelion object containing the airr data. threshold : float numeric scalar where the tree should be cut (the distance threshold for clonal grouping). method : Literal["nt", "aa"], optional one of the "nt" for nucleotide based clustering or "aa" for amino acid based clustering. linkage : Literal["single", "average", "complete"], optional one of the "single", "average" or "complete" for the hierarchical clustering method. normalize : Literal["len", "none"], optional method of normalization. The default is "len", which divides the distance by the length of the sequence group. If "none" then no normalization if performed. junction : str, optional character name of the column containing junction sequences. Also used to determine sequence length for grouping. v_call : str, optional name of the column containing the V-segment allele calls. j_call : str, optional name of the column containing the J-segment allele calls. clone : str, optional output column name containing the clonal cluster identifiers. fields : list[str], optional character vector of additional columns to use for grouping. Sequences with disjoint values in the specified fields will be classified as separate clones. cell_id : str | None, optional name of the column containing cell identifiers or barcodes. If specified, grouping will be performed in single-cell mode with the behavior governed by the locus and only_heavy arguments. If set to None then the bulk sequencing data is assumed. locus : str, optional name of the column containing locus information. only_heavy : bool, optional use only the IGH (BCR) or TRB/TRD (TCR) sequences for grouping. split_light : bool, optional split clones by light chains. first : bool, optional specifies how to handle multiple V(D)J assignments for initial grouping. If True only the first call of the gene assignments is used. If False the union of ambiguous gene assignments is used to group all sequences with any overlapping gene calls. cdr3 : bool, optional if True removes 3 nucleotides from both ends of "junction" prior to clustering (converts IMGT junction to CDR3 region). If True this will also remove records with a junction length less than 7 nucleotides. mod3 : bool, optional if True removes records with a junction length that is not divisible by 3 in nucleotide space. max_n : int | None, optional The maximum number of degenerate characters to permit in the junction sequence before excluding the record from clonal assignment. Default is set to be zero. Set it as "None" for no action. nproc : int, optional number of cores to distribute the function over. verbose : bool, optional if True prints out a summary of each step cloning process. if False (default) process cloning silently. summarize_clones : bool, optional if True performs a series of analysis to assess the clonal landscape and returns a ScoperClones object. If False then a modified input db is returned. When grouping by fields, summarize_clones should be False. remove_ambiguous : bool, optional if True removes contigs with ambiguous V(D)J assignments flagged by `check_contigs`. remove_extra : bool, optional if True removes extra contigs flagged by `check_contigs`. """ try: from rpy2.robjects.packages import importr from rpy2.rinterface import NULL from rpy2.robjects import r except: raise ( ImportError( "Unable to initialise R instance. Please run this separately through R with scoper's tutorials." ) ) scp = importr("scoper") db = load_data(vdj._data) warnings.filterwarnings("ignore") db = sanitize_data(db) if remove_ambiguous: if "ambiguous" in db: db = db[db["ambiguous"] == "F"].copy() if remove_extra: if "extra" in db: db = db[db["extra"] == "F"].copy() # sanitize before passing to R db, _ = sanitize_data_for_saving(db) fields = NULL if fields is None else fields cell_id = NULL if cell_id is None else cell_id db_r = safe_py2rpy(db) results = scp.hierarchicalClones( db=db_r, threshold=threshold, method=method, linkage=linkage, normalize=normalize, junction=junction, v_call=v_call, j_call=j_call, clone=clone_id, fields=fields, cell_id=cell_id, locus=locus, only_heavy=only_heavy, split_light=split_light, first=first, cdr3=cdr3, mod3=mod3, max_n=max_n, nproc=nproc, verbose=verbose, summarize_clones=summarize_clones, ) results_dataframe = r["as.data.frame"](results) df = safe_rpy2py(results_dataframe) vdj._data = df.copy() vdj.update_metadata( clone_key=clone_id, retrieve=clone_id, retrieve_mode="merge and unique only", )
[docs] def spectral_clones( vdj: Dandelion, method: Literal["novj", "vj"] = "novj", germline: str = "germline_alignment", sequence: str = "sequence_alignment", junction: str = "junction", v_call: str = "v_call", j_call: str = "j_call", clone_id: str = "clone_id", fields: list[str] | None = None, cell_id: str | None = "cell_id", locus: str = "locus", only_heavy: bool = True, split_light: bool = True, first: bool = False, cdr3: bool = False, mod3: bool = False, max_n: int | None = 0, threshold: float | None = None, base_sim: float = 0.95, iter_max: int = 1000, nstart: int = 1000, nproc: int = 1, verbose: bool = False, summarize_clones: bool = True, remove_ambiguous: bool = True, remove_extra: bool = True, ) -> None: """ Spectral clustering method for clonal partitioning. https://scoper.readthedocs.io/en/stable/topics/spectralClones/ This is a wrapper for one of scoper's method to perform clone clustering. spectralClones provides an unsupervised spectral clustering approach to infer clonal relationships in high-throughput Adaptive Immune Receptor Repertoire sequencing (AIRR-seq) data. This approach clusters B or T cell receptor sequences based on junction region sequence similarity and shared mutations within partitions that share the same V gene, J gene, and junction length, allowing for ambiguous V or J gene annotations. This is not a full implementation as additional arguments such as `targeting_model` and `len_limit` requires access to additional objects that needs to be separately created through other packages e.g. `shazam`. As such, we will only implement the default argument were both will be set to `None` (or `NULL` in R). If you want to use this method in its full functionality, please run it separately through R with scoper's tutorial. see also https://scoper.readthedocs.io/en/stable/vignettes/Scoper-Vignette/ Parameters ---------- vdj : Dandelion a dandelion object containing the airr data. threshold : float numeric scalar where the tree should be cut (the distance threshold for clonal grouping). method : Literal["novj", "vj"], optional one of the "novj" or "vj". If method="novj", then clonal relationships are inferred using an adaptive threshold that indicates the level of similarity among junction sequences in a local neighborhood. If method="vj", then clonal relationships are inferred not only on junction region homology, but also taking into account the mutation profiles in the V and J segments. Mutation counts are determined by comparing the input sequences (in the column specified by sequence) to the effective germline sequence (IUPAC representation of sequences in the column specified by germline). germline : str, optional character name of the column containing the germline or reference sequence. sequence : str, optional character name of the column containing input sequences. junction : str, optional character name of the column containing junction sequences. Also used to determine sequence length for grouping. v_call : str, optional name of the column containing the V-segment allele calls. j_call : str, optional name of the column containing the J-segment allele calls. clone : str, optional output column name containing the clonal cluster identifiers. fields : list[str], optional character vector of additional columns to use for grouping. Sequences with disjoint values in the specified fields will be classified as separate clones. cell_id : str | None, optional name of the column containing cell identifiers or barcodes. If specified, grouping will be performed in single-cell mode with the behavior governed by the locus and only_heavy arguments. If set to None then the bulk sequencing data is assumed. locus : str, optional name of the column containing locus information. only_heavy : bool, optional use only the IGH (BCR) or TRB/TRD (TCR) sequences for grouping. split_light : bool, optional split clones by light chains. first : bool, optional specifies how to handle multiple V(D)J assignments for initial grouping. If True only the first call of the gene assignments is used. If False the union of ambiguous gene assignments is used to group all sequences with any overlapping gene calls. cdr3 : bool, optional if True removes 3 nucleotides from both ends of "junction" prior to clustering (converts IMGT junction to CDR3 region). If True this will also remove records with a junction length less than 7 nucleotides. mod3 : bool, optional if True removes records with a junction length that is not divisible by 3 in nucleotide space. max_n : int | None, optional The maximum number of degenerate characters to permit in the junction sequence before excluding the record from clonal assignment. Default is set to be zero. Set it as "None" for no action. threshold : float | None, optional the supervising cut-off to enforce an upper-limit distance for clonal grouping. A numeric value between (0,1). base_sim : float, optional required similarity cut-off for sequences in equal distances from each other. iter_max : int, optional the maximum number of iterations allowed for kmean clustering step. nstart : int, optional the number of random sets chosen for kmean clustering initialization. nproc : int, optional number of cores to distribute the function over. verbose : bool, optional if True prints out a summary of each step cloning process. if False (default) process cloning silently. summarize_clones : bool, optional if True performs a series of analysis to assess the clonal landscape and returns a ScoperClones object. If False then a modified input db is returned. When grouping by fields, summarize_clones should be False. remove_ambiguous : bool, optional if True removes contigs with ambiguous V(D)J assignments flagged by `check_contigs`. remove_extra : bool, optional if True removes extra contigs flagged by `check_contigs`. """ try: from rpy2.robjects.packages import importr from rpy2.rinterface import NULL from rpy2.robjects import r except: raise ( ImportError( "Unable to initialise R instance. Please run this separately through R with scoper's tutorials." ) ) scp = importr("scoper") db = load_data(vdj._data) warnings.filterwarnings("ignore") db = sanitize_data(db) if remove_ambiguous: if "ambiguous" in db: db = db[db["ambiguous"] == "F"].copy() if remove_extra: if "extra" in db: db = db[db["extra"] == "F"].copy() # sanitize before passing to R db, _ = sanitize_data_for_saving(db) fields = NULL if fields is None else fields cell_id = NULL if cell_id is None else cell_id threshold = NULL if threshold is None else threshold db_r = safe_py2rpy(db) results = scp.spectralClones( db=db_r, method=method, germline=germline, sequence=sequence, junction=junction, v_call=v_call, j_call=j_call, clone=clone_id, fields=fields, cell_id=cell_id, locus=locus, only_heavy=only_heavy, split_light=split_light, targeting_model=NULL, len_limit=NULL, first=first, cdr3=cdr3, mod3=mod3, max_n=max_n, threshold=threshold, base_sim=base_sim, iter_max=iter_max, nstart=nstart, nproc=nproc, verbose=verbose, summarize_clones=summarize_clones, ) results_dataframe = r["as.data.frame"](results) df = safe_rpy2py(results_dataframe) vdj._data = df.copy() vdj.update_metadata( clone_key=clone_id, retrieve=clone_id, retrieve_mode="merge and unique only", )
def safe_py2rpy(df: pd.DataFrame) -> "rpy2 object": """Convert pandas DataFrame to R object safely.""" try: import rpy2 from rpy2.robjects.conversion import localconverter from rpy2.robjects import pandas2ri except: raise ( ImportError( "Unable to initialise R instance. Please run this separately through R with shazam's tutorials." ) ) try: with localconverter( rpy2.robjects.default_converter + pandas2ri.converter ): return pandas2ri.py2rpy(df) except: df = df.astype(str) with localconverter( rpy2.robjects.default_converter + pandas2ri.converter ): return pandas2ri.py2rpy(df) def safe_rpy2py(r_object) -> pd.DataFrame: """Convert R object to pandas DataFrame safely.""" try: import rpy2 from rpy2.robjects.conversion import localconverter from rpy2.robjects import pandas2ri except: raise ( ImportError( "Unable to initialise R instance. Please run this separately through R with shazam's tutorials." ) ) with localconverter(rpy2.robjects.default_converter + pandas2ri.converter): return pandas2ri.rpy2py(r_object)