Source code for dandelion.external.immcantation.polars.scoper

import warnings

import polars as pl

from typing import Literal

from dandelion.polars.core._core import (
    SCHEMA_OVERRIDES,
    DandelionPolars,
    _sanitize_data_polars,
)


[docs] def identical_clones( vdj: DandelionPolars, method: Literal["nt", "aa"] = "nt", junction: str = "junction", v_call: str = "v_call", j_call: str = "j_call", clone_key: str = "clone_id", fields: list[str] | None = None, cell_id: str | None = "cell_id", locus: str = "locus", only_heavy: bool = True, split_light: bool = True, first: bool = False, cdr3: bool = False, mod3: bool = False, max_n: int | None = 0, nproc: int = 1, verbose: bool = False, summarize_clones: bool = True, remove_ambiguous: bool = True, remove_extra: bool = True, ) -> DandelionPolars: """ Clonal assignment using sequence identity partitioning with Polars. https://scoper.readthedocs.io/en/stable/topics/identicalClones/ This is a wrapper for one of scoper's method to perform clone clustering using Polars internally for data manipulation. Parameters ---------- vdj : DandelionPolars a DandelionPolars object containing the airr data. method : Literal["nt", "aa"], optional one of the "nt" for nucleotide based clustering or "aa" for amino acid based clustering. junction : str, optional character name of the column containing junction sequences. v_call : str, optional name of the column containing the V-segment allele calls. j_call : str, optional name of the column containing the J-segment allele calls. clone_key : str, optional output column name containing the clonal cluster identifiers. fields : list[str], optional character vector of additional columns to use for grouping. cell_id : str | None, optional name of the column containing cell identifiers or barcodes. locus : str, optional name of the column containing locus information. only_heavy : bool, optional use only the IGH (BCR) or TRB/TRD (TCR) sequences for grouping. split_light : bool, optional split clones by light chains. first : bool, optional specifies how to handle multiple V(D)J assignments for initial grouping. cdr3 : bool, optional if True removes 3 nucleotides from both ends of "junction" prior to clustering. mod3 : bool, optional if True removes records with a junction length that is not divisible by 3. max_n : int | None, optional The maximum number of degenerate characters to permit in the junction sequence. nproc : int, optional number of cores to distribute the function over. verbose : bool, optional if True prints out a summary of each step cloning process. summarize_clones : bool, optional if True performs a series of analysis to assess the clonal landscape. remove_ambiguous : bool, optional if True removes contigs with ambiguous V(D)J assignments. remove_extra : bool, optional if True removes extra contigs flagged by `check_contigs`. Returns ------- DandelionPolars DandelionPolars object with `.clone_id` column populated. """ try: from rpy2.robjects.packages import importr from rpy2.rinterface import NULL from rpy2.robjects import r except: raise ImportError( "Unable to initialise R instance. Please run this separately through R with scoper's tutorials." ) from dandelion.external.immcantation.base.scoper import ( safe_py2rpy, safe_rpy2py, ) scp = importr("scoper") # Convert to pandas for R interop, then back to polars db = ( vdj._data.collect(engine="streaming") if isinstance(vdj._data, pl.LazyFrame) else vdj._data ) db = _sanitize_data_polars(db) db_pandas = db.to_pandas() warnings.filterwarnings("ignore") if remove_ambiguous: if "ambiguous" in db_pandas: db_pandas = db_pandas[db_pandas["ambiguous"] == "F"].copy() if remove_extra: if "extra" in db_pandas: db_pandas = db_pandas[db_pandas["extra"] == "F"].copy() fields = NULL if fields is None else fields cell_id = NULL if cell_id is None else cell_id db_r = safe_py2rpy(db_pandas) results = scp.identicalClones( db=db_r, method=method, junction=junction, v_call=v_call, j_call=j_call, clone=clone_key, fields=fields, cell_id=cell_id, locus=locus, only_heavy=only_heavy, split_light=split_light, first=first, cdr3=cdr3, mod3=mod3, max_n=max_n, nproc=nproc, verbose=verbose, summarize_clones=summarize_clones, ) results_dataframe = r["as.data.frame"](results) df = safe_rpy2py(results_dataframe) # Clean NA_character_ before converting to polars for col in df.columns: if df[col].dtype == "object": df[col] = df[col].apply( lambda x: ( None if ( hasattr(x, "__class__") and "NACharacter" in str(x.__class__.__name__) ) else x ) ) # Convert back to polars df_polars = pl.from_pandas(df, schema_overrides=SCHEMA_OVERRIDES) vdj._data = df_polars vdj.update_metadata(clone_key=clone_key) return vdj
[docs] def hierarchical_clones( vdj: DandelionPolars, threshold: float, method: Literal["nt", "aa"] = "nt", linkage: Literal["single", "average", "complete"] = "single", normalize: Literal["len", "none"] = "len", junction: str = "junction", v_call: str = "v_call", j_call: str = "j_call", clone_id: str = "clone_id", fields: list[str] | None = None, cell_id: str | None = "cell_id", locus: str = "locus", only_heavy: bool = True, split_light: bool = True, first: bool = False, cdr3: bool = False, mod3: bool = False, max_n: int | None = 0, nproc: int = 1, verbose: bool = False, summarize_clones: bool = True, remove_ambiguous: bool = True, remove_extra: bool = True, ) -> DandelionPolars: """ Hierarchical clustering approach to clonal assignment with Polars. https://scoper.readthedocs.io/en/stable/topics/hierarchicalClones/ This is a wrapper for one of scoper's method to perform clone clustering using Polars internally for data manipulation. Parameters ---------- vdj : DandelionPolars a DandelionPolars object containing the airr data. threshold : float numeric scalar where the tree should be cut (the distance threshold for clonal grouping). method : Literal["nt", "aa"], optional one of the "nt" for nucleotide based clustering or "aa" for amino acid based clustering. linkage : Literal["single", "average", "complete"], optional one of the "single", "average" or "complete" for the hierarchical clustering method. normalize : Literal["len", "none"], optional method of normalization. junction : str, optional character name of the column containing junction sequences. v_call : str, optional name of the column containing the V-segment allele calls. j_call : str, optional name of the column containing the J-segment allele calls. clone_id : str, optional output column name containing the clonal cluster identifiers. fields : list[str], optional character vector of additional columns to use for grouping. cell_id : str | None, optional name of the column containing cell identifiers or barcodes. locus : str, optional name of the column containing locus information. only_heavy : bool, optional use only the IGH (BCR) or TRB/TRD (TCR) sequences for grouping. split_light : bool, optional split clones by light chains. first : bool, optional specifies how to handle multiple V(D)J assignments for initial grouping. cdr3 : bool, optional if True removes 3 nucleotides from both ends of "junction" prior to clustering. mod3 : bool, optional if True removes records with a junction length that is not divisible by 3. max_n : int | None, optional The maximum number of degenerate characters to permit in the junction sequence. nproc : int, optional number of cores to distribute the function over. verbose : bool, optional if True prints out a summary of each step cloning process. summarize_clones : bool, optional if True performs a series of analysis to assess the clonal landscape. remove_ambiguous : bool, optional if True removes contigs with ambiguous V(D)J assignments. remove_extra : bool, optional if True removes extra contigs flagged by `check_contigs`. Returns ------- DandelionPolars DandelionPolars object with `.clone_id` column populated. """ try: from rpy2.robjects.packages import importr from rpy2.rinterface import NULL from rpy2.robjects import r except: raise ImportError( "Unable to initialise R instance. Please run this separately through R with scoper's tutorials." ) from dandelion.external.immcantation.base.scoper import ( safe_py2rpy, safe_rpy2py, ) scp = importr("scoper") # Convert to pandas for R interop, then back to polars db = ( vdj._data.collect(engine="streaming") if isinstance(vdj._data, pl.LazyFrame) else vdj._data ) db_pandas = _sanitize_data_polars(db) db_pandas = db.to_pandas() warnings.filterwarnings("ignore") if remove_ambiguous: if "ambiguous" in db_pandas: db_pandas = db_pandas[db_pandas["ambiguous"] == "F"].copy() if remove_extra: if "extra" in db_pandas: db_pandas = db_pandas[db_pandas["extra"] == "F"].copy() fields = NULL if fields is None else fields cell_id = NULL if cell_id is None else cell_id db_r = safe_py2rpy(db_pandas) results = scp.hierarchicalClones( db=db_r, threshold=threshold, method=method, linkage=linkage, normalize=normalize, junction=junction, v_call=v_call, j_call=j_call, clone=clone_id, fields=fields, cell_id=cell_id, locus=locus, only_heavy=only_heavy, split_light=split_light, first=first, cdr3=cdr3, mod3=mod3, max_n=max_n, nproc=nproc, verbose=verbose, summarize_clones=summarize_clones, ) results_dataframe = r["as.data.frame"](results) df = safe_rpy2py(results_dataframe) # Clean NA_character_ before converting to polars for col in df.columns: if df[col].dtype == "object": df[col] = df[col].apply( lambda x: ( None if ( hasattr(x, "__class__") and "NACharacter" in str(x.__class__.__name__) ) else x ) ) # Convert back to polars df_polars = pl.from_pandas(df, schema_overrides=SCHEMA_OVERRIDES) vdj._data = df_polars vdj.update_metadata(clone_key=clone_id) return vdj
[docs] def spectral_clones( vdj: DandelionPolars, method: Literal["novj", "vj"] = "novj", germline: str = "germline_alignment", sequence: str = "sequence_alignment", junction: str = "junction", v_call: str = "v_call", j_call: str = "j_call", clone_id: str = "clone_id", fields: list[str] | None = None, cell_id: str | None = "cell_id", locus: str = "locus", only_heavy: bool = True, split_light: bool = True, first: bool = False, cdr3: bool = False, mod3: bool = False, max_n: int | None = 0, threshold: float | None = None, base_sim: float = 0.95, iter_max: int = 1000, nstart: int = 1000, nproc: int = 1, verbose: bool = False, summarize_clones: bool = True, remove_ambiguous: bool = True, remove_extra: bool = True, ) -> DandelionPolars: """ Spectral clustering method for clonal partitioning with Polars. https://scoper.readthedocs.io/en/stable/topics/spectralClones/ This is a wrapper for one of scoper's method to perform clone clustering using Polars internally for data manipulation. Parameters ---------- vdj : DandelionPolars a DandelionPolars object containing the airr data. method : Literal["novj", "vj"], optional one of the "novj" or "vj". germline : str, optional character name of the column containing the germline or reference sequence. sequence : str, optional character name of the column containing input sequences. junction : str, optional character name of the column containing junction sequences. v_call : str, optional name of the column containing the V-segment allele calls. j_call : str, optional name of the column containing the J-segment allele calls. clone_id : str, optional output column name containing the clonal cluster identifiers. fields : list[str], optional character vector of additional columns to use for grouping. cell_id : str | None, optional name of the column containing cell identifiers or barcodes. locus : str, optional name of the column containing locus information. only_heavy : bool, optional use only the IGH (BCR) or TRB/TRD (TCR) sequences for grouping. split_light : bool, optional split clones by light chains. first : bool, optional specifies how to handle multiple V(D)J assignments for initial grouping. cdr3 : bool, optional if True removes 3 nucleotides from both ends of "junction" prior to clustering. mod3 : bool, optional if True removes records with a junction length that is not divisible by 3. max_n : int | None, optional The maximum number of degenerate characters to permit in the junction sequence. threshold : float | None, optional the supervising cut-off to enforce an upper-limit distance for clonal grouping. base_sim : float, optional required similarity cut-off for sequences in equal distances from each other. iter_max : int, optional the maximum number of iterations allowed for kmean clustering step. nstart : int, optional the number of random sets chosen for kmean clustering initialization. nproc : int, optional number of cores to distribute the function over. verbose : bool, optional if True prints out a summary of each step cloning process. summarize_clones : bool, optional if True performs a series of analysis to assess the clonal landscape. remove_ambiguous : bool, optional if True removes contigs with ambiguous V(D)J assignments. remove_extra : bool, optional if True removes extra contigs flagged by `check_contigs`. Returns ------- DandelionPolars DandelionPolars object with `.clone_id` column populated. """ try: from rpy2.robjects.packages import importr from rpy2.rinterface import NULL from rpy2.robjects import r except: raise ImportError( "Unable to initialise R instance. Please run this separately through R with scoper's tutorials." ) from dandelion.external.immcantation.base.scoper import ( safe_py2rpy, safe_rpy2py, ) scp = importr("scoper") # Convert to pandas for R interop, then back to polars db = ( vdj._data.collect(engine="streaming") if isinstance(vdj._data, pl.LazyFrame) else vdj._data ) db = _sanitize_data_polars(db) db_pandas = db.to_pandas() warnings.filterwarnings("ignore") if remove_ambiguous: if "ambiguous" in db_pandas: db_pandas = db_pandas[db_pandas["ambiguous"] == "F"].copy() if remove_extra: if "extra" in db_pandas: db_pandas = db_pandas[db_pandas["extra"] == "F"].copy() fields = NULL if fields is None else fields cell_id = NULL if cell_id is None else cell_id threshold = NULL if threshold is None else threshold db_r = safe_py2rpy(db_pandas) results = scp.spectralClones( db=db_r, method=method, germline=germline, sequence=sequence, junction=junction, v_call=v_call, j_call=j_call, clone=clone_id, fields=fields, cell_id=cell_id, locus=locus, only_heavy=only_heavy, split_light=split_light, targeting_model=NULL, len_limit=NULL, first=first, cdr3=cdr3, mod3=mod3, max_n=max_n, threshold=threshold, base_sim=base_sim, iter_max=iter_max, nstart=nstart, nproc=nproc, verbose=verbose, summarize_clones=summarize_clones, ) results_dataframe = r["as.data.frame"](results) df = safe_rpy2py(results_dataframe) # Clean NA_character_ before converting to polars for col in df.columns: if df[col].dtype == "object": df[col] = df[col].apply( lambda x: ( None if ( hasattr(x, "__class__") and "NACharacter" in str(x.__class__.__name__) ) else x ) ) # Convert back to polars df_polars = pl.from_pandas(df, schema_overrides=SCHEMA_OVERRIDES) vdj._data = df_polars vdj.update_metadata(clone_key=clone_id) return vdj