import warnings
import polars as pl
from typing import Literal
from dandelion.polars.core._core import (
SCHEMA_OVERRIDES,
DandelionPolars,
_sanitize_data_polars,
)
[docs]
def identical_clones(
vdj: DandelionPolars,
method: Literal["nt", "aa"] = "nt",
junction: str = "junction",
v_call: str = "v_call",
j_call: str = "j_call",
clone_key: str = "clone_id",
fields: list[str] | None = None,
cell_id: str | None = "cell_id",
locus: str = "locus",
only_heavy: bool = True,
split_light: bool = True,
first: bool = False,
cdr3: bool = False,
mod3: bool = False,
max_n: int | None = 0,
nproc: int = 1,
verbose: bool = False,
summarize_clones: bool = True,
remove_ambiguous: bool = True,
remove_extra: bool = True,
) -> DandelionPolars:
"""
Clonal assignment using sequence identity partitioning with Polars.
https://scoper.readthedocs.io/en/stable/topics/identicalClones/
This is a wrapper for one of scoper's method to perform clone clustering using Polars
internally for data manipulation.
Parameters
----------
vdj : DandelionPolars
a DandelionPolars object containing the airr data.
method : Literal["nt", "aa"], optional
one of the "nt" for nucleotide based clustering or "aa" for amino acid based clustering.
junction : str, optional
character name of the column containing junction sequences.
v_call : str, optional
name of the column containing the V-segment allele calls.
j_call : str, optional
name of the column containing the J-segment allele calls.
clone_key : str, optional
output column name containing the clonal cluster identifiers.
fields : list[str], optional
character vector of additional columns to use for grouping.
cell_id : str | None, optional
name of the column containing cell identifiers or barcodes.
locus : str, optional
name of the column containing locus information.
only_heavy : bool, optional
use only the IGH (BCR) or TRB/TRD (TCR) sequences for grouping.
split_light : bool, optional
split clones by light chains.
first : bool, optional
specifies how to handle multiple V(D)J assignments for initial grouping.
cdr3 : bool, optional
if True removes 3 nucleotides from both ends of "junction" prior to clustering.
mod3 : bool, optional
if True removes records with a junction length that is not divisible by 3.
max_n : int | None, optional
The maximum number of degenerate characters to permit in the junction sequence.
nproc : int, optional
number of cores to distribute the function over.
verbose : bool, optional
if True prints out a summary of each step cloning process.
summarize_clones : bool, optional
if True performs a series of analysis to assess the clonal landscape.
remove_ambiguous : bool, optional
if True removes contigs with ambiguous V(D)J assignments.
remove_extra : bool, optional
if True removes extra contigs flagged by `check_contigs`.
Returns
-------
DandelionPolars
DandelionPolars object with `.clone_id` column populated.
"""
try:
from rpy2.robjects.packages import importr
from rpy2.rinterface import NULL
from rpy2.robjects import r
except:
raise ImportError(
"Unable to initialise R instance. Please run this separately through R with scoper's tutorials."
)
from dandelion.external.immcantation.base.scoper import (
safe_py2rpy,
safe_rpy2py,
)
scp = importr("scoper")
# Convert to pandas for R interop, then back to polars
db = (
vdj._data.collect(engine="streaming")
if isinstance(vdj._data, pl.LazyFrame)
else vdj._data
)
db = _sanitize_data_polars(db)
db_pandas = db.to_pandas()
warnings.filterwarnings("ignore")
if remove_ambiguous:
if "ambiguous" in db_pandas:
db_pandas = db_pandas[db_pandas["ambiguous"] == "F"].copy()
if remove_extra:
if "extra" in db_pandas:
db_pandas = db_pandas[db_pandas["extra"] == "F"].copy()
fields = NULL if fields is None else fields
cell_id = NULL if cell_id is None else cell_id
db_r = safe_py2rpy(db_pandas)
results = scp.identicalClones(
db=db_r,
method=method,
junction=junction,
v_call=v_call,
j_call=j_call,
clone=clone_key,
fields=fields,
cell_id=cell_id,
locus=locus,
only_heavy=only_heavy,
split_light=split_light,
first=first,
cdr3=cdr3,
mod3=mod3,
max_n=max_n,
nproc=nproc,
verbose=verbose,
summarize_clones=summarize_clones,
)
results_dataframe = r["as.data.frame"](results)
df = safe_rpy2py(results_dataframe)
# Clean NA_character_ before converting to polars
for col in df.columns:
if df[col].dtype == "object":
df[col] = df[col].apply(
lambda x: (
None
if (
hasattr(x, "__class__")
and "NACharacter" in str(x.__class__.__name__)
)
else x
)
)
# Convert back to polars
df_polars = pl.from_pandas(df, schema_overrides=SCHEMA_OVERRIDES)
vdj._data = df_polars
vdj.update_metadata(clone_key=clone_key)
return vdj
[docs]
def hierarchical_clones(
vdj: DandelionPolars,
threshold: float,
method: Literal["nt", "aa"] = "nt",
linkage: Literal["single", "average", "complete"] = "single",
normalize: Literal["len", "none"] = "len",
junction: str = "junction",
v_call: str = "v_call",
j_call: str = "j_call",
clone_id: str = "clone_id",
fields: list[str] | None = None,
cell_id: str | None = "cell_id",
locus: str = "locus",
only_heavy: bool = True,
split_light: bool = True,
first: bool = False,
cdr3: bool = False,
mod3: bool = False,
max_n: int | None = 0,
nproc: int = 1,
verbose: bool = False,
summarize_clones: bool = True,
remove_ambiguous: bool = True,
remove_extra: bool = True,
) -> DandelionPolars:
"""
Hierarchical clustering approach to clonal assignment with Polars.
https://scoper.readthedocs.io/en/stable/topics/hierarchicalClones/
This is a wrapper for one of scoper's method to perform clone clustering using Polars
internally for data manipulation.
Parameters
----------
vdj : DandelionPolars
a DandelionPolars object containing the airr data.
threshold : float
numeric scalar where the tree should be cut (the distance threshold for clonal grouping).
method : Literal["nt", "aa"], optional
one of the "nt" for nucleotide based clustering or "aa" for amino acid based clustering.
linkage : Literal["single", "average", "complete"], optional
one of the "single", "average" or "complete" for the hierarchical clustering method.
normalize : Literal["len", "none"], optional
method of normalization.
junction : str, optional
character name of the column containing junction sequences.
v_call : str, optional
name of the column containing the V-segment allele calls.
j_call : str, optional
name of the column containing the J-segment allele calls.
clone_id : str, optional
output column name containing the clonal cluster identifiers.
fields : list[str], optional
character vector of additional columns to use for grouping.
cell_id : str | None, optional
name of the column containing cell identifiers or barcodes.
locus : str, optional
name of the column containing locus information.
only_heavy : bool, optional
use only the IGH (BCR) or TRB/TRD (TCR) sequences for grouping.
split_light : bool, optional
split clones by light chains.
first : bool, optional
specifies how to handle multiple V(D)J assignments for initial grouping.
cdr3 : bool, optional
if True removes 3 nucleotides from both ends of "junction" prior to clustering.
mod3 : bool, optional
if True removes records with a junction length that is not divisible by 3.
max_n : int | None, optional
The maximum number of degenerate characters to permit in the junction sequence.
nproc : int, optional
number of cores to distribute the function over.
verbose : bool, optional
if True prints out a summary of each step cloning process.
summarize_clones : bool, optional
if True performs a series of analysis to assess the clonal landscape.
remove_ambiguous : bool, optional
if True removes contigs with ambiguous V(D)J assignments.
remove_extra : bool, optional
if True removes extra contigs flagged by `check_contigs`.
Returns
-------
DandelionPolars
DandelionPolars object with `.clone_id` column populated.
"""
try:
from rpy2.robjects.packages import importr
from rpy2.rinterface import NULL
from rpy2.robjects import r
except:
raise ImportError(
"Unable to initialise R instance. Please run this separately through R with scoper's tutorials."
)
from dandelion.external.immcantation.base.scoper import (
safe_py2rpy,
safe_rpy2py,
)
scp = importr("scoper")
# Convert to pandas for R interop, then back to polars
db = (
vdj._data.collect(engine="streaming")
if isinstance(vdj._data, pl.LazyFrame)
else vdj._data
)
db_pandas = _sanitize_data_polars(db)
db_pandas = db.to_pandas()
warnings.filterwarnings("ignore")
if remove_ambiguous:
if "ambiguous" in db_pandas:
db_pandas = db_pandas[db_pandas["ambiguous"] == "F"].copy()
if remove_extra:
if "extra" in db_pandas:
db_pandas = db_pandas[db_pandas["extra"] == "F"].copy()
fields = NULL if fields is None else fields
cell_id = NULL if cell_id is None else cell_id
db_r = safe_py2rpy(db_pandas)
results = scp.hierarchicalClones(
db=db_r,
threshold=threshold,
method=method,
linkage=linkage,
normalize=normalize,
junction=junction,
v_call=v_call,
j_call=j_call,
clone=clone_id,
fields=fields,
cell_id=cell_id,
locus=locus,
only_heavy=only_heavy,
split_light=split_light,
first=first,
cdr3=cdr3,
mod3=mod3,
max_n=max_n,
nproc=nproc,
verbose=verbose,
summarize_clones=summarize_clones,
)
results_dataframe = r["as.data.frame"](results)
df = safe_rpy2py(results_dataframe)
# Clean NA_character_ before converting to polars
for col in df.columns:
if df[col].dtype == "object":
df[col] = df[col].apply(
lambda x: (
None
if (
hasattr(x, "__class__")
and "NACharacter" in str(x.__class__.__name__)
)
else x
)
)
# Convert back to polars
df_polars = pl.from_pandas(df, schema_overrides=SCHEMA_OVERRIDES)
vdj._data = df_polars
vdj.update_metadata(clone_key=clone_id)
return vdj
[docs]
def spectral_clones(
vdj: DandelionPolars,
method: Literal["novj", "vj"] = "novj",
germline: str = "germline_alignment",
sequence: str = "sequence_alignment",
junction: str = "junction",
v_call: str = "v_call",
j_call: str = "j_call",
clone_id: str = "clone_id",
fields: list[str] | None = None,
cell_id: str | None = "cell_id",
locus: str = "locus",
only_heavy: bool = True,
split_light: bool = True,
first: bool = False,
cdr3: bool = False,
mod3: bool = False,
max_n: int | None = 0,
threshold: float | None = None,
base_sim: float = 0.95,
iter_max: int = 1000,
nstart: int = 1000,
nproc: int = 1,
verbose: bool = False,
summarize_clones: bool = True,
remove_ambiguous: bool = True,
remove_extra: bool = True,
) -> DandelionPolars:
"""
Spectral clustering method for clonal partitioning with Polars.
https://scoper.readthedocs.io/en/stable/topics/spectralClones/
This is a wrapper for one of scoper's method to perform clone clustering using Polars
internally for data manipulation.
Parameters
----------
vdj : DandelionPolars
a DandelionPolars object containing the airr data.
method : Literal["novj", "vj"], optional
one of the "novj" or "vj".
germline : str, optional
character name of the column containing the germline or reference sequence.
sequence : str, optional
character name of the column containing input sequences.
junction : str, optional
character name of the column containing junction sequences.
v_call : str, optional
name of the column containing the V-segment allele calls.
j_call : str, optional
name of the column containing the J-segment allele calls.
clone_id : str, optional
output column name containing the clonal cluster identifiers.
fields : list[str], optional
character vector of additional columns to use for grouping.
cell_id : str | None, optional
name of the column containing cell identifiers or barcodes.
locus : str, optional
name of the column containing locus information.
only_heavy : bool, optional
use only the IGH (BCR) or TRB/TRD (TCR) sequences for grouping.
split_light : bool, optional
split clones by light chains.
first : bool, optional
specifies how to handle multiple V(D)J assignments for initial grouping.
cdr3 : bool, optional
if True removes 3 nucleotides from both ends of "junction" prior to clustering.
mod3 : bool, optional
if True removes records with a junction length that is not divisible by 3.
max_n : int | None, optional
The maximum number of degenerate characters to permit in the junction sequence.
threshold : float | None, optional
the supervising cut-off to enforce an upper-limit distance for clonal grouping.
base_sim : float, optional
required similarity cut-off for sequences in equal distances from each other.
iter_max : int, optional
the maximum number of iterations allowed for kmean clustering step.
nstart : int, optional
the number of random sets chosen for kmean clustering initialization.
nproc : int, optional
number of cores to distribute the function over.
verbose : bool, optional
if True prints out a summary of each step cloning process.
summarize_clones : bool, optional
if True performs a series of analysis to assess the clonal landscape.
remove_ambiguous : bool, optional
if True removes contigs with ambiguous V(D)J assignments.
remove_extra : bool, optional
if True removes extra contigs flagged by `check_contigs`.
Returns
-------
DandelionPolars
DandelionPolars object with `.clone_id` column populated.
"""
try:
from rpy2.robjects.packages import importr
from rpy2.rinterface import NULL
from rpy2.robjects import r
except:
raise ImportError(
"Unable to initialise R instance. Please run this separately through R with scoper's tutorials."
)
from dandelion.external.immcantation.base.scoper import (
safe_py2rpy,
safe_rpy2py,
)
scp = importr("scoper")
# Convert to pandas for R interop, then back to polars
db = (
vdj._data.collect(engine="streaming")
if isinstance(vdj._data, pl.LazyFrame)
else vdj._data
)
db = _sanitize_data_polars(db)
db_pandas = db.to_pandas()
warnings.filterwarnings("ignore")
if remove_ambiguous:
if "ambiguous" in db_pandas:
db_pandas = db_pandas[db_pandas["ambiguous"] == "F"].copy()
if remove_extra:
if "extra" in db_pandas:
db_pandas = db_pandas[db_pandas["extra"] == "F"].copy()
fields = NULL if fields is None else fields
cell_id = NULL if cell_id is None else cell_id
threshold = NULL if threshold is None else threshold
db_r = safe_py2rpy(db_pandas)
results = scp.spectralClones(
db=db_r,
method=method,
germline=germline,
sequence=sequence,
junction=junction,
v_call=v_call,
j_call=j_call,
clone=clone_id,
fields=fields,
cell_id=cell_id,
locus=locus,
only_heavy=only_heavy,
split_light=split_light,
targeting_model=NULL,
len_limit=NULL,
first=first,
cdr3=cdr3,
mod3=mod3,
max_n=max_n,
threshold=threshold,
base_sim=base_sim,
iter_max=iter_max,
nstart=nstart,
nproc=nproc,
verbose=verbose,
summarize_clones=summarize_clones,
)
results_dataframe = r["as.data.frame"](results)
df = safe_rpy2py(results_dataframe)
# Clean NA_character_ before converting to polars
for col in df.columns:
if df[col].dtype == "object":
df[col] = df[col].apply(
lambda x: (
None
if (
hasattr(x, "__class__")
and "NACharacter" in str(x.__class__.__name__)
)
else x
)
)
# Convert back to polars
df_polars = pl.from_pandas(df, schema_overrides=SCHEMA_OVERRIDES)
vdj._data = df_polars
vdj.update_metadata(clone_key=clone_id)
return vdj