Source code for dandelion.external.immcantation.changeo

from pathlib import Path
from scanpy import logging as logg
from subprocess import run
from typing import Literal

from dandelion.utilities._utilities import (
    set_germline_env,
    set_igblast_env,
    NO_DS,
)


[docs] def assigngenes_igblast( fasta: Path | str, igblast_db: Path | str | None = None, org: Literal["human", "mouse"] = "human", loci: Literal["ig", "tr"] = "ig", additional_args: list[str] = [], ): """ Reannotate with IgBLASTn. Parameters ---------- fasta : Path | str path to fasta file for reannotation. igblast_db : Path | str | None, optional path to igblast database. org : Literal["human", "mouse"], optional organism for germline sequences. loci : Literal["ig", "tr"], optional `ig` or `tr` mode for running igblastn. additional_args : list[str], optional Additional arguments to pass to `AssignGenes.py`. """ env, igdb, fasta = set_igblast_env(igblast_db=igblast_db, input_file=fasta) outfolder = fasta.parent / "tmp" outfolder.mkdir(parents=True, exist_ok=True) informat_dict = {"blast": "_igblast.fmt7", "airr": "_igblast.tsv"} for fileformat in ["blast", "airr"]: outfile = fasta.stem + informat_dict[fileformat] cmd = [ "AssignGenes.py", "igblast", "-s", str(fasta), "-b", str(igdb), "--organism", org, "--loci", loci, "--format", fileformat, "-o", str(outfolder / outfile), ] cmd += additional_args logg.info("Running command: %s\n" % (" ".join(cmd))) run(cmd, env=env) # logs are printed to terminal
[docs] def makedb_igblast( fasta: Path | str, igblast_output: Path | str | None = None, germline: str | None = None, org: Literal["human", "mouse"] = "human", db: Literal["imgt", "ogrdb"] = "imgt", extended: bool = True, additional_args: list[str] = [], loci: Literal["ig", "tr"] = "ig", ): """ Parse IgBLAST output to AIRR format. Parameters ---------- fasta : Path | str path to fasta file used for reannotation. igblast_output : Path | str | None, optional path to igblast output file. germline : str | None, optional path to germline database. org : Literal["human", "mouse"], optional organism of germline sequences. db : Literal["imgt", "ogrdb"], optional `imgt` or `ogrdb` reference database for running igblastn. extended : bool, optional whether or not to parse extended 10x annotations. additional_args: list[str], optional Additional arguments to pass to `MakeDb.py`. """ env, gml, fasta = set_germline_env( germline=germline, org=org, input_file=fasta, db=db, ) if igblast_output is None: indir = fasta.parent / "tmp" infile = fasta.stem + "_igblast.fmt7" igbo = indir / infile else: igbo = Path(igblast_output) cellranger_annotation = fasta.parent / (fasta.stem + "_annotations.csv") if (org == "mouse") and (loci.lower() == "tr"): cmd = [ "MakeDb_gentle.py", "igblast", "-i", str(igbo), "-s", str(fasta), "-r", str(gml), "--10x", str(cellranger_annotation), ] else: cmd = [ "MakeDb.py", "igblast", "-i", str(igbo), "-s", str(fasta), "-r", str(gml), "--10x", str(cellranger_annotation), ] if extended: cmd = cmd + ["--extended"] for add_cmd in [[], ["--failed"]]: cmd = cmd + add_cmd + additional_args logg.info("Running command: %s\n" % (" ".join(cmd))) run(cmd, env=env) # logs are printed to terminal
[docs] def parsedb_heavy(airr_file: Path | str): """ Parse AIRR tsv file (heavy chain contigs only). Parameters ---------- airr_file : Path | str path to AIRR tsv file. """ outname = Path(airr_file).stem + "_heavy" cmd = [ "ParseDb.py", "select", "-d", str(airr_file), "-f", "locus", "-u", "IGH", "--logic", "all", "--regex", "--outname", outname, ] logg.info("Running command: %s\n" % (" ".join(cmd))) run(cmd) # logs are printed to terminal
[docs] def parsedb_light(airr_file: Path | str): """ Parse AIRR tsv file (light chain contigs only). Parameters ---------- airr_file : Path | str path to AIRR tsv file. """ outname = Path(airr_file).stem + "_light" cmd = [ "ParseDb.py", "select", "-d", str(airr_file), "-f", "locus", "-u", "IG[LK]", "--logic", "all", "--regex", "--outname", outname, ] logg.info("Running command: %s\n" % (" ".join(cmd))) run(cmd) # logs are printed to terminal
[docs] def creategermlines( airr_file: Path | str, germline: list[str] | None = None, org: Literal["human", "mouse"] = "human", genotyped_fasta: str | None = None, mode: Literal["heavy", "light"] | None = None, db: Literal["imgt", "ogrdb"] = "imgt", strain: ( Literal[ "c57bl6", "balbc", "129S1_SvImJ", "AKR_J", "A_J", "BALB_c_ByJ", "BALB_c", "C3H_HeJ", "C57BL_6J", "C57BL_6", "CAST_EiJ", "CBA_J", "DBA_1J", "DBA_2J", "LEWES_EiJ", "MRL_MpJ", "MSM_MsJ", "NOD_ShiLtJ", "NOR_LtJ", "NZB_BlNJ", "PWD_PhJ", "SJL_J", ] | None ) = None, additional_args: list[str] = [], ): """ Wrapper for CreateGermlines.py for reconstructing germline sequences. Parameters ---------- airr_file : Path | str path to AIRR tsv file. germline : list[str] | None, optional location to germline fasta files as a list. org : Literal["human", "mouse"], optional organism for germline sequences. genotyped_fasta : str | None, optional location to V genotyped fasta file. mode : Literal["heavy", "light"] | None, optional whether to run on heavy or light mode. If left as None, heavy and light will be run together. db : Literal["imgt", "ogrdb"], optional `imgt` or `ogrdb` reference database. strain : Literal["c57bl6", "balbc", "129S1_SvImJ", "AKR_J", "A_J", "BALB_c_ByJ", "BALB_c", "C3H_HeJ", "C57BL_6J", "C57BL_6", "CAST_EiJ", "CBA_J", "DBA_1J", "DBA_2J", "LEWES_EiJ", "MRL_MpJ", "MSM_MsJ", "NOD_ShiLtJ", "NOR_LtJ", "NZB_BlNJ", "PWD_PhJ", "SJL_J"] | None, optional strain of mouse to use for germline sequences. Only for `db="ogrdb"`. Note that only "c57bl6", "balbc", "CAST_EiJ", "LEWES_EiJ", "MSM_MsJ", "NOD_ShiLt_J" and "PWD_PhJ" contains both heavy chain and light chain germline sequences as a set. The rest will not allow igblastn and MakeDB.py to generate a successful airr table (check the failed file). "c57bl6" and "balbc" are merged databases of "C57BL_6" with "C57BL_6J" and "BALB_c" with "BALB_c_ByJ" respectively. None defaults to all combined. additional_args : list[str], optional Additional arguments to pass to `CreateGermlines.py`. """ env, gml, airr_file = set_germline_env( germline=germline, org=org, input_file=airr_file, db=db, ) _strain = "_" + strain if strain is not None else "" if germline is None: if mode == "heavy": if genotyped_fasta is None: gml_ref = [ str(gml / ("imgt_" + org + _strain + "_IGHV.fasta")), ] else: gml_ref = [ str(genotyped_fasta), ] gml_ref += [ str(gml / ("imgt_" + org + _strain + "_IGHJ.fasta")), ] _strainD = "" if strain not in NO_DS else _strain gml_ref += [ str(gml / ("imgt_" + org + _strainD + "_IGHD.fasta")), ] elif mode == "light": gml_ref = [ str(gml / ("imgt_" + org + _strain + "_IGKV.fasta")), str(gml / ("imgt_" + org + _strain + "_IGKJ.fasta")), str(gml / ("imgt_" + org + _strain + "_IGLV.fasta")), str(gml / ("imgt_" + org + _strain + "_IGLJ.fasta")), ] elif mode is None: if genotyped_fasta is None: gml_ref = [ str(gml / ("imgt_" + org + _strain + "_IGHV.fasta")), ] else: gml_ref = [ str(genotyped_fasta), ] gml_ref += [ str(gml / ("imgt_" + org + _strain + "_IGHJ.fasta")), ] _strainD = "" if strain not in NO_DS else _strain gml_ref += [ str(gml / ("imgt_" + org + _strainD + "_IGHD.fasta")), ] gml_ref += [ str(gml / ("imgt_" + org + _strain + "_IGKV.fasta")), str(gml / ("imgt_" + org + _strain + "_IGKJ.fasta")), str(gml / ("imgt_" + org + _strain + "_IGLV.fasta")), str(gml / ("imgt_" + org + _strain + "_IGLJ.fasta")), ] else: if not isinstance(germline, list): germline = [str(germline)] gml_ref = germline cmd = [ "CreateGermlines.py", "-d", str(airr_file), "-r", ] cmd = cmd + gml_ref + additional_args logg.info("Running command: %s\n" % (" ".join(cmd))) run(cmd, env=env) # logs are printed to terminal