Source code for dandelion.tutorial._tutorial

from importlib.resources import path
import subprocess

from pathlib import Path
from urllib.request import Request, urlopen


def _download_file(url: str, dest: Path | str, chunk_size: int = 8192):
    """Download a file using urllib.

    Parameters
    ----------
    url : str
        URL of the file to download.
    dest : Path | str
        Destination file path to write the downloaded content.
    chunk_size : int, optional
        Number of bytes to read per chunk. Defaults to 8192.
    """
    req = Request(
        url, headers={"User-Agent": "Mozilla/5.0 (compatible; Python urllib)"}
    )
    with urlopen(req) as response, open(dest, "wb") as out_file:
        while True:
            chunk = response.read(chunk_size)
            if not chunk:
                break
            out_file.write(chunk)



[docs]
def setup_dandelion_tutorial_bcr(path: Path | str | None = None) -> None:
    """Download example BCR datasets for Dandelion tutorial.

    Downloads 10x Genomics PBMC BCR datasets into a local directory for use
    in the dandelion BCR preprocessing tutorial.

    Parameters
    ----------
    path : Path | str | None, optional
        Root directory to download datasets into.
        Defaults to ``./dandelion_tutorial``.
    """
    base = Path("./dandelion_tutorial") if path is None else Path(path)
    base.mkdir(parents=True, exist_ok=True)

    datasets = {
        "vdj_v1_hs_pbmc3_b": {
            "filtered_feature_bc_matrix.h5": "https://cf.10xgenomics.com/samples/cell-vdj/3.1.0/vdj_v1_hs_pbmc3/vdj_v1_hs_pbmc3_filtered_feature_bc_matrix.h5",
            "filtered_contig_annotations.csv": "https://cf.10xgenomics.com/samples/cell-vdj/3.1.0/vdj_v1_hs_pbmc3/vdj_v1_hs_pbmc3_b_filtered_contig_annotations.csv",
            "filtered_contig.fasta": "https://cf.10xgenomics.com/samples/cell-vdj/3.1.0/vdj_v1_hs_pbmc3/vdj_v1_hs_pbmc3_b_filtered_contig.fasta",
        },
        "vdj_nextgem_hs_pbmc3_b": {
            "filtered_feature_bc_matrix.h5": "https://cf.10xgenomics.com/samples/cell-vdj/3.1.0/vdj_nextgem_hs_pbmc3/vdj_nextgem_hs_pbmc3_filtered_feature_bc_matrix.h5",
            "filtered_contig_annotations.csv": "https://cf.10xgenomics.com/samples/cell-vdj/3.1.0/vdj_nextgem_hs_pbmc3/vdj_nextgem_hs_pbmc3_b_filtered_contig_annotations.csv",
            "filtered_contig.fasta": "https://cf.10xgenomics.com/samples/cell-vdj/3.1.0/vdj_nextgem_hs_pbmc3/vdj_nextgem_hs_pbmc3_b_filtered_contig.fasta",
        },
        "sc5p_v2_hs_PBMC_10k_b": {
            "filtered_feature_bc_matrix.h5": "https://cf.10xgenomics.com/samples/cell-vdj/4.0.0/sc5p_v2_hs_PBMC_10k/sc5p_v2_hs_PBMC_10k_filtered_feature_bc_matrix.h5",
            "filtered_contig_annotations.csv": "https://cf.10xgenomics.com/samples/cell-vdj/4.0.0/sc5p_v2_hs_PBMC_10k/sc5p_v2_hs_PBMC_10k_b_filtered_contig_annotations.csv",
            "filtered_contig.fasta": "https://cf.10xgenomics.com/samples/cell-vdj/4.0.0/sc5p_v2_hs_PBMC_10k/sc5p_v2_hs_PBMC_10k_b_filtered_contig.fasta",
            "airr_rearrangement.tsv": "https://cf.10xgenomics.com/samples/cell-vdj/4.0.0/sc5p_v2_hs_PBMC_10k/sc5p_v2_hs_PBMC_10k_b_airr_rearrangement.tsv",
        },
        "sc5p_v2_hs_PBMC_1k_b": {
            "filtered_feature_bc_matrix.h5": "https://cf.10xgenomics.com/samples/cell-vdj/4.0.0/sc5p_v2_hs_PBMC_1k/sc5p_v2_hs_PBMC_1k_filtered_feature_bc_matrix.h5",
            "filtered_contig_annotations.csv": "https://cf.10xgenomics.com/samples/cell-vdj/4.0.0/sc5p_v2_hs_PBMC_1k/sc5p_v2_hs_PBMC_1k_b_filtered_contig_annotations.csv",
            "filtered_contig.fasta": "https://cf.10xgenomics.com/samples/cell-vdj/4.0.0/sc5p_v2_hs_PBMC_1k/sc5p_v2_hs_PBMC_1k_b_filtered_contig.fasta",
        },
    }

    for dirname, files in datasets.items():
        dirpath = base / dirname
        dirpath.mkdir(parents=True, exist_ok=True)

        for filename, url in files.items():
            outfile = dirpath / filename
            if outfile.exists():
                continue
            print(f"Downloading {filename} → {outfile}")
            _download_file(url, outfile)




[docs]
def setup_dandelion_tutorial_tcr(path: Path | str | None = None) -> None:
    """Download example TCR datasets for Dandelion tutorial.

    Downloads 10x Genomics PBMC and melanoma TCR datasets into a local
    directory for use in the dandelion TCR preprocessing tutorial.

    Parameters
    ----------
    path : Path | str | None, optional
        Root directory to download datasets into.
        Defaults to ``./dandelion_tutorial``.
    """
    base = Path("./dandelion_tutorial") if path is None else Path(path)
    base.mkdir(parents=True, exist_ok=True)

    datasets = {
        "sc5p_v2_hs_PBMC_10k_t": {
            "filtered_feature_bc_matrix.h5": "https://cf.10xgenomics.com/samples/cell-vdj/4.0.0/sc5p_v2_hs_PBMC_10k/sc5p_v2_hs_PBMC_10k_filtered_feature_bc_matrix.h5",
            "airr_rearrangement.tsv": "https://cf.10xgenomics.com/samples/cell-vdj/4.0.0/sc5p_v2_hs_PBMC_10k/sc5p_v2_hs_PBMC_10k_t_airr_rearrangement.tsv",
            "filtered_contig_annotations.csv": "https://cf.10xgenomics.com/samples/cell-vdj/4.0.0/sc5p_v2_hs_PBMC_10k/sc5p_v2_hs_PBMC_10k_t_filtered_contig_annotations.csv",
            "filtered_contig.fasta": "https://cf.10xgenomics.com/samples/cell-vdj/4.0.0/sc5p_v2_hs_PBMC_10k/sc5p_v2_hs_PBMC_10k_t_filtered_contig.fasta",
        },
        "sc5p_v1p1_hs_melanoma_10k_t": {
            "filtered_feature_bc_matrix.h5": "https://cf.10xgenomics.com/samples/cell-vdj/4.0.0/sc5p_v1p1_hs_melanoma_10k/sc5p_v1p1_hs_melanoma_10k_filtered_feature_bc_matrix.h5",
            "airr_rearrangement.tsv": "https://cf.10xgenomics.com/samples/cell-vdj/4.0.0/sc5p_v1p1_hs_melanoma_10k/sc5p_v1p1_hs_melanoma_10k_t_airr_rearrangement.tsv",
            "filtered_contig_annotations.csv": "https://cf.10xgenomics.com/samples/cell-vdj/4.0.0/sc5p_v1p1_hs_melanoma_10k/sc5p_v1p1_hs_melanoma_10k_t_filtered_contig_annotations.csv",
            "filtered_contig.fasta": "https://cf.10xgenomics.com/samples/cell-vdj/4.0.0/sc5p_v1p1_hs_melanoma_10k/sc5p_v1p1_hs_melanoma_10k_t_filtered_contig.fasta",
        },
    }

    for dirname, files in datasets.items():
        dirpath = base / dirname
        dirpath.mkdir(parents=True, exist_ok=True)

        for filename, url in files.items():
            outfile = dirpath / filename
            if outfile.exists():
                continue
            print(f"Downloading {filename} → {outfile}")
            _download_file(url, outfile)




[docs]
def setup_dandelion_tutorial_trajectory(path: Path | str | None = None) -> None:
    """Download example datasets for Dandelion V(D)J trajectory tutorial.

    Downloads panfetal B-cell trajectory GEX and VDJ data from Google Drive
    using ``gdown``.

    Parameters
    ----------
    path : Path | str | None, optional
        Root directory to download datasets into.
        Defaults to ``./dandelion_tutorial``.

    Raises
    ------
    ImportError
        If ``gdown`` is not installed.
    """
    try:
        import gdown
    except ImportError:
        raise ImportError(
            "gdown is required to download the trajectory tutorial data. Please install it via `pip install gdown`."
        )
    base = Path("./dandelion_tutorial") if path is None else Path(path)
    base.mkdir(parents=True, exist_ok=True)

    gex_id = "1-LbAinwhAhJW3Y60wpO9GWJJcaMa_liy"
    vdj_id = "1lyScJWdGopW2nLoIhZmfUGVSWLWI_qWg"
    datasets = {
        "panfetal_trajectory": {
            "demo-pseudobulk.h5ad": f"https://drive.google.com/uc?id={gex_id}",
            "demo-vdj-traj.tsv.gz": f"https://drive.google.com/uc?id={vdj_id}",
        }
    }
    for dirname, files in datasets.items():
        dirpath = base / dirname
        dirpath.mkdir(parents=True, exist_ok=True)
        for filename, url in files.items():
            outfile = dirpath / filename
            if outfile.exists():
                continue
            print(f"Downloading {filename} → {outfile}")
            gdown.download(url, str(outfile), quiet=False)



def setup_dandelion_tutorial_simple(path: Path | str | None = None) -> None:
    """Download example datasets for Dandelion simple tutorial.

    Downloads a small demo dataset with GEX and BCR data from Google Drive
    using ``gdown``.

    Parameters
    ----------
    path : Path | str | None, optional
        Root directory to download datasets into.
        Defaults to ``./dandelion_tutorial``.

    Raises
    ------
    ImportError
        If ``gdown`` is not installed.
    """
    try:
        import gdown
    except ImportError:
        raise ImportError(
            "gdown is required to download the simple tutorial data. Please install it via `pip install gdown`."
        )
    base = Path("./dandelion_tutorial") if path is None else Path(path)
    base.mkdir(parents=True, exist_ok=True)

    gex_id = "1-PrwDi1Py8jqioNtP0DISKrcShRRHKxk"
    vdj_id = "1-d_uah-NzJqDYRP53ICgAAquiVLWRRtN"
    datasets = {
        "simple_demo": {
            "demo-gex.h5ad": f"https://drive.google.com/uc?id={gex_id}",
            "demo-vdj.h5ddl": f"https://drive.google.com/uc?id={vdj_id}",
        }
    }
    for dirname, files in datasets.items():
        dirpath = base / dirname
        dirpath.mkdir(parents=True, exist_ok=True)
        for filename, url in files.items():
            outfile = dirpath / filename
            if outfile.exists():
                continue
            print(f"Downloading {filename} → {outfile}")
            gdown.download(url, str(outfile), quiet=False)



[docs]
def setup_dandelion_tutorial_parse(path: Path | str | None = None) -> None:
    """Download the extremely large dataset from Parse Biosciences for Dandelion tutorial.

    Downloads the Parse Biosciences 1M human BCR dataset (cell metadata CSV
    and AIRR rearrangement TSV) into a local directory.

    Parameters
    ----------
    path : Path | str | None, optional
        Root directory to download datasets into.
        Defaults to ``./dandelion_tutorial``.
    """
    base = Path("./dandelion_tutorial") if path is None else Path(path)
    base.mkdir(parents=True, exist_ok=True)
    datasets = {
        "human-bcr-1m": {
            "cell_metadata.csv": "https://cdn.parsebiosciences.com/bcr/human-bcr-1m/cell_metadata.csv",
            "bcr_annotation_airr.tsv": "https://cdn.parsebiosciences.com/bcr/human-bcr-1m/bcr_annotation_airr.tsv",
        }
    }

    for dirname, files in datasets.items():
        dirpath = base / dirname
        dirpath.mkdir(parents=True, exist_ok=True)
        for filename, url in files.items():
            outfile = dirpath / filename
            if outfile.exists():
                continue
            print(f"Downloading {filename} → {outfile}")
            _download_file(url, outfile)



def setup_colab_singularity() -> None:  # pragma: no cover
    """Install and configure Apptainer/Singularity in a Google Colab environment.

    Installs ``apptainer-suid`` from the official PPA, wraps it with
    ``unshare -r`` so that it operates correctly inside Colab's unprivileged
    container, and registers the Sylabs remote. Safe to re-run; existing
    files are backed up rather than overwritten.

    Raises
    ------
    subprocess.CalledProcessError
        If any step of the installation script exits with a non-zero status.
    """

    bash_script = r"""
set -e

echo "Installing Apptainer..."
sudo apt update -qq
sudo apt install -y -qq software-properties-common

if ! grep -q apptainer /etc/apt/sources.list /etc/apt/sources.list.d/* 2>/dev/null; then
    sudo add-apt-repository -y ppa:apptainer/ppa
    sudo apt update -qq
fi

sudo apt install -y -qq apptainer-suid

echo "Configuring fakeroot..."
sudo apptainer config fakeroot --add root || true

echo "Creating singularity wrapper..."
echo 'unshare -r apptainer "$@"' | sudo tee /usr/bin/singularity_test > /dev/null
sudo chmod +x /usr/bin/singularity_test

sudo mv /usr/bin/singularity /usr/bin/singularity_backup 2>/dev/null || true
sudo mv /usr/bin/singularity_test /usr/bin/singularity

echo "Adding Sylabs remote..."
apptainer remote add --no-login SylabsCloud cloud.sylabs.io || true
apptainer remote use SylabsCloud

echo "Done."
"""

    subprocess.run(["bash", "-c", bash_script], check=True)
    print("\n✅ Singularity / Apptainer ready!")