DLPFC Gene Cost by SOMDE

In this file, the embedding from DLPFC data will be generated. SOMDE will be used to select the highly variable genes. If you don’t have the data, Please run the 0_somde_robust_scanpy.ipynb to generate the data.

[1]:

import os
import random
import numpy as np
import torch
import scanpy as sc
import pandas as pd

from raftup import _gene_cost_somde


# =========================================================
# User options
# =========================================================
mode = "fast"   # choose from: "reproducible", "fast"
seed = 0

ROOT_DIR = "./data"
SOMDE_DIR = "./results/somde"
OUT_DIR = "./results/gene_cost_matrix"
N_TOP_SV_GENES = 3000

SECTION_PAIRS = [
    ("151508", "151509")
]


# =========================================================
# Setup
# =========================================================
def setup_environment(mode="reproducible", seed=0):
    """
    Configure random seeds and device.

    mode = "reproducible"
        - force CPU
        - enable deterministic algorithms
        - optionally limit thread-level nondeterminism

    mode = "fast"
        - prefer CUDA, then MPS, then CPU
        - allow nondeterministic behavior for speed
    """
    if mode not in {"reproducible", "fast"}:
        raise ValueError("mode must be 'reproducible' or 'fast'")

    # ----- reproducible mode -----
    if mode == "reproducible":
        # Optional: reduce CPU nondeterminism from parallel reductions
        os.environ["OMP_NUM_THREADS"] = "1"
        os.environ["MKL_NUM_THREADS"] = "1"

        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)

        torch.set_num_threads(1)
        torch.use_deterministic_algorithms(True)

        device = torch.device("cpu")

    # ----- fast mode -----
    else:
        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)

        if torch.cuda.is_available():
            torch.cuda.manual_seed(seed)
            torch.cuda.manual_seed_all(seed)
            device = torch.device("cuda")
        elif torch.backends.mps.is_available():
            device = torch.device("mps")
        else:
            device = torch.device("cpu")

        # allow faster execution
        torch.use_deterministic_algorithms(False)

        if device.type == "cuda":
            torch.backends.cudnn.benchmark = True
            torch.backends.cudnn.deterministic = False

    print(f"Mode: {mode}")
    print(f"Using device: {device}")
    print("OMP_NUM_THREADS:", os.environ.get("OMP_NUM_THREADS"))
    print("MKL_NUM_THREADS:", os.environ.get("MKL_NUM_THREADS"))
    print("torch num threads:", torch.get_num_threads())
    print("torch interop threads:", torch.get_num_interop_threads())

    return device


def load_DLPFC(root_dir: str, section_id: str):
    """
    Load one DLPFC Visium slice with ground-truth layer labels.
    """
    adata = sc.read_visium(
        path=os.path.join(root_dir, section_id),
        count_file=f"{section_id}_filtered_feature_bc_matrix.h5",
    )
    adata.var_names_make_unique()

    gt_dir = os.path.join(root_dir, section_id, "gt")
    gt_df = pd.read_csv(
        os.path.join(gt_dir, "tissue_positions_list_GTs.txt"),
        sep=",",
        header=None,
        index_col=0,
    )

    adata.obs["original_clusters"] = gt_df.loc[:, 6]
    keep_bcs = adata.obs.dropna().index
    adata = adata[keep_bcs].copy()
    adata.obs["original_clusters"] = (
        adata.obs["original_clusters"].astype(int).astype(str)
    )

    print(f"[load_DLPFC] Loaded section {section_id}")
    return adata


# =========================================================
# Main
# =========================================================
device = setup_environment(mode=mode, seed=seed)
os.makedirs(OUT_DIR, exist_ok=True)

for sid_A, sid_B in SECTION_PAIRS:
    print(f"\n=== Computing gene cost: {sid_A} vs {sid_B} ===")

    sliceA = load_DLPFC(ROOT_DIR, sid_A)
    sliceB = load_DLPFC(ROOT_DIR, sid_B)

    for sl in (sliceA, sliceB):
        sc.pp.normalize_total(sl)
        sc.pp.log1p(sl)

    df_somde_A = pd.read_csv(f"{SOMDE_DIR}/somde_{sid_A}.csv")
    df_somde_B = pd.read_csv(f"{SOMDE_DIR}/somde_{sid_B}.csv")

    sv_genes_A = df_somde_A["g"].values[:N_TOP_SV_GENES]
    sv_genes_B = df_somde_B["g"].values[:N_TOP_SV_GENES]

    sliceA = sliceA[:, sv_genes_A].copy()
    sliceB = sliceB[:, sv_genes_B].copy()

    for sl in (sliceA, sliceB):
        sc.pp.scale(sl)

    _gene_cost_somde.compute_gene_cost(
        sliceA=sliceA,
        sliceB=sliceB,
        section_id_A=sid_A,
        section_id_B=sid_B,
        n_h=100,
        n_epoch=3500,
        lr=2e-4,
        print_step=500,
        seed=seed,
        device=device,
        output_dir=OUT_DIR,
    )

print("\nAll gene cost matrices computed successfully.")

Mode: fast
Using device: mps
OMP_NUM_THREADS: None
MKL_NUM_THREADS: None
torch num threads: 12
torch interop threads: 12

=== Computing gene cost: 151508 vs 151509 ===
[load_DLPFC] Loaded section 151508
[load_DLPFC] Loaded section 151509
[0000] DGI loss = 2.8241
[0500] DGI loss = 0.0010
[1000] DGI loss = 0.0003
[1500] DGI loss = 0.0002
[2000] DGI loss = 0.0001
[2500] DGI loss = 0.0000
[3000] DGI loss = 0.0000
[3499] DGI loss = 0.0001

All gene cost matrices computed successfully.