{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# SOMDE Robust" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "=== SOMDE for 151508 ===\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/dj/6qv7h1l52bvgckcytvzh5ws00000gn/T/ipykernel_99872/1163262007.py:45: FutureWarning: Use `squidpy.read.visium` instead.\n", " ad = sc.read_visium(\n", "/opt/anaconda3/envs/raftup/lib/python3.10/site-packages/anndata/_core/anndata.py:1832: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.\n", " utils.warn_names_duplicates(\"var\")\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "151508 loading done\n", "[SOMDE] Processing slice 151508\n", "[SOMDE] 151508: total genes before filtering = 33538\n", "[SOMDE] 151508: after min_cells>=50 = 11450\n", "[SOMDE] 151508: after min_counts>=50 = 11450\n", "[SOMDE] 151508: keeping 11450 genes, dropping 22088 genes\n", "using 29*29 SOM nodes for 4381 points\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Models: 0%| | 0/10 [00:00 is currently using SeriesGroupBy.max. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"max\" instead.\n", " model_results = model_results[model_results.groupby(['g'])['max_ll'].transform(max) == model_results['max_ll']]\n", "/opt/anaconda3/envs/raftup/lib/python3.10/site-packages/somde/util.py:110: FutureWarning: Series.ravel is deprecated. The underlying array is already 1D, so ravel is not necessary. Use `to_numpy()` for conversion to a numpy array instead.\n", " pv = pv.ravel() # flattens the array in place, more efficient than flatten()\n", "/var/folders/dj/6qv7h1l52bvgckcytvzh5ws00000gn/T/ipykernel_99872/1163262007.py:45: FutureWarning: Use `squidpy.read.visium` instead.\n", " ad = sc.read_visium(\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[SOMDE] Saved result to ./results/somde/somde_151508.csv\n", "[SOMDE] Saved filter stats to ./results/somde/somde_151508_filter_stats.csv\n", "[SOMDE] Saved filtered-out genes to ./results/somde/somde_151508_filtered_out_genes.csv\n", "151508 somde done\n", "\n", "=== SOMDE for 151509 ===\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/opt/anaconda3/envs/raftup/lib/python3.10/site-packages/anndata/_core/anndata.py:1832: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.\n", " utils.warn_names_duplicates(\"var\")\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "151509 loading done\n", "[SOMDE] Processing slice 151509\n", "[SOMDE] 151509: total genes before filtering = 33538\n", "[SOMDE] 151509: after min_cells>=50 = 12407\n", "[SOMDE] 151509: after min_counts>=50 = 12407\n", "[SOMDE] 151509: keeping 12407 genes, dropping 21131 genes\n", "using 30*30 SOM nodes for 4788 points\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Models: 0%| | 0/10 [00:00 is currently using SeriesGroupBy.max. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"max\" instead.\n", " model_results = model_results[model_results.groupby(['g'])['max_ll'].transform(max) == model_results['max_ll']]\n", "/opt/anaconda3/envs/raftup/lib/python3.10/site-packages/somde/util.py:110: FutureWarning: Series.ravel is deprecated. The underlying array is already 1D, so ravel is not necessary. Use `to_numpy()` for conversion to a numpy array instead.\n", " pv = pv.ravel() # flattens the array in place, more efficient than flatten()\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[SOMDE] Saved result to ./results/somde/somde_151509.csv\n", "[SOMDE] Saved filter stats to ./results/somde/somde_151509_filter_stats.csv\n", "[SOMDE] Saved filtered-out genes to ./results/somde/somde_151509_filtered_out_genes.csv\n", "151509 somde done\n", "\n", "All SOMDE computations finished.\n" ] } ], "source": [ "import os\n", "import scanpy as sc\n", "import pandas as pd\n", "import numpy as np\n", "\n", "import scipy as sp\n", "import scipy.sparse as sp_sparse\n", "\n", "# ------------------------------------------------------\n", "# Hotfix for SOMDE: SOMDE uses `import scipy as sp` and then calls numpy-like APIs\n", "# Newer SciPy removed these aliases (sp.arange, sp.array, sp.argsort, ...)\n", "# We patch them back from NumPy for runtime compatibility.\n", "# ------------------------------------------------------\n", "_np_aliases = [\n", " \"array\", \"arange\", \"argsort\", \"asarray\",\n", " \"zeros\", \"ones\", \"empty\",\n", " \"zeros_like\", \"ones_like\", \"full\", \"full_like\",\n", " \"log\", \"exp\", \"sqrt\",\n", " \"sum\", \"mean\", \"std\", \"var\",\n", " \"where\", \"isfinite\", \"isnan\",\n", " \"maximum\", \"minimum\", \"clip\",\n", " \"cumsum\", \"cumprod\",\n", " \"floor\", \"ceil\", \"abs\",\n", "]\n", "for _name in _np_aliases:\n", " if not hasattr(sp, _name):\n", " setattr(sp, _name, getattr(np, _name))\n", "\n", "from somde import SomNode\n", "\n", "\n", "DATA_DIR = \"./data\"\n", "OUTPUT_DIR = \"./results/somde\"\n", "SOM_K = 5\n", "os.makedirs(OUTPUT_DIR, exist_ok=True)\n", "\n", "# ------------------------------------------------------\n", "# Scanpy gene filtering thresholds\n", "# ------------------------------------------------------\n", "MIN_CELLS = 50\n", "MIN_COUNTS = 50\n", "\n", "\n", "def load_DLPFC(root_dir=DATA_DIR, section_id=\"151507\"):\n", " ad = sc.read_visium(\n", " path=os.path.join(root_dir, section_id),\n", " count_file=section_id + \"_filtered_feature_bc_matrix.h5\"\n", " )\n", " ad.var_names_make_unique()\n", "\n", " gt_dir = os.path.join(root_dir, section_id, \"gt\")\n", " gt_df = pd.read_csv(\n", " os.path.join(gt_dir, \"tissue_positions_list_GTs.txt\"),\n", " sep=\",\", header=None, index_col=0\n", " )\n", " ad.obs[\"original_clusters\"] = gt_df.loc[:, 6]\n", " keep_bcs = ad.obs.dropna().index\n", " ad = ad[keep_bcs].copy()\n", " ad.obs[\"original_clusters\"] = ad.obs[\"original_clusters\"].astype(int).astype(str)\n", "\n", " print(f\"{section_id} loading done\")\n", " return ad\n", "\n", "\n", "def compute_and_save_somde_csv(\n", " adata: sc.AnnData,\n", " section_id: str,\n", " output_dir: str = OUTPUT_DIR,\n", " som_k: int = SOM_K,\n", " min_cells: int = MIN_CELLS,\n", " min_counts: int = MIN_COUNTS,\n", "):\n", " \"\"\"\n", " Compute SOMDE spatially variable genes for one slice after filtering genes\n", " with Scanpy:\n", " 1) keep genes expressed in at least `min_cells` spots\n", " 2) keep genes with total counts at least `min_counts`\n", "\n", " Saves:\n", " somde_{section_id}.csv\n", " somde_{section_id}_filter_stats.csv\n", " somde_{section_id}_filtered_out_genes.csv\n", " \"\"\"\n", "\n", " print(f\"[SOMDE] Processing slice {section_id}\")\n", "\n", " # Keep a copy of original adata for diagnostics\n", " adata_raw = adata.copy()\n", "\n", " # ------------------------------------------------------\n", " # Build pre-filter gene stats from raw counts\n", " # ------------------------------------------------------\n", " X_raw = adata_raw.X\n", " gene_names_raw = np.array(adata_raw.var_names).astype(str)\n", "\n", " if sp_sparse.issparse(X_raw):\n", " total_count_raw = np.asarray(X_raw.sum(axis=0)).ravel()\n", " nonzero_spots_raw = np.asarray((X_raw > 0).sum(axis=0)).ravel()\n", " else:\n", " X_tmp = np.asarray(X_raw)\n", " total_count_raw = X_tmp.sum(axis=0)\n", " nonzero_spots_raw = (X_tmp > 0).sum(axis=0)\n", "\n", " # ------------------------------------------------------\n", " # Filter genes using Scanpy\n", " # ------------------------------------------------------\n", " adata_filt = adata.copy()\n", " n_before = adata_filt.n_vars\n", "\n", " sc.pp.filter_genes(adata_filt, min_cells=min_cells)\n", " n_after_min_cells = adata_filt.n_vars\n", "\n", " sc.pp.filter_genes(adata_filt, min_counts=min_counts)\n", " n_after_min_counts = adata_filt.n_vars\n", "\n", " kept_genes = set(adata_filt.var_names.astype(str))\n", " keep_mask = np.array([g in kept_genes for g in gene_names_raw])\n", "\n", " n_keep = int(keep_mask.sum())\n", " n_drop = int((~keep_mask).sum())\n", "\n", " print(f\"[SOMDE] {section_id}: total genes before filtering = {n_before}\")\n", " print(f\"[SOMDE] {section_id}: after min_cells>={min_cells} = {n_after_min_cells}\")\n", " print(f\"[SOMDE] {section_id}: after min_counts>={min_counts} = {n_after_min_counts}\")\n", " print(f\"[SOMDE] {section_id}: keeping {n_keep} genes, dropping {n_drop} genes\")\n", "\n", " # ------------------------------------------------------\n", " # Save filtering diagnostics\n", " # ------------------------------------------------------\n", " filter_df = pd.DataFrame({\n", " \"g\": gene_names_raw,\n", " \"total_count\": total_count_raw,\n", " \"nonzero_spots\": nonzero_spots_raw,\n", " \"keep_for_somde\": keep_mask,\n", " })\n", " filter_df[f\"nonzero_spots_lt_{min_cells}\"] = filter_df[\"nonzero_spots\"] < min_cells\n", " filter_df[f\"total_count_lt_{min_counts}\"] = filter_df[\"total_count\"] < min_counts\n", "\n", " filter_stats_path = os.path.join(output_dir, f\"somde_{section_id}_filter_stats.csv\")\n", " filter_df.to_csv(filter_stats_path, index=False)\n", "\n", " filtered_out_path = os.path.join(output_dir, f\"somde_{section_id}_filtered_out_genes.csv\")\n", " filter_df.loc[~filter_df[\"keep_for_somde\"]].to_csv(filtered_out_path, index=False)\n", "\n", " # ------------------------------------------------------\n", " # Run SOMDE on filtered genes only\n", " # ------------------------------------------------------\n", " pts = adata_filt.obsm[\"spatial\"].astype(np.float32)\n", "\n", " if sp_sparse.issparse(adata_filt.X):\n", " X_filt = adata_filt.X.toarray()\n", " else:\n", " X_filt = np.asarray(adata_filt.X)\n", "\n", " df_expr = pd.DataFrame(X_filt, columns=adata_filt.var_names.astype(str))\n", "\n", " som = SomNode(pts, som_k)\n", " som.mtx(df_expr.T)\n", " som.norm()\n", " result, _ = som.run()\n", "\n", " output_path = os.path.join(output_dir, f\"somde_{section_id}.csv\")\n", " result.to_csv(output_path, index=False)\n", "\n", " print(f\"[SOMDE] Saved result to {output_path}\")\n", " print(f\"[SOMDE] Saved filter stats to {filter_stats_path}\")\n", " print(f\"[SOMDE] Saved filtered-out genes to {filtered_out_path}\")\n", "\n", "\n", "section_ids = [\n", " \"151508\",\n", " \"151509\"\n", "]\n", "\n", "for sid in section_ids:\n", " print(f\"\\n=== SOMDE for {sid} ===\")\n", " try:\n", " adata = load_DLPFC(section_id=sid)\n", " compute_and_save_somde_csv(\n", " adata=adata,\n", " section_id=sid,\n", " min_cells=MIN_CELLS,\n", " min_counts=MIN_COUNTS,\n", " )\n", " print(f\"{sid} somde done\")\n", " except Exception as e:\n", " print(f\"[ERROR] {sid} failed: {e}\")\n", "\n", "print(\"\\nAll SOMDE computations finished.\")" ] } ], "metadata": { "kernelspec": { "display_name": "raftup", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.19" } }, "nbformat": 4, "nbformat_minor": 2 }