{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "overview",
   "metadata": {},
   "source": [
    "# scPerturb 数据处理\n",
    "\n",
    "这个 notebook 先完成两件事：\n",
    "\n",
    "1. 统一展示当前 `.h5ad` 文件的结构。\n",
    "2. 根据这些结构，自动提取一个后续建模可直接使用的标准化 metadata 表。"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "workflow",
   "metadata": {},
   "source": [
    "\n",
    "\n",
    "- 对比不同数据集的 `obs` 字段，确认 perturbation 标签列名\n",
    "- 找到表达矩阵、原始矩阵和其他矩阵存储位置\n",
    "- 统一提取每个细胞的 perturbation 信息\n",
    "- 标记哪些细胞更接近单 perturbation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "setup",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Kernel python: /opt/miniconda3/bin/python\n",
      "Project root: /Users/yurujiang/Desktop/final_thesis\n",
      "Search root: /Users/yurujiang/Desktop/final_thesis/data/scPerturb\n",
      "Found 4 h5ad file(s).\n"
     ]
    }
   ],
   "source": [
    "from __future__ import annotations\n",
    "\n",
    "from pathlib import Path\n",
    "import re\n",
    "import sys\n",
    "\n",
    "import pandas as pd\n",
    "import anndata as ad\n",
    "import h5py\n",
    "\n",
    "\n",
    "def find_project_root(start: Path) -> Path:\n",
    "    for candidate in [start, *start.parents]:\n",
    "        if (candidate / \"src\").exists() and (candidate / \"data\").exists():\n",
    "            return candidate\n",
    "    return start\n",
    "\n",
    "\n",
    "PROJECT_ROOT = find_project_root(Path.cwd().resolve())\n",
    "H5AD_ROOT = PROJECT_ROOT / \"data\" / \"scPerturb\"\n",
    "H5AD_FILES = sorted(H5AD_ROOT.rglob(\"*.h5ad\"))\n",
    "\n",
    "print(f\"Kernel python: {sys.executable}\")\n",
    "print(f\"Project root: {PROJECT_ROOT}\")\n",
    "print(f\"Search root: {H5AD_ROOT}\")\n",
    "print(f\"Found {len(H5AD_FILES)} h5ad file(s).\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "list-files",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Available .h5ad files:\n",
      "- data/scPerturb/rna_protein/AdamsonWeissman2016_GSM2406675_10X001.h5ad (32.96 MB)\n",
      "- data/scPerturb/rna_protein/AdamsonWeissman2016_GSM2406677_10X005.h5ad (132.62 MB)\n",
      "- data/scPerturb/rna_protein/AdamsonWeissman2016_GSM2406681_10X010.h5ad (449.45 MB)\n",
      "- data/scPerturb/rna_protein/ReplogleWeissman2022_K562_essential.h5ad (1475.08 MB)\n"
     ]
    }
   ],
   "source": [
    "if not H5AD_FILES:\n",
    "    print(\"No .h5ad files found under data/scPerturb/. Please run download.ipynb first.\")\n",
    "else:\n",
    "    print(\"Available .h5ad files:\")\n",
    "    for path in H5AD_FILES:\n",
    "        rel_path = path.relative_to(PROJECT_ROOT)\n",
    "        size_mb = path.stat().st_size / 1024 / 1024\n",
    "        print(f\"- {rel_path} ({size_mb:.2f} MB)\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "hdf5-tree-helpers",
   "metadata": {},
   "outputs": [],
   "source": [
    "def _format_hdf5_node(node) -> str:\n",
    "    shape = getattr(node, \"shape\", None)\n",
    "    dtype = getattr(node, \"dtype\", None)\n",
    "    if shape is None:\n",
    "        return \"group\"\n",
    "    return f\"dataset shape={shape}, dtype={dtype}\"\n",
    "\n",
    "\n",
    "def print_hdf5_tree(path: Path, max_depth: int = 2, max_children: int = 12) -> None:\n",
    "    print(f\"\\n=== HDF5 tree: {path.name} ===\")\n",
    "    with h5py.File(path, \"r\") as handle:\n",
    "        def walk(group, depth: int = 0, prefix: str = \"\"):\n",
    "            if depth > max_depth:\n",
    "                return\n",
    "\n",
    "            items = list(group.items())\n",
    "            visible_items = items[:max_children]\n",
    "            for index, (name, node) in enumerate(visible_items):\n",
    "                connector = \"└── \" if index == len(visible_items) - 1 else \"├── \"\n",
    "                print(f\"{prefix}{connector}{name} [{_format_hdf5_node(node)}]\")\n",
    "                if hasattr(node, \"items\") and depth < max_depth:\n",
    "                    extension = \"    \" if index == len(visible_items) - 1 else \"│   \"\n",
    "                    walk(node, depth + 1, prefix + extension)\n",
    "\n",
    "            if len(items) > max_children:\n",
    "                print(f\"{prefix}... ({len(items) - max_children} more item(s) omitted)\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "show-hdf5-tree",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "=== HDF5 tree: AdamsonWeissman2016_GSM2406675_10X001.h5ad ===\n",
      "\n",
      "=== HDF5 tree: AdamsonWeissman2016_GSM2406677_10X005.h5ad ===\n",
      "\n",
      "=== HDF5 tree: AdamsonWeissman2016_GSM2406681_10X010.h5ad ===\n",
      "\n",
      "=== HDF5 tree: ReplogleWeissman2022_K562_essential.h5ad ===\n"
     ]
    }
   ],
   "source": [
    "# 如需查看 HDF5 层级，可取消注释运行\n",
    "for path in H5AD_FILES:\n",
    "    print_hdf5_tree(path, max_depth=2, max_children=12)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "anndata-summary-helpers",
   "metadata": {},
   "outputs": [],
   "source": [
    "def summarize_anndata(path: Path) -> None:\n",
    "    print(f\"\\n=== AnnData summary: {path.name} ===\")\n",
    "    adata = ad.read_h5ad(path, backed=\"r\")\n",
    "    try:\n",
    "        print(f\"shape: {adata.shape}\")\n",
    "        print(f\"obs columns ({len(adata.obs.columns)}): {list(adata.obs.columns)}\")\n",
    "        print(f\"var columns ({len(adata.var.columns)}): {list(adata.var.columns)}\")\n",
    "        print(f\"layers: {list(adata.layers.keys())}\")\n",
    "        print(f\"obsm: {list(adata.obsm.keys())}\")\n",
    "        print(f\"varm: {list(adata.varm.keys())}\")\n",
    "        print(f\"uns: {list(adata.uns.keys())}\")\n",
    "\n",
    "        if len(adata.obs.columns) > 0:\n",
    "            print(\"obs head:\")\n",
    "            print(adata.obs.head(3))\n",
    "\n",
    "        if len(adata.var.columns) > 0:\n",
    "            print(\"var head:\")\n",
    "            print(adata.var.head(3))\n",
    "    finally:\n",
    "        if getattr(adata, \"file\", None) is not None:\n",
    "            adata.file.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "show-anndata-summary",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "=== AnnData summary: AdamsonWeissman2016_GSM2406675_10X001.h5ad ===\n",
      "shape: (5768, 35635)\n",
      "obs columns (15): ['perturbation', 'read count', 'UMI count', 'tissue_type', 'cell_line', 'cancer', 'disease', 'perturbation_type', 'celltype', 'organism', 'ncounts', 'ngenes', 'percent_mito', 'percent_ribo', 'nperts']\n",
      "var columns (3): ['ensembl_id', 'ncounts', 'ncells']\n",
      "layers: []\n",
      "obsm: []\n",
      "varm: []\n",
      "uns: []\n",
      "obs head:\n",
      "                  perturbation  read count  UMI count tissue_type cell_line  \\\n",
      "cell_barcode                                                                  \n",
      "AAACATACACCGAT    CREB1_pDS269      1286.0       98.0   cell_line      K562   \n",
      "AAACATACAGAGAT    SNAI1_pDS266       296.0       19.0   cell_line      K562   \n",
      "AAACATACCAGAAA  62(mod)_pBA581      1829.0      162.0   cell_line      K562   \n",
      "\n",
      "                cancer                       disease perturbation_type  \\\n",
      "cell_barcode                                                             \n",
      "AAACATACACCGAT    True  chronic myelogenous leukemia            CRISPR   \n",
      "AAACATACAGAGAT    True  chronic myelogenous leukemia            CRISPR   \n",
      "AAACATACCAGAAA    True  chronic myelogenous leukemia            CRISPR   \n",
      "\n",
      "                    celltype organism  ncounts  ngenes  percent_mito  \\\n",
      "cell_barcode                                                           \n",
      "AAACATACACCGAT  lymphoblasts    human   8138.0    2412           0.0   \n",
      "AAACATACAGAGAT  lymphoblasts    human   8980.0    2386           0.0   \n",
      "AAACATACCAGAAA  lymphoblasts    human  28610.0    4404           0.0   \n",
      "\n",
      "                percent_ribo  nperts  \n",
      "cell_barcode                          \n",
      "AAACATACACCGAT     34.037846       2  \n",
      "AAACATACAGAGAT     40.011135       2  \n",
      "AAACATACCAGAAA     40.003494       2  \n",
      "var head:\n",
      "                  ensembl_id  ncounts  ncells\n",
      "gene_symbol                                  \n",
      "MIR1302-10   ENSG00000243485      0.0       0\n",
      "FAM138A      ENSG00000237613      0.0       0\n",
      "OR4F5        ENSG00000186092      0.0       0\n",
      "\n",
      "=== AnnData summary: AdamsonWeissman2016_GSM2406677_10X005.h5ad ===\n",
      "shape: (15006, 32738)\n",
      "obs columns (15): ['perturbation', 'read count', 'UMI count', 'tissue_type', 'cell_line', 'cancer', 'disease', 'perturbation_type', 'celltype', 'organism', 'ncounts', 'ngenes', 'percent_mito', 'percent_ribo', 'nperts']\n",
      "var columns (3): ['ensembl_id', 'ncounts', 'ncells']\n",
      "layers: []\n",
      "obsm: []\n",
      "varm: []\n",
      "uns: []\n",
      "obs head:\n",
      "                        perturbation  read count  UMI count tissue_type  \\\n",
      "cell_barcode                                                              \n",
      "AAACATACACTCAG  3x_neg_ctrl_pMJ144-1       261.0       59.0   cell_line   \n",
      "AAACATACTCCTAT  3x_neg_ctrl_pMJ144-2       132.0       37.0   cell_line   \n",
      "AAACATTGCAGAGG  3x_neg_ctrl_pMJ144-2       560.0      117.0   cell_line   \n",
      "\n",
      "               cell_line  cancer                       disease  \\\n",
      "cell_barcode                                                     \n",
      "AAACATACACTCAG      K562    True  chronic myelogenous leukemia   \n",
      "AAACATACTCCTAT      K562    True  chronic myelogenous leukemia   \n",
      "AAACATTGCAGAGG      K562    True  chronic myelogenous leukemia   \n",
      "\n",
      "               perturbation_type      celltype organism  ncounts  ngenes  \\\n",
      "cell_barcode                                                               \n",
      "AAACATACACTCAG            CRISPR  lymphoblasts    human  24343.0    4164   \n",
      "AAACATACTCCTAT            CRISPR  lymphoblasts    human  27678.0    4428   \n",
      "AAACATTGCAGAGG            CRISPR  lymphoblasts    human  24745.0    4371   \n",
      "\n",
      "                percent_mito  percent_ribo  nperts  \n",
      "cell_barcode                                        \n",
      "AAACATACACTCAG      4.563941     32.629505       4  \n",
      "AAACATACTCCTAT      4.508996     28.658140       4  \n",
      "AAACATTGCAGAGG      3.200647     31.117397       4  \n",
      "var head:\n",
      "                  ensembl_id  ncounts  ncells\n",
      "gene_symbol                                  \n",
      "MIR1302-10   ENSG00000243485      2.0       2\n",
      "FAM138A      ENSG00000237613      0.0       0\n",
      "OR4F5        ENSG00000186092      0.0       0\n",
      "\n",
      "=== AnnData summary: AdamsonWeissman2016_GSM2406681_10X010.h5ad ===\n",
      "shape: (65337, 32738)\n",
      "obs columns (15): ['perturbation', 'read count', 'UMI count', 'tissue_type', 'cell_line', 'cancer', 'disease', 'perturbation_type', 'celltype', 'organism', 'ncounts', 'ngenes', 'percent_mito', 'percent_ribo', 'nperts']\n",
      "var columns (3): ['ensembl_id', 'ncounts', 'ncells']\n",
      "layers: []\n",
      "obsm: []\n",
      "varm: []\n",
      "uns: []\n",
      "obs head:\n",
      "                  perturbation  read count  UMI count tissue_type cell_line  \\\n",
      "cell_barcode                                                                  \n",
      "AAACATACAAGATG  63(mod)_pBA580       282.0        8.0   cell_line      K562   \n",
      "AAACATACACCTAG     OST4_pDS353       331.0        7.0   cell_line      K562   \n",
      "AAACATACTTCCCG  SEC61A1_pDS031       285.0       10.0   cell_line      K562   \n",
      "\n",
      "                cancer                       disease perturbation_type  \\\n",
      "cell_barcode                                                             \n",
      "AAACATACAAGATG    True  chronic myelogenous leukemia            CRISPR   \n",
      "AAACATACACCTAG    True  chronic myelogenous leukemia            CRISPR   \n",
      "AAACATACTTCCCG    True  chronic myelogenous leukemia            CRISPR   \n",
      "\n",
      "                    celltype organism  ncounts  ngenes  percent_mito  \\\n",
      "cell_barcode                                                           \n",
      "AAACATACAAGATG  lymphoblasts    human   8866.0    2914      4.917663   \n",
      "AAACATACACCTAG  lymphoblasts    human  13785.0    3818      4.468626   \n",
      "AAACATACTTCCCG  lymphoblasts    human   7569.0    2616      5.060113   \n",
      "\n",
      "                percent_ribo  nperts  \n",
      "cell_barcode                          \n",
      "AAACATACAAGATG     21.306112       2  \n",
      "AAACATACACCTAG     19.492201       2  \n",
      "AAACATACTTCCCG     23.199894       2  \n",
      "var head:\n",
      "                  ensembl_id  ncounts  ncells\n",
      "gene_symbol                                  \n",
      "MIR1302-10   ENSG00000243485     11.0      11\n",
      "FAM138A      ENSG00000237613      0.0       0\n",
      "OR4F5        ENSG00000186092      0.0       0\n",
      "\n",
      "=== AnnData summary: ReplogleWeissman2022_K562_essential.h5ad ===\n",
      "shape: (310385, 8563)\n",
      "obs columns (24): ['batch', 'gene', 'gene_id', 'transcript', 'gene_transcript', 'guide_id', 'percent_mito', 'UMI_count', 'z_gemgroup_UMI', 'core_scale_factor', 'core_adjusted_UMI_count', 'disease', 'cancer', 'cell_line', 'sex', 'age', 'perturbation', 'organism', 'perturbation_type', 'tissue_type', 'ncounts', 'ngenes', 'nperts', 'percent_ribo']\n",
      "var columns (14): ['chr', 'start', 'end', 'class', 'strand', 'length', 'in_matrix', 'mean', 'std', 'cv', 'fano', 'ensembl_id', 'ncounts', 'ncells']\n",
      "layers: []\n",
      "obsm: []\n",
      "varm: []\n",
      "uns: []\n",
      "obs head:\n",
      "                     batch  gene          gene_id transcript  \\\n",
      "cell_barcode                                                   \n",
      "AAACCCAAGAAATCCA-27     27  NAF1  ENSG00000145414       P1P2   \n",
      "AAACCCAAGAACTTCC-31     31  BUB1  ENSG00000169679       P1P2   \n",
      "AAACCCAAGAAGCCAC-34     34  UBL5  ENSG00000198258       P1P2   \n",
      "\n",
      "                                    gene_transcript  \\\n",
      "cell_barcode                                          \n",
      "AAACCCAAGAAATCCA-27  5449_NAF1_P1P2_ENSG00000145414   \n",
      "AAACCCAAGAACTTCC-31   935_BUB1_P1P2_ENSG00000169679   \n",
      "AAACCCAAGAAGCCAC-34  9534_UBL5_P1P2_ENSG00000198258   \n",
      "\n",
      "                                                              guide_id  \\\n",
      "cell_barcode                                                             \n",
      "AAACCCAAGAAATCCA-27  NAF1_+_164087918.23-P1P2|NAF1_-_164087674.23-P1P2   \n",
      "AAACCCAAGAACTTCC-31  BUB1_-_111435363.23-P1P2|BUB1_-_111435372.23-P1P2   \n",
      "AAACCCAAGAAGCCAC-34      UBL5_-_9938639.23-P1P2|UBL5_+_9938801.23-P1P2   \n",
      "\n",
      "                     percent_mito  UMI_count  z_gemgroup_UMI  \\\n",
      "cell_barcode                                                   \n",
      "AAACCCAAGAAATCCA-27      0.112083    11438.0        0.013047   \n",
      "AAACCCAAGAACTTCC-31      0.179895     5342.0       -1.522247   \n",
      "AAACCCAAGAAGCCAC-34      0.105287    17305.0        0.384157   \n",
      "\n",
      "                     core_scale_factor  ...     sex age  perturbation  \\\n",
      "cell_barcode                            ...                             \n",
      "AAACCCAAGAAATCCA-27           0.813253  ...  Female  53          NAF1   \n",
      "AAACCCAAGAACTTCC-31           0.844107  ...  Female  53          BUB1   \n",
      "AAACCCAAGAAGCCAC-34           1.091537  ...  Female  53          UBL5   \n",
      "\n",
      "                    organism perturbation_type  tissue_type  ncounts ngenes  \\\n",
      "cell_barcode                                                                  \n",
      "AAACCCAAGAAATCCA-27    human            CRISPR    cell_line  11324.0   3332   \n",
      "AAACCCAAGAACTTCC-31    human            CRISPR    cell_line   5257.0   2192   \n",
      "AAACCCAAGAAGCCAC-34    human            CRISPR    cell_line  17135.0   4002   \n",
      "\n",
      "                    nperts percent_ribo  \n",
      "cell_barcode                             \n",
      "AAACCCAAGAAATCCA-27      1     0.225362  \n",
      "AAACCCAAGAACTTCC-31      1     0.129732  \n",
      "AAACCCAAGAAGCCAC-34      1     0.236825  \n",
      "\n",
      "[3 rows x 24 columns]\n",
      "var head:\n",
      "            chr   start     end           class strand  length  in_matrix  \\\n",
      "gene_name                                                                   \n",
      "LINC01409  chr1  778747  810065  gene_version10      +   31318       True   \n",
      "LINC01128  chr1  825138  868202   gene_version9      +   43064       True   \n",
      "NOC2L      chr1  944203  959309  gene_version11      -   15106       True   \n",
      "\n",
      "               mean       std        cv      fano       ensembl_id   ncounts  \\\n",
      "gene_name                                                                      \n",
      "LINC01409  0.137594  0.380048  2.762105  1.049733  ENSG00000237491   42707.0   \n",
      "LINC01128  0.256720  0.520162  2.026184  1.053944  ENSG00000228794   79682.0   \n",
      "NOC2L      1.975144  1.707837  0.864665  1.476706  ENSG00000188976  613055.0   \n",
      "\n",
      "           ncells  \n",
      "gene_name          \n",
      "LINC01409   39082  \n",
      "LINC01128   68732  \n",
      "NOC2L      248759  \n"
     ]
    }
   ],
   "source": [
    "for path in H5AD_FILES:\n",
    "    summarize_anndata(path)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "step1",
   "metadata": {},
   "source": [
    "## 2. 对比不同数据集的 `obs` 字段\n",
    "\n",
    "这一部分会把每个文件里的观测列、候选 perturbation 列和基础统计列统一汇总出来。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "schema-helpers",
   "metadata": {},
   "outputs": [],
   "source": [
    "PERTURBATION_COLUMN_CANDIDATES = [\n",
    "    \"perturbation\",\n",
    "    \"gene\",\n",
    "    \"gene_id\",\n",
    "    \"gene_transcript\",\n",
    "    \"guide_id\",\n",
    "    \"target_gene\",\n",
    "    \"condition\",\n",
    "]\n",
    "\n",
    "COUNT_COLUMN_CANDIDATES = [\"ncounts\", \"UMI_count\", \"UMI count\", \"read count\"]\n",
    "GENE_COUNT_COLUMN_CANDIDATES = [\"ngenes\"]\n",
    "\n",
    "\n",
    "def available_columns(obs_columns: list[str], candidates: list[str]) -> list[str]:\n",
    "    return [column for column in candidates if column in obs_columns]\n",
    "\n",
    "\n",
    "def matrix_storage_summary(adata) -> dict:\n",
    "    return {\n",
    "        \"x_storage\": type(adata.X).__name__,\n",
    "        \"has_raw\": adata.raw is not None,\n",
    "        \"raw_shape\": None if adata.raw is None else adata.raw.shape,\n",
    "        \"layers\": list(adata.layers.keys()),\n",
    "        \"obsm\": list(adata.obsm.keys()),\n",
    "        \"varm\": list(adata.varm.keys()),\n",
    "        \"uns\": list(adata.uns.keys()),\n",
    "    }\n",
    "\n",
    "\n",
    "def summarize_schema(path: Path) -> dict:\n",
    "    adata = ad.read_h5ad(path, backed=\"r\")\n",
    "    try:\n",
    "        obs_columns = list(adata.obs.columns)\n",
    "        var_columns = list(adata.var.columns)\n",
    "        matrix_info = matrix_storage_summary(adata)\n",
    "        return {\n",
    "            \"file_name\": path.name,\n",
    "            \"dataset_name\": path.stem,\n",
    "            \"n_cells\": adata.n_obs,\n",
    "            \"n_genes\": adata.n_vars,\n",
    "            \"obs_columns\": obs_columns,\n",
    "            \"var_columns\": var_columns,\n",
    "            \"candidate_perturbation_columns\": available_columns(obs_columns, PERTURBATION_COLUMN_CANDIDATES),\n",
    "            \"candidate_count_columns\": available_columns(obs_columns, COUNT_COLUMN_CANDIDATES),\n",
    "            \"candidate_gene_count_columns\": available_columns(obs_columns, GENE_COUNT_COLUMN_CANDIDATES),\n",
    "            **matrix_info,\n",
    "        }\n",
    "    finally:\n",
    "        if getattr(adata, \"file\", None) is not None:\n",
    "            adata.file.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "schema-summary",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>file_name</th>\n",
       "      <th>n_cells</th>\n",
       "      <th>n_genes</th>\n",
       "      <th>candidate_perturbation_columns</th>\n",
       "      <th>candidate_count_columns</th>\n",
       "      <th>candidate_gene_count_columns</th>\n",
       "      <th>x_storage</th>\n",
       "      <th>has_raw</th>\n",
       "      <th>layers</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>AdamsonWeissman2016_GSM2406675_10X001.h5ad</td>\n",
       "      <td>5768</td>\n",
       "      <td>35635</td>\n",
       "      <td>[perturbation]</td>\n",
       "      <td>[ncounts, UMI count, read count]</td>\n",
       "      <td>[ngenes]</td>\n",
       "      <td>_CSCDataset</td>\n",
       "      <td>False</td>\n",
       "      <td>[]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>AdamsonWeissman2016_GSM2406677_10X005.h5ad</td>\n",
       "      <td>15006</td>\n",
       "      <td>32738</td>\n",
       "      <td>[perturbation]</td>\n",
       "      <td>[ncounts, UMI count, read count]</td>\n",
       "      <td>[ngenes]</td>\n",
       "      <td>_CSCDataset</td>\n",
       "      <td>False</td>\n",
       "      <td>[]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>AdamsonWeissman2016_GSM2406681_10X010.h5ad</td>\n",
       "      <td>65337</td>\n",
       "      <td>32738</td>\n",
       "      <td>[perturbation]</td>\n",
       "      <td>[ncounts, UMI count, read count]</td>\n",
       "      <td>[ngenes]</td>\n",
       "      <td>_CSCDataset</td>\n",
       "      <td>False</td>\n",
       "      <td>[]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>ReplogleWeissman2022_K562_essential.h5ad</td>\n",
       "      <td>310385</td>\n",
       "      <td>8563</td>\n",
       "      <td>[perturbation, gene, gene_id, gene_transcript,...</td>\n",
       "      <td>[ncounts, UMI_count]</td>\n",
       "      <td>[ngenes]</td>\n",
       "      <td>Dataset</td>\n",
       "      <td>False</td>\n",
       "      <td>[]</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                    file_name  n_cells  n_genes  \\\n",
       "0  AdamsonWeissman2016_GSM2406675_10X001.h5ad     5768    35635   \n",
       "1  AdamsonWeissman2016_GSM2406677_10X005.h5ad    15006    32738   \n",
       "2  AdamsonWeissman2016_GSM2406681_10X010.h5ad    65337    32738   \n",
       "3    ReplogleWeissman2022_K562_essential.h5ad   310385     8563   \n",
       "\n",
       "                      candidate_perturbation_columns  \\\n",
       "0                                     [perturbation]   \n",
       "1                                     [perturbation]   \n",
       "2                                     [perturbation]   \n",
       "3  [perturbation, gene, gene_id, gene_transcript,...   \n",
       "\n",
       "            candidate_count_columns candidate_gene_count_columns    x_storage  \\\n",
       "0  [ncounts, UMI count, read count]                     [ngenes]  _CSCDataset   \n",
       "1  [ncounts, UMI count, read count]                     [ngenes]  _CSCDataset   \n",
       "2  [ncounts, UMI count, read count]                     [ngenes]  _CSCDataset   \n",
       "3              [ncounts, UMI_count]                     [ngenes]      Dataset   \n",
       "\n",
       "   has_raw layers  \n",
       "0    False     []  \n",
       "1    False     []  \n",
       "2    False     []  \n",
       "3    False     []  "
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "schema_summary = pd.DataFrame([summarize_schema(path) for path in H5AD_FILES])\n",
    "schema_summary[[\n",
    "    \"file_name\",\n",
    "    \"n_cells\",\n",
    "    \"n_genes\",\n",
    "    \"candidate_perturbation_columns\",\n",
    "    \"candidate_count_columns\",\n",
    "    \"candidate_gene_count_columns\",\n",
    "    \"x_storage\",\n",
    "    \"has_raw\",\n",
    "    \"layers\",\n",
    "]]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "step2",
   "metadata": {},
   "source": [
    "## 3. 统一定位 perturbation 信息与矩阵位置\n",
    "\n",
    "这里做两个判断：\n",
    "\n",
    "- 这个数据集主要用哪一列表示 perturbation\n",
    "- 表达矩阵主要在 `X`、`raw` 还是 `layers`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "standardization-helpers",
   "metadata": {},
   "outputs": [],
   "source": [
    "CONTROL_PATTERN = re.compile(r\"ctrl|control|neg|non-target|nontarget|ntc|\\*|nan\", re.IGNORECASE)\n",
    "\n",
    "\n",
    "def choose_first_existing(obs_columns: list[str], candidates: list[str]) -> str | None:\n",
    "    for column in candidates:\n",
    "        if column in obs_columns:\n",
    "            return column\n",
    "    return None\n",
    "\n",
    "\n",
    "def infer_expression_location(adata) -> str:\n",
    "    if len(adata.layers.keys()) > 0:\n",
    "        return f\"X + layers={list(adata.layers.keys())}\"\n",
    "    if adata.raw is not None:\n",
    "        return \"X + raw\"\n",
    "    return \"X only\"\n",
    "\n",
    "\n",
    "def infer_single_perturbation(obs_df: pd.DataFrame, perturbation_col: str | None) -> pd.Series:\n",
    "    if \"nperts\" in obs_df.columns:\n",
    "        return pd.to_numeric(obs_df[\"nperts\"], errors=\"coerce\").fillna(-1).eq(1)\n",
    "\n",
    "    if perturbation_col is None:\n",
    "        return pd.Series(False, index=obs_df.index)\n",
    "\n",
    "    text = obs_df[perturbation_col].astype(str)\n",
    "    delimiters = text.str.contains(r\"[|,+;]\", regex=True)\n",
    "    looks_control = text.str.contains(CONTROL_PATTERN)\n",
    "    return (~delimiters) & (~looks_control)\n",
    "\n",
    "\n",
    "def infer_control(obs_df: pd.DataFrame, perturbation_col: str | None) -> pd.Series:\n",
    "    if perturbation_col is None:\n",
    "        return pd.Series(False, index=obs_df.index)\n",
    "    return obs_df[perturbation_col].astype(str).str.contains(CONTROL_PATTERN)\n",
    "\n",
    "\n",
    "def standardize_obs(path: Path) -> pd.DataFrame:\n",
    "    adata = ad.read_h5ad(path, backed=\"r\")\n",
    "    try:\n",
    "        obs = adata.obs.copy()\n",
    "        obs_columns = list(obs.columns)\n",
    "\n",
    "        perturbation_col = choose_first_existing(obs_columns, [\"perturbation\", \"gene\", \"condition\"])\n",
    "        gene_col = choose_first_existing(obs_columns, [\"gene\", \"target_gene\", \"perturbation\"])\n",
    "        gene_id_col = choose_first_existing(obs_columns, [\"gene_id\"])\n",
    "        guide_col = choose_first_existing(obs_columns, [\"guide_id\"])\n",
    "        count_col = choose_first_existing(obs_columns, [\"ncounts\", \"UMI_count\", \"UMI count\", \"read count\"])\n",
    "        ngenes_col = choose_first_existing(obs_columns, [\"ngenes\"])\n",
    "        perturbation_type_col = choose_first_existing(obs_columns, [\"perturbation_type\"])\n",
    "        cell_line_col = choose_first_existing(obs_columns, [\"cell_line\"])\n",
    "        celltype_col = choose_first_existing(obs_columns, [\"celltype\", \"cell_type\"])\n",
    "        organism_col = choose_first_existing(obs_columns, [\"organism\"])\n",
    "        disease_col = choose_first_existing(obs_columns, [\"disease\"])\n",
    "\n",
    "        standardized = pd.DataFrame(index=obs.index)\n",
    "        standardized[\"dataset_name\"] = path.stem\n",
    "        standardized[\"file_name\"] = path.name\n",
    "        standardized[\"cell_barcode\"] = obs.index.astype(str)\n",
    "        standardized[\"perturbation_label\"] = None if perturbation_col is None else obs[perturbation_col].astype(str)\n",
    "        standardized[\"perturbation_gene\"] = None if gene_col is None else obs[gene_col].astype(str)\n",
    "        standardized[\"perturbation_gene_id\"] = None if gene_id_col is None else obs[gene_id_col].astype(str)\n",
    "        standardized[\"guide_id\"] = None if guide_col is None else obs[guide_col].astype(str)\n",
    "        standardized[\"perturbation_type\"] = None if perturbation_type_col is None else obs[perturbation_type_col].astype(str)\n",
    "        standardized[\"cell_line\"] = None if cell_line_col is None else obs[cell_line_col].astype(str)\n",
    "        standardized[\"celltype\"] = None if celltype_col is None else obs[celltype_col].astype(str)\n",
    "        standardized[\"organism\"] = None if organism_col is None else obs[organism_col].astype(str)\n",
    "        standardized[\"disease\"] = None if disease_col is None else obs[disease_col].astype(str)\n",
    "        standardized[\"ncounts\"] = None if count_col is None else pd.to_numeric(obs[count_col], errors=\"coerce\")\n",
    "        standardized[\"ngenes\"] = None if ngenes_col is None else pd.to_numeric(obs[ngenes_col], errors=\"coerce\")\n",
    "        standardized[\"nperts\"] = pd.to_numeric(obs[\"nperts\"], errors=\"coerce\") if \"nperts\" in obs.columns else pd.NA\n",
    "        standardized[\"is_control\"] = infer_control(obs, perturbation_col)\n",
    "        standardized[\"is_single_perturbation\"] = infer_single_perturbation(obs, perturbation_col)\n",
    "        standardized[\"expression_location\"] = infer_expression_location(adata)\n",
    "        standardized[\"source_perturbation_col\"] = perturbation_col\n",
    "        standardized[\"source_gene_col\"] = gene_col\n",
    "        standardized[\"source_gene_id_col\"] = gene_id_col\n",
    "        standardized[\"source_guide_col\"] = guide_col\n",
    "\n",
    "        return standardized.reset_index(drop=True)\n",
    "    finally:\n",
    "        if getattr(adata, \"file\", None) is not None:\n",
    "            adata.file.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "metadata-standardized",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>dataset_name</th>\n",
       "      <th>file_name</th>\n",
       "      <th>cell_barcode</th>\n",
       "      <th>perturbation_label</th>\n",
       "      <th>perturbation_gene</th>\n",
       "      <th>perturbation_gene_id</th>\n",
       "      <th>guide_id</th>\n",
       "      <th>perturbation_type</th>\n",
       "      <th>cell_line</th>\n",
       "      <th>celltype</th>\n",
       "      <th>...</th>\n",
       "      <th>ncounts</th>\n",
       "      <th>ngenes</th>\n",
       "      <th>nperts</th>\n",
       "      <th>is_control</th>\n",
       "      <th>is_single_perturbation</th>\n",
       "      <th>expression_location</th>\n",
       "      <th>source_perturbation_col</th>\n",
       "      <th>source_gene_col</th>\n",
       "      <th>source_gene_id_col</th>\n",
       "      <th>source_guide_col</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>AdamsonWeissman2016_GSM2406675_10X001</td>\n",
       "      <td>AdamsonWeissman2016_GSM2406675_10X001.h5ad</td>\n",
       "      <td>AAACATACACCGAT</td>\n",
       "      <td>CREB1_pDS269</td>\n",
       "      <td>CREB1_pDS269</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>CRISPR</td>\n",
       "      <td>K562</td>\n",
       "      <td>lymphoblasts</td>\n",
       "      <td>...</td>\n",
       "      <td>8138.0</td>\n",
       "      <td>2412</td>\n",
       "      <td>2</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>X only</td>\n",
       "      <td>perturbation</td>\n",
       "      <td>perturbation</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>AdamsonWeissman2016_GSM2406675_10X001</td>\n",
       "      <td>AdamsonWeissman2016_GSM2406675_10X001.h5ad</td>\n",
       "      <td>AAACATACAGAGAT</td>\n",
       "      <td>SNAI1_pDS266</td>\n",
       "      <td>SNAI1_pDS266</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>CRISPR</td>\n",
       "      <td>K562</td>\n",
       "      <td>lymphoblasts</td>\n",
       "      <td>...</td>\n",
       "      <td>8980.0</td>\n",
       "      <td>2386</td>\n",
       "      <td>2</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>X only</td>\n",
       "      <td>perturbation</td>\n",
       "      <td>perturbation</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>AdamsonWeissman2016_GSM2406675_10X001</td>\n",
       "      <td>AdamsonWeissman2016_GSM2406675_10X001.h5ad</td>\n",
       "      <td>AAACATACCAGAAA</td>\n",
       "      <td>62(mod)_pBA581</td>\n",
       "      <td>62(mod)_pBA581</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>CRISPR</td>\n",
       "      <td>K562</td>\n",
       "      <td>lymphoblasts</td>\n",
       "      <td>...</td>\n",
       "      <td>28610.0</td>\n",
       "      <td>4404</td>\n",
       "      <td>2</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>X only</td>\n",
       "      <td>perturbation</td>\n",
       "      <td>perturbation</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>AdamsonWeissman2016_GSM2406675_10X001</td>\n",
       "      <td>AdamsonWeissman2016_GSM2406675_10X001.h5ad</td>\n",
       "      <td>AAACATACGTTGAC</td>\n",
       "      <td>EP300_pDS268</td>\n",
       "      <td>EP300_pDS268</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>CRISPR</td>\n",
       "      <td>K562</td>\n",
       "      <td>lymphoblasts</td>\n",
       "      <td>...</td>\n",
       "      <td>11346.0</td>\n",
       "      <td>2815</td>\n",
       "      <td>2</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>X only</td>\n",
       "      <td>perturbation</td>\n",
       "      <td>perturbation</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>AdamsonWeissman2016_GSM2406675_10X001</td>\n",
       "      <td>AdamsonWeissman2016_GSM2406675_10X001.h5ad</td>\n",
       "      <td>AAACATACTGTTCT</td>\n",
       "      <td>62(mod)_pBA581</td>\n",
       "      <td>62(mod)_pBA581</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>CRISPR</td>\n",
       "      <td>K562</td>\n",
       "      <td>lymphoblasts</td>\n",
       "      <td>...</td>\n",
       "      <td>9864.0</td>\n",
       "      <td>2584</td>\n",
       "      <td>2</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>X only</td>\n",
       "      <td>perturbation</td>\n",
       "      <td>perturbation</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 22 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                            dataset_name  \\\n",
       "0  AdamsonWeissman2016_GSM2406675_10X001   \n",
       "1  AdamsonWeissman2016_GSM2406675_10X001   \n",
       "2  AdamsonWeissman2016_GSM2406675_10X001   \n",
       "3  AdamsonWeissman2016_GSM2406675_10X001   \n",
       "4  AdamsonWeissman2016_GSM2406675_10X001   \n",
       "\n",
       "                                    file_name    cell_barcode  \\\n",
       "0  AdamsonWeissman2016_GSM2406675_10X001.h5ad  AAACATACACCGAT   \n",
       "1  AdamsonWeissman2016_GSM2406675_10X001.h5ad  AAACATACAGAGAT   \n",
       "2  AdamsonWeissman2016_GSM2406675_10X001.h5ad  AAACATACCAGAAA   \n",
       "3  AdamsonWeissman2016_GSM2406675_10X001.h5ad  AAACATACGTTGAC   \n",
       "4  AdamsonWeissman2016_GSM2406675_10X001.h5ad  AAACATACTGTTCT   \n",
       "\n",
       "  perturbation_label perturbation_gene perturbation_gene_id guide_id  \\\n",
       "0       CREB1_pDS269      CREB1_pDS269                 None     None   \n",
       "1       SNAI1_pDS266      SNAI1_pDS266                 None     None   \n",
       "2     62(mod)_pBA581    62(mod)_pBA581                 None     None   \n",
       "3       EP300_pDS268      EP300_pDS268                 None     None   \n",
       "4     62(mod)_pBA581    62(mod)_pBA581                 None     None   \n",
       "\n",
       "  perturbation_type cell_line      celltype  ...  ncounts ngenes  nperts  \\\n",
       "0            CRISPR      K562  lymphoblasts  ...   8138.0   2412       2   \n",
       "1            CRISPR      K562  lymphoblasts  ...   8980.0   2386       2   \n",
       "2            CRISPR      K562  lymphoblasts  ...  28610.0   4404       2   \n",
       "3            CRISPR      K562  lymphoblasts  ...  11346.0   2815       2   \n",
       "4            CRISPR      K562  lymphoblasts  ...   9864.0   2584       2   \n",
       "\n",
       "   is_control  is_single_perturbation  expression_location  \\\n",
       "0       False                   False               X only   \n",
       "1       False                   False               X only   \n",
       "2       False                   False               X only   \n",
       "3       False                   False               X only   \n",
       "4       False                   False               X only   \n",
       "\n",
       "   source_perturbation_col source_gene_col source_gene_id_col source_guide_col  \n",
       "0             perturbation    perturbation               None             None  \n",
       "1             perturbation    perturbation               None             None  \n",
       "2             perturbation    perturbation               None             None  \n",
       "3             perturbation    perturbation               None             None  \n",
       "4             perturbation    perturbation               None             None  \n",
       "\n",
       "[5 rows x 22 columns]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "standardized_metadata = pd.concat([standardize_obs(path) for path in H5AD_FILES], ignore_index=True)\n",
    "standardized_metadata.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "dataset-level-summary",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>dataset_name</th>\n",
       "      <th>n_cells</th>\n",
       "      <th>n_single_perturbation</th>\n",
       "      <th>n_control</th>\n",
       "      <th>perturbation_type</th>\n",
       "      <th>cell_line</th>\n",
       "      <th>source_perturbation_col</th>\n",
       "      <th>source_gene_col</th>\n",
       "      <th>source_guide_col</th>\n",
       "      <th>expression_location</th>\n",
       "      <th>single_perturbation_ratio</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>AdamsonWeissman2016_GSM2406675_10X001</td>\n",
       "      <td>5768</td>\n",
       "      <td>6</td>\n",
       "      <td>16</td>\n",
       "      <td>[CRISPR]</td>\n",
       "      <td>[K562]</td>\n",
       "      <td>perturbation</td>\n",
       "      <td>perturbation</td>\n",
       "      <td>None</td>\n",
       "      <td>X only</td>\n",
       "      <td>0.001040</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>AdamsonWeissman2016_GSM2406677_10X005</td>\n",
       "      <td>15006</td>\n",
       "      <td>13</td>\n",
       "      <td>3674</td>\n",
       "      <td>[CRISPR]</td>\n",
       "      <td>[K562]</td>\n",
       "      <td>perturbation</td>\n",
       "      <td>perturbation</td>\n",
       "      <td>None</td>\n",
       "      <td>X only</td>\n",
       "      <td>0.000866</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>AdamsonWeissman2016_GSM2406681_10X010</td>\n",
       "      <td>65337</td>\n",
       "      <td>101</td>\n",
       "      <td>2714</td>\n",
       "      <td>[CRISPR]</td>\n",
       "      <td>[K562]</td>\n",
       "      <td>perturbation</td>\n",
       "      <td>perturbation</td>\n",
       "      <td>None</td>\n",
       "      <td>X only</td>\n",
       "      <td>0.001546</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>ReplogleWeissman2022_K562_essential</td>\n",
       "      <td>310385</td>\n",
       "      <td>299694</td>\n",
       "      <td>10816</td>\n",
       "      <td>[CRISPR]</td>\n",
       "      <td>[K562]</td>\n",
       "      <td>perturbation</td>\n",
       "      <td>gene</td>\n",
       "      <td>guide_id</td>\n",
       "      <td>X only</td>\n",
       "      <td>0.965556</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                            dataset_name  n_cells  n_single_perturbation  \\\n",
       "0  AdamsonWeissman2016_GSM2406675_10X001     5768                      6   \n",
       "1  AdamsonWeissman2016_GSM2406677_10X005    15006                     13   \n",
       "2  AdamsonWeissman2016_GSM2406681_10X010    65337                    101   \n",
       "3    ReplogleWeissman2022_K562_essential   310385                 299694   \n",
       "\n",
       "   n_control perturbation_type cell_line source_perturbation_col  \\\n",
       "0         16          [CRISPR]    [K562]            perturbation   \n",
       "1       3674          [CRISPR]    [K562]            perturbation   \n",
       "2       2714          [CRISPR]    [K562]            perturbation   \n",
       "3      10816          [CRISPR]    [K562]            perturbation   \n",
       "\n",
       "  source_gene_col source_guide_col expression_location  \\\n",
       "0    perturbation             None              X only   \n",
       "1    perturbation             None              X only   \n",
       "2    perturbation             None              X only   \n",
       "3            gene         guide_id              X only   \n",
       "\n",
       "   single_perturbation_ratio  \n",
       "0                   0.001040  \n",
       "1                   0.000866  \n",
       "2                   0.001546  \n",
       "3                   0.965556  "
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset_level_summary = (\n",
    "    standardized_metadata.groupby(\"dataset_name\", as_index=False)\n",
    "    .agg(\n",
    "        n_cells=(\"cell_barcode\", \"size\"),\n",
    "        n_single_perturbation=(\"is_single_perturbation\", \"sum\"),\n",
    "        n_control=(\"is_control\", \"sum\"),\n",
    "        perturbation_type=(\"perturbation_type\", lambda x: sorted(pd.Series(x).dropna().unique().tolist())),\n",
    "        cell_line=(\"cell_line\", lambda x: sorted(pd.Series(x).dropna().unique().tolist())),\n",
    "        source_perturbation_col=(\"source_perturbation_col\", \"first\"),\n",
    "        source_gene_col=(\"source_gene_col\", \"first\"),\n",
    "        source_guide_col=(\"source_guide_col\", \"first\"),\n",
    "        expression_location=(\"expression_location\", \"first\"),\n",
    "    )\n",
    ")\n",
    "dataset_level_summary[\"single_perturbation_ratio\"] = (\n",
    "    dataset_level_summary[\"n_single_perturbation\"] / dataset_level_summary[\"n_cells\"]\n",
    ")\n",
    "dataset_level_summary"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "step4",
   "metadata": {},
   "source": [
    "## 4. 导出标准化 metadata\n",
    "\n",
    "下面会把统一后的细胞级 metadata 和数据集级摘要保存到 `data/processed/scPerturb/`，方便后续模型训练或统计分析直接读取。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "export-metadata",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saved cell metadata to: /Users/yurujiang/Desktop/final_thesis/data/processed/scPerturb/cell_metadata_standardized.csv\n",
      "Saved dataset summary to: /Users/yurujiang/Desktop/final_thesis/data/processed/scPerturb/dataset_summary.csv\n"
     ]
    }
   ],
   "source": [
    "OUTPUT_DIR = PROJECT_ROOT / \"data\" / \"processed\" / \"scPerturb\"\n",
    "OUTPUT_DIR.mkdir(parents=True, exist_ok=True)\n",
    "\n",
    "cell_metadata_path = OUTPUT_DIR / \"cell_metadata_standardized.csv\"\n",
    "dataset_summary_path = OUTPUT_DIR / \"dataset_summary.csv\"\n",
    "\n",
    "standardized_metadata.to_csv(cell_metadata_path, index=False)\n",
    "dataset_level_summary.to_csv(dataset_summary_path, index=False)\n",
    "\n",
    "print(f\"Saved cell metadata to: {cell_metadata_path}\")\n",
    "print(f\"Saved dataset summary to: {dataset_summary_path}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}