Spaces:

PortPy-Project
/

portpy_dataset_visualization

Running

File size: 17,746 Bytes

import streamlit as st
import pandas as pd
import json
import os
import posixpath
from huggingface_hub import hf_hub_download
from huggingface_hub import list_repo_files
import io
import zipfile
import shutil
import tempfile, uuid

# Replace this with your actual Hugging Face repo ID
REPO_ID = "PortPy-Project/PortPy_Dataset"

# Load from private repo using token
token = os.getenv("HF_TOKEN")

@st.cache_data
def get_patient_ids():
    # Extract disease site from patient ID prefix (e.g., Lung_Patient_1)
    file = hf_hub_download(REPO_ID, repo_type="dataset", filename="data_info.jsonl", token=token)
    with open(file) as f:
        # data_info = json.load(f)
        data_info = [json.loads(line) for line in f]
    patient_ids = [pat['patient_id'] for pat in data_info]
    df = pd.DataFrame(patient_ids, columns=["patient_id"])
    df["disease_site"] = df["patient_id"].str.extract(r"^(.*?)_")
    return df

@st.cache_data
def _list_all_repo_files():
    return list_repo_files(repo_id=REPO_ID, repo_type="dataset")

@st.cache_data
def load_all_metadata(disease_site):
    # Get the list of patient IDs for the selected disease site
    patient_df = get_patient_ids()
    filtered_patients = patient_df[patient_df["disease_site"] == disease_site]

    metadata = {}
    for patient_id in filtered_patients["patient_id"]: # TODO: limit for testing
        # Load structure metadata for the patient
        structs = load_structure_metadata(patient_id)
        # Load beam metadata for the patient
        beams = load_beam_metadata(patient_id)
        planner_file = hf_hub_download(REPO_ID, repo_type="dataset", filename=f"data/{patient_id}/PlannerBeams.json", token=token)
        with open(planner_file) as f:
            planner_data = json.load(f)
            planner_beam_ids = planner_data.get("IDs", [])
        metadata[patient_id] = {
            "structures": structs,
            "beams": beams,
            "planner_beam_ids": planner_beam_ids
        }

    return metadata

@st.cache_data
def load_structure_metadata(patient_id):
    file = hf_hub_download(REPO_ID, repo_type="dataset", filename=f"data/{patient_id}/StructureSet_MetaData.json", token=token)
    with open(file) as f:
        return json.load(f)

@st.cache_data
def load_beam_metadata(patient_id):
    files = _list_all_repo_files()
    beam_meta_paths = [
        f for f in files
        if f.startswith(f"data/{patient_id}/Beams/Beam_") and f.endswith("_MetaData.json")
    ]

    beam_meta = []
    for path in beam_meta_paths:
        file = hf_hub_download(REPO_ID,
                               repo_type="dataset",
                               filename=path,
                               token=token)   # no local_dir
        with open(file) as f:
            beam_meta.append(json.load(f))
    return beam_meta

def get_patient_summary_from_cached_data(patient_id, all_metadata):
    structs = all_metadata[patient_id]["structures"]
    beams = all_metadata[patient_id]["beams"]

    ptv_vol = None
    for s in structs:
        if "PTV" in s["name"].upper():
            ptv_vol = s.get("volume_cc")
            break

    return {
        "ptv_volume": ptv_vol,
        "num_beams": len(beams),
        "beams": beams
    }

def filter_matched_data(filtered_patients, query_ptv_vol, beam_gantry_filter,
                            beam_collimator_filter, beam_energy_filter, beam_couch_filter,
                            only_planner, all_metadata):
    matched = []
    gantry_angles = set(map(int, beam_gantry_filter.split(","))) if beam_gantry_filter else None
    collimator_angles = set(map(int, beam_collimator_filter.split(","))) if beam_collimator_filter else None
    couch_angles = set(map(int, beam_couch_filter.split(","))) if beam_couch_filter else None
    energies = set(beam_energy_filter.replace(" ", "").split(",")) if beam_energy_filter else None

    for pid in filtered_patients["patient_id"]:
        # Retrieve metadata for the patient from the pre-cached all_metadata
        summary = get_patient_summary_from_cached_data(pid, all_metadata)
        if summary["ptv_volume"] is None or summary["ptv_volume"] < query_ptv_vol:
            continue

        # Filter beams by all conditions
        selected_beams = summary["beams"]
        if gantry_angles:
            selected_beams = [b for b in selected_beams if b["gantry_angle"] in gantry_angles]
        if collimator_angles:
            selected_beams = [b for b in selected_beams if b["collimator_angle"] in collimator_angles]
        if couch_angles:
            selected_beams = [b for b in selected_beams if b["couch_angle"] in couch_angles]
        if energies:
            selected_beams = [b for b in selected_beams if b['energy_MV'] in energies]

        selected_beam_ids = [b["ID"] for b in selected_beams]
        if not selected_beam_ids:
            continue

        if only_planner:
            planner_beam_ids = set(all_metadata[pid]["planner_beam_ids"])
            selected_beam_ids = list(planner_beam_ids.intersection(selected_beam_ids))
            if not selected_beam_ids:
                continue

        matched.append({
            "patient_id": pid,
            "num_beams": len(selected_beam_ids),
            "ptv_volume": summary["ptv_volume"],
            "selected_beam_ids": selected_beam_ids
        })

    return pd.DataFrame(matched)

def download_data(repo_id, patient_ids, beam_ids=None, planner_beam_ids=True, max_retries=2, local_dir='./', download_dicom=True):
    from huggingface_hub import hf_hub_download

    downloaded_files = []
    for patient_id in patient_ids:
        static_files = [
            "CT_Data.h5", "CT_MetaData.json",
            "StructureSet_Data.h5", "StructureSet_MetaData.json",
            "OptimizationVoxels_Data.h5", "OptimizationVoxels_MetaData.json",
            "PlannerBeams.json"
        ]
        for filename in static_files:
            hf_path = posixpath.join("data", patient_id, filename)
            for attempt in range(max_retries):
                try:
                    local_path = hf_hub_download(
                        repo_id=repo_id,
                        repo_type="dataset",
                        filename=hf_path,
                        local_dir=local_dir,
                        token=token
                    )
                    downloaded_files.append(local_path)
                    break
                except Exception as e:
                    if attempt == max_retries - 1:
                        st.error(f"Failed to download {hf_path}: {e}")

        # ---------------------------------------------------------------
        # 2. Download all DICOM files under data/<patient_id>/DicomFiles/
        # ---------------------------------------------------------------
        if download_dicom:
            try:
                all_files = list_repo_files(repo_id, repo_type="dataset")
                dicom_prefix = f"data/{patient_id}/DicomFiles/"
                dicom_files = [f for f in all_files if f.startswith(dicom_prefix)]

                for hf_path in dicom_files:
                    for attempt in range(max_retries):
                        try:
                            local_path = hf_hub_download(
                                repo_id=repo_id,
                                repo_type="dataset",
                                filename=hf_path,
                                local_dir=local_dir,
                                token=token
                            )
                            downloaded_files.append(local_path)
                            break
                        except Exception as e:
                            if attempt == max_retries - 1:
                                st.error(f"Failed to download {hf_path}: {e}")

            except Exception as e:
                st.error(f"Error listing DICOM files for {patient_id}: {e}")
        if planner_beam_ids:
            planner_file = os.path.join(local_dir, 'data', patient_id, "PlannerBeams.json")
            try:
                with open(planner_file, "r") as f:
                    planner_data = json.load(f)
                    beam_ids = planner_data.get("IDs", [])
            except Exception as e:
                st.error(f"Error reading PlannerBeams.json: {e}")
                beam_ids = []

        if beam_ids is not None:
            for bid in beam_ids:
                beam_data_file = f"Beams/Beam_{bid}_Data.h5"
                beam_meta_file = f"Beams/Beam_{bid}_MetaData.json"
                for beam_file in [beam_data_file, beam_meta_file]:
                    hf_path = posixpath.join("data", patient_id, beam_file)
                    for attempt in range(max_retries):
                        try:
                            local_path = hf_hub_download(
                                repo_id=repo_id,
                                repo_type="dataset",
                                filename=hf_path,
                                local_dir=local_dir,
                                token=token
                            )
                            downloaded_files.append(local_path)
                            break
                        except Exception as e:
                            if attempt == max_retries - 1:
                                st.error(f"Failed to download {hf_path}: {e}")
    return downloaded_files

from st_aggrid import AgGrid, GridOptionsBuilder, GridUpdateMode

def show_aggrid_table(df):
    gb = GridOptionsBuilder.from_dataframe(df)
    gb.configure_default_column(groupable=True, value=True, enableRowGroup=True, aggFunc='sum', editable=False)
    gb.configure_grid_options(domLayout='normal')

    # Enable multiple row selection with checkboxes
    gb.configure_selection('multiple', use_checkbox=True)
    gb.configure_column("patient_id", checkboxSelection=True)

    grid_options = gb.build()

    grid_response = AgGrid(
        df,
        gridOptions=grid_options,
        enable_enterprise_modules=False,
        allow_unsafe_jscode=True,
        fit_columns_on_grid_load=True,
        theme='balham',
        update_mode=GridUpdateMode.SELECTION_CHANGED
    )

    return grid_response

def main():
    st.set_page_config(page_title="PortPy Metadata Explorer", layout="wide")
    st.title("📊 PortPy Metadata Explorer & Downloader")

    patient_df = get_patient_ids()
    disease_site = st.sidebar.selectbox("Select Disease Site", patient_df["disease_site"].unique())
    all_metadata = load_all_metadata(disease_site)  # Load and cache all metadata for selected disease site

    filtered_patients = pd.DataFrame(all_metadata.keys(), columns=["patient_id"])


    beam_gantry_filter = st.sidebar.text_input("Gantry Angles (comma-separated)", "")
    beam_collimator_filter = st.sidebar.text_input("Collimator Angles (comma-separated)", "")
    beam_energy_filter = st.sidebar.text_input("Beam Energies (comma-separated)", "")
    beam_couch_filter = st.sidebar.text_input("Couch Angles (comma-separated)", "")
    query_ptv_vol = st.sidebar.number_input("Minimum PTV volume (cc):", value=0)

    # Checkbox: Only planner beams
    only_planner = st.sidebar.checkbox(
        "Show only planner beams (if selected it will download only planner beams)",
        value=True,
    )

    results_df = filter_matched_data(
        filtered_patients, query_ptv_vol, beam_gantry_filter,
        beam_collimator_filter, beam_energy_filter, beam_couch_filter,
        only_planner, all_metadata
    )
    # Summary Table
    # st.dataframe(results_df)
    grid_response = show_aggrid_table(results_df)

    selected_rows = grid_response.get("selected_rows", pd.DataFrame())

    if isinstance(selected_rows, pd.DataFrame):
        print(selected_rows)
        if not selected_rows.empty:
            for _, row in selected_rows.iterrows():
                pid = row["patient_id"]
                st.markdown(f"### Patient: {pid}")
                st.markdown("#### Structures")
                st.dataframe(pd.DataFrame(all_metadata[pid]["structures"]))
                st.markdown("#### Beams")
                st.dataframe(pd.DataFrame(all_metadata[pid]["beams"]))

    # selected_patient = st.selectbox("Select patient for detailed view", results_df["patient_id"] if not results_df.empty else [])
    # if selected_patient:
    #     structs = all_metadata[selected_patient]["structures"]
    #     beams = all_metadata[selected_patient]["beams"]
    #     st.subheader(f"🏗️ Structures for {selected_patient}")
    #     st.dataframe(pd.DataFrame(structs), use_container_width=True)
    #     st.subheader(f"📡 Beams for {selected_patient}")
    #     st.dataframe(pd.DataFrame(beams), use_container_width=True)
    if "open_download_expander" not in st.session_state:
        st.session_state["open_download_expander"] = False
    with st.expander("Download matched patients", expanded=st.session_state["open_download_expander"]):
        # Multi-select and download
        to_download = st.sidebar.multiselect("Select Patients to Download", results_df["patient_id"].tolist())
        # local_dir = st.sidebar.text_input("Enter local directory to download data:", value="./downloaded")
        # if st.sidebar.button("Download Selected Patients"):
        #     if to_download:
        #         patient_to_beams = {
        #             row["patient_id"]: row["beam_ids"] for ind, row in results_df.iterrows() if ind in to_download
        #         }
        #         for pid, beam_ids in patient_to_beams.items():
        #             download_data(REPO_ID, [pid], beam_ids=beam_ids, planner_beam_ids=False, local_dir=local_dir)
        #         st.success("Download complete!")
        #     else:
        #         st.warning("No patients selected.")

        if st.sidebar.button("Download Selected Patients"):
            st.session_state["open_download_expander"] = True  # Force open expander
            if not to_download:
                st.warning("No patients selected.")
            else:
                progress = st.progress(0)
                status = st.empty()

                local_dir = "./downloaded"
                if os.path.exists(local_dir):
                    shutil.rmtree(local_dir)
                os.makedirs(local_dir, exist_ok=True)

                patient_to_beams = {
                    row["patient_id"]: row["selected_beam_ids"]
                    for _, row in results_df.iterrows()
                    if row["patient_id"] in to_download
                }

                total = len(patient_to_beams)
                for i, (pid, beam_ids) in enumerate(patient_to_beams.items(), start=1):
                    status.write(f"Downloading {pid} ({i}/{total})…")

                    download_data(REPO_ID, [pid], beam_ids=beam_ids,
                                  planner_beam_ids=only_planner,
                                  local_dir=local_dir, download_dicom=True)

                    progress.progress(i / total)


                status.success("All downloads complete. Preparing zip…")

                zip_path = os.path.join(tempfile.gettempdir(), f"portpy_patients_{uuid.uuid4().hex}.zip")

                # optional: guard size to avoid crashes
                total_bytes = 0
                for root, _, files in os.walk(local_dir):
                    for f in files:
                        total_bytes += os.path.getsize(os.path.join(root, f))
                total_gb = total_bytes / (1024 ** 3)
                status.write(f"Preparing zip (~{total_gb:.2f} GB)…")
                if total_gb > 40.0:
                    st.error("Selection too large for a single zip. Please download fewer patients.")
                    st.stop()

                if os.path.exists(zip_path):
                    os.remove(zip_path)

                with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_STORED, allowZip64=True) as zf:
                    for root, _, files in os.walk(local_dir):
                        for f in files:
                            full_path = os.path.join(root, f)
                            rel_path = os.path.relpath(full_path, local_dir)
                            zf.write(full_path, rel_path)

                with open(zip_path, "rb") as fp:
                    st.download_button(
                        label="Your download is ready! Click to save.",
                        data=fp,
                        file_name="portpy_patients.zip",
                        mime="application/zip",
                    )
                # # Create zip in memory
                # buf = io.BytesIO()
                # with zipfile.ZipFile(buf, "w", zipfile.ZIP_STORED) as zf:
                #     for root, _, files in os.walk(local_dir):
                #         for f in files:
                #             full_path = os.path.join(root, f)
                #             rel_path = os.path.relpath(full_path, local_dir)
                #             zf.write(full_path, rel_path)
                # buf.seek(0)
                #
                # # Trigger file download automatically from the SAME BUTTON CLICK
                # st.download_button(
                #     label="Your download is ready! Click to save.",
                #     data=buf,
                #     file_name="portpy_patients.zip",
                #     mime="application/zip",
                # )

        # if st.button("Download Data"):
        #     patients_to_download = results_df["patient_id"].tolist()
        #     download_data(REPO_ID, patients_to_download, planner_beam_ids=True, local_dir=local_dir)
        #     st.success("Download complete!")

if __name__ == "__main__":
    main()
    #to run: streamlit run app.py