| | import streamlit as st |
| | import pandas as pd |
| | import json |
| | import os |
| | import posixpath |
| | from huggingface_hub import hf_hub_download |
| | from huggingface_hub import list_repo_files |
| | import io |
| | import zipfile |
| | import shutil |
| | import tempfile, uuid |
| |
|
| | |
| | REPO_ID = "PortPy-Project/PortPy_Dataset" |
| |
|
| | |
| | token = os.getenv("HF_TOKEN") |
| |
|
| | @st.cache_data |
| | def get_patient_ids(): |
| | |
| | file = hf_hub_download(REPO_ID, repo_type="dataset", filename="data_info.jsonl", token=token) |
| | with open(file) as f: |
| | |
| | data_info = [json.loads(line) for line in f] |
| | patient_ids = [pat['patient_id'] for pat in data_info] |
| | df = pd.DataFrame(patient_ids, columns=["patient_id"]) |
| | df["disease_site"] = df["patient_id"].str.extract(r"^(.*?)_") |
| | return df |
| |
|
| | @st.cache_data |
| | def _list_all_repo_files(): |
| | return list_repo_files(repo_id=REPO_ID, repo_type="dataset") |
| |
|
| | @st.cache_data |
| | def load_all_metadata(disease_site): |
| | |
| | patient_df = get_patient_ids() |
| | filtered_patients = patient_df[patient_df["disease_site"] == disease_site] |
| |
|
| | metadata = {} |
| | for patient_id in filtered_patients["patient_id"]: |
| | |
| | structs = load_structure_metadata(patient_id) |
| | |
| | beams = load_beam_metadata(patient_id) |
| | planner_file = hf_hub_download(REPO_ID, repo_type="dataset", filename=f"data/{patient_id}/PlannerBeams.json", token=token) |
| | with open(planner_file) as f: |
| | planner_data = json.load(f) |
| | planner_beam_ids = planner_data.get("IDs", []) |
| | metadata[patient_id] = { |
| | "structures": structs, |
| | "beams": beams, |
| | "planner_beam_ids": planner_beam_ids |
| | } |
| |
|
| | return metadata |
| |
|
| | @st.cache_data |
| | def load_structure_metadata(patient_id): |
| | file = hf_hub_download(REPO_ID, repo_type="dataset", filename=f"data/{patient_id}/StructureSet_MetaData.json", token=token) |
| | with open(file) as f: |
| | return json.load(f) |
| |
|
| | @st.cache_data |
| | def load_beam_metadata(patient_id): |
| | files = _list_all_repo_files() |
| | beam_meta_paths = [ |
| | f for f in files |
| | if f.startswith(f"data/{patient_id}/Beams/Beam_") and f.endswith("_MetaData.json") |
| | ] |
| |
|
| | beam_meta = [] |
| | for path in beam_meta_paths: |
| | file = hf_hub_download(REPO_ID, |
| | repo_type="dataset", |
| | filename=path, |
| | token=token) |
| | with open(file) as f: |
| | beam_meta.append(json.load(f)) |
| | return beam_meta |
| |
|
| | def get_patient_summary_from_cached_data(patient_id, all_metadata): |
| | structs = all_metadata[patient_id]["structures"] |
| | beams = all_metadata[patient_id]["beams"] |
| |
|
| | ptv_vol = None |
| | for s in structs: |
| | if "PTV" in s["name"].upper(): |
| | ptv_vol = s.get("volume_cc") |
| | break |
| |
|
| | return { |
| | "ptv_volume": ptv_vol, |
| | "num_beams": len(beams), |
| | "beams": beams |
| | } |
| |
|
| | def filter_matched_data(filtered_patients, query_ptv_vol, beam_gantry_filter, |
| | beam_collimator_filter, beam_energy_filter, beam_couch_filter, |
| | only_planner, all_metadata): |
| | matched = [] |
| | gantry_angles = set(map(int, beam_gantry_filter.split(","))) if beam_gantry_filter else None |
| | collimator_angles = set(map(int, beam_collimator_filter.split(","))) if beam_collimator_filter else None |
| | couch_angles = set(map(int, beam_couch_filter.split(","))) if beam_couch_filter else None |
| | energies = set(beam_energy_filter.replace(" ", "").split(",")) if beam_energy_filter else None |
| |
|
| | for pid in filtered_patients["patient_id"]: |
| | |
| | summary = get_patient_summary_from_cached_data(pid, all_metadata) |
| | if summary["ptv_volume"] is None or summary["ptv_volume"] < query_ptv_vol: |
| | continue |
| |
|
| | |
| | selected_beams = summary["beams"] |
| | if gantry_angles: |
| | selected_beams = [b for b in selected_beams if b["gantry_angle"] in gantry_angles] |
| | if collimator_angles: |
| | selected_beams = [b for b in selected_beams if b["collimator_angle"] in collimator_angles] |
| | if couch_angles: |
| | selected_beams = [b for b in selected_beams if b["couch_angle"] in couch_angles] |
| | if energies: |
| | selected_beams = [b for b in selected_beams if b['energy_MV'] in energies] |
| |
|
| | selected_beam_ids = [b["ID"] for b in selected_beams] |
| | if not selected_beam_ids: |
| | continue |
| |
|
| | if only_planner: |
| | planner_beam_ids = set(all_metadata[pid]["planner_beam_ids"]) |
| | selected_beam_ids = list(planner_beam_ids.intersection(selected_beam_ids)) |
| | if not selected_beam_ids: |
| | continue |
| |
|
| | matched.append({ |
| | "patient_id": pid, |
| | "num_beams": len(selected_beam_ids), |
| | "ptv_volume": summary["ptv_volume"], |
| | "selected_beam_ids": selected_beam_ids |
| | }) |
| |
|
| | return pd.DataFrame(matched) |
| |
|
| | def download_data(repo_id, patient_ids, beam_ids=None, planner_beam_ids=True, max_retries=2, local_dir='./', download_dicom=True): |
| | from huggingface_hub import hf_hub_download |
| |
|
| | downloaded_files = [] |
| | for patient_id in patient_ids: |
| | static_files = [ |
| | "CT_Data.h5", "CT_MetaData.json", |
| | "StructureSet_Data.h5", "StructureSet_MetaData.json", |
| | "OptimizationVoxels_Data.h5", "OptimizationVoxels_MetaData.json", |
| | "PlannerBeams.json" |
| | ] |
| | for filename in static_files: |
| | hf_path = posixpath.join("data", patient_id, filename) |
| | for attempt in range(max_retries): |
| | try: |
| | local_path = hf_hub_download( |
| | repo_id=repo_id, |
| | repo_type="dataset", |
| | filename=hf_path, |
| | local_dir=local_dir, |
| | token=token |
| | ) |
| | downloaded_files.append(local_path) |
| | break |
| | except Exception as e: |
| | if attempt == max_retries - 1: |
| | st.error(f"Failed to download {hf_path}: {e}") |
| |
|
| | |
| | |
| | |
| | if download_dicom: |
| | try: |
| | all_files = list_repo_files(repo_id, repo_type="dataset") |
| | dicom_prefix = f"data/{patient_id}/DicomFiles/" |
| | dicom_files = [f for f in all_files if f.startswith(dicom_prefix)] |
| |
|
| | for hf_path in dicom_files: |
| | for attempt in range(max_retries): |
| | try: |
| | local_path = hf_hub_download( |
| | repo_id=repo_id, |
| | repo_type="dataset", |
| | filename=hf_path, |
| | local_dir=local_dir, |
| | token=token |
| | ) |
| | downloaded_files.append(local_path) |
| | break |
| | except Exception as e: |
| | if attempt == max_retries - 1: |
| | st.error(f"Failed to download {hf_path}: {e}") |
| |
|
| | except Exception as e: |
| | st.error(f"Error listing DICOM files for {patient_id}: {e}") |
| | if planner_beam_ids: |
| | planner_file = os.path.join(local_dir, 'data', patient_id, "PlannerBeams.json") |
| | try: |
| | with open(planner_file, "r") as f: |
| | planner_data = json.load(f) |
| | beam_ids = planner_data.get("IDs", []) |
| | except Exception as e: |
| | st.error(f"Error reading PlannerBeams.json: {e}") |
| | beam_ids = [] |
| |
|
| | if beam_ids is not None: |
| | for bid in beam_ids: |
| | beam_data_file = f"Beams/Beam_{bid}_Data.h5" |
| | beam_meta_file = f"Beams/Beam_{bid}_MetaData.json" |
| | for beam_file in [beam_data_file, beam_meta_file]: |
| | hf_path = posixpath.join("data", patient_id, beam_file) |
| | for attempt in range(max_retries): |
| | try: |
| | local_path = hf_hub_download( |
| | repo_id=repo_id, |
| | repo_type="dataset", |
| | filename=hf_path, |
| | local_dir=local_dir, |
| | token=token |
| | ) |
| | downloaded_files.append(local_path) |
| | break |
| | except Exception as e: |
| | if attempt == max_retries - 1: |
| | st.error(f"Failed to download {hf_path}: {e}") |
| | return downloaded_files |
| |
|
| | from st_aggrid import AgGrid, GridOptionsBuilder, GridUpdateMode |
| |
|
| | def show_aggrid_table(df): |
| | gb = GridOptionsBuilder.from_dataframe(df) |
| | gb.configure_default_column(groupable=True, value=True, enableRowGroup=True, aggFunc='sum', editable=False) |
| | gb.configure_grid_options(domLayout='normal') |
| |
|
| | |
| | gb.configure_selection('multiple', use_checkbox=True) |
| | gb.configure_column("patient_id", checkboxSelection=True) |
| |
|
| | grid_options = gb.build() |
| |
|
| | grid_response = AgGrid( |
| | df, |
| | gridOptions=grid_options, |
| | enable_enterprise_modules=False, |
| | allow_unsafe_jscode=True, |
| | fit_columns_on_grid_load=True, |
| | theme='balham', |
| | update_mode=GridUpdateMode.SELECTION_CHANGED |
| | ) |
| |
|
| | return grid_response |
| |
|
| | def main(): |
| | st.set_page_config(page_title="PortPy Metadata Explorer", layout="wide") |
| | st.title("📊 PortPy Metadata Explorer & Downloader") |
| |
|
| | patient_df = get_patient_ids() |
| | disease_site = st.sidebar.selectbox("Select Disease Site", patient_df["disease_site"].unique()) |
| | all_metadata = load_all_metadata(disease_site) |
| |
|
| | filtered_patients = pd.DataFrame(all_metadata.keys(), columns=["patient_id"]) |
| |
|
| |
|
| | beam_gantry_filter = st.sidebar.text_input("Gantry Angles (comma-separated)", "") |
| | beam_collimator_filter = st.sidebar.text_input("Collimator Angles (comma-separated)", "") |
| | beam_energy_filter = st.sidebar.text_input("Beam Energies (comma-separated)", "") |
| | beam_couch_filter = st.sidebar.text_input("Couch Angles (comma-separated)", "") |
| | query_ptv_vol = st.sidebar.number_input("Minimum PTV volume (cc):", value=0) |
| |
|
| | |
| | only_planner = st.sidebar.checkbox( |
| | "Show only planner beams (if selected it will download only planner beams)", |
| | value=True, |
| | ) |
| |
|
| | results_df = filter_matched_data( |
| | filtered_patients, query_ptv_vol, beam_gantry_filter, |
| | beam_collimator_filter, beam_energy_filter, beam_couch_filter, |
| | only_planner, all_metadata |
| | ) |
| | |
| | |
| | grid_response = show_aggrid_table(results_df) |
| |
|
| | selected_rows = grid_response.get("selected_rows", pd.DataFrame()) |
| |
|
| | if isinstance(selected_rows, pd.DataFrame): |
| | print(selected_rows) |
| | if not selected_rows.empty: |
| | for _, row in selected_rows.iterrows(): |
| | pid = row["patient_id"] |
| | st.markdown(f"### Patient: {pid}") |
| | st.markdown("#### Structures") |
| | st.dataframe(pd.DataFrame(all_metadata[pid]["structures"])) |
| | st.markdown("#### Beams") |
| | st.dataframe(pd.DataFrame(all_metadata[pid]["beams"])) |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | if "open_download_expander" not in st.session_state: |
| | st.session_state["open_download_expander"] = False |
| | with st.expander("Download matched patients", expanded=st.session_state["open_download_expander"]): |
| | |
| | to_download = st.sidebar.multiselect("Select Patients to Download", results_df["patient_id"].tolist()) |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | if st.sidebar.button("Download Selected Patients"): |
| | st.session_state["open_download_expander"] = True |
| | if not to_download: |
| | st.warning("No patients selected.") |
| | else: |
| | progress = st.progress(0) |
| | status = st.empty() |
| |
|
| | local_dir = "./downloaded" |
| | if os.path.exists(local_dir): |
| | shutil.rmtree(local_dir) |
| | os.makedirs(local_dir, exist_ok=True) |
| |
|
| | patient_to_beams = { |
| | row["patient_id"]: row["selected_beam_ids"] |
| | for _, row in results_df.iterrows() |
| | if row["patient_id"] in to_download |
| | } |
| |
|
| | total = len(patient_to_beams) |
| | for i, (pid, beam_ids) in enumerate(patient_to_beams.items(), start=1): |
| | status.write(f"Downloading {pid} ({i}/{total})…") |
| |
|
| | download_data(REPO_ID, [pid], beam_ids=beam_ids, |
| | planner_beam_ids=only_planner, |
| | local_dir=local_dir, download_dicom=True) |
| |
|
| | progress.progress(i / total) |
| |
|
| |
|
| | status.success("All downloads complete. Preparing zip…") |
| |
|
| | zip_path = os.path.join(tempfile.gettempdir(), f"portpy_patients_{uuid.uuid4().hex}.zip") |
| |
|
| | |
| | total_bytes = 0 |
| | for root, _, files in os.walk(local_dir): |
| | for f in files: |
| | total_bytes += os.path.getsize(os.path.join(root, f)) |
| | total_gb = total_bytes / (1024 ** 3) |
| | status.write(f"Preparing zip (~{total_gb:.2f} GB)…") |
| | if total_gb > 40.0: |
| | st.error("Selection too large for a single zip. Please download fewer patients.") |
| | st.stop() |
| |
|
| | if os.path.exists(zip_path): |
| | os.remove(zip_path) |
| |
|
| | with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_STORED, allowZip64=True) as zf: |
| | for root, _, files in os.walk(local_dir): |
| | for f in files: |
| | full_path = os.path.join(root, f) |
| | rel_path = os.path.relpath(full_path, local_dir) |
| | zf.write(full_path, rel_path) |
| |
|
| | with open(zip_path, "rb") as fp: |
| | st.download_button( |
| | label="Your download is ready! Click to save.", |
| | data=fp, |
| | file_name="portpy_patients.zip", |
| | mime="application/zip", |
| | ) |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| |
|
| | if __name__ == "__main__": |
| | main() |
| | |