| |
| |
| |
|
|
| |
| |
| |
|
|
| |
| import subprocess |
| import sys |
|
|
| def install_packages(): |
| """Install required packages""" |
| packages = ['openai', 'gradio', 'python-dotenv', 'requests', 'pandas'] |
| for package in packages: |
| try: |
| __import__(package) |
| except ImportError: |
| print(f"Installing {package}...") |
| subprocess.check_call([sys.executable, "-m", "pip", "install", package]) |
|
|
| |
| install_packages() |
|
|
| |
| import gradio as gr |
| import json |
| import random |
| import re |
| import time |
| import os |
| import io |
| import zipfile |
| from datetime import datetime |
| from typing import Dict, List, Any, Optional, Tuple |
| from openai import OpenAI |
| import pandas as pd |
|
|
| |
| |
| |
|
|
| class MedicalLiteratureSimulator: |
| """Simulates medical literature research for health dataset generation""" |
| |
| def __init__(self): |
| self.research_domains = { |
| "longevity": { |
| "interventions": ["NAD+ supplementation", "resveratrol", "metformin", "caloric restriction"], |
| "biomarkers": ["telomere length", "cellular senescence", "inflammatory markers", "mitochondrial function"], |
| "outcomes": ["biological age reduction", "improved healthspan", "enhanced cellular repair"] |
| }, |
| "metabolic_health": { |
| "interventions": ["berberine", "intermittent fasting", "alpha-lipoic acid", "chromium"], |
| "biomarkers": ["glucose levels", "insulin sensitivity", "HbA1c", "HOMA-IR"], |
| "outcomes": ["improved glucose control", "enhanced insulin sensitivity", "reduced inflammation"] |
| }, |
| "cardiovascular": { |
| "interventions": ["omega-3 fatty acids", "coenzyme Q10", "magnesium", "nattokinase"], |
| "biomarkers": ["blood pressure", "cholesterol levels", "CRP", "endothelial function"], |
| "outcomes": ["reduced blood pressure", "improved lipid profile", "decreased inflammation"] |
| }, |
| "cognitive": { |
| "interventions": ["lion's mane mushroom", "phosphatidylserine", "bacopa monnieri", "acetyl-L-carnitine"], |
| "biomarkers": ["cognitive performance", "BDNF levels", "neuroinflammation", "memory function"], |
| "outcomes": ["enhanced memory", "improved cognitive function", "neuroprotection"] |
| }, |
| "hormonal": { |
| "interventions": ["ashwagandha", "vitamin D", "DHEA", "maca root"], |
| "biomarkers": ["cortisol levels", "thyroid hormones", "sex hormones", "stress markers"], |
| "outcomes": ["hormone balance", "improved energy", "better sleep quality"] |
| }, |
| "inflammation": { |
| "interventions": ["curcumin", "omega-3", "quercetin", "boswellia"], |
| "biomarkers": ["CRP", "IL-6", "TNF-alpha", "oxidative stress"], |
| "outcomes": ["reduced inflammation", "improved immune function", "enhanced recovery"] |
| } |
| } |
| |
| def generate_study_data(self, domain: str) -> Dict[str, Any]: |
| """Generate realistic medical study data""" |
| if domain not in self.research_domains: |
| domain = "longevity" |
| |
| domain_data = self.research_domains[domain] |
| |
| study = { |
| "pmid": f"PMID{random.randint(35000000, 40000000)}", |
| "title": self._generate_study_title(domain, domain_data), |
| "abstract": self._generate_study_abstract(domain, domain_data), |
| "journal": random.choice([ |
| "Nature Medicine", "Cell Metabolism", "Journal of Clinical Medicine", |
| "Circulation", "Aging Cell", "Nutrients", "Clinical Nutrition" |
| ]), |
| "year": random.choice([2023, 2024]), |
| "domain": domain, |
| "interventions": random.sample(domain_data["interventions"], min(2, len(domain_data["interventions"]))), |
| "biomarkers": random.sample(domain_data["biomarkers"], min(3, len(domain_data["biomarkers"]))), |
| "outcomes": random.sample(domain_data["outcomes"], min(2, len(domain_data["outcomes"]))), |
| "participant_count": random.randint(50, 300), |
| "duration_weeks": random.choice([8, 12, 16, 24]), |
| "dosages": self._generate_dosages(domain_data["interventions"][0]) |
| } |
| |
| return study |
| |
| def _generate_study_title(self, domain: str, domain_data: Dict) -> str: |
| intervention = random.choice(domain_data["interventions"]) |
| outcome = random.choice(domain_data["outcomes"]) |
| |
| titles = [ |
| f"Effects of {intervention} on {outcome}: A randomized controlled trial", |
| f"{intervention} supplementation improves {outcome} in healthy adults", |
| f"Clinical evaluation of {intervention} for {outcome} optimization", |
| f"Randomized trial of {intervention} in {outcome} enhancement" |
| ] |
| |
| return random.choice(titles) |
| |
| def _generate_study_abstract(self, domain: str, domain_data: Dict) -> str: |
| intervention = domain_data["interventions"][0] |
| biomarker = random.choice(domain_data["biomarkers"]) |
| outcome = random.choice(domain_data["outcomes"]) |
| |
| abstract = f""" |
| Background: {intervention} has shown promise in preliminary studies for health optimization. |
| |
| Objective: To evaluate the effects of {intervention} supplementation on {biomarker} and related health outcomes. |
| |
| Methods: Randomized, double-blind, placebo-controlled trial with {random.randint(120, 250)} participants aged 40-65 years. |
| Subjects received {intervention} or placebo for {random.randint(12, 24)} weeks. |
| |
| Results: {intervention} supplementation significantly improved {outcome} compared to placebo (p<0.05). |
| {biomarker.capitalize()} showed {random.randint(15, 35)}% improvement from baseline. |
| Secondary outcomes included improved quality of life and no serious adverse events. |
| |
| Conclusions: {intervention} supplementation provides significant benefits for {outcome} with excellent safety profile. |
| """.strip() |
| |
| return abstract |
| |
| def _generate_dosages(self, intervention: str) -> List[str]: |
| dosage_ranges = { |
| "NAD+": ["250mg", "500mg", "1000mg"], |
| "resveratrol": ["100mg", "250mg", "500mg"], |
| "berberine": ["500mg", "1000mg", "1500mg"], |
| "omega-3": ["1000mg", "2000mg", "3000mg"], |
| "magnesium": ["200mg", "400mg", "600mg"], |
| "curcumin": ["500mg", "1000mg", "1500mg"] |
| } |
| |
| for key in dosage_ranges: |
| if key.lower() in intervention.lower(): |
| return random.sample(dosage_ranges[key], min(2, len(dosage_ranges[key]))) |
| |
| return ["500mg", "1000mg"] |
|
|
| class HealthProfileGenerator: |
| """Generates realistic health profiles based on medical studies""" |
| |
| def __init__(self): |
| self.severity_levels = { |
| "optimal": {"multiplier": 1.0, "description": "excellent baseline health with optimization focus"}, |
| "mild": {"multiplier": 1.2, "description": "minor health concerns with good overall function"}, |
| "moderate": {"multiplier": 1.5, "description": "noticeable health issues requiring intervention"}, |
| "severe": {"multiplier": 2.0, "description": "significant health challenges needing intensive protocols"} |
| } |
| |
| def generate_profile_from_study(self, study: Dict[str, Any], severity: str = "moderate") -> Dict[str, Any]: |
| """Generate complete health profile based on study data and severity level""" |
| domain = study.get("domain", "longevity") |
| severity_data = self.severity_levels.get(severity, self.severity_levels["moderate"]) |
| multiplier = severity_data["multiplier"] |
| |
| age = random.randint(35, 65) |
| gender = random.choice(["male", "female"]) |
| |
| labs = self._generate_lab_values(domain, multiplier) |
| |
| health_profile = { |
| "user_tests_result_data": { |
| "Labs": labs, |
| "gut_microbiome": self._generate_gut_microbiome(severity), |
| "epigenetics": self._generate_epigenetics(severity), |
| "wearables": self._generate_wearables(severity), |
| "cgm": self._generate_cgm(severity) |
| }, |
| "user_query": self._generate_user_query(study, age, gender, severity), |
| "source_study": { |
| "pmid": study.get("pmid"), |
| "domain": domain, |
| "severity": severity, |
| "title": study.get("title") |
| } |
| } |
| |
| return health_profile |
| |
| def _generate_lab_values(self, domain: str, multiplier: float) -> Dict[str, Any]: |
| """Generate realistic lab values based on domain and severity""" |
| base_labs = { |
| "blood_tests": { |
| "systolic_bp": int(random.randint(120, 140) * multiplier), |
| "diastolic_bp": int(random.randint(70, 90) * multiplier), |
| "total_cholesterol": int(random.randint(180, 220) * multiplier), |
| "ldl": int(random.randint(100, 140) * multiplier), |
| "hdl": int(random.randint(40, 60) / multiplier), |
| "triglycerides": int(random.randint(80, 150) * multiplier), |
| "apoB": int(random.randint(70, 110) * multiplier), |
| "lp_a": random.randint(10, 50) |
| }, |
| "inflammatory": { |
| "hscrp": round(random.uniform(1.0, 4.0) * multiplier, 1), |
| "esr": int(random.randint(5, 25) * multiplier), |
| "il6": round(random.uniform(1.0, 5.0) * multiplier, 1), |
| "tnf_alpha": round(random.uniform(1.0, 3.0) * multiplier, 1), |
| "oxidative_stress_markers": "elevated" if multiplier > 1.3 else "normal", |
| "homocysteine": round(random.uniform(8, 15) * multiplier, 1) |
| }, |
| "nutritional": { |
| "vitamin_d": int(random.randint(25, 50) / multiplier), |
| "b12": random.randint(250, 400), |
| "folate": round(random.uniform(6, 14), 1), |
| "iron": random.randint(60, 120), |
| "ferritin": random.randint(30, 100), |
| "selenium": random.randint(80, 120), |
| "zinc": random.randint(70, 110), |
| "magnesium": round(random.uniform(1.5, 2.2), 1), |
| "omega3_index": round(random.uniform(4, 8) / multiplier, 1) |
| } |
| } |
| |
| if domain == "metabolic_health": |
| base_labs["metabolic"] = { |
| "fasting_glucose": int(random.randint(85, 110) * multiplier), |
| "hba1c": round(random.uniform(5.2, 6.0) * min(multiplier, 1.4), 1), |
| "insulin_fasting": round(random.uniform(5, 15) * multiplier, 1), |
| "homa_ir": round(random.uniform(1.5, 4.0) * multiplier, 1) |
| } |
| |
| return base_labs |
| |
| def _generate_gut_microbiome(self, severity: str) -> str: |
| scores = { |
| "optimal": random.uniform(8.5, 9.5), |
| "mild": random.uniform(7.0, 8.5), |
| "moderate": random.uniform(5.5, 7.0), |
| "severe": random.uniform(3.5, 5.5) |
| } |
| |
| score = scores.get(severity, 6.5) |
| |
| descriptions = { |
| "optimal": "excellent diversity with optimal bacterial balance", |
| "mild": "good diversity with minor imbalances", |
| "moderate": "moderate dysbiosis with reduced beneficial bacteria", |
| "severe": "significant dysbiosis with pathogenic overgrowth" |
| } |
| |
| desc = descriptions.get(severity, "moderate dysbiosis") |
| return f"Diversity score {score:.1f}/10, {desc}, beneficial bacteria {random.randint(60, 90)}%" |
| |
| def _generate_epigenetics(self, severity: str) -> str: |
| age_acceleration = { |
| "optimal": random.randint(-2, 1), |
| "mild": random.randint(1, 3), |
| "moderate": random.randint(3, 6), |
| "severe": random.randint(6, 12) |
| } |
| |
| acceleration = age_acceleration.get(severity, 4) |
| telomere_percentile = max(10, random.randint(30, 80) - acceleration * 5) |
| |
| return f"Biological age acceleration: {acceleration} years, telomere length: {telomere_percentile}th percentile, DunedinPACE: {round(random.uniform(0.9, 1.4), 2)}" |
| |
| def _generate_wearables(self, severity: str) -> Dict[str, int]: |
| base_ranges = { |
| "optimal": {"hrv": (55, 75), "rhr": (45, 60), "sleep": (85, 95)}, |
| "mild": {"hrv": (45, 65), "rhr": (55, 70), "sleep": (75, 85)}, |
| "moderate": {"hrv": (30, 50), "rhr": (65, 80), "sleep": (60, 75)}, |
| "severe": {"hrv": (20, 35), "rhr": (75, 95), "sleep": (45, 65)} |
| } |
| |
| ranges = base_ranges.get(severity, base_ranges["moderate"]) |
| |
| return { |
| "hrv_avg": random.randint(*ranges["hrv"]), |
| "rhr": random.randint(*ranges["rhr"]), |
| "sleep_score": random.randint(*ranges["sleep"]), |
| "recovery_score": random.randint(ranges["sleep"][0]-10, ranges["sleep"][1]-5), |
| "stress_score": random.randint(100-ranges["sleep"][1], 100-ranges["sleep"][0]+20), |
| "vo2_max": random.randint(25, 50), |
| "fitness_age": random.randint(30, 65) |
| } |
| |
| def _generate_cgm(self, severity: str) -> str: |
| glucose_ranges = { |
| "optimal": (80, 95, 92, 98), |
| "mild": (85, 105, 85, 95), |
| "moderate": (95, 120, 70, 85), |
| "severe": (110, 140, 55, 75) |
| } |
| |
| avg_min, avg_max, tir_min, tir_max = glucose_ranges.get(severity, glucose_ranges["moderate"]) |
| return f"Average glucose {random.randint(avg_min, avg_max)} mg/dL, time in range {random.randint(tir_min, tir_max)}%" |
| |
| def _generate_user_query(self, study: Dict[str, Any], age: int, gender: str, severity: str) -> str: |
| domain = study.get("domain", "longevity") |
| |
| base_queries = { |
| "longevity": f"I'm a {age}-year-old {gender} interested in longevity optimization and anti-aging protocols", |
| "metabolic_health": f"I'm a {age}-year-old {gender} with metabolic dysfunction seeking evidence-based glucose control", |
| "cardiovascular": f"I'm a {age}-year-old {gender} with cardiovascular risk factors wanting heart health optimization", |
| "cognitive": f"I'm a {age}-year-old {gender} seeking cognitive enhancement and brain health optimization", |
| "hormonal": f"I'm a {age}-year-old {gender} with hormonal imbalances needing optimization protocols", |
| "inflammation": f"I'm a {age}-year-old {gender} with chronic inflammation seeking anti-inflammatory interventions" |
| } |
| |
| base_query = base_queries.get(domain, base_queries["longevity"]) |
| |
| severity_context = { |
| "optimal": "I have excellent baseline health but want to push the boundaries of optimization", |
| "mild": "I have minor health concerns and want targeted interventions", |
| "moderate": "I have noticeable health issues and need comprehensive protocols", |
| "severe": "I have significant health challenges and require intensive interventions" |
| } |
| |
| context = severity_context.get(severity, "") |
| return f"{base_query}. {context}." |
|
|
| class AIProtocolGenerator: |
| """Uses OpenAI to generate health optimization protocols""" |
| |
| def __init__(self, api_key: str, model: str = "gpt-4"): |
| self.client = OpenAI(api_key=api_key) |
| self.model = model |
| self.total_cost = 0.0 |
| |
| def generate_protocol(self, health_profile: Dict[str, Any], study_context: Dict[str, Any], progress_callback=None) -> Optional[str]: |
| """Generate comprehensive health optimization protocol""" |
| |
| system_prompt = self._create_system_prompt(study_context) |
| user_prompt = self._create_user_prompt(health_profile, study_context) |
| |
| try: |
| if progress_callback: |
| progress_callback(f"π Generating protocol using {self.model}...") |
| |
| response = self.client.chat.completions.create( |
| model=self.model, |
| messages=[ |
| {"role": "system", "content": system_prompt}, |
| {"role": "user", "content": user_prompt} |
| ], |
| max_tokens=4000, |
| temperature=0.7, |
| top_p=0.9 |
| ) |
| |
| self._update_cost(response.usage) |
| |
| if progress_callback: |
| progress_callback(f"β
Protocol generated ({response.usage.total_tokens} tokens)") |
| |
| return response.choices[0].message.content |
| |
| except Exception as e: |
| if progress_callback: |
| progress_callback(f"β Error generating protocol: {e}") |
| return None |
| |
| def _create_system_prompt(self, study_context: Dict[str, Any]) -> str: |
| domain = study_context.get("domain", "health") |
| interventions = ", ".join(study_context.get("interventions", [])) |
| |
| return f"""You are an advanced AI health optimization system specializing in evidence-based medicine and personalized protocols. |
| |
| RESEARCH CONTEXT: |
| - Domain: {domain} optimization |
| - Key Interventions: {interventions} |
| - Evidence Level: Peer-reviewed clinical research |
| |
| PROTOCOL REQUIREMENTS: |
| 1. Executive Summary with current health assessment |
| 2. Multi-Phase Protocol: |
| - Phase 1: Foundation (0-3 months) |
| - Phase 2: Optimization (3-6 months) |
| - Phase 3: Advanced Enhancement (6-12 months) |
| 3. Specific supplement protocols with dosages and timing |
| 4. Lifestyle interventions (exercise, nutrition, sleep) |
| 5. Monitoring and assessment plans |
| 6. Expected outcomes with realistic timelines |
| |
| STYLE: Professional, authoritative, using Medicine 3.0 terminology. Reference biological age, biomarkers, and cellular health. |
| |
| SAFETY: Keep dosages within evidence-based safe ranges. Include monitoring recommendations. |
| |
| Generate comprehensive protocols (3000+ words) with actionable precision medicine recommendations.""" |
| |
| def _create_user_prompt(self, health_profile: Dict[str, Any], study_context: Dict[str, Any]) -> str: |
| return f""" |
| COMPREHENSIVE HEALTH OPTIMIZATION REQUEST: |
| |
| Health Profile Analysis: |
| {json.dumps(health_profile, indent=2)} |
| |
| Research Context: |
| - Study: {study_context.get('title', 'Health Optimization Study')} |
| - Domain: {study_context.get('domain', 'general health')} |
| - Key Findings: Based on clinical research showing significant improvements in health biomarkers |
| |
| Please analyze this health profile and generate a detailed, personalized optimization protocol. Address the specific biomarker patterns, deficiencies, and health challenges identified in the data. Provide targeted interventions with precise dosing, timing, and monitoring protocols. |
| """ |
| |
| def _update_cost(self, usage): |
| pricing = { |
| "gpt-3.5-turbo": {"input": 0.0015, "output": 0.002}, |
| "gpt-4": {"input": 0.03, "output": 0.06}, |
| "gpt-4-turbo": {"input": 0.01, "output": 0.03} |
| } |
| |
| model_pricing = pricing.get(self.model, pricing["gpt-4"]) |
| input_cost = usage.prompt_tokens * model_pricing["input"] / 1000 |
| output_cost = usage.completion_tokens * model_pricing["output"] / 1000 |
| |
| self.total_cost += input_cost + output_cost |
|
|
| class HealthDatasetGenerator: |
| """Complete system that orchestrates the entire dataset generation process""" |
| |
| def __init__(self, api_key: str, model: str = "gpt-4"): |
| self.literature_sim = MedicalLiteratureSimulator() |
| self.profile_gen = HealthProfileGenerator() |
| self.protocol_gen = AIProtocolGenerator(api_key, model) |
| self.generated_examples = [] |
| |
| def generate_dataset(self, |
| domains: List[str] = None, |
| examples_per_domain: int = 2, |
| rate_limit_delay: float = 2.0, |
| progress_callback=None) -> Tuple[List[Dict[str, Any]], str]: |
| """Generate complete health optimization dataset with progress updates""" |
| |
| if domains is None: |
| domains = ["longevity", "metabolic_health", "cardiovascular", "cognitive"] |
| |
| if progress_callback: |
| progress_callback(f"π Starting Health Dataset Generation") |
| progress_callback(f"Domains: {domains}") |
| progress_callback(f"Examples per domain: {examples_per_domain}") |
| progress_callback(f"Total examples to generate: {len(domains) * examples_per_domain}") |
| |
| examples = [] |
| total_examples = len(domains) * examples_per_domain |
| current_example = 0 |
| |
| for domain in domains: |
| if progress_callback: |
| progress_callback(f"\nπ Processing domain: {domain}") |
| |
| for i in range(examples_per_domain): |
| current_example += 1 |
| try: |
| if progress_callback: |
| progress_callback(f" Creating example {i+1}/{examples_per_domain} (Overall: {current_example}/{total_examples})") |
| |
| |
| study = self.literature_sim.generate_study_data(domain) |
| if progress_callback: |
| progress_callback(f" π Generated study: {study['title'][:50]}...") |
| |
| |
| severity = random.choice(["mild", "moderate", "severe"]) |
| health_profile = self.profile_gen.generate_profile_from_study(study, severity) |
| if progress_callback: |
| progress_callback(f" π€ Created {severity} health profile") |
| |
| |
| protocol = self.protocol_gen.generate_protocol(health_profile, study, progress_callback) |
| |
| if protocol: |
| training_example = { |
| "user_context": health_profile, |
| "response": protocol, |
| "citations": self._generate_citations(study), |
| "metadata": { |
| "domain": domain, |
| "severity": severity, |
| "study_pmid": study["pmid"], |
| "generated_at": datetime.now().isoformat() |
| } |
| } |
| |
| examples.append(training_example) |
| if progress_callback: |
| progress_callback(f" β
Complete example generated") |
| |
| |
| if i < examples_per_domain - 1: |
| if progress_callback: |
| progress_callback(f" β³ Rate limit delay: {rate_limit_delay}s") |
| time.sleep(rate_limit_delay) |
| |
| except Exception as e: |
| if progress_callback: |
| progress_callback(f" β Error generating example: {e}") |
| continue |
| |
| if progress_callback: |
| progress_callback(f"\nπ Dataset generation complete!") |
| progress_callback(f"Generated: {len(examples)} examples") |
| progress_callback(f"Total cost: ${self.protocol_gen.total_cost:.4f}") |
| |
| self.generated_examples = examples |
| return examples, f"Generated {len(examples)} examples. Total cost: ${self.protocol_gen.total_cost:.4f}" |
| |
| def _generate_citations(self, study: Dict[str, Any]) -> Dict[str, List[str]]: |
| return { |
| "tier_1_peer_reviewed": [study["pmid"], f"PMC{random.randint(1000000, 9999999)}"], |
| "tier_2_rct": [f"{study['domain'].upper()}.2024.{random.randint(100000, 999999)}"], |
| "tier_3_cohort": [f"HEALTH.2023.{random.randint(100000, 999999)}"], |
| "real_world_cases": ["Evidence-based health optimization protocols"] |
| } |
| |
| def export_dataset(self, filename: str = None) -> Tuple[str, List[str]]: |
| """Export dataset and return zip file path and file list""" |
| |
| if not filename: |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
| filename = f"health_dataset_{timestamp}" |
| |
| |
| files_created = [] |
| |
| |
| raw_data = json.dumps(self.generated_examples, indent=2, ensure_ascii=False) |
| files_created.append((f"{filename}.json", raw_data)) |
| |
| |
| fine_tune_lines = [] |
| for example in self.generated_examples: |
| fine_tune_example = { |
| "messages": [ |
| { |
| "role": "system", |
| "content": "You are an advanced AI health optimization system that creates evidence-based protocols." |
| }, |
| { |
| "role": "user", |
| "content": f"Create a health optimization protocol for this profile:\n\n{json.dumps(example['user_context'], indent=2)}" |
| }, |
| { |
| "role": "assistant", |
| "content": example["response"] |
| } |
| ] |
| } |
| fine_tune_lines.append(json.dumps(fine_tune_example, ensure_ascii=False)) |
| |
| fine_tune_data = '\n'.join(fine_tune_lines) |
| files_created.append((f"{filename}_fine_tuning.jsonl", fine_tune_data)) |
| |
| |
| sample_size = min(3, len(self.generated_examples)) |
| sample_data = json.dumps(self.generated_examples[:sample_size], indent=2, ensure_ascii=False) |
| files_created.append((f"{filename}_samples.json", sample_data)) |
| |
| |
| metadata = { |
| "generation_info": { |
| "generated_at": datetime.now().isoformat(), |
| "total_examples": len(self.generated_examples), |
| "total_cost": self.protocol_gen.total_cost, |
| "model_used": self.protocol_gen.model |
| }, |
| "domains_covered": list(set(ex["metadata"]["domain"] for ex in self.generated_examples)), |
| "severity_distribution": { |
| severity: sum(1 for ex in self.generated_examples if ex["metadata"]["severity"] == severity) |
| for severity in ["mild", "moderate", "severe"] |
| } |
| } |
| |
| metadata_data = json.dumps(metadata, indent=2, ensure_ascii=False) |
| files_created.append((f"{filename}_metadata.json", metadata_data)) |
| |
| |
| zip_buffer = io.BytesIO() |
| with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file: |
| for file_name, file_content in files_created: |
| zip_file.writestr(file_name, file_content) |
| |
| |
| zip_filename = f"{filename}.zip" |
| with open(zip_filename, 'wb') as f: |
| f.write(zip_buffer.getvalue()) |
| |
| file_list = [f[0] for f in files_created] |
| return zip_filename, file_list |
|
|
| |
| |
| |
|
|
| class HealthDatasetGradioInterface: |
| """Gradio web interface for the health dataset generator""" |
| |
| def __init__(self): |
| self.generator = None |
| self.available_domains = list(MedicalLiteratureSimulator().research_domains.keys()) |
| |
| def estimate_cost(self, domains, examples_per_domain, model): |
| """Estimate generation cost""" |
| if not domains: |
| return "Please select at least one domain" |
| |
| total_examples = len(domains) * examples_per_domain |
| |
| cost_per_example = { |
| "gpt-3.5-turbo": 0.05, |
| "gpt-4": 0.25, |
| "gpt-4-turbo": 0.15 |
| } |
| |
| estimated_cost = total_examples * cost_per_example.get(model, 0.25) |
| |
| return f"π° Estimated cost: ${estimated_cost:.2f} for {total_examples} examples" |
| |
| def validate_inputs(self, api_key, domains, examples_per_domain): |
| """Validate user inputs""" |
| if not api_key or not api_key.strip(): |
| return False, "β Please provide your OpenAI API key" |
| |
| if not domains: |
| return False, "β Please select at least one domain" |
| |
| if examples_per_domain < 1 or examples_per_domain > 10: |
| return False, "β Examples per domain must be between 1 and 10" |
| |
| return True, "β
Inputs are valid" |
| |
| def generate_dataset_interface(self, api_key, domains, examples_per_domain, model, rate_limit): |
| """Main dataset generation function for Gradio interface""" |
| |
| |
| is_valid, message = self.validate_inputs(api_key, domains, examples_per_domain) |
| if not is_valid: |
| yield message, "", "", None, None |
| return |
| |
| |
| try: |
| self.generator = HealthDatasetGenerator(api_key.strip(), model) |
| except Exception as e: |
| yield f"β Error initializing generator: {e}", "", "", None, None |
| return |
| |
| |
| progress_messages = [] |
| |
| def progress_callback(message): |
| progress_messages.append(message) |
| progress_text = "\n".join(progress_messages[-20:]) |
| return progress_text |
| |
| try: |
| |
| yield "π Starting dataset generation...", "", "", None, None |
| |
| dataset, summary = self.generator.generate_dataset( |
| domains=domains, |
| examples_per_domain=examples_per_domain, |
| rate_limit_delay=rate_limit, |
| progress_callback=progress_callback |
| ) |
| |
| if not dataset: |
| yield "β No examples generated", "", "", None, None |
| return |
| |
| |
| progress_callback("πΎ Exporting dataset...") |
| zip_filename, file_list = self.generator.export_dataset() |
| |
| |
| preview = self.create_dataset_preview(dataset) |
| |
| |
| final_progress = progress_callback(f"π Generation complete! Files: {', '.join(file_list)}") |
| |
| yield final_progress, summary, preview, zip_filename, file_list |
| |
| except Exception as e: |
| yield f"β Error during generation: {e}", "", "", None, None |
| |
| def create_dataset_preview(self, dataset): |
| """Create a preview of the generated dataset""" |
| if not dataset: |
| return "No data to preview" |
| |
| preview = "π **Dataset Preview**\n\n" |
| |
| |
| preview += f"**Total Examples:** {len(dataset)}\n" |
| |
| |
| domains = [ex['metadata']['domain'] for ex in dataset] |
| domain_counts = {d: domains.count(d) for d in set(domains)} |
| preview += f"**Domain Distribution:** {domain_counts}\n" |
| |
| |
| severities = [ex['metadata']['severity'] for ex in dataset] |
| severity_counts = {s: severities.count(s) for s in set(severities)} |
| preview += f"**Severity Distribution:** {severity_counts}\n\n" |
| |
| |
| if dataset: |
| example = dataset[0] |
| preview += "**Sample Example:**\n" |
| preview += f"- **Domain:** {example['metadata']['domain']}\n" |
| preview += f"- **Severity:** {example['metadata']['severity']}\n" |
| preview += f"- **User Query:** {example['user_context']['user_query'][:150]}...\n" |
| preview += f"- **Response Length:** {len(example['response'])} characters\n" |
| preview += f"- **PMID:** {example['metadata']['study_pmid']}\n" |
| |
| return preview |
| |
| def analyze_dataset_file(self, zip_file): |
| """Analyze uploaded dataset file""" |
| if zip_file is None: |
| return "No file uploaded" |
| |
| try: |
| |
| with zipfile.ZipFile(zip_file.name, 'r') as zip_ref: |
| |
| json_files = [f for f in zip_ref.namelist() if f.endswith('.json') and not f.endswith('_samples.json') and not f.endswith('_metadata.json')] |
| |
| if json_files: |
| dataset_file = json_files[0] |
| with zip_ref.open(dataset_file) as f: |
| dataset = json.load(f) |
| |
| analysis = "π **Dataset Analysis**\n\n" |
| analysis += f"**Total Examples:** {len(dataset)}\n" |
| analysis += f"**Average Response Length:** {sum(len(ex['response']) for ex in dataset) / len(dataset):.0f} characters\n" |
| |
| |
| long_responses = sum(1 for ex in dataset if len(ex['response']) > 2000) |
| has_phases = sum(1 for ex in dataset if "Phase" in ex['response']) |
| has_dosages = sum(1 for ex in dataset if re.search(r'\d+\s*mg', ex['response'])) |
| |
| analysis += f"**Quality Metrics:**\n" |
| analysis += f"- Responses >2000 chars: {long_responses}/{len(dataset)} ({long_responses/len(dataset)*100:.1f}%)\n" |
| analysis += f"- Responses with phases: {has_phases}/{len(dataset)} ({has_phases/len(dataset)*100:.1f}%)\n" |
| analysis += f"- Responses with dosages: {has_dosages}/{len(dataset)} ({has_dosages/len(dataset)*100:.1f}%)\n" |
| |
| return analysis |
| else: |
| return "No dataset JSON file found in zip" |
| |
| except Exception as e: |
| return f"Error analyzing file: {e}" |
| |
| def create_interface(self): |
| """Create the Gradio interface""" |
| |
| with gr.Blocks(title="Medical Literature Health Dataset Generator", theme=gr.themes.Soft()) as interface: |
| |
| gr.Markdown(""" |
| # π₯ Medical Literature Health Dataset Generator |
| |
| This tool generates synthetic health optimization datasets based on medical literature patterns. |
| Perfect for training AI models on evidence-based health protocols. |
| |
| β οΈ **Important:** Generated content is for research/educational purposes only. Not medical advice. |
| """) |
| |
| with gr.Tab("π Generate Dataset"): |
| |
| with gr.Row(): |
| with gr.Column(scale=1): |
| gr.Markdown("### βοΈ Configuration") |
| |
| api_key = gr.Textbox( |
| label="OpenAI API Key", |
| placeholder="sk-...", |
| type="password", |
| info="Your OpenAI API key for generating protocols" |
| ) |
| |
| domains = gr.CheckboxGroup( |
| label="Research Domains", |
| choices=self.available_domains, |
| value=["longevity", "metabolic_health"], |
| info="Select medical research domains to include" |
| ) |
| |
| examples_per_domain = gr.Slider( |
| label="Examples per Domain", |
| minimum=1, |
| maximum=10, |
| value=2, |
| step=1, |
| info="Number of examples to generate for each domain" |
| ) |
| |
| model = gr.Dropdown( |
| label="OpenAI Model", |
| choices=["gpt-3.5-turbo", "gpt-4", "gpt-4-turbo"], |
| value="gpt-4", |
| info="Model for generating protocols (GPT-4 recommended for quality)" |
| ) |
| |
| rate_limit = gr.Slider( |
| label="Rate Limit Delay (seconds)", |
| minimum=0.5, |
| maximum=5.0, |
| value=2.0, |
| step=0.5, |
| info="Delay between API calls to avoid rate limits" |
| ) |
| |
| cost_estimate = gr.Textbox( |
| label="Cost Estimate", |
| value="Select domains and examples to see estimate", |
| interactive=False |
| ) |
| |
| generate_btn = gr.Button( |
| "π Generate Dataset", |
| variant="primary", |
| size="lg" |
| ) |
| |
| with gr.Column(scale=2): |
| gr.Markdown("### π Progress & Results") |
| |
| progress_output = gr.Textbox( |
| label="Generation Progress", |
| lines=15, |
| max_lines=20, |
| value="Ready to generate dataset...", |
| interactive=False |
| ) |
| |
| summary_output = gr.Textbox( |
| label="Generation Summary", |
| lines=3, |
| interactive=False |
| ) |
| |
| preview_output = gr.Markdown( |
| label="Dataset Preview", |
| value="Dataset preview will appear here..." |
| ) |
| |
| with gr.Row(): |
| download_file = gr.File( |
| label="π₯ Download Generated Dataset", |
| interactive=False |
| ) |
| |
| file_list = gr.Textbox( |
| label="Generated Files", |
| placeholder="Files included in download will be listed here", |
| interactive=False |
| ) |
| |
| with gr.Tab("π Analyze Dataset"): |
| gr.Markdown("### π Dataset Analysis") |
| gr.Markdown("Upload a generated dataset zip file to analyze its quality and structure.") |
| |
| with gr.Row(): |
| with gr.Column(): |
| upload_file = gr.File( |
| label="Upload Dataset Zip File", |
| file_types=[".zip"] |
| ) |
| |
| analyze_btn = gr.Button( |
| "π Analyze Dataset", |
| variant="secondary" |
| ) |
| |
| with gr.Column(): |
| analysis_output = gr.Markdown( |
| label="Analysis Results", |
| value="Upload a dataset file to see analysis..." |
| ) |
| |
| with gr.Tab("βΉοΈ Information"): |
| gr.Markdown(""" |
| ### π How It Works |
| |
| 1. **Literature Simulation**: Creates realistic medical studies with proper abstracts, interventions, and outcomes |
| 2. **Health Profile Generation**: Generates comprehensive health profiles based on study domains and severity levels |
| 3. **AI Protocol Generation**: Uses OpenAI to create detailed health optimization protocols |
| 4. **Dataset Export**: Outputs data in multiple formats including OpenAI fine-tuning format |
| |
| ### π― Output Files |
| |
| - **`dataset.json`**: Complete raw dataset |
| - **`dataset_fine_tuning.jsonl`**: OpenAI fine-tuning format |
| - **`dataset_samples.json`**: Sample examples for review |
| - **`dataset_metadata.json`**: Generation statistics and info |
| |
| ### π° Cost Information |
| |
| - **GPT-3.5-turbo**: ~$0.05 per example |
| - **GPT-4**: ~$0.25 per example |
| - **GPT-4-turbo**: ~$0.15 per example |
| |
| ### β οΈ Important Notes |
| |
| - Generated content is for **research/educational purposes only** |
| - **Not medical advice** - always consult healthcare professionals |
| - Include appropriate medical disclaimers when using generated content |
| - Review sample outputs before using in production |
| |
| ### π§ Recommended Settings |
| |
| - **Start small**: Generate 2-4 examples first to test quality |
| - **Use GPT-4**: Better quality than GPT-3.5-turbo |
| - **Rate limiting**: Use 2+ second delays to avoid API limits |
| - **Multiple domains**: Include diverse domains for comprehensive dataset |
| """) |
| |
| |
| |
| |
| def update_cost_estimate(domains, examples_per_domain, model): |
| return self.estimate_cost(domains, examples_per_domain, model) |
| |
| for input_component in [domains, examples_per_domain, model]: |
| input_component.change( |
| fn=update_cost_estimate, |
| inputs=[domains, examples_per_domain, model], |
| outputs=[cost_estimate] |
| ) |
| |
| |
| generate_btn.click( |
| fn=self.generate_dataset_interface, |
| inputs=[api_key, domains, examples_per_domain, model, rate_limit], |
| outputs=[progress_output, summary_output, preview_output, download_file, file_list] |
| ) |
| |
| |
| analyze_btn.click( |
| fn=self.analyze_dataset_file, |
| inputs=[upload_file], |
| outputs=[analysis_output] |
| ) |
| |
| return interface |
|
|
| |
| |
| |
|
|
| def main(): |
| """Launch the Gradio interface""" |
| |
| print("π Launching Medical Literature Health Dataset Generator") |
| print("This will start a web interface accessible through your browser") |
| |
| |
| interface_creator = HealthDatasetGradioInterface() |
| interface = interface_creator.create_interface() |
| |
| |
| interface.launch( |
| share=True, |
| server_name="0.0.0.0", |
| server_port=7860, |
| show_error=True, |
| quiet=False |
| ) |
|
|
| if __name__ == "__main__": |
| main() |
| |
| |
| |