| import argparse |
| import pandas as pd |
| import streamlit as st |
| from generate_schema import generate_schema |
| from fetch_data import fetch_real_data |
| from synthetic_generator import train_and_generate_synthetic |
|
|
| def main(): |
| parser = argparse.ArgumentParser() |
| parser.add_argument("--prompt", type=str, required=True, help="Describe the dataset you want") |
| parser.add_argument("--domain", type=str, default="healthcare", help="Domain to fetch real data from (optional)") |
| args = parser.parse_args() |
|
|
| |
| hf_token = st.secrets["hf_token"] |
|
|
| |
| schema = generate_schema(args.prompt, hf_token) |
| print(f"📊 Generated schema: {schema}") |
|
|
| |
| real_data = fetch_real_data(args.domain) |
|
|
| |
| real_data = real_data[schema['columns']] |
| print(f"✅ Fetched real data with shape: {real_data.shape}") |
|
|
| |
| output_path = f"outputs/synthetic_{args.domain}.csv" |
| train_and_generate_synthetic(real_data, schema, output_path) |
|
|
| if __name__ == "__main__": |
| main() |
|
|