import streamlit as st
import requests
import pandas as pd
import io
import os
from PIL import Image
import time
# Configure page
st.set_page_config(
page_title="PromptPrepML - Auto ML Data Preprocessing",
page_icon="🤖",
layout="wide",
initial_sidebar_state="expanded"
)
# Custom CSS for better styling
st.markdown("""
""", unsafe_allow_html=True)
# API base URLs - try deployed backend first, fallback to localhost
DEPLOYED_BACKEND = "https://promptprepml-backend.railway.app"
LOCAL_BACKEND = "http://localhost:8000"
def check_backend_health():
"""Check if backend is running (try deployed first, then local)"""
backends = [DEPLOYED_BACKEND, LOCAL_BACKEND]
for backend_url in backends:
try:
response = requests.get(f"{backend_url}/health", timeout=5)
if response.status_code == 200:
st.session_state.backend_url = backend_url
return True, backend_url
except:
continue
return False, None
def upload_dataset(uploaded_file):
"""Upload dataset to backend"""
if 'backend_url' not in st.session_state:
return None, "Backend not connected"
try:
files = {'file': uploaded_file}
response = requests.post(f"{st.session_state.backend_url}/upload-dataset", files=files)
if response.status_code == 200:
return response.json(), None
else:
return None, f"Upload failed: {response.text}"
except Exception as e:
return None, f"Upload error: {str(e)}"
def process_pipeline(uploaded_file, prompt):
"""Process dataset through ML pipeline"""
if 'backend_url' not in st.session_state:
return None, "Backend not connected"
try:
files = {'file': uploaded_file}
data = {'prompt': prompt}
response = requests.post(f"{st.session_state.backend_url}/process-pipeline", files=files, data=data)
if response.status_code == 200:
return response.json(), None
else:
return None, f"Processing failed: {response.text}"
except Exception as e:
return None, f"Processing error: {str(e)}"
def download_file(filename):
"""Download processed file"""
if 'backend_url' not in st.session_state:
return None, "Backend not connected"
try:
response = requests.get(f"{st.session_state.backend_url}/api/download/{filename}")
if response.status_code == 200:
return response.content, None
else:
return None, f"Download failed: {response.text}"
except Exception as e:
return None, f"Download error: {str(e)}"
def main():
# Main header
st.markdown('
🤖 PromptPrepML
', unsafe_allow_html=True)
st.markdown('Convert natural language prompts into ML-ready datasets
', unsafe_allow_html=True)
# Check backend health
backend_healthy, backend_url = check_backend_health()
if not backend_healthy:
st.error("❌ Backend is not running! Please start the backend:")
st.code("""
cd promptprepml/backend
venv\\Scripts\\activate
python app/main.py
# OR wait for deployed backend to be ready
""")
st.info("🚀 **Deploying backend to cloud...** This will make the app work standalone!")
return
st.success(f"✅ Backend connected at: {backend_url}")
# Sidebar for navigation
st.sidebar.title("📋 Processing Steps")
# Initialize session state
if 'step' not in st.session_state:
st.session_state.step = 'upload'
if 'upload_result' not in st.session_state:
st.session_state.upload_result = None
if 'processing_result' not in st.session_state:
st.session_state.processing_result = None
# Step indicators
steps = ['📤 Upload', '⚙️ Configure', '🚀 Process', '📊 Results']
current_step_index = 0
if st.session_state.step == 'upload':
current_step_index = 0
elif st.session_state.step == 'configure':
current_step_index = 1
elif st.session_state.step == 'process':
current_step_index = 2
elif st.session_state.step == 'results':
current_step_index = 3
# Display step indicators
for i, step in enumerate(steps):
if i < current_step_index:
st.sidebar.success(f"✅ {step}")
elif i == current_step_index:
st.sidebar.info(f"🔄 {step}")
else:
st.sidebar.write(f"⏳ {step}")
# Step 1: Upload Dataset
if st.session_state.step == 'upload':
st.markdown('', unsafe_allow_html=True)
uploaded_file = st.file_uploader(
"Choose a CSV file",
type=['csv'],
help="Upload your dataset for preprocessing"
)
if uploaded_file is not None:
st.info(f"📄 File uploaded: `{uploaded_file.name}`")
# Show file preview
try:
df = pd.read_csv(uploaded_file)
st.markdown('', unsafe_allow_html=True)
st.markdown(f"**Dataset Shape:** {df.shape}")
st.markdown(f"**Columns:** {', '.join(df.columns)}")
st.dataframe(df.head())
st.markdown('
', unsafe_allow_html=True)
if st.button("🚀 Continue to Configuration", type="primary"):
# Upload to backend
with st.spinner("Uploading dataset..."):
result, error = upload_dataset(uploaded_file)
if error:
st.error(f"❌ Upload failed: {error}")
else:
st.session_state.upload_result = result
st.session_state.step = 'configure'
st.rerun()
except Exception as e:
st.error(f"❌ Error reading file: {str(e)}")
# Step 2: Configure Processing
elif st.session_state.step == 'configure':
st.markdown('', unsafe_allow_html=True)
if st.session_state.upload_result:
file_info = st.session_state.upload_result
st.markdown('', unsafe_allow_html=True)
st.markdown(f"**File:** {file_info.get('filename', 'Unknown')}")
st.markdown(f"**Size:** {file_info.get('size', 'Unknown')} bytes")
st.markdown('
', unsafe_allow_html=True)
# Processing options
prompt = st.text_area(
"Describe your preprocessing needs:",
value="Prepare this dataset for machine learning. Handle missing values, remove identifier columns, extract date features, encode categorical variables, and scale numeric features.",
height=100,
help="Describe what you want to do with your dataset in natural language"
)
col1, col2 = st.columns([1, 1])
with col1:
if st.button("⬅️ Back", type="secondary"):
st.session_state.step = 'upload'
st.rerun()
with col2:
if st.button("🚀 Start Processing", type="primary"):
if uploaded_file is not None:
with st.spinner("Processing dataset... This may take a few minutes."):
result, error = process_pipeline(uploaded_file, prompt)
if error:
st.error(f"❌ Processing failed: {error}")
else:
st.session_state.processing_result = result
st.session_state.step = 'results'
st.rerun()
# Step 3: Results
elif st.session_state.step == 'results':
st.markdown('', unsafe_allow_html=True)
if st.session_state.processing_result:
result = st.session_state.processing_result
# Success message
st.markdown('', unsafe_allow_html=True)
st.success("✅ Dataset processed successfully!")
st.markdown('
', unsafe_allow_html=True)
# Results summary
col1, col2 = st.columns([2, 1])
with col1:
st.markdown("### 📈 Processing Summary")
dataset_info = result.get('dataset_info', {})
if dataset_info:
basic_info = dataset_info.get('basic_info', {})
st.markdown(f"- **Original Shape:** {basic_info.get('shape', 'Unknown')}")
st.markdown(f"- **Columns:** {basic_info.get('columns', 'Unknown')}")
preprocessing_info = result.get('preprocessing_info', {})
if preprocessing_info:
st.markdown(f"- **Processed Shape:** {preprocessing_info.get('processed_shape', 'Unknown')}")
# Dataset preview
st.markdown("### 👀 Dataset Preview")
preview_data = result.get('preview_data', [])
if preview_data:
df_preview = pd.DataFrame(preview_data)
st.dataframe(df_preview)
with col2:
st.markdown("### 📥 Download Files")
download_links = [
("Processed Dataset", "processed_dataset.csv"),
("Training Set", "train.csv"),
("Test Set", "test.csv"),
("Pipeline", "pipeline.pkl"),
("EDA Report", "eda_report.html")
]
for name, filename in download_links:
if st.button(f"📥 {name}", key=f"download_{filename}"):
with st.spinner(f"Downloading {filename}..."):
file_content, error = download_file(filename)
if error:
st.error(f"❌ Download failed: {error}")
else:
st.download_button(
label=f"💾 Save {filename}",
data=file_content,
file_name=filename,
mime="application/octet-stream"
)
# Action buttons
col1, col2 = st.columns([1, 1])
with col1:
if st.button("🔄 Process New Dataset", type="secondary"):
# Reset session state
for key in list(st.session_state.keys()):
del st.session_state[key]
st.session_state.step = 'upload'
st.rerun()
with col2:
if st.button("📈 View EDA Report", type="primary"):
st.info("📊 EDA Report feature coming soon!")
# Footer
st.markdown("---")
st.markdown("""
PromptPrepML - Automated ML Data Preprocessing
Convert natural language prompts into ML-ready datasets
""", unsafe_allow_html=True)
if __name__ == "__main__":
main()