| | from sklearn.impute import SimpleImputer |
| | from dotenv import load_dotenv |
| | from scipy import stats |
| | from langchain_groq import ChatGroq |
| | from langchain.chains import LLMChain |
| | import pandas as pd |
| | import numpy as np |
| | import re |
| | import os |
| | from langchain_google_genai import ChatGoogleGenerativeAI |
| | from langchain.prompts import PromptTemplate |
| | from langchain_core.runnables import RunnableSequence |
| | import streamlit as st |
| | from .clean_df_fallback import clean_dataframe_fallback |
| |
|
| |
|
| | |
| |
|
| | load_dotenv() |
| |
|
| |
|
| |
|
| | groq_api_key = os.getenv("GROQ_API_KEY") |
| | gemini_api_key = os.getenv("GEMINI_API_KEY") |
| |
|
| |
|
| | if not gemini_api_key: |
| | raise ValueError("GEMINI_API_KEY not found in environment variables") |
| | if not groq_api_key: |
| | raise ValueError("GROQ_API_KEY not found in environment variables") |
| |
|
| |
|
| | |
| | try: |
| | llm = ChatGoogleGenerativeAI( |
| | model="gemini-2.0-flash-lite-preview-02-05", |
| | google_api_key=gemini_api_key |
| | ) |
| | print("Primary Gemini LLM loaded successfully.") |
| |
|
| | except Exception as e: |
| | print(f"Error initializing primary Gemini LLM: {e}") |
| | |
| | |
| | try: |
| | llm = ChatGroq( |
| | model="gemma2-9b-it", |
| | groq_api_key=groq_api_key |
| | ) |
| | print("Fallback Groq LLM loaded successfully.") |
| | |
| | except Exception as e2: |
| | print(f"Error initializing fallback Groq LLM: {e2}") |
| | llm=None |
| |
|
| |
|
| |
|
| | |
| | @st.cache_data(ttl=3600, show_spinner=False) |
| | def cached_clean_csv(df_json, skip_cleaning=False): |
| | """Cached version of the clean_csv function to prevent redundant cleaning. |
| | |
| | Args: |
| | df_json: JSON string representation of the dataframe (for hashing) |
| | skip_cleaning: Whether to skip cleaning |
| | |
| | Returns: |
| | Tuple of (cleaned_df, insights) |
| | """ |
| | |
| | df = pd.read_json(df_json, orient='records') |
| | |
| | |
| | if skip_cleaning: |
| | return df, "No cleaning performed (user skipped)." |
| | |
| | |
| | if "test_results_calculated" in st.session_state: |
| | st.session_state.test_results_calculated = False |
| | |
| | for key in ['test_metrics', 'test_y_pred', 'test_y_test', 'test_cm', 'sampling_message']: |
| | if key in st.session_state: |
| | del st.session_state[key] |
| | |
| | |
| | return clean_csv(df) |
| |
|
| |
|
| | def clean_csv(df): |
| | """Original clean_csv function that performs the actual cleaning.""" |
| | |
| | |
| | |
| | if llm is None: |
| | print("LLM initialization failed; using hardcoded cleaning function.") |
| | fallback_df = clean_dataframe_fallback(df) |
| |
|
| | return fallback_df , "LLM initialization failed; using hardcoded cleaning function, so no insights were generated." |
| |
|
| |
|
| |
|
| | |
| | |
| | |
| |
|
| |
|
| | |
| | sample_data = df.head(3).to_json(orient='records') |
| | escaped_sample_data = sample_data.replace("{", "{{").replace("}", "}}") |
| |
|
| | escaped_columns = [ |
| | col.replace("{", "{{").replace("}", "}}") for col in df.columns |
| | ] |
| | column_names_str = ", ".join(escaped_columns) |
| |
|
| |
|
| |
|
| | |
| | initial_prompt = PromptTemplate.from_template(f''' |
| | You are given the following sample data from a pandas DataFrame: |
| | {escaped_sample_data} |
| | |
| | column names are : [{column_names_str}]. |
| | |
| | Generate a Python function named clean_dataframe(df) considering the following: |
| | |
| | |
| | 1. Performs thorough data cleaning without performing feature engineering. Ensure all necessary cleaning steps are included. |
| | 2. Uses assignment operations (e.g., df = df.drop(...)) and avoids inplace=True for clarity. |
| | 3. First deeply analyze each column’s content this is the most important step , to infer its predominant data type for example if we have RS.2100 in rows remove rs and if we have (89%) remove % , if the column contains only text and no numbers then it is a text column and if it contains numbers and text then it is a mixed column and if it contains only numbers then it is a numeric column. |
| | 4. For columns that are intended to be numeric but contain extra characters (such as '%' in percentage values, currency symbols like 'Rs.', '$', and commas), remove all non-digit characters (except for the decimal point) and convert them to a numeric type. |
| | 5. For columns that are clearly text or categorical, preserve the content without removing digits or altering the textual information. |
| | 6. Handles missing values appropriately: fill numeric columns with the median (or 0 if the median is not available) and non-numeric columns with 'Unknown'. |
| | 7. For columns where more than 50% of values are strings and less than 10% are numeric, perform conservative string cleaning by removing unwanted special symbols while preserving meaningful digits. |
| | 8. For columns whose names contain 'name', 'Name', or 'Names' (case-insensitive), convert to string type and remove extraneous numeric characters only if they are not part of the essential text. |
| | 9. Preserves other categorical or text columns (such as Gender, City, State, Country, etc.) unless explicitly specified for removal. |
| | 10. Handles edge cases such as completely empty columns appropriately. |
| | |
| | Return only the Python code for the function, with no explanations or extra formatting. |
| | |
| | ''' |
| | ) |
| |
|
| |
|
| |
|
| | |
| | refine_prompt = PromptTemplate.from_template( |
| | "The following Python code for cleaning a DataFrame caused an error: {error}\n" |
| | "Original code:\n{code}\n" |
| | "Please correct the code to fix the error and ensure it returns a cleaned DataFrame. " |
| | "Return only the corrected Python code for the function, no explanations or formatting." |
| | ) |
| |
|
| |
|
| |
|
| |
|
| | |
| | initial_chain = initial_prompt | llm |
| | refine_chain = refine_prompt | llm |
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | def extract_code(response): |
| | |
| | if isinstance(response, str): |
| | |
| | if "```python" in response: |
| | match = re.search(r'```python\n(.*?)\n```', response, re.DOTALL) |
| | return match.group(1).strip() if match else response |
| | |
| | elif "```" in response: |
| | match = re.search(r'```\n(.*?)\n```', response, re.DOTALL) |
| | return match.group(1).strip() if match else response |
| | |
| | return response.strip() |
| | |
| | |
| | content = getattr(response, 'content', str(response)) |
| | |
| | if "```python" in content: |
| | match = re.search(r'```python\n(.*?)\n```', content, re.DOTALL) |
| | return match.group(1).strip() if match else content |
| | |
| | elif "```" in content: |
| | match = re.search(r'```\n(.*?)\n```', content, re.DOTALL) |
| | return match.group(1).strip() if match else content |
| | |
| | return content.strip() |
| | |
| |
|
| |
|
| |
|
| |
|
| | |
| | |
| | try: |
| | |
| | cleaning_function_code = extract_code(initial_chain.invoke({})) |
| | print("Initial generated cleaning function code not executed yet is:\n", cleaning_function_code) |
| |
|
| | |
| | max_attempts = 5 |
| |
|
| | for attempt in range(max_attempts): |
| | print(f"Attempt {attempt} code:\n{cleaning_function_code}") |
| | try: |
| | |
| | exec(cleaning_function_code, globals()) |
| | |
| |
|
| |
|
| | if 'clean_dataframe' not in globals(): |
| | raise NameError("Cleaning function not defined in generated code") |
| |
|
| | df = clean_dataframe(df) |
| |
|
| | print(f"Cleaning successful on attempt {attempt + 1}") |
| | break |
| | |
| | |
| | except Exception as e: |
| | error_message = str(e) |
| | print(f"Error on attempt {attempt + 1}: {error_message}") |
| | |
| | if attempt < max_attempts - 1: |
| | |
| | |
| | refined_response = refine_chain.invoke({"error": error_message, "code": cleaning_function_code}) |
| | cleaning_function_code = extract_code(refined_response) |
| | |
| | print(f"Refined cleaning function code:\n", cleaning_function_code) |
| | |
| | else: |
| | print("Failed to clean DataFrame after 5 maximum attempts") |
| | |
| |
|
| | df = clean_dataframe_fallback(df) |
| | |
| | except Exception as e: |
| | print("⚡No successful cleaning done enforcing fallback") |
| | df = clean_dataframe_fallback(df) |
| |
|
| | |
| | cleaned_df = df |
| |
|
| |
|
| | insights_prompt = f""" |
| | Analyze this cleaned dataset: |
| | - Columns: {cleaned_df.columns.tolist()} |
| | - Sample data: {cleaned_df.head(3).to_dict()} |
| | - Numeric stats: {cleaned_df.describe().to_dict()} |
| | Provide key data quality insights and recommendations. |
| | """ |
| | |
| | try: |
| | insights_response = llm.invoke(insights_prompt) |
| | analysis_insights = insights_response.content |
| | except Exception as e: |
| | analysis_insights = f"Insight generation failed: {str(e)}" |
| |
|
| |
|
| |
|
| | |
| | return cleaned_df, analysis_insights |
| |
|