| import gradio as gr |
| import pandas as pd |
| import duckdb |
| from datasets import load_dataset |
| from huggingface_hub import login |
| import openai |
| import os |
| from typing import Dict, List, Any |
|
|
| class SALTAnalytics: |
| def __init__(self): |
| """Initialize SALT Analytics""" |
| self.con = duckdb.connect(':memory:') |
| self.data_loaded = False |
| self.schema_info = "" |
| self.available_columns = [] |
| |
| def load_salt_dataset(self): |
| """Load SAP SALT dataset from Hugging Face into DuckDB""" |
| if self.data_loaded: |
| return "Dataset already loaded!" |
| |
| try: |
| hf_token = os.getenv('HF_TOKEN') |
| |
| if hf_token: |
| dataset = load_dataset( |
| "SAP/SALT", |
| "joined_table", |
| split="train", |
| token=hf_token, |
| streaming=False |
| ) |
| else: |
| dataset = load_dataset( |
| "SAP/SALT", |
| "joined_table", |
| split="train", |
| use_auth_token=True, |
| streaming=False |
| ) |
| |
| df = dataset.to_pandas() |
| |
| if len(df) > 100000: |
| df = df.sample(n=50000, random_state=42) |
| |
| self.con.execute("CREATE TABLE salt_data AS SELECT * FROM df") |
| |
| schema_result = self.con.execute("DESCRIBE salt_data").fetchall() |
| self.schema_info = "\n".join([f"{col[0]}: {col[1]}" for col in schema_result]) |
| self.available_columns = [col[0] for col in schema_result] |
| |
| self.data_loaded = True |
| |
| return f"β
Successfully loaded {len(df)} records into DuckDB\n\nπ Available columns:\n" + "\n".join(f"β’ {col}" for col in self.available_columns[:20]) + ("\n... and more" if len(self.available_columns) > 20 else "") |
| |
| except Exception as e: |
| error_msg = str(e) |
| if "gated dataset" in error_msg or "authentication" in error_msg.lower(): |
| return f"β Authentication Error: {error_msg}\n\nTo fix this:\n1. Go to https://huggingface.co/datasets/SAP/SALT\n2. Request access to the dataset\n3. Wait for approval\n4. Set HF_TOKEN in your Space secrets" |
| else: |
| return f"β Error loading dataset: {error_msg}" |
| |
| def get_predefined_insights(self): |
| """Generate predefined analytical insights - COMPLETELY FIXED""" |
| if not self.data_loaded: |
| return "Please load the dataset first" |
| |
| try: |
| insights = {} |
| |
| |
| insights['Dataset Overview'] = self.con.execute(""" |
| SELECT |
| COUNT(*) as total_records, |
| COUNT(DISTINCT CREATIONDATE) as unique_dates, |
| MIN(CREATIONDATE) as earliest_date, |
| MAX(CREATIONDATE) as latest_date |
| FROM salt_data |
| """).fetchdf() |
| |
| |
| if 'CUSTOMERPAYMENTTERMS' in self.available_columns: |
| insights['Payment Terms Distribution'] = self.con.execute(""" |
| SELECT CUSTOMERPAYMENTTERMS, |
| COUNT(*) as frequency, |
| ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER(), 2) as percentage |
| FROM salt_data |
| WHERE CUSTOMERPAYMENTTERMS IS NOT NULL AND CUSTOMERPAYMENTTERMS != '' |
| GROUP BY CUSTOMERPAYMENTTERMS |
| ORDER BY frequency DESC |
| LIMIT 10 |
| """).fetchdf() |
| |
| |
| sales_office_col = None |
| for col in self.available_columns: |
| if 'SALES' in col.upper() and 'OFFICE' in col.upper(): |
| sales_office_col = col |
| break |
| |
| if sales_office_col: |
| query = f""" |
| SELECT {sales_office_col}, |
| COUNT(*) as total_orders |
| FROM salt_data |
| WHERE {sales_office_col} IS NOT NULL AND {sales_office_col} != '' |
| GROUP BY {sales_office_col} |
| ORDER BY total_orders DESC |
| LIMIT 10 |
| """ |
| insights['Sales Office Performance'] = self.con.execute(query).fetchdf() |
| |
| |
| shipping_col = None |
| for col in self.available_columns: |
| if 'SHIPPING' in col.upper() and 'CONDITION' in col.upper(): |
| shipping_col = col |
| break |
| |
| if shipping_col: |
| query = f""" |
| SELECT {shipping_col}, |
| COUNT(*) as order_count |
| FROM salt_data |
| WHERE {shipping_col} IS NOT NULL AND {shipping_col} != '' |
| GROUP BY {shipping_col} |
| ORDER BY order_count DESC |
| LIMIT 10 |
| """ |
| insights['Shipping Conditions'] = self.con.execute(query).fetchdf() |
| |
| |
| if 'SALESDOCUMENTITEMCATEGORY' in self.available_columns: |
| insights['Sales Document Categories'] = self.con.execute(""" |
| SELECT SALESDOCUMENTITEMCATEGORY, |
| COUNT(*) as frequency, |
| ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER(), 2) as percentage |
| FROM salt_data |
| WHERE SALESDOCUMENTITEMCATEGORY IS NOT NULL AND SALESDOCUMENTITEMCATEGORY != '' |
| GROUP BY SALESDOCUMENTITEMCATEGORY |
| ORDER BY frequency DESC |
| LIMIT 10 |
| """).fetchdf() |
| |
| |
| insights['Available Columns Sample'] = pd.DataFrame({ |
| 'Column Name': self.available_columns[:20], |
| 'Index': range(len(self.available_columns[:20])) |
| }) |
| |
| return insights |
| |
| except Exception as e: |
| |
| return f"β Error generating insights: {str(e)}\n\nπ Debug Info:\n" + \ |
| f"Data loaded: {self.data_loaded}\n" + \ |
| f"Available columns ({len(self.available_columns)}): {', '.join(self.available_columns[:15])}...\n" + \ |
| f"Error type: {type(e).__name__}" |
| |
| def clean_sql_response(self, sql_query: str) -> str: |
| """Clean SQL response - avoiding string literal errors""" |
| backticks = "`" + "`" + "`" |
| sql_marker = backticks + "sql" |
| |
| if sql_query.startswith(sql_marker): |
| sql_query = sql_query[6:] |
| elif sql_query.startswith(backticks): |
| sql_query = sql_query[3:] |
| |
| if sql_query.endswith(backticks): |
| sql_query = sql_query[:-3] |
| |
| return sql_query.strip() |
| |
| def natural_language_query(self, question: str, api_key: str): |
| """Convert natural language to SQL and execute""" |
| if not self.data_loaded: |
| return "Please load the dataset first" |
| |
| if not api_key: |
| return "Please provide OpenAI API key" |
| |
| try: |
| client = openai.OpenAI(api_key=api_key) |
| |
| columns_list = ", ".join(self.available_columns[:30]) |
| |
| prompt = f""" |
| You are a SQL expert analyzing SAP SALT dataset. The database has a table called 'salt_data' with these available columns: |
| |
| {columns_list} |
| |
| The SALT dataset contains SAP ERP sales order data where each row represents a sales document item. |
| |
| IMPORTANT: Use only the column names I provided above. Do not assume column names that don't exist. |
| |
| Convert this question to a DuckDB SQL query: "{question}" |
| |
| Return ONLY the SQL query, no explanation. Limit results to 20 rows and use WHERE clauses to filter out NULL values. |
| """ |
| |
| response = client.chat.completions.create( |
| model="gpt-4", |
| messages=[{"role": "user", "content": prompt}], |
| temperature=0.1 |
| ) |
| |
| sql_query = response.choices[0].message.content.strip() |
| sql_query = self.clean_sql_response(sql_query) |
| |
| result_df = self.con.execute(sql_query).fetchdf() |
| |
| explanation_prompt = f""" |
| Question: {question} |
| Results: {result_df.head(10).to_string()} |
| |
| Provide a clear business explanation of these SAP ERP results in 2-3 sentences, focusing on actionable insights for sales operations. |
| """ |
| |
| explanation_response = client.chat.completions.create( |
| model="gpt-4", |
| messages=[{"role": "user", "content": explanation_prompt}], |
| temperature=0.3 |
| ) |
| |
| explanation = explanation_response.choices[0].message.content |
| |
| code_block = "`" + "`" + "`" |
| return f"**SQL Query:**\n{code_block}sql\n{sql_query}\n{code_block}\n\n**Results:**\n{result_df.to_string(index=False)}\n\n**Explanation:**\n{explanation}" |
| |
| except Exception as e: |
| return f"Error: {str(e)}\n\nTry rephrasing your question. Available columns: {', '.join(self.available_columns[:10])}..." |
|
|
| |
| analytics = SALTAnalytics() |
|
|
| def load_dataset_interface(): |
| return analytics.load_salt_dataset() |
|
|
| def show_insights_interface(): |
| """Fixed insights interface with better error handling""" |
| insights = analytics.get_predefined_insights() |
| |
| if isinstance(insights, str): |
| return insights |
| |
| output = "# π SAP SALT Dataset Insights\n\n" |
| |
| for title, df in insights.items(): |
| output += f"## {title}\n\n" |
| if isinstance(df, pd.DataFrame) and len(df) > 0: |
| output += df.to_markdown(index=False) |
| else: |
| output += "*No data available for this analysis*" |
| output += "\n\n---\n\n" |
| |
| return output |
|
|
| def qa_interface(question: str, api_key: str): |
| if not question.strip(): |
| return "Please enter a question" |
| return analytics.natural_language_query(question, api_key) |
|
|
| sample_questions = [ |
| "Which sales offices process the most orders?", |
| "What are the most common payment terms?", |
| "Show me the distribution of shipping conditions", |
| "What is the date range of orders in the dataset?", |
| "Which document categories are most frequent?" |
| ] |
|
|
| with gr.Blocks(title="SAP SALT Analytics Demo", theme=gr.themes.Soft()) as demo: |
| |
| gr.Markdown(""" |
| # π SAP SALT Dataset Analytics Demo |
| ## Open Source Analytics + AI for SAP ERP |
| |
| This demo uses the **authentic SAP SALT dataset** - real ERP data from sales orders, items, customers, and addresses. |
| """) |
| |
| with gr.Tab("π₯ Load Dataset"): |
| gr.Markdown("### Load SAP SALT Dataset from Hugging Face") |
| |
| load_btn = gr.Button("Load SALT Dataset", variant="primary") |
| load_output = gr.Textbox(label="Status", lines=8) |
| |
| load_btn.click(fn=load_dataset_interface, outputs=load_output) |
| |
| with gr.Tab("π Insights"): |
| gr.Markdown("### Pre-built Analytics Insights") |
| |
| insights_btn = gr.Button("Generate Insights", variant="primary") |
| insights_output = gr.Markdown() |
| |
| insights_btn.click(fn=show_insights_interface, outputs=insights_output) |
| |
| with gr.Tab("π€ AI Q&A"): |
| gr.Markdown("### Ask Questions in Natural Language") |
| |
| with gr.Row(): |
| with gr.Column(scale=3): |
| api_key_input = gr.Textbox( |
| label="OpenAI API Key", |
| type="password", |
| placeholder="Enter your OpenAI API key" |
| ) |
| |
| question_input = gr.Textbox( |
| label="Your Question", |
| placeholder="e.g., Which sales offices process the most orders?", |
| lines=2 |
| ) |
| |
| sample_dropdown = gr.Dropdown( |
| choices=sample_questions, |
| label="Or choose a sample question", |
| value=None |
| ) |
| |
| ask_btn = gr.Button("Get Answer", variant="primary") |
| |
| with gr.Column(scale=4): |
| qa_output = gr.Markdown() |
| |
| sample_dropdown.change( |
| fn=lambda x: x if x else "", |
| inputs=sample_dropdown, |
| outputs=question_input |
| ) |
| |
| ask_btn.click( |
| fn=qa_interface, |
| inputs=[question_input, api_key_input], |
| outputs=qa_output |
| ) |
| |
| with gr.Tab("βΉοΈ About"): |
| gr.Markdown(""" |
| ### About the SALT Dataset |
| |
| **SAP SALT** (Sales Autocompletion Linked Business Tables) contains: |
| - **500,908 sales orders** from real SAP S/4HANA system |
| - **2.3M sales order line items** |
| - **139,611 unique customers** |
| - **Data from 2018-2020** with full business context |
| |
| **Key Use Cases:** |
| - Sales process automation (70-80% accuracy) |
| - Customer behavior analysis |
| - Shipping and logistics optimization |
| - Payment terms prediction |
| |
| **Technology Stack:** |
| - **DuckDB**: High-performance analytics |
| - **OpenAI GPT-4**: Natural language to SQL |
| - **Gradio**: Interactive interface |
| - **Real ERP Data**: Authentic business scenarios |
| |
| This demonstrates how **open source tools** can unlock massive value from enterprise SAP systems at zero licensing cost. |
| """) |
|
|
| if __name__ == "__main__": |
| demo.launch() |
|
|