import gradio as gr from datetime import datetime from groq import Groq import json import os import traceback import re # --- 1. API KEY --- api_key_coder = os.environ.get('fristapi') if not api_key_coder: raise ValueError("Groq API key not found. Set fristapi environment variable.") # --- 2. LLM CLIENT --- class GroqLLM: def __init__(self, api_key, model="llama-3.3-70b-versatile", temperature=0.7): self.client = Groq(api_key=api_key) self.model = model self.temperature = temperature def invoke(self, prompt): try: response = self.client.chat.completions.create( model=self.model, messages=[{"role": "user", "content": prompt}], temperature=self.temperature, max_tokens=4000 ) return response.choices[0].message.content except Exception as e: return f"LLM Error: {str(e)}" llm = GroqLLM(api_key=api_key_coder) class SyntheticDataGenerator: """Generates synthetic test data in multiple formats""" def __init__(self, llm): self.llm = llm def generate_data(self, schema_description, data_type="json", language="Python", record_count=5): """ Generate synthetic test data based on schema description """ import json # Ensure schema_description is safely embedded if isinstance(schema_description, dict): schema_description_str = json.dumps(schema_description, indent=2) else: schema_description_str = schema_description # Map data types to formats format_map = { "json": "JSON (array of objects)", "csv": "CSV (comma-separated values with headers)", "sql": "SQL INSERT statements", "xml": "XML document", "python_dict": "Python list of dictionaries", "yaml": "YAML format", "excel_style": "Table format (markdown)" } output_format = format_map.get(data_type, "JSON") prompt = f""" You are a TEST DATA ENGINEER specializing in synthetic data generation. Generate realistic, diverse, and comprehensive test data. ===================== CONFIGURATION ===================== Data Schema: {schema_description_str} Output Format: {output_format} Number of Records: {record_count} Programming Context: {language} ===================== REQUIREMENTS ===================== 1. Generate {record_count} unique, realistic records 2. Include edge cases and boundary values 3. Ensure data variety (different types, values) 4. Make data look production-like 5. Include at least one record with null/empty values (if applicable) 6. Include at least one record with extreme values ===================== OUTPUT STRUCTURE ===================== Generate the response with: ## 📊 Data Generation Summary - **Schema**: [brief description] - **Format**: {output_format} - **Records**: {record_count} - **Language Context**: {language} ## 📝 Generated Test Data ```{data_type if data_type != 'excel_style' else 'markdown'} [generated data here] 🔍 Data Quality Notes [Special cases included] [Edge values] [Validation notes] 💡 Usage Example [How to use this data in {language} tests] """ # Invoke LLM return self.llm.invoke(prompt) # ============================================ # Synthetic Data Generator: Bulk Function # ============================================ def generate_bulk_data(self, schema_description, formats=None, record_counts=None): """Generate data in multiple formats at once""" # Default formats if formats is None: formats = ["json", "csv", "sql"] # Default record counts if record_counts is None: record_counts = [3, 5, 10] results = {} for format_type in formats: # If only one record count is given, use it for all formats count = record_counts[0] if len(record_counts) == 1 else record_counts[formats.index(format_type)] # Generate data using the main generator results[format_type] = self.generate_data( schema_description=schema_description, data_type=format_type, record_count=count ) return results # Initialize the synthetic data generator synthetic_generator = SyntheticDataGenerator(llm) # ============================================ # Template Functions # ============================================ def load_schema_template(template_name): """Return the JSON schema for a given template name""" templates = { "custom": '''{ "user_id": "uuid", "full_name": "name", "email": "email", "age": {"type": "int", "min": 18, "max": 99}, "is_active": "boolean", "country": "country" }''', "user_profile": '''{ "user_id": "integer", "username": "string (3-20 chars)", "email": "email format", "age": "integer (18-80)", "country": "ISO country code", "is_premium": "boolean", "registration_date": "date (last 2 years)", "last_login": "datetime" }''', "ecommerce_order": '''{ "order_id": "string (format: ORD-XXXX)", "customer_name": "string", "email": "email", "items": [ { "product_id": "string", "quantity": "integer (1-10)", "price": "decimal (10.99-999.99)" } ], "total_amount": "decimal", "status": "enum(pending,shipped,delivered,cancelled)", "order_date": "datetime" }''', "api_request_log": '''{ "request_id": "uuid", "endpoint": "string", "method": "enum(GET,POST,PUT,DELETE)", "status_code": "integer (200,400,401,404,500)", "response_time_ms": "integer (50-5000)", "timestamp": "datetime", "user_agent": "string", "ip_address": "ipv4" }''', "contact_info": '''{ "contact_id": "integer", "first_name": "string", "last_name": "string", "email": "email", "phone": "phone number", "address": { "street": "string", "city": "string", "state": "string", "zip_code": "string", "country": "string" }, "company": "string", "notes": "string (optional)" }''', "financial_transaction": '''{ "transaction_id": "uuid", "account_number": "string (format: ****1234)", "transaction_type": "enum(deposit,withdrawal,transfer,payment)", "amount": "decimal (0.01-10000.00)", "currency": "enum(USD,EUR,GBP,JPY)", "timestamp": "datetime", "status": "enum(pending,completed,failed,reversed)", "description": "string" }''', "healthcare_patient": '''{ "patient_id": "string (format: PAT-XXXX)", "first_name": "string", "last_name": "string", "date_of_birth": "date (1950-2020)", "gender": "enum(M,F,Other)", "blood_type": "enum(A+,A-,B+,B-,AB+,AB-,O+,O-)", "allergies": "array of strings", "medications": "array of strings", "last_visit": "datetime", "insurance_provider": "string" }''' } # Return the requested template, default to 'custom' return templates.get(template_name, templates["custom"]) def get_output_format_description(output_type): """ Return a human-readable description for a given output format type """ descriptions = { "json": "📄 JSON format - Array of objects, ideal for API testing and data interchange", "csv": "📊 CSV format - Comma-separated values, perfect for data import and spreadsheet analysis", "sql": "đŸ—„ī¸ SQL format - INSERT statements with CREATE TABLE, ready for database seeding", "xml": "📋 XML format - Structured document format for enterprise systems", "python_dict": "🐍 Python format - List of dictionaries, ready to use in Python tests", "yaml": "âš™ī¸ YAML format - Human-readable configuration format", "excel_style": "📑 Table format - Markdown table, easy to read and document" } # Return description if found, otherwise default message return descriptions.get(output_type, "📄 Standard format for test data") # ============================================ # Gradio UI Functions # ============================================ def generate_synthetic_data(language, schema_json, record_count, data_format, template_name): """Main generation function for Gradio""" try: # Validate schema if not schema_json or not schema_json.strip(): return ( "### âš ī¸ No Data Generated\n\nPlease provide a schema definition.", "# No data generated\nPlease provide a schema definition.", "### 📊 Generation Statistics\nNo data generated yet", schema_json ) # Parse record count try: count = int(record_count) if count < 1: count = 1 elif count > 100: count = 100 except: count = 5 # Generate synthetic data result = synthetic_generator.generate_data( schema_description=schema_json, data_type=data_format, language=language, record_count=count ) # Extract data from result pattern = rf'{data_format}\n(.*?)\n' if data_format == 'excel_style': pattern = r'markdown\n(.*?)\n' data_match = re.search(pattern, result, re.DOTALL) if data_match: generated_data = data_match.group(1) # Generate usage example based on format usage_examples = { "json": f'''# Generated test data for {language} Created with Synthetic Data Generator import json test_data = {generated_data[:500] + '...' if len(generated_data) > 500 else generated_data} for record in test_data: # Your test logic here print(f"Processing: {{record}}") # Example validation assert 'user_id' in record assert 'email' in record ''', "python_dict": f'''# Generated test data for {language} Test data as Python dictionary/list test_data = {generated_data[:500] + '...' if len(generated_data) > 500 else generated_data} for record in test_data: print(f"Testing with: {{record.get('name', 'N/A')}}") ''', "csv": f'''# Generated test data for {language} import csv from io import StringIO csv_data = """{generated_data[:500]}""" reader = csv.DictReader(StringIO(csv_data)) for row in reader: print(f"Processing row: {{row}}") ''', "sql": f'''# Generated test data for {language} import sqlite3 sql_statements = """{generated_data[:500]}""" conn = sqlite3.connect(':memory:') cursor = conn.cursor() for statement in sql_statements.split(';'): if statement.strip(): cursor.execute(statement) conn.commit() print("Database seeded successfully!") ''', "xml": f'''# Generated test data for {language} import xml.etree.ElementTree as ET xml_data = """{generated_data[:500]}""" root = ET.fromstring(xml_data) for record in root: print(f"Record: {{record.tag}}") ''', "yaml": f'''# Generated test data for {language} import yaml yaml_data = """{generated_data[:500]}""" test_data = yaml.safe_load(yaml_data) for record in test_data: print(f"Testing with: {{record}}") ''' } code_template = usage_examples.get( data_format, f'''# Generated test data for {language} Generated Data: {generated_data[:500]} Use this data in your tests as needed''' ) # Generation statistics stats = f"""### 📊 Generation Statistics Metric Value Records Generated {count} Language {language} Format {data_format.upper()} Template {template_name} Status ✅ Success Generated {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} Data Quality Notes: ✅ Realistic test data created ✅ Includes edge cases and variations ✅ Ready for immediate use in tests ✅ Follows schema specifications Special Cases Included: Boundary values Edge cases Null/empty values (where applicable) Extreme values """ return ( generated_data, code_template, stats, schema_json ) else: # No data found in expected format, return raw result return ( result[:2000] + "\n\n... (truncated)", f"# Generated Output\n\n{result}", "### âš ī¸ Generation completed but format may need review\n\nThe data may not be in the expected format. Please check the output above.", schema_json ) except Exception as e: error_msg = f"❌ Error: {str(e)}\n\n{traceback.format_exc()}\n" return ( error_msg, f"# Error generating data\n\n{str(e)}", f"### ❌ Generation Failed\n\nError: {str(e)}", schema_json ) def preview_schema(schema_json): """Preview and validate schema""" try: if not schema_json or not schema_json.strip(): return "âš ī¸ No schema provided" # Try to parse as JSON for validation try: parsed = json.loads(schema_json) return f"""### ✅ Valid JSON Schema Structure Preview: {json.dumps(parsed, indent=2, ensure_ascii=False)[:500]} Fields Detected: {len(parsed) if isinstance(parsed, dict) else 'Multiple records'} This schema will generate realistic test data based on the field definitions. """ except json.JSONDecodeError: return f"""### 📝 Schema Description (Text Format) {schema_json[:300]} Note: The schema is in text format. The AI will interpret and generate appropriate test data. """ except Exception as e: return f"❌ Error previewing schema: {str(e)}" # ============================================ # Gradio UI (Standalone with Home Tab) # ============================================ with gr.Blocks(title="Synthetic Test Data Generator") as demo: gr.Markdown("# 🎲 Synthetic Test Data Generator") gr.Markdown("### Create Realistic Test Data for Your Applications with AI") # ============================================ # Tab 0: Home # ============================================ with gr.Tab("🏠 Home"): gr.Markdown(""" đŸŽ¯ Welcome to Synthetic Test Data Generator ... (Ø§Ų„Ų…Ø­ØĒŲˆŲ‰) """) # Stats and Use Cases with gr.Row(): with gr.Column(): gr.Markdown("### 📊 Quick Stats") gr.Markdown(""" Formats Supported: 7 Max Records: 100 per generation Templates: 7 pre-built Languages: 4 """) with gr.Column(): gr.Markdown("### đŸŽ¯ Use Cases") gr.Markdown(""" API Testing Database Seeding Load Testing UI Testing Integration Tests Demo Data Creation """) # Quick Action Button gr.Markdown("---") gr.Markdown("### 🔧 Quick Action") quick_generate_btn = gr.Button("🚀 Go to Generator Tab", variant="primary") quick_generate_btn.click( fn=None, inputs=[], outputs=[], js='() => { document.querySelector(\'[data-testid="tab-Generator"]\').click(); }' ) # ============================================ # Tab 1: Generator (Main) - ؊ØŦب ØŖŲ† ŲŠŲƒŲˆŲ† Ø¯Ø§ØŽŲ„ ؃ØĒŲ„ØŠ demo # ============================================ with gr.Tab("Generator") as generator_tab: with gr.Row(): with gr.Column(scale=1): gr.Markdown("### âš™ī¸ Configuration") # Language Selection data_language = gr.Dropdown( choices=["python", "javascript", "java", "csharp"], value="python", label="đŸŽ¯ Target Language" ) # Format Selection data_format = gr.Dropdown( choices=["json", "csv", "sql", "xml", "python_dict", "yaml", "excel_style"], value="json", label="📄 Output Format" ) # Format description format_desc = gr.Markdown("Format Info: " + get_output_format_description("json")) # Record count data_count = gr.Slider( minimum=1, maximum=100, value=10, step=1, label="📊 Number of Records" ) gr.Markdown("### 📝 Schema Definition") # Template selection data_template = gr.Dropdown( choices=[ "custom", "user_profile", "ecommerce_order", "api_request_log", "contact_info", "financial_transaction", "healthcare_patient" ], value="custom", label="📁 Schema Template" ) # Schema input schema_input = gr.Textbox( value='''{ "user_id": "uuid", "full_name": "name", "email": "email", "age": {"type": "int", "min": 18, "max": 99}, "is_active": "boolean", "country": "country" }''', lines=15, label="Schema Definition" ) # Schema preview preview_btn = gr.Button("🔍 Preview Schema", size="sm") schema_preview = gr.Markdown("") # Actions generate_btn = gr.Button("✨ Generate Test Data", variant="primary", size="lg") with gr.Column(scale=1): gr.Markdown("### 📊 Generation Statistics") data_stats = gr.Markdown("No data generated yet") gr.Markdown("### 📝 Generated Data") data_output = gr.Code(label="Test Data", language="json", lines=15) gr.Markdown("### đŸ’ģ Ready-to-use Code Template") code_output = gr.Code(label="Code Template", language="python", lines=12) # ØĒØšØąŲŠŲ Ø§Ų„Ø¯ŲˆØ§Ų„ Ø§Ų„Ų…ØŗØ§ØšØ¯ØŠ def update_code_language(format_type): lang_map = { "json": "json", "csv": "csv", "sql": "sql", "xml": "xml", "python_dict": "python", "yaml": "yaml", "excel_style": "markdown" } return gr.update(language=lang_map.get(format_type, "text")) # Event Handlers data_template.change( fn=load_schema_template, inputs=[data_template], outputs=[schema_input] ) preview_btn.click( fn=preview_schema, inputs=[schema_input], outputs=[schema_preview] ) generate_btn.click( fn=generate_synthetic_data, inputs=[data_language, schema_input, data_count, data_format, data_template], outputs=[data_output, code_output, data_stats, schema_input] ) data_format.change( fn=get_output_format_description, inputs=[data_format], outputs=[format_desc] ) data_format.change( fn=update_code_language, inputs=[data_format], outputs=[data_output] ) # ============================================ # Tab 2: Templates Library # ============================================ with gr.Tab("📚 Templates Library"): gr.Markdown("# 📚 Schema Templates Library") gr.Markdown("Browse and learn from these example schemas") template_selector = gr.Dropdown( choices=[ "user_profile", "ecommerce_order", "api_request_log", "contact_info", "financial_transaction", "healthcare_patient" ], value="user_profile", label="Select Template to View" ) template_display = gr.Code(label="Template Schema", language="json", lines=20) template_description = gr.Markdown("") def show_template_details(template_name): templates_info = { "user_profile": { "desc": "User Profile Schema\n\nPerfect for testing user management systems, authentication, and profile features.\n\nUse Cases:\n- User registration tests\n- Profile management\n- User search/filtering\n- Data export features", "schema": load_schema_template(template_name) }, "ecommerce_order": { "desc": "E-commerce Order Schema\n\nIdeal for testing shopping carts, checkout flows, and order management systems.\n\nUse Cases:\n- Order creation tests\n- Cart calculations\n- Inventory management\n- Order history features", "schema": load_schema_template(template_name) }, "api_request_log": { "desc": "API Request Log Schema\n\nGreat for testing logging systems, analytics, and monitoring tools.\n\nUse Cases:\n- API monitoring tests\n- Log analysis\n- Performance metrics\n- Error tracking", "schema": load_schema_template(template_name) }, "contact_info": { "desc": "Contact Information Schema\n\nUseful for CRM systems, address books, and contact management features.\n\nUse Cases:\n- Contact import/export\n- Address validation\n- Phone number formatting\n- Data enrichment tests", "schema": load_schema_template(template_name) }, "financial_transaction": { "desc": "Financial Transaction Schema\n\nEssential for banking, payment systems, and financial applications.\n\nUse Cases:\n- Payment processing tests\n- Transaction history\n- Fraud detection\n- Statement generation", "schema": load_schema_template(template_name) }, "healthcare_patient": { "desc": "Healthcare Patient Schema\n\nDesigned for healthcare systems, patient management, and medical records.\n\nUse Cases:\n- Patient registration\n- Medical records\n- Appointment scheduling\n- Insurance verification", "schema": load_schema_template(template_name) } } info = templates_info.get(template_name, templates_info["user_profile"]) return info["schema"], info["desc"] template_selector.change( fn=show_template_details, inputs=[template_selector], outputs=[template_display, template_description] ) # Load initial template initial_schema, initial_desc = show_template_details("user_profile") template_display.value = initial_schema template_description.value = initial_desc gr.Markdown("---") gr.Markdown("### 💡 Tips for Creating Custom Schemas") gr.Markdown(""" - Use descriptive field names - The AI understands context (e.g., 'email' vs 'e') - Specify data types - Include hints like 'string', 'integer', 'decimal', 'date' - Add constraints - Use ranges like 'age: 18-80' or patterns like 'email format' - Include enums - For fields with limited options (e.g., 'status: pending,active,closed') - Structure nested data - Use JSON objects or arrays for complex relationships """) # ============================================ # Tab 3: About & Help # ============================================ with gr.Tab("â„šī¸ About & Help"): gr.Markdown(""" â„šī¸ **About Synthetic Test Data Generator** **What is this tool?** This tool uses AI (Groq's LLM) to generate realistic, production-like test data based on your schema definitions. It helps QA engineers and developers create comprehensive test datasets quickly and efficiently. ... (Ø¨Ø§Ų‚ŲŠ Ø§Ų„Ų…Ø­ØĒŲˆŲ‰) """) # ============================================ # Launch Application # ============================================ if __name__ == "__main__": print("=" * 60) print("🎲 Synthetic Test Data Generator") print("=" * 60) print("✅ All components loaded successfully!") print("=" * 60) print("🚀 Launching application...") print("=" * 60) demo.launch( share=True, debug=False, server_name="0.0.0.0", server_port=7860, theme=gr.themes.Soft() )