import os import asyncio from fastapi import FastAPI, HTTPException from pydantic import BaseModel from typing import Optional, Dict from uuid import uuid4 from src.web_extractor import WebExtractor from src.scrapers.playwright_scraper import ScraperConfig app = FastAPI() # Store active sessions sessions: Dict[str, WebExtractor] = {} class ScrapeRequest(BaseModel): url: str query: str model_name: Optional[str] = "alias-fast" class SessionCreateRequest(BaseModel): model_name: Optional[str] = "alias-fast" @app.get("/health") async def health(): return {"status": "ok", "message": "CyberScraper 2077 API is running"} @app.get("/api-docs") async def api_docs(): """Comprehensive API documentation with examples""" return { "title": "CyberScraper 2077 API Documentation", "version": "1.0.0", "description": "Advanced web scraping API with session management and AI-powered content extraction", "base_url": "https://grazieprego-scrapling.hf.space", "endpoints": { "health": { "method": "GET", "path": "/health", "description": "Check if the API is running", "response": { "status": "ok", "message": "CyberScraper 2077 API is running" }, "example": "curl https://grazieprego-scrapling.hf.space/health" }, "scrape": { "method": "POST", "path": "/api/scrape", "description": "Stateless scrape request - creates a new extractor for each request", "request_body": { "url": "string - The URL to scrape", "query": "string - The extraction query/instruction", "model_name": "string (optional) - AI model to use (default: 'alias-fast')" }, "response": { "url": "string - The scraped URL", "query": "string - The query used", "response": "any - The extracted content" }, "example": { "curl": "curl -X POST https://grazieprego-scrapling.hf.space/api/scrape -H 'Content-Type: application/json' -d '{\"url\": \"https://example.com\", \"query\": \"Extract all product prices\"}'", "python": "import requests\nresponse = requests.post('https://grazieprego-scrapling.hf.space/api/scrape', json={'url': 'https://example.com', 'query': 'Extract prices'})\nprint(response.json())" } }, "create_session": { "method": "POST", "path": "/api/session", "description": "Create a persistent scraping session for multiple requests", "request_body": { "model_name": "string (optional) - AI model to use (default: 'alias-fast')" }, "response": { "session_id": "string - UUID of the created session", "message": "string - Confirmation message", "model": "string - Model used" }, "example": { "curl": "curl -X POST https://grazieprego-scrapling.hf.space/api/session -H 'Content-Type: application/json' -d '{\"model_name\": \"alias-fast\"}'", "python": "import requests\nsession = requests.post('https://grazieprego-scrapling.hf.space/api/session', json={'model_name': 'alias-fast'})\nsession_id = session.json()['session_id']" } }, "session_scrape": { "method": "POST", "path": "/api/session/{session_id}/scrape", "description": "Scrape using an existing session context (more efficient for multiple requests)", "path_parameters": { "session_id": "string - UUID of the session" }, "request_body": { "url": "string - The URL to scrape", "query": "string - The extraction query", "model_name": "string (optional)" }, "response": { "session_id": "string - The session ID", "url": "string - The scraped URL", "query": "string - The query used", "response": "any - The extracted content" }, "example": { "curl": "curl -X POST https://grazieprego-scrapling.hf.space/api/session/uuid-here/scrape -H 'Content-Type: application/json' -d '{\"url\": \"https://example.com/page1\", \"query\": \"Extract titles\"}'", "python": "import requests\nresponse = requests.post(f'https://grazieprego-scrapling.hf.space/api/session/{session_id}/scrape', json={'url': 'https://example.com', 'query': 'Extract data'})\nprint(response.json())" } }, "close_session": { "method": "DELETE", "path": "/api/session/{session_id}", "description": "Close a session and release resources", "path_parameters": { "session_id": "string - UUID of the session to close" }, "response": { "message": "string - Confirmation message", "session_id": "string - The closed session ID" }, "example": { "curl": "curl -X DELETE https://grazieprego-scrapling.hf.space/api/session/uuid-here", "python": "import requests\nresponse = requests.delete(f'https://grazieprego-scrapling.hf.space/api/session/{session_id}')\nprint(response.json())" } } }, "usage_guide": { "quick_start": [ "1. Make a simple scrape request to /api/scrape", "2. For multiple requests, create a session first", "3. Use the session ID for subsequent requests", "4. Close sessions when done to free resources" ], "best_practices": [ "Use stateless /api/scrape for one-off requests", "Use sessions for batch processing multiple URLs", "Always close sessions when finished", "Handle errors gracefully (500 errors may occur on complex sites)", "Set appropriate timeouts for slow-loading pages" ], "error_handling": { "404": "Session not found (for session endpoints)", "500": "Internal server error - check the detail message", "Common issues": [ "URL unreachable or timeout", "JavaScript-heavy sites may require different approaches", "Bot protection may block requests" ] } }, "integration_examples": { "python_script": """ import requests # Stateless scrape response = requests.post( 'https://grazieprego-scrapling.hf.space/api/scrape', json={ 'url': 'https://example.com', 'query': 'Extract all headings and prices' } ) print("Result:", response.json()) # Session-based workflow session_response = requests.post( 'https://grazieprego-scrapling.hf.space/api/session', json={'model_name': 'alias-fast'} ) session_id = session_response.json()['session_id'] try: # Multiple requests using the same session for url in ['https://example.com/page1', 'https://example.com/page2']: result = requests.post( f'https://grazieprego-scrapling.hf.space/api/session/{session_id}/scrape', json={'url': url, 'query': 'Extract product data'} ) print(f"Scraped {url}:", result.json()) finally: # Always close the session requests.delete(f'https://grazieprego-scrapling.hf.space/api/session/{session_id}') """, "javascript": """ // Fetch API example async function scrapeUrl(url, query) { const response = await fetch('https://grazieprego-scrapling.hf.space/api/scrape', { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ url, query }) }); return await response.json(); } // Usage scrapeUrl('https://example.com', 'Extract all links').then(console.log); """ }, "rate_limits": { "note": "Rate limits may apply. Please use responsibly.", "recommendation": "For high-volume scraping, use session-based approach and implement retry logic" } } @app.post("/api/scrape") async def scrape(request: ScrapeRequest): """Stateless scrape request (creates a new extractor for each request)""" scraper_config = ScraperConfig( headless=True, max_retries=3, delay_after_load=5 ) extractor = WebExtractor(model_name=request.model_name, scraper_config=scraper_config) try: # Construct the query by combining URL and the specific request full_query = f"{request.url} {request.query}" response = await extractor.process_query(full_query) # If response is a tuple (csv/excel), extract the first part if isinstance(response, tuple): response = response[0] # Clean up if hasattr(extractor.playwright_scraper, 'close'): await extractor.playwright_scraper.close() return { "url": request.url, "query": request.query, "response": response } except Exception as e: # Try to clean up on error if hasattr(extractor, 'playwright_scraper') and hasattr(extractor.playwright_scraper, 'close'): await extractor.playwright_scraper.close() raise HTTPException(status_code=500, detail=str(e)) @app.post("/api/session") async def create_session(request: SessionCreateRequest): """Create a persistent scraping session""" session_id = str(uuid4()) try: scraper_config = ScraperConfig( headless=True, max_retries=3, delay_after_load=5 ) extractor = WebExtractor(model_name=request.model_name, scraper_config=scraper_config) sessions[session_id] = extractor return {"session_id": session_id, "message": "Session created", "model": request.model_name} except Exception as e: raise HTTPException(status_code=500, detail=f"Failed to create session: {str(e)}") @app.post("/api/session/{session_id}/scrape") async def session_scrape(session_id: str, request: ScrapeRequest): """Scrape using an existing session context""" if session_id not in sessions: raise HTTPException(status_code=404, detail="Session not found") extractor = sessions[session_id] try: full_query = f"{request.url} {request.query}" response = await extractor.process_query(full_query) if isinstance(response, tuple): response = response[0] return { "session_id": session_id, "url": request.url, "query": request.query, "response": response } except Exception as e: raise HTTPException(status_code=500, detail=str(e)) @app.delete("/api/session/{session_id}") async def close_session(session_id: str): """Close a session and release resources""" if session_id in sessions: extractor = sessions[session_id] if hasattr(extractor.playwright_scraper, 'close'): await extractor.playwright_scraper.close() del sessions[session_id] return {"message": "Session closed", "session_id": session_id} raise HTTPException(status_code=404, detail="Session not found") if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=8000)