| | import os |
| | from scrapingbee import ScrapingBeeClient |
| | from logger import setup_logger |
| | import json |
| |
|
| | logger = setup_logger("scraper") |
| |
|
| | |
| | client = ScrapingBeeClient(api_key=os.getenv('SCRAPINGBEE_API_KEY', '')) |
| |
|
| | def scrape_url(url: str) -> str: |
| | """ |
| | Scrape content from URL using ScrapingBee with AI extraction |
| | |
| | Args: |
| | url: The URL to scrape |
| | |
| | Returns: |
| | str: Extracted text content or error message |
| | """ |
| | try: |
| | logger.info(f"Scraping URL: {url}") |
| | response = client.get( |
| | url, |
| | params={ |
| | 'stealth_proxy': True, |
| | 'country_code': 'us', |
| | 'ai_query': 'Extract the main text content from this page' |
| | } |
| | ) |
| | |
| | if response.status_code == 200: |
| | logger.info(f"Successfully scraped URL: {url}") |
| | return response.text if response.text else "No content could be extracted from the URL" |
| | else: |
| | logger.error(f"Failed to scrape URL: {url}, Status: {response.status_code}") |
| | return f"Failed to download the URL. Status code: {response.status_code}" |
| | |
| | except Exception as e: |
| | logger.error(f"Error scraping URL: {url}", exc_info=True) |
| | return f"Error scraping the URL: {str(e)}" |