Spaces:
Running
Running
| <html lang="en"> | |
| <head> | |
| <meta charset="UTF-8"> | |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
| <title>Advanced Web Scraper with Crawlee & Playwright</title> | |
| <script src="https://cdn.tailwindcss.com"></script> | |
| <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css"> | |
| <style> | |
| .loading-spinner { | |
| border: 4px solid rgba(0, 0, 0, 0.1); | |
| border-radius: 50%; | |
| border-top: 4px solid #3b82f6; | |
| width: 30px; | |
| height: 30px; | |
| animation: spin 1s linear infinite; | |
| margin: 0 auto; | |
| } | |
| @keyframes spin { | |
| 0% { transform: rotate(0deg); } | |
| 100% { transform: rotate(360deg); } | |
| } | |
| .data-card { | |
| background-color: #f8fafc; | |
| border-radius: 0.5rem; | |
| padding: 1rem; | |
| margin-bottom: 1rem; | |
| border: 1px solid #e2e8f0; | |
| } | |
| .data-highlight { | |
| background-color: #e0f7fa; | |
| padding: 0.2rem 0.4rem; | |
| border-radius: 0.25rem; | |
| font-family: monospace; | |
| display: inline-block; | |
| margin: 0.1rem; | |
| word-break: break-all; | |
| } | |
| .email-chip { | |
| background-color: #e3f2fd; | |
| color: #0d47a1; | |
| } | |
| .phone-chip { | |
| background-color: #e8f5e9; | |
| color: #1b5e20; | |
| } | |
| .name-chip { | |
| background-color: #f3e5f5; | |
| color: #4a148c; | |
| } | |
| .link-chip { | |
| background-color: #fff3e0; | |
| color: #e65100; | |
| } | |
| .site-tree { | |
| font-family: monospace; | |
| white-space: pre; | |
| overflow-x: auto; | |
| background-color: #f5f5f5; | |
| padding: 1rem; | |
| border-radius: 0.25rem; | |
| border: 1px solid #e2e8f0; | |
| } | |
| .progress-bar { | |
| height: 6px; | |
| background-color: #e2e8f0; | |
| border-radius: 3px; | |
| overflow: hidden; | |
| margin-top: 1rem; | |
| } | |
| .progress-fill { | |
| height: 100%; | |
| background-color: #3b82f6; | |
| width: 0%; | |
| transition: width 0.3s ease; | |
| } | |
| .stats-grid { | |
| display: grid; | |
| grid-template-columns: repeat(auto-fill, minmax(120px, 1fr)); | |
| gap: 1rem; | |
| margin-bottom: 1.5rem; | |
| } | |
| .stat-card { | |
| background-color: white; | |
| border-radius: 0.5rem; | |
| padding: 1rem; | |
| text-align: center; | |
| box-shadow: 0 1px 3px rgba(0,0,0,0.1); | |
| } | |
| .stat-value { | |
| font-size: 1.5rem; | |
| font-weight: bold; | |
| color: #3b82f6; | |
| } | |
| .stat-label { | |
| font-size: 0.75rem; | |
| color: #64748b; | |
| text-transform: uppercase; | |
| letter-spacing: 0.05em; | |
| } | |
| </style> | |
| </head> | |
| <body class="bg-gray-50 min-h-screen"> | |
| <div class="container mx-auto px-4 py-8"> | |
| <header class="text-center mb-12"> | |
| <h1 class="text-4xl font-bold text-blue-600 mb-2"> | |
| <i class="fas fa-spider mr-2"></i> Advanced Web Scraper | |
| </h1> | |
| <p class="text-gray-600 max-w-2xl mx-auto"> | |
| Professional web scraping with Crawlee & Playwright simulation | |
| </p> | |
| </header> | |
| <div class="bg-white rounded-xl shadow-lg p-6 max-w-4xl mx-auto mb-12"> | |
| <form id="crawlerForm" class="space-y-6"> | |
| <div> | |
| <label for="urls" class="block text-sm font-medium text-gray-700 mb-1"> | |
| <i class="fas fa-link mr-1 text-blue-500"></i> Start URLs (one per line) | |
| </label> | |
| <textarea id="urls" name="urls" rows="5" | |
| class="w-full px-4 py-2 border border-gray-300 rounded-lg focus:ring-2 focus:ring-blue-500 focus:border-blue-500 transition font-mono" | |
| placeholder="https://example.com https://another-site.com" required>https://example.com</textarea> | |
| </div> | |
| <div class="grid grid-cols-1 md:grid-cols-3 gap-4"> | |
| <div> | |
| <label for="depth" class="block text-sm font-medium text-gray-700 mb-1"> | |
| <i class="fas fa-layer-group mr-1 text-blue-500"></i> Crawl Depth | |
| </label> | |
| <select id="depth" name="depth" | |
| class="w-full px-4 py-2 border border-gray-300 rounded-lg focus:ring-2 focus:ring-blue-500 focus:border-blue-500 transition"> | |
| <option value="1">1 (Single page only)</option> | |
| <option value="2" selected>2 (Page + 1 level deep)</option> | |
| <option value="3">3 (Page + 2 levels deep)</option> | |
| <option value="5">5 (Deep crawl)</option> | |
| </select> | |
| </div> | |
| <div> | |
| <label for="maxPages" class="block text-sm font-medium text-gray-700 mb-1"> | |
| <i class="fas fa-file-alt mr-1 text-blue-500"></i> Max Pages to Crawl | |
| </label> | |
| <input type="number" id="maxPages" name="maxPages" min="1" value="20" | |
| class="w-full px-4 py-2 border border-gray-300 rounded-lg focus:ring-2 focus:ring-blue-500 focus:border-blue-500 transition"> | |
| </div> | |
| <div> | |
| <label for="timeout" class="block text-sm font-medium text-gray-700 mb-1"> | |
| <i class="fas fa-clock mr-1 text-blue-500"></i> Timeout (seconds) | |
| </label> | |
| <input type="number" id="timeout" name="timeout" min="5" value="30" | |
| class="w-full px-4 py-2 border border-gray-300 rounded-lg focus:ring-2 focus:ring-blue-500 focus:border-blue-500 transition"> | |
| </div> | |
| </div> | |
| <div class="grid grid-cols-1 md:grid-cols-2 gap-4"> | |
| <div> | |
| <label for="proxyConfig" class="block text-sm font-medium text-gray-700 mb-1"> | |
| <i class="fas fa-globe mr-1 text-blue-500"></i> Proxy Configuration | |
| </label> | |
| <select id="proxyConfig" name="proxyConfig" | |
| class="w-full px-4 py-2 border border-gray-300 rounded-lg focus:ring-2 focus:ring-blue-500 focus:border-blue-500 transition"> | |
| <option value="none">No proxy</option> | |
| <option value="residential">Residential proxies</option> | |
| <option value="datacenter">Datacenter proxies</option> | |
| <option value="tor">TOR network</option> | |
| </select> | |
| </div> | |
| <div> | |
| <label for="userAgent" class="block text-sm font-medium text-gray-700 mb-1"> | |
| <i class="fas fa-user-secret mr-1 text-blue-500"></i> User Agent | |
| </label> | |
| <select id="userAgent" name="userAgent" | |
| class="w-full px-4 py-2 border border-gray-300 rounded-lg focus:ring-2 focus:ring-blue-500 focus:border-blue-500 transition"> | |
| <option value="default">Default (Chrome latest)</option> | |
| <option value="mobile">Mobile (iPhone)</option> | |
| <option value="tablet">Tablet (iPad)</option> | |
| <option value="legacy">Legacy (IE 11)</option> | |
| </select> | |
| </div> | |
| </div> | |
| <div class="flex flex-wrap gap-4"> | |
| <div class="flex items-center"> | |
| <input type="checkbox" id="extractNames" name="extractNames" class="h-4 w-4 text-blue-600 focus:ring-blue-500 border-gray-300 rounded" checked> | |
| <label for="extractNames" class="ml-2 block text-sm text-gray-700"> | |
| Extract Names | |
| </label> | |
| </div> | |
| <div class="flex items-center"> | |
| <input type="checkbox" id="extractPhones" name="extractPhones" class="h-4 w-4 text-blue-600 focus:ring-blue-500 border-gray-300 rounded" checked> | |
| <label for="extractPhones" class="ml-2 block text-sm text-gray-700"> | |
| Extract Phone Numbers | |
| </label> | |
| </div> | |
| <div class="flex items-center"> | |
| <input type="checkbox" id="extractEmails" name="extractEmails" class="h-4 w-4 text-blue-600 focus:ring-blue-500 border-gray-300 rounded" checked> | |
| <label for="extractEmails" class="ml-2 block text-sm text-gray-700"> | |
| Extract Emails | |
| </label> | |
| </div> | |
| <div class="flex items-center"> | |
| <input type="checkbox" id="extractLinks" name="extractLinks" class="h-4 w-4 text-blue-600 focus:ring-blue-500 border-gray-300 rounded" checked> | |
| <label for="extractLinks" class="ml-2 block text-sm text-gray-700"> | |
| Extract Links | |
| </label> | |
| </div> | |
| <div class="flex items-center"> | |
| <input type="checkbox" id="extractStructure" name="extractStructure" class="h-4 w-4 text-blue-600 focus:ring-blue-500 border-gray-300 rounded" checked> | |
| <label for="extractStructure" class="ml-2 block text-sm text-gray-700"> | |
| Site Structure | |
| </label> | |
| </div> | |
| <div class="flex items-center"> | |
| <input type="checkbox" id="screenshots" name="screenshots" class="h-4 w-4 text-blue-600 focus:ring-blue-500 border-gray-300 rounded"> | |
| <label for="screenshots" class="ml-2 block text-sm text-gray-700"> | |
| Take Screenshots | |
| </label> | |
| </div> | |
| </div> | |
| <div class="pt-4"> | |
| <button type="submit" id="searchButton" class="w-full bg-blue-600 hover:bg-blue-700 text-white font-medium py-3 px-4 rounded-lg transition flex items-center justify-center"> | |
| <i class="fas fa-spider mr-2"></i> Start Crawling | |
| </button> | |
| </div> | |
| </form> | |
| </div> | |
| <div id="loadingIndicator" class="hidden text-center py-8"> | |
| <div class="loading-spinner mb-4"></div> | |
| <p class="text-gray-600">Crawling websites with Playwright. This may take a few minutes...</p> | |
| <p class="text-sm text-gray-500 mt-2">Currently processing: <span id="currentUrl" class="font-medium">example.com</span></p> | |
| <div class="progress-bar mt-4 max-w-md mx-auto"> | |
| <div id="progressFill" class="progress-fill"></div> | |
| </div> | |
| <div class="mt-4 max-w-md mx-auto bg-blue-50 p-3 rounded-lg"> | |
| <p class="text-blue-800 text-sm"> | |
| <i class="fas fa-info-circle mr-2"></i> | |
| Crawling with depth <span id="currentDepth">2</span>, max <span id="currentMaxPages">20</span> pages, | |
| timeout <span id="currentTimeout">30</span>s | |
| </p> | |
| </div> | |
| </div> | |
| <div id="resultsContainer" class="hidden"> | |
| <div class="flex justify-between items-center mb-6"> | |
| <h2 class="text-2xl font-bold text-gray-800"> | |
| <i class="fas fa-database mr-2 text-blue-500"></i> Scraping Results | |
| </h2> | |
| <div class="flex items-center space-x-4"> | |
| <div class="bg-blue-100 text-blue-800 px-3 py-1 rounded-full text-sm font-medium"> | |
| Pages: <span id="resultCount">0</span> | |
| </div> | |
| <button id="exportBtn" class="bg-green-100 text-green-800 px-3 py-1 rounded-full text-sm font-medium hover:bg-green-200 transition flex items-center"> | |
| <i class="fas fa-file-export mr-1"></i> Export JSON | |
| </button> | |
| <button id="exportCSV" class="bg-purple-100 text-purple-800 px-3 py-1 rounded-full text-sm font-medium hover:bg-purple-200 transition flex items-center"> | |
| <i class="fas fa-file-csv mr-1"></i> Export CSV | |
| </button> | |
| </div> | |
| </div> | |
| <div id="statsContainer" class="stats-grid mb-6"> | |
| <!-- Stats will be inserted here --> | |
| </div> | |
| <div id="resultsAccordion" class="space-y-6"> | |
| <!-- Results will be inserted here by JavaScript --> | |
| </div> | |
| </div> | |
| <div id="noResults" class="hidden text-center py-12"> | |
| <div class="mx-auto w-24 h-24 bg-gray-100 rounded-full flex items-center justify-center mb-4"> | |
| <i class="fas fa-exclamation-triangle text-3xl text-yellow-500"></i> | |
| </div> | |
| <h3 class="text-xl font-medium text-gray-700 mb-2">No Results Found</h3> | |
| <p class="text-gray-500 max-w-md mx-auto"> | |
| The crawler didn't find any matching data from the provided URLs. | |
| Try adjusting your search parameters. | |
| </p> | |
| </div> | |
| </div> | |
| <script> | |
| document.addEventListener('DOMContentLoaded', function() { | |
| const crawlerForm = document.getElementById('crawlerForm'); | |
| const loadingIndicator = document.getElementById('loadingIndicator'); | |
| const resultsContainer = document.getElementById('resultsContainer'); | |
| const noResults = document.getElementById('noResults'); | |
| const resultsAccordion = document.getElementById('resultsAccordion'); | |
| const resultCount = document.getElementById('resultCount'); | |
| const currentUrl = document.getElementById('currentUrl'); | |
| const currentDepth = document.getElementById('currentDepth'); | |
| const currentTimeout = document.getElementById('currentTimeout'); | |
| const currentMaxPages = document.getElementById('currentMaxPages'); | |
| const progressFill = document.getElementById('progressFill'); | |
| const exportBtn = document.getElementById('exportBtn'); | |
| const exportCSV = document.getElementById('exportCSV'); | |
| const statsContainer = document.getElementById('statsContainer'); | |
| // Regular expressions for data extraction | |
| const patterns = { | |
| email: /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/g, | |
| phone: /(\+\d{1,3}[- ]?)?\(?\d{3}\)?[- ]?\d{3}[- ]?\d{4}\b/g, | |
| name: /\b(?:Mr\.?|Mrs\.?|Ms\.?|Dr\.?)?\s?[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+\b/g, | |
| social: /(facebook\.com|twitter\.com|linkedin\.com|instagram\.com)\/[^\s"'>]+/g | |
| }; | |
| // Simulate Crawlee + Playwright crawling with realistic data | |
| async function simulateCrawleeCrawl(urls, depth, maxPages, timeout, options) { | |
| // In a real implementation, this would call your backend API | |
| // which would use Crawlee with Playwright for headless browsing | |
| // Simulate realistic crawl delays | |
| const simulateDelay = () => new Promise(resolve => | |
| setTimeout(resolve, 500 + Math.random() * 1500)); | |
| const results = []; | |
| let processed = 0; | |
| const totalToProcess = Math.min(urls.length * depth, maxPages); | |
| // Generate realistic mock data for different domains | |
| const mockDataGenerators = { | |
| "example.com": generateExampleData, | |
| "wikipedia.org": generateWikipediaData, | |
| "github.com": generateGithubData, | |
| "amazon.com": generateAmazonData, | |
| "nytimes.com": generateNewsData | |
| }; | |
| for (const url of urls) { | |
| try { | |
| const domain = new URL(url).hostname.replace('www.', ''); | |
| const generator = mockDataGenerators[domain] || generateGenericData; | |
| // Simulate crawling at different depths | |
| for (let d = 0; d < depth; d++) { | |
| if (processed >= maxPages) break; | |
| // Update UI with current URL | |
| currentUrl.textContent = `${domain} (depth ${d+1})`; | |
| progressFill.style.width = `${(processed / totalToProcess) * 100}%`; | |
| await simulateDelay(); | |
| // Generate mock data for this page | |
| const pageData = generator(url, d); | |
| const extractedData = extractDataFromContent(pageData.content, options); | |
| const result = { | |
| url: url + (d > 0 ? `?depth=${d}` : ''), | |
| title: pageData.title, | |
| depth: d, | |
| data: extractedData, | |
| screenshot: options.screenshots ? `data:image/svg+xml;base64,${btoa(`<svg xmlns="http://www.w3.org/2000/svg" width="800" height="600" viewBox="0 0 800 600"><rect width="800" height="600" fill="#${Math.floor(Math.random()*16777215).toString(16)}"/><text x="50%" y="50%" font-family="Arial" font-size="24" fill="white" text-anchor="middle">${domain} Screenshot</text></svg>`)}` : null | |
| }; | |
| results.push(result); | |
| processed++; | |
| } | |
| } catch (error) { | |
| console.error(`Error processing ${url}:`, error); | |
| } | |
| if (processed >= maxPages) break; | |
| } | |
| progressFill.style.width = '100%'; | |
| return results; | |
| } | |
| // Data generators for different types of sites | |
| function generateExampleData(url, depth) { | |
| const titles = [ | |
| "Example Domain", | |
| "About Our Company", | |
| "Our Services", | |
| "Contact Information", | |
| "Customer Testimonials" | |
| ]; | |
| const people = [ | |
| "John Smith (CEO)", | |
| "Sarah Johnson (CTO)", | |
| "Michael Brown (CFO)", | |
| "Emily Davis (Marketing Director)", | |
| "Robert Wilson (Sales Manager)" | |
| ]; | |
| const content = ` | |
| <h1>${titles[depth % titles.length]}</h1> | |
| <p>Welcome to our website. Contact us at info@example.com or call +1 (555) 123-4567</p> | |
| <p>Our team includes ${people.slice(0, depth + 2).join(', ')}.</p> | |
| <h2>Our Services</h2> | |
| <p>We offer great services. Email sales@example.com for inquiries.</p> | |
| <h3>Departments</h3> | |
| <ul> | |
| <li><a href="/about">About Us</a></li> | |
| <li><a href="/contact">Contact</a></li> | |
| <li><a href="https://facebook.com/examplecompany">Facebook</a></li> | |
| <li><a href="https://twitter.com/example">Twitter</a></li> | |
| </ul> | |
| <div class="footer"> | |
| <p>Copyright © ${new Date().getFullYear()} Example Inc. All rights reserved.</p> | |
| <p>Address: 123 Main St, Anytown, USA</p> | |
| </div> | |
| `; | |
| return { | |
| title: titles[depth % titles.length], | |
| content: content | |
| }; | |
| } | |
| function generateWikipediaData(url, depth) { | |
| const titles = [ | |
| "Wikipedia, the free encyclopedia", | |
| "Artificial Intelligence - Wikipedia", | |
| "Machine Learning", | |
| "History of Computing", | |
| "Notable Computer Scientists" | |
| ]; | |
| const people = [ | |
| "Alan Turing", | |
| "Grace Hopper", | |
| "Tim Berners-Lee", | |
| "Ada Lovelace", | |
| "John von Neumann" | |
| ]; | |
| const content = ` | |
| <h1 id="firstHeading">${titles[depth % titles.length]}</h1> | |
| <div class="infobox"> | |
| <p>This article is about ${titles[depth % titles.length].split(' - ')[0]}</p> | |
| </div> | |
| <p>${people[depth % people.length]} (${['British', 'American', 'Hungarian', 'English'][depth % 4]} scientist)</p> | |
| <h2>Contents</h2> | |
| <ul> | |
| <li><a href="#History">History</a></li> | |
| <li><a href="#Applications">Applications</a></li> | |
| <li><a href="#References">References</a></li> | |
| </ul> | |
| <h2 id="History">History</h2> | |
| <p>The field was founded on the assumption that human intelligence can be described precisely enough to be simulated by a machine.</p> | |
| <h2 id="References">References</h2> | |
| <ol> | |
| <li>Author, A. (2020). "Title". Journal. 10(2): 123–145.</li> | |
| <li>Researcher, B. (2019). "New discoveries". Science.</li> | |
| </ol> | |
| <div class="footer"> | |
| <p>This page was last edited on ${new Date().toLocaleDateString()}</p> | |
| <p>Contact: en.wikipedia.org@wikimedia.org</p> | |
| </div> | |
| `; | |
| return { | |
| title: titles[depth % titles.length], | |
| content: content | |
| }; | |
| } | |
| // More data generators would be here in a real implementation... | |
| // Enhanced data extraction | |
| function extractDataFromContent(content, options) { | |
| const data = { | |
| emails: [], | |
| phones: [], | |
| names: [], | |
| links: [], | |
| social: [], | |
| structure: "" | |
| }; | |
| // Extract emails | |
| if (options.extractEmails) { | |
| const emailMatches = content.match(patterns.email) || []; | |
| data.emails = [...new Set(emailMatches)]; // Remove duplicates | |
| } | |
| // Extract phone numbers | |
| if (options.extractPhones) { | |
| const phoneMatches = content.match(patterns.phone) || []; | |
| data.phones = [...new Set(phoneMatches)]; | |
| } | |
| // Extract names | |
| if (options.extractNames) { | |
| const nameMatches = content.match(patterns.name) || []; | |
| data.names = [...new Set(nameMatches)]; | |
| } | |
| // Extract links | |
| if (options.extractLinks) { | |
| const linkMatches = content.match(/https?:\/\/[^\s"'<>]+/g) || []; | |
| data.links = [...new Set(linkMatches)]; | |
| // Extract social media links | |
| const socialMatches = content.match(patterns.social) || []; | |
| data.social = [...new Set(socialMatches)]; | |
| } | |
| // Generate site structure | |
| if (options.extractStructure) { | |
| const headings = content.match(/<h[1-6][^>]*>.*?<\/h[1-6]>/gi) || []; | |
| data.structure = headings.map(h => { | |
| const level = h.match(/<h([1-6])/i)[1]; | |
| const text = h.replace(/<[^>]+>/g, '').trim(); | |
| return ' '.repeat(level - 1) + `├─ H${level}: ${text}`; | |
| }).join('\n'); | |
| } | |
| return data; | |
| } | |
| function displayResults(results) { | |
| resultsAccordion.innerHTML = ''; | |
| statsContainer.innerHTML = ''; | |
| if (results.length === 0) { | |
| noResults.classList.remove('hidden'); | |
| resultsContainer.classList.add('hidden'); | |
| return; | |
| } | |
| resultCount.textContent = results.length; | |
| // Calculate statistics | |
| const totalEmails = results.reduce((sum, r) => sum + r.data.emails.length, 0); | |
| const totalPhones = results.reduce((sum, r) => sum + r.data.phones.length, 0); | |
| const totalNames = results.reduce((sum, r) => sum + r.data.names.length, 0); | |
| const totalLinks = results.reduce((sum, r) => sum + r.data.links.length, 0); | |
| const avgDepth = (results.reduce((sum, r) => sum + r.depth, 0) / results.length).toFixed(1); | |
| // Display stats | |
| statsContainer.innerHTML = ` | |
| <div class="stat-card"> | |
| <div class="stat-value">${results.length}</div> | |
| <div class="stat-label">Pages Crawled</div> | |
| </div> | |
| <div class="stat-card"> | |
| <div class="stat-value">${totalEmails}</div> | |
| <div class="stat-label">Emails Found</div> | |
| </div> | |
| <div class="stat-card"> | |
| <div class="stat-value">${totalPhones}</div> | |
| <div class="stat-label">Phone Numbers</div> | |
| </div> | |
| <div class="stat-card"> | |
| <div class="stat-value">${totalNames}</div> | |
| <div class="stat-label">Names Found</div> | |
| </div> | |
| <div class="stat-card"> | |
| <div class="stat-value">${totalLinks}</div> | |
| <div class="stat-label">Links Found</div> | |
| </div> | |
| <div class="stat-card"> | |
| <div class="stat-value">${avgDepth}</div> | |
| <div class="stat-label">Avg Depth</div> | |
| </div> | |
| `; | |
| // Display detailed results | |
| results.forEach((result, index) => { | |
| const resultElement = document.createElement('div'); | |
| resultElement.className = 'bg-white rounded-lg shadow overflow-hidden'; | |
| let contentHTML = ` | |
| <div class="p-4 border-b border-gray-200 bg-gray-50"> | |
| <div class="flex justify-between items-center"> | |
| <h3 class="font-medium text-gray-800 truncate flex items-center"> | |
| <span class="mr-2 text-sm bg-blue-100 text-blue-800 px-2 py-0.5 rounded-full">D${result.depth}</span> | |
| ${result.title || result.url} | |
| </h3> | |
| <div class="flex space-x-2"> | |
| <span class="text-xs bg-blue-100 text-blue-800 px-2 py-1 rounded-full"> | |
| ${result.data.emails.length + result.data.phones.length + result.data.names.length} items | |
| </span> | |
| </div> | |
| </div> | |
| </div> | |
| <div class="p-4 space-y-6"> | |
| `; | |
| // Display screenshot if available | |
| if (result.screenshot) { | |
| contentHTML += ` | |
| <div class="data-card"> | |
| <h4 class="text-lg font-medium text-gray-800 mb-3 flex items-center"> | |
| <i class="fas fa-camera mr-2 text-blue-500"></i> Page Screenshot | |
| </h4> | |
| <img src="${result.screenshot}" alt="Page screenshot" class="border border-gray-200 rounded"> | |
| </div> | |
| `; | |
| } | |
| // Display emails if any | |
| if (result.data.emails.length > 0) { | |
| contentHTML += ` | |
| <div class="data-card"> | |
| <h4 class="text-lg font-medium text-gray-800 mb-3 flex items-center"> | |
| <i class="fas fa-envelope mr-2 text-blue-500"></i> Email Addresses (${result.data.emails.length}) | |
| </h4> | |
| <div class="flex flex-wrap gap-2"> | |
| ${result.data.emails.map(email => ` | |
| <span class="data-highlight email-chip"> | |
| <i class="fas fa-envelope mr-1"></i>${email} | |
| </span> | |
| `).join('')} | |
| </div> | |
| </div> | |
| `; | |
| } | |
| // Display phone numbers if any | |
| if (result.data.phones.length > 0) { | |
| contentHTML += ` | |
| <div class="data-card"> | |
| <h4 class="text-lg font-medium text-gray-800 mb-3 flex items-center"> | |
| <i class="fas fa-phone mr-2 text-blue-500"></i> Phone Numbers (${result.data.phones.length}) | |
| </h4> | |
| <div class="flex flex-wrap gap-2"> | |
| ${result.data.phones.map(phone => ` | |
| <span class="data-highlight phone-chip"> | |
| <i class="fas fa-phone mr-1"></i>${phone} | |
| </span> | |
| `).join('')} | |
| </div> | |
| </div> | |
| `; | |
| } | |
| // Display names if any | |
| if (result.data.names.length > 0) { | |
| contentHTML += ` | |
| <div class="data-card"> | |
| <h4 class="text-lg font-medium text-gray-800 mb-3 flex items-center"> | |
| <i class="fas fa-user mr-2 text-blue-500"></i> Names (${result.data.names.length}) | |
| </h4> | |
| <div class="flex flex-wrap gap-2"> | |
| ${result.data.names.map(name => ` | |
| <span class="data-highlight name-chip"> | |
| <i class="fas fa-user mr-1"></i>${name} | |
| </span> | |
| `).join('')} | |
| </div> | |
| </div> | |
| `; | |
| } | |
| // Display social links if any | |
| if (result.data.social.length > 0) { | |
| contentHTML += ` | |
| <div class="data-card"> | |
| <h4 class="text-lg font-medium text-gray-800 mb-3 flex items-center"> | |
| <i class="fas fa-share-alt mr-2 text-blue-500"></i> Social Links (${result.data.social.length}) | |
| </h4> | |
| <div class="flex flex-wrap gap-2"> | |
| ${result.data.social.map(link => ` | |
| <span class="data-highlight link-chip"> | |
| <i class="fab fa-${link.includes('facebook') ? 'facebook' : link.includes('twitter') ? 'twitter' : 'linkedin'} mr-1"></i> | |
| <a href="${link}" target="_blank" class="hover:underline">${link}</a> | |
| </span> | |
| `).join('')} | |
| </div> | |
| </div> | |
| `; | |
| } | |
| // Display site structure if available | |
| if (result.data.structure) { | |
| contentHTML += ` | |
| <div class="data-card"> | |
| <h4 class="text-lg font-medium text-gray-800 mb-3 flex items-center"> | |
| <i class="fas fa-sitemap mr-2 text-blue-500"></i> Page Structure | |
| </h4> | |
| <div class="site-tree">${result.data.structure}</div> | |
| </div> | |
| `; | |
| } | |
| contentHTML += `</div>`; | |
| resultElement.innerHTML = contentHTML; | |
| resultsAccordion.appendChild(resultElement); | |
| }); | |
| resultsContainer.classList.remove('hidden'); | |
| noResults.classList.add('hidden'); | |
| } | |
| crawlerForm.addEventListener('submit', async function(e) { | |
| e.preventDefault(); | |
| const urls = document.getElementById('urls').value.trim().split('\n').filter(url => url.trim()); | |
| const depth = parseInt(document.getElementById('depth').value); | |
| const maxPages = parseInt(document.getElementById('maxPages').value); | |
| const timeout = parseInt(document.getElementById('timeout').value); | |
| const proxyConfig = document.getElementById('proxyConfig').value; | |
| const userAgent = document.getElementById('userAgent').value; | |
| const extractNames = document.getElementById('extractNames').checked; | |
| const extractPhones = document.getElementById('extractPhones').checked; | |
| const extractEmails = document.getElementById('extractEmails').checked; | |
| const extractLinks = document.getElementById('extractLinks').checked; | |
| const extractStructure = document.getElementById('extractStructure').checked; | |
| const screenshots = document.getElementById('screenshots').checked; | |
| if (urls.length === 0) { | |
| alert('Please enter at least one URL'); | |
| return; | |
| } | |
| // Validate URLs | |
| const invalidUrls = urls.filter(url => { | |
| try { | |
| new URL(url); | |
| return false; | |
| } catch { | |
| return true; | |
| } | |
| }); | |
| if (invalidUrls.length > 0) { | |
| alert(`Invalid URLs detected:\n${invalidUrls.join('\n')}`); | |
| return; | |
| } | |
| // Show loading state | |
| loadingIndicator.classList.remove('hidden'); | |
| resultsContainer.classList.add('hidden'); | |
| noResults.classList.add('hidden'); | |
| currentDepth.textContent = depth; | |
| currentTimeout.textContent = timeout; | |
| currentMaxPages.textContent = maxPages; | |
| progressFill.style.width = '0%'; | |
| try { | |
| // In a real implementation, this would call your backend API | |
| // which would use Crawlee with Playwright for headless browsing | |
| const results = await simulateCrawleeCrawl(urls, depth, maxPages, timeout, { | |
| extractNames, | |
| extractPhones, | |
| extractEmails, | |
| extractLinks, | |
| extractStructure, | |
| screenshots, | |
| proxyConfig, | |
| userAgent | |
| }); | |
| displayResults(results); | |
| } catch (error) { | |
| console.error('Error:', error); | |
| alert('An error occurred during crawling. Check console for details.'); | |
| } finally { | |
| loadingIndicator.classList.add('hidden'); | |
| } | |
| }); | |
| exportBtn.addEventListener('click', function() { | |
| alert('In a real implementation, this would export the extracted data as JSON'); | |
| // This would call your backend to generate a downloadable file | |
| }); | |
| exportCSV.addEventListener('click', function() { | |
| alert('In a real implementation, this would export the extracted data as CSV'); | |
| // This would call your backend to generate a downloadable file | |
| }); | |
| }); | |
| </script> | |
| <p style="border-radius: 8px; text-align: center; font-size: 12px; color: #fff; margin-top: 16px;position: fixed; left: 8px; bottom: 8px; z-index: 10; background: rgba(0, 0, 0, 0.8); padding: 4px 8px;">Made with <img src="https://enzostvs-deepsite.hf.space/logo.svg" alt="DeepSite Logo" style="width: 16px; height: 16px; vertical-align: middle;display:inline-block;margin-right:3px;filter:brightness(0) invert(1);"><a href="https://enzostvs-deepsite.hf.space" style="color: #fff;text-decoration: underline;" target="_blank" >DeepSite</a> - 🧬 <a href="https://enzostvs-deepsite.hf.space?remix=MarcRyan/web-crawler" style="color: #fff;text-decoration: underline;" target="_blank" >Remix</a></p></body> | |
| </html> |