web-crawler / index.html
MarcRyan's picture
Add 2 files
2be27f7 verified
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Advanced Web Scraper with Crawlee & Playwright</title>
<script src="https://cdn.tailwindcss.com"></script>
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css">
<style>
.loading-spinner {
border: 4px solid rgba(0, 0, 0, 0.1);
border-radius: 50%;
border-top: 4px solid #3b82f6;
width: 30px;
height: 30px;
animation: spin 1s linear infinite;
margin: 0 auto;
}
@keyframes spin {
0% { transform: rotate(0deg); }
100% { transform: rotate(360deg); }
}
.data-card {
background-color: #f8fafc;
border-radius: 0.5rem;
padding: 1rem;
margin-bottom: 1rem;
border: 1px solid #e2e8f0;
}
.data-highlight {
background-color: #e0f7fa;
padding: 0.2rem 0.4rem;
border-radius: 0.25rem;
font-family: monospace;
display: inline-block;
margin: 0.1rem;
word-break: break-all;
}
.email-chip {
background-color: #e3f2fd;
color: #0d47a1;
}
.phone-chip {
background-color: #e8f5e9;
color: #1b5e20;
}
.name-chip {
background-color: #f3e5f5;
color: #4a148c;
}
.link-chip {
background-color: #fff3e0;
color: #e65100;
}
.site-tree {
font-family: monospace;
white-space: pre;
overflow-x: auto;
background-color: #f5f5f5;
padding: 1rem;
border-radius: 0.25rem;
border: 1px solid #e2e8f0;
}
.progress-bar {
height: 6px;
background-color: #e2e8f0;
border-radius: 3px;
overflow: hidden;
margin-top: 1rem;
}
.progress-fill {
height: 100%;
background-color: #3b82f6;
width: 0%;
transition: width 0.3s ease;
}
.stats-grid {
display: grid;
grid-template-columns: repeat(auto-fill, minmax(120px, 1fr));
gap: 1rem;
margin-bottom: 1.5rem;
}
.stat-card {
background-color: white;
border-radius: 0.5rem;
padding: 1rem;
text-align: center;
box-shadow: 0 1px 3px rgba(0,0,0,0.1);
}
.stat-value {
font-size: 1.5rem;
font-weight: bold;
color: #3b82f6;
}
.stat-label {
font-size: 0.75rem;
color: #64748b;
text-transform: uppercase;
letter-spacing: 0.05em;
}
</style>
</head>
<body class="bg-gray-50 min-h-screen">
<div class="container mx-auto px-4 py-8">
<header class="text-center mb-12">
<h1 class="text-4xl font-bold text-blue-600 mb-2">
<i class="fas fa-spider mr-2"></i> Advanced Web Scraper
</h1>
<p class="text-gray-600 max-w-2xl mx-auto">
Professional web scraping with Crawlee & Playwright simulation
</p>
</header>
<div class="bg-white rounded-xl shadow-lg p-6 max-w-4xl mx-auto mb-12">
<form id="crawlerForm" class="space-y-6">
<div>
<label for="urls" class="block text-sm font-medium text-gray-700 mb-1">
<i class="fas fa-link mr-1 text-blue-500"></i> Start URLs (one per line)
</label>
<textarea id="urls" name="urls" rows="5"
class="w-full px-4 py-2 border border-gray-300 rounded-lg focus:ring-2 focus:ring-blue-500 focus:border-blue-500 transition font-mono"
placeholder="https://example.com&#10;https://another-site.com" required>https://example.com</textarea>
</div>
<div class="grid grid-cols-1 md:grid-cols-3 gap-4">
<div>
<label for="depth" class="block text-sm font-medium text-gray-700 mb-1">
<i class="fas fa-layer-group mr-1 text-blue-500"></i> Crawl Depth
</label>
<select id="depth" name="depth"
class="w-full px-4 py-2 border border-gray-300 rounded-lg focus:ring-2 focus:ring-blue-500 focus:border-blue-500 transition">
<option value="1">1 (Single page only)</option>
<option value="2" selected>2 (Page + 1 level deep)</option>
<option value="3">3 (Page + 2 levels deep)</option>
<option value="5">5 (Deep crawl)</option>
</select>
</div>
<div>
<label for="maxPages" class="block text-sm font-medium text-gray-700 mb-1">
<i class="fas fa-file-alt mr-1 text-blue-500"></i> Max Pages to Crawl
</label>
<input type="number" id="maxPages" name="maxPages" min="1" value="20"
class="w-full px-4 py-2 border border-gray-300 rounded-lg focus:ring-2 focus:ring-blue-500 focus:border-blue-500 transition">
</div>
<div>
<label for="timeout" class="block text-sm font-medium text-gray-700 mb-1">
<i class="fas fa-clock mr-1 text-blue-500"></i> Timeout (seconds)
</label>
<input type="number" id="timeout" name="timeout" min="5" value="30"
class="w-full px-4 py-2 border border-gray-300 rounded-lg focus:ring-2 focus:ring-blue-500 focus:border-blue-500 transition">
</div>
</div>
<div class="grid grid-cols-1 md:grid-cols-2 gap-4">
<div>
<label for="proxyConfig" class="block text-sm font-medium text-gray-700 mb-1">
<i class="fas fa-globe mr-1 text-blue-500"></i> Proxy Configuration
</label>
<select id="proxyConfig" name="proxyConfig"
class="w-full px-4 py-2 border border-gray-300 rounded-lg focus:ring-2 focus:ring-blue-500 focus:border-blue-500 transition">
<option value="none">No proxy</option>
<option value="residential">Residential proxies</option>
<option value="datacenter">Datacenter proxies</option>
<option value="tor">TOR network</option>
</select>
</div>
<div>
<label for="userAgent" class="block text-sm font-medium text-gray-700 mb-1">
<i class="fas fa-user-secret mr-1 text-blue-500"></i> User Agent
</label>
<select id="userAgent" name="userAgent"
class="w-full px-4 py-2 border border-gray-300 rounded-lg focus:ring-2 focus:ring-blue-500 focus:border-blue-500 transition">
<option value="default">Default (Chrome latest)</option>
<option value="mobile">Mobile (iPhone)</option>
<option value="tablet">Tablet (iPad)</option>
<option value="legacy">Legacy (IE 11)</option>
</select>
</div>
</div>
<div class="flex flex-wrap gap-4">
<div class="flex items-center">
<input type="checkbox" id="extractNames" name="extractNames" class="h-4 w-4 text-blue-600 focus:ring-blue-500 border-gray-300 rounded" checked>
<label for="extractNames" class="ml-2 block text-sm text-gray-700">
Extract Names
</label>
</div>
<div class="flex items-center">
<input type="checkbox" id="extractPhones" name="extractPhones" class="h-4 w-4 text-blue-600 focus:ring-blue-500 border-gray-300 rounded" checked>
<label for="extractPhones" class="ml-2 block text-sm text-gray-700">
Extract Phone Numbers
</label>
</div>
<div class="flex items-center">
<input type="checkbox" id="extractEmails" name="extractEmails" class="h-4 w-4 text-blue-600 focus:ring-blue-500 border-gray-300 rounded" checked>
<label for="extractEmails" class="ml-2 block text-sm text-gray-700">
Extract Emails
</label>
</div>
<div class="flex items-center">
<input type="checkbox" id="extractLinks" name="extractLinks" class="h-4 w-4 text-blue-600 focus:ring-blue-500 border-gray-300 rounded" checked>
<label for="extractLinks" class="ml-2 block text-sm text-gray-700">
Extract Links
</label>
</div>
<div class="flex items-center">
<input type="checkbox" id="extractStructure" name="extractStructure" class="h-4 w-4 text-blue-600 focus:ring-blue-500 border-gray-300 rounded" checked>
<label for="extractStructure" class="ml-2 block text-sm text-gray-700">
Site Structure
</label>
</div>
<div class="flex items-center">
<input type="checkbox" id="screenshots" name="screenshots" class="h-4 w-4 text-blue-600 focus:ring-blue-500 border-gray-300 rounded">
<label for="screenshots" class="ml-2 block text-sm text-gray-700">
Take Screenshots
</label>
</div>
</div>
<div class="pt-4">
<button type="submit" id="searchButton" class="w-full bg-blue-600 hover:bg-blue-700 text-white font-medium py-3 px-4 rounded-lg transition flex items-center justify-center">
<i class="fas fa-spider mr-2"></i> Start Crawling
</button>
</div>
</form>
</div>
<div id="loadingIndicator" class="hidden text-center py-8">
<div class="loading-spinner mb-4"></div>
<p class="text-gray-600">Crawling websites with Playwright. This may take a few minutes...</p>
<p class="text-sm text-gray-500 mt-2">Currently processing: <span id="currentUrl" class="font-medium">example.com</span></p>
<div class="progress-bar mt-4 max-w-md mx-auto">
<div id="progressFill" class="progress-fill"></div>
</div>
<div class="mt-4 max-w-md mx-auto bg-blue-50 p-3 rounded-lg">
<p class="text-blue-800 text-sm">
<i class="fas fa-info-circle mr-2"></i>
Crawling with depth <span id="currentDepth">2</span>, max <span id="currentMaxPages">20</span> pages,
timeout <span id="currentTimeout">30</span>s
</p>
</div>
</div>
<div id="resultsContainer" class="hidden">
<div class="flex justify-between items-center mb-6">
<h2 class="text-2xl font-bold text-gray-800">
<i class="fas fa-database mr-2 text-blue-500"></i> Scraping Results
</h2>
<div class="flex items-center space-x-4">
<div class="bg-blue-100 text-blue-800 px-3 py-1 rounded-full text-sm font-medium">
Pages: <span id="resultCount">0</span>
</div>
<button id="exportBtn" class="bg-green-100 text-green-800 px-3 py-1 rounded-full text-sm font-medium hover:bg-green-200 transition flex items-center">
<i class="fas fa-file-export mr-1"></i> Export JSON
</button>
<button id="exportCSV" class="bg-purple-100 text-purple-800 px-3 py-1 rounded-full text-sm font-medium hover:bg-purple-200 transition flex items-center">
<i class="fas fa-file-csv mr-1"></i> Export CSV
</button>
</div>
</div>
<div id="statsContainer" class="stats-grid mb-6">
<!-- Stats will be inserted here -->
</div>
<div id="resultsAccordion" class="space-y-6">
<!-- Results will be inserted here by JavaScript -->
</div>
</div>
<div id="noResults" class="hidden text-center py-12">
<div class="mx-auto w-24 h-24 bg-gray-100 rounded-full flex items-center justify-center mb-4">
<i class="fas fa-exclamation-triangle text-3xl text-yellow-500"></i>
</div>
<h3 class="text-xl font-medium text-gray-700 mb-2">No Results Found</h3>
<p class="text-gray-500 max-w-md mx-auto">
The crawler didn't find any matching data from the provided URLs.
Try adjusting your search parameters.
</p>
</div>
</div>
<script>
document.addEventListener('DOMContentLoaded', function() {
const crawlerForm = document.getElementById('crawlerForm');
const loadingIndicator = document.getElementById('loadingIndicator');
const resultsContainer = document.getElementById('resultsContainer');
const noResults = document.getElementById('noResults');
const resultsAccordion = document.getElementById('resultsAccordion');
const resultCount = document.getElementById('resultCount');
const currentUrl = document.getElementById('currentUrl');
const currentDepth = document.getElementById('currentDepth');
const currentTimeout = document.getElementById('currentTimeout');
const currentMaxPages = document.getElementById('currentMaxPages');
const progressFill = document.getElementById('progressFill');
const exportBtn = document.getElementById('exportBtn');
const exportCSV = document.getElementById('exportCSV');
const statsContainer = document.getElementById('statsContainer');
// Regular expressions for data extraction
const patterns = {
email: /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/g,
phone: /(\+\d{1,3}[- ]?)?\(?\d{3}\)?[- ]?\d{3}[- ]?\d{4}\b/g,
name: /\b(?:Mr\.?|Mrs\.?|Ms\.?|Dr\.?)?\s?[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+\b/g,
social: /(facebook\.com|twitter\.com|linkedin\.com|instagram\.com)\/[^\s"'>]+/g
};
// Simulate Crawlee + Playwright crawling with realistic data
async function simulateCrawleeCrawl(urls, depth, maxPages, timeout, options) {
// In a real implementation, this would call your backend API
// which would use Crawlee with Playwright for headless browsing
// Simulate realistic crawl delays
const simulateDelay = () => new Promise(resolve =>
setTimeout(resolve, 500 + Math.random() * 1500));
const results = [];
let processed = 0;
const totalToProcess = Math.min(urls.length * depth, maxPages);
// Generate realistic mock data for different domains
const mockDataGenerators = {
"example.com": generateExampleData,
"wikipedia.org": generateWikipediaData,
"github.com": generateGithubData,
"amazon.com": generateAmazonData,
"nytimes.com": generateNewsData
};
for (const url of urls) {
try {
const domain = new URL(url).hostname.replace('www.', '');
const generator = mockDataGenerators[domain] || generateGenericData;
// Simulate crawling at different depths
for (let d = 0; d < depth; d++) {
if (processed >= maxPages) break;
// Update UI with current URL
currentUrl.textContent = `${domain} (depth ${d+1})`;
progressFill.style.width = `${(processed / totalToProcess) * 100}%`;
await simulateDelay();
// Generate mock data for this page
const pageData = generator(url, d);
const extractedData = extractDataFromContent(pageData.content, options);
const result = {
url: url + (d > 0 ? `?depth=${d}` : ''),
title: pageData.title,
depth: d,
data: extractedData,
screenshot: options.screenshots ? `data:image/svg+xml;base64,${btoa(`<svg xmlns="http://www.w3.org/2000/svg" width="800" height="600" viewBox="0 0 800 600"><rect width="800" height="600" fill="#${Math.floor(Math.random()*16777215).toString(16)}"/><text x="50%" y="50%" font-family="Arial" font-size="24" fill="white" text-anchor="middle">${domain} Screenshot</text></svg>`)}` : null
};
results.push(result);
processed++;
}
} catch (error) {
console.error(`Error processing ${url}:`, error);
}
if (processed >= maxPages) break;
}
progressFill.style.width = '100%';
return results;
}
// Data generators for different types of sites
function generateExampleData(url, depth) {
const titles = [
"Example Domain",
"About Our Company",
"Our Services",
"Contact Information",
"Customer Testimonials"
];
const people = [
"John Smith (CEO)",
"Sarah Johnson (CTO)",
"Michael Brown (CFO)",
"Emily Davis (Marketing Director)",
"Robert Wilson (Sales Manager)"
];
const content = `
<h1>${titles[depth % titles.length]}</h1>
<p>Welcome to our website. Contact us at info@example.com or call +1 (555) 123-4567</p>
<p>Our team includes ${people.slice(0, depth + 2).join(', ')}.</p>
<h2>Our Services</h2>
<p>We offer great services. Email sales@example.com for inquiries.</p>
<h3>Departments</h3>
<ul>
<li><a href="/about">About Us</a></li>
<li><a href="/contact">Contact</a></li>
<li><a href="https://facebook.com/examplecompany">Facebook</a></li>
<li><a href="https://twitter.com/example">Twitter</a></li>
</ul>
<div class="footer">
<p>Copyright © ${new Date().getFullYear()} Example Inc. All rights reserved.</p>
<p>Address: 123 Main St, Anytown, USA</p>
</div>
`;
return {
title: titles[depth % titles.length],
content: content
};
}
function generateWikipediaData(url, depth) {
const titles = [
"Wikipedia, the free encyclopedia",
"Artificial Intelligence - Wikipedia",
"Machine Learning",
"History of Computing",
"Notable Computer Scientists"
];
const people = [
"Alan Turing",
"Grace Hopper",
"Tim Berners-Lee",
"Ada Lovelace",
"John von Neumann"
];
const content = `
<h1 id="firstHeading">${titles[depth % titles.length]}</h1>
<div class="infobox">
<p>This article is about ${titles[depth % titles.length].split(' - ')[0]}</p>
</div>
<p>${people[depth % people.length]} (${['British', 'American', 'Hungarian', 'English'][depth % 4]} scientist)</p>
<h2>Contents</h2>
<ul>
<li><a href="#History">History</a></li>
<li><a href="#Applications">Applications</a></li>
<li><a href="#References">References</a></li>
</ul>
<h2 id="History">History</h2>
<p>The field was founded on the assumption that human intelligence can be described precisely enough to be simulated by a machine.</p>
<h2 id="References">References</h2>
<ol>
<li>Author, A. (2020). "Title". Journal. 10(2): 123–145.</li>
<li>Researcher, B. (2019). "New discoveries". Science.</li>
</ol>
<div class="footer">
<p>This page was last edited on ${new Date().toLocaleDateString()}</p>
<p>Contact: en.wikipedia.org@wikimedia.org</p>
</div>
`;
return {
title: titles[depth % titles.length],
content: content
};
}
// More data generators would be here in a real implementation...
// Enhanced data extraction
function extractDataFromContent(content, options) {
const data = {
emails: [],
phones: [],
names: [],
links: [],
social: [],
structure: ""
};
// Extract emails
if (options.extractEmails) {
const emailMatches = content.match(patterns.email) || [];
data.emails = [...new Set(emailMatches)]; // Remove duplicates
}
// Extract phone numbers
if (options.extractPhones) {
const phoneMatches = content.match(patterns.phone) || [];
data.phones = [...new Set(phoneMatches)];
}
// Extract names
if (options.extractNames) {
const nameMatches = content.match(patterns.name) || [];
data.names = [...new Set(nameMatches)];
}
// Extract links
if (options.extractLinks) {
const linkMatches = content.match(/https?:\/\/[^\s"'<>]+/g) || [];
data.links = [...new Set(linkMatches)];
// Extract social media links
const socialMatches = content.match(patterns.social) || [];
data.social = [...new Set(socialMatches)];
}
// Generate site structure
if (options.extractStructure) {
const headings = content.match(/<h[1-6][^>]*>.*?<\/h[1-6]>/gi) || [];
data.structure = headings.map(h => {
const level = h.match(/<h([1-6])/i)[1];
const text = h.replace(/<[^>]+>/g, '').trim();
return ' '.repeat(level - 1) + `├─ H${level}: ${text}`;
}).join('\n');
}
return data;
}
function displayResults(results) {
resultsAccordion.innerHTML = '';
statsContainer.innerHTML = '';
if (results.length === 0) {
noResults.classList.remove('hidden');
resultsContainer.classList.add('hidden');
return;
}
resultCount.textContent = results.length;
// Calculate statistics
const totalEmails = results.reduce((sum, r) => sum + r.data.emails.length, 0);
const totalPhones = results.reduce((sum, r) => sum + r.data.phones.length, 0);
const totalNames = results.reduce((sum, r) => sum + r.data.names.length, 0);
const totalLinks = results.reduce((sum, r) => sum + r.data.links.length, 0);
const avgDepth = (results.reduce((sum, r) => sum + r.depth, 0) / results.length).toFixed(1);
// Display stats
statsContainer.innerHTML = `
<div class="stat-card">
<div class="stat-value">${results.length}</div>
<div class="stat-label">Pages Crawled</div>
</div>
<div class="stat-card">
<div class="stat-value">${totalEmails}</div>
<div class="stat-label">Emails Found</div>
</div>
<div class="stat-card">
<div class="stat-value">${totalPhones}</div>
<div class="stat-label">Phone Numbers</div>
</div>
<div class="stat-card">
<div class="stat-value">${totalNames}</div>
<div class="stat-label">Names Found</div>
</div>
<div class="stat-card">
<div class="stat-value">${totalLinks}</div>
<div class="stat-label">Links Found</div>
</div>
<div class="stat-card">
<div class="stat-value">${avgDepth}</div>
<div class="stat-label">Avg Depth</div>
</div>
`;
// Display detailed results
results.forEach((result, index) => {
const resultElement = document.createElement('div');
resultElement.className = 'bg-white rounded-lg shadow overflow-hidden';
let contentHTML = `
<div class="p-4 border-b border-gray-200 bg-gray-50">
<div class="flex justify-between items-center">
<h3 class="font-medium text-gray-800 truncate flex items-center">
<span class="mr-2 text-sm bg-blue-100 text-blue-800 px-2 py-0.5 rounded-full">D${result.depth}</span>
${result.title || result.url}
</h3>
<div class="flex space-x-2">
<span class="text-xs bg-blue-100 text-blue-800 px-2 py-1 rounded-full">
${result.data.emails.length + result.data.phones.length + result.data.names.length} items
</span>
</div>
</div>
</div>
<div class="p-4 space-y-6">
`;
// Display screenshot if available
if (result.screenshot) {
contentHTML += `
<div class="data-card">
<h4 class="text-lg font-medium text-gray-800 mb-3 flex items-center">
<i class="fas fa-camera mr-2 text-blue-500"></i> Page Screenshot
</h4>
<img src="${result.screenshot}" alt="Page screenshot" class="border border-gray-200 rounded">
</div>
`;
}
// Display emails if any
if (result.data.emails.length > 0) {
contentHTML += `
<div class="data-card">
<h4 class="text-lg font-medium text-gray-800 mb-3 flex items-center">
<i class="fas fa-envelope mr-2 text-blue-500"></i> Email Addresses (${result.data.emails.length})
</h4>
<div class="flex flex-wrap gap-2">
${result.data.emails.map(email => `
<span class="data-highlight email-chip">
<i class="fas fa-envelope mr-1"></i>${email}
</span>
`).join('')}
</div>
</div>
`;
}
// Display phone numbers if any
if (result.data.phones.length > 0) {
contentHTML += `
<div class="data-card">
<h4 class="text-lg font-medium text-gray-800 mb-3 flex items-center">
<i class="fas fa-phone mr-2 text-blue-500"></i> Phone Numbers (${result.data.phones.length})
</h4>
<div class="flex flex-wrap gap-2">
${result.data.phones.map(phone => `
<span class="data-highlight phone-chip">
<i class="fas fa-phone mr-1"></i>${phone}
</span>
`).join('')}
</div>
</div>
`;
}
// Display names if any
if (result.data.names.length > 0) {
contentHTML += `
<div class="data-card">
<h4 class="text-lg font-medium text-gray-800 mb-3 flex items-center">
<i class="fas fa-user mr-2 text-blue-500"></i> Names (${result.data.names.length})
</h4>
<div class="flex flex-wrap gap-2">
${result.data.names.map(name => `
<span class="data-highlight name-chip">
<i class="fas fa-user mr-1"></i>${name}
</span>
`).join('')}
</div>
</div>
`;
}
// Display social links if any
if (result.data.social.length > 0) {
contentHTML += `
<div class="data-card">
<h4 class="text-lg font-medium text-gray-800 mb-3 flex items-center">
<i class="fas fa-share-alt mr-2 text-blue-500"></i> Social Links (${result.data.social.length})
</h4>
<div class="flex flex-wrap gap-2">
${result.data.social.map(link => `
<span class="data-highlight link-chip">
<i class="fab fa-${link.includes('facebook') ? 'facebook' : link.includes('twitter') ? 'twitter' : 'linkedin'} mr-1"></i>
<a href="${link}" target="_blank" class="hover:underline">${link}</a>
</span>
`).join('')}
</div>
</div>
`;
}
// Display site structure if available
if (result.data.structure) {
contentHTML += `
<div class="data-card">
<h4 class="text-lg font-medium text-gray-800 mb-3 flex items-center">
<i class="fas fa-sitemap mr-2 text-blue-500"></i> Page Structure
</h4>
<div class="site-tree">${result.data.structure}</div>
</div>
`;
}
contentHTML += `</div>`;
resultElement.innerHTML = contentHTML;
resultsAccordion.appendChild(resultElement);
});
resultsContainer.classList.remove('hidden');
noResults.classList.add('hidden');
}
crawlerForm.addEventListener('submit', async function(e) {
e.preventDefault();
const urls = document.getElementById('urls').value.trim().split('\n').filter(url => url.trim());
const depth = parseInt(document.getElementById('depth').value);
const maxPages = parseInt(document.getElementById('maxPages').value);
const timeout = parseInt(document.getElementById('timeout').value);
const proxyConfig = document.getElementById('proxyConfig').value;
const userAgent = document.getElementById('userAgent').value;
const extractNames = document.getElementById('extractNames').checked;
const extractPhones = document.getElementById('extractPhones').checked;
const extractEmails = document.getElementById('extractEmails').checked;
const extractLinks = document.getElementById('extractLinks').checked;
const extractStructure = document.getElementById('extractStructure').checked;
const screenshots = document.getElementById('screenshots').checked;
if (urls.length === 0) {
alert('Please enter at least one URL');
return;
}
// Validate URLs
const invalidUrls = urls.filter(url => {
try {
new URL(url);
return false;
} catch {
return true;
}
});
if (invalidUrls.length > 0) {
alert(`Invalid URLs detected:\n${invalidUrls.join('\n')}`);
return;
}
// Show loading state
loadingIndicator.classList.remove('hidden');
resultsContainer.classList.add('hidden');
noResults.classList.add('hidden');
currentDepth.textContent = depth;
currentTimeout.textContent = timeout;
currentMaxPages.textContent = maxPages;
progressFill.style.width = '0%';
try {
// In a real implementation, this would call your backend API
// which would use Crawlee with Playwright for headless browsing
const results = await simulateCrawleeCrawl(urls, depth, maxPages, timeout, {
extractNames,
extractPhones,
extractEmails,
extractLinks,
extractStructure,
screenshots,
proxyConfig,
userAgent
});
displayResults(results);
} catch (error) {
console.error('Error:', error);
alert('An error occurred during crawling. Check console for details.');
} finally {
loadingIndicator.classList.add('hidden');
}
});
exportBtn.addEventListener('click', function() {
alert('In a real implementation, this would export the extracted data as JSON');
// This would call your backend to generate a downloadable file
});
exportCSV.addEventListener('click', function() {
alert('In a real implementation, this would export the extracted data as CSV');
// This would call your backend to generate a downloadable file
});
});
</script>
<p style="border-radius: 8px; text-align: center; font-size: 12px; color: #fff; margin-top: 16px;position: fixed; left: 8px; bottom: 8px; z-index: 10; background: rgba(0, 0, 0, 0.8); padding: 4px 8px;">Made with <img src="https://enzostvs-deepsite.hf.space/logo.svg" alt="DeepSite Logo" style="width: 16px; height: 16px; vertical-align: middle;display:inline-block;margin-right:3px;filter:brightness(0) invert(1);"><a href="https://enzostvs-deepsite.hf.space" style="color: #fff;text-decoration: underline;" target="_blank" >DeepSite</a> - 🧬 <a href="https://enzostvs-deepsite.hf.space?remix=MarcRyan/web-crawler" style="color: #fff;text-decoration: underline;" target="_blank" >Remix</a></p></body>
</html>