| """googlesearch is a Python library for searching Google, easily.""" |
| from time import sleep |
| from bs4 import BeautifulSoup |
| from requests import get |
| from urllib.parse import unquote |
| from tools.googlesearch.useragentka import get_useragent |
| from curl_cffi import requests as curlreq |
| from tools.googlesearch.gettyimages import get_images |
|
|
| def _req(term, results, lang, start, proxies, timeout, safe, ssl_verify, region): |
| resp = get( |
| url="https://www.google.com/search", |
| headers={ |
| "User-Agent": get_useragent(), |
| "Accept": "*/*" |
| }, |
| params={ |
| "q": term, |
| "num": results + 2, |
| "hl": lang, |
| "start": start, |
| "safe": safe, |
| "gl": region, |
| }, |
| proxies=proxies, |
| timeout=timeout, |
| verify=ssl_verify, |
| cookies = { |
| 'CONSENT': 'PENDING+987', |
| 'SOCS': 'CAESHAgBEhIaAB', |
| } |
| ) |
| resp.raise_for_status() |
| return resp |
|
|
|
|
| class SearchResult: |
| def __init__(self, url, title, description): |
| self.url = url |
| self.title = title |
| self.description = description |
|
|
| def __repr__(self): |
| return f"SearchResult(url={self.url}, title={self.title}, description={self.description})" |
|
|
|
|
| def search(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_interval=0, timeout=5, safe="active", ssl_verify=None, region=None, start_num=0, unique=False): |
| """Search the Google search engine""" |
|
|
| |
| proxies = {"https": proxy, "http": proxy} if proxy and (proxy.startswith("https") or proxy.startswith("http")) else None |
|
|
| start = start_num |
| fetched_results = 0 |
| fetched_links = set() |
| results_list = [] |
| image_results = [] |
|
|
| while fetched_results < num_results: |
| |
| resp = _req(term, num_results - start, |
| lang, start, proxies, timeout, safe, ssl_verify, region) |
| |
| |
| soup = BeautifulSoup(resp.text, "html.parser") |
| result_block = soup.find_all("div", class_="ezO2md") |
| new_results = 0 |
|
|
| |
| try: |
| all_images = soup.find_all("img") |
| for img in all_images: |
| img_src = img.get("src") or img.get("data-src") |
| if img_src: |
| |
| if img_src.startswith("data:image"): |
| image_results.append({ |
| "src": img_src, |
| "alt": img.get("alt", ""), |
| "class": img.get("class", []), |
| }) |
| |
| elif img_src.startswith("http"): |
| image_results.append({ |
| "src": img_src, |
| "alt": img.get("alt", ""), |
| "class": img.get("class", []), |
| }) |
| except Exception as e: |
| print(f"Error parsing images: {str(e)}") |
|
|
| for result in result_block: |
| link_tag = result.find("a", href=True) |
| title_tag = link_tag.find("span", class_="CVA68e") if link_tag else None |
| description_tag = result.find("span", class_="FrIlee") |
|
|
| if link_tag and title_tag and description_tag: |
| link = unquote(link_tag["href"].split("&")[0].replace("/url?q=", "")) |
| if link in fetched_links and unique: |
| continue |
| fetched_links.add(link) |
| title = title_tag.text if title_tag else "" |
| description = description_tag.text if description_tag else "" |
|
|
| |
| if advanced and not any('page_text' in result for result in results_list): |
| try: |
| page_scrape = curlreq.get(link, impersonate='chrome110') |
| page_scrape.encoding = 'utf-8' |
| page_soup = BeautifulSoup(page_scrape.text, "html.parser") |
| |
| |
| main_content = ( |
| page_soup.find(['article', 'main']) or |
| page_soup.find('div', {'id': ['content', 'main-content', 'body-content']}) or |
| page_soup.find('div', {'class': ['content', 'main', 'article', 'post']}) or |
| page_soup.find('div', {'role': 'main'}) or |
| page_soup.body |
| ) |
| if main_content: |
| |
| for element in main_content(['script', 'style', 'noscript', 'svg', 'header', 'footer', 'nav']): |
| element.decompose() |
| |
| text = main_content.get_text(separator=' ', strip=True) |
| text = ' '.join(line.strip() for line in text.splitlines() if line.strip()) |
| page_text = ' '.join(word for word in text.split() if len(word) > 1)[:3000] |
| else: |
| page_text = "" |
| except Exception as e: |
| print(f"Error scraping {link}: {str(e)}") |
| page_text = "" |
| else: |
| page_text = "" |
|
|
|
|
| fetched_results += 1 |
| new_results += 1 |
| |
| if advanced: |
| results_list.append({ |
| "link": link, |
| "title": title, |
| "description": description, |
| "page_text": page_text, |
| }) |
| else: |
| results_list.append(link) |
|
|
| if fetched_results >= num_results: |
| break |
|
|
| if new_results == 0: |
| break |
|
|
| start += 10 |
| sleep(sleep_interval) |
|
|
| if image_results == [] : |
| images = get_images(term) |
| return {"results": results_list, "images": images} |
| else: |
| return {"results": results_list, "images": image_results} |
|
|