import pandas as pd import feedparser import html import re def scrape_crypto_news(urls: list = None, num_entries: int = 10) -> pd.DataFrame: if urls is None: # urls = ["https://cointelegraph.com/rss","https://cryptopotato.com/feed/","https://cryptonews.com/news/feed/"] urls = ["https://www.coindesk.com/arc/outboundfeeds/rss/","https://cointelegraph.com/rss","https://cryptopotato.com/feed/","https://decrypt.co/feed"] if num_entries is None: num_entries = 10 news_dict = {} for url in urls: try: feed = feedparser.parse(url, request_headers={ "User-Agent": "Mozilla/5.0 (CryptoNewsBot; +https://example.com)" }) # Check if the feed was parsed correctly if getattr(feed, "bozo", 0): print("[warn] bozo_exception:", getattr(feed, "bozo_exception", None)) # Helper to extract the best text field per entry def get_text(entry): if "content" in entry and entry.content: for c in entry.content: if c.get("value"): return c["value"] if entry.get("summary"): return entry["summary"] if entry.get("description"): return entry["description"] return "" # Print the first few articles for i, e in enumerate(feed.entries[:num_entries], 1): title = e.get("title", "").strip() link = e.get("link", "") raw_html = get_text(e) plain = re.sub(r"<[^>]+>", " ", html.unescape(raw_html)) plain = re.sub(r"\s+", " ", plain).strip() pub = e.get("published", "") news_dict[title] = {"link": link, "published": pub, "description": plain, "source": url} except: pass news_df = pd.DataFrame(news_dict).T news_df = news_df.reset_index().rename(columns={"index":"title"}) return news_df