import pandas as pd
import feedparser
import html
import re

def scrape_crypto_news(urls: list = None, num_entries: int = 10) -> pd.DataFrame:
    if urls is None:
        # urls = ["https://cointelegraph.com/rss","https://cryptopotato.com/feed/","https://cryptonews.com/news/feed/"]
        urls = ["https://www.coindesk.com/arc/outboundfeeds/rss/","https://cointelegraph.com/rss","https://cryptopotato.com/feed/","https://decrypt.co/feed"]
    if num_entries is None:
        num_entries = 10
    news_dict = {}
    for url in urls:
        try:
            feed = feedparser.parse(url, request_headers={
                "User-Agent": "Mozilla/5.0 (CryptoNewsBot; +https://example.com)"
            })

            # Check if the feed was parsed correctly
            if getattr(feed, "bozo", 0):
                print("[warn] bozo_exception:", getattr(feed, "bozo_exception", None))

            # Helper to extract the best text field per entry
            def get_text(entry):
                if "content" in entry and entry.content:
                    for c in entry.content:
                        if c.get("value"):
                            return c["value"]
                if entry.get("summary"):
                    return entry["summary"]
                if entry.get("description"):
                    return entry["description"]
                return ""

            # Print the first few articles
            for i, e in enumerate(feed.entries[:num_entries], 1):
                title = e.get("title", "").strip()
                link = e.get("link", "")
                raw_html = get_text(e)
                plain = re.sub(r"<[^>]+>", " ", html.unescape(raw_html))
                plain = re.sub(r"\s+", " ", plain).strip()
                pub = e.get("published", "")
                news_dict[title] = {"link": link, "published": pub, "description": plain, "source": url}

        except:
            pass
        news_df = pd.DataFrame(news_dict).T
        news_df = news_df.reset_index().rename(columns={"index":"title"})
    return news_df