| from langchain.tools import tool |
|
|
| try: |
| from youtube_transcript_api import YouTubeTranscriptApi |
| except Exception: |
| YouTubeTranscriptApi = None |
|
|
| import re |
| from urllib.parse import urlparse, parse_qs |
|
|
|
|
| def _extract_video_id(url_or_id: str) -> str | None: |
| s = (url_or_id or "").strip() |
| if re.fullmatch(r"[A-Za-z0-9_-]{11}", s): |
| return s |
| u = urlparse(s) |
| |
| if u.netloc.endswith("youtu.be"): |
| vid = u.path.strip("/").split("/")[0] |
| return vid if re.fullmatch(r"[A-Za-z0-9_-]{11}", vid) else None |
| |
| qs = parse_qs(u.query or "") |
| if "v" in qs: |
| vid = qs["v"][0] |
| return vid if re.fullmatch(r"[A-Za-z0-9_-]{11}", vid) else None |
| |
| for pref in ("/embed/", "/shorts/", "/v/"): |
| if u.path.startswith(pref): |
| vid = u.path[len(pref):].split("/")[0] |
| return vid if re.fullmatch(r"[A-Za-z0-9_-]{11}", vid) else None |
| return None |
|
|
|
|
| @tool |
| def extract_youtube_transcript(url: str, chars: int = 1000) -> str: |
| """ |
| Simple YouTube transcript fetcher. |
| |
| Input: |
| - url: Regular YouTube URL (or the 11-char video_id). |
| - chars: Return the first `chars` characters of the transcript. |
| |
| Output: |
| - String with the transcript (trimmed to `chars`), or an error string: |
| "yt_error:<reason>" |
| """ |
| if YouTubeTranscriptApi is None: |
| return "yt_error:missing_dependency" |
|
|
| vid = _extract_video_id(url) |
| if not vid: |
| return "yt_error:id_not_found" |
|
|
| try: |
| api = YouTubeTranscriptApi() |
| |
| snippets = api.fetch(vid) |
|
|
| parts = [] |
| for s in snippets: |
| |
| text = getattr(s, "text", None) |
| if text is None and isinstance(s, dict): |
| text = s.get("text") |
| if not text: |
| continue |
| parts.append(text.replace("\n", " ").strip()) |
|
|
| full_text = " ".join(p for p in parts if p) |
| return full_text[: max(0, int(chars))] |
| except Exception as e: |
| return f"yt_error:{type(e).__name__}:{e}" |