| import logging
|
| import re
|
| from typing import Optional, List, Dict, Any
|
|
|
| logger = logging.getLogger(__name__)
|
|
|
| class LocalSummarizer:
|
| """
|
| Singleton-style wrapper for local LLM summarization.
|
| Enhances extraction using robust heuristic rules and LLM generation with retry logic.
|
| """
|
| _tokenizer = None
|
| _model = None
|
| _model_name = "google/flan-t5-small"
|
|
|
| @classmethod
|
| def _load_model(cls):
|
| """Lazy load the model and tokenizer directly"""
|
| if cls._model is None:
|
| try:
|
| from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
| import transformers
|
| logger.info(f"⏳ Loading summarization model ({cls._model_name})...")
|
|
|
| old_verbosity = transformers.logging.get_verbosity()
|
| transformers.logging.set_verbosity_error()
|
|
|
| cls._tokenizer = AutoTokenizer.from_pretrained(cls._model_name)
|
| cls._model = AutoModelForSeq2SeqLM.from_pretrained(cls._model_name)
|
|
|
| transformers.logging.set_verbosity(old_verbosity)
|
| logger.info("✅ Summarization model loaded successfully")
|
| except Exception as e:
|
| logger.error(f"❌ Failed to load summarization model: {e}")
|
| cls._model = False
|
|
|
| @staticmethod
|
| def _strip_yaml_frontmatter(text: str) -> str:
|
| """Strip the YAML frontmatter enclosed in ---"""
|
| return re.sub(r'^---\s*\n.*?\n---\s*\n', '', text, flags=re.MULTILINE | re.DOTALL)
|
|
|
| @staticmethod
|
| def _extract_candidates(text: str) -> List[str]:
|
| candidates = []
|
|
|
|
|
| heading_matches = re.finditer(r'^#+\s*(?:\d+[\.\)]?\s*)?(Description|Model [dD]escription|Model Overview|Overview|Introduction|Summary|モデル概要|Model Details)[^\n]*\n(.*?)(?=\n#+\s|\Z)', text, flags=re.MULTILINE | re.DOTALL)
|
| for match in heading_matches:
|
| if match.group(2).strip():
|
| candidates.append(match.group(2).strip())
|
|
|
|
|
| inline_matches = re.finditer(r'(?:Description:|Overview:|### Description:)\s*(.*?)(?=\n\n|\Z)', text, flags=re.DOTALL | re.IGNORECASE)
|
| for match in inline_matches:
|
| if match.group(1).strip():
|
| candidates.append(match.group(1).strip())
|
|
|
|
|
| tuned_matches = re.finditer(r'^(?:The .*model is a .*|This model is a fine-tuned version of.*|This is a fine-tuned.*)', text, flags=re.MULTILINE | re.IGNORECASE)
|
| for match in tuned_matches:
|
| candidates.append(match.group(0).strip())
|
|
|
|
|
|
|
| html_stripped = re.sub(r'<[^>]+>', '', text)
|
| paragraphs = re.split(r'\n\s*\n', html_stripped)
|
| for p in paragraphs:
|
| p = p.strip()
|
| if not p:
|
| continue
|
| if p.startswith('#'):
|
| continue
|
|
|
| if p.startswith('[!') or p.startswith('<a href') or p.startswith('> [!'):
|
| continue
|
|
|
| if p.count('](') > 3 or p.count('http') > 3:
|
| continue
|
| if len(p) > 50:
|
| candidates.append(p)
|
| break
|
|
|
| return candidates
|
|
|
| @staticmethod
|
| def _score_candidate(text: str) -> float:
|
| score = 0.0
|
| text_lower = text.lower()
|
|
|
|
|
| if 50 < len(text) < 1000:
|
| score += 10.0
|
|
|
|
|
| if "is a" in text_lower or "fine-tuned version of" in text_lower or "trained on" in text_lower or "designed for" in text_lower:
|
| score += 20.0
|
|
|
|
|
| if "leaderboard" in text_lower or "benchmark" in text_lower or "results" in text_lower:
|
| score -= 50.0
|
| if "install" in text_lower or "how to run" in text_lower or "pip install" in text_lower or "read our guide" in text_lower:
|
| score -= 30.0
|
|
|
|
|
| if text.count('|') > 5 or text.count('```') >= 1 or text.count('\n- ') > 2 or text.count('\n* ') > 2:
|
| score -= 50.0
|
|
|
| return score
|
|
|
| @staticmethod
|
| def _clean_text(text: str) -> str:
|
|
|
| from bs4 import BeautifulSoup
|
| try:
|
| soup = BeautifulSoup(text, "html.parser")
|
| for tag in soup(["style", "script"]):
|
| tag.decompose()
|
| text = soup.get_text(separator=' ')
|
| except Exception:
|
| pass
|
|
|
|
|
| text = re.sub(r'!\[.*?\]\([^)]+\)', '', text)
|
|
|
| text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
|
|
|
| text = re.sub(r'```.*?```', '', text, flags=re.DOTALL)
|
|
|
| text = re.sub(r'`[^`]*`', '', text)
|
|
|
| text = re.sub(r'\|.*?\|', '', text)
|
| text = re.sub(r'(?m)^[-:| ]+$', '', text)
|
|
|
|
|
| lines = text.split('\n')
|
| clean_lines = []
|
| for line in lines:
|
| line_lower = line.lower()
|
| if 'generated automatically' in line_lower and 'model card' in line_lower:
|
| continue
|
| if 'completed by the model author' in line_lower:
|
| continue
|
| if 'model cards for model reporting' in line_lower:
|
| continue
|
| clean_lines.append(line)
|
| text = '\n'.join(clean_lines)
|
|
|
|
|
| text = re.sub(r'\s+', ' ', text).strip()
|
|
|
| return text
|
|
|
| @classmethod
|
| def _generate(cls, prompt: str, max_output_chars: int) -> Optional[str]:
|
| if cls._model is None:
|
| cls._load_model()
|
| if not cls._model or not cls._tokenizer:
|
| return None
|
|
|
| try:
|
| inputs = cls._tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
|
| generate_kwargs = {
|
| "max_length": 128,
|
| "min_length": 15,
|
| "do_sample": False,
|
| "num_beams": 4,
|
| "early_stopping": True,
|
| "repetition_penalty": 2.0
|
| }
|
| summary_ids = cls._model.generate(inputs["input_ids"], **generate_kwargs)
|
| summary = cls._tokenizer.decode(summary_ids[0], skip_special_tokens=True)
|
|
|
| summary = summary.strip()
|
|
|
|
|
| if summary.lower().startswith("output:"):
|
| summary = re.sub(r'^Output:\s*', '', summary, flags=re.IGNORECASE)
|
|
|
| if len(summary) > max_output_chars:
|
| return summary[:max_output_chars-3] + "..."
|
| return summary
|
| except Exception as e:
|
| logger.warning(f"⚠️ Generation failed: {e}")
|
| return None
|
|
|
| @staticmethod
|
| def _is_valid_summary(summary: str, model_id: str) -> bool:
|
| if not summary or len(summary) < 15:
|
| return False
|
|
|
| summary_lower = summary.lower()
|
| model_name = model_id.split('/')[-1].lower()
|
|
|
| if summary_lower == model_name or summary_lower == f"{model_name} model":
|
| return False
|
|
|
|
|
| if '#' in summary or '<' in summary or '>' in summary or '*' in summary:
|
| return False
|
|
|
|
|
| if summary_lower.startswith("to install") or summary_lower.startswith("how to") or "pip install" in summary_lower:
|
| return False
|
|
|
|
|
| if "- type:" in summary_lower or "number of parameters:" in summary_lower:
|
| return False
|
|
|
| return True
|
|
|
| @classmethod
|
| def summarize(cls, text: str, max_output_chars: int = 332, model_id: str = "") -> Optional[str]:
|
| """
|
| Robustly extract and summarize model description.
|
| """
|
| if not text or not text.strip():
|
| return None
|
|
|
|
|
| text_without_yaml = cls._strip_yaml_frontmatter(text)
|
|
|
|
|
| candidates = cls._extract_candidates(text_without_yaml)
|
|
|
| if not candidates:
|
|
|
| candidates = [text_without_yaml[:1000]]
|
|
|
|
|
| scored_candidates = [(c, cls._score_candidate(c)) for c in candidates]
|
| best_candidate = max(scored_candidates, key=lambda x: x[1])[0]
|
|
|
|
|
| cleaned_text = cls._clean_text(best_candidate)
|
|
|
| if not cleaned_text.strip():
|
| return None
|
|
|
|
|
|
|
| sentences = re.split(r'(?<=[.!?])\s+', cleaned_text)
|
| short_text = " ".join(sentences[:3])
|
|
|
|
|
| prompt1 = f"In one sentence, explain what this AI model is designed to do based on this description:\n\n{short_text}"
|
|
|
| summary = cls._generate(prompt1, max_output_chars)
|
|
|
| if summary and cls._is_valid_summary(summary, model_id):
|
| return summary
|
|
|
|
|
| logger.info("⚠️ First summary invalid, retrying with stricter prompt.")
|
| prompt2 = f"Summarize the main purpose of this AI model in one complete sentence:\n\n{cleaned_text}"
|
| summary2 = cls._generate(prompt2, max_output_chars)
|
|
|
| if summary2 and cls._is_valid_summary(summary2, model_id):
|
| return summary2
|
|
|
|
|
| logger.info("⚠️ Both LLM summaries invalid, falling back to cleaned extracted text.")
|
| sentences = re.split(r'(?<=[.!?])\s+', cleaned_text)
|
| fallback_summary = " ".join(sentences[:2])
|
| if len(fallback_summary) > max_output_chars:
|
| return fallback_summary[:max_output_chars-3] + "..."
|
| return fallback_summary
|
|
|