Spaces:
Sleeping
Sleeping
| """ | |
| LLM-based note generation module. | |
| Uses Google Gemini to generate structured study notes from transcripts. | |
| """ | |
| from typing import Dict, List, Optional | |
| import google.generativeai as genai | |
| from src.utils.logger import setup_logger | |
| from src.utils.config import settings | |
| logger = setup_logger(__name__) | |
| class NoteGenerator: | |
| """Generates structured study notes using LLM.""" | |
| # System prompt for note generation | |
| SYSTEM_PROMPT = """You are an expert educational note-taker. Your task is to convert video transcripts into clear, structured study notes. | |
| Follow these guidelines: | |
| 1. Create a clear hierarchical structure with section titles | |
| 2. Use bullet points for key information | |
| 3. Highlight important concepts and definitions | |
| 4. Extract key terms and explain them | |
| 5. Be concise but comprehensive | |
| 6. Focus on educational content, skip irrelevant parts | |
| 7. Use proper Markdown formatting | |
| Format the output as follows: | |
| # [Main Topic/Title] | |
| ## [Section 1 Title] | |
| - Key point 1 | |
| - Key point 2 | |
| - Sub-point if needed | |
| - **Important term**: Definition or explanation | |
| ## [Section 2 Title] | |
| ... | |
| ## Key Concepts | |
| - **Concept 1**: Explanation | |
| - **Concept 2**: Explanation | |
| """ | |
| def __init__(self, api_key: Optional[str] = None, model_name: str = "gemini-2.5-flash"): | |
| """ | |
| Initialize the note generator. | |
| Args: | |
| api_key: Google Gemini API key (defaults to config) | |
| model_name: Gemini model to use | |
| """ | |
| self.api_key = api_key or settings.google_api_key | |
| self.model_name = model_name | |
| # Configure Gemini | |
| genai.configure(api_key=self.api_key) | |
| self.model = genai.GenerativeModel(model_name) | |
| logger.info(f"Initialized NoteGenerator with model: {model_name}") | |
| def generate_notes_from_segment(self, segment_text: str) -> str: | |
| """ | |
| Generate notes from a single transcript segment. | |
| Args: | |
| segment_text: Text segment to process | |
| Returns: | |
| Generated notes in Markdown format | |
| """ | |
| try: | |
| prompt = f"{self.SYSTEM_PROMPT}\n\nTranscript:\n{segment_text}\n\nGenerate structured study notes:" | |
| logger.debug(f"Generating notes for segment ({len(segment_text)} chars)") | |
| response = self.model.generate_content(prompt) | |
| notes = response.text | |
| logger.debug(f"Generated {len(notes)} characters of notes") | |
| return notes.strip() | |
| except Exception as e: | |
| logger.error(f"Failed to generate notes: {e}") | |
| return f"## Error\nFailed to generate notes for this segment: {str(e)}" | |
| def generate_notes_from_segments(self, segments: List[Dict]) -> str: | |
| """ | |
| Generate notes from multiple transcript segments. | |
| Args: | |
| segments: List of transcript segments | |
| Returns: | |
| Combined notes in Markdown format | |
| """ | |
| all_notes = [] | |
| logger.info(f"Generating notes from {len(segments)} segments") | |
| for i, segment in enumerate(segments, 1): | |
| logger.info(f"Processing segment {i}/{len(segments)}") | |
| segment_text = segment.get('text', '') | |
| if not segment_text: | |
| continue | |
| # Add timestamp if available | |
| if 'start' in segment: | |
| timestamp = self._format_timestamp(segment['start']) | |
| all_notes.append(f"\n---\n**Timestamp: {timestamp}**\n") | |
| # Generate notes for this segment | |
| notes = self.generate_notes_from_segment(segment_text) | |
| all_notes.append(notes) | |
| # Combine all notes | |
| combined_notes = "\n\n".join(all_notes) | |
| logger.info(f"Generated total of {len(combined_notes)} characters") | |
| return combined_notes | |
| def generate_notes_from_full_transcript( | |
| self, | |
| transcript_text: str, | |
| video_title: str = "Educational Video" | |
| ) -> str: | |
| """ | |
| Generate notes from full transcript (for shorter videos). | |
| Args: | |
| transcript_text: Full transcript text | |
| video_title: Title of the video | |
| Returns: | |
| Generated notes in Markdown format | |
| """ | |
| try: | |
| prompt = f"""{self.SYSTEM_PROMPT} | |
| Video Title: {video_title} | |
| Transcript: | |
| {transcript_text} | |
| Generate comprehensive structured study notes:""" | |
| logger.info(f"Generating notes from full transcript ({len(transcript_text)} chars)") | |
| response = self.model.generate_content(prompt) | |
| notes = response.text | |
| # Add header with video title | |
| final_notes = f"# {video_title}\n\n{notes.strip()}" | |
| logger.info(f"Generated {len(final_notes)} characters of notes") | |
| return final_notes | |
| except Exception as e: | |
| logger.error(f"Failed to generate notes from full transcript: {e}") | |
| raise RuntimeError(f"Note generation failed: {str(e)}") | |
| def generate_summary(self, notes: str) -> str: | |
| """ | |
| Generate a brief summary of the notes. | |
| Args: | |
| notes: Generated study notes | |
| Returns: | |
| Brief summary | |
| """ | |
| try: | |
| prompt = f"""Provide a brief 2-3 sentence summary of these study notes: | |
| {notes} | |
| Summary:""" | |
| response = self.model.generate_content(prompt) | |
| summary = response.text.strip() | |
| return summary | |
| except Exception as e: | |
| logger.error(f"Failed to generate summary: {e}") | |
| return "Summary generation failed." | |
| def _format_timestamp(seconds: float) -> str: | |
| """Format seconds into MM:SS or HH:MM:SS.""" | |
| hours = int(seconds // 3600) | |
| minutes = int((seconds % 3600) // 60) | |
| secs = int(seconds % 60) | |
| if hours > 0: | |
| return f"{hours:02d}:{minutes:02d}:{secs:02d}" | |
| else: | |
| return f"{minutes:02d}:{secs:02d}" | |
| def format_final_notes( | |
| self, | |
| notes: str, | |
| video_title: str, | |
| video_url: str, | |
| duration: int | |
| ) -> str: | |
| """ | |
| Format final notes with metadata. | |
| Args: | |
| notes: Generated notes | |
| video_title: Video title | |
| video_url: Original YouTube URL | |
| duration: Video duration in seconds | |
| Returns: | |
| Formatted notes with metadata header | |
| """ | |
| duration_str = self._format_timestamp(duration) | |
| header = f"""# {video_title} | |
| --- | |
| **Source:** [{video_url}]({video_url}) | |
| **Duration:** {duration_str} | |
| **Generated:** AI Study Notes | |
| --- | |
| """ | |
| return header + notes | |