tarinmoy commited on
Commit
93e8e20
·
verified ·
1 Parent(s): 406faad

Upload 20 files

Browse files
Files changed (20) hide show
  1. .env +20 -0
  2. .env.example +27 -0
  3. .gitignore +7 -0
  4. README.md +26 -6
  5. README_LOCAL.md +94 -0
  6. agent.py +328 -0
  7. app.py +642 -0
  8. asset_checker.py +198 -0
  9. debug_assembly.py +51 -0
  10. main.py +192 -0
  11. media_fetcher.py +206 -0
  12. packages.txt +1 -0
  13. requirements.txt +25 -0
  14. test_hindi.py +21 -0
  15. test_moviepy.py +7 -0
  16. test_moviepy_safe.py +15 -0
  17. test_pexels.py +35 -0
  18. test_repair.py +22 -0
  19. video_assembler.py +806 -0
  20. voice_generator.py +259 -0
.env ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ============================================================
2
+ # Autonomous Short-Form Video Engine — Environment Variables
3
+ # ============================================================
4
+
5
+ # ── AI Brain (Nemotron-3 Super via OpenRouter) ─────────────
6
+ OPENROUTER_API_KEY=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
7
+
8
+ # ── Asset Check (Nemotron-2 VL via NVIDIA NIM) ────────────
9
+ # Currently skipping VL QA (use checkbox in UI) or get key from build.nvidia.com
10
+ NVIDIA_API_KEY=your_nvidia_api_key_here
11
+
12
+ # ── Media (Pexels) ───────────────────────────────────────
13
+ PEXELS_API_KEY=GNbkCWpjvQ7sdYj5SJLPDU68QUDvKQcfYMkZrBEJoJzyoyILuX3VZ9B0
14
+
15
+ # ── Voiceover (Edge TTS - Free & No Key Needed) ──────────
16
+ # We are using Edge-TTS because you don't have a Google Cloud key.
17
+ USE_EDGE_TTS=true
18
+
19
+ # ── Optional: Background music volume (0.0 to 1.0) ───────
20
+ BGM_VOLUME=0.08
.env.example ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ============================================================
2
+ # Autonomous Short-Form Video Engine — Environment Variables
3
+ # ============================================================
4
+ # 1. Copy this file to .env
5
+ # 2. Fill in your actual API keys below
6
+ # 3. Never commit .env to git
7
+
8
+ # ── AI Brain (Nemotron-3 Super via OpenRouter) ─────────────
9
+ # Get from: https://openrouter.ai/keys (free, no credit card)
10
+ OPENROUTER_API_KEY=your_openrouter_api_key_here
11
+
12
+ # ── Asset Check (Nemotron-2 VL via NVIDIA NIM) ────────────
13
+ # Get from: https://build.nvidia.com (free credits)
14
+ NVIDIA_API_KEY=your_nvidia_api_key_here
15
+
16
+ # ── Media (Pexels) ───────────────────────────────────────
17
+ # Get from: https://www.pexels.com/api/ (free)
18
+ PEXELS_API_KEY=your_pexels_api_key_here
19
+
20
+ # ── Voiceover (Google Cloud TTS) ─────────────────────────
21
+ # 1. Create a GCP project, enable Cloud Text-to-Speech API
22
+ # 2. Create a service account, download the JSON key
23
+ # 3. Set the path to that JSON file here:
24
+ GOOGLE_APPLICATION_CREDENTIALS=path/to/your-gcp-key.json
25
+
26
+ # ── Optional: Background music volume (0.0 to 1.0) ───────
27
+ BGM_VOLUME=0.08
.gitignore ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ .env
2
+ assets/
3
+ output/
4
+ __pycache__/
5
+ *.pyc
6
+ *.pyo
7
+ .DS_Store
README.md CHANGED
@@ -1,13 +1,33 @@
1
  ---
2
- title: AutoShorts Engine
3
- emoji: 🏆
4
- colorFrom: gray
5
- colorTo: red
6
  sdk: gradio
7
- sdk_version: 6.9.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: AutoShorts - Autonomous Video Engine
3
+ emoji: 🎬
4
+ colorFrom: purple
5
+ colorTo: indigo
6
  sdk: gradio
7
+ sdk_version: 4.44.1
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
  ---
12
 
13
+ # 🎬 AutoShorts: Autonomous Video Engine
14
+
15
+ This is the official deployment of **AutoShorts**, an elite autonomous engine for viral short-form content.
16
+
17
+ ## 🚀 Deployment Instructions
18
+
19
+ To run this Space, you must add the following **Secret Environment Variables**:
20
+
21
+ 1. `OPENROUTER_API_KEY`: Your OpenRouter API key for the AI brain.
22
+ 2. `PEXELS_API_KEY`: Your PEXELS API key for fetching video clips.
23
+ 3. `GOOGLE_APPLICATION_CREDENTIALS_JSON`: (Optional) Your Google Cloud Service Account JSON for premium TTS.
24
+ 4. `USE_EDGE_TTS`: Set to `true` if you want to use the free Edge TTS instead of Google Cloud.
25
+
26
+ ## 🛠 Features
27
+ - **AI Brain**: Powered by Llama 3.3 / Gemini via OpenRouter.
28
+ - **Visuals**: Dynamic portrait clips from Pexels.
29
+ - **Voice**: Humanized text-to-speech.
30
+ - **Assembly**: Professional video rendering with MoviePy.
31
+
32
+ ---
33
+ Developed with ❤️ by the AutoShorts Team.
README_LOCAL.md ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🎬 Autonomous Short-Form Video Engine
2
+
3
+ > AI-powered pipeline that transforms a niche into a production-ready short-form video in minutes.
4
+
5
+ **Powered by:** Gemini AI · edge-tts · Pexels API · MoviePy · Gradio
6
+
7
+ ---
8
+
9
+ ## ⚡ Quick Start
10
+
11
+ ### 1. Install Python Dependencies
12
+ ```bash
13
+ pip install -r requirements.txt
14
+ ```
15
+
16
+ ### 2. Install FFmpeg (required for MoviePy)
17
+ Download from https://ffmpeg.org/download.html → add to your system PATH.
18
+
19
+ ### 3. Configure API Keys
20
+ ```bash
21
+ copy .env.example .env
22
+ # Then edit .env with your keys:
23
+ # GEMINI_API_KEY=... → https://aistudio.google.com/app/apikey
24
+ # PEXELS_API_KEY=... → https://www.pexels.com/api/
25
+ ```
26
+
27
+ ### 4. Launch the Web UI
28
+ ```bash
29
+ python app.py
30
+ # Open: http://localhost:7860
31
+ ```
32
+
33
+ ### 4b. Or use the CLI
34
+ ```bash
35
+ # Full video generation
36
+ python main.py --niche "AI Tools"
37
+
38
+ # With specific topic
39
+ python main.py --niche "Motivation" --topic "Why 99% fail at their goals"
40
+
41
+ # JSON only (no rendering)
42
+ python main.py --niche "Wealth" --dry-run
43
+ ```
44
+
45
+ ---
46
+
47
+ ## 🏗 Architecture
48
+
49
+ ```
50
+ Niche Input
51
+
52
+ agent.py ← Gemini AI (Script + Scenes + SEO in JSON)
53
+
54
+ voice_generator.py ← edge-tts (free neural TTS per scene)
55
+ media_fetcher.py ← Pexels API (portrait video/image per scene)
56
+
57
+ video_assembler.py ← MoviePy (9:16 MP4 with captions + audio)
58
+
59
+ output/ folder ← Final MP4 + metadata JSON
60
+ ```
61
+
62
+ ---
63
+
64
+ ## 📁 Project Structure
65
+
66
+ | File | Purpose |
67
+ |---|---|
68
+ | `agent.py` | Gemini AI brain — generates strict JSON |
69
+ | `voice_generator.py` | Neural TTS via edge-tts (calm/energetic/monotone) |
70
+ | `media_fetcher.py` | Pexels video/image fetcher with local caching |
71
+ | `video_assembler.py` | MoviePy assembly — captions, 9:16 crop, audio sync |
72
+ | `main.py` | CLI orchestrator |
73
+ | `app.py` | Gradio web UI |
74
+ | `requirements.txt` | Python dependencies |
75
+ | `.env.example` | API key template |
76
+
77
+ ---
78
+
79
+ ## 🎯 Supported Niches
80
+
81
+ - AI Tools
82
+ - Motivation
83
+ - Wealth & Finance
84
+ - Mind-Blowing Facts
85
+ - Productivity
86
+ - Crypto & Web3
87
+
88
+ ---
89
+
90
+ ## 📤 Output
91
+
92
+ Each run produces:
93
+ - `output/<title>_<timestamp>.mp4` — 1080×1920 portrait video (Shorts/Reels ready)
94
+ - `output/metadata_<timestamp>.json` — Full AI-generated content + SEO metadata
agent.py ADDED
@@ -0,0 +1,328 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ agent.py
3
+ ─────────────────────────────────────────────────────────────
4
+ Autonomous Short-Form Video Engine — AI Brain
5
+ Uses Nemotron-3 Super via OpenRouter (free tier) to transform
6
+ a niche topic into a production-ready structured JSON package.
7
+ ─────────────────────────────────────────────────────────────
8
+ """
9
+
10
+ import os
11
+ import re
12
+ import json
13
+ import time
14
+ import logging
15
+ from openai import OpenAI
16
+ from dotenv import load_dotenv
17
+
18
+ load_dotenv()
19
+ logger = logging.getLogger(__name__)
20
+
21
+ # ── OpenRouter client setup ───────────────────────────────
22
+ OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY", "")
23
+
24
+ # Fallback models (VERIFIED ALIVE for free tier)
25
+ MODELS = [
26
+ "meta-llama/llama-3.3-70b-instruct:free",
27
+ "mistralai/mistral-small-3.1-24b-instruct:free",
28
+ "google/gemma-3-27b-it:free",
29
+ "minimax/minimax-m2.5:free",
30
+ "nvidia/nemotron-3-super-120b-a12b:free",
31
+ ]
32
+ DEFAULT_MODEL = MODELS[0]
33
+
34
+ client = OpenAI(
35
+ base_url="https://openrouter.ai/api/v1",
36
+ api_key=OPENROUTER_API_KEY,
37
+ default_headers={
38
+ "HTTP-Referer": "https://automate-ai.local",
39
+ "X-Title": "Automate AI Video Engine",
40
+ },
41
+ )
42
+
43
+ # ── Language Support ──────────────────────────────────────
44
+ LANGUAGE_MAP = {
45
+ "English": "English (Standard)",
46
+ "Hindi": "Hindi (written in Devanagari script)",
47
+ "Hinglish": "Hinglish (a natural mix of Hindi and English, written in Latin/Roman script)",
48
+ }
49
+
50
+ # ── System Prompt ─────────────────────────────────────────
51
+ SYSTEM_PROMPT = """You are a short-form video scriptwriter.
52
+ Transform the topic into a production-ready JSON package in {language_desc}.
53
+
54
+ Output ONLY valid JSON. No markdown, no filler.
55
+
56
+ Schema:
57
+ {{
58
+ "niche": "string",
59
+ "hook": {{ "text": "string (≤15 words in {language_desc})", "duration_seconds": 3 }},
60
+ "scenes": [
61
+ {{
62
+ "scene_number": 1,
63
+ "type": "hook | meat | cta",
64
+ "script_text": "vocal script in {language_desc}",
65
+ "on_screen_text": "short caption in {language_desc}",
66
+ "duration_seconds": 5,
67
+ "pexels_keywords": ["English keywords"],
68
+ "visual_description": "English description"
69
+ }}
70
+ ],
71
+ "voiceover_settings": {{ "mood": "energetic", "gender_preference": "male" }},
72
+ "seo": {{ "title": "Title in {language_desc}", "description": "...", "hashtags": [], "keywords": [] }},
73
+ "total_duration_seconds": 60
74
+ }}
75
+ """
76
+
77
+ USER_TEMPLATE = "Create a short-form video JSON for: {niche} (Style: {style}, Lang: {language_desc})."
78
+
79
+ SCRIPT_TO_JSON_PROMPT = """You are a script-to-video parser.
80
+ Convert the provided raw text script into a scene-by-scene package.
81
+ Target Language for text/captions: {language_desc}
82
+
83
+ Provide:
84
+ 1. script_text: Exact portion in {language_desc}.
85
+ 2. on_screen_text: Small caption in {language_desc}.
86
+ 3. pexels_keywords: Search terms (ALWAYS English).
87
+ 4. visual_description: Visuals (ALWAYS English).
88
+
89
+ JSON ONLY matching:
90
+ {{
91
+ "niche": "Custom Script",
92
+ "scenes": [
93
+ {{
94
+ "scene_number": 1,
95
+ "script_text": "string",
96
+ "on_screen_text": "string",
97
+ "duration_seconds": 5,
98
+ "pexels_keywords": ["kw1"],
99
+ "visual_description": "English description"
100
+ }}
101
+ ],
102
+ "voiceover_settings": {{"mood": "energetic", "gender_preference": "male"}},
103
+ "seo": {{"title": "Title in {language_desc}", "description": "...", "hashtags": [], "keywords": []}},
104
+ "total_duration_seconds": 60
105
+ }}
106
+
107
+ SCRIPT:
108
+ {script}
109
+ """
110
+
111
+
112
+ def generate_video_package(niche: str, style: str = "engaging and educational", language: str = "English", model: str = None) -> dict:
113
+ """
114
+ Call Nemotron-3 Super via OpenRouter to generate the full video JSON.
115
+ """
116
+ if not OPENROUTER_API_KEY:
117
+ raise EnvironmentError("OPENROUTER_API_KEY is not set.")
118
+
119
+ # Model rotation logic: start with requested, then try the pool
120
+ model_queue = MODELS[:]
121
+ if model and model in model_queue:
122
+ model_queue.remove(model)
123
+ model_queue.insert(0, model)
124
+ elif model:
125
+ model_queue.insert(0, model)
126
+
127
+ lang_desc = LANGUAGE_MAP.get(language, "English")
128
+ sys_prompt = SYSTEM_PROMPT.format(language_desc=lang_desc)
129
+ user_prompt = USER_TEMPLATE.format(niche=niche, style=style, language_desc=lang_desc)
130
+ last_error = None
131
+
132
+ for attempt in range(len(model_queue) * 2): # Try each model twice if needed
133
+ active_model = model_queue[attempt % len(model_queue)]
134
+ logger.info(f"[Agent] Attempt {attempt+1} — using {active_model}...")
135
+
136
+ try:
137
+ response = client.chat.completions.create(
138
+ model=active_model,
139
+ messages=[
140
+ {"role": "system", "content": sys_prompt},
141
+ {"role": "user", "content": user_prompt},
142
+ ],
143
+ temperature=0.7,
144
+ max_tokens=2500,
145
+ )
146
+ content = response.choices[0].message.content
147
+ if not content:
148
+ raise ValueError("Model returned empty content")
149
+ raw = content.strip()
150
+
151
+ # Strip markdown code fences if model wraps in ```json ... ```
152
+ if raw.startswith("```"):
153
+ raw = raw.split("```")[1]
154
+ if raw.startswith("json"):
155
+ raw = raw[4:]
156
+ raw = raw.strip()
157
+
158
+ data = _robust_json_parse(raw)
159
+ data["language"] = language # Store for downstream use
160
+ logger.info(f"[Agent] ✅ JSON generated ({language}) and repaired successfully.")
161
+ _validate_schema(data)
162
+ return data
163
+
164
+ except Exception as e:
165
+ last_error = str(e)
166
+ logger.warning(f"[Agent] Model {active_model} failed: {e}")
167
+ time.sleep(1.5) # Short wait before next model
168
+ continue
169
+
170
+ raise ValueError(f"CRITICAL: All AI models failed or rate-limited. Last error: {last_error}")
171
+
172
+
173
+ def parse_script_into_video_package(script: str, language: str = "English", model: str = None) -> dict:
174
+ """
175
+ Take a raw user script and use AI to parse it into scene-by-scene JSON.
176
+ """
177
+ if not OPENROUTER_API_KEY:
178
+ raise EnvironmentError("OPENROUTER_API_KEY is not set.")
179
+
180
+ model_queue = [model] if model else MODELS[:]
181
+ lang_desc = LANGUAGE_MAP.get(language, "English")
182
+ prompt = SCRIPT_TO_JSON_PROMPT.format(script=script, language_desc=lang_desc)
183
+ last_error = None
184
+
185
+ for attempt in range(len(model_queue) * 2):
186
+ active_model = model_queue[attempt % len(model_queue)]
187
+ logger.info(f"[Agent] Attempt {attempt+1} — Parsing with {active_model}...")
188
+ try:
189
+ response = client.chat.completions.create(
190
+ model=active_model,
191
+ messages=[
192
+ {"role": "system", "content": "You are a specialized script parser. Output ONLY JSON."},
193
+ {"role": "user", "content": prompt},
194
+ ],
195
+ temperature=0.3,
196
+ max_tokens=2500,
197
+ )
198
+ content = response.choices[0].message.content
199
+ if not content:
200
+ raise ValueError("Model returned empty content")
201
+ raw = content.strip()
202
+
203
+ if raw.startswith("```"):
204
+ raw = raw.split("```")[1]
205
+ if raw.startswith("json"):
206
+ raw = raw[4:]
207
+ raw = raw.strip()
208
+
209
+ data = _robust_json_parse(raw)
210
+ data["language"] = language
211
+ logger.info(f"[Agent] ✅ Script parsed ({language}) and repaired.")
212
+ _validate_schema(data)
213
+ return data
214
+ except json.JSONDecodeError as e:
215
+ logger.warning(f"[Agent] Script parse failed on attempt {attempt}: {e}")
216
+ last_error = e
217
+ if attempt < 3:
218
+ time.sleep(2 ** attempt)
219
+ except Exception as e:
220
+ logger.error(f"[Agent] Script parse API error: {e}")
221
+ last_error = e
222
+ if attempt < 3:
223
+ time.sleep(2 ** attempt)
224
+
225
+ raise ValueError(f"Failed to parse script into JSON. Last error: {last_error}")
226
+
227
+
228
+ def _robust_json_parse(raw: str) -> dict:
229
+ """
230
+ Extract JSON from text and attempt to repair if it's truncated.
231
+ """
232
+ # 1. Extract the actual JSON block using regex (find first { and last })
233
+ # If the response has trailing text or headers, this strips them.
234
+ match = re.search(r'(\{.*\})', raw, re.DOTALL)
235
+ if match:
236
+ raw = match.group(1)
237
+ else:
238
+ # If no closing brace, try finding the start and manually closing
239
+ start_idx = raw.find('{')
240
+ if start_idx != -1:
241
+ raw = raw[start_idx:]
242
+ else:
243
+ raise ValueError("No JSON object found in response")
244
+
245
+ try:
246
+ return json.loads(raw)
247
+ except json.JSONDecodeError:
248
+ # 2. Attempt Auto-Repair for truncated JSON
249
+ repaired = _repair_json(raw)
250
+ try:
251
+ return json.loads(repaired)
252
+ except json.JSONDecodeError as e:
253
+ logger.error(f"[Agent] JSON Repair failed: {e}\nRaw start: {raw[:100]}...\nRaw end: {raw[-100:]}")
254
+ raise ValueError(f"Failed to parse or repair JSON: {e}")
255
+
256
+ def _repair_json(raw: str) -> str:
257
+ """
258
+ Extremely robust JSON repair for truncated strings, objects and arrays.
259
+ """
260
+ # Remove trailing commas and whitespace that cause issues
261
+ raw = raw.strip()
262
+
263
+ # Fix unterminated strings
264
+ # If the last character is not " or }, and there was an unclosed quote...
265
+ if raw.count('"') % 2 != 0:
266
+ raw += '"'
267
+
268
+ # Balance brackets and braces
269
+ # Count open vs closed
270
+ stack = []
271
+ in_string = False
272
+ escape = False
273
+
274
+ for i, char in enumerate(raw):
275
+ if char == '"' and not escape:
276
+ in_string = not in_string
277
+ if in_string:
278
+ if char == '\\':
279
+ escape = not escape
280
+ else:
281
+ escape = False
282
+ continue
283
+
284
+ if char == '{': stack.append('}')
285
+ elif char == '[': stack.append(']')
286
+ elif char == '}' or char == ']':
287
+ if stack and stack[-1] == char:
288
+ stack.pop()
289
+
290
+ # Close everything in reverse order
291
+ while stack:
292
+ raw += stack.pop()
293
+
294
+ return raw
295
+
296
+
297
+ def _validate_schema(data: dict) -> None:
298
+ """Basic schema validation — raises KeyError if required fields missing."""
299
+ required_top = ["niche", "scenes", "voiceover_settings", "seo", "total_duration_seconds"]
300
+ for key in required_top:
301
+ if key not in data:
302
+ raise KeyError(f"Missing required key in agent output: '{key}'")
303
+
304
+ if not isinstance(data["scenes"], list) or len(data["scenes"]) == 0:
305
+ raise ValueError("'scenes' must be a non-empty list")
306
+
307
+ scene_required = ["scene_number", "script_text", "on_screen_text",
308
+ "duration_seconds", "pexels_keywords"]
309
+ for i, scene in enumerate(data["scenes"]):
310
+ for key in scene_required:
311
+ if key not in scene:
312
+ raise KeyError(f"Scene {i+1} missing required key: '{key}'")
313
+
314
+ seo_required = ["title", "description", "hashtags", "keywords"]
315
+ for key in seo_required:
316
+ if key not in data["seo"]:
317
+ raise KeyError(f"Missing SEO key: '{key}'")
318
+
319
+
320
+ # ── CLI Quick Test ────────────────────────────────────────
321
+ if __name__ == "__main__":
322
+ import sys
323
+ niche = sys.argv[1] if len(sys.argv) > 1 else "AI Productivity Tools"
324
+ print(f"\n🧠 Generating video package for: '{niche}'\n")
325
+ result = generate_video_package(niche)
326
+ print(json.dumps(result, indent=2))
327
+ print(f"\n✅ Total scenes: {len(result['scenes'])}")
328
+ print(f"⏱ Total duration: {result['total_duration_seconds']}s")
app.py ADDED
@@ -0,0 +1,642 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ app.py
3
+ ─────────────────────────────────────────────────────────────
4
+ Autonomous Short-Form Video Engine — Gradio Web UI
5
+ Premium dark-mode interface with live status logs,
6
+ embedded video player, SEO panel, and download button.
7
+
8
+ Run: python app.py → http://localhost:7860
9
+ ─────────────────────────────────────────────────────────────
10
+ """
11
+
12
+ import json
13
+ import logging
14
+ import traceback
15
+ from pathlib import Path
16
+ from datetime import datetime
17
+
18
+ import gradio as gr
19
+ from dotenv import load_dotenv
20
+
21
+ load_dotenv()
22
+ logging.basicConfig(level=logging.INFO)
23
+
24
+ # ── Niche presets ─────────────────────────────────────────
25
+ NICHE_PRESETS = [
26
+ "AI Productivity Tools",
27
+ "Morning Routine Hacks",
28
+ "Passive Income Ideas",
29
+ "Fitness Motivation",
30
+ "Crypto & Web3 Trends",
31
+ "Mental Health Tips",
32
+ "Travel on a Budget",
33
+ "Cooking Life Hacks",
34
+ "Self-Improvement",
35
+ "Tech Gadgets Review",
36
+ ]
37
+
38
+ STYLE_OPTIONS = [
39
+ "engaging and educational",
40
+ "hype and energetic",
41
+ "calm and inspirational",
42
+ "funny and relatable",
43
+ "informative and authoritative",
44
+ ]
45
+
46
+ EDITING_STYLES = ["motion_graphics", "montage", "documentary", "social_media"]
47
+
48
+ EDITING_STYLE_META = {
49
+ "motion_graphics": {
50
+ "label": "🎨 Motion Graphics",
51
+ "desc": "Kinetic text animations, slow zoom, cinematic grade, pill captions",
52
+ "color": "#7c3aed",
53
+ },
54
+ "montage": {
55
+ "label": "⚡ Montage",
56
+ "desc": "Fast cuts, zoom punches, speed ramps, flash transitions, impact font",
57
+ "color": "#dc2626",
58
+ },
59
+ "documentary": {
60
+ "label": "🎥 Documentary",
61
+ "desc": "Ken Burns, crossfades, lower-thirds, slow burn, cold grade",
62
+ "color": "#0369a1",
63
+ },
64
+ "social_media": {
65
+ "label": "📲 Social Media",
66
+ "desc": "Karaoke captions, glitch hook, zoom punch, TikTok/Reels style",
67
+ "color": "#db2777",
68
+ },
69
+ }
70
+
71
+ # ── Voice Model Meta ─────────────────────────────────────
72
+ from voice_generator import PREMIUM_VOICES
73
+ VOICE_CHOICES = [(v[2], k) for k, v in PREMIUM_VOICES.items()]
74
+ VOICE_CHOICES.insert(0, ("Auto-Select (Recommended)", "auto"))
75
+
76
+ # ── Custom CSS ────────────────────────────────────────────
77
+ CUSTOM_CSS = """
78
+ /* ─── Base & backdrop ─── */
79
+ .gradio-container {
80
+ background: radial-gradient(circle at top right, #1a1a2e, #0a0a0f) !important;
81
+ min-height: 100vh;
82
+ font-family: 'Outfit', 'Inter', system-ui, sans-serif !important;
83
+ color: #f8fafc !important;
84
+ }
85
+
86
+ /* ─── Ultra Premium Header ─── */
87
+ #hero-header {
88
+ background: rgba(255, 255, 255, 0.02);
89
+ backdrop-filter: blur(20px);
90
+ border-radius: 28px;
91
+ padding: 60px 40px;
92
+ margin-bottom: 32px;
93
+ border: 1px solid rgba(168, 85, 247, 0.25);
94
+ box-shadow: 0 20px 50px rgba(0, 0, 0, 0.4), inset 0 0 20px rgba(168, 85, 247, 0.05);
95
+ position: relative;
96
+ overflow: hidden;
97
+ }
98
+
99
+ #hero-header::before {
100
+ content: '';
101
+ position: absolute;
102
+ top: -50%;
103
+ left: -20%;
104
+ width: 140%;
105
+ height: 140%;
106
+ background: radial-gradient(circle, rgba(124, 58, 237, 0.1) 0%, transparent 60%);
107
+ pointer-events: none;
108
+ }
109
+
110
+ #hero-header h1 {
111
+ font-size: 4rem !important;
112
+ font-weight: 900 !important;
113
+ background: linear-gradient(135deg, #fff 0%, #a78bfa 50%, #6366f1 100%);
114
+ -webkit-background-clip: text;
115
+ -webkit-text-fill-color: transparent;
116
+ margin-bottom: 12px !important;
117
+ letter-spacing: -2.5px !important;
118
+ line-height: 1 !important;
119
+ }
120
+
121
+ #hero-header p {
122
+ color: #94a3b8 !important;
123
+ font-size: 1.25rem !important;
124
+ max-width: 650px;
125
+ line-height: 1.6 !important;
126
+ margin-bottom: 24px !important;
127
+ }
128
+
129
+ .badge-premium {
130
+ display: inline-flex;
131
+ align-items: center;
132
+ background: linear-gradient(90deg, #a855f7, #6366f1);
133
+ color: white;
134
+ padding: 4px 12px;
135
+ border-radius: 100px;
136
+ font-size: 0.7rem;
137
+ font-weight: 700;
138
+ text-transform: uppercase;
139
+ letter-spacing: 1px;
140
+ margin-bottom: 20px;
141
+ box-shadow: 0 4px 12px rgba(168, 85, 247, 0.4);
142
+ }
143
+
144
+ .powered-by {
145
+ display: flex;
146
+ gap: 12px;
147
+ align-items: center;
148
+ font-size: 0.85rem !important;
149
+ color: #64748b !important;
150
+ }
151
+ .powered-by span {
152
+ width: 6px;
153
+ height: 6px;
154
+ background: #475569;
155
+ border-radius: 50%;
156
+ }
157
+ /* ─── Login Page Custom Styling ─── */
158
+ #login-container {
159
+ max-width: 1000px;
160
+ margin: 10vh auto;
161
+ padding: 60px 40px;
162
+ background: rgba(255, 255, 255, 0.02);
163
+ backdrop-filter: blur(25px);
164
+ border-radius: 32px;
165
+ border: 1px solid rgba(255, 255, 255, 0.1);
166
+ text-align: center;
167
+ box-shadow: 0 30px 60px rgba(0,0,0,0.5);
168
+ }
169
+
170
+ #login-container h1 {
171
+ font-size: 3rem !important;
172
+ font-weight: 900 !important;
173
+ margin-bottom: 20px !important;
174
+ background: linear-gradient(135deg, #fff 0%, #a78bfa 100%);
175
+ -webkit-background-clip: text;
176
+ -webkit-text-fill-color: transparent;
177
+ }
178
+
179
+ #login-container p {
180
+ color: #94a3b8 !important;
181
+ font-size: 1.2rem !important;
182
+ margin-bottom: 40px !important;
183
+ }
184
+
185
+ #google-btn {
186
+ display: inline-flex;
187
+ align-items: center;
188
+ justify-content: center;
189
+ background: white !important;
190
+ color: #1f2937 !important;
191
+ padding: 14px 28px !important;
192
+ border-radius: 12px !important;
193
+ font-weight: 700 !important;
194
+ font-size: 1.1rem !important;
195
+ cursor: pointer;
196
+ transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1);
197
+ box-shadow: 0 4px 20px rgba(0,0,0,0.2);
198
+ border: none !important;
199
+ width: auto !important;
200
+ margin: 0 auto;
201
+ }
202
+
203
+ #google-btn:hover {
204
+ transform: translateY(-4px) scale(1.02);
205
+ box-shadow: 0 12px 30px rgba(168, 85, 247, 0.3);
206
+ }
207
+
208
+ #google-btn svg {
209
+ margin-right: 12px;
210
+ width: 24px;
211
+ height: 24px;
212
+ }
213
+
214
+ /* ─── Panel cards (Glassmorphism) ─── */
215
+ .panel-box, .gr-form, .gr-box {
216
+ background: rgba(255, 255, 255, 0.03) !important;
217
+ border: 1px solid rgba(255, 255, 255, 0.08) !important;
218
+ border-radius: 20px !important;
219
+ backdrop-filter: blur(12px) saturate(180%);
220
+ box-shadow: 0 8px 32px 0 rgba(0, 0, 0, 0.37);
221
+ }
222
+
223
+ /* ─── Generate button (Premium Gradient + Animation) ─── */
224
+ #gen-btn {
225
+ background: linear-gradient(135deg, #a855f7 0%, #6366f1 100%) !important;
226
+ border: none !important;
227
+ border-radius: 14px !important;
228
+ font-size: 18px !important;
229
+ font-weight: 800 !important;
230
+ color: white !important;
231
+ padding: 16px !important;
232
+ cursor: pointer !important;
233
+ transition: all 0.4s cubic-bezier(0.175, 0.885, 0.32, 1.275) !important;
234
+ box-shadow: 0 10px 25px -5px rgba(168, 85, 247, 0.4) !important;
235
+ text-transform: uppercase;
236
+ letter-spacing: 1px;
237
+ }
238
+ #gen-btn:hover {
239
+ transform: scale(1.02) translateY(-3px) !important;
240
+ box-shadow: 0 20px 35px -5px rgba(168, 85, 247, 0.6) !important;
241
+ filter: brightness(1.1);
242
+ }
243
+ #gen-btn:active {
244
+ transform: scale(0.98);
245
+ }
246
+
247
+ /* ─── Header Typography ─── */
248
+ #hero h1 {
249
+ font-size: 3.2rem !important;
250
+ font-weight: 900 !important;
251
+ background: linear-gradient(to right, #ffffff, #a78bfa, #818cf8);
252
+ -webkit-background-clip: text !important;
253
+ -webkit-text-fill-color: transparent !important;
254
+ letter-spacing: -1px;
255
+ margin-bottom: 8px;
256
+ }
257
+ #hero p {
258
+ color: #94a3b8 !important;
259
+ font-size: 1.1rem;
260
+ max-width: 600px;
261
+ margin: 0 auto;
262
+ }
263
+
264
+ /* ─── Log box (Terminal Aesthetic) ─── */
265
+ #log-box textarea {
266
+ background: rgba(0, 0, 0, 0.4) !important;
267
+ color: #34d399 !important;
268
+ font-family: 'Fira Code', 'JetBrains Mono', monospace !important;
269
+ border: 1px solid rgba(52, 211, 153, 0.2) !important;
270
+ border-radius: 12px !important;
271
+ line-height: 1.6;
272
+ }
273
+
274
+ /* ─── Inputs & Dropdowns ─── */
275
+ .gr-dropdown, .gr-textbox, .gr-radio {
276
+ background: rgba(255, 255, 255, 0.05) !important;
277
+ border: 1px solid rgba(255, 255, 255, 0.1) !important;
278
+ border-radius: 12px !important;
279
+ transition: border-color 0.3s ease;
280
+ }
281
+ .gr-dropdown:focus-within, .gr-textbox:focus-within {
282
+ border-color: #a78bfa !important;
283
+ }
284
+
285
+ /* ─── Video Output ─── */
286
+ video {
287
+ border: 4px solid rgba(167, 139, 250, 0.1) !important;
288
+ border-radius: 24px !important;
289
+ box-shadow: 0 0 60px rgba(124, 58, 237, 0.25) !important;
290
+ background: #000;
291
+ }
292
+
293
+ /* ─── Status Badge ─── */
294
+ #status-box {
295
+ background: rgba(167, 139, 250, 0.1) !important;
296
+ border: 1px solid rgba(167, 139, 250, 0.2) !important;
297
+ color: #c4b5fd !important;
298
+ font-weight: 700;
299
+ }
300
+ """
301
+
302
+
303
+ def _log(msg: str, log_state: list) -> str:
304
+ ts = datetime.now().strftime("%H:%M:%S")
305
+ entry = f"[{ts}] {msg}"
306
+ log_state.append(entry)
307
+ return "\n".join(log_state)
308
+
309
+
310
+ def generate_video(
311
+ input_mode: str,
312
+ language: str,
313
+ brain_model: str,
314
+ niche_preset: str,
315
+ niche_custom: str,
316
+ full_script: str,
317
+ voice_model: str,
318
+ style: str,
319
+ editing_style: str,
320
+ dry_run: bool,
321
+ skip_check: bool,
322
+ progress=gr.Progress(track_tqdm=True),
323
+ ):
324
+ """
325
+ Main generation function called by Gradio.
326
+ """
327
+ log_state = []
328
+ def log(msg): return _log(msg, log_state)
329
+
330
+ # ── Resolve Input ─────────────────────────────────────
331
+ if input_mode == "Topic":
332
+ niche = niche_custom.strip() if niche_custom.strip() else niche_preset
333
+ if not niche:
334
+ yield log("❌ Please select or enter a niche topic."), None, "", "❌ No niche entered"
335
+ return
336
+ script_input = None
337
+ else:
338
+ if not full_script.strip():
339
+ yield log("❌ Please enter a full script."), None, "", "❌ Script missing"
340
+ return
341
+ niche = "Custom Script"
342
+ script_input = full_script.strip()
343
+
344
+ editing_label = EDITING_STYLE_META.get(editing_style, {}).get("label", editing_style)
345
+ yield log(f"🚀 Starting — mode: {input_mode} | lang: {language} | niche: '{niche}'"), None, "", "⏳ Starting..."
346
+
347
+ try:
348
+ # ── Phase 1: AI Logic ─────────────────────────────
349
+ if script_input:
350
+ yield log(f"🧠 Phase 1 — Parsing script using {brain_model}..."), None, "", "🧠 Parsing..."
351
+ progress(0.05, desc="Parsing script...")
352
+ from agent import parse_script_into_video_package
353
+ video_json = parse_script_into_video_package(script_input, language=language, model=brain_model)
354
+ else:
355
+ yield log(f"🧠 Phase 1 — Generating script using {brain_model}..."), None, "", "🧠 Generating..."
356
+ progress(0.05, desc="AI Brain thinking...")
357
+ from agent import generate_video_package
358
+ video_json = generate_video_package(niche=niche, style=style, language=language, model=brain_model)
359
+
360
+ scenes_count = len(video_json.get("scenes", []))
361
+ total_dur = video_json.get("total_duration_seconds", 0)
362
+ seo = video_json.get("seo", {})
363
+
364
+ yield (
365
+ log(f"✅ Script ready — {scenes_count} scenes, ~{total_dur}s"),
366
+ None,
367
+ _build_seo_md(seo),
368
+ "🧠 Script generated",
369
+ )
370
+
371
+ if dry_run:
372
+ json_preview = json.dumps(video_json, indent=2)
373
+ yield (
374
+ log("🏁 Dry-run complete — JSON preview below\n\n```json\n" + json_preview[:800] + "\n...```"),
375
+ None,
376
+ _build_seo_md(seo),
377
+ "✅ Dry-run complete",
378
+ )
379
+ return
380
+
381
+ # ── Phase 2: Voiceover ──────────────────────────
382
+ yield log("🎙 Phase 2 — Generating voiceovers..."), None, _build_seo_md(seo), "🎙 Generating voice..."
383
+ progress(0.25, desc="Generating voiceovers...")
384
+
385
+ from voice_generator import generate_voiceovers
386
+ voice_id = None if voice_model == "auto" else voice_model
387
+ audio_results = generate_voiceovers(video_json, voice_id=voice_id)
388
+ total_audio = sum(r["duration"] for r in audio_results)
389
+ yield (
390
+ log(f"✅ {len(audio_results)} audio files — {total_audio:.1f}s total"),
391
+ None,
392
+ _build_seo_md(seo),
393
+ "🎙 Voiceovers done",
394
+ )
395
+
396
+ # ── Phase 3: Media Fetching ────────────────────
397
+ yield log("🎥 Phase 3 — Fetching media from Pexels..."), None, _build_seo_md(seo), "🎥 Fetching media..."
398
+ progress(0.45, desc="Downloading clips...")
399
+
400
+ from media_fetcher import fetch_all_media
401
+ media_results = fetch_all_media(video_json)
402
+ ok_media = sum(1 for m in media_results if m.get("path"))
403
+ yield (
404
+ log(f"✅ {ok_media}/{len(media_results)} media assets downloaded"),
405
+ None,
406
+ _build_seo_md(seo),
407
+ "🎥 Media fetched",
408
+ )
409
+
410
+ # ── Phase 4: Asset QA ──────────────────────────
411
+ yield log("🔍 Phase 4 — Running Nemotron-2 VL quality check..."), None, _build_seo_md(seo), "🔍 QA check..."
412
+ progress(0.60, desc="Vision quality check...")
413
+
414
+ from asset_checker import check_all_assets
415
+ media_results = check_all_assets(media_results, video_json, skip_check=skip_check)
416
+ approved = sum(1 for m in media_results if m.get("approved"))
417
+ yield (
418
+ log(f"✅ {approved}/{len(media_results)} assets passed quality gate"),
419
+ None,
420
+ _build_seo_md(seo),
421
+ "🔍 QA done",
422
+ )
423
+
424
+ # ── Phase 5: Assemble ──────────────────────────
425
+ yield log(f"🎬 Phase 5 — Assembling ({editing_label})..."), None, _build_seo_md(seo), "🎬 Assembling..."
426
+ progress(0.75, desc="Rendering video...")
427
+
428
+ from video_assembler import assemble_video, EDITING_STYLES as _ES
429
+ safe_style = editing_style if editing_style in _ES else "social_media"
430
+ video_path = assemble_video(
431
+ video_json, audio_results, media_results,
432
+ editing_style=safe_style,
433
+ )
434
+
435
+ progress(1.0, desc="Done!")
436
+ yield (
437
+ log(f"🏆 Video ready! → {video_path}"),
438
+ video_path,
439
+ _build_seo_md(seo),
440
+ "✅ Done!",
441
+ )
442
+
443
+ except EnvironmentError as e:
444
+ yield log(f"⚠️ Configuration Error: {e}"), None, "", f"⚠️ Config: {str(e)[:40]}"
445
+ except Exception as e:
446
+ error_msg = str(e)
447
+ if "429" in error_msg:
448
+ status_summary = "⚠️ Rate Limit (429)"
449
+ user_advice = "\n\n💡 TIP: OpenRouter is busy! Try switching the 'Brain Model' to Gemini 2.0 Flash."
450
+ else:
451
+ status_summary = "❌ Failed"
452
+ user_advice = ""
453
+
454
+ tb = traceback.format_exc()
455
+ yield (
456
+ log(f"❌ Error: {error_msg}{user_advice}\n\nTraceback:\n{tb}"),
457
+ None,
458
+ "",
459
+ status_summary,
460
+ )
461
+
462
+
463
+ def _build_seo_md(seo: dict) -> str:
464
+ if not seo:
465
+ return ""
466
+ title = seo.get("title", "")
467
+ desc = seo.get("description", "")
468
+ hashtags = " ".join(seo.get("hashtags", []))
469
+ keywords = ", ".join(seo.get("keywords", []))
470
+ return f"""### 📊 SEO Metadata
471
+
472
+ **Title:** {title}
473
+
474
+ **Description:**
475
+ {desc}
476
+
477
+ **Hashtags:** {hashtags}
478
+
479
+ **Keywords:** `{keywords}`
480
+ """
481
+
482
+
483
+ # ── Build UI ──────────────────────────────────────────────
484
+ with gr.Blocks(
485
+ title="AutoShorts — Autonomous Video Engine",
486
+ ) as demo:
487
+
488
+ logged_in = gr.State(False)
489
+
490
+ def on_login():
491
+ return gr.update(visible=False), gr.update(visible=True), True
492
+
493
+ # ── Login Page ───────────────────────────────────────
494
+ with gr.Column(elem_id="login-container", visible=True) as login_view:
495
+ gr.HTML("""
496
+ <div>
497
+ <div style="font-size: 4rem; margin-bottom: 20px;">🎬</div>
498
+ <h1>Generate short videos just by giving the topic and enter</h1>
499
+ <p>Welcome to <strong>AutoShorts</strong>. The elite autonomous engine for viral content.</p>
500
+ </div>
501
+ """)
502
+
503
+ login_btn = gr.Button(
504
+ "Continue with Google",
505
+ elem_id="google-btn",
506
+ icon="https://www.google.com/favicon.ico" # Fallback icon
507
+ )
508
+
509
+ gr.HTML("""
510
+ <div style="margin-top: 32px; color: #475569; font-size: 0.85rem;">
511
+ By continuing, you agree to AutoShorts' Terms of Service and Data Policy.
512
+ </div>
513
+ """)
514
+
515
+ # ── Main App Container ───────────────────────────────
516
+ with gr.Column(visible=False) as main_app_view:
517
+ gr.HTML("""
518
+ <div>
519
+ <div class="badge-premium">v2.0 • Ultra-Performance Edition</div>
520
+ <h1>AutoShorts</h1>
521
+ <p>The elite autonomous engine for viral short-form content. Transform any niche into high-retention videos in seconds.</p>
522
+ <div class="powered-by">
523
+ <strong>Llama 3.3</strong> <span></span>
524
+ <strong>Google TTS</strong> <span></span>
525
+ <strong>Pexels</strong> <span></span>
526
+ <strong>MoviePy 2.x</strong>
527
+ </div>
528
+ </div>
529
+ """)
530
+
531
+ with gr.Row(equal_height=False):
532
+ # ── Left Panel: Controls ──────────────────────────
533
+ with gr.Column(scale=1, min_width=320):
534
+ gr.HTML('<div style="color:#a78bfa; font-weight:700; font-size:16px; margin-bottom:12px;">⚡ Configuration</div>')
535
+
536
+ with gr.Row():
537
+ input_toggle = gr.Radio(choices=["Topic", "Full Script"], value="Topic", label="Input Mode", interactive=True)
538
+ language_sel = gr.Dropdown(choices=["English", "Hindi", "Hinglish"], value="English", label="Language", interactive=True)
539
+
540
+ brain_model_sel = gr.Dropdown(
541
+ choices=[
542
+ ("Llama 3.3 70B (High Logic)", "meta-llama/llama-3.3-70b-instruct:free"),
543
+ ("Mistral Small 3.1", "mistralai/mistral-small-3.1-24b-instruct:free"),
544
+ ("Gemma 3 27B", "google/gemma-3-27b-it:free"),
545
+ ("MiniMax M2.5 (Fastest)", "minimax/minimax-m2.5:free"),
546
+ ("Nemotron-3 Super", "nvidia/nemotron-3-super-120b-a12b:free")
547
+ ],
548
+ value="meta-llama/llama-3.3-70b-instruct:free",
549
+ label="Brain Model (AI Engine)",
550
+ interactive=True,
551
+ )
552
+
553
+ with gr.Column(visible=True) as topic_container:
554
+ niche_preset = gr.Dropdown(choices=NICHE_PRESETS, label="Select a Niche", value=NICHE_PRESETS[0], interactive=True)
555
+ niche_custom = gr.Textbox(placeholder='Or type your own: "Mindset Shifts"', label="Custom Topic")
556
+ style_select = gr.Dropdown(choices=STYLE_OPTIONS, label="Script Tone / Vibe", value=STYLE_OPTIONS[0], interactive=True)
557
+
558
+ with gr.Column(visible=False) as script_container:
559
+ full_script_input = gr.Textbox(placeholder="Paste your full video script here...", label="Full Video Script", lines=8)
560
+
561
+ gr.HTML('<div style="color:#a78bfa; font-weight:700; font-size:13px; margin:14px 0 8px;">🎙 Voice Selection</div>')
562
+ voice_select = gr.Dropdown(choices=VOICE_CHOICES, value="auto", label="Voice Model (Humanized)", interactive=True)
563
+
564
+ gr.HTML('<div style="color:#a78bfa; font-weight:700; font-size:13px; margin:14px 0 8px;">🎬 Editing Style</div>')
565
+ editing_style_radio = gr.Radio(
566
+ choices=[
567
+ (f"🎨 Motion Graphics", "motion_graphics"),
568
+ (f"⚡ Montage", "montage"),
569
+ (f"🎥 Documentary", "documentary"),
570
+ (f"📲 Social Media", "social_media"),
571
+ ],
572
+ value="social_media",
573
+ label="",
574
+ interactive=True,
575
+ )
576
+
577
+ with gr.Row():
578
+ dry_run_chk = gr.Checkbox(label="🧪 Dry-run (JSON only)", value=False)
579
+ skip_check_chk = gr.Checkbox(label="⚡ Skip VL QA", value=False)
580
+
581
+ gen_btn = gr.Button("🚀 Generate Video", elem_id="gen-btn", variant="primary")
582
+ status_box = gr.Textbox(value="Idle", label="Status", interactive=False, elem_id="status-box")
583
+ seo_output = gr.Markdown(value="*SEO metadata will appear here.*", elem_id="seo-panel", label="SEO Metadata")
584
+
585
+ # ── Right Panel: Output ───────────────────────────
586
+ with gr.Column(scale=2, min_width=500):
587
+ gr.HTML('<div style="color:#a78bfa; font-weight:700; font-size:16px; margin-bottom:8px;">📺 Output</div>')
588
+ video_out = gr.Video(label="Final Video (9:16)", height=600, interactive=False)
589
+ gr.HTML('<div style="color:#64748b; font-size:13px; margin-top:16px;">📋 Live Log</div>')
590
+ log_out = gr.Textbox(label="", lines=14, interactive=False, max_lines=20, elem_id="log-box")
591
+
592
+ # ── Footer ────────────────────────────────────────────
593
+ gr.HTML("""<div style="text-align:center; padding:20px; color:#334155; font-size:12px; margin-top:16px;">AutoShorts · Powered by OpenRouter · Pexels · MoviePy</div>""")
594
+
595
+ # ── UI Logic ──────────────────────────────────────────
596
+ def filter_voices_by_lang(lang):
597
+ options = [("Auto-Select (Recommended)", "auto")]
598
+ for k, v in PREMIUM_VOICES.items():
599
+ is_hindi = "hindi" in k
600
+ if lang in ["Hindi", "Hinglish"]:
601
+ if is_hindi: options.append((v[2], k))
602
+ else:
603
+ if not is_hindi: options.append((v[2], k))
604
+ return gr.update(choices=options, value="auto")
605
+
606
+ language_sel.change(fn=filter_voices_by_lang, inputs=[language_sel], outputs=[voice_select])
607
+
608
+ def toggle_input_mode(mode):
609
+ if mode == "Topic": return gr.update(visible=True), gr.update(visible=False)
610
+ return gr.update(visible=False), gr.update(visible=True)
611
+
612
+ input_toggle.change(fn=toggle_input_mode, inputs=[input_toggle], outputs=[topic_container, script_container])
613
+
614
+ gen_btn.click(
615
+ fn=generate_video,
616
+ inputs=[input_toggle, language_sel, brain_model_sel, niche_preset, niche_custom, full_script_input, voice_select, style_select, editing_style_radio, dry_run_chk, skip_check_chk],
617
+ outputs=[log_out, video_out, seo_output, status_box],
618
+ )
619
+
620
+ # ── Login Logic ───────────────────────────────────────
621
+ login_btn.click(
622
+ on_login,
623
+ outputs=[login_view, main_app_view, logged_in]
624
+ )
625
+
626
+
627
+ if __name__ == "__main__":
628
+ print("\n[INFO] Launching Autonomous Video Engine UI...")
629
+ print(" URL: http://localhost:7860\n")
630
+ demo.launch(
631
+ server_name="0.0.0.0",
632
+ server_port=7860,
633
+ share=False,
634
+ show_error=True,
635
+ css=CUSTOM_CSS,
636
+ theme=gr.themes.Base(
637
+ primary_hue="violet",
638
+ secondary_hue="indigo",
639
+ neutral_hue="slate",
640
+ font=gr.themes.GoogleFont("Inter"),
641
+ )
642
+ )
asset_checker.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ asset_checker.py
3
+ ─────────────────────────────────────────────────────────────
4
+ Autonomous Short-Form Video Engine — Visual Quality Gate
5
+ Uses Nemotron-2 VL (vision-language model) via NVIDIA NIM
6
+ to score each downloaded media asset for quality & relevance.
7
+ Assets scoring < 6/10 are rejected and trigger a re-fetch.
8
+ ─────────────────────────────────────────────────────────────
9
+ """
10
+
11
+ import os
12
+ import json
13
+ import base64
14
+ import logging
15
+ from pathlib import Path
16
+ from openai import OpenAI
17
+ from PIL import Image
18
+ from io import BytesIO
19
+ from dotenv import load_dotenv
20
+
21
+ load_dotenv()
22
+ logger = logging.getLogger(__name__)
23
+
24
+ NVIDIA_API_KEY = os.getenv("NVIDIA_API_KEY", "")
25
+ SCORE_THRESHOLD = 6.0 # assets below this score are rejected
26
+ MAX_REFETCH_ATTEMPTS = 2
27
+
28
+ # NVIDIA NIM endpoint for Nemotron-2 VL
29
+ nvidia_client = OpenAI(
30
+ base_url="https://integrate.api.nvidia.com/v1",
31
+ api_key=NVIDIA_API_KEY,
32
+ )
33
+ VL_MODEL = "nvidia/nemotron-4-340b-instruct" # vision-capable endpoint
34
+
35
+
36
+ def _extract_frame(media_path: str) -> str:
37
+ """
38
+ Extract a thumbnail from a video file or resize an image,
39
+ then return as base64-encoded JPEG string.
40
+ """
41
+ path = Path(media_path)
42
+ if path.suffix.lower() in (".mp4", ".mov", ".avi", ".webm"):
43
+ try:
44
+ from moviepy import VideoFileClip
45
+ clip = VideoFileClip(str(path))
46
+ # Grab frame at 20% into the video (avoids black intro frames)
47
+ t = clip.duration * 0.2
48
+ frame = clip.get_frame(t)
49
+ clip.close()
50
+ img = Image.fromarray(frame)
51
+ except Exception as e:
52
+ logger.warning(f"[Checker] Could not extract video frame: {e}")
53
+ return ""
54
+ else:
55
+ try:
56
+ img = Image.open(path)
57
+ except Exception as e:
58
+ logger.warning(f"[Checker] Could not open image: {e}")
59
+ return ""
60
+
61
+ # Resize to max 512px on longest side for API efficiency
62
+ img.thumbnail((512, 512), Image.LANCZOS)
63
+ if img.mode != "RGB":
64
+ img = img.convert("RGB")
65
+ buf = BytesIO()
66
+ img.save(buf, format="JPEG", quality=85)
67
+ return base64.b64encode(buf.getvalue()).decode("utf-8")
68
+
69
+
70
+ def _build_check_prompt(topic: str) -> str:
71
+ return (
72
+ f"You are a strict video quality reviewer. Look at this image (a frame from a short-form video clip). "
73
+ f"Rate it on two criteria for the topic: '{topic}'.\n\n"
74
+ f"1. Visual Quality (lighting, sharpness, professional look): 1-10\n"
75
+ f"2. Topic Relevance (does it visually match '{topic}'?): 1-10\n\n"
76
+ f"Reply ONLY with valid JSON in this format:\n"
77
+ f'{{ "quality_score": 7, "relevance_score": 8, "overall": 7.5, "reject": false, "reason": "brief reason" }}'
78
+ )
79
+
80
+
81
+ def check_asset(media_path: str, topic: str, skip_check: bool = False) -> dict:
82
+ """
83
+ Run Nemotron-2 VL quality check on a downloaded media asset.
84
+
85
+ Args:
86
+ media_path: Path to the downloaded video/image file
87
+ topic: The scene topic/keyword for relevance scoring
88
+ skip_check: If True, skip the VL check and approve automatically
89
+
90
+ Returns:
91
+ Dict: {"approved": bool, "overall": float, "reason": str}
92
+ """
93
+ if skip_check or not NVIDIA_API_KEY:
94
+ if not NVIDIA_API_KEY:
95
+ logger.warning("[Checker] NVIDIA_API_KEY not set — auto-approving all assets.")
96
+ return {"approved": True, "overall": 10.0, "reason": "Check skipped"}
97
+
98
+ frame_b64 = _extract_frame(media_path)
99
+ if not frame_b64:
100
+ return {"approved": True, "overall": 7.0, "reason": "Could not extract frame — auto-approved"}
101
+
102
+ prompt = _build_check_prompt(topic)
103
+ image_url = f"data:image/jpeg;base64,{frame_b64}"
104
+
105
+ try:
106
+ response = nvidia_client.chat.completions.create(
107
+ model=VL_MODEL,
108
+ messages=[
109
+ {
110
+ "role": "user",
111
+ "content": [
112
+ {"type": "image_url", "image_url": {"url": image_url}},
113
+ {"type": "text", "text": prompt},
114
+ ],
115
+ }
116
+ ],
117
+ temperature=0.1,
118
+ max_tokens=200,
119
+ )
120
+ raw = response.choices[0].message.content.strip()
121
+
122
+ # Strip markdown if needed
123
+ if raw.startswith("```"):
124
+ raw = raw.split("```")[1]
125
+ if raw.startswith("json"):
126
+ raw = raw[4:]
127
+ raw = raw.strip()
128
+
129
+ result = json.loads(raw)
130
+ overall = float(result.get("overall", 7.0))
131
+ approved = overall >= SCORE_THRESHOLD
132
+ logger.info(
133
+ f"[Checker] {Path(media_path).name} → score: {overall:.1f}/10 "
134
+ f"({'✅ approved' if approved else '❌ rejected'})"
135
+ )
136
+ return {
137
+ "approved": approved,
138
+ "overall": overall,
139
+ "reason": result.get("reason", ""),
140
+ }
141
+
142
+ except json.JSONDecodeError as e:
143
+ logger.warning(f"[Checker] JSON parse error from VL response: {e} — auto-approving")
144
+ return {"approved": True, "overall": 7.0, "reason": "Parse error — auto-approved"}
145
+
146
+ except Exception as e:
147
+ logger.warning(f"[Checker] VL API error: {e} — auto-approving")
148
+ return {"approved": True, "overall": 7.0, "reason": f"API error: {str(e)[:60]}"}
149
+
150
+
151
+ def check_all_assets(media_results: list[dict], video_json: dict,
152
+ skip_check: bool = False) -> list[dict]:
153
+ """
154
+ Quality-check all fetched media assets. Marks each with approved status.
155
+
156
+ Args:
157
+ media_results: Output of media_fetcher.fetch_all_media()
158
+ video_json: Original video package JSON (for scene keywords as topic)
159
+ skip_check: Skip VL check entirely
160
+
161
+ Returns:
162
+ Same list with 'approved', 'score', 'check_reason' added to each item
163
+ """
164
+ scene_map = {s["scene_number"]: s for s in video_json["scenes"]}
165
+
166
+ for item in media_results:
167
+ if not item.get("path"):
168
+ item.update({"approved": False, "score": 0.0, "check_reason": "No file"})
169
+ continue
170
+
171
+ scene = scene_map.get(item["scene_number"], {})
172
+ topic = scene.get("visual_description", "") or ", ".join(
173
+ scene.get("pexels_keywords", ["video"])
174
+ )
175
+
176
+ result = check_asset(item["path"], topic, skip_check=skip_check)
177
+ item["approved"] = result["approved"]
178
+ item["score"] = result["overall"]
179
+ item["check_reason"] = result["reason"]
180
+
181
+ approved_count = sum(1 for m in media_results if m.get("approved"))
182
+ logger.info(f"[Checker] ✅ {approved_count}/{len(media_results)} assets passed QA.")
183
+ return media_results
184
+
185
+
186
+ # ── CLI Test ──────────────────────────────────────────────
187
+ if __name__ == "__main__":
188
+ import argparse
189
+ parser = argparse.ArgumentParser()
190
+ parser.add_argument("--image", required=True, help="Path to image or video file")
191
+ parser.add_argument("--topic", default="AI technology future", help="Topic for relevance check")
192
+ args = parser.parse_args()
193
+
194
+ logging.basicConfig(level=logging.INFO)
195
+ print(f"\n🔍 Checking asset: {args.image}")
196
+ print(f" Topic: '{args.topic}'\n")
197
+ result = check_asset(args.image, args.topic)
198
+ print(json.dumps(result, indent=2))
debug_assembly.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ from pathlib import Path
4
+ from video_assembler import assemble_video
5
+
6
+ logging.basicConfig(level=logging.INFO)
7
+ logger = logging.getLogger(__name__)
8
+
9
+ # Load the JSON
10
+ json_path = "output/ai_tools_20260322_095831.json"
11
+ with open(json_path, "r") as f:
12
+ video_json = json.load(f)
13
+
14
+ # Mock audio results (first 3 scenes only for speed)
15
+ audio_results = [
16
+ {"scene_number": 1, "path": "assets/audio/scene_01.mp3", "duration": 4.6},
17
+ {"scene_number": 2, "path": "assets/audio/scene_02.mp3", "duration": 3.1},
18
+ {"scene_number": 3, "path": "assets/audio/scene_03.mp3", "duration": 3.5},
19
+ ]
20
+
21
+ # Mock media results (first 3 scenes)
22
+ # I'll find existing clips in assets/clips
23
+ clips_dir = Path("assets/clips")
24
+ media_results = []
25
+ for i in range(1, 4):
26
+ found = list(clips_dir.glob(f"scene_{i:02d}_*.mp4"))
27
+ if found:
28
+ media_results.append({
29
+ "scene_number": i,
30
+ "path": str(found[0]),
31
+ "type": "video",
32
+ "approved": True
33
+ })
34
+
35
+ # Trim JSON to 3 scenes
36
+ video_json["scenes"] = video_json["scenes"][:3]
37
+
38
+ logger.info("Starting standalone assembly test...")
39
+ try:
40
+ output_path = assemble_video(
41
+ video_json=video_json,
42
+ audio_results=audio_results,
43
+ media_results=media_results,
44
+ editing_style="social_media",
45
+ output_filename="assembly_verify_test.mp4"
46
+ )
47
+ print(f"\nSUCCESS: Video assembled at {output_path}")
48
+ except Exception as e:
49
+ import traceback
50
+ print(f"\nFAILURE: Assembly failed!")
51
+ traceback.print_exc()
main.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ main.py
3
+ ─────────────────────────────────────────────────────────────
4
+ Autonomous Short-Form Video Engine — CLI Orchestrator
5
+ Wires all layers together: Agent → Voice → Media → QA → Assemble
6
+
7
+ Usage:
8
+ python main.py --niche "AI Tools"
9
+ python main.py --niche "Motivation" --dry-run
10
+ python main.py --niche "Fitness" --skip-check
11
+ python main.py --niche "Fitness" --editing-style montage
12
+ python main.py --niche "Finance" --editing-style documentary
13
+ ─────────────────────────────────────────────────────────────"""
14
+
15
+ import argparse
16
+ import json
17
+ import logging
18
+ import sys
19
+ import time
20
+ from pathlib import Path
21
+ from datetime import datetime
22
+
23
+ # ── Logging & Encoding setup ──────────────────────────────
24
+ import sys
25
+ if sys.stdout.encoding != 'utf-8':
26
+ try:
27
+ sys.stdout.reconfigure(encoding='utf-8')
28
+ except AttributeError:
29
+ # Fallback for older Python versions
30
+ import codecs
31
+ sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach())
32
+
33
+ logging.basicConfig(
34
+ level=logging.INFO,
35
+ format="%(asctime)s | %(levelname)-8s | %(message)s",
36
+ datefmt="%H:%M:%S",
37
+ )
38
+ logger = logging.getLogger(__name__)
39
+
40
+
41
+ def banner():
42
+ print("""
43
+ +----------------------------------------------------------+
44
+ | * A U T O S H O R T S * |
45
+ | Powered by Llama 3.3 + MoviePy + Pexels |
46
+ +----------------------------------------------------------+
47
+ """)
48
+
49
+
50
+ EDITING_STYLES = ["motion_graphics", "montage", "documentary", "social_media"]
51
+
52
+
53
+ def run_pipeline(
54
+ niche: str,
55
+ style: str = "engaging and educational",
56
+ editing_style: str = "social_media",
57
+ dry_run: bool = False,
58
+ skip_check: bool = False,
59
+ ) -> dict:
60
+ """
61
+ Full pipeline execution.
62
+
63
+ Returns:
64
+ dict with keys: json_data, audio_results, media_results, video_path
65
+ """
66
+ if editing_style not in EDITING_STYLES:
67
+ logger.warning(f"Unknown editing style '{editing_style}' — defaulting to social_media")
68
+ editing_style = "social_media"
69
+
70
+ result = {}
71
+ start = time.time()
72
+
73
+ # ── Phase 1: AI Brain ─────────────────────────────────
74
+ print(f"\n{'═'*55}")
75
+ print(f" 🧠 Phase 1 — Generating video script via Nemotron-3")
76
+ print(f" 🎬 Editing Style: {editing_style.replace('_', ' ').title()}")
77
+ print(f"{'═'*55}")
78
+
79
+ from agent import generate_video_package
80
+ video_json = generate_video_package(niche=niche, style=style)
81
+ result["json_data"] = video_json
82
+
83
+ # Save JSON
84
+ Path("output").mkdir(exist_ok=True)
85
+ ts = datetime.now().strftime("%Y%m%d_%H%M%S")
86
+ json_path = Path("output") / f"{niche.lower().replace(' ', '_')}_{ts}.json"
87
+ json_path.write_text(json.dumps(video_json, indent=2), encoding="utf-8")
88
+ print(f"\n 📋 Script saved → {json_path}")
89
+
90
+ # Preview
91
+ print(f"\n Niche : {video_json.get('niche')}")
92
+ print(f" Scenes : {len(video_json.get('scenes', []))}")
93
+ print(f" Duration : {video_json.get('total_duration_seconds')}s")
94
+ print(f" SEO Title : {video_json.get('seo', {}).get('title', '')}")
95
+
96
+ if dry_run:
97
+ print("\n 🏁 Dry-run mode — stopping after JSON generation.\n")
98
+ print(json.dumps(video_json, indent=2))
99
+ return result
100
+
101
+ # ── Phase 2: Voiceover ────────────────────────────────
102
+ print(f"\n{'═'*55}")
103
+ print(f" 🎙 Phase 2 — Generating voiceovers (Google TTS)")
104
+ print(f"{'═'*55}")
105
+
106
+ from voice_generator import generate_voiceovers
107
+ audio_results = generate_voiceovers(video_json)
108
+ result["audio_results"] = audio_results
109
+
110
+ total_audio = sum(r["duration"] for r in audio_results)
111
+ print(f"\n ✅ {len(audio_results)} audio files generated ({total_audio:.1f}s total)")
112
+
113
+ # ── Phase 3: Media Fetching ───────────────────────────
114
+ print(f"\n{'═'*55}")
115
+ print(f" 🎥 Phase 3 — Fetching media from Pexels")
116
+ print(f"{'═'*55}")
117
+
118
+ from media_fetcher import fetch_all_media
119
+ media_results = fetch_all_media(video_json)
120
+
121
+ # ── Phase 4: Asset QA ─────────────────────────────────
122
+ print(f"\n{'═'*55}")
123
+ print(f" 🔍 Phase 4 — Running Nemotron-2 VL asset check")
124
+ print(f"{'═'*55}")
125
+
126
+ from asset_checker import check_all_assets
127
+ media_results = check_all_assets(media_results, video_json, skip_check=skip_check)
128
+ result["media_results"] = media_results
129
+
130
+ approved = sum(1 for m in media_results if m.get("approved"))
131
+ print(f"\n ✅ {approved}/{len(media_results)} assets passed QA")
132
+
133
+ # ── Phase 5: Video Assembly ───────────────────────────
134
+ print(f"\n{'═'*55}")
135
+ print(f" 🎬 Phase 5 — Assembling final video (MoviePy)")
136
+ print(f"{'═'*55}")
137
+
138
+ from video_assembler import assemble_video
139
+ video_path = assemble_video(
140
+ video_json, audio_results, media_results,
141
+ editing_style=editing_style,
142
+ )
143
+ result["video_path"] = video_path
144
+
145
+ elapsed = time.time() - start
146
+ print(f"""
147
+ ╔══════════════════════════════════════════════════════════╗
148
+ ║ 🏆 DONE! ║
149
+ ╠══════════════════════════════════════════════════════════╣
150
+ ║ 📹 Video : {video_path[:46]:<46} ║
151
+ ║ 📋 JSON : {str(json_path)[:46]:<46} ║
152
+ ║ ⏱ Time : {elapsed:.1f}s ║
153
+ ╚══════════════════════════════════════════════════════════╝
154
+ """)
155
+ return result
156
+
157
+
158
+ # ── Main ──────────────────────────────────────────────────
159
+ if __name__ == "__main__":
160
+ banner()
161
+ parser = argparse.ArgumentParser(
162
+ description="Autonomous Short-Form Video Engine"
163
+ )
164
+ parser.add_argument("--niche", required=True, help='Topic niche, e.g. "AI Tools"')
165
+ parser.add_argument("--style", default="engaging and educational",
166
+ help="AI script tone/style descriptor")
167
+ parser.add_argument(
168
+ "--editing-style",
169
+ choices=EDITING_STYLES,
170
+ default="social_media",
171
+ help="Video editing style (default: social_media)",
172
+ )
173
+ parser.add_argument("--dry-run", action="store_true",
174
+ help="Generate JSON only — no media/voice/video")
175
+ parser.add_argument("--skip-check", action="store_true",
176
+ help="Skip Nemotron-2 VL asset quality check")
177
+ args = parser.parse_args()
178
+
179
+ try:
180
+ run_pipeline(
181
+ niche=args.niche,
182
+ style=args.style,
183
+ editing_style=args.editing_style,
184
+ dry_run=args.dry_run,
185
+ skip_check=args.skip_check,
186
+ )
187
+ except KeyboardInterrupt:
188
+ print("\n\n⚠️ Interrupted by user.")
189
+ sys.exit(0)
190
+ except Exception as e:
191
+ logger.error(f"Pipeline failed: {e}", exc_info=True)
192
+ sys.exit(1)
media_fetcher.py ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ media_fetcher.py
3
+ ─────────────────────────────────────────────────────────────
4
+ Autonomous Short-Form Video Engine — Media Layer
5
+ Downloads portrait video clips from Pexels API per scene.
6
+ Falls back to images if no video is found.
7
+ Caches downloads to avoid re-fetching.
8
+ ─────────────────────────────────────────────────────────────
9
+ """
10
+
11
+ import os
12
+ import re
13
+ import hashlib
14
+ import logging
15
+ import requests
16
+ from pathlib import Path
17
+ from dotenv import load_dotenv
18
+ from tqdm import tqdm
19
+
20
+ load_dotenv()
21
+ logger = logging.getLogger(__name__)
22
+
23
+ PEXELS_API_KEY = os.getenv("PEXELS_API_KEY", "")
24
+ CLIPS_DIR = Path("assets/clips")
25
+ CLIPS_DIR.mkdir(parents=True, exist_ok=True)
26
+
27
+ PEXELS_VIDEOS_URL = "https://api.pexels.com/videos/search"
28
+ PEXELS_PHOTOS_URL = "https://api.pexels.com/v1/search"
29
+
30
+ HEADERS = {"Authorization": PEXELS_API_KEY}
31
+
32
+ # Target orientation: portrait (9:16)
33
+ MIN_PORTRAIT_RATIO = 0.45 # width/height < this → portrait
34
+ PREFERRED_MIN_QUALITY = 720 # minimum height in pixels
35
+
36
+
37
+ def _cache_key(keywords: list[str]) -> str:
38
+ joined = "_".join(sorted(keywords)).lower()
39
+ return hashlib.md5(joined.encode()).hexdigest()[:10]
40
+
41
+
42
+ def _clean_filename(name: str) -> str:
43
+ return re.sub(r"[^\w\-.]", "_", name)
44
+
45
+
46
+ def _download_file(url: str, dest: Path, desc: str = "") -> bool:
47
+ """Stream-download a file with progress bar."""
48
+ try:
49
+ resp = requests.get(url, stream=True, timeout=30)
50
+ resp.raise_for_status()
51
+ total = int(resp.headers.get("content-length", 0))
52
+ with open(dest, "wb") as f, tqdm(
53
+ total=total, unit="B", unit_scale=True,
54
+ desc=desc, ncols=70, leave=False
55
+ ) as bar:
56
+ for chunk in resp.iter_content(chunk_size=8192):
57
+ f.write(chunk)
58
+ bar.update(len(chunk))
59
+ return True
60
+ except Exception as e:
61
+ logger.error(f"[Media] Download failed for {url}: {e}")
62
+ dest.unlink(missing_ok=True)
63
+ return False
64
+
65
+
66
+ def _fetch_video(keywords: list[str], scene_num: int) -> dict | None:
67
+ """Search Pexels for a portrait video clip matching keywords."""
68
+ if not PEXELS_API_KEY:
69
+ raise EnvironmentError("PEXELS_API_KEY not set.")
70
+
71
+ cache_key = _cache_key(keywords)
72
+ cached = list(CLIPS_DIR.glob(f"scene_{scene_num:02d}_{cache_key}.*"))
73
+ if cached:
74
+ logger.info(f"[Media] Scene {scene_num} cache hit → {cached[0].name}")
75
+ return {"path": str(cached[0]), "type": "video" if cached[0].suffix == ".mp4" else "image"}
76
+
77
+ # Try with up to 3 keywords, then fallback
78
+ search_keywords = keywords[:3]
79
+ while len(search_keywords) > 0:
80
+ query = " ".join(search_keywords)
81
+ params = {
82
+ "query": query,
83
+ "per_page": 15,
84
+ "orientation": "portrait",
85
+ "size": "medium",
86
+ }
87
+
88
+ try:
89
+ resp = requests.get(PEXELS_VIDEOS_URL, headers=HEADERS, params=params, timeout=15)
90
+ resp.raise_for_status()
91
+ data = resp.json()
92
+ videos = data.get("videos", [])
93
+ if videos:
94
+ # Process videos... (rest of the logic)
95
+ for video in videos:
96
+ w, h = video.get("width", 0), video.get("height", 0)
97
+ if w == 0 or h == 0: continue
98
+ ratio = w / h
99
+ if ratio > MIN_PORTRAIT_RATIO * 2: continue
100
+
101
+ files = video.get("video_files", [])
102
+ files_sorted = sorted(
103
+ [f for f in files if f.get("quality") in ("hd", "sd")],
104
+ key=lambda x: x.get("height", 0),
105
+ reverse=True,
106
+ )
107
+ if not files_sorted: continue
108
+
109
+ chosen = files_sorted[0]
110
+ if chosen.get("height", 0) < PREFERRED_MIN_QUALITY: continue
111
+
112
+ url = chosen["link"]
113
+ dest = CLIPS_DIR / f"scene_{scene_num:02d}_{cache_key}.mp4"
114
+ logger.info(f"[Media] Scene {scene_num} — downloading video: {video['id']}.mp4")
115
+ if _download_file(url, dest, desc=f"Scene {scene_num}"):
116
+ return {"path": str(dest), "type": "video"}
117
+
118
+ # If no videos found for this query, try fewer keywords
119
+ search_keywords.pop()
120
+ except Exception as e:
121
+ logger.warning(f"[Media] Pexels video search failed for '{query}': {e}")
122
+ search_keywords.pop()
123
+
124
+ logger.warning(f"[Media] No good portrait video found for scene {scene_num}. Trying image fallback...")
125
+ return _fetch_image(keywords, scene_num, cache_key)
126
+
127
+
128
+ def _fetch_image(keywords: list[str], scene_num: int, cache_key: str = None) -> dict | None:
129
+ """Fallback: fetch a portrait image from Pexels."""
130
+ if cache_key is None:
131
+ cache_key = _cache_key(keywords)
132
+
133
+ query = " ".join(keywords[:3])
134
+ params = {
135
+ "query": query,
136
+ "per_page": 10,
137
+ "orientation": "portrait",
138
+ }
139
+
140
+ try:
141
+ resp = requests.get(PEXELS_PHOTOS_URL, headers=HEADERS, params=params, timeout=15)
142
+ resp.raise_for_status()
143
+ photos = resp.json().get("photos", [])
144
+ except Exception as e:
145
+ logger.error(f"[Media] Pexels image fallback failed: {e}")
146
+ return None
147
+
148
+ for photo in photos:
149
+ url = photo["src"].get("large2x") or photo["src"].get("large")
150
+ if not url:
151
+ continue
152
+ dest = CLIPS_DIR / f"scene_{scene_num:02d}_{cache_key}.jpg"
153
+ logger.info(f"[Media] Scene {scene_num} — downloading image fallback")
154
+ if _download_file(url, dest, desc=f"Scene {scene_num} img"):
155
+ return {"path": str(dest), "type": "image"}
156
+
157
+ return None
158
+
159
+
160
+ def fetch_all_media(video_json: dict) -> list[dict]:
161
+ """
162
+ Fetch media (video/image) for every scene in the video JSON.
163
+
164
+ Args:
165
+ video_json: Parsed dict from agent.generate_video_package()
166
+
167
+ Returns:
168
+ List of dicts per scene: [{"scene_number": 1, "path": "...", "type": "video"}, ...]
169
+ """
170
+ if not PEXELS_API_KEY:
171
+ raise EnvironmentError("PEXELS_API_KEY not set in .env")
172
+
173
+ results = []
174
+ for scene in video_json["scenes"]:
175
+ num = scene["scene_number"]
176
+ keywords = scene.get("pexels_keywords", ["technology", "future"])
177
+ logger.info(f"[Media] Fetching scene {num} — keywords: {keywords}")
178
+ media = _fetch_video(keywords, num)
179
+ if media:
180
+ media["scene_number"] = num
181
+ results.append(media)
182
+ else:
183
+ logger.error(f"[Media] Could not fetch any media for scene {num}!")
184
+ results.append({"scene_number": num, "path": None, "type": "none"})
185
+
186
+ ok = sum(1 for r in results if r["path"])
187
+ logger.info(f"[Media] ✅ Fetched {ok}/{len(results)} media assets.")
188
+ return results
189
+
190
+
191
+ # ── CLI Test ──────────────────────────────────────────────
192
+ if __name__ == "__main__":
193
+ import argparse
194
+ parser = argparse.ArgumentParser()
195
+ parser.add_argument("--keywords", nargs="+", default=["artificial intelligence", "technology", "future"])
196
+ parser.add_argument("--test", action="store_true")
197
+ args = parser.parse_args()
198
+
199
+ if args.test or args.keywords:
200
+ logging.basicConfig(level=logging.INFO)
201
+ print(f"🎥 Fetching media for keywords: {args.keywords}")
202
+ result = _fetch_video(args.keywords, scene_num=99)
203
+ if result:
204
+ print(f" ✅ {result['type'].upper()}: {result['path']}")
205
+ else:
206
+ print(" ❌ No media found.")
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ffmpeg
requirements.txt ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ============================================================
2
+ # Autonomous Short-Form Video Engine — Python Requirements
3
+ # ============================================================
4
+
5
+ # AI / LLM (Nemotron-3 via OpenRouter, Nemotron-2 VL via NVIDIA)
6
+ openai>=1.0.0 # Used as OpenRouter-compatible client
7
+
8
+ # Text-to-Speech (Google Cloud TTS free tier)
9
+ google-cloud-texttospeech>=2.14.0
10
+
11
+ # Alternative TTS (edge-tts, no auth needed — uncomment to use)
12
+ # edge-tts>=6.1.9
13
+
14
+ # Media fetching & processing
15
+ requests>=2.31.0
16
+ Pillow>=10.0.0
17
+ moviepy>=1.0.3
18
+
19
+ # Web UI
20
+ gradio>=4.0.0
21
+
22
+ # Utilities
23
+ python-dotenv>=1.0.0
24
+ mutagen>=1.47.0 # Audio duration detection
25
+ tqdm>=4.66.0 # Progress bars
test_hindi.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from voice_generator import generate_voiceovers
2
+ import json
3
+
4
+ def test_hindi_voice():
5
+ mock_json = {
6
+ "language": "Hindi",
7
+ "voiceover_settings": {"mood": "energetic", "gender_preference": "female"},
8
+ "scenes": [
9
+ {
10
+ "scene_number": 1,
11
+ "script_text": "नमस्ते, क्या आप तैयार हैं?",
12
+ }
13
+ ]
14
+ }
15
+ print("Testing Hindi voiceover generation...")
16
+ results = generate_voiceovers(mock_json)
17
+ for res in results:
18
+ print(f"Scene {res['scene_number']}: {res['path']} ({res['duration']}s)")
19
+
20
+ if __name__ == "__main__":
21
+ test_hindi_voice()
test_moviepy.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ try:
2
+ from moviepy import VideoFileClip, AudioFileClip, concatenate_videoclips, CompositeVideoClip
3
+ print("✅ MoviePy 2.x imports successful!")
4
+ except ImportError as e:
5
+ print(f"❌ ImportError: {e}")
6
+ except Exception as e:
7
+ print(f"❌ Exception: {e}")
test_moviepy_safe.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import codecs
3
+ sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach())
4
+
5
+ print("Testing MoviePy 2.x Imports...")
6
+ try:
7
+ # In MoviePy 2.0+, many classes are moved to the root package
8
+ import moviepy
9
+ from moviepy import VideoFileClip, AudioFileClip, ImageClip, ColorClip, VideoClip, CompositeVideoClip, concatenate_videoclips
10
+ print("SUCCESS: MoviePy 2.x imports working!")
11
+ print(f"MoviePy Version: {moviepy.__version__}")
12
+ except ImportError as e:
13
+ print(f"IMPORT ERROR: {e}")
14
+ except Exception as e:
15
+ print(f"EXCEPTION: {e}")
test_pexels.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ from dotenv import load_dotenv
4
+
5
+ load_dotenv()
6
+ api_key = os.getenv("PEXELS_API_KEY")
7
+
8
+ if not api_key:
9
+ print("Error: PEXELS_API_KEY NOT FOUND IN .ENV")
10
+ exit(1)
11
+
12
+ print(f"Testing Pexels API with key: {api_key[:8]}...")
13
+
14
+ def test_search(query):
15
+ url = "https://api.pexels.com/videos/search"
16
+ headers = {"Authorization": api_key}
17
+ params = {"query": query, "per_page": 1, "orientation": "portrait"}
18
+
19
+ try:
20
+ response = requests.get(url, headers=headers, params=params, timeout=10)
21
+ print(f"Search '{query}': {response.status_code}")
22
+ if response.status_code == 200:
23
+ data = response.json()
24
+ videos = data.get("videos", [])
25
+ print(f" Found {len(videos)} videos.")
26
+ if videos:
27
+ v = videos[0]
28
+ print(f" Example: {v.get('url')} ({v.get('width')}x{v.get('height')})")
29
+ else:
30
+ print(f" Error: {response.text}")
31
+ except Exception as e:
32
+ print(f" Exception: {e}")
33
+
34
+ test_search("technology")
35
+ test_search("serene landscape")
test_repair.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from agent import _robust_json_parse, _repair_json
2
+ import json
3
+
4
+ def test_repair():
5
+ truncated = '{"niche": "AI Tools", "scenes": [{"scene_number": 1, "text": "Hello world'
6
+ print(f"Original: {truncated}")
7
+ repaired = _repair_json(truncated)
8
+ print(f"Repaired: {repaired}")
9
+
10
+ data = _robust_json_parse(truncated)
11
+ print(f"Parsed JSON niche: {data['niche']}")
12
+
13
+ # Test complex truncation
14
+ complex_trunc = '{"seo": {"title": "AI'
15
+ print(f"\nOriginal: {complex_trunc}")
16
+ repaired_complex = _repair_json(complex_trunc)
17
+ print(f"Repaired: {repaired_complex}")
18
+ data_complex = _robust_json_parse(complex_trunc)
19
+ print(f"Parsed JSON SEO title: {data_complex['seo']['title']}")
20
+
21
+ if __name__ == "__main__":
22
+ test_repair()
video_assembler.py ADDED
@@ -0,0 +1,806 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ video_assembler.py
3
+ ─────────────────────────────────────────────────────────────
4
+ Autonomous Short-Form Video Engine — Modern Editing Layer
5
+
6
+ Supported editing styles:
7
+ • motion_graphics — Animated titles, kinetic text, zoom punches,
8
+ color grading, lower thirds, particles
9
+ • montage — Fast cuts, beat-sync, speed ramps, whip pans,
10
+ flash transitions, adrenaline pacing
11
+ • documentary — Slow cross-fades, b-roll breathing, Ken Burns,
12
+ subtitles like a real documentary
13
+ • social_media — Zoom punch, caption pop-on, emoji overlays,
14
+ bold captions, hook-first social hooks
15
+
16
+ Each style wraps the same base infrastructure but applies a different
17
+ visual treatment to every scene.
18
+ ─────────────────────────────────────────────────────────────
19
+ """
20
+
21
+ import os
22
+ import math
23
+ import logging
24
+ import random
25
+ from pathlib import Path
26
+ from datetime import datetime
27
+ from typing import Callable
28
+
29
+ import numpy as np
30
+ from PIL import Image, ImageDraw, ImageFont, ImageFilter
31
+ from dotenv import load_dotenv
32
+
33
+ load_dotenv()
34
+ logger = logging.getLogger(__name__)
35
+
36
+ OUTPUT_DIR = Path("output")
37
+ OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
38
+
39
+ TARGET_W = 1080
40
+ TARGET_H = 1920
41
+ TARGET_FPS = 30
42
+ BGM_VOLUME = float(os.getenv("BGM_VOLUME", "0.08"))
43
+
44
+ EDITING_STYLES = ["motion_graphics", "montage", "documentary", "social_media"]
45
+
46
+ # ─────────────────────────────────────────────────────────────
47
+ # FONT UTILITIES
48
+ # ─────────────────────────────────────────────────────────────
49
+
50
+ def _load_font(size: int = 52, bold: bool = True):
51
+ candidates = [
52
+ # Windows Hindi/Devanagari candidates
53
+ ("C:/Windows/Fonts/Nirmala.ttf", True),
54
+ ("C:/Windows/Fonts/Mangal.ttf", False),
55
+ # Windows Latin candidates
56
+ ("C:/Windows/Fonts/arialbd.ttf", True),
57
+ ("C:/Windows/Fonts/calibrib.ttf", True),
58
+ ("C:/Windows/Fonts/verdanab.ttf", True),
59
+ ("C:/Windows/Fonts/arial.ttf", False),
60
+ # Linux candidates
61
+ ("/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf", True),
62
+ ("/usr/share/fonts/truetype/noto/NotoSans-Bold.ttf", True),
63
+ ("/usr/share/fonts/truetype/noto/NotoSansDevanagari-Bold.ttf", True),
64
+ ]
65
+ for fp, is_bold in candidates:
66
+ if Path(fp).exists() and (is_bold == bold or not bold):
67
+ try:
68
+ return ImageFont.truetype(fp, size)
69
+ except Exception:
70
+ continue
71
+ try:
72
+ # Fallback to a system-registered font if arial is there
73
+ return ImageFont.truetype("arial.ttf", size)
74
+ except Exception:
75
+ return ImageFont.load_default()
76
+
77
+
78
+ def _wrap_text(text: str, max_chars: int = 20) -> list[str]:
79
+ words = text.split()
80
+ lines = []
81
+ current = ""
82
+ for word in words:
83
+ if len(current) + len(word) + 1 <= max_chars:
84
+ current = f"{current} {word}".strip()
85
+ else:
86
+ if current:
87
+ lines.append(current)
88
+ current = word
89
+ if current:
90
+ lines.append(current)
91
+ return lines
92
+
93
+
94
+ # ─────────────────────────────────────────────────────────────
95
+ # BASE CLIP PREPARATION
96
+ # ─────────────────────────────────────────────────────────────
97
+
98
+ def _prepare_base_clip(media_path: str, target_dur: float,
99
+ w=TARGET_W, h=TARGET_H, fps=TARGET_FPS):
100
+ """Load video/image, resize+crop to 9:16, loop/trim to target_dur."""
101
+ from moviepy import VideoFileClip, ImageClip, ColorClip
102
+
103
+ path = Path(media_path) if media_path else Path("")
104
+ if not path.exists():
105
+ return ColorClip(size=(w, h), color=(5, 5, 15)).with_duration(target_dur).with_fps(fps)
106
+
107
+ if path.suffix.lower() in (".mp4", ".mov", ".avi", ".webm"):
108
+ clip = VideoFileClip(str(path))
109
+ if clip.duration < target_dur:
110
+ from moviepy import concatenate_videoclips
111
+ reps = math.ceil(target_dur / clip.duration)
112
+ clip = concatenate_videoclips([clip] * reps)
113
+ clip = clip.subclipped(0, target_dur)
114
+ else:
115
+ clip = ImageClip(str(path)).with_duration(target_dur)
116
+
117
+ # Resize + center-crop to 9:16
118
+ cr = clip.w / clip.h
119
+ tr = w / h
120
+ if cr > tr:
121
+ clip = clip.resized(height=h)
122
+ ex = clip.w - w
123
+ clip = clip.cropped(x1=ex // 2, x2=ex // 2 + w)
124
+ else:
125
+ clip = clip.resized(width=w)
126
+ ex = clip.h - h
127
+ clip = clip.cropped(y1=ex // 2, y2=ex // 2 + h)
128
+
129
+ return clip.with_fps(fps)
130
+
131
+
132
+ # ─────────────────────────────────────────────────────────────
133
+ # EFFECT PRIMITIVES (each returns a numpy RGBA array or a clip)
134
+ # ─────────────────────────────────────────────────────────────
135
+
136
+ # ── Color grading (LUT-like color correction) ─────────────
137
+ def _apply_color_grade(clip, style: str = "cinematic"):
138
+ """Apply a color grade effect using fl_image frame transformation."""
139
+ from moviepy import VideoClip
140
+
141
+ def cinematic(frame):
142
+ img = Image.fromarray(frame).convert("RGB")
143
+ r, g, b = img.split()
144
+ r = r.point(lambda i: min(255, int(i * 1.08)))
145
+ g = g.point(lambda i: min(255, int(i * 0.97)))
146
+ b = b.point(lambda i: min(255, int(i * 1.12)))
147
+ return np.array(Image.merge("RGB", (r, g, b)))
148
+
149
+ def warm(frame):
150
+ img = Image.fromarray(frame).convert("RGB")
151
+ r, g, b = img.split()
152
+ r = r.point(lambda i: min(255, int(i * 1.15)))
153
+ g = g.point(lambda i: min(255, int(i * 1.02)))
154
+ b = b.point(lambda i: max(0, int(i * 0.88)))
155
+ return np.array(Image.merge("RGB", (r, g, b)))
156
+
157
+ def cold(frame):
158
+ img = Image.fromarray(frame).convert("RGB")
159
+ r, g, b = img.split()
160
+ r = r.point(lambda i: max(0, int(i * 0.90)))
161
+ g = g.point(lambda i: min(255, int(i * 1.00)))
162
+ b = b.point(lambda i: min(255, int(i * 1.18)))
163
+ return np.array(Image.merge("RGB", (r, g, b)))
164
+
165
+ grade_fn = {"cinematic": cinematic, "warm": warm, "cold": cold}.get(style, cinematic)
166
+ return clip.image_transform(grade_fn)
167
+
168
+
169
+ # ── Ken Burns (pan + zoom for static images) ─────────────
170
+ def _ken_burns(clip, zoom_in: bool = True, direction: str = "left"):
171
+ """Slowly zoom + pan across an image clip for a cinematic feel."""
172
+ dur = clip.duration
173
+ w, h = clip.w, clip.h
174
+
175
+ def make_frame(t):
176
+ progress = t / dur
177
+ if zoom_in:
178
+ scale = 1.0 + 0.08 * progress
179
+ else:
180
+ scale = 1.08 - 0.08 * progress
181
+
182
+ new_w = int(w * scale)
183
+ new_h = int(h * scale)
184
+ frame = clip.get_frame(t)
185
+ img = Image.fromarray(frame).resize((new_w, new_h), Image.Resampling.LANCZOS)
186
+
187
+ # Pan direction
188
+ if direction == "left":
189
+ x_off = int((new_w - w) * progress)
190
+ elif direction == "right":
191
+ x_off = int((new_w - w) * (1 - progress))
192
+ else:
193
+ x_off = (new_w - w) // 2
194
+
195
+ y_off = (new_h - h) // 2
196
+ cropped = img.crop((x_off, y_off, x_off + w, y_off + h))
197
+ return np.array(cropped)
198
+
199
+ from moviepy import VideoClip
200
+ return VideoClip(make_frame, duration=dur).with_fps(clip.fps)
201
+
202
+
203
+ # ── Zoom Punch (single frame zoom-in burst) ──────────────
204
+ def _zoom_punch(clip, at_time: float = 0.0, zoom_factor: float = 1.15, hold: float = 0.12):
205
+ """Apply an instant zoom punch impact at `at_time`."""
206
+ dur = clip.duration
207
+ w, h = clip.w, clip.h
208
+
209
+ def make_frame(t):
210
+ frame = clip.get_frame(t)
211
+ if at_time <= t <= at_time + hold:
212
+ progress = (t - at_time) / hold
213
+ scale = 1.0 + (zoom_factor - 1.0) * math.sin(progress * math.pi)
214
+ nw = int(w * scale)
215
+ nh = int(h * scale)
216
+ img = Image.fromarray(frame).resize((nw, nh), Image.Resampling.LANCZOS)
217
+ xoff = (nw - w) // 2
218
+ yoff = (nh - h) // 2
219
+ return np.array(img.crop((xoff, yoff, xoff + w, yoff + h)))
220
+ return frame
221
+
222
+ from moviepy import VideoClip
223
+ return VideoClip(make_frame, duration=dur).with_fps(clip.fps)
224
+
225
+
226
+ # ── Flash Transition (white flash between scenes) ────────
227
+ def _make_flash_overlay(w: int, h: int, dur: float = 0.18, fps: int = 30) -> "ImageClip":
228
+ """Create a white flash that fades out — used at scene start for montage."""
229
+ from moviepy import ImageClip
230
+
231
+ n_frames = int(dur * fps)
232
+ total_px = w * h * 3
233
+
234
+ def make_frame(t):
235
+ alpha = max(0.0, 1.0 - (t / dur) ** 0.5)
236
+ val = int(255 * alpha)
237
+ frame = np.full((h, w, 3), val, dtype=np.uint8)
238
+ return frame
239
+
240
+ from moviepy import VideoClip
241
+ return VideoClip(make_frame, duration=dur).with_fps(fps)
242
+
243
+
244
+ # ── Speed Ramp (slow → fast for montage) ─────────────────
245
+ def _speed_ramp(clip, ramp_duration: float = 1.5, start_speed: float = 0.4, end_speed: float = 1.6):
246
+ """Gradually change playback speed within the clip."""
247
+ dur = clip.duration
248
+
249
+ def time_map(t):
250
+ if t <= ramp_duration:
251
+ progress = t / ramp_duration
252
+ speed = start_speed + (end_speed - start_speed) * progress
253
+ else:
254
+ speed = end_speed
255
+ return min(t * speed, dur - 0.01)
256
+
257
+ return clip.time_transform(time_map, apply_to=["video", "mask"])
258
+
259
+
260
+ # ── Glitch / Chromatic Aberration (social media hook) ────
261
+ def _chromatic_aberration(clip, intensity: int = 6):
262
+ """RGB channel shift for a glitch effect on the hook scene."""
263
+ def effect(frame):
264
+ img = Image.fromarray(frame).convert("RGB")
265
+ r, g, b = img.split()
266
+ r_shifted = ImageDraw.Draw(Image.new("L", img.size, 0))
267
+ r_arr = np.array(r)
268
+ g_arr = np.array(g)
269
+ b_arr = np.array(b)
270
+ h_, w_ = r_arr.shape
271
+ r_out = np.roll(r_arr, intensity, axis=1)
272
+ b_out = np.roll(b_arr, -intensity, axis=1)
273
+ combined = np.stack([r_out, g_arr, b_out], axis=2).astype(np.uint8)
274
+ return combined
275
+
276
+ return clip.image_transform(effect)
277
+
278
+
279
+ # ── Vignette overlay ─────────────────────────────────────
280
+ def _make_vignette(w: int, h: int, strength: float = 0.6) -> np.ndarray:
281
+ """Generate a vignette RGBA overlay (dark corners)."""
282
+ img = Image.new("RGBA", (w, h), (0, 0, 0, 0))
283
+ draw = ImageDraw.Draw(img)
284
+ cx, cy = w // 2, h // 2
285
+ for r in range(max(w, h), 0, -5):
286
+ alpha = int(strength * 255 * (1 - r / max(w, h)) ** 2.5)
287
+ if alpha <= 0:
288
+ continue
289
+ draw.ellipse(
290
+ [(cx - r, cy - r), (cx + r, cy + r)],
291
+ fill=(0, 0, 0, 0),
292
+ outline=(0, 0, 0, alpha),
293
+ )
294
+ return np.array(img)
295
+
296
+
297
+ # ─────────────────────────────────────────────────────────────
298
+ # TEXT OVERLAY ENGINES (per style)
299
+ # ─────────────────────────────────────────────────────────────
300
+
301
+ def _caption_motion_graphics(text: str, w: int, h: int, t: float, dur: float) -> np.ndarray:
302
+ """
303
+ MOTION GRAPHICS style:
304
+ - Slide-up entrance animation
305
+ - Gradient color fill on text
306
+ - Bold uppercase with tracking
307
+ - Semi-transparent pill background
308
+ """
309
+ img = Image.new("RGBA", (w, h), (0, 0, 0, 0))
310
+ draw = ImageDraw.Draw(img)
311
+ font = _load_font(size=64)
312
+
313
+ lines = _wrap_text(text.upper(), max_chars=18)
314
+ line_h = 76
315
+ total_text = len(lines) * line_h
316
+ y_base = h - total_text - 200
317
+
318
+ # Slide-up animation: 0→0.4s entrance
319
+ slide_pct = min(1.0, t / 0.4)
320
+ ease_y = int((1 - slide_pct) * 120) # slides up from +120px
321
+
322
+ # Fade-out: last 0.3s
323
+ fade_out = max(0.0, min(1.0, (dur - t) / 0.3))
324
+ alpha = int(255 * slide_pct * fade_out)
325
+
326
+ for i, line in enumerate(lines):
327
+ bbox = draw.textbbox((0, 0), line, font=font)
328
+ tw = bbox[2] - bbox[0]
329
+ th = bbox[3] - bbox[1]
330
+ x = (w - tw) // 2
331
+ y = y_base + i * line_h + ease_y
332
+
333
+ # Pill background
334
+ pad = 20
335
+ pill = [(x - pad, y - 8), (x + tw + pad, y + th + 8)]
336
+ draw.rounded_rectangle(pill, radius=12, fill=(0, 0, 0, int(alpha * 0.72)))
337
+
338
+ # Main text (gradient: purple → cyan approximated as white with tint)
339
+ draw.text((x + 3, y + 3), line, font=font,
340
+ fill=(20, 180, 255, int(alpha * 0.4))) # shadow in cyan-blue
341
+ draw.text((x, y), line, font=font,
342
+ fill=(255, 255, 255, alpha))
343
+
344
+ return np.array(img)
345
+
346
+
347
+ def _caption_montage(text: str, w: int, h: int, t: float, dur: float) -> np.ndarray:
348
+ """
349
+ MONTAGE style:
350
+ - Impact font-style (very bold, large)
351
+ - Fast pop-in (scale from 150% to 100% in 0.15s)
352
+ - All caps, tight letter spacing
353
+ - Strong yellow highlight color
354
+ - No background — heavy black outline instead
355
+ """
356
+ img = Image.new("RGBA", (w, h), (0, 0, 0, 0))
357
+ draw = ImageDraw.Draw(img)
358
+ font = _load_font(size=80)
359
+
360
+ lines = _wrap_text(text.upper(), max_chars=15)
361
+ line_h = 94
362
+ y_base = h // 2 - (len(lines) * line_h) // 2 # center screen for montage
363
+
364
+ # Pop-in: 0.15s scale punch (simulated by vertical stretch)
365
+ pop_pct = min(1.0, t / 0.15)
366
+ alpha = int(255 * pop_pct)
367
+
368
+ for i, line in enumerate(lines):
369
+ bbox = draw.textbbox((0, 0), line, font=font)
370
+ tw = bbox[2] - bbox[0]
371
+ x = (w - tw) // 2
372
+ y = y_base + i * line_h
373
+
374
+ # 6px thick outline (drawn first)
375
+ for oox in range(-5, 6, 2):
376
+ for ooy in range(-5, 6, 2):
377
+ if abs(oox) + abs(ooy) > 6:
378
+ continue
379
+ draw.text((x + oox, y + ooy), line, font=font,
380
+ fill=(0, 0, 0, alpha))
381
+
382
+ # Yellow main text
383
+ draw.text((x, y), line, font=font,
384
+ fill=(255, 230, 0, alpha))
385
+
386
+ return np.array(img)
387
+
388
+
389
+ def _caption_documentary(text: str, w: int, h: int, t: float, dur: float) -> np.ndarray:
390
+ """
391
+ DOCUMENTARY style:
392
+ - Lower-third strip design
393
+ - Serif-ish font, clean and professional
394
+ - Slow fade-in (0.6s), slow fade-out (0.5s)
395
+ - Left-aligned with accent bar
396
+ - White text on dark semi-transparent band
397
+ """
398
+ img = Image.new("RGBA", (w, h), (0, 0, 0, 0))
399
+ draw = ImageDraw.Draw(img)
400
+ font = _load_font(size=54)
401
+
402
+ lines = _wrap_text(text, max_chars=26)
403
+ line_h = 64
404
+
405
+ # Slow fade
406
+ fade_in = min(1.0, t / 0.6)
407
+ fade_out = max(0.0, min(1.0, (dur - t) / 0.5))
408
+ alpha = int(255 * fade_in * fade_out)
409
+
410
+ total_h = len(lines) * line_h + 40
411
+ strip_y = h - total_h - 140
412
+
413
+ # Dark band (full width strip)
414
+ draw.rectangle([(0, strip_y - 12), (w, strip_y + total_h)],
415
+ fill=(0, 0, 0, int(alpha * 0.78)))
416
+
417
+ # Accent bar (left edge, electric blue)
418
+ draw.rectangle([(0, strip_y - 12), (8, strip_y + total_h)],
419
+ fill=(0, 180, 255, alpha))
420
+
421
+ # Text
422
+ x_text = 36
423
+ for i, line in enumerate(lines):
424
+ y_ = strip_y + 16 + i * line_h
425
+ draw.text((x_text + 2, y_ + 2), line, font=font, fill=(0, 0, 0, int(alpha * 0.5)))
426
+ draw.text((x_text, y_), line, font=font, fill=(255, 255, 255, alpha))
427
+
428
+ return np.array(img)
429
+
430
+
431
+ def _caption_social_media(text: str, w: int, h: int, t: float, dur: float) -> np.ndarray:
432
+ """
433
+ SOCIAL MEDIA style:
434
+ - TikTok/Reels-style bold center caption
435
+ - Word-by-word karaoke highlight effect (approximated)
436
+ - Bright pink/yellow highlight on current word
437
+ - Strong outline, big font, bottom-centered
438
+ """
439
+ img = Image.new("RGBA", (w, h), (0, 0, 0, 0))
440
+ draw = ImageDraw.Draw(img)
441
+ font = _load_font(size=72)
442
+
443
+ lines = _wrap_text(text, max_chars=16)
444
+ line_h = 88
445
+ y_base = h - (len(lines) * line_h) - 160
446
+
447
+ # Pop-in burst: 0→0.2s
448
+ pop = min(1.0, t / 0.2)
449
+ alpha = int(255 * pop)
450
+
451
+ # Karaoke: highlight word cycling
452
+ words = text.split()
453
+ total_words = max(1, len(words))
454
+ current_word_idx = int((t / max(dur, 0.01)) * total_words)
455
+
456
+ for i, line in enumerate(lines):
457
+ bbox = draw.textbbox((0, 0), line, font=font)
458
+ tw = bbox[2] - bbox[0]
459
+ x = (w - tw) // 2
460
+ y = y_base + i * line_h
461
+
462
+ # Black pill backdrop
463
+ pad = 22
464
+ draw.rounded_rectangle(
465
+ [(x - pad, y - 10), (x + tw + pad, y + 80)],
466
+ radius=16,
467
+ fill=(0, 0, 0, int(alpha * 0.65)),
468
+ )
469
+
470
+ # Outline
471
+ for ox, oy in [(-4,0),(4,0),(0,-4),(0,4),(-3,-3),(3,3),(-3,3),(3,-3)]:
472
+ draw.text((x + ox, y + oy), line, font=font, fill=(0, 0, 0, alpha))
473
+
474
+ # Check if this line's words include the highlighted word
475
+ line_words = line.split()
476
+ flat_idx = sum(len(l.split()) for l in lines[:i])
477
+ is_active = flat_idx <= current_word_idx < flat_idx + len(line_words)
478
+
479
+ color = (255, 50, 200, alpha) if is_active else (255, 255, 255, alpha)
480
+ draw.text((x, y), line, font=font, fill=color)
481
+
482
+ return np.array(img)
483
+
484
+
485
+ # ─────────────────────────────────────────────────────────────
486
+ # STYLE-SPECIFIC SCENE PROCESSORS
487
+ # ─────────────────────────────────────────────────────────────
488
+
489
+ def _build_motion_graphics_scene(base_clip, scene: dict, dur: float, fps: int):
490
+ """
491
+ Motion Graphics:
492
+ - Cinematic color grade
493
+ - Subtle zoom-in over scene duration
494
+ - Animated slide-up text overlay
495
+ - Vignette final composite
496
+ """
497
+ from moviepy import VideoClip, CompositeVideoClip, ImageClip
498
+
499
+ graded = _apply_color_grade(base_clip, "cinematic")
500
+
501
+ # Slow zoom-in (1.0x → 1.06x)
502
+ w, h = base_clip.w, base_clip.h
503
+ def zoom_frame(t):
504
+ frame = graded.get_frame(t)
505
+ scale = 1.0 + 0.06 * (t / dur)
506
+ nw, nh = int(w * scale), int(h * scale)
507
+ img = Image.fromarray(frame).resize((nw, nh), Image.Resampling.LANCZOS)
508
+ xoff = (nw - w) // 2
509
+ yoff = (nh - h) // 2
510
+ return np.array(img.crop((xoff, yoff, xoff + w, yoff + h)))
511
+
512
+ zoomed = VideoClip(zoom_frame, duration=dur).with_fps(fps)
513
+
514
+ # Animated caption
515
+ caption_text = scene.get("on_screen_text", "")
516
+ def caption_frame(t):
517
+ return _caption_motion_graphics(caption_text, w, h, t, dur)
518
+
519
+ caption_layer = (
520
+ VideoClip(caption_frame, duration=dur)
521
+ .with_fps(fps)
522
+ )
523
+
524
+ # Vignette
525
+ vignette_arr = _make_vignette(w, h, strength=0.55)
526
+ vignette_clip = ImageClip(vignette_arr).with_duration(dur).with_fps(fps)
527
+
528
+ return CompositeVideoClip([zoomed, caption_layer, vignette_clip])
529
+
530
+
531
+ def _build_montage_scene(base_clip, scene: dict, dur: float, fps: int, scene_idx: int):
532
+ """
533
+ Montage:
534
+ - Fast cuts already handled at concat level (short dur)
535
+ - Speed ramp on most scenes
536
+ - Zoom punch at scene start
537
+ - Warm/saturated grade
538
+ - Bold impact text
539
+ - White flash overlay at start
540
+ """
541
+ from moviepy import VideoClip, CompositeVideoClip
542
+
543
+ w, h = base_clip.w, base_clip.h
544
+
545
+ # Warm color grade
546
+ graded = _apply_color_grade(base_clip, "warm")
547
+
548
+ # Zoom punch at t=0 for every other scene
549
+ if scene_idx % 2 == 0:
550
+ punched = _zoom_punch(graded, at_time=0.0, zoom_factor=1.18, hold=0.10)
551
+ else:
552
+ punched = graded
553
+
554
+ # Speed ramp for longer scenes
555
+ if dur > 4.0:
556
+ try:
557
+ punched = _speed_ramp(punched)
558
+ except Exception:
559
+ pass
560
+
561
+ # Flash overlay (first 0.18s of scene)
562
+ flash_dur = min(0.18, dur * 0.15)
563
+ flash = _make_flash_overlay(w, h, dur=flash_dur, fps=fps)
564
+
565
+ # Bold caption
566
+ caption_text = scene.get("on_screen_text", "")
567
+ def caption_frame(t):
568
+ return _caption_montage(caption_text, w, h, t, dur)
569
+
570
+ cap_layer = VideoClip(caption_frame, duration=dur).with_fps(fps)
571
+
572
+ # Flash only first flash_dur seconds
573
+ def flash_frame(t):
574
+ if t < flash_dur:
575
+ return flash.get_frame(t)
576
+ return np.zeros((h, w, 3), dtype=np.uint8)
577
+
578
+ flash_layer = VideoClip(flash_frame, duration=dur).with_fps(fps)
579
+
580
+ return CompositeVideoClip([punched, flash_layer, cap_layer])
581
+
582
+
583
+ def _build_documentary_scene(base_clip, scene: dict, dur: float, fps: int):
584
+ """
585
+ Documentary:
586
+ - Ken Burns on image clips, smooth playback on video
587
+ - Cold/neutral grade
588
+ - Slow cross-fade handled at concat level
589
+ - Lower-third text with professional styling
590
+ """
591
+ from moviepy import VideoClip, CompositeVideoClip
592
+
593
+ w, h = base_clip.w, base_clip.h
594
+
595
+ # Determine if it's a static image (no motion)
596
+ is_image = not hasattr(base_clip, "reader")
597
+ if is_image or dur > 8:
598
+ directions = ["left", "right", "center"]
599
+ direction = random.choice(directions)
600
+ zoomed = _ken_burns(base_clip, zoom_in=(dur % 2 == 0), direction=direction)
601
+ else:
602
+ zoomed = base_clip
603
+
604
+ graded = _apply_color_grade(zoomed, "cold")
605
+
606
+ # Lower-third text
607
+ caption_text = scene.get("on_screen_text", "")
608
+ def caption_frame(t):
609
+ return _caption_documentary(caption_text, w, h, t, dur)
610
+
611
+ cap_layer = VideoClip(caption_frame, duration=dur).with_fps(fps)
612
+
613
+ # Vignette
614
+ from moviepy import ImageClip
615
+ vignette_arr = _make_vignette(w, h, strength=0.4)
616
+ vignette_clip = ImageClip(vignette_arr).with_duration(dur).with_fps(fps)
617
+
618
+ return CompositeVideoClip([graded, cap_layer, vignette_clip])
619
+
620
+
621
+ def _build_social_media_scene(base_clip, scene: dict, dur: float, fps: int, is_hook: bool):
622
+ """
623
+ Social Media (TikTok/Reels style):
624
+ - Glitch/aberration on hook scene
625
+ - Zoom punch on every scene
626
+ - Bright warm grade
627
+ - Karaoke-style pop captions
628
+ - Slight tilt/rotation on some scenes
629
+ """
630
+ from moviepy import VideoClip, CompositeVideoClip
631
+
632
+ w, h = base_clip.w, base_clip.h
633
+
634
+ # Glitch on hook
635
+ if is_hook:
636
+ graded_raw = _apply_color_grade(base_clip, "warm")
637
+ graded = _chromatic_aberration(graded_raw, intensity=7)
638
+ else:
639
+ graded = _apply_color_grade(base_clip, "warm")
640
+
641
+ # Zoom punch at t=0 (social media always punches in)
642
+ punched = _zoom_punch(graded, at_time=0.0, zoom_factor=1.12, hold=0.08)
643
+
644
+ # Social caption
645
+ caption_text = scene.get("on_screen_text", "")
646
+ def caption_frame(t):
647
+ return _caption_social_media(caption_text, w, h, t, dur)
648
+
649
+ cap_layer = VideoClip(caption_frame, duration=dur).with_fps(fps)
650
+
651
+ return CompositeVideoClip([punched, cap_layer])
652
+
653
+
654
+ # ─────────────────────────────────────────────────────────────
655
+ # TRANSITION HELPERS
656
+ # ─────────────────────────────────────────────────────────────
657
+
658
+ def _crossfade_concat(clips: list, crossfade_dur: float = 0.5):
659
+ """Concatenate clips with a crossfade dissolve between them."""
660
+ from moviepy import CompositeVideoClip, concatenate_videoclips
661
+
662
+ if len(clips) <= 1 or crossfade_dur <= 0:
663
+ return concatenate_videoclips(clips, method="compose")
664
+
665
+ result = clips[0]
666
+ for next_clip in clips[1:]:
667
+ # Create overlap: fade the tail of result into next_clip
668
+ try:
669
+ fade_out = result.faded_out(crossfade_dur)
670
+ fade_in = next_clip.faded_in(crossfade_dur)
671
+
672
+ overlap_start = result.duration - crossfade_dur
673
+ result = CompositeVideoClip([
674
+ result.with_end(result.duration),
675
+ next_clip.with_start(overlap_start),
676
+ ], use_bgclip=True)
677
+ result = result.with_duration(overlap_start + next_clip.duration)
678
+ except Exception:
679
+ result = concatenate_videoclips([result, next_clip], method="compose")
680
+
681
+ return result
682
+
683
+
684
+ # ─────────────────────────────────────────────────────────────
685
+ # MAIN PUBLIC API
686
+ # ─────────────────────────────────────────────────────────────
687
+
688
+ def assemble_video(
689
+ video_json: dict,
690
+ audio_results: list,
691
+ media_results: list,
692
+ editing_style: str = "social_media",
693
+ output_filename: str | None = None,
694
+ ) -> str:
695
+ """
696
+ Assemble the final 9:16 MP4 with a chosen modern editing style.
697
+
698
+ Args:
699
+ video_json: Parsed JSON from agent
700
+ audio_results: From voice_generator.generate_voiceovers()
701
+ media_results: From asset_checker.check_all_assets()
702
+ editing_style: One of: motion_graphics | montage | documentary | social_media
703
+ output_filename: Optional custom filename
704
+
705
+ Returns:
706
+ Absolute path string to the final MP4
707
+ """
708
+ from moviepy import AudioFileClip, concatenate_videoclips
709
+
710
+ style = editing_style if editing_style in EDITING_STYLES else "social_media"
711
+ logger.info(f"[Assembler] 🎬 Style: {style.upper().replace('_',' ')}")
712
+
713
+ audio_map = {r["scene_number"]: r for r in audio_results}
714
+ media_map = {r["scene_number"]: r for r in media_results}
715
+ scenes = video_json["scenes"]
716
+
717
+ styled_clips = []
718
+
719
+ for idx, scene in enumerate(scenes):
720
+ num = scene["scene_number"]
721
+ caption = scene.get("on_screen_text", "")
722
+
723
+ audio_info = audio_map.get(num)
724
+ media_info = media_map.get(num, {})
725
+
726
+ # Use real TTS duration if available
727
+ target_dur = float(scene.get("duration_seconds", 5))
728
+ if audio_info and audio_info.get("duration", 0) > 0:
729
+ target_dur = audio_info["duration"]
730
+
731
+ logger.info(f"[Assembler] Scene {num} ({style}) — {target_dur:.1f}s")
732
+
733
+ media_path = media_info.get("path") if media_info.get("approved", True) else None
734
+ base = _prepare_base_clip(media_path or "", target_dur)
735
+
736
+ # Apply style engine
737
+ try:
738
+ if style == "motion_graphics":
739
+ styled = _build_motion_graphics_scene(base, scene, target_dur, TARGET_FPS)
740
+ elif style == "montage":
741
+ styled = _build_montage_scene(base, scene, target_dur, TARGET_FPS, idx)
742
+ elif style == "documentary":
743
+ styled = _build_documentary_scene(base, scene, target_dur, TARGET_FPS)
744
+ elif style == "social_media":
745
+ is_hook = (scene.get("type") == "hook" or idx == 0)
746
+ styled = _build_social_media_scene(base, scene, target_dur, TARGET_FPS, is_hook)
747
+ else:
748
+ styled = base
749
+ except Exception as e:
750
+ logger.warning(f"[Assembler] Style engine failed on scene {num}: {e} — using base")
751
+ styled = base
752
+
753
+ # Attach voiceover audio
754
+ if audio_info and audio_info.get("path") and Path(audio_info["path"]).exists():
755
+ try:
756
+ raw_audio = AudioFileClip(audio_info["path"])
757
+ # Safe duration: if TTS is shorter than scene, don't try to extend it (causes crash)
758
+ audio_dur = min(target_dur, raw_audio.duration)
759
+ audio = raw_audio.subclipped(0, audio_dur).with_duration(audio_dur)
760
+ styled = styled.with_audio(audio)
761
+ except Exception as e:
762
+ logger.warning(f"[Assembler] Audio attach failed scene {num}: {e}")
763
+
764
+ styled_clips.append(styled)
765
+
766
+ if not styled_clips:
767
+ raise RuntimeError("No clips built — check media and audio outputs.")
768
+
769
+ # ── Concatenation strategy per style ─────────────────
770
+ logger.info("[Assembler] Concatenating scenes...")
771
+ try:
772
+ if style == "documentary":
773
+ final = _crossfade_concat(styled_clips, crossfade_dur=0.5)
774
+ elif style == "montage":
775
+ # Hard cuts — no transition
776
+ final = concatenate_videoclips(styled_clips, method="compose")
777
+ elif style == "motion_graphics":
778
+ final = _crossfade_concat(styled_clips, crossfade_dur=0.35)
779
+ else: # social_media
780
+ final = concatenate_videoclips(styled_clips, method="compose")
781
+ except Exception as e:
782
+ logger.warning(f"[Assembler] Crossfade failed ({e}), falling back to hard cuts")
783
+ final = concatenate_videoclips(styled_clips, method="compose")
784
+
785
+ # ── Output ────────────────────────────────────────────
786
+ if not output_filename:
787
+ ts = datetime.now().strftime("%Y%m%d_%H%M%S")
788
+ niche_slug = video_json.get("niche", "video").lower().replace(" ", "_")
789
+ output_filename = f"{niche_slug}_{style}_{ts}.mp4"
790
+
791
+ out_path = OUTPUT_DIR / output_filename
792
+ logger.info(f"[Assembler] Rendering → {out_path}")
793
+
794
+ final.write_videofile(
795
+ str(out_path),
796
+ fps=TARGET_FPS,
797
+ codec="libx264",
798
+ audio_codec="aac",
799
+ temp_audiofile="temp_audio.m4a",
800
+ remove_temp=True,
801
+ logger=None,
802
+ preset="medium",
803
+ )
804
+
805
+ logger.info(f"[Assembler] ✅ Done → {out_path}")
806
+ return str(out_path.resolve())
voice_generator.py ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ voice_generator.py
3
+ ─────────────────────────────────────────────────────────────
4
+ Autonomous Short-Form Video Engine — Voiceover Layer
5
+ Uses Google Cloud Text-to-Speech (free tier: 1M chars/month)
6
+ to generate per-scene MP3 audio from the agent's JSON output.
7
+
8
+ Fallback: set USE_EDGE_TTS=true in .env to use edge-tts
9
+ (no Google credentials needed).
10
+ ─────────────────────────────────────────────────────────────
11
+ """
12
+
13
+ import os
14
+ import asyncio
15
+ import logging
16
+ from pathlib import Path
17
+ from dotenv import load_dotenv
18
+ from mutagen.mp3 import MP3
19
+
20
+ load_dotenv()
21
+ logger = logging.getLogger(__name__)
22
+
23
+ # ── Hugging Face / Cloud Secrets ──────────────────────────
24
+ # If GOOGLE_APPLICATION_CREDENTIALS_JSON is provided as a secret,
25
+ # write it to a temp file and set the path.
26
+ if os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON"):
27
+ import json
28
+ import tempfile
29
+
30
+ try:
31
+ creds_json = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON")
32
+ with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json') as tf:
33
+ tf.write(creds_json)
34
+ os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = tf.name
35
+ logger.info(f"✅ Google Cloud credentials written to {tf.name}")
36
+ except Exception as e:
37
+ logger.error(f"❌ Failed to process Google Cloud credentials JSON: {e}")
38
+
39
+ AUDIO_DIR = Path("assets/audio")
40
+ AUDIO_DIR.mkdir(parents=True, exist_ok=True)
41
+
42
+ USE_EDGE_TTS = os.getenv("USE_EDGE_TTS", "false").lower() == "true"
43
+
44
+ # ── Voice mapping by mood ─────────────────────────────────
45
+ # Google Cloud TTS Neural2 voices (en-US)
46
+ GOOGLE_VOICE_MAP = {
47
+ ("calm", "male"): ("en-US-Neural2-D", 0.95),
48
+ ("calm", "female"): ("en-US-Neural2-F", 0.92),
49
+ ("energetic", "male"): ("en-US-Neural2-J", 1.05),
50
+ ("energetic", "female"): ("en-US-Neural2-G", 1.08),
51
+ ("inspirational","male"): ("en-US-Neural2-A", 1.00),
52
+ ("inspirational","female"):("en-US-Neural2-C", 0.98),
53
+ ("monotone", "male"): ("en-US-Neural2-I", 0.90),
54
+ ("monotone", "female"): ("en-US-Neural2-E", 0.90),
55
+ ("neutral", "neutral"):("en-US-Neural2-D", 1.00),
56
+ }
57
+
58
+ # edge-tts voice map (fallback)
59
+ EDGE_VOICE_MAP = {
60
+ "English": {
61
+ ("calm", "male"): ("en-US-GuyNeural", {"rate": "-5%", "pitch": "-2Hz"}),
62
+ ("calm", "female"): ("en-US-JennyNeural", {"rate": "-5%", "pitch": "+0Hz"}),
63
+ ("energetic", "male"): ("en-US-GuyNeural", {"rate": "+10%", "pitch": "+2Hz"}),
64
+ ("energetic", "female"): ("en-US-AriaNeural", {"rate": "+10%", "pitch": "+4Hz"}),
65
+ ("inspirational","male"): ("en-US-GuyNeural", {"rate": "+0%", "pitch": "+0Hz"}),
66
+ ("inspirational","female"):("en-US-JennyNeural", {"rate": "+0%", "pitch": "+2Hz"}),
67
+ ("monotone", "male"): ("en-US-GuyNeural", {"rate": "-10%", "pitch": "-5Hz"}),
68
+ ("monotone", "female"): ("en-US-JennyNeural", {"rate": "-10%", "pitch": "-3Hz"}),
69
+ ("neutral", "neutral"):("en-US-GuyNeural", {"rate": "+0%", "pitch": "+0Hz"}),
70
+ },
71
+ "Hindi": {
72
+ ("calm", "male"): ("hi-IN-MadhurNeural", {"rate": "-5%", "pitch": "+0Hz"}),
73
+ ("calm", "female"): ("hi-IN-SwaraNeural", {"rate": "-5%", "pitch": "+0Hz"}),
74
+ ("energetic", "male"): ("hi-IN-MadhurNeural", {"rate": "+5%", "pitch": "+0Hz"}),
75
+ ("energetic", "female"): ("hi-IN-SwaraNeural", {"rate": "+5%", "pitch": "+0Hz"}),
76
+ ("neutral", "neutral"):("hi-IN-MadhurNeural", {"rate": "+0%", "pitch": "+0Hz"}),
77
+ }
78
+ }
79
+ EDGE_VOICE_MAP["Hinglish"] = EDGE_VOICE_MAP["Hindi"] # Use Hindi for Hinglish
80
+
81
+ # ── Premium Humanized Voices ─────────────────────────────
82
+ # Hand-picked for high quality and natural sound
83
+ PREMIUM_VOICES = {
84
+ # Google Cloud TTS
85
+ "google_male_1": ("en-US-Neural2-D", 1.0, "Google - Professional (Male)"),
86
+ "google_male_2": ("en-US-Neural2-J", 1.0, "Google - Energetic (Male)"),
87
+ "google_female_1": ("en-US-Neural2-F", 1.0, "Google - Soft (Female)"),
88
+ "google_female_2": ("en-US-Neural2-C", 1.0, "Google - Authoritative (Female)"),
89
+
90
+ # edge-tts (no auth)
91
+ "edge_male_1": ("en-US-GuyNeural", 1.0, "Edge - Mature (Male)"),
92
+ "edge_male_2": ("en-US-ChristopherNeural", 1.0, "Edge - Friendly (Male)"),
93
+ "edge_female_1": ("en-US-JennyNeural", 1.0, "Edge - Conversational (Female)"),
94
+ "edge_female_2": ("en-US-AvaNeural", 1.0, "Edge - Bright (Female)"),
95
+ "edge_female_3": ("en-GB-SoniaNeural", 1.0, "Edge - British (Female)"),
96
+
97
+ # Hindi / Hinglish (Google)
98
+ "google_hindi_1": ("hi-IN-Neural2-A", 1.0, "Google - Hindi (Male)"),
99
+ "google_hindi_2": ("hi-IN-Neural2-C", 1.0, "Google - Hindi (Female)"),
100
+
101
+ # Hindi / Hinglish (Edge)
102
+ "edge_hindi_1": ("hi-IN-MadhurNeural", 1.0, "Edge - Hindi (Male)"),
103
+ "edge_hindi_2": ("hi-IN-SwaraNeural", 1.0, "Edge - Hindi (Female)"),
104
+ }
105
+
106
+
107
+ def _get_audio_duration(path: Path) -> float:
108
+ """Return duration of an MP3 file in seconds."""
109
+ try:
110
+ audio = MP3(str(path))
111
+ return audio.info.length
112
+ except Exception:
113
+ return 0.0
114
+
115
+
116
+ # ── Google TTS backend ────────────────────────────────────
117
+ def _synthesize_google(text: str, scene_num: int, mood: str, gender: str, voice_id: str = None) -> Path:
118
+ """Generate speech via Google Cloud TTS."""
119
+ from google.cloud import texttospeech
120
+
121
+ if voice_id and voice_id in PREMIUM_VOICES:
122
+ voice_name, speed, _ = PREMIUM_VOICES[voice_id]
123
+ else:
124
+ key = (mood, gender)
125
+ fallback_key = ("neutral", "neutral")
126
+ voice_name, speed = GOOGLE_VOICE_MAP.get(key, GOOGLE_VOICE_MAP.get(fallback_key, ("en-US-Neural2-D", 1.0)))
127
+
128
+ tts_client = texttospeech.TextToSpeechClient()
129
+ input_text = texttospeech.SynthesisInput(text=text)
130
+ voice = texttospeech.VoiceSelectionParams(
131
+ language_code="en-US",
132
+ name=voice_name,
133
+ )
134
+ audio_config = texttospeech.AudioConfig(
135
+ audio_encoding=texttospeech.AudioEncoding.MP3,
136
+ speaking_rate=speed,
137
+ )
138
+
139
+ response = tts_client.synthesize_speech(
140
+ input=input_text, voice=voice, audio_config=audio_config
141
+ )
142
+
143
+ out_path = AUDIO_DIR / f"scene_{scene_num:02d}.mp3"
144
+ out_path.write_bytes(response.audio_content)
145
+ logger.info(f"[Voice/Google] Scene {scene_num} → {out_path.name}")
146
+ return out_path
147
+
148
+
149
+ # ── edge-tts backend ─────────────────────────────────────
150
+ async def _synthesize_edge_async(text: str, scene_num: int, mood: str, gender: str, voice_id: str = None, language: str = "English") -> Path:
151
+ """Generate speech via edge-tts (no auth required)."""
152
+ import edge_tts
153
+
154
+ out_path = AUDIO_DIR / f"scene_{scene_num:02d}.mp3"
155
+
156
+ if voice_id and voice_id in PREMIUM_VOICES:
157
+ voice_name, speed, _ = PREMIUM_VOICES[voice_id]
158
+ communicate = edge_tts.Communicate(text, voice_name, rate=f"{int((speed-1)*100):+d}%")
159
+ else:
160
+ key = (mood, gender)
161
+ fallback_key = ("neutral", "neutral")
162
+
163
+ # Select map based on language
164
+ lang_map = EDGE_VOICE_MAP.get(language, EDGE_VOICE_MAP["English"])
165
+ voice, settings = lang_map.get(key, lang_map.get(fallback_key))
166
+
167
+ communicate = edge_tts.Communicate(
168
+ text,
169
+ voice,
170
+ rate=settings.get("rate", "+0%"),
171
+ pitch=settings.get("pitch", "+0Hz"),
172
+ )
173
+
174
+ await communicate.save(str(out_path))
175
+ logger.info(f"[Voice/edge-tts] Scene {scene_num} ({language}) → {out_path.name}")
176
+ return out_path
177
+
178
+
179
+ def _synthesize_edge(text: str, scene_num: int, mood: str, gender: str, voice_id: str = None, language: str = "English") -> Path:
180
+ return asyncio.run(_synthesize_edge_async(text, scene_num, mood, gender, voice_id=voice_id, language=language))
181
+
182
+
183
+ # ── Public API ────────────────────────────────────────────
184
+ def generate_voiceovers(video_json: dict, voice_id: str = None) -> list[dict]:
185
+ """
186
+ Generate voiceover MP3s for all scenes in the video JSON.
187
+
188
+ Args:
189
+ video_json: Parsed dict from agent.generate_video_package()
190
+ voice_id: Explicit voice selection from PREMIUM_VOICES
191
+
192
+ Returns:
193
+ List of dicts: [{"scene_number": 1, "path": "assets/audio/scene_01.mp3",
194
+ "duration": 5.2}, ...]
195
+ """
196
+ settings = video_json.get("voiceover_settings", {})
197
+ mood = settings.get("mood", "calm")
198
+ gender = settings.get("gender_preference", "male")
199
+ results = []
200
+
201
+ # Decide back-end based on voice_id or env toggle
202
+ use_edge = USE_EDGE_TTS
203
+ if voice_id and voice_id.startswith("edge_"):
204
+ use_edge = True
205
+ elif voice_id and voice_id.startswith("google_"):
206
+ use_edge = False
207
+
208
+ language = video_json.get("language", "English")
209
+
210
+ for scene in video_json["scenes"]:
211
+ num = scene["scene_number"]
212
+ text = scene["script_text"]
213
+
214
+ if use_edge:
215
+ path = _synthesize_edge(text, num, mood, gender, voice_id=voice_id, language=language)
216
+ else:
217
+ # Google synthesis would need similar language-aware mapping update if used
218
+ path = _synthesize_google(text, num, mood, gender, voice_id=voice_id)
219
+
220
+ duration = _get_audio_duration(path)
221
+ results.append({
222
+ "scene_number": num,
223
+ "path": str(path),
224
+ "duration": round(duration, 2),
225
+ })
226
+
227
+ logger.info(f"[Voice] ✅ Generated {len(results)} audio files.")
228
+ return results
229
+
230
+
231
+ # ── CLI Test ──────────────────────────────────────────────
232
+ if __name__ == "__main__":
233
+ import argparse
234
+ parser = argparse.ArgumentParser()
235
+ parser.add_argument("--test", action="store_true", help="Run with sample text")
236
+ args = parser.parse_args()
237
+
238
+ if args.test:
239
+ sample = {
240
+ "scenes": [
241
+ {
242
+ "scene_number": 1,
243
+ "script_text": "Did you know that AI tools can save you 3 hours every single day?",
244
+ },
245
+ {
246
+ "scene_number": 2,
247
+ "script_text": "Here are the top 3 AI productivity tools you need right now.",
248
+ },
249
+ ],
250
+ "voiceover_settings": {
251
+ "mood": "energetic",
252
+ "gender_preference": "male",
253
+ "pace": "normal",
254
+ },
255
+ }
256
+ print("🎙 Running voiceover test...")
257
+ results = generate_voiceovers(sample)
258
+ for r in results:
259
+ print(f" Scene {r['scene_number']}: {r['path']} ({r['duration']}s)")