From 98e829c9953050486936de271d6253a8bc6912bb Mon Sep 17 00:00:00 2001 From: sylyx Date: Sat, 16 May 2026 10:29:32 +0200 Subject: [PATCH] Initial implementation of reddit-video-bot MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Full pipeline: Reddit sourcing → Groq text optimization → Edge-TTS voice generation → Whisper transcription → FFmpeg video rendering with word-level subtitles. Includes SQLite deduplication and .env-based config. Co-Authored-By: Claude Sonnet 4.6 --- .env.example | 4 ++ .gitignore | 11 +++++ PLAN.md | 81 +++++++++++++++++++++++++++++++ config.py | 23 +++++++++ main.py | 44 +++++++++++++++++ requirements.txt | 6 +++ src/__init__.py | 0 src/processor.py | 32 +++++++++++++ src/reddit_client.py | 61 +++++++++++++++++++++++ src/subtitler.py | 31 ++++++++++++ src/video_engine.py | 112 +++++++++++++++++++++++++++++++++++++++++++ src/voice_gen.py | 14 ++++++ 12 files changed, 419 insertions(+) create mode 100644 .env.example create mode 100644 .gitignore create mode 100644 PLAN.md create mode 100644 config.py create mode 100644 main.py create mode 100644 requirements.txt create mode 100644 src/__init__.py create mode 100644 src/processor.py create mode 100644 src/reddit_client.py create mode 100644 src/subtitler.py create mode 100644 src/video_engine.py create mode 100644 src/voice_gen.py diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..e33c50c --- /dev/null +++ b/.env.example @@ -0,0 +1,4 @@ +REDDIT_CLIENT_ID=your_reddit_client_id +REDDIT_CLIENT_SECRET=your_reddit_client_secret +REDDIT_USER_AGENT=RedditVideoBot/0.1 by /u/YourUsername +GROQ_API_KEY=your_groq_api_key diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..361b468 --- /dev/null +++ b/.gitignore @@ -0,0 +1,11 @@ +.env +__pycache__/ +*.pyc +*.pyo +output/ +processed.db +assets/background_videos/ +assets/fonts/ +*.mp3 +*.mp4 +*.srt diff --git a/PLAN.md b/PLAN.md new file mode 100644 index 0000000..3f61dc7 --- /dev/null +++ b/PLAN.md @@ -0,0 +1,81 @@ +# Projekt-Plan: Automatisierter Reddit-Story Video Creator + +Dieses Projekt automatisiert die Erstellung von Social-Media-Videos (TikTok, YouTube Shorts, Reels) basierend auf populären Reddit-Beiträgen. Das Ziel ist ein "Low-Cost, High-Efficiency" Workflow. + +## 1. Architektur & Workflow + +1. **Sourcing**: Ein Skript sucht über die Reddit API (`praw`) nach viralen Posts in Subreddits wie `r/AITAH`, `r/relationship_advice` oder `r/confessions`. Bereits verarbeitete Posts werden über eine lokale SQLite-Datenbank übersprungen. +2. **Processing (Groq)**: Der Text wird an Groq (Llama 3) gesendet. Die KI bereinigt den Text, entfernt unnötige Kürzel, verbessert den Spannungsbogen und sorgt für einen starken "Hook" in den ersten 3 Sekunden. +3. **Voice (Edge-TTS)**: Der optimierte Text wird mit der Microsoft Edge TTS Engine in eine hochwertige, menschlich klingende MP3-Datei umgewandelt (kostenlos). +4. **Transkription (Whisper)**: Das Audio wird mit OpenAI Whisper lokal transkribiert, um wortgenaue Zeitstempel für die Untertitel zu erhalten. Standard-Modell: `base` (schnell) — für höhere Qualität auf `medium` wechseln. +5. **Visuals & Montage (FFmpeg)**: + * Hintergrundvideo (z.B. Minecraft Parkour) wird geladen und bei Bedarf geloopt, sodass es die volle Audio-Länge abdeckt. + * Audio wird darübergelegt. + * Untertitel werden per `ffmpeg drawtext`-Filter wort-genau und animiert eingeblendet (schneller und stabiler als MoviePy TextClip). +6. **Export**: Das fertige Video wird als `.mp4` im Hochformat (9:16) exportiert. + +## 2. Tech-Stack + +- **Sprache**: Python 3.10+ +- **LLM**: Groq API (Llama 3 70B) +- **Voice**: `edge-tts` (Python Bibliothek) +- **Transkription**: `openai-whisper` (lokal, Modell: `base` oder `medium`) +- **Video-Editing**: `ffmpeg` (direkt via `subprocess`), `moviepy` nur für einfache Clips +- **Reddit API**: `praw` +- **Secrets**: `python-dotenv` + `.env` Datei +- **Deduplication**: `sqlite3` (Standardbibliothek, kein Extra-Install) + +## 3. Verzeichnisstruktur + +```text +reddit-video-bot/ +├── main.py # Hauptsteuerung des Bots +├── .env # API-Keys (wird nicht committed) +├── .env.example # Vorlage mit Platzhaltern +├── src/ +│ ├── reddit_client.py # Holt Posts von Reddit, prüft Duplikate per SQLite +│ ├── processor.py # Groq-Integration & Text-Optimierung +│ ├── voice_gen.py # Edge-TTS Integration +│ ├── subtitler.py # Whisper-Transkription & Wort-Zeitstempel +│ └── video_engine.py # Montage mit FFmpeg (loop, audio, drawtext) +├── assets/ +│ ├── background_videos/ # Speicherort für Hintergrund-Loops +│ └── fonts/ # Schriftarten für Untertitel +├── output/ # Hier landen die fertigen Videos +├── processed.db # SQLite: bereits verarbeitete Post-IDs +└── requirements.txt # Abhängigkeiten +``` + +## 4. Implementierungsschritte + +### Phase 1: Setup & Sourcing +- `requirements.txt` und `.env.example` erstellen, Bibliotheken installieren. +- `reddit_client.py` implementieren: Authentifizierung, Abruf der Top-Posts der letzten 24h, SQLite-Check auf Duplikate. + +### Phase 2: Logik & Stimme +- `processor.py` erstellen: Groq API einbinden und Prompt-Engineering für virale Storys. +- `voice_gen.py` erstellen: Funktion zum Speichern von Text als MP3 via `edge-tts`. + +### Phase 3: Transkription & Untertitel +- `subtitler.py` erstellen: Whisper Modell laden (`base` als Default), Audio zu Wort-Zeitstempeln konvertieren. +- Zeitstempel als strukturierte Liste ausgeben (für FFmpeg `drawtext`). + +### Phase 4: Video-Engine *(höchstes Risiko — früh prototypen)* +- `video_engine.py` erstellen: + - Zufälligen Clip aus `assets/background_videos/` wählen. + - Hintergrundvideo per FFmpeg loopen bis Audio-Länge erreicht ist. + - Audio einbetten. + - Untertitel wort-genau per `drawtext`-Filter rendern. +- Ziel-Format: 9:16, 1080x1920. + +### Phase 5: Automatisierung +- `main.py` schreiben: Alle Module verknüpfen, Post-ID nach erfolgreichem Export in SQLite speichern. + +## 5. Kosten-Optimierung +- **Groq**: Kostenlos (Free Tier). +- **Edge-TTS**: Kostenlos. +- **Whisper**: Kostenlos (läuft lokal). +- **Reddit API**: Kostenlos (für persönliche Nutzung). +- **Visuals**: Einmalig kostenlose Gameplay-Videos von YouTube/Pexels laden. + +**Gesamtkosten pro Video: ~0,00 €** diff --git a/config.py b/config.py new file mode 100644 index 0000000..da63b5a --- /dev/null +++ b/config.py @@ -0,0 +1,23 @@ +import os +from dotenv import load_dotenv + +load_dotenv() + +# Reddit API Credentials +REDDIT_CLIENT_ID = os.getenv("REDDIT_CLIENT_ID") +REDDIT_CLIENT_SECRET = os.getenv("REDDIT_CLIENT_SECRET") +REDDIT_USER_AGENT = os.getenv("REDDIT_USER_AGENT", "RedditVideoBot/0.1 by /u/YourUsername") + +# Groq API Key +GROQ_API_KEY = os.getenv("GROQ_API_KEY") + +# Path Settings +BASE_DIR = os.path.dirname(os.path.abspath(__file__)) +ASSETS_DIR = os.path.join(BASE_DIR, "assets") +OUTPUT_DIR = os.path.join(BASE_DIR, "output") +BACKGROUND_VIDEOS_DIR = os.path.join(ASSETS_DIR, "background_videos") + +# Video Settings +VIDEO_WIDTH = 1080 +VIDEO_HEIGHT = 1920 +FONT_PATH = os.path.join(ASSETS_DIR, "fonts", "BoldFont.ttf") diff --git a/main.py b/main.py new file mode 100644 index 0000000..42cb7e1 --- /dev/null +++ b/main.py @@ -0,0 +1,44 @@ +import os +import sys +from src.reddit_client import get_next_post, mark_processed +from src.processor import optimize_text +from src.voice_gen import generate_audio +from src.subtitler import transcribe +from src.video_engine import render_video + +WHISPER_MODEL = os.getenv("WHISPER_MODEL", "base") +TEMP_AUDIO = "temp_audio.mp3" + + +def run(): + print("[main] Fetching next Reddit post...") + post = get_next_post() + if not post: + print("[main] No new viral posts found. Try again later.") + sys.exit(0) + + print(f"[main] Found: r/{post['subreddit']} — {post['title'][:60]} (score: {post['score']})") + + print("[main] Optimizing text with Groq...") + script = optimize_text(post["title"], post["text"]) + print(f"[main] Script preview: {script[:120]}...") + + print("[main] Generating voice...") + generate_audio(script, TEMP_AUDIO) + + print("[main] Transcribing with Whisper...") + words = transcribe(TEMP_AUDIO, model_name=WHISPER_MODEL) + print(f"[main] Got {len(words)} word timestamps.") + + print("[main] Rendering video...") + output_path = render_video(TEMP_AUDIO, words, post["id"]) + + mark_processed(post["id"]) + print(f"[main] Done! Video saved to: {output_path}") + + if os.path.exists(TEMP_AUDIO): + os.remove(TEMP_AUDIO) + + +if __name__ == "__main__": + run() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..81190df --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +praw>=7.7.0 +groq>=0.9.0 +edge-tts>=6.1.9 +openai-whisper>=20231117 +moviepy>=1.0.3 +python-dotenv>=1.0.0 diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/processor.py b/src/processor.py new file mode 100644 index 0000000..7cbc71a --- /dev/null +++ b/src/processor.py @@ -0,0 +1,32 @@ +from groq import Groq +from config import GROQ_API_KEY + +client = Groq(api_key=GROQ_API_KEY) + +SYSTEM_PROMPT = """You are a viral social media script writer. Your job is to rewrite Reddit stories for TikTok/YouTube Shorts. + +Rules: +- Start with a HOOK in the first sentence that grabs attention immediately (use "So I...", "I can't believe...", "This actually happened to me...") +- Remove Reddit-specific abbreviations (AITA → "Am I the asshole", NTA, etc.) +- Write in a natural, spoken voice — no bullet points, no markdown +- Keep sentences short for TTS pacing +- Preserve all the drama and emotion +- End with a cliffhanger or strong emotional close +- Output ONLY the script, no commentary or headings""" + + +def optimize_text(title: str, text: str) -> str: + """Send a Reddit post to Groq and return a TTS-ready viral script.""" + user_message = f"Title: {title}\n\nStory:\n{text}" + + response = client.chat.completions.create( + model="llama3-70b-8192", + messages=[ + {"role": "system", "content": SYSTEM_PROMPT}, + {"role": "user", "content": user_message}, + ], + temperature=0.8, + max_tokens=1024, + ) + + return response.choices[0].message.content.strip() diff --git a/src/reddit_client.py b/src/reddit_client.py new file mode 100644 index 0000000..24a1e84 --- /dev/null +++ b/src/reddit_client.py @@ -0,0 +1,61 @@ +import sqlite3 +import praw +from config import REDDIT_CLIENT_ID, REDDIT_CLIENT_SECRET, REDDIT_USER_AGENT + +DB_PATH = "processed.db" + +SUBREDDITS = ["AITAH", "relationship_advice", "confessions", "tifu", "AmItheAsshole"] +MIN_SCORE = 1000 +MIN_WORDS = 200 +MAX_WORDS = 1500 + + +def _init_db(): + con = sqlite3.connect(DB_PATH) + con.execute("CREATE TABLE IF NOT EXISTS processed (post_id TEXT PRIMARY KEY)") + con.commit() + return con + + +def _is_processed(con, post_id): + return con.execute("SELECT 1 FROM processed WHERE post_id=?", (post_id,)).fetchone() is not None + + +def mark_processed(post_id): + con = sqlite3.connect(DB_PATH) + con.execute("INSERT OR IGNORE INTO processed VALUES (?)", (post_id,)) + con.commit() + con.close() + + +def get_next_post(): + """Return the next unprocessed viral Reddit post as a dict, or None.""" + reddit = praw.Reddit( + client_id=REDDIT_CLIENT_ID, + client_secret=REDDIT_CLIENT_SECRET, + user_agent=REDDIT_USER_AGENT, + ) + con = _init_db() + + for subreddit_name in SUBREDDITS: + subreddit = reddit.subreddit(subreddit_name) + for post in subreddit.top(time_filter="day", limit=25): + if _is_processed(con, post.id): + continue + if post.score < MIN_SCORE: + continue + if post.is_self and post.selftext: + word_count = len(post.selftext.split()) + if MIN_WORDS <= word_count <= MAX_WORDS: + con.close() + return { + "id": post.id, + "title": post.title, + "text": post.selftext, + "score": post.score, + "subreddit": subreddit_name, + "url": f"https://reddit.com{post.permalink}", + } + + con.close() + return None diff --git a/src/subtitler.py b/src/subtitler.py new file mode 100644 index 0000000..7b99b72 --- /dev/null +++ b/src/subtitler.py @@ -0,0 +1,31 @@ +import whisper + +_model = None + + +def _get_model(model_name: str = "base"): + global _model + if _model is None: + print(f"[subtitler] Loading Whisper model '{model_name}'...") + _model = whisper.load_model(model_name) + return _model + + +def transcribe(audio_path: str, model_name: str = "base") -> list[dict]: + """ + Transcribe audio and return a list of word-level segments: + [{"word": str, "start": float, "end": float}, ...] + """ + model = _get_model(model_name) + result = model.transcribe(audio_path, word_timestamps=True, language="en") + + words = [] + for segment in result.get("segments", []): + for w in segment.get("words", []): + words.append({ + "word": w["word"].strip(), + "start": w["start"], + "end": w["end"], + }) + + return words diff --git a/src/video_engine.py b/src/video_engine.py new file mode 100644 index 0000000..3dbf241 --- /dev/null +++ b/src/video_engine.py @@ -0,0 +1,112 @@ +import os +import random +import subprocess +import json +from config import BACKGROUND_VIDEOS_DIR, OUTPUT_DIR, VIDEO_WIDTH, VIDEO_HEIGHT, FONT_PATH + + +def _get_audio_duration(audio_path: str) -> float: + result = subprocess.run( + ["ffprobe", "-v", "quiet", "-print_format", "json", "-show_streams", audio_path], + capture_output=True, text=True, check=True, + ) + streams = json.loads(result.stdout)["streams"] + for s in streams: + if s.get("codec_type") == "audio": + return float(s["duration"]) + raise ValueError(f"No audio stream found in {audio_path}") + + +def _get_video_duration(video_path: str) -> float: + result = subprocess.run( + ["ffprobe", "-v", "quiet", "-print_format", "json", "-show_streams", video_path], + capture_output=True, text=True, check=True, + ) + streams = json.loads(result.stdout)["streams"] + for s in streams: + if s.get("codec_type") == "video": + return float(s["duration"]) + raise ValueError(f"No video stream found in {video_path}") + + +def _pick_background() -> str: + videos = [ + f for f in os.listdir(BACKGROUND_VIDEOS_DIR) + if f.lower().endswith((".mp4", ".mov", ".mkv")) + ] + if not videos: + raise FileNotFoundError( + f"No background videos found in {BACKGROUND_VIDEOS_DIR}. " + "Add at least one .mp4 file to assets/background_videos/" + ) + return os.path.join(BACKGROUND_VIDEOS_DIR, random.choice(videos)) + + +def _build_drawtext_filter(words: list[dict]) -> str: + """Build an ffmpeg drawtext filter chain for word-by-word subtitle display.""" + if not os.path.exists(FONT_PATH): + font_arg = "fontfile=/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf" + else: + font_arg = f"fontfile={FONT_PATH}" + + parts = [] + for w in words: + word = w["word"].replace("'", "\\'").replace(":", "\\:").replace(",", "\\,") + start = w["start"] + end = w["end"] + part = ( + f"drawtext={font_arg}:text='{word}':" + f"fontcolor=white:fontsize=80:borderw=4:bordercolor=black:" + f"x=(w-text_w)/2:y=(h-text_h)/2:" + f"enable='between(t,{start},{end})'" + ) + parts.append(part) + + return ",".join(parts) + + +def render_video(audio_path: str, words: list[dict], post_id: str) -> str: + """ + Compose the final video: looped background + audio + word subtitles. + Returns the path to the output .mp4. + """ + os.makedirs(OUTPUT_DIR, exist_ok=True) + output_path = os.path.join(OUTPUT_DIR, f"{post_id}.mp4") + + bg_path = _pick_background() + audio_duration = _get_audio_duration(audio_path) + video_duration = _get_video_duration(bg_path) + + # Calculate loop count needed to cover audio duration + loop_count = int(audio_duration / video_duration) + 2 + + drawtext_filter = _build_drawtext_filter(words) + + # Full filter: loop bg, scale/crop to 9:16, overlay subtitles + vf = ( + f"scale={VIDEO_WIDTH}:{VIDEO_HEIGHT}:force_original_aspect_ratio=increase," + f"crop={VIDEO_WIDTH}:{VIDEO_HEIGHT}," + f"{drawtext_filter}" + ) + + cmd = [ + "ffmpeg", "-y", + "-stream_loop", str(loop_count), + "-i", bg_path, + "-i", audio_path, + "-vf", vf, + "-t", str(audio_duration), + "-map", "0:v:0", + "-map", "1:a:0", + "-c:v", "libx264", + "-preset", "fast", + "-crf", "23", + "-c:a", "aac", + "-b:a", "192k", + "-shortest", + output_path, + ] + + print(f"[video_engine] Rendering video → {output_path}") + subprocess.run(cmd, check=True) + return output_path diff --git a/src/voice_gen.py b/src/voice_gen.py new file mode 100644 index 0000000..17add6d --- /dev/null +++ b/src/voice_gen.py @@ -0,0 +1,14 @@ +import asyncio +import edge_tts + +VOICE = "en-US-ChristopherNeural" + + +async def _synthesize(text: str, output_path: str): + communicate = edge_tts.Communicate(text, VOICE) + await communicate.save(output_path) + + +def generate_audio(text: str, output_path: str): + """Generate an MP3 from text using Edge TTS and save to output_path.""" + asyncio.run(_synthesize(text, output_path))