Initial Commit

This commit is contained in:
2023-11-10 21:09:31 -05:00
commit c71fef9429
17 changed files with 990 additions and 0 deletions

42
vreader/__init__.py Normal file
View File

@@ -0,0 +1,42 @@
import click
import signal
import sys
from importlib.metadata import version
from vreader.oai import OpenAIConnector
from vreader.video import VideoManager
from flask import Flask
from flask.cli import FlaskGroup
__version__ = version("vreader")
def signal_handler(sig, frame):
sys.exit(0)
def create_app():
global oai, vman
from vreader.config import Config
import vreader.api.common as api_common
import vreader.api.v1 as api_v1
app = Flask(__name__)
oai = OpenAIConnector(Config.OPENAI_API_KEY)
vman = VideoManager()
app.register_blueprint(api_common.bp)
app.register_blueprint(api_v1.bp)
return app
@click.group()
def cli():
"""VReader CLI"""
@cli.group(cls=FlaskGroup, create_app=create_app)
def server():
"""VReader flask server"""
signal.signal(signal.SIGINT, signal_handler)

64
vreader/api/common.py Normal file
View File

@@ -0,0 +1,64 @@
from flask import Blueprint
from flask import make_response, render_template
from html_sanitizer import Sanitizer
from markdown import markdown
from vreader.config import Config
import os
bp = Blueprint("common", __name__)
sanitizer = Sanitizer()
@bp.route("/", methods=["GET"])
def main_entry():
directory = str(Config.DATA_PATH)
all_files = os.listdir(directory)
markdown_files = [file for file in all_files if file.endswith(".md")]
articles = [parse_filename(file) for file in markdown_files]
return make_response(render_template("index.html", articles=articles))
@bp.route("/articles/<id>", methods=["GET"])
def article_item(id):
if len(id) != 11:
return make_response(render_template("404.html")), 404
metadata = get_article_metadata(id)
if not metadata:
return make_response(render_template("404.html")), 404
try:
with open(metadata["filepath"], 'r', encoding='utf-8') as file:
article_contents = file.read()
markdown_html = sanitizer.sanitize(markdown(article_contents))
return make_response(
render_template("article.html", metadata=metadata, markdown_html=markdown_html)
)
except Exception as _:
return make_response(render_template("404.html")), 404
def get_article_metadata(id):
directory = str(Config.DATA_PATH)
files = os.listdir(directory)
for file_name in files:
if file_name.startswith(id) and file_name.endswith(".md"):
file_path = os.path.join(directory, file_name)
metadata = parse_filename(file_name)
metadata["filepath"] = file_path
return metadata
return None
def parse_filename(filename):
video_id = filename[:11]
title = filename[12:][:-3]
return {
"video_id": video_id,
"title": title
}

78
vreader/api/v1.py Normal file
View File

@@ -0,0 +1,78 @@
import os
from os import path
from flask import Blueprint, request
from vreader.config import Config
import vreader
bp = Blueprint("v1", __name__, url_prefix="/api/v1")
@bp.route("/articles", methods=["GET"])
def articles():
directory = str(Config.DATA_PATH)
all_files = os.listdir(directory)
markdown_files = [file for file in all_files if file.endswith(".md")]
articles = [parse_filename(file) for file in markdown_files]
return articles
@bp.route("/generate", methods=["POST"])
def generate():
data = request.get_json()
if not data:
return {"error": "Missing Data"}
video = str(data.get("video"))
if video == "":
return {"error": "Missing Data"}
if len(video) != 11:
return {"error": "Invalid VideoID"}
metadata = get_article_metadata(video)
if metadata is not None:
return {"video": video}
context = vreader.vman.transcribe_video(video)
if context is None:
return {"error": "Unable to Extract Subtitles"}
resp = vreader.oai.query(context)
# Get Details
directory = str(Config.DATA_PATH)
title = resp.get("title")
content = resp.get("content")
# Derive Filename
new_title = f"{video}_{title}"
file_path = path.join(directory, f"{new_title}.md")
# Write File
file = open(file_path, 'w', encoding='utf-8')
file.write(content)
file.close()
return { "title": resp["title"] }
def get_article_metadata(id):
directory = str(Config.DATA_PATH)
files = os.listdir(directory)
for file_name in files:
if file_name.startswith(id) and file_name.endswith(".md"):
file_path = os.path.join(directory, file_name)
metadata = parse_filename(file_name)
metadata["filepath"] = file_path
return metadata
return None
def parse_filename(filename):
video_id = filename[:11]
title = filename[12:][:-3]
return {
"video_id": video_id,
"title": title
}

24
vreader/config.py Normal file
View File

@@ -0,0 +1,24 @@
import os
def get_env(key, default=None, required=False) -> str | None:
"""Wrapper for gathering env vars."""
if required:
assert key in os.environ, "Missing Environment Variable: %s" % key
env = os.environ.get(key, default)
return str(env) if env is not None else None
class Config:
"""Wrap application configurations
Attributes
----------
DATA_PATH : str
The path where to store any resources (default: ./)
OPENAI_API_KEY : str
OpenAI API Key - Required
"""
DATA_PATH: str | None = get_env("DATA_PATH", required=False)
OPENAI_API_KEY: str | None = get_env("OPENAI_API_KEY", required=True)

67
vreader/oai.py Normal file
View File

@@ -0,0 +1,67 @@
from dataclasses import dataclass
from textwrap import indent
from typing import Any, List
import json
import openai
INITIAL_PROMPT_TEMPLATE = """
The following is a video transcription. Write a fully comprehensive article in markdown appropriately utilizing subsections. Be sure to only use the following transcription to write the article:
{context}
"""
INITIAL_PROMPT_TEMPLATE_OLD = """
The following is a video transcription. Write a comprehensive article in markdown utilizing the following content:
{context}
"""
@dataclass
class ChatCompletion:
id: str
object: str
created: int
model: str
choices: List[dict]
usage: dict
class OpenAIConnector:
def __init__(self, api_key: str | None):
if api_key is None:
raise RuntimeError("OPENAI_API_KEY Required")
# self.model = "gpt-3.5-turbo-16k"
self.model = "gpt-3.5-turbo-1106"
self.word_cap = 1000
openai.api_key = api_key
def query(self, context: str) -> Any:
# Create Initial Prompt
prompt = INITIAL_PROMPT_TEMPLATE.format(context = context)
messages = [{"role": "user", "content": prompt}]
print("[OpenAIConnector] Running OAI Query")
# Article Call
response: ChatCompletion = openai.ChatCompletion.create( # type: ignore
model=self.model,
messages=messages
)
# Markdown Data
content = response.choices[0]["message"]["content"]
title = self.get_title(content)
print("[OpenAIConnector] Completed OAI Query:\n", indent(json.dumps({ "usage": response.usage }, indent=2), ' ' * 2))
# Return Response
return { "title": title, "content": content }
def get_title(self, markdown: str):
lines = markdown.split('\n')
for line in lines:
if line.startswith("# "):
return line.strip("# ").strip()
return None

View File

@@ -0,0 +1,15 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8" />
<meta
name="viewport"
content="width=device-width, initial-scale=0.9, user-scalable=no, viewport-fit=cover"
/>
<title>VReader - Article</title>
<script src="https://cdn.tailwindcss.com"></script>
</head>
<body class="bg-slate-200 h-[100dvh] p-5 flex flex-col justify-between">
{{ markdown_html|safe }}
</body>
</html>

View File

@@ -0,0 +1,48 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8" />
<meta
name="viewport"
content="width=device-width, initial-scale=0.9, user-scalable=no, viewport-fit=cover"
/>
<title>VReader - {{ metadata.title }}</title>
<script src="https://cdn.tailwindcss.com"></script>
<style>
#content {
h1 {
font-size: 1.75em;
font-weight: 400;
}
h2 {
font-size: 1.25em;
}
p {
margin-top: 0.25em;
margin-bottom: 1.5em;
}
}
</style>
</head>
<body class="bg-slate-200">
<header class="w-screen h-16 bg-slate-300 mb-5">
<div
class="flex px-2 h-16 w-11/12 md:w-5/6 mx-auto rounded bg-slate-300"
>
<a class="font-bold flex justify-center items-center" href="/">All Articles</a>
</div>
</header>
<div
id="content"
class="w-11/12 md:w-5/6 mx-auto rounded px-10 py-5 bg-slate-300"
>
<div class="flex justify-center pb-5 w-full">
<a target="_blank" href="https://www.youtube.com/watch?v={{ metadata.video_id }}">
<img class="h-32 rounded" src="https://i.ytimg.com/vi_webp/{{ metadata.video_id }}/maxresdefault.webp"></img>
</a>
</div>
<hr class="border-gray-500 pb-5" />
{{ markdown_html|safe }}
</div>
</body>
</html>

View File

@@ -0,0 +1,152 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8" />
<meta
name="viewport"
content="width=device-width, initial-scale=0.9, user-scalable=no, viewport-fit=cover"
/>
<title>VReader - Home</title>
<script src="https://cdn.tailwindcss.com"></script>
</head>
<body class="bg-slate-200">
<header class="w-screen h-16 bg-slate-300 mb-5">
<div
class="flex px-2 h-16 w-11/12 md:w-5/6 mx-auto rounded bg-slate-300"
>
<span class="font-bold flex justify-center items-center">VReader</span>
</div>
</header>
<main class="flex flex-col gap-4">
<div id="submit"
class="flex gap-4 items-center text-lg w-11/12 md:w-4/6 mx-auto rounded px-6 py-3 bg-slate-300"
>
<input type="text" placeholder="YouTube URL" class="w-full p-2 bg-gray-700 text-white">
<button class="p-2 bg-gray-500 text-gray-800 hover:bg-gray-100" type="submit">Generate</button>
</div>
{% for article in articles %}
<a
href="/articles/{{ article.video_id }}"
class="flex items-center text-lg w-11/12 md:w-4/6 mx-auto rounded px-6 py-3 bg-slate-300 hover:bg-slate-400 transition-all duration-200"
>
<img class="h-14 md:h-24 mr-6 rounded" src="https://i.ytimg.com/vi_webp/{{ article.video_id }}/maxresdefault.webp"></img>
<span>{{ article.title }}</span>
</a>
{% endfor %}
</main>
<script>
const LOADING_SVG = `<svg
class="w-full"
width="24"
height="24"
viewBox="0 0 24 24"
xmlns="http://www.w3.org/2000/svg"
fill="currentColor"
>
<style>
.spinner_qM83 {
animation: spinner_8HQG 1.05s infinite;
}
.spinner_oXPr {
animation-delay: 0.1s;
}
.spinner_ZTLf {
animation-delay: 0.2s;
}
@keyframes spinner_8HQG {
0%,
57.14% {
animation-timing-function: cubic-bezier(0.33, 0.66, 0.66, 1);
transform: translate(0);
}
28.57% {
animation-timing-function: cubic-bezier(0.33, 0, 0.66, 0.33);
transform: translateY(-6px);
}
100% {
transform: translate(0);
}
}
</style>
<circle class="spinner_qM83" cx="4" cy="12" r="3"></circle>
<circle class="spinner_qM83 spinner_oXPr" cx="12" cy="12" r="3"></circle>
<circle class="spinner_qM83 spinner_ZTLf" cx="20" cy="12" r="3"></circle>
</svg>`;
/**
* Wrapper API Call
**/
function apiCall(data) {
let fetchObj = {
method: data.method || "GET",
headers: {
"Content-Type": "application/json",
},
};
if (fetchObj.method == "POST")
fetchObj.body = JSON.stringify(data.data || {});
return fetch(data.url, fetchObj).then((resp) => resp.json());
}
function getVideoArticle(videoID) {
return apiCall({
url: "/api/v1/generate",
method: "POST",
data: { video: videoID },
});
}
function generateAction(){
let inputEl = document.querySelector("input");
let inputVal = inputEl.value;
let videoID = getYouTubeVideoId(inputVal);
if (!videoID) return alert("Invalid URL")
// Loading
let submitEl = document.querySelector("#submit");
let oldHTML = submitEl.innerHTML;
submitEl.innerHTML = LOADING_SVG;
// Do API Call
apiCall({
url: "/api/v1/generate",
method: "POST",
data: { video: videoID },
}).then((resp) => {
if ("error" in resp) throw new Error(resp.error);
window.location.href = "/articles/" + videoID;
}).catch(e => {
console.log(e);
alert(e.message);
submitEl.innerHTML = oldHTML;
});
}
function initListeners(){
let buttonEl = document.querySelector("button");
let inputEl = document.querySelector("input");
buttonEl.addEventListener("click", generateAction);
inputEl.addEventListener("keydown", function(event) {
if (event.keyCode !== 13) return;
generateAction();
});
}
function getYouTubeVideoId(url) {
var regExp = /^.*(?:youtu.be\/|v\/|u\/\w\/|embed\/|watch\?v=|\&v=)([^#\&\?]*).*/;
var match = url.match(regExp);
if (match && match[1]) {
return match[1];
} else {
return null;
}
}
initListeners();
</script>
</body>
</html>

42
vreader/video.py Normal file
View File

@@ -0,0 +1,42 @@
import os
from yt_dlp import YoutubeDL
import xml.etree.ElementTree as ET
class VideoManager():
"""Transcribe Videos"""
def transcribe_video(self, video_id: str):
URLS = [video_id]
vid = YoutubeDL({
"skip_download": True,
"writesubtitles": True,
"writeautomaticsub": True,
"subtitleslangs": ["en"],
"subtitlesformat": "ttml",
"outtmpl": "transcript"
})
vid.download(URLS)
content = self.convert_ttml_to_plain_text("transcript.en.ttml")
os.remove("transcript.en.ttml")
return content
def convert_ttml_to_plain_text(self, ttml_file_path):
try:
# Parse the TTML file
tree = ET.parse(ttml_file_path)
root = tree.getroot()
# Process Text
plain_text = ""
for elem in root.iter():
if elem.text:
plain_text += elem.text + " "
return plain_text.strip()
except ET.ParseError as e:
print("[VideoManager] TTML Conversion Error:", e)
return None