From 3d61d0f5efa65649827b3bd62b1d65311de34592 Mon Sep 17 00:00:00 2001 From: Evan Reichard Date: Sun, 1 Dec 2024 12:48:25 -0500 Subject: [PATCH] perf(db): incremental document stats cache --- database/db.go | 2 +- database/document_user_statistics.sql | 228 ++++++++++++++++++++ database/manager.go | 15 +- database/models.go | 3 +- database/query.sql.go | 2 +- database/schema.sql | 43 ++-- database/user_streaks.sql | 117 ++++++++++ database/views.sql | 296 -------------------------- 8 files changed, 377 insertions(+), 329 deletions(-) create mode 100644 database/document_user_statistics.sql create mode 100644 database/user_streaks.sql delete mode 100644 database/views.sql diff --git a/database/db.go b/database/db.go index a3cc795..dacb52e 100644 --- a/database/db.go +++ b/database/db.go @@ -1,6 +1,6 @@ // Code generated by sqlc. DO NOT EDIT. // versions: -// sqlc v1.26.0 +// sqlc v1.27.0 package database diff --git a/database/document_user_statistics.sql b/database/document_user_statistics.sql new file mode 100644 index 0000000..332842a --- /dev/null +++ b/database/document_user_statistics.sql @@ -0,0 +1,228 @@ +WITH new_activity AS ( + SELECT + document_id, + user_id + FROM activity + WHERE + created_at > COALESCE( + (SELECT MAX(last_seen) FROM document_user_statistics), + '1970-01-01T00:00:00Z' + ) + GROUP BY user_id, document_id +), + +intermediate_ga AS ( + SELECT + ga.id AS row_id, + ga.user_id, + ga.document_id, + ga.duration, + ga.start_time, + ga.start_percentage, + ga.end_percentage, + ga.created_at, + + -- Find Overlapping Events (Assign Unique ID) + ( + SELECT MIN(id) + FROM activity AS overlap + WHERE + ga.document_id = overlap.document_id + AND ga.user_id = overlap.user_id + AND ga.start_percentage <= overlap.end_percentage + AND ga.end_percentage >= overlap.start_percentage + ) AS group_leader + FROM activity AS ga + INNER JOIN new_activity AS na + WHERE na.user_id = ga.user_id AND na.document_id = ga.document_id +), + +grouped_activity AS ( + SELECT + user_id, + document_id, + MAX(created_at) AS created_at, + MAX(start_time) AS start_time, + MIN(start_percentage) AS start_percentage, + MAX(end_percentage) AS end_percentage, + MAX(end_percentage) - MIN(start_percentage) AS read_percentage, + SUM(duration) AS duration + FROM intermediate_ga + GROUP BY group_leader +), + +current_progress AS ( + SELECT + user_id, + document_id, + COALESCE(( + SELECT percentage + FROM document_progress AS dp + WHERE + dp.user_id = iga.user_id + AND dp.document_id = iga.document_id + ORDER BY created_at DESC + LIMIT 1 + ), end_percentage) AS percentage + FROM intermediate_ga AS iga + GROUP BY user_id, document_id + HAVING MAX(start_time) +) + +INSERT INTO document_user_statistics +SELECT + ga.document_id, + ga.user_id, + cp.percentage, + MAX(ga.start_time) AS last_read, + MAX(ga.created_at) AS last_seen, + SUM(ga.read_percentage) AS read_percentage, + + -- All Time WPM + SUM(ga.duration) AS total_time_seconds, + (CAST(COALESCE(d.words, 0.0) AS REAL) * SUM(read_percentage)) + AS total_words_read, + (CAST(COALESCE(d.words, 0.0) AS REAL) * SUM(read_percentage)) + / (SUM(ga.duration) / 60.0) AS total_wpm, + + -- Yearly WPM + SUM( + CASE + WHEN + ga.start_time >= DATE('now', '-1 year') + THEN ga.duration + ELSE 0 + END + ) + AS yearly_time_seconds, + ( + CAST(COALESCE(d.words, 0.0) AS REAL) + * SUM( + CASE + WHEN + ga.start_time >= DATE('now', '-1 year') + THEN read_percentage + ELSE 0 + END + ) + ) + AS yearly_words_read, + COALESCE(( + CAST(COALESCE(d.words, 0.0) AS REAL) + * SUM( + CASE + WHEN + ga.start_time >= DATE('now', '-1 year') + THEN read_percentage + END + ) + ) + / ( + SUM( + CASE + WHEN + ga.start_time >= DATE('now', '-1 year') + THEN ga.duration + END + ) + / 60.0 + ), 0.0) + AS yearly_wpm, + + -- Monthly WPM + SUM( + CASE + WHEN + ga.start_time >= DATE('now', '-1 month') + THEN ga.duration + ELSE 0 + END + ) + AS monthly_time_seconds, + ( + CAST(COALESCE(d.words, 0.0) AS REAL) + * SUM( + CASE + WHEN + ga.start_time >= DATE('now', '-1 month') + THEN read_percentage + ELSE 0 + END + ) + ) + AS monthly_words_read, + COALESCE(( + CAST(COALESCE(d.words, 0.0) AS REAL) + * SUM( + CASE + WHEN + ga.start_time >= DATE('now', '-1 month') + THEN read_percentage + END + ) + ) + / ( + SUM( + CASE + WHEN + ga.start_time >= DATE('now', '-1 month') + THEN ga.duration + END + ) + / 60.0 + ), 0.0) + AS monthly_wpm, + + -- Weekly WPM + SUM( + CASE + WHEN + ga.start_time >= DATE('now', '-7 days') + THEN ga.duration + ELSE 0 + END + ) + AS weekly_time_seconds, + ( + CAST(COALESCE(d.words, 0.0) AS REAL) + * SUM( + CASE + WHEN + ga.start_time >= DATE('now', '-7 days') + THEN read_percentage + ELSE 0 + END + ) + ) + AS weekly_words_read, + COALESCE(( + CAST(COALESCE(d.words, 0.0) AS REAL) + * SUM( + CASE + WHEN + ga.start_time >= DATE('now', '-7 days') + THEN read_percentage + END + ) + ) + / ( + SUM( + CASE + WHEN + ga.start_time >= DATE('now', '-7 days') + THEN ga.duration + END + ) + / 60.0 + ), 0.0) + AS weekly_wpm + +FROM grouped_activity AS ga +INNER JOIN + current_progress AS cp + ON ga.user_id = cp.user_id AND ga.document_id = cp.document_id +INNER JOIN + documents AS d + ON ga.document_id = d.id +GROUP BY ga.document_id, ga.user_id +ORDER BY total_wpm DESC; diff --git a/database/manager.go b/database/manager.go index 51372fa..5f7023b 100644 --- a/database/manager.go +++ b/database/manager.go @@ -28,8 +28,11 @@ type DBManager struct { //go:embed schema.sql var ddl string -//go:embed views.sql -var views string +//go:embed user_streaks.sql +var user_streaks_view string + +//go:embed document_user_statistics.sql +var document_user_statistics string //go:embed migrations/* var migrations embed.FS @@ -105,7 +108,7 @@ func (dbm *DBManager) init() error { } // Execute views - if _, err := dbm.DB.Exec(views, nil); err != nil { + if _, err := dbm.DB.Exec(user_streaks_view, nil); err != nil { log.Panicf("Error executing views: %v", err) return err } @@ -154,11 +157,7 @@ func (dbm *DBManager) CacheTempTables() error { log.Debug("Cached 'user_streaks' in: ", time.Since(start)) start = time.Now() - document_statistics_sql := ` - DELETE FROM document_user_statistics; - INSERT INTO document_user_statistics SELECT * FROM view_document_user_statistics; - ` - if _, err := dbm.DB.ExecContext(dbm.Ctx, document_statistics_sql); err != nil { + if _, err := dbm.DB.ExecContext(dbm.Ctx, document_user_statistics); err != nil { return err } log.Debug("Cached 'document_user_statistics' in: ", time.Since(start)) diff --git a/database/models.go b/database/models.go index 8d0cdea..4bb9963 100644 --- a/database/models.go +++ b/database/models.go @@ -1,6 +1,6 @@ // Code generated by sqlc. DO NOT EDIT. // versions: -// sqlc v1.26.0 +// sqlc v1.27.0 package database @@ -62,6 +62,7 @@ type DocumentUserStatistic struct { UserID string `json:"user_id"` Percentage float64 `json:"percentage"` LastRead string `json:"last_read"` + LastSeen string `json:"last_seen"` ReadPercentage float64 `json:"read_percentage"` TotalTimeSeconds int64 `json:"total_time_seconds"` TotalWordsRead int64 `json:"total_words_read"` diff --git a/database/query.sql.go b/database/query.sql.go index 0423180..3ee91bd 100644 --- a/database/query.sql.go +++ b/database/query.sql.go @@ -1,6 +1,6 @@ // Code generated by sqlc. DO NOT EDIT. // versions: -// sqlc v1.26.0 +// sqlc v1.27.0 // source: query.sql package database diff --git a/database/schema.sql b/database/schema.sql index 5309116..2219832 100644 --- a/database/schema.sql +++ b/database/schema.sql @@ -118,30 +118,13 @@ CREATE TABLE IF NOT EXISTS settings ( created_at DATETIME NOT NULL DEFAULT (STRFTIME('%Y-%m-%dT%H:%M:%SZ', 'now')) ); ---------------------------------------------------------------- ------------------------ Temporary Tables ---------------------- ---------------------------------------------------------------- - --- Temporary User Streaks Table (Cached from View) -CREATE TEMPORARY TABLE IF NOT EXISTS user_streaks ( - user_id TEXT NOT NULL, - window TEXT NOT NULL, - - max_streak INTEGER NOT NULL, - max_streak_start_date TEXT NOT NULL, - max_streak_end_date TEXT NOT NULL, - - current_streak INTEGER NOT NULL, - current_streak_start_date TEXT NOT NULL, - current_streak_end_date TEXT NOT NULL -); - --- Temporary Document User Statistics Table (Cached from View) -CREATE TEMPORARY TABLE IF NOT EXISTS document_user_statistics ( +-- Document User Statistics Table +CREATE TABLE IF NOT EXISTS document_user_statistics ( document_id TEXT NOT NULL, user_id TEXT NOT NULL, percentage REAL NOT NULL, - last_read TEXT NOT NULL, + last_read DATETIME NOT NULL, + last_seen DATETIME NOT NULL, read_percentage REAL NOT NULL, total_time_seconds INTEGER NOT NULL, @@ -163,6 +146,23 @@ CREATE TEMPORARY TABLE IF NOT EXISTS document_user_statistics ( UNIQUE(document_id, user_id) ON CONFLICT REPLACE ); +--------------------------------------------------------------- +----------------------- Temporary Tables ---------------------- +--------------------------------------------------------------- + +-- Temporary User Streaks Table (Cached from View) +CREATE TEMPORARY TABLE IF NOT EXISTS user_streaks ( + user_id TEXT NOT NULL, + window TEXT NOT NULL, + + max_streak INTEGER NOT NULL, + max_streak_start_date TEXT NOT NULL, + max_streak_end_date TEXT NOT NULL, + + current_streak INTEGER NOT NULL, + current_streak_start_date TEXT NOT NULL, + current_streak_end_date TEXT NOT NULL +); --------------------------------------------------------------- --------------------------- Indexes --------------------------- @@ -176,7 +176,6 @@ CREATE INDEX IF NOT EXISTS activity_user_id_document_id ON activity ( ); DROP VIEW IF EXISTS view_user_streaks; -DROP VIEW IF EXISTS view_document_user_statistics; --------------------------------------------------------------- --------------------------- Triggers -------------------------- diff --git a/database/user_streaks.sql b/database/user_streaks.sql new file mode 100644 index 0000000..95eacfe --- /dev/null +++ b/database/user_streaks.sql @@ -0,0 +1,117 @@ +--------------------------------------------------------------- +---------------------------- Views ---------------------------- +--------------------------------------------------------------- + +-------------------------------- +--------- User Streaks --------- +-------------------------------- + +CREATE VIEW view_user_streaks AS + +WITH document_windows AS ( + SELECT + activity.user_id, + users.timezone, + DATE( + LOCAL_TIME(activity.start_time, users.timezone), + 'weekday 0', '-7 day' + ) AS weekly_read, + DATE(LOCAL_TIME(activity.start_time, users.timezone)) AS daily_read + FROM activity + LEFT JOIN users ON users.id = activity.user_id + GROUP BY activity.user_id, weekly_read, daily_read +), +weekly_partitions AS ( + SELECT + user_id, + timezone, + 'WEEK' AS "window", + weekly_read AS read_window, + row_number() OVER ( + PARTITION BY user_id ORDER BY weekly_read DESC + ) AS seqnum + FROM document_windows + GROUP BY user_id, weekly_read +), +daily_partitions AS ( + SELECT + user_id, + timezone, + 'DAY' AS "window", + daily_read AS read_window, + row_number() OVER ( + PARTITION BY user_id ORDER BY daily_read DESC + ) AS seqnum + FROM document_windows + GROUP BY user_id, daily_read +), +streaks AS ( + SELECT + COUNT(*) AS streak, + MIN(read_window) AS start_date, + MAX(read_window) AS end_date, + window, + user_id, + timezone + FROM daily_partitions + GROUP BY + timezone, + user_id, + DATE(read_window, '+' || seqnum || ' day') + + UNION ALL + + SELECT + COUNT(*) AS streak, + MIN(read_window) AS start_date, + MAX(read_window) AS end_date, + window, + user_id, + timezone + FROM weekly_partitions + GROUP BY + timezone, + user_id, + DATE(read_window, '+' || (seqnum * 7) || ' day') +), +max_streak AS ( + SELECT + MAX(streak) AS max_streak, + start_date AS max_streak_start_date, + end_date AS max_streak_end_date, + window, + user_id + FROM streaks + GROUP BY user_id, window +), +current_streak AS ( + SELECT + streak AS current_streak, + start_date AS current_streak_start_date, + end_date AS current_streak_end_date, + window, + user_id + FROM streaks + WHERE CASE + WHEN window = "WEEK" THEN + DATE(LOCAL_TIME(STRFTIME('%Y-%m-%dT%H:%M:%SZ', 'now'), timezone), 'weekday 0', '-14 day') = current_streak_end_date + OR DATE(LOCAL_TIME(STRFTIME('%Y-%m-%dT%H:%M:%SZ', 'now'), timezone), 'weekday 0', '-7 day') = current_streak_end_date + WHEN window = "DAY" THEN + DATE(LOCAL_TIME(STRFTIME('%Y-%m-%dT%H:%M:%SZ', 'now'), timezone), '-1 day') = current_streak_end_date + OR DATE(LOCAL_TIME(STRFTIME('%Y-%m-%dT%H:%M:%SZ', 'now'), timezone)) = current_streak_end_date + END + GROUP BY user_id, window +) +SELECT + max_streak.user_id, + max_streak.window, + IFNULL(max_streak, 0) AS max_streak, + IFNULL(max_streak_start_date, "N/A") AS max_streak_start_date, + IFNULL(max_streak_end_date, "N/A") AS max_streak_end_date, + IFNULL(current_streak, 0) AS current_streak, + IFNULL(current_streak_start_date, "N/A") AS current_streak_start_date, + IFNULL(current_streak_end_date, "N/A") AS current_streak_end_date +FROM max_streak +LEFT JOIN current_streak ON + current_streak.user_id = max_streak.user_id + AND current_streak.window = max_streak.window; diff --git a/database/views.sql b/database/views.sql deleted file mode 100644 index b56178a..0000000 --- a/database/views.sql +++ /dev/null @@ -1,296 +0,0 @@ ---------------------------------------------------------------- ----------------------------- Views ---------------------------- ---------------------------------------------------------------- - --------------------------------- ---------- User Streaks --------- --------------------------------- - -CREATE VIEW view_user_streaks AS - -WITH document_windows AS ( - SELECT - activity.user_id, - users.timezone, - DATE( - LOCAL_TIME(activity.start_time, users.timezone), - 'weekday 0', '-7 day' - ) AS weekly_read, - DATE(LOCAL_TIME(activity.start_time, users.timezone)) AS daily_read - FROM activity - LEFT JOIN users ON users.id = activity.user_id - GROUP BY activity.user_id, weekly_read, daily_read -), -weekly_partitions AS ( - SELECT - user_id, - timezone, - 'WEEK' AS "window", - weekly_read AS read_window, - row_number() OVER ( - PARTITION BY user_id ORDER BY weekly_read DESC - ) AS seqnum - FROM document_windows - GROUP BY user_id, weekly_read -), -daily_partitions AS ( - SELECT - user_id, - timezone, - 'DAY' AS "window", - daily_read AS read_window, - row_number() OVER ( - PARTITION BY user_id ORDER BY daily_read DESC - ) AS seqnum - FROM document_windows - GROUP BY user_id, daily_read -), -streaks AS ( - SELECT - COUNT(*) AS streak, - MIN(read_window) AS start_date, - MAX(read_window) AS end_date, - window, - user_id, - timezone - FROM daily_partitions - GROUP BY - timezone, - user_id, - DATE(read_window, '+' || seqnum || ' day') - - UNION ALL - - SELECT - COUNT(*) AS streak, - MIN(read_window) AS start_date, - MAX(read_window) AS end_date, - window, - user_id, - timezone - FROM weekly_partitions - GROUP BY - timezone, - user_id, - DATE(read_window, '+' || (seqnum * 7) || ' day') -), -max_streak AS ( - SELECT - MAX(streak) AS max_streak, - start_date AS max_streak_start_date, - end_date AS max_streak_end_date, - window, - user_id - FROM streaks - GROUP BY user_id, window -), -current_streak AS ( - SELECT - streak AS current_streak, - start_date AS current_streak_start_date, - end_date AS current_streak_end_date, - window, - user_id - FROM streaks - WHERE CASE - WHEN window = "WEEK" THEN - DATE(LOCAL_TIME(STRFTIME('%Y-%m-%dT%H:%M:%SZ', 'now'), timezone), 'weekday 0', '-14 day') = current_streak_end_date - OR DATE(LOCAL_TIME(STRFTIME('%Y-%m-%dT%H:%M:%SZ', 'now'), timezone), 'weekday 0', '-7 day') = current_streak_end_date - WHEN window = "DAY" THEN - DATE(LOCAL_TIME(STRFTIME('%Y-%m-%dT%H:%M:%SZ', 'now'), timezone), '-1 day') = current_streak_end_date - OR DATE(LOCAL_TIME(STRFTIME('%Y-%m-%dT%H:%M:%SZ', 'now'), timezone)) = current_streak_end_date - END - GROUP BY user_id, window -) -SELECT - max_streak.user_id, - max_streak.window, - IFNULL(max_streak, 0) AS max_streak, - IFNULL(max_streak_start_date, "N/A") AS max_streak_start_date, - IFNULL(max_streak_end_date, "N/A") AS max_streak_end_date, - IFNULL(current_streak, 0) AS current_streak, - IFNULL(current_streak_start_date, "N/A") AS current_streak_start_date, - IFNULL(current_streak_end_date, "N/A") AS current_streak_end_date -FROM max_streak -LEFT JOIN current_streak ON - current_streak.user_id = max_streak.user_id - AND current_streak.window = max_streak.window; - --------------------------------- -------- Document Stats --------- --------------------------------- - -CREATE VIEW view_document_user_statistics AS - -WITH intermediate_ga AS ( - SELECT - ga1.id AS row_id, - ga1.user_id, - ga1.document_id, - ga1.duration, - ga1.start_time, - ga1.start_percentage, - ga1.end_percentage, - - -- Find Overlapping Events (Assign Unique ID) - ( - SELECT MIN(id) - FROM activity AS ga2 - WHERE - ga1.document_id = ga2.document_id - AND ga1.user_id = ga2.user_id - AND ga1.start_percentage <= ga2.end_percentage - AND ga1.end_percentage >= ga2.start_percentage - ) AS group_leader - FROM activity AS ga1 -), - -grouped_activity AS ( - SELECT - user_id, - document_id, - MAX(start_time) AS start_time, - MIN(start_percentage) AS start_percentage, - MAX(end_percentage) AS end_percentage, - MAX(end_percentage) - MIN(start_percentage) AS read_percentage, - SUM(duration) AS duration - FROM intermediate_ga - GROUP BY group_leader -), - -current_progress AS ( - SELECT - user_id, - document_id, - COALESCE(( - SELECT percentage - FROM document_progress AS dp - WHERE - dp.user_id = iga.user_id - AND dp.document_id = iga.document_id - ORDER BY created_at DESC - LIMIT 1 - ), end_percentage) AS percentage - FROM intermediate_ga AS iga - GROUP BY user_id, document_id - HAVING MAX(start_time) -) - -SELECT - ga.document_id, - ga.user_id, - cp.percentage, - MAX(start_time) AS last_read, - SUM(read_percentage) AS read_percentage, - - -- All Time WPM - SUM(duration) AS total_time_seconds, - (CAST(COALESCE(d.words, 0.0) AS REAL) * SUM(read_percentage)) - AS total_words_read, - (CAST(COALESCE(d.words, 0.0) AS REAL) * SUM(read_percentage)) - / (SUM(duration) / 60.0) AS total_wpm, - - -- Yearly WPM - SUM(CASE WHEN start_time >= DATE('now', '-1 year') THEN duration ELSE 0 END) - AS yearly_time_seconds, - ( - CAST(COALESCE(d.words, 0.0) AS REAL) - * SUM( - CASE - WHEN start_time >= DATE('now', '-1 year') THEN read_percentage - ELSE 0 - END - ) - ) - AS yearly_words_read, - COALESCE(( - CAST(COALESCE(d.words, 0.0) AS REAL) - * SUM( - CASE - WHEN start_time >= DATE('now', '-1 year') THEN read_percentage - END - ) - ) - / ( - SUM( - CASE - WHEN start_time >= DATE('now', '-1 year') THEN duration - END - ) - / 60.0 - ), 0.0) - AS yearly_wpm, - - -- Monthly WPM - SUM( - CASE WHEN start_time >= DATE('now', '-1 month') THEN duration ELSE 0 END - ) - AS monthly_time_seconds, - ( - CAST(COALESCE(d.words, 0.0) AS REAL) - * SUM( - CASE - WHEN start_time >= DATE('now', '-1 month') THEN read_percentage - ELSE 0 - END - ) - ) - AS monthly_words_read, - COALESCE(( - CAST(COALESCE(d.words, 0.0) AS REAL) - * SUM( - CASE - WHEN start_time >= DATE('now', '-1 month') THEN read_percentage - END - ) - ) - / ( - SUM( - CASE - WHEN start_time >= DATE('now', '-1 month') THEN duration - END - ) - / 60.0 - ), 0.0) - AS monthly_wpm, - - -- Weekly WPM - SUM(CASE WHEN start_time >= DATE('now', '-7 days') THEN duration ELSE 0 END) - AS weekly_time_seconds, - ( - CAST(COALESCE(d.words, 0.0) AS REAL) - * SUM( - CASE - WHEN start_time >= DATE('now', '-7 days') THEN read_percentage - ELSE 0 - END - ) - ) - AS weekly_words_read, - COALESCE(( - CAST(COALESCE(d.words, 0.0) AS REAL) - * SUM( - CASE - WHEN start_time >= DATE('now', '-7 days') THEN read_percentage - END - ) - ) - / ( - SUM( - CASE - WHEN start_time >= DATE('now', '-7 days') THEN duration - END - ) - / 60.0 - ), 0.0) - AS weekly_wpm - -FROM grouped_activity AS ga -INNER JOIN - current_progress AS cp - ON ga.user_id = cp.user_id AND ga.document_id = cp.document_id -INNER JOIN - documents AS d - ON ga.document_id = d.id -GROUP BY ga.document_id, ga.user_id -ORDER BY total_wpm DESC;