perf(db): incremental document stats cache
All checks were successful
continuous-integration/drone/push Build is passing

This commit is contained in:
Evan Reichard 2024-12-01 12:48:25 -05:00
parent 5e388730a5
commit 3d61d0f5ef
8 changed files with 377 additions and 329 deletions

View File

@ -1,6 +1,6 @@
// Code generated by sqlc. DO NOT EDIT. // Code generated by sqlc. DO NOT EDIT.
// versions: // versions:
// sqlc v1.26.0 // sqlc v1.27.0
package database package database

View File

@ -0,0 +1,228 @@
WITH new_activity AS (
SELECT
document_id,
user_id
FROM activity
WHERE
created_at > COALESCE(
(SELECT MAX(last_seen) FROM document_user_statistics),
'1970-01-01T00:00:00Z'
)
GROUP BY user_id, document_id
),
intermediate_ga AS (
SELECT
ga.id AS row_id,
ga.user_id,
ga.document_id,
ga.duration,
ga.start_time,
ga.start_percentage,
ga.end_percentage,
ga.created_at,
-- Find Overlapping Events (Assign Unique ID)
(
SELECT MIN(id)
FROM activity AS overlap
WHERE
ga.document_id = overlap.document_id
AND ga.user_id = overlap.user_id
AND ga.start_percentage <= overlap.end_percentage
AND ga.end_percentage >= overlap.start_percentage
) AS group_leader
FROM activity AS ga
INNER JOIN new_activity AS na
WHERE na.user_id = ga.user_id AND na.document_id = ga.document_id
),
grouped_activity AS (
SELECT
user_id,
document_id,
MAX(created_at) AS created_at,
MAX(start_time) AS start_time,
MIN(start_percentage) AS start_percentage,
MAX(end_percentage) AS end_percentage,
MAX(end_percentage) - MIN(start_percentage) AS read_percentage,
SUM(duration) AS duration
FROM intermediate_ga
GROUP BY group_leader
),
current_progress AS (
SELECT
user_id,
document_id,
COALESCE((
SELECT percentage
FROM document_progress AS dp
WHERE
dp.user_id = iga.user_id
AND dp.document_id = iga.document_id
ORDER BY created_at DESC
LIMIT 1
), end_percentage) AS percentage
FROM intermediate_ga AS iga
GROUP BY user_id, document_id
HAVING MAX(start_time)
)
INSERT INTO document_user_statistics
SELECT
ga.document_id,
ga.user_id,
cp.percentage,
MAX(ga.start_time) AS last_read,
MAX(ga.created_at) AS last_seen,
SUM(ga.read_percentage) AS read_percentage,
-- All Time WPM
SUM(ga.duration) AS total_time_seconds,
(CAST(COALESCE(d.words, 0.0) AS REAL) * SUM(read_percentage))
AS total_words_read,
(CAST(COALESCE(d.words, 0.0) AS REAL) * SUM(read_percentage))
/ (SUM(ga.duration) / 60.0) AS total_wpm,
-- Yearly WPM
SUM(
CASE
WHEN
ga.start_time >= DATE('now', '-1 year')
THEN ga.duration
ELSE 0
END
)
AS yearly_time_seconds,
(
CAST(COALESCE(d.words, 0.0) AS REAL)
* SUM(
CASE
WHEN
ga.start_time >= DATE('now', '-1 year')
THEN read_percentage
ELSE 0
END
)
)
AS yearly_words_read,
COALESCE((
CAST(COALESCE(d.words, 0.0) AS REAL)
* SUM(
CASE
WHEN
ga.start_time >= DATE('now', '-1 year')
THEN read_percentage
END
)
)
/ (
SUM(
CASE
WHEN
ga.start_time >= DATE('now', '-1 year')
THEN ga.duration
END
)
/ 60.0
), 0.0)
AS yearly_wpm,
-- Monthly WPM
SUM(
CASE
WHEN
ga.start_time >= DATE('now', '-1 month')
THEN ga.duration
ELSE 0
END
)
AS monthly_time_seconds,
(
CAST(COALESCE(d.words, 0.0) AS REAL)
* SUM(
CASE
WHEN
ga.start_time >= DATE('now', '-1 month')
THEN read_percentage
ELSE 0
END
)
)
AS monthly_words_read,
COALESCE((
CAST(COALESCE(d.words, 0.0) AS REAL)
* SUM(
CASE
WHEN
ga.start_time >= DATE('now', '-1 month')
THEN read_percentage
END
)
)
/ (
SUM(
CASE
WHEN
ga.start_time >= DATE('now', '-1 month')
THEN ga.duration
END
)
/ 60.0
), 0.0)
AS monthly_wpm,
-- Weekly WPM
SUM(
CASE
WHEN
ga.start_time >= DATE('now', '-7 days')
THEN ga.duration
ELSE 0
END
)
AS weekly_time_seconds,
(
CAST(COALESCE(d.words, 0.0) AS REAL)
* SUM(
CASE
WHEN
ga.start_time >= DATE('now', '-7 days')
THEN read_percentage
ELSE 0
END
)
)
AS weekly_words_read,
COALESCE((
CAST(COALESCE(d.words, 0.0) AS REAL)
* SUM(
CASE
WHEN
ga.start_time >= DATE('now', '-7 days')
THEN read_percentage
END
)
)
/ (
SUM(
CASE
WHEN
ga.start_time >= DATE('now', '-7 days')
THEN ga.duration
END
)
/ 60.0
), 0.0)
AS weekly_wpm
FROM grouped_activity AS ga
INNER JOIN
current_progress AS cp
ON ga.user_id = cp.user_id AND ga.document_id = cp.document_id
INNER JOIN
documents AS d
ON ga.document_id = d.id
GROUP BY ga.document_id, ga.user_id
ORDER BY total_wpm DESC;

View File

@ -28,8 +28,11 @@ type DBManager struct {
//go:embed schema.sql //go:embed schema.sql
var ddl string var ddl string
//go:embed views.sql //go:embed user_streaks.sql
var views string var user_streaks_view string
//go:embed document_user_statistics.sql
var document_user_statistics string
//go:embed migrations/* //go:embed migrations/*
var migrations embed.FS var migrations embed.FS
@ -105,7 +108,7 @@ func (dbm *DBManager) init() error {
} }
// Execute views // Execute views
if _, err := dbm.DB.Exec(views, nil); err != nil { if _, err := dbm.DB.Exec(user_streaks_view, nil); err != nil {
log.Panicf("Error executing views: %v", err) log.Panicf("Error executing views: %v", err)
return err return err
} }
@ -154,11 +157,7 @@ func (dbm *DBManager) CacheTempTables() error {
log.Debug("Cached 'user_streaks' in: ", time.Since(start)) log.Debug("Cached 'user_streaks' in: ", time.Since(start))
start = time.Now() start = time.Now()
document_statistics_sql := ` if _, err := dbm.DB.ExecContext(dbm.Ctx, document_user_statistics); err != nil {
DELETE FROM document_user_statistics;
INSERT INTO document_user_statistics SELECT * FROM view_document_user_statistics;
`
if _, err := dbm.DB.ExecContext(dbm.Ctx, document_statistics_sql); err != nil {
return err return err
} }
log.Debug("Cached 'document_user_statistics' in: ", time.Since(start)) log.Debug("Cached 'document_user_statistics' in: ", time.Since(start))

View File

@ -1,6 +1,6 @@
// Code generated by sqlc. DO NOT EDIT. // Code generated by sqlc. DO NOT EDIT.
// versions: // versions:
// sqlc v1.26.0 // sqlc v1.27.0
package database package database
@ -62,6 +62,7 @@ type DocumentUserStatistic struct {
UserID string `json:"user_id"` UserID string `json:"user_id"`
Percentage float64 `json:"percentage"` Percentage float64 `json:"percentage"`
LastRead string `json:"last_read"` LastRead string `json:"last_read"`
LastSeen string `json:"last_seen"`
ReadPercentage float64 `json:"read_percentage"` ReadPercentage float64 `json:"read_percentage"`
TotalTimeSeconds int64 `json:"total_time_seconds"` TotalTimeSeconds int64 `json:"total_time_seconds"`
TotalWordsRead int64 `json:"total_words_read"` TotalWordsRead int64 `json:"total_words_read"`

View File

@ -1,6 +1,6 @@
// Code generated by sqlc. DO NOT EDIT. // Code generated by sqlc. DO NOT EDIT.
// versions: // versions:
// sqlc v1.26.0 // sqlc v1.27.0
// source: query.sql // source: query.sql
package database package database

View File

@ -118,30 +118,13 @@ CREATE TABLE IF NOT EXISTS settings (
created_at DATETIME NOT NULL DEFAULT (STRFTIME('%Y-%m-%dT%H:%M:%SZ', 'now')) created_at DATETIME NOT NULL DEFAULT (STRFTIME('%Y-%m-%dT%H:%M:%SZ', 'now'))
); );
--------------------------------------------------------------- -- Document User Statistics Table
----------------------- Temporary Tables ---------------------- CREATE TABLE IF NOT EXISTS document_user_statistics (
---------------------------------------------------------------
-- Temporary User Streaks Table (Cached from View)
CREATE TEMPORARY TABLE IF NOT EXISTS user_streaks (
user_id TEXT NOT NULL,
window TEXT NOT NULL,
max_streak INTEGER NOT NULL,
max_streak_start_date TEXT NOT NULL,
max_streak_end_date TEXT NOT NULL,
current_streak INTEGER NOT NULL,
current_streak_start_date TEXT NOT NULL,
current_streak_end_date TEXT NOT NULL
);
-- Temporary Document User Statistics Table (Cached from View)
CREATE TEMPORARY TABLE IF NOT EXISTS document_user_statistics (
document_id TEXT NOT NULL, document_id TEXT NOT NULL,
user_id TEXT NOT NULL, user_id TEXT NOT NULL,
percentage REAL NOT NULL, percentage REAL NOT NULL,
last_read TEXT NOT NULL, last_read DATETIME NOT NULL,
last_seen DATETIME NOT NULL,
read_percentage REAL NOT NULL, read_percentage REAL NOT NULL,
total_time_seconds INTEGER NOT NULL, total_time_seconds INTEGER NOT NULL,
@ -163,6 +146,23 @@ CREATE TEMPORARY TABLE IF NOT EXISTS document_user_statistics (
UNIQUE(document_id, user_id) ON CONFLICT REPLACE UNIQUE(document_id, user_id) ON CONFLICT REPLACE
); );
---------------------------------------------------------------
----------------------- Temporary Tables ----------------------
---------------------------------------------------------------
-- Temporary User Streaks Table (Cached from View)
CREATE TEMPORARY TABLE IF NOT EXISTS user_streaks (
user_id TEXT NOT NULL,
window TEXT NOT NULL,
max_streak INTEGER NOT NULL,
max_streak_start_date TEXT NOT NULL,
max_streak_end_date TEXT NOT NULL,
current_streak INTEGER NOT NULL,
current_streak_start_date TEXT NOT NULL,
current_streak_end_date TEXT NOT NULL
);
--------------------------------------------------------------- ---------------------------------------------------------------
--------------------------- Indexes --------------------------- --------------------------- Indexes ---------------------------
@ -176,7 +176,6 @@ CREATE INDEX IF NOT EXISTS activity_user_id_document_id ON activity (
); );
DROP VIEW IF EXISTS view_user_streaks; DROP VIEW IF EXISTS view_user_streaks;
DROP VIEW IF EXISTS view_document_user_statistics;
--------------------------------------------------------------- ---------------------------------------------------------------
--------------------------- Triggers -------------------------- --------------------------- Triggers --------------------------

117
database/user_streaks.sql Normal file
View File

@ -0,0 +1,117 @@
---------------------------------------------------------------
---------------------------- Views ----------------------------
---------------------------------------------------------------
--------------------------------
--------- User Streaks ---------
--------------------------------
CREATE VIEW view_user_streaks AS
WITH document_windows AS (
SELECT
activity.user_id,
users.timezone,
DATE(
LOCAL_TIME(activity.start_time, users.timezone),
'weekday 0', '-7 day'
) AS weekly_read,
DATE(LOCAL_TIME(activity.start_time, users.timezone)) AS daily_read
FROM activity
LEFT JOIN users ON users.id = activity.user_id
GROUP BY activity.user_id, weekly_read, daily_read
),
weekly_partitions AS (
SELECT
user_id,
timezone,
'WEEK' AS "window",
weekly_read AS read_window,
row_number() OVER (
PARTITION BY user_id ORDER BY weekly_read DESC
) AS seqnum
FROM document_windows
GROUP BY user_id, weekly_read
),
daily_partitions AS (
SELECT
user_id,
timezone,
'DAY' AS "window",
daily_read AS read_window,
row_number() OVER (
PARTITION BY user_id ORDER BY daily_read DESC
) AS seqnum
FROM document_windows
GROUP BY user_id, daily_read
),
streaks AS (
SELECT
COUNT(*) AS streak,
MIN(read_window) AS start_date,
MAX(read_window) AS end_date,
window,
user_id,
timezone
FROM daily_partitions
GROUP BY
timezone,
user_id,
DATE(read_window, '+' || seqnum || ' day')
UNION ALL
SELECT
COUNT(*) AS streak,
MIN(read_window) AS start_date,
MAX(read_window) AS end_date,
window,
user_id,
timezone
FROM weekly_partitions
GROUP BY
timezone,
user_id,
DATE(read_window, '+' || (seqnum * 7) || ' day')
),
max_streak AS (
SELECT
MAX(streak) AS max_streak,
start_date AS max_streak_start_date,
end_date AS max_streak_end_date,
window,
user_id
FROM streaks
GROUP BY user_id, window
),
current_streak AS (
SELECT
streak AS current_streak,
start_date AS current_streak_start_date,
end_date AS current_streak_end_date,
window,
user_id
FROM streaks
WHERE CASE
WHEN window = "WEEK" THEN
DATE(LOCAL_TIME(STRFTIME('%Y-%m-%dT%H:%M:%SZ', 'now'), timezone), 'weekday 0', '-14 day') = current_streak_end_date
OR DATE(LOCAL_TIME(STRFTIME('%Y-%m-%dT%H:%M:%SZ', 'now'), timezone), 'weekday 0', '-7 day') = current_streak_end_date
WHEN window = "DAY" THEN
DATE(LOCAL_TIME(STRFTIME('%Y-%m-%dT%H:%M:%SZ', 'now'), timezone), '-1 day') = current_streak_end_date
OR DATE(LOCAL_TIME(STRFTIME('%Y-%m-%dT%H:%M:%SZ', 'now'), timezone)) = current_streak_end_date
END
GROUP BY user_id, window
)
SELECT
max_streak.user_id,
max_streak.window,
IFNULL(max_streak, 0) AS max_streak,
IFNULL(max_streak_start_date, "N/A") AS max_streak_start_date,
IFNULL(max_streak_end_date, "N/A") AS max_streak_end_date,
IFNULL(current_streak, 0) AS current_streak,
IFNULL(current_streak_start_date, "N/A") AS current_streak_start_date,
IFNULL(current_streak_end_date, "N/A") AS current_streak_end_date
FROM max_streak
LEFT JOIN current_streak ON
current_streak.user_id = max_streak.user_id
AND current_streak.window = max_streak.window;

View File

@ -1,296 +0,0 @@
---------------------------------------------------------------
---------------------------- Views ----------------------------
---------------------------------------------------------------
--------------------------------
--------- User Streaks ---------
--------------------------------
CREATE VIEW view_user_streaks AS
WITH document_windows AS (
SELECT
activity.user_id,
users.timezone,
DATE(
LOCAL_TIME(activity.start_time, users.timezone),
'weekday 0', '-7 day'
) AS weekly_read,
DATE(LOCAL_TIME(activity.start_time, users.timezone)) AS daily_read
FROM activity
LEFT JOIN users ON users.id = activity.user_id
GROUP BY activity.user_id, weekly_read, daily_read
),
weekly_partitions AS (
SELECT
user_id,
timezone,
'WEEK' AS "window",
weekly_read AS read_window,
row_number() OVER (
PARTITION BY user_id ORDER BY weekly_read DESC
) AS seqnum
FROM document_windows
GROUP BY user_id, weekly_read
),
daily_partitions AS (
SELECT
user_id,
timezone,
'DAY' AS "window",
daily_read AS read_window,
row_number() OVER (
PARTITION BY user_id ORDER BY daily_read DESC
) AS seqnum
FROM document_windows
GROUP BY user_id, daily_read
),
streaks AS (
SELECT
COUNT(*) AS streak,
MIN(read_window) AS start_date,
MAX(read_window) AS end_date,
window,
user_id,
timezone
FROM daily_partitions
GROUP BY
timezone,
user_id,
DATE(read_window, '+' || seqnum || ' day')
UNION ALL
SELECT
COUNT(*) AS streak,
MIN(read_window) AS start_date,
MAX(read_window) AS end_date,
window,
user_id,
timezone
FROM weekly_partitions
GROUP BY
timezone,
user_id,
DATE(read_window, '+' || (seqnum * 7) || ' day')
),
max_streak AS (
SELECT
MAX(streak) AS max_streak,
start_date AS max_streak_start_date,
end_date AS max_streak_end_date,
window,
user_id
FROM streaks
GROUP BY user_id, window
),
current_streak AS (
SELECT
streak AS current_streak,
start_date AS current_streak_start_date,
end_date AS current_streak_end_date,
window,
user_id
FROM streaks
WHERE CASE
WHEN window = "WEEK" THEN
DATE(LOCAL_TIME(STRFTIME('%Y-%m-%dT%H:%M:%SZ', 'now'), timezone), 'weekday 0', '-14 day') = current_streak_end_date
OR DATE(LOCAL_TIME(STRFTIME('%Y-%m-%dT%H:%M:%SZ', 'now'), timezone), 'weekday 0', '-7 day') = current_streak_end_date
WHEN window = "DAY" THEN
DATE(LOCAL_TIME(STRFTIME('%Y-%m-%dT%H:%M:%SZ', 'now'), timezone), '-1 day') = current_streak_end_date
OR DATE(LOCAL_TIME(STRFTIME('%Y-%m-%dT%H:%M:%SZ', 'now'), timezone)) = current_streak_end_date
END
GROUP BY user_id, window
)
SELECT
max_streak.user_id,
max_streak.window,
IFNULL(max_streak, 0) AS max_streak,
IFNULL(max_streak_start_date, "N/A") AS max_streak_start_date,
IFNULL(max_streak_end_date, "N/A") AS max_streak_end_date,
IFNULL(current_streak, 0) AS current_streak,
IFNULL(current_streak_start_date, "N/A") AS current_streak_start_date,
IFNULL(current_streak_end_date, "N/A") AS current_streak_end_date
FROM max_streak
LEFT JOIN current_streak ON
current_streak.user_id = max_streak.user_id
AND current_streak.window = max_streak.window;
--------------------------------
------- Document Stats ---------
--------------------------------
CREATE VIEW view_document_user_statistics AS
WITH intermediate_ga AS (
SELECT
ga1.id AS row_id,
ga1.user_id,
ga1.document_id,
ga1.duration,
ga1.start_time,
ga1.start_percentage,
ga1.end_percentage,
-- Find Overlapping Events (Assign Unique ID)
(
SELECT MIN(id)
FROM activity AS ga2
WHERE
ga1.document_id = ga2.document_id
AND ga1.user_id = ga2.user_id
AND ga1.start_percentage <= ga2.end_percentage
AND ga1.end_percentage >= ga2.start_percentage
) AS group_leader
FROM activity AS ga1
),
grouped_activity AS (
SELECT
user_id,
document_id,
MAX(start_time) AS start_time,
MIN(start_percentage) AS start_percentage,
MAX(end_percentage) AS end_percentage,
MAX(end_percentage) - MIN(start_percentage) AS read_percentage,
SUM(duration) AS duration
FROM intermediate_ga
GROUP BY group_leader
),
current_progress AS (
SELECT
user_id,
document_id,
COALESCE((
SELECT percentage
FROM document_progress AS dp
WHERE
dp.user_id = iga.user_id
AND dp.document_id = iga.document_id
ORDER BY created_at DESC
LIMIT 1
), end_percentage) AS percentage
FROM intermediate_ga AS iga
GROUP BY user_id, document_id
HAVING MAX(start_time)
)
SELECT
ga.document_id,
ga.user_id,
cp.percentage,
MAX(start_time) AS last_read,
SUM(read_percentage) AS read_percentage,
-- All Time WPM
SUM(duration) AS total_time_seconds,
(CAST(COALESCE(d.words, 0.0) AS REAL) * SUM(read_percentage))
AS total_words_read,
(CAST(COALESCE(d.words, 0.0) AS REAL) * SUM(read_percentage))
/ (SUM(duration) / 60.0) AS total_wpm,
-- Yearly WPM
SUM(CASE WHEN start_time >= DATE('now', '-1 year') THEN duration ELSE 0 END)
AS yearly_time_seconds,
(
CAST(COALESCE(d.words, 0.0) AS REAL)
* SUM(
CASE
WHEN start_time >= DATE('now', '-1 year') THEN read_percentage
ELSE 0
END
)
)
AS yearly_words_read,
COALESCE((
CAST(COALESCE(d.words, 0.0) AS REAL)
* SUM(
CASE
WHEN start_time >= DATE('now', '-1 year') THEN read_percentage
END
)
)
/ (
SUM(
CASE
WHEN start_time >= DATE('now', '-1 year') THEN duration
END
)
/ 60.0
), 0.0)
AS yearly_wpm,
-- Monthly WPM
SUM(
CASE WHEN start_time >= DATE('now', '-1 month') THEN duration ELSE 0 END
)
AS monthly_time_seconds,
(
CAST(COALESCE(d.words, 0.0) AS REAL)
* SUM(
CASE
WHEN start_time >= DATE('now', '-1 month') THEN read_percentage
ELSE 0
END
)
)
AS monthly_words_read,
COALESCE((
CAST(COALESCE(d.words, 0.0) AS REAL)
* SUM(
CASE
WHEN start_time >= DATE('now', '-1 month') THEN read_percentage
END
)
)
/ (
SUM(
CASE
WHEN start_time >= DATE('now', '-1 month') THEN duration
END
)
/ 60.0
), 0.0)
AS monthly_wpm,
-- Weekly WPM
SUM(CASE WHEN start_time >= DATE('now', '-7 days') THEN duration ELSE 0 END)
AS weekly_time_seconds,
(
CAST(COALESCE(d.words, 0.0) AS REAL)
* SUM(
CASE
WHEN start_time >= DATE('now', '-7 days') THEN read_percentage
ELSE 0
END
)
)
AS weekly_words_read,
COALESCE((
CAST(COALESCE(d.words, 0.0) AS REAL)
* SUM(
CASE
WHEN start_time >= DATE('now', '-7 days') THEN read_percentage
END
)
)
/ (
SUM(
CASE
WHEN start_time >= DATE('now', '-7 days') THEN duration
END
)
/ 60.0
), 0.0)
AS weekly_wpm
FROM grouped_activity AS ga
INNER JOIN
current_progress AS cp
ON ga.user_id = cp.user_id AND ga.document_id = cp.document_id
INNER JOIN
documents AS d
ON ga.document_id = d.id
GROUP BY ga.document_id, ga.user_id
ORDER BY total_wpm DESC;