improve(search): progress & retries
All checks were successful
continuous-integration/drone/push Build is passing

This commit is contained in:
Evan Reichard 2024-12-01 17:02:53 -05:00
parent 3d61d0f5ef
commit 841b29c425
6 changed files with 256 additions and 231 deletions

View File

@ -23,7 +23,6 @@ import (
"reichard.io/antholume/database" "reichard.io/antholume/database"
"reichard.io/antholume/metadata" "reichard.io/antholume/metadata"
"reichard.io/antholume/search" "reichard.io/antholume/search"
"reichard.io/antholume/utils"
) )
type backupType string type backupType string
@ -740,57 +739,50 @@ func (api *API) appSaveNewDocument(c *gin.Context) {
} }
// Send Message // Send Message
sendDownloadMessage("Downloading document...", gin.H{"Progress": 10}) sendDownloadMessage("Downloading document...", gin.H{"Progress": 1})
// Scaled Download Function
lastTime := time.Now()
downloadFunc := func(p float32) {
nowTime := time.Now()
if nowTime.Before(lastTime.Add(time.Millisecond * 500)) {
return
}
scaledProgress := int((p * 95 / 100) + 2)
sendDownloadMessage("Downloading document...", gin.H{"Progress": scaledProgress})
lastTime = nowTime
}
// Save Book // Save Book
tempFilePath, err := search.SaveBook(rDocAdd.ID, rDocAdd.Source) tempFilePath, metadata, err := search.SaveBook(rDocAdd.ID, rDocAdd.Source, downloadFunc)
if err != nil { if err != nil {
log.Warn("Temp File Error: ", err) log.Warn("Save Book Error: ", err)
sendDownloadMessage("Unable to download file", gin.H{"Error": true}) sendDownloadMessage("Unable to download file", gin.H{"Error": true})
return return
} }
// Send Message // Send Message
sendDownloadMessage("Calculating partial MD5...", gin.H{"Progress": 60}) sendDownloadMessage("Saving document...", gin.H{"Progress": 98})
// Calculate Partial MD5 ID // Derive Author / Title
partialMD5, err := utils.CalculatePartialMD5(tempFilePath) docAuthor := "Unknown"
if err != nil { if *metadata.Author != "" {
log.Warn("Partial MD5 Error: ", err) docAuthor = *metadata.Author
sendDownloadMessage("Unable to calculate partial MD5", gin.H{"Error": true}) } else if *rDocAdd.Author != "" {
docAuthor = *rDocAdd.Author
} }
// Send Message docTitle := "Unknown"
sendDownloadMessage("Saving file...", gin.H{"Progress": 60}) if *metadata.Title != "" {
docTitle = *metadata.Title
// Derive Extension on MIME } else if *rDocAdd.Title != "" {
fileMime, err := mimetype.DetectFile(tempFilePath) docTitle = *rDocAdd.Title
if err != nil {
log.Warn("MIME Detect Error: ", err)
sendDownloadMessage("Unable to download file", gin.H{"Error": true})
return
}
fileExtension := fileMime.Extension()
// Derive Filename
var fileName string
if *rDocAdd.Author != "" {
fileName = fileName + *rDocAdd.Author
} else {
fileName = fileName + "Unknown"
} }
if *rDocAdd.Title != "" { // Remove Slashes & Sanitize File Name
fileName = fileName + " - " + *rDocAdd.Title fileName := fmt.Sprintf("%s - %s", docAuthor, docTitle)
} else {
fileName = fileName + " - Unknown"
}
// Remove Slashes
fileName = strings.ReplaceAll(fileName, "/", "") fileName = strings.ReplaceAll(fileName, "/", "")
fileName = "." + filepath.Clean(fmt.Sprintf("/%s [%s]%s", fileName, *metadata.PartialMD5, metadata.Type))
// Derive & Sanitize File Name
fileName = "." + filepath.Clean(fmt.Sprintf("/%s [%s]%s", fileName, *partialMD5, fileExtension))
// Open Source File // Open Source File
sourceFile, err := os.Open(tempFilePath) sourceFile, err := os.Open(tempFilePath)
@ -822,37 +814,15 @@ func (api *API) appSaveNewDocument(c *gin.Context) {
} }
// Send Message // Send Message
sendDownloadMessage("Calculating MD5...", gin.H{"Progress": 70}) sendDownloadMessage("Saving to database...", gin.H{"Progress": 99})
// Get MD5 Hash
fileHash, err := getFileMD5(safePath)
if err != nil {
log.Error("Hash Failure: ", err)
sendDownloadMessage("Unable to calculate MD5", gin.H{"Error": true})
return
}
// Send Message
sendDownloadMessage("Calculating word count...", gin.H{"Progress": 80})
// Get Word Count
wordCount, err := metadata.GetWordCount(safePath)
if err != nil {
log.Error("Word Count Failure: ", err)
sendDownloadMessage("Unable to calculate word count", gin.H{"Error": true})
return
}
// Send Message
sendDownloadMessage("Saving to database...", gin.H{"Progress": 90})
// Upsert Document // Upsert Document
if _, err = api.db.Queries.UpsertDocument(api.db.Ctx, database.UpsertDocumentParams{ if _, err = api.db.Queries.UpsertDocument(api.db.Ctx, database.UpsertDocumentParams{
ID: *partialMD5, ID: *metadata.PartialMD5,
Title: rDocAdd.Title, Title: &docTitle,
Author: rDocAdd.Author, Author: &docAuthor,
Md5: fileHash, Md5: metadata.MD5,
Words: wordCount, Words: metadata.WordCount,
Filepath: &fileName, Filepath: &fileName,
Basepath: &basePath, Basepath: &basePath,
}); err != nil { }); err != nil {
@ -865,7 +835,7 @@ func (api *API) appSaveNewDocument(c *gin.Context) {
sendDownloadMessage("Download Success", gin.H{ sendDownloadMessage("Download Success", gin.H{
"Progress": 100, "Progress": 100,
"ButtonText": "Go to Book", "ButtonText": "Go to Book",
"ButtonHref": fmt.Sprintf("./documents/%s", *partialMD5), "ButtonHref": fmt.Sprintf("./documents/%s", *metadata.PartialMD5),
}) })
} }

View File

@ -3,6 +3,7 @@ package search
import ( import (
"fmt" "fmt"
"io" "io"
"net/url"
"regexp" "regexp"
"strings" "strings"
@ -11,50 +12,14 @@ import (
var commentRE = regexp.MustCompile(`(?s)<!--(.*?)-->`) var commentRE = regexp.MustCompile(`(?s)<!--(.*?)-->`)
func parseAnnasArchiveDownloadURL(body io.ReadCloser) (string, error) { func searchAnnasArchive(query string) ([]SearchItem, error) {
// Parse searchURL := "https://annas-archive.org/search?index=&q=%s&ext=epub&sort=&lang=en"
defer body.Close() url := fmt.Sprintf(searchURL, url.QueryEscape(query))
doc, _ := goquery.NewDocumentFromReader(body) body, err := getPage(url)
// Return Download URL
downloadPath, exists := doc.Find("body > table > tbody > tr > td > a").Attr("href")
if !exists {
return "", fmt.Errorf("Download URL not found")
}
// Possible Funky URL
downloadPath = strings.ReplaceAll(downloadPath, "\\", "/")
return fmt.Sprintf("http://libgen.li/%s", downloadPath), nil
}
// getAnnasArchiveBookSelection parses potentially commented out HTML. For some reason
// Annas Archive comments out blocks "below the fold". They aren't rendered until you
// scroll. This attempts to parse the commented out HTML.
func getAnnasArchiveBookSelection(rawBook *goquery.Selection) *goquery.Selection {
rawHTML, err := rawBook.Html()
if err != nil { if err != nil {
return rawBook return nil, err
} }
return parseAnnasArchive(body)
strippedHTML := strings.TrimSpace(rawHTML)
if !strings.HasPrefix(strippedHTML, "<!--") || !strings.HasSuffix(strippedHTML, "-->") {
return rawBook
}
allMatches := commentRE.FindAllStringSubmatch(strippedHTML, -1)
if len(allMatches) != 1 || len(allMatches[0]) != 2 {
return rawBook
}
captureGroup := allMatches[0][1]
docReader := strings.NewReader(captureGroup)
doc, err := goquery.NewDocumentFromReader(docReader)
if err != nil {
return rawBook
}
return doc.Selection
} }
func parseAnnasArchive(body io.ReadCloser) ([]SearchItem, error) { func parseAnnasArchive(body io.ReadCloser) ([]SearchItem, error) {
@ -107,3 +72,32 @@ func parseAnnasArchive(body io.ReadCloser) ([]SearchItem, error) {
// Return Results // Return Results
return allEntries, nil return allEntries, nil
} }
// getAnnasArchiveBookSelection parses potentially commented out HTML. For some reason
// Annas Archive comments out blocks "below the fold". They aren't rendered until you
// scroll. This attempts to parse the commented out HTML.
func getAnnasArchiveBookSelection(rawBook *goquery.Selection) *goquery.Selection {
rawHTML, err := rawBook.Html()
if err != nil {
return rawBook
}
strippedHTML := strings.TrimSpace(rawHTML)
if !strings.HasPrefix(strippedHTML, "<!--") || !strings.HasSuffix(strippedHTML, "-->") {
return rawBook
}
allMatches := commentRE.FindAllStringSubmatch(strippedHTML, -1)
if len(allMatches) != 1 || len(allMatches[0]) != 2 {
return rawBook
}
captureGroup := allMatches[0][1]
docReader := strings.NewReader(captureGroup)
doc, err := goquery.NewDocumentFromReader(docReader)
if err != nil {
return rawBook
}
return doc.Selection
}

69
search/downloaders.go Normal file
View File

@ -0,0 +1,69 @@
package search
import (
"errors"
"fmt"
"strings"
"github.com/PuerkitoBio/goquery"
)
func getLibGenDownloadURL(md5 string, _ Source) ([]string, error) {
// Get Page
body, err := getPage("http://libgen.li/ads.php?md5=" + md5)
if err != nil {
return nil, err
}
defer body.Close()
// Parse
doc, err := goquery.NewDocumentFromReader(body)
if err != nil {
return nil, err
}
// Return Download URL
downloadPath, exists := doc.Find("body > table > tbody > tr > td > a").Attr("href")
if !exists {
return nil, fmt.Errorf("Download URL not found")
}
// Possible Funky URL
downloadPath = strings.ReplaceAll(downloadPath, "\\", "/")
return []string{fmt.Sprintf("http://libgen.li/%s", downloadPath)}, nil
}
func getLibraryDownloadURL(md5 string, source Source) ([]string, error) {
// Derive Info URL
var infoURL string
switch source {
case SOURCE_LIBGEN_FICTION, SOURCE_ANNAS_ARCHIVE:
infoURL = "http://library.lol/fiction/" + md5
case SOURCE_LIBGEN_NON_FICTION:
infoURL = "http://library.lol/main/" + md5
default:
return nil, errors.New("invalid source")
}
// Get Page
body, err := getPage(infoURL)
if err != nil {
return nil, err
}
defer body.Close()
// Parse
doc, err := goquery.NewDocumentFromReader(body)
if err != nil {
return nil, err
}
// Return Download URL
// downloadURL, _ := doc.Find("#download [href*=cloudflare]").Attr("href")
downloadURL, exists := doc.Find("#download h2 a").Attr("href")
if !exists {
return nil, errors.New("Download URL not found")
}
return []string{downloadURL}, nil
}

View File

@ -3,12 +3,23 @@ package search
import ( import (
"fmt" "fmt"
"io" "io"
"net/url"
"strings" "strings"
"time" "time"
"github.com/PuerkitoBio/goquery" "github.com/PuerkitoBio/goquery"
) )
func searchLibGenFiction(query string) ([]SearchItem, error) {
searchURL := "https://libgen.is/fiction/?q=%s&language=English&format=epub"
url := fmt.Sprintf(searchURL, url.QueryEscape(query))
body, err := getPage(url)
if err != nil {
return nil, err
}
return parseLibGenFiction(body)
}
func parseLibGenFiction(body io.ReadCloser) ([]SearchItem, error) { func parseLibGenFiction(body io.ReadCloser) ([]SearchItem, error) {
// Parse // Parse
defer body.Close() defer body.Close()
@ -62,6 +73,16 @@ func parseLibGenFiction(body io.ReadCloser) ([]SearchItem, error) {
return allEntries, nil return allEntries, nil
} }
func searchLibGenNonFiction(query string) ([]SearchItem, error) {
searchURL := "https://libgen.is/search.php?req=%s"
url := fmt.Sprintf(searchURL, url.QueryEscape(query))
body, err := getPage(url)
if err != nil {
return nil, err
}
return parseLibGenNonFiction(body)
}
func parseLibGenNonFiction(body io.ReadCloser) ([]SearchItem, error) { func parseLibGenNonFiction(body io.ReadCloser) ([]SearchItem, error) {
// Parse // Parse
defer body.Close() defer body.Close()
@ -106,18 +127,3 @@ func parseLibGenNonFiction(body io.ReadCloser) ([]SearchItem, error) {
// Return Results // Return Results
return allEntries, nil return allEntries, nil
} }
func parseLibGenDownloadURL(body io.ReadCloser) (string, error) {
// Parse
defer body.Close()
doc, _ := goquery.NewDocumentFromReader(body)
// Return Download URL
// downloadURL, _ := doc.Find("#download [href*=cloudflare]").Attr("href")
downloadURL, exists := doc.Find("#download h2 a").Attr("href")
if !exists {
return "", fmt.Errorf("Download URL not found")
}
return downloadURL, nil
}

22
search/progress.go Normal file
View File

@ -0,0 +1,22 @@
package search
type writeCounter struct {
Total int64
Current int64
ProgressFunction func(float32)
}
func (wc *writeCounter) Write(p []byte) (int, error) {
n := len(p)
wc.Current += int64(n)
wc.flushProgress()
return n, nil
}
func (wc *writeCounter) flushProgress() {
if wc.ProgressFunction == nil || wc.Total < 100000 {
return
}
percentage := float32(wc.Current) * 100 / float32(wc.Total)
wc.ProgressFunction(percentage)
}

View File

@ -2,17 +2,18 @@ package search
import ( import (
"crypto/tls" "crypto/tls"
"errors"
"fmt" "fmt"
"io" "io"
"net/http" "net/http"
"net/url"
"os" "os"
"time" "time"
log "github.com/sirupsen/logrus" log "github.com/sirupsen/logrus"
"reichard.io/antholume/metadata"
) )
const userAgent string = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:127.0) Gecko/20100101 Firefox/127.0" const userAgent string = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
type Cadence string type Cadence string
@ -21,13 +22,6 @@ const (
CADENCE_TOP_MONTH Cadence = "m" CADENCE_TOP_MONTH Cadence = "m"
) )
type BookType int
const (
BOOK_FICTION BookType = iota
BOOK_NON_FICTION
)
type Source string type Source string
const ( const (
@ -47,108 +41,58 @@ type SearchItem struct {
UploadDate string UploadDate string
} }
type sourceDef struct { type searchFunc func(query string) (searchResults []SearchItem, err error)
searchURL string type downloadFunc func(md5 string, source Source) (downloadURL []string, err error)
downloadURL string
parseSearchFunc func(io.ReadCloser) ([]SearchItem, error) var searchDefs = map[Source]searchFunc{
parseDownloadFunc func(io.ReadCloser) (string, error) SOURCE_ANNAS_ARCHIVE: searchAnnasArchive,
SOURCE_LIBGEN_FICTION: searchLibGenFiction,
SOURCE_LIBGEN_NON_FICTION: searchLibGenNonFiction,
} }
var sourceDefs = map[Source]sourceDef{ var downloadFuncs = []downloadFunc{
SOURCE_ANNAS_ARCHIVE: { getLibGenDownloadURL,
searchURL: "https://annas-archive.org/search?index=&q=%s&ext=epub&sort=&lang=en", getLibraryDownloadURL,
downloadURL: "http://libgen.li/ads.php?md5=%s",
parseSearchFunc: parseAnnasArchive,
parseDownloadFunc: parseAnnasArchiveDownloadURL,
},
SOURCE_LIBGEN_FICTION: {
searchURL: "https://libgen.is/fiction/?q=%s&language=English&format=epub",
downloadURL: "http://libgen.li/ads.php?md5=%s",
parseSearchFunc: parseLibGenFiction,
parseDownloadFunc: parseAnnasArchiveDownloadURL,
},
SOURCE_LIBGEN_NON_FICTION: {
searchURL: "https://libgen.is/search.php?req=%s",
downloadURL: "http://libgen.li/ads.php?md5=%s",
parseSearchFunc: parseLibGenNonFiction,
parseDownloadFunc: parseAnnasArchiveDownloadURL,
},
} }
func SearchBook(query string, source Source) ([]SearchItem, error) { func SearchBook(query string, source Source) ([]SearchItem, error) {
def := sourceDefs[source] searchFunc, found := searchDefs[source]
log.Debug("Source: ", def) if !found {
url := fmt.Sprintf(def.searchURL, url.QueryEscape(query)) return nil, fmt.Errorf("invalid source: %s", source)
body, err := getPage(url)
if err != nil {
return nil, err
} }
return def.parseSearchFunc(body) log.Debug("Source: ", source)
return searchFunc(query)
} }
func SaveBook(id string, source Source) (string, error) { func SaveBook(md5 string, source Source, progressFunc func(float32)) (string, *metadata.MetadataInfo, error) {
def := sourceDefs[source] for _, f := range downloadFuncs {
log.Debug("Source: ", def) downloadURLs, err := f(md5, source)
url := fmt.Sprintf(def.downloadURL, id)
body, err := getPage(url)
if err != nil { if err != nil {
return "", err log.Error("failed to acquire download urls")
continue
} }
bookURL, err := def.parseDownloadFunc(body) for _, bookURL := range downloadURLs {
if err != nil {
log.Error("Parse Download URL Error: ", err)
return "", fmt.Errorf("Download Failure")
}
// Create File
tempFile, err := os.CreateTemp("", "book")
if err != nil {
log.Error("File Create Error: ", err)
return "", fmt.Errorf("File Failure")
}
defer tempFile.Close()
// Download File // Download File
log.Info("Downloading Book: ", bookURL) log.Info("Downloading Book: ", bookURL)
resp, err := downloadBook(bookURL) fileName, err := downloadBook(bookURL, progressFunc)
if err != nil { if err != nil {
os.Remove(tempFile.Name())
log.Error("Book URL API Failure: ", err) log.Error("Book URL API Failure: ", err)
return "", fmt.Errorf("API Failure") continue
} }
defer resp.Body.Close()
// Copy File to Disk // Get Metadata
log.Info("Saving Book") metadata, err := metadata.GetMetadata(fileName)
_, err = io.Copy(tempFile, resp.Body)
if err != nil { if err != nil {
os.Remove(tempFile.Name()) log.Error("Book Metadata Failure: ", err)
log.Error("File Copy Error: ", err) continue
return "", fmt.Errorf("File Failure")
} }
return tempFile.Name(), nil return fileName, metadata, nil
} }
func GetBookURL(id string, bookType BookType) (string, error) {
// Derive Info URL
var infoURL string
if bookType == BOOK_FICTION {
infoURL = "http://library.lol/fiction/" + id
} else if bookType == BOOK_NON_FICTION {
infoURL = "http://library.lol/main/" + id
} }
// Parse & Derive Download URL return "", nil, errors.New("failed to download book")
body, err := getPage(infoURL)
if err != nil {
return "", err
}
// downloadURL := parseLibGenDownloadURL(body)
return parseLibGenDownloadURL(body)
} }
func getPage(page string) (io.ReadCloser, error) { func getPage(page string) (io.ReadCloser, error) {
@ -162,8 +106,6 @@ func getPage(page string) (io.ReadCloser, error) {
if err != nil { if err != nil {
return nil, err return nil, err
} }
// Set User-Agent
req.Header.Set("User-Agent", userAgent) req.Header.Set("User-Agent", userAgent)
// Do Request // Do Request
@ -176,7 +118,7 @@ func getPage(page string) (io.ReadCloser, error) {
return resp.Body, err return resp.Body, err
} }
func downloadBook(bookURL string) (*http.Response, error) { func downloadBook(bookURL string, progressFunc func(float32)) (string, error) {
log.Debug("URL: ", bookURL) log.Debug("URL: ", bookURL)
// Allow Insecure // Allow Insecure
@ -189,11 +131,33 @@ func downloadBook(bookURL string) (*http.Response, error) {
// Start Request // Start Request
req, err := http.NewRequest("GET", bookURL, nil) req, err := http.NewRequest("GET", bookURL, nil)
if err != nil { if err != nil {
return nil, err return "", err
} }
// Set User-Agent
req.Header.Set("User-Agent", userAgent) req.Header.Set("User-Agent", userAgent)
return client.Do(req) // Perform API Request
resp, err := client.Do(req)
if err != nil {
return "", err
}
// Create File
tempFile, err := os.CreateTemp("", "book")
if err != nil {
log.Error("File Create Error: ", err)
return "", fmt.Errorf("failed to create temp file: %w", err)
}
defer tempFile.Close()
// Copy File to Disk
log.Info("Saving Book")
counter := &writeCounter{Total: resp.ContentLength, ProgressFunction: progressFunc}
_, err = io.Copy(tempFile, io.TeeReader(resp.Body, counter))
if err != nil {
os.Remove(tempFile.Name())
log.Error("File Copy Error: ", err)
return "", fmt.Errorf("failed to copy response to temp file: %w", err)
}
return tempFile.Name(), nil
} }