diff --git a/api/app-routes.go b/api/app-routes.go index 9e24a08..a5f37d3 100644 --- a/api/app-routes.go +++ b/api/app-routes.go @@ -23,7 +23,6 @@ import ( "reichard.io/antholume/database" "reichard.io/antholume/metadata" "reichard.io/antholume/search" - "reichard.io/antholume/utils" ) type backupType string @@ -740,57 +739,50 @@ func (api *API) appSaveNewDocument(c *gin.Context) { } // Send Message - sendDownloadMessage("Downloading document...", gin.H{"Progress": 10}) + sendDownloadMessage("Downloading document...", gin.H{"Progress": 1}) + + // Scaled Download Function + lastTime := time.Now() + downloadFunc := func(p float32) { + nowTime := time.Now() + if nowTime.Before(lastTime.Add(time.Millisecond * 500)) { + return + } + scaledProgress := int((p * 95 / 100) + 2) + sendDownloadMessage("Downloading document...", gin.H{"Progress": scaledProgress}) + lastTime = nowTime + } // Save Book - tempFilePath, err := search.SaveBook(rDocAdd.ID, rDocAdd.Source) + tempFilePath, metadata, err := search.SaveBook(rDocAdd.ID, rDocAdd.Source, downloadFunc) if err != nil { - log.Warn("Temp File Error: ", err) + log.Warn("Save Book Error: ", err) sendDownloadMessage("Unable to download file", gin.H{"Error": true}) return } // Send Message - sendDownloadMessage("Calculating partial MD5...", gin.H{"Progress": 60}) + sendDownloadMessage("Saving document...", gin.H{"Progress": 98}) - // Calculate Partial MD5 ID - partialMD5, err := utils.CalculatePartialMD5(tempFilePath) - if err != nil { - log.Warn("Partial MD5 Error: ", err) - sendDownloadMessage("Unable to calculate partial MD5", gin.H{"Error": true}) + // Derive Author / Title + docAuthor := "Unknown" + if *metadata.Author != "" { + docAuthor = *metadata.Author + } else if *rDocAdd.Author != "" { + docAuthor = *rDocAdd.Author } - // Send Message - sendDownloadMessage("Saving file...", gin.H{"Progress": 60}) - - // Derive Extension on MIME - fileMime, err := mimetype.DetectFile(tempFilePath) - if err != nil { - log.Warn("MIME Detect Error: ", err) - sendDownloadMessage("Unable to download file", gin.H{"Error": true}) - return - } - fileExtension := fileMime.Extension() - - // Derive Filename - var fileName string - if *rDocAdd.Author != "" { - fileName = fileName + *rDocAdd.Author - } else { - fileName = fileName + "Unknown" + docTitle := "Unknown" + if *metadata.Title != "" { + docTitle = *metadata.Title + } else if *rDocAdd.Title != "" { + docTitle = *rDocAdd.Title } - if *rDocAdd.Title != "" { - fileName = fileName + " - " + *rDocAdd.Title - } else { - fileName = fileName + " - Unknown" - } - - // Remove Slashes + // Remove Slashes & Sanitize File Name + fileName := fmt.Sprintf("%s - %s", docAuthor, docTitle) fileName = strings.ReplaceAll(fileName, "/", "") - - // Derive & Sanitize File Name - fileName = "." + filepath.Clean(fmt.Sprintf("/%s [%s]%s", fileName, *partialMD5, fileExtension)) + fileName = "." + filepath.Clean(fmt.Sprintf("/%s [%s]%s", fileName, *metadata.PartialMD5, metadata.Type)) // Open Source File sourceFile, err := os.Open(tempFilePath) @@ -822,37 +814,15 @@ func (api *API) appSaveNewDocument(c *gin.Context) { } // Send Message - sendDownloadMessage("Calculating MD5...", gin.H{"Progress": 70}) - - // Get MD5 Hash - fileHash, err := getFileMD5(safePath) - if err != nil { - log.Error("Hash Failure: ", err) - sendDownloadMessage("Unable to calculate MD5", gin.H{"Error": true}) - return - } - - // Send Message - sendDownloadMessage("Calculating word count...", gin.H{"Progress": 80}) - - // Get Word Count - wordCount, err := metadata.GetWordCount(safePath) - if err != nil { - log.Error("Word Count Failure: ", err) - sendDownloadMessage("Unable to calculate word count", gin.H{"Error": true}) - return - } - - // Send Message - sendDownloadMessage("Saving to database...", gin.H{"Progress": 90}) + sendDownloadMessage("Saving to database...", gin.H{"Progress": 99}) // Upsert Document if _, err = api.db.Queries.UpsertDocument(api.db.Ctx, database.UpsertDocumentParams{ - ID: *partialMD5, - Title: rDocAdd.Title, - Author: rDocAdd.Author, - Md5: fileHash, - Words: wordCount, + ID: *metadata.PartialMD5, + Title: &docTitle, + Author: &docAuthor, + Md5: metadata.MD5, + Words: metadata.WordCount, Filepath: &fileName, Basepath: &basePath, }); err != nil { @@ -865,7 +835,7 @@ func (api *API) appSaveNewDocument(c *gin.Context) { sendDownloadMessage("Download Success", gin.H{ "Progress": 100, "ButtonText": "Go to Book", - "ButtonHref": fmt.Sprintf("./documents/%s", *partialMD5), + "ButtonHref": fmt.Sprintf("./documents/%s", *metadata.PartialMD5), }) } diff --git a/search/anna.go b/search/anna.go index aa57312..9fd2c6b 100644 --- a/search/anna.go +++ b/search/anna.go @@ -3,6 +3,7 @@ package search import ( "fmt" "io" + "net/url" "regexp" "strings" @@ -11,50 +12,14 @@ import ( var commentRE = regexp.MustCompile(`(?s)`) -func parseAnnasArchiveDownloadURL(body io.ReadCloser) (string, error) { - // Parse - defer body.Close() - doc, _ := goquery.NewDocumentFromReader(body) - - // Return Download URL - downloadPath, exists := doc.Find("body > table > tbody > tr > td > a").Attr("href") - if !exists { - return "", fmt.Errorf("Download URL not found") - } - - // Possible Funky URL - downloadPath = strings.ReplaceAll(downloadPath, "\\", "/") - - return fmt.Sprintf("http://libgen.li/%s", downloadPath), nil -} - -// getAnnasArchiveBookSelection parses potentially commented out HTML. For some reason -// Annas Archive comments out blocks "below the fold". They aren't rendered until you -// scroll. This attempts to parse the commented out HTML. -func getAnnasArchiveBookSelection(rawBook *goquery.Selection) *goquery.Selection { - rawHTML, err := rawBook.Html() +func searchAnnasArchive(query string) ([]SearchItem, error) { + searchURL := "https://annas-archive.org/search?index=&q=%s&ext=epub&sort=&lang=en" + url := fmt.Sprintf(searchURL, url.QueryEscape(query)) + body, err := getPage(url) if err != nil { - return rawBook + return nil, err } - - strippedHTML := strings.TrimSpace(rawHTML) - if !strings.HasPrefix(strippedHTML, "") { - return rawBook - } - - allMatches := commentRE.FindAllStringSubmatch(strippedHTML, -1) - if len(allMatches) != 1 || len(allMatches[0]) != 2 { - return rawBook - } - - captureGroup := allMatches[0][1] - docReader := strings.NewReader(captureGroup) - doc, err := goquery.NewDocumentFromReader(docReader) - if err != nil { - return rawBook - } - - return doc.Selection + return parseAnnasArchive(body) } func parseAnnasArchive(body io.ReadCloser) ([]SearchItem, error) { @@ -107,3 +72,32 @@ func parseAnnasArchive(body io.ReadCloser) ([]SearchItem, error) { // Return Results return allEntries, nil } + +// getAnnasArchiveBookSelection parses potentially commented out HTML. For some reason +// Annas Archive comments out blocks "below the fold". They aren't rendered until you +// scroll. This attempts to parse the commented out HTML. +func getAnnasArchiveBookSelection(rawBook *goquery.Selection) *goquery.Selection { + rawHTML, err := rawBook.Html() + if err != nil { + return rawBook + } + + strippedHTML := strings.TrimSpace(rawHTML) + if !strings.HasPrefix(strippedHTML, "") { + return rawBook + } + + allMatches := commentRE.FindAllStringSubmatch(strippedHTML, -1) + if len(allMatches) != 1 || len(allMatches[0]) != 2 { + return rawBook + } + + captureGroup := allMatches[0][1] + docReader := strings.NewReader(captureGroup) + doc, err := goquery.NewDocumentFromReader(docReader) + if err != nil { + return rawBook + } + + return doc.Selection +} diff --git a/search/downloaders.go b/search/downloaders.go new file mode 100644 index 0000000..ead8233 --- /dev/null +++ b/search/downloaders.go @@ -0,0 +1,69 @@ +package search + +import ( + "errors" + "fmt" + "strings" + + "github.com/PuerkitoBio/goquery" +) + +func getLibGenDownloadURL(md5 string, _ Source) ([]string, error) { + // Get Page + body, err := getPage("http://libgen.li/ads.php?md5=" + md5) + if err != nil { + return nil, err + } + defer body.Close() + + // Parse + doc, err := goquery.NewDocumentFromReader(body) + if err != nil { + return nil, err + } + + // Return Download URL + downloadPath, exists := doc.Find("body > table > tbody > tr > td > a").Attr("href") + if !exists { + return nil, fmt.Errorf("Download URL not found") + } + + // Possible Funky URL + downloadPath = strings.ReplaceAll(downloadPath, "\\", "/") + return []string{fmt.Sprintf("http://libgen.li/%s", downloadPath)}, nil +} + +func getLibraryDownloadURL(md5 string, source Source) ([]string, error) { + // Derive Info URL + var infoURL string + switch source { + case SOURCE_LIBGEN_FICTION, SOURCE_ANNAS_ARCHIVE: + infoURL = "http://library.lol/fiction/" + md5 + case SOURCE_LIBGEN_NON_FICTION: + infoURL = "http://library.lol/main/" + md5 + default: + return nil, errors.New("invalid source") + } + + // Get Page + body, err := getPage(infoURL) + if err != nil { + return nil, err + } + defer body.Close() + + // Parse + doc, err := goquery.NewDocumentFromReader(body) + if err != nil { + return nil, err + } + + // Return Download URL + // downloadURL, _ := doc.Find("#download [href*=cloudflare]").Attr("href") + downloadURL, exists := doc.Find("#download h2 a").Attr("href") + if !exists { + return nil, errors.New("Download URL not found") + } + + return []string{downloadURL}, nil +} diff --git a/search/libgen.go b/search/libgen.go index 048a04b..9df7e2b 100644 --- a/search/libgen.go +++ b/search/libgen.go @@ -3,12 +3,23 @@ package search import ( "fmt" "io" + "net/url" "strings" "time" "github.com/PuerkitoBio/goquery" ) +func searchLibGenFiction(query string) ([]SearchItem, error) { + searchURL := "https://libgen.is/fiction/?q=%s&language=English&format=epub" + url := fmt.Sprintf(searchURL, url.QueryEscape(query)) + body, err := getPage(url) + if err != nil { + return nil, err + } + return parseLibGenFiction(body) +} + func parseLibGenFiction(body io.ReadCloser) ([]SearchItem, error) { // Parse defer body.Close() @@ -62,6 +73,16 @@ func parseLibGenFiction(body io.ReadCloser) ([]SearchItem, error) { return allEntries, nil } +func searchLibGenNonFiction(query string) ([]SearchItem, error) { + searchURL := "https://libgen.is/search.php?req=%s" + url := fmt.Sprintf(searchURL, url.QueryEscape(query)) + body, err := getPage(url) + if err != nil { + return nil, err + } + return parseLibGenNonFiction(body) +} + func parseLibGenNonFiction(body io.ReadCloser) ([]SearchItem, error) { // Parse defer body.Close() @@ -106,18 +127,3 @@ func parseLibGenNonFiction(body io.ReadCloser) ([]SearchItem, error) { // Return Results return allEntries, nil } - -func parseLibGenDownloadURL(body io.ReadCloser) (string, error) { - // Parse - defer body.Close() - doc, _ := goquery.NewDocumentFromReader(body) - - // Return Download URL - // downloadURL, _ := doc.Find("#download [href*=cloudflare]").Attr("href") - downloadURL, exists := doc.Find("#download h2 a").Attr("href") - if !exists { - return "", fmt.Errorf("Download URL not found") - } - - return downloadURL, nil -} diff --git a/search/progress.go b/search/progress.go new file mode 100644 index 0000000..828de45 --- /dev/null +++ b/search/progress.go @@ -0,0 +1,22 @@ +package search + +type writeCounter struct { + Total int64 + Current int64 + ProgressFunction func(float32) +} + +func (wc *writeCounter) Write(p []byte) (int, error) { + n := len(p) + wc.Current += int64(n) + wc.flushProgress() + return n, nil +} + +func (wc *writeCounter) flushProgress() { + if wc.ProgressFunction == nil || wc.Total < 100000 { + return + } + percentage := float32(wc.Current) * 100 / float32(wc.Total) + wc.ProgressFunction(percentage) +} diff --git a/search/search.go b/search/search.go index e1e425d..58c793d 100644 --- a/search/search.go +++ b/search/search.go @@ -2,17 +2,18 @@ package search import ( "crypto/tls" + "errors" "fmt" "io" "net/http" - "net/url" "os" "time" log "github.com/sirupsen/logrus" + "reichard.io/antholume/metadata" ) -const userAgent string = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:127.0) Gecko/20100101 Firefox/127.0" +const userAgent string = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36" type Cadence string @@ -21,13 +22,6 @@ const ( CADENCE_TOP_MONTH Cadence = "m" ) -type BookType int - -const ( - BOOK_FICTION BookType = iota - BOOK_NON_FICTION -) - type Source string const ( @@ -47,108 +41,58 @@ type SearchItem struct { UploadDate string } -type sourceDef struct { - searchURL string - downloadURL string - parseSearchFunc func(io.ReadCloser) ([]SearchItem, error) - parseDownloadFunc func(io.ReadCloser) (string, error) +type searchFunc func(query string) (searchResults []SearchItem, err error) +type downloadFunc func(md5 string, source Source) (downloadURL []string, err error) + +var searchDefs = map[Source]searchFunc{ + SOURCE_ANNAS_ARCHIVE: searchAnnasArchive, + SOURCE_LIBGEN_FICTION: searchLibGenFiction, + SOURCE_LIBGEN_NON_FICTION: searchLibGenNonFiction, } -var sourceDefs = map[Source]sourceDef{ - SOURCE_ANNAS_ARCHIVE: { - searchURL: "https://annas-archive.org/search?index=&q=%s&ext=epub&sort=&lang=en", - downloadURL: "http://libgen.li/ads.php?md5=%s", - parseSearchFunc: parseAnnasArchive, - parseDownloadFunc: parseAnnasArchiveDownloadURL, - }, - SOURCE_LIBGEN_FICTION: { - searchURL: "https://libgen.is/fiction/?q=%s&language=English&format=epub", - downloadURL: "http://libgen.li/ads.php?md5=%s", - parseSearchFunc: parseLibGenFiction, - parseDownloadFunc: parseAnnasArchiveDownloadURL, - }, - SOURCE_LIBGEN_NON_FICTION: { - searchURL: "https://libgen.is/search.php?req=%s", - downloadURL: "http://libgen.li/ads.php?md5=%s", - parseSearchFunc: parseLibGenNonFiction, - parseDownloadFunc: parseAnnasArchiveDownloadURL, - }, +var downloadFuncs = []downloadFunc{ + getLibGenDownloadURL, + getLibraryDownloadURL, } func SearchBook(query string, source Source) ([]SearchItem, error) { - def := sourceDefs[source] - log.Debug("Source: ", def) - url := fmt.Sprintf(def.searchURL, url.QueryEscape(query)) - body, err := getPage(url) - if err != nil { - return nil, err + searchFunc, found := searchDefs[source] + if !found { + return nil, fmt.Errorf("invalid source: %s", source) } - return def.parseSearchFunc(body) + log.Debug("Source: ", source) + return searchFunc(query) } -func SaveBook(id string, source Source) (string, error) { - def := sourceDefs[source] - log.Debug("Source: ", def) - url := fmt.Sprintf(def.downloadURL, id) +func SaveBook(md5 string, source Source, progressFunc func(float32)) (string, *metadata.MetadataInfo, error) { + for _, f := range downloadFuncs { + downloadURLs, err := f(md5, source) + if err != nil { + log.Error("failed to acquire download urls") + continue + } - body, err := getPage(url) - if err != nil { - return "", err + for _, bookURL := range downloadURLs { + // Download File + log.Info("Downloading Book: ", bookURL) + fileName, err := downloadBook(bookURL, progressFunc) + if err != nil { + log.Error("Book URL API Failure: ", err) + continue + } + + // Get Metadata + metadata, err := metadata.GetMetadata(fileName) + if err != nil { + log.Error("Book Metadata Failure: ", err) + continue + } + + return fileName, metadata, nil + } } - bookURL, err := def.parseDownloadFunc(body) - if err != nil { - log.Error("Parse Download URL Error: ", err) - return "", fmt.Errorf("Download Failure") - } - - // Create File - tempFile, err := os.CreateTemp("", "book") - if err != nil { - log.Error("File Create Error: ", err) - return "", fmt.Errorf("File Failure") - } - defer tempFile.Close() - - // Download File - log.Info("Downloading Book: ", bookURL) - resp, err := downloadBook(bookURL) - if err != nil { - os.Remove(tempFile.Name()) - log.Error("Book URL API Failure: ", err) - return "", fmt.Errorf("API Failure") - } - defer resp.Body.Close() - - // Copy File to Disk - log.Info("Saving Book") - _, err = io.Copy(tempFile, resp.Body) - if err != nil { - os.Remove(tempFile.Name()) - log.Error("File Copy Error: ", err) - return "", fmt.Errorf("File Failure") - } - - return tempFile.Name(), nil -} - -func GetBookURL(id string, bookType BookType) (string, error) { - // Derive Info URL - var infoURL string - if bookType == BOOK_FICTION { - infoURL = "http://library.lol/fiction/" + id - } else if bookType == BOOK_NON_FICTION { - infoURL = "http://library.lol/main/" + id - } - - // Parse & Derive Download URL - body, err := getPage(infoURL) - if err != nil { - return "", err - } - - // downloadURL := parseLibGenDownloadURL(body) - return parseLibGenDownloadURL(body) + return "", nil, errors.New("failed to download book") } func getPage(page string) (io.ReadCloser, error) { @@ -162,8 +106,6 @@ func getPage(page string) (io.ReadCloser, error) { if err != nil { return nil, err } - - // Set User-Agent req.Header.Set("User-Agent", userAgent) // Do Request @@ -176,7 +118,7 @@ func getPage(page string) (io.ReadCloser, error) { return resp.Body, err } -func downloadBook(bookURL string) (*http.Response, error) { +func downloadBook(bookURL string, progressFunc func(float32)) (string, error) { log.Debug("URL: ", bookURL) // Allow Insecure @@ -189,11 +131,33 @@ func downloadBook(bookURL string) (*http.Response, error) { // Start Request req, err := http.NewRequest("GET", bookURL, nil) if err != nil { - return nil, err + return "", err } - - // Set User-Agent req.Header.Set("User-Agent", userAgent) - return client.Do(req) + // Perform API Request + resp, err := client.Do(req) + if err != nil { + return "", err + } + + // Create File + tempFile, err := os.CreateTemp("", "book") + if err != nil { + log.Error("File Create Error: ", err) + return "", fmt.Errorf("failed to create temp file: %w", err) + } + defer tempFile.Close() + + // Copy File to Disk + log.Info("Saving Book") + counter := &writeCounter{Total: resp.ContentLength, ProgressFunction: progressFunc} + _, err = io.Copy(tempFile, io.TeeReader(resp.Body, counter)) + if err != nil { + os.Remove(tempFile.Name()) + log.Error("File Copy Error: ", err) + return "", fmt.Errorf("failed to copy response to temp file: %w", err) + } + + return tempFile.Name(), nil }