From d4c8e4d2da17783ffcb9744f162ca0465e862e14 Mon Sep 17 00:00:00 2001 From: Evan Reichard Date: Sun, 11 Aug 2024 11:02:46 -0400 Subject: [PATCH] fix(search): broken parser & download source --- search/anna.go | 38 ++++++++++++++++++++++++++++++++++++-- search/search.go | 4 ++-- 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/search/anna.go b/search/anna.go index c0a9eb2..3473c7e 100644 --- a/search/anna.go +++ b/search/anna.go @@ -3,11 +3,14 @@ package search import ( "fmt" "io" + "regexp" "strings" "github.com/PuerkitoBio/goquery" ) +var commentRE = regexp.MustCompile(`(?s)`) + func parseAnnasArchiveDownloadURL(body io.ReadCloser) (string, error) { // Parse defer body.Close() @@ -25,6 +28,35 @@ func parseAnnasArchiveDownloadURL(body io.ReadCloser) (string, error) { return downloadURL, nil } +// getAnnasArchiveBookSelection parses potentially commented out HTML. For some reason +// Annas Archive comments out blocks "below the fold". They aren't rendered until you +// scroll. This attempts to parse the commented out HTML. +func getAnnasArchiveBookSelection(rawBook *goquery.Selection) *goquery.Selection { + rawHTML, err := rawBook.Html() + if err != nil { + return rawBook + } + + strippedHTML := strings.TrimSpace(rawHTML) + if !strings.HasPrefix(strippedHTML, "") { + return rawBook + } + + allMatches := commentRE.FindAllStringSubmatch(strippedHTML, -1) + if len(allMatches) != 1 || len(allMatches[0]) != 2 { + return rawBook + } + + captureGroup := allMatches[0][1] + docReader := strings.NewReader(captureGroup) + doc, err := goquery.NewDocumentFromReader(docReader) + if err != nil { + return rawBook + } + + return doc.Selection +} + func parseAnnasArchive(body io.ReadCloser) ([]SearchItem, error) { // Parse defer body.Close() @@ -36,18 +68,20 @@ func parseAnnasArchive(body io.ReadCloser) ([]SearchItem, error) { // Normalize Results var allEntries []SearchItem doc.Find("form > div.w-full > div.w-full > div > div.justify-center").Each(func(ix int, rawBook *goquery.Selection) { + rawBook = getAnnasArchiveBookSelection(rawBook) + // Parse Details details := rawBook.Find("div:nth-child(2) > div:nth-child(1)").Text() detailsSplit := strings.Split(details, ", ") // Invalid Details - if len(detailsSplit) < 3 { + if len(detailsSplit) < 4 { return } language := detailsSplit[0] fileType := detailsSplit[1] - fileSize := detailsSplit[2] + fileSize := detailsSplit[3] // Get Title & Author title := rawBook.Find("h3").Text() diff --git a/search/search.go b/search/search.go index 02e4015..ebba671 100644 --- a/search/search.go +++ b/search/search.go @@ -57,9 +57,9 @@ type sourceDef struct { var sourceDefs = map[Source]sourceDef{ SOURCE_ANNAS_ARCHIVE: { searchURL: "https://annas-archive.org/search?index=&q=%s&ext=epub&sort=&lang=en", - downloadURL: "http://libgen.li/ads.php?md5=%s", + downloadURL: "http://library.lol/fiction/%s", parseSearchFunc: parseAnnasArchive, - parseDownloadFunc: parseAnnasArchiveDownloadURL, + parseDownloadFunc: parseLibGenDownloadURL, }, SOURCE_LIBGEN_FICTION: { searchURL: "https://libgen.is/fiction/?q=%s&language=English&format=epub",