fix: annas archive parsing

2025-08-17 17:04:03 -04:00
parent 7937890acd
commit 3cff965393
1 changed files with 10 additions and 41 deletions
--- a/search/anna.go
+++ b/search/anna.go
@@ -4,14 +4,11 @@ import (
 	"fmt"
 	"io"
 	"net/url"
 	"regexp"
 	"strings"
 	"github.com/PuerkitoBio/goquery"
 )
 var commentRE = regexp.MustCompile(`(?s)<!--(.*?)-->`)
 func searchAnnasArchive(query string) ([]SearchItem, error) {
 	searchURL := "https://annas-archive.org/search?index=&q=%s&ext=epub&sort=&lang=en"
 	url := fmt.Sprintf(searchURL, url.QueryEscape(query))
@@ -32,62 +29,34 @@ func parseAnnasArchive(body io.ReadCloser) ([]SearchItem, error) {
 	// Normalize Results
 	var allEntries []SearchItem
-	doc.Find("#aarecord-list > div.justify-center").Each(func(ix int, rawBook *goquery.Selection) {
+	doc.Find(".js-aarecord-list-outer > div > div").Each(func(ix int, rawBook *goquery.Selection) {
 		rawBook = getAnnasArchiveBookSelection(rawBook)
 		// Parse Details
-		details := rawBook.Find("div:nth-child(2) > div:nth-child(1)").Text()
+		details := rawBook.Find("div:nth-child(3)").Text()
-		detailsSplit := strings.Split(details, ", ")
+		detailsSplit := strings.Split(details, " · ")
 		// Invalid Details
-		if len(detailsSplit) < 4 {
+		if len(detailsSplit) < 3 {
 			return
 		}
 		// Parse MD5
-		itemHref, _ := rawBook.Find("a").Attr("href")
+		titleAuthorDetails := rawBook.Find("div a")
 		titleEl := titleAuthorDetails.Eq(0)
 		itemHref, _ := titleEl.Attr("href")
 		hrefArray := strings.Split(itemHref, "/")
 		id := hrefArray[len(hrefArray)-1]
 		allEntries = append(allEntries, SearchItem{
 			ID:       id,
-			Title:    rawBook.Find("h3").First().Text(),
+			Title:    titleEl.Text(),
-			Author:   rawBook.Find("div:nth-child(2) > div:nth-child(4)").First().Text(),
+			Author:   titleAuthorDetails.Eq(1).Text(),
 			Language: detailsSplit[0],
 			FileType: detailsSplit[1],
-			FileSize: detailsSplit[3],
+			FileSize: detailsSplit[2],
 		})
 	})
 	// Return Results
 	return allEntries, nil
 }
 // getAnnasArchiveBookSelection parses potentially commented out HTML. For some reason
 // Annas Archive comments out blocks "below the fold". They aren't rendered until you
 // scroll. This attempts to parse the commented out HTML.
 func getAnnasArchiveBookSelection(rawBook *goquery.Selection) *goquery.Selection {
 	rawHTML, err := rawBook.Html()
 	if err != nil {
 		return rawBook
 	}
 	strippedHTML := strings.TrimSpace(rawHTML)
 	if !strings.HasPrefix(strippedHTML, "<!--") || !strings.HasSuffix(strippedHTML, "-->") {
 		return rawBook
 	}
 	allMatches := commentRE.FindAllStringSubmatch(strippedHTML, -1)
 	if len(allMatches) != 1 || len(allMatches[0]) != 2 {
 		return rawBook
 	}
 	captureGroup := allMatches[0][1]
 	docReader := strings.NewReader(captureGroup)
 	doc, err := goquery.NewDocumentFromReader(docReader)
 	if err != nil {
 		return rawBook
 	}
 	return doc.Selection
 }