diff --git a/search/anna.go b/search/anna.go index e68016f..a260aba 100644 --- a/search/anna.go +++ b/search/anna.go @@ -4,14 +4,11 @@ import ( "fmt" "io" "net/url" - "regexp" "strings" "github.com/PuerkitoBio/goquery" ) -var commentRE = regexp.MustCompile(`(?s)`) - func searchAnnasArchive(query string) ([]SearchItem, error) { searchURL := "https://annas-archive.org/search?index=&q=%s&ext=epub&sort=&lang=en" url := fmt.Sprintf(searchURL, url.QueryEscape(query)) @@ -32,62 +29,34 @@ func parseAnnasArchive(body io.ReadCloser) ([]SearchItem, error) { // Normalize Results var allEntries []SearchItem - doc.Find("#aarecord-list > div.justify-center").Each(func(ix int, rawBook *goquery.Selection) { - rawBook = getAnnasArchiveBookSelection(rawBook) + doc.Find(".js-aarecord-list-outer > div > div").Each(func(ix int, rawBook *goquery.Selection) { // Parse Details - details := rawBook.Find("div:nth-child(2) > div:nth-child(1)").Text() - detailsSplit := strings.Split(details, ", ") + details := rawBook.Find("div:nth-child(3)").Text() + detailsSplit := strings.Split(details, " ยท ") // Invalid Details - if len(detailsSplit) < 4 { + if len(detailsSplit) < 3 { return } // Parse MD5 - itemHref, _ := rawBook.Find("a").Attr("href") + titleAuthorDetails := rawBook.Find("div a") + titleEl := titleAuthorDetails.Eq(0) + itemHref, _ := titleEl.Attr("href") hrefArray := strings.Split(itemHref, "/") id := hrefArray[len(hrefArray)-1] allEntries = append(allEntries, SearchItem{ ID: id, - Title: rawBook.Find("h3").First().Text(), - Author: rawBook.Find("div:nth-child(2) > div:nth-child(4)").First().Text(), + Title: titleEl.Text(), + Author: titleAuthorDetails.Eq(1).Text(), Language: detailsSplit[0], FileType: detailsSplit[1], - FileSize: detailsSplit[3], + FileSize: detailsSplit[2], }) }) // Return Results return allEntries, nil } - -// getAnnasArchiveBookSelection parses potentially commented out HTML. For some reason -// Annas Archive comments out blocks "below the fold". They aren't rendered until you -// scroll. This attempts to parse the commented out HTML. -func getAnnasArchiveBookSelection(rawBook *goquery.Selection) *goquery.Selection { - rawHTML, err := rawBook.Html() - if err != nil { - return rawBook - } - - strippedHTML := strings.TrimSpace(rawHTML) - if !strings.HasPrefix(strippedHTML, "") { - return rawBook - } - - allMatches := commentRE.FindAllStringSubmatch(strippedHTML, -1) - if len(allMatches) != 1 || len(allMatches[0]) != 2 { - return rawBook - } - - captureGroup := allMatches[0][1] - docReader := strings.NewReader(captureGroup) - doc, err := goquery.NewDocumentFromReader(docReader) - if err != nil { - return rawBook - } - - return doc.Selection -}