Files
AnthoLume/metadata/epub.go
Evan Reichard f9f23f2d3f
Some checks failed
continuous-integration/drone/push Build is failing
fix: word count calculation
2025-11-12 19:13:04 -05:00

66 lines
1.6 KiB
Go

package metadata
import (
"regexp"
"strings"
"github.com/PuerkitoBio/goquery"
"github.com/taylorskalyo/goreader/epub"
)
func getEPUBMetadata(filepath string) (*MetadataInfo, error) {
rc, err := epub.OpenReader(filepath)
if err != nil {
return nil, err
}
rf := rc.Rootfiles[0]
parsedMetadata := &MetadataInfo{
Type: TYPE_EPUB,
Title: &rf.Title,
Author: &rf.Creator,
Description: &rf.Description,
}
// Parse Possible ISBN
if rf.Source != "" {
replaceRE := regexp.MustCompile(`[-\s]`)
possibleISBN := replaceRE.ReplaceAllString(rf.Source, "")
// ISBN Matches
isbn13RE := regexp.MustCompile(`(?P<ISBN>\d{13})`)
isbn10RE := regexp.MustCompile(`(?P<ISBN>\d{10})`)
isbn13Matches := isbn13RE.FindStringSubmatch(possibleISBN)
isbn10Matches := isbn10RE.FindStringSubmatch(possibleISBN)
if len(isbn13Matches) > 0 {
isbnIndex := isbn13RE.SubexpIndex("ISBN")
parsedMetadata.ISBN13 = &isbn13Matches[isbnIndex]
} else if len(isbn10Matches) > 0 {
isbnIndex := isbn10RE.SubexpIndex("ISBN")
parsedMetadata.ISBN10 = &isbn10Matches[isbnIndex]
}
}
return parsedMetadata, nil
}
func countEPUBWords(filepath string) (int64, error) {
rc, err := epub.OpenReader(filepath)
if err != nil {
return 0, err
}
rf := rc.Rootfiles[0]
var completeCount int64
for _, item := range rf.Itemrefs {
f, _ := item.Open()
doc, _ := goquery.NewDocumentFromReader(f)
doc.Find("script, style, noscript, iframe").Remove()
words := len(strings.Fields(doc.Text()))
completeCount = completeCount + int64(words)
}
return completeCount, nil
}