2023-10-01 23:17:22 +00:00
|
|
|
package metadata
|
|
|
|
|
|
|
|
import (
|
|
|
|
"io"
|
|
|
|
"strings"
|
|
|
|
|
2023-10-24 00:18:16 +00:00
|
|
|
"github.com/taylorskalyo/goreader/epub"
|
2023-10-01 23:17:22 +00:00
|
|
|
"golang.org/x/net/html"
|
|
|
|
)
|
|
|
|
|
2023-10-24 00:18:16 +00:00
|
|
|
func countEPUBWords(filepath string) (int64, error) {
|
|
|
|
rc, err := epub.OpenReader(filepath)
|
2023-10-01 23:17:22 +00:00
|
|
|
if err != nil {
|
2023-10-24 00:18:16 +00:00
|
|
|
return 0, err
|
2023-10-01 23:17:22 +00:00
|
|
|
}
|
2023-10-24 00:18:16 +00:00
|
|
|
rf := rc.Rootfiles[0]
|
2023-10-01 23:17:22 +00:00
|
|
|
|
|
|
|
var completeCount int64
|
|
|
|
for _, item := range rf.Spine.Itemrefs {
|
|
|
|
f, _ := item.Open()
|
|
|
|
tokenizer := html.NewTokenizer(f)
|
2023-10-24 00:18:16 +00:00
|
|
|
newCount, err := countTokenizerWords(*tokenizer)
|
|
|
|
if err != nil {
|
|
|
|
return 0, err
|
|
|
|
}
|
|
|
|
completeCount = completeCount + newCount
|
2023-10-01 23:17:22 +00:00
|
|
|
}
|
|
|
|
|
2023-10-24 00:18:16 +00:00
|
|
|
return completeCount, nil
|
2023-10-01 23:17:22 +00:00
|
|
|
}
|
|
|
|
|
2023-10-24 00:18:16 +00:00
|
|
|
func countTokenizerWords(tokenizer html.Tokenizer) (int64, error) {
|
2023-10-01 23:17:22 +00:00
|
|
|
var err error
|
|
|
|
var totalWords int64
|
|
|
|
for {
|
|
|
|
tokenType := tokenizer.Next()
|
|
|
|
token := tokenizer.Token()
|
|
|
|
if tokenType == html.TextToken {
|
|
|
|
currStr := string(token.Data)
|
|
|
|
totalWords = totalWords + int64(len(strings.Fields(currStr)))
|
|
|
|
} else if tokenType == html.ErrorToken {
|
|
|
|
err = tokenizer.Err()
|
|
|
|
}
|
|
|
|
if err == io.EOF {
|
2023-10-24 00:18:16 +00:00
|
|
|
return totalWords, nil
|
2023-10-01 23:17:22 +00:00
|
|
|
} else if err != nil {
|
2023-10-24 00:18:16 +00:00
|
|
|
return 0, err
|
2023-10-01 23:17:22 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|