[new] count words & stats, [new] refactor metadata, [new] human readable time

This commit is contained in:
2023-10-01 19:17:22 -04:00
parent 5a8bdacf4f
commit 2a101c6cee
13 changed files with 816 additions and 239 deletions

330
metadata/epub.go Normal file
View File

@@ -0,0 +1,330 @@
/*
Package epub provides basic support for reading EPUB archives.
Adapted from: https://github.com/taylorskalyo/goreader
*/
package metadata
import (
"archive/zip"
"bytes"
"encoding/xml"
"errors"
"io"
"os"
"path"
"strings"
"golang.org/x/net/html"
)
const containerPath = "META-INF/container.xml"
var (
// ErrNoRootfile occurs when there are no rootfile entries found in
// container.xml.
ErrNoRootfile = errors.New("epub: no rootfile found in container")
// ErrBadRootfile occurs when container.xml references a rootfile that does
// not exist in the zip.
ErrBadRootfile = errors.New("epub: container references non-existent rootfile")
// ErrNoItemref occurrs when a content.opf contains a spine without any
// itemref entries.
ErrNoItemref = errors.New("epub: no itemrefs found in spine")
// ErrBadItemref occurs when an itemref entry in content.opf references an
// item that does not exist in the manifest.
ErrBadItemref = errors.New("epub: itemref references non-existent item")
// ErrBadManifest occurs when a manifest in content.opf references an item
// that does not exist in the zip.
ErrBadManifest = errors.New("epub: manifest references non-existent item")
)
// Reader represents a readable epub file.
type Reader struct {
Container
files map[string]*zip.File
}
// ReadCloser represents a readable epub file that can be closed.
type ReadCloser struct {
Reader
f *os.File
}
// Rootfile contains the location of a content.opf package file.
type Rootfile struct {
FullPath string `xml:"full-path,attr"`
Package
}
// Container serves as a directory of Rootfiles.
type Container struct {
Rootfiles []*Rootfile `xml:"rootfiles>rootfile"`
}
// Package represents an epub content.opf file.
type Package struct {
Metadata
Manifest
Spine
}
// Metadata contains publishing information about the epub.
type Metadata struct {
Title string `xml:"metadata>title"`
Language string `xml:"metadata>language"`
Identifier string `xml:"metadata>idenifier"`
Creator string `xml:"metadata>creator"`
Contributor string `xml:"metadata>contributor"`
Publisher string `xml:"metadata>publisher"`
Subject string `xml:"metadata>subject"`
Description string `xml:"metadata>description"`
Event []struct {
Name string `xml:"event,attr"`
Date string `xml:",innerxml"`
} `xml:"metadata>date"`
Type string `xml:"metadata>type"`
Format string `xml:"metadata>format"`
Source string `xml:"metadata>source"`
Relation string `xml:"metadata>relation"`
Coverage string `xml:"metadata>coverage"`
Rights string `xml:"metadata>rights"`
}
// Manifest lists every file that is part of the epub.
type Manifest struct {
Items []Item `xml:"manifest>item"`
}
// Item represents a file stored in the epub.
type Item struct {
ID string `xml:"id,attr"`
HREF string `xml:"href,attr"`
MediaType string `xml:"media-type,attr"`
f *zip.File
}
// Spine defines the reading order of the epub documents.
type Spine struct {
Itemrefs []Itemref `xml:"spine>itemref"`
}
// Itemref points to an Item.
type Itemref struct {
IDREF string `xml:"idref,attr"`
*Item
}
// OpenEPUBReader will open the epub file specified by name and return a
// ReadCloser.
func OpenEPUBReader(name string) (*ReadCloser, error) {
f, err := os.Open(name)
if err != nil {
return nil, err
}
rc := new(ReadCloser)
rc.f = f
fi, err := f.Stat()
if err != nil {
f.Close()
return nil, err
}
z, err := zip.NewReader(f, fi.Size())
if err != nil {
return nil, err
}
if err = rc.init(z); err != nil {
return nil, err
}
return rc, nil
}
// NewReader returns a new Reader reading from ra, which is assumed to have the
// given size in bytes.
func NewReader(ra io.ReaderAt, size int64) (*Reader, error) {
z, err := zip.NewReader(ra, size)
if err != nil {
return nil, err
}
r := new(Reader)
if err = r.init(z); err != nil {
return nil, err
}
return r, nil
}
func (r *Reader) init(z *zip.Reader) error {
// Create a file lookup table
r.files = make(map[string]*zip.File)
for _, f := range z.File {
r.files[f.Name] = f
}
err := r.setContainer()
if err != nil {
return err
}
err = r.setPackages()
if err != nil {
return err
}
err = r.setItems()
if err != nil {
return err
}
return nil
}
// setContainer unmarshals the epub's container.xml file.
func (r *Reader) setContainer() error {
f, err := r.files[containerPath].Open()
if err != nil {
return err
}
var b bytes.Buffer
_, err = io.Copy(&b, f)
if err != nil {
return err
}
err = xml.Unmarshal(b.Bytes(), &r.Container)
if err != nil {
return err
}
if len(r.Container.Rootfiles) < 1 {
return ErrNoRootfile
}
return nil
}
// setPackages unmarshal's each of the epub's content.opf files.
func (r *Reader) setPackages() error {
for _, rf := range r.Container.Rootfiles {
if r.files[rf.FullPath] == nil {
return ErrBadRootfile
}
f, err := r.files[rf.FullPath].Open()
if err != nil {
return err
}
var b bytes.Buffer
_, err = io.Copy(&b, f)
if err != nil {
return err
}
err = xml.Unmarshal(b.Bytes(), &rf.Package)
if err != nil {
return err
}
}
return nil
}
// setItems associates Itemrefs with their respective Item and Items with
// their zip.File.
func (r *Reader) setItems() error {
itemrefCount := 0
for _, rf := range r.Container.Rootfiles {
itemMap := make(map[string]*Item)
for i := range rf.Manifest.Items {
item := &rf.Manifest.Items[i]
itemMap[item.ID] = item
abs := path.Join(path.Dir(rf.FullPath), item.HREF)
item.f = r.files[abs]
}
for i := range rf.Spine.Itemrefs {
itemref := &rf.Spine.Itemrefs[i]
itemref.Item = itemMap[itemref.IDREF]
if itemref.Item == nil {
return ErrBadItemref
}
}
itemrefCount += len(rf.Spine.Itemrefs)
}
if itemrefCount < 1 {
return ErrNoItemref
}
return nil
}
// Open returns a ReadCloser that provides access to the Items's contents.
// Multiple items may be read concurrently.
func (item *Item) Open() (r io.ReadCloser, err error) {
if item.f == nil {
return nil, ErrBadManifest
}
return item.f.Open()
}
// Close closes the epub file, rendering it unusable for I/O.
func (rc *ReadCloser) Close() {
rc.f.Close()
}
// Hehe
func (rf *Rootfile) CountWords() int64 {
var completeCount int64
for _, item := range rf.Spine.Itemrefs {
f, _ := item.Open()
tokenizer := html.NewTokenizer(f)
completeCount = completeCount + countWords(*tokenizer)
}
return completeCount
}
func countWords(tokenizer html.Tokenizer) int64 {
var err error
var totalWords int64
for {
tokenType := tokenizer.Next()
token := tokenizer.Token()
if tokenType == html.TextToken {
currStr := string(token.Data)
totalWords = totalWords + int64(len(strings.Fields(currStr)))
} else if tokenType == html.ErrorToken {
err = tokenizer.Err()
}
if err == io.EOF {
return totalWords
} else if err != nil {
return 0
}
}
}
/*
func main() {
rc, err := OpenEPUBReader("test.epub")
if err != nil {
log.Fatal(err)
}
rf := rc.Rootfiles[0]
totalWords := rf.CountWords()
log.Info("WOAH WORDS:", totalWords)
}
*/

200
metadata/gbooks.go Normal file
View File

@@ -0,0 +1,200 @@
package metadata
import (
"encoding/json"
"errors"
"fmt"
"io"
"net/http"
"net/url"
"os"
"strings"
log "github.com/sirupsen/logrus"
)
type gBooksIdentifiers struct {
Type string `json:"type"`
Identifier string `json:"identifier"`
}
type gBooksInfo struct {
Title string `json:"title"`
Authors []string `json:"authors"`
Description string `json:"description"`
Identifiers []gBooksIdentifiers `json:"industryIdentifiers"`
}
type gBooksQueryItem struct {
ID string `json:"id"`
Info gBooksInfo `json:"volumeInfo"`
}
type gBooksQueryResponse struct {
TotalItems int `json:"totalItems"`
Items []gBooksQueryItem `json:"items"`
}
const GBOOKS_QUERY_URL string = "https://www.googleapis.com/books/v1/volumes?q=%s"
const GBOOKS_GBID_INFO_URL string = "https://www.googleapis.com/books/v1/volumes/%s"
const GBOOKS_GBID_COVER_URL string = "https://books.google.com/books/content/images/frontcover/%s?fife=w480-h690"
func getGBooksMetadata(metadataSearch MetadataInfo) ([]MetadataInfo, error) {
var queryResults []gBooksQueryItem
if metadataSearch.ID != nil {
// Use GBID
resp, err := performGBIDRequest(*metadataSearch.ID)
if err != nil {
return nil, err
}
queryResults = []gBooksQueryItem{*resp}
} else if metadataSearch.ISBN13 != nil {
searchQuery := "isbn:" + *metadataSearch.ISBN13
resp, err := performSearchRequest(searchQuery)
if err != nil {
return nil, err
}
queryResults = resp.Items
} else if metadataSearch.ISBN10 != nil {
searchQuery := "isbn:" + *metadataSearch.ISBN10
resp, err := performSearchRequest(searchQuery)
if err != nil {
return nil, err
}
queryResults = resp.Items
} else if metadataSearch.Title != nil || metadataSearch.Author != nil {
var searchQuery string
if metadataSearch.Title != nil {
searchQuery = searchQuery + *metadataSearch.Title
}
if metadataSearch.Author != nil {
searchQuery = searchQuery + " " + *metadataSearch.Author
}
// Escape & Trim
searchQuery = url.QueryEscape(strings.TrimSpace(searchQuery))
resp, err := performSearchRequest(searchQuery)
if err != nil {
return nil, err
}
queryResults = resp.Items
} else {
return nil, errors.New("Invalid Data")
}
// Normalize Data
allMetadata := []MetadataInfo{}
for i := range queryResults {
item := queryResults[i] // Range Value Pointer Issue
itemResult := MetadataInfo{
ID: &item.ID,
Title: &item.Info.Title,
Description: &item.Info.Description,
}
if len(item.Info.Authors) > 0 {
itemResult.Author = &item.Info.Authors[0]
}
for i := range item.Info.Identifiers {
item := item.Info.Identifiers[i] // Range Value Pointer Issue
if itemResult.ISBN10 != nil && itemResult.ISBN13 != nil {
break
} else if itemResult.ISBN10 == nil && item.Type == "ISBN_10" {
itemResult.ISBN10 = &item.Identifier
} else if itemResult.ISBN13 == nil && item.Type == "ISBN_13" {
itemResult.ISBN13 = &item.Identifier
}
}
allMetadata = append(allMetadata, itemResult)
}
return allMetadata, nil
}
func saveGBooksCover(gbid string, coverFilePath string, overwrite bool) error {
// Validate File Doesn't Exists
_, err := os.Stat(coverFilePath)
if err == nil && overwrite == false {
log.Warn("[saveGBooksCover] File Alreads Exists")
return nil
}
// Create File
out, err := os.Create(coverFilePath)
if err != nil {
log.Error("[saveGBooksCover] File Create Error")
return errors.New("File Failure")
}
defer out.Close()
// Download File
log.Info("[saveGBooksCover] Downloading Cover")
coverURL := fmt.Sprintf(GBOOKS_GBID_COVER_URL, gbid)
resp, err := http.Get(coverURL)
if err != nil {
log.Error("[saveGBooksCover] Cover URL API Failure")
return errors.New("API Failure")
}
defer resp.Body.Close()
// Copy File to Disk
log.Info("[saveGBooksCover] Saving Cover")
_, err = io.Copy(out, resp.Body)
if err != nil {
log.Error("[saveGBooksCover] File Copy Error")
return errors.New("File Failure")
}
return nil
}
func performSearchRequest(searchQuery string) (*gBooksQueryResponse, error) {
apiQuery := fmt.Sprintf(GBOOKS_QUERY_URL, searchQuery)
log.Info("[performSearchRequest] Acquiring Metadata: ", apiQuery)
resp, err := http.Get(apiQuery)
if err != nil {
log.Error("[performSearchRequest] Google Books Query URL API Failure")
return nil, errors.New("API Failure")
}
parsedResp := gBooksQueryResponse{}
err = json.NewDecoder(resp.Body).Decode(&parsedResp)
if err != nil {
log.Error("[performSearchRequest] Google Books Query API Decode Failure")
return nil, errors.New("API Failure")
}
if len(parsedResp.Items) == 0 {
log.Warn("[performSearchRequest] No Results")
return nil, errors.New("No Results")
}
return &parsedResp, nil
}
func performGBIDRequest(id string) (*gBooksQueryItem, error) {
apiQuery := fmt.Sprintf(GBOOKS_GBID_INFO_URL, id)
log.Info("[performGBIDRequest] Acquiring CoverID")
resp, err := http.Get(apiQuery)
if err != nil {
log.Error("[performGBIDRequest] Cover URL API Failure")
return nil, errors.New("API Failure")
}
parsedResp := gBooksQueryItem{}
err = json.NewDecoder(resp.Body).Decode(&parsedResp)
if err != nil {
log.Error("[performGBIDRequest] Google Books ID API Decode Failure")
return nil, errors.New("API Failure")
}
return &parsedResp, nil
}

View File

@@ -1,217 +1,72 @@
package metadata
import (
"encoding/json"
"errors"
"fmt"
"io"
"net/http"
"net/url"
"os"
"path/filepath"
"strings"
log "github.com/sirupsen/logrus"
"github.com/gabriel-vasile/mimetype"
)
type Source int
const (
GBOOK Source = iota
OLIB
)
type MetadataInfo struct {
ID *string
Title *string
Author *string
Description *string
GBID *string
OLID *string
ISBN10 *string
ISBN13 *string
}
type gBooksIdentifiers struct {
Type string `json:"type"`
Identifier string `json:"identifier"`
}
type gBooksInfo struct {
Title string `json:"title"`
Authors []string `json:"authors"`
Description string `json:"description"`
Identifiers []gBooksIdentifiers `json:"industryIdentifiers"`
}
type gBooksQueryItem struct {
ID string `json:"id"`
Info gBooksInfo `json:"volumeInfo"`
}
type gBooksQueryResponse struct {
TotalItems int `json:"totalItems"`
Items []gBooksQueryItem `json:"items"`
}
const GBOOKS_QUERY_URL string = "https://www.googleapis.com/books/v1/volumes?q=%s"
const GBOOKS_GBID_INFO_URL string = "https://www.googleapis.com/books/v1/volumes/%s"
const GBOOKS_GBID_COVER_URL string = "https://books.google.com/books/content/images/frontcover/%s?fife=w480-h690"
func GetMetadata(metadataSearch MetadataInfo) ([]MetadataInfo, error) {
var queryResults []gBooksQueryItem
if metadataSearch.GBID != nil {
// Use GBID
resp, err := performGBIDRequest(*metadataSearch.GBID)
if err != nil {
return nil, err
}
queryResults = []gBooksQueryItem{*resp}
} else if metadataSearch.ISBN13 != nil {
searchQuery := "isbn:" + *metadataSearch.ISBN13
resp, err := performSearchRequest(searchQuery)
if err != nil {
return nil, err
}
queryResults = resp.Items
} else if metadataSearch.ISBN10 != nil {
searchQuery := "isbn:" + *metadataSearch.ISBN10
resp, err := performSearchRequest(searchQuery)
if err != nil {
return nil, err
}
queryResults = resp.Items
} else if metadataSearch.Title != nil || metadataSearch.Author != nil {
var searchQuery string
if metadataSearch.Title != nil {
searchQuery = searchQuery + *metadataSearch.Title
}
if metadataSearch.Author != nil {
searchQuery = searchQuery + " " + *metadataSearch.Author
}
// Escape & Trim
searchQuery = url.QueryEscape(strings.TrimSpace(searchQuery))
resp, err := performSearchRequest(searchQuery)
if err != nil {
return nil, err
}
queryResults = resp.Items
} else {
return nil, errors.New("Invalid Data")
}
// Normalize Data
allMetadata := []MetadataInfo{}
for i := range queryResults {
item := queryResults[i] // Range Value Pointer Issue
itemResult := MetadataInfo{
GBID: &item.ID,
Title: &item.Info.Title,
Description: &item.Info.Description,
}
if len(item.Info.Authors) > 0 {
itemResult.Author = &item.Info.Authors[0]
}
for i := range item.Info.Identifiers {
item := item.Info.Identifiers[i] // Range Value Pointer Issue
if itemResult.ISBN10 != nil && itemResult.ISBN13 != nil {
break
} else if itemResult.ISBN10 == nil && item.Type == "ISBN_10" {
itemResult.ISBN10 = &item.Identifier
} else if itemResult.ISBN13 == nil && item.Type == "ISBN_13" {
itemResult.ISBN13 = &item.Identifier
}
}
allMetadata = append(allMetadata, itemResult)
}
return allMetadata, nil
}
func SaveCover(gbid string, coverDir string, documentID string, overwrite bool) (*string, error) {
// Google Books -> JPG
func CacheCover(gbid string, coverDir string, documentID string, overwrite bool) (*string, error) {
// Get Filepath
coverFile := "." + filepath.Clean(fmt.Sprintf("/%s.jpg", documentID))
coverFilePath := filepath.Join(coverDir, coverFile)
// Validate File Doesn't Exists
_, err := os.Stat(coverFilePath)
if err == nil && overwrite == false {
log.Warn("[SaveCover] File Alreads Exists")
return &coverFile, nil
// Save Google Books
if err := saveGBooksCover(gbid, coverFilePath, overwrite); err != nil {
return nil, err
}
// Create File
out, err := os.Create(coverFilePath)
if err != nil {
log.Error("[SaveCover] File Create Error")
return nil, errors.New("File Failure")
}
defer out.Close()
// TODO - Refactor & Allow Open Library / Alternative Sources
// Download File
log.Info("[SaveCover] Downloading Cover")
coverURL := fmt.Sprintf(GBOOKS_GBID_COVER_URL, gbid)
resp, err := http.Get(coverURL)
if err != nil {
log.Error("[SaveCover] Cover URL API Failure")
return nil, errors.New("API Failure")
}
defer resp.Body.Close()
// Copy File to Disk
log.Info("[SaveCover] Saving Cover")
_, err = io.Copy(out, resp.Body)
if err != nil {
log.Error("[SaveCover] File Copy Error")
return nil, errors.New("File Failure")
}
// Return FilePath
return &coverFile, nil
}
func performSearchRequest(searchQuery string) (*gBooksQueryResponse, error) {
apiQuery := fmt.Sprintf(GBOOKS_QUERY_URL, searchQuery)
log.Info("[performSearchRequest] Acquiring Metadata: ", apiQuery)
resp, err := http.Get(apiQuery)
if err != nil {
log.Error("[performSearchRequest] Google Books Query URL API Failure")
return nil, errors.New("API Failure")
}
func SearchMetadata(s Source, metadataSearch MetadataInfo) ([]MetadataInfo, error) {
switch s {
case GBOOK:
return getGBooksMetadata(metadataSearch)
case OLIB:
return nil, errors.New("Not implemented")
default:
return nil, errors.New("Not implemented")
parsedResp := gBooksQueryResponse{}
err = json.NewDecoder(resp.Body).Decode(&parsedResp)
if err != nil {
log.Error("[performSearchRequest] Google Books Query API Decode Failure")
return nil, errors.New("API Failure")
}
if len(parsedResp.Items) == 0 {
log.Warn("[performSearchRequest] No Results")
return nil, errors.New("No Results")
}
return &parsedResp, nil
}
func performGBIDRequest(id string) (*gBooksQueryItem, error) {
apiQuery := fmt.Sprintf(GBOOKS_GBID_INFO_URL, id)
log.Info("[performGBIDRequest] Acquiring CoverID")
resp, err := http.Get(apiQuery)
func GetWordCount(filepath string) (int64, error) {
fileMime, err := mimetype.DetectFile(filepath)
if err != nil {
log.Error("[performGBIDRequest] Cover URL API Failure")
return nil, errors.New("API Failure")
return 0, err
}
parsedResp := gBooksQueryItem{}
err = json.NewDecoder(resp.Body).Decode(&parsedResp)
if err != nil {
log.Error("[performGBIDRequest] Google Books ID API Decode Failure")
return nil, errors.New("API Failure")
}
if fileExtension := fileMime.Extension(); fileExtension == ".epub" {
rc, err := OpenEPUBReader(filepath)
if err != nil {
return 0, err
}
return &parsedResp, nil
rf := rc.Rootfiles[0]
totalWords := rf.CountWords()
return totalWords, nil
} else {
return 0, errors.New("Invalid Extension")
}
}