[new] count words & stats, [new] refactor metadata, [new] human readable time
This commit is contained in:
330
metadata/epub.go
Normal file
330
metadata/epub.go
Normal file
@@ -0,0 +1,330 @@
|
||||
/*
|
||||
Package epub provides basic support for reading EPUB archives.
|
||||
Adapted from: https://github.com/taylorskalyo/goreader
|
||||
*/
|
||||
package metadata
|
||||
|
||||
import (
|
||||
"archive/zip"
|
||||
"bytes"
|
||||
"encoding/xml"
|
||||
"errors"
|
||||
"io"
|
||||
"os"
|
||||
"path"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/net/html"
|
||||
)
|
||||
|
||||
const containerPath = "META-INF/container.xml"
|
||||
|
||||
var (
|
||||
// ErrNoRootfile occurs when there are no rootfile entries found in
|
||||
// container.xml.
|
||||
ErrNoRootfile = errors.New("epub: no rootfile found in container")
|
||||
|
||||
// ErrBadRootfile occurs when container.xml references a rootfile that does
|
||||
// not exist in the zip.
|
||||
ErrBadRootfile = errors.New("epub: container references non-existent rootfile")
|
||||
|
||||
// ErrNoItemref occurrs when a content.opf contains a spine without any
|
||||
// itemref entries.
|
||||
ErrNoItemref = errors.New("epub: no itemrefs found in spine")
|
||||
|
||||
// ErrBadItemref occurs when an itemref entry in content.opf references an
|
||||
// item that does not exist in the manifest.
|
||||
ErrBadItemref = errors.New("epub: itemref references non-existent item")
|
||||
|
||||
// ErrBadManifest occurs when a manifest in content.opf references an item
|
||||
// that does not exist in the zip.
|
||||
ErrBadManifest = errors.New("epub: manifest references non-existent item")
|
||||
)
|
||||
|
||||
// Reader represents a readable epub file.
|
||||
type Reader struct {
|
||||
Container
|
||||
files map[string]*zip.File
|
||||
}
|
||||
|
||||
// ReadCloser represents a readable epub file that can be closed.
|
||||
type ReadCloser struct {
|
||||
Reader
|
||||
f *os.File
|
||||
}
|
||||
|
||||
// Rootfile contains the location of a content.opf package file.
|
||||
type Rootfile struct {
|
||||
FullPath string `xml:"full-path,attr"`
|
||||
Package
|
||||
}
|
||||
|
||||
// Container serves as a directory of Rootfiles.
|
||||
type Container struct {
|
||||
Rootfiles []*Rootfile `xml:"rootfiles>rootfile"`
|
||||
}
|
||||
|
||||
// Package represents an epub content.opf file.
|
||||
type Package struct {
|
||||
Metadata
|
||||
Manifest
|
||||
Spine
|
||||
}
|
||||
|
||||
// Metadata contains publishing information about the epub.
|
||||
type Metadata struct {
|
||||
Title string `xml:"metadata>title"`
|
||||
Language string `xml:"metadata>language"`
|
||||
Identifier string `xml:"metadata>idenifier"`
|
||||
Creator string `xml:"metadata>creator"`
|
||||
Contributor string `xml:"metadata>contributor"`
|
||||
Publisher string `xml:"metadata>publisher"`
|
||||
Subject string `xml:"metadata>subject"`
|
||||
Description string `xml:"metadata>description"`
|
||||
Event []struct {
|
||||
Name string `xml:"event,attr"`
|
||||
Date string `xml:",innerxml"`
|
||||
} `xml:"metadata>date"`
|
||||
Type string `xml:"metadata>type"`
|
||||
Format string `xml:"metadata>format"`
|
||||
Source string `xml:"metadata>source"`
|
||||
Relation string `xml:"metadata>relation"`
|
||||
Coverage string `xml:"metadata>coverage"`
|
||||
Rights string `xml:"metadata>rights"`
|
||||
}
|
||||
|
||||
// Manifest lists every file that is part of the epub.
|
||||
type Manifest struct {
|
||||
Items []Item `xml:"manifest>item"`
|
||||
}
|
||||
|
||||
// Item represents a file stored in the epub.
|
||||
type Item struct {
|
||||
ID string `xml:"id,attr"`
|
||||
HREF string `xml:"href,attr"`
|
||||
MediaType string `xml:"media-type,attr"`
|
||||
f *zip.File
|
||||
}
|
||||
|
||||
// Spine defines the reading order of the epub documents.
|
||||
type Spine struct {
|
||||
Itemrefs []Itemref `xml:"spine>itemref"`
|
||||
}
|
||||
|
||||
// Itemref points to an Item.
|
||||
type Itemref struct {
|
||||
IDREF string `xml:"idref,attr"`
|
||||
*Item
|
||||
}
|
||||
|
||||
// OpenEPUBReader will open the epub file specified by name and return a
|
||||
// ReadCloser.
|
||||
func OpenEPUBReader(name string) (*ReadCloser, error) {
|
||||
f, err := os.Open(name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
rc := new(ReadCloser)
|
||||
rc.f = f
|
||||
|
||||
fi, err := f.Stat()
|
||||
if err != nil {
|
||||
f.Close()
|
||||
return nil, err
|
||||
}
|
||||
|
||||
z, err := zip.NewReader(f, fi.Size())
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if err = rc.init(z); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return rc, nil
|
||||
}
|
||||
|
||||
// NewReader returns a new Reader reading from ra, which is assumed to have the
|
||||
// given size in bytes.
|
||||
func NewReader(ra io.ReaderAt, size int64) (*Reader, error) {
|
||||
z, err := zip.NewReader(ra, size)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
r := new(Reader)
|
||||
if err = r.init(z); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return r, nil
|
||||
}
|
||||
|
||||
func (r *Reader) init(z *zip.Reader) error {
|
||||
// Create a file lookup table
|
||||
r.files = make(map[string]*zip.File)
|
||||
for _, f := range z.File {
|
||||
r.files[f.Name] = f
|
||||
}
|
||||
|
||||
err := r.setContainer()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
err = r.setPackages()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
err = r.setItems()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// setContainer unmarshals the epub's container.xml file.
|
||||
func (r *Reader) setContainer() error {
|
||||
f, err := r.files[containerPath].Open()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
var b bytes.Buffer
|
||||
_, err = io.Copy(&b, f)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
err = xml.Unmarshal(b.Bytes(), &r.Container)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if len(r.Container.Rootfiles) < 1 {
|
||||
return ErrNoRootfile
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// setPackages unmarshal's each of the epub's content.opf files.
|
||||
func (r *Reader) setPackages() error {
|
||||
for _, rf := range r.Container.Rootfiles {
|
||||
if r.files[rf.FullPath] == nil {
|
||||
return ErrBadRootfile
|
||||
}
|
||||
|
||||
f, err := r.files[rf.FullPath].Open()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
var b bytes.Buffer
|
||||
_, err = io.Copy(&b, f)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
err = xml.Unmarshal(b.Bytes(), &rf.Package)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// setItems associates Itemrefs with their respective Item and Items with
|
||||
// their zip.File.
|
||||
func (r *Reader) setItems() error {
|
||||
itemrefCount := 0
|
||||
for _, rf := range r.Container.Rootfiles {
|
||||
itemMap := make(map[string]*Item)
|
||||
for i := range rf.Manifest.Items {
|
||||
item := &rf.Manifest.Items[i]
|
||||
itemMap[item.ID] = item
|
||||
|
||||
abs := path.Join(path.Dir(rf.FullPath), item.HREF)
|
||||
item.f = r.files[abs]
|
||||
}
|
||||
|
||||
for i := range rf.Spine.Itemrefs {
|
||||
itemref := &rf.Spine.Itemrefs[i]
|
||||
itemref.Item = itemMap[itemref.IDREF]
|
||||
if itemref.Item == nil {
|
||||
return ErrBadItemref
|
||||
}
|
||||
}
|
||||
itemrefCount += len(rf.Spine.Itemrefs)
|
||||
}
|
||||
|
||||
if itemrefCount < 1 {
|
||||
return ErrNoItemref
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Open returns a ReadCloser that provides access to the Items's contents.
|
||||
// Multiple items may be read concurrently.
|
||||
func (item *Item) Open() (r io.ReadCloser, err error) {
|
||||
if item.f == nil {
|
||||
return nil, ErrBadManifest
|
||||
}
|
||||
|
||||
return item.f.Open()
|
||||
}
|
||||
|
||||
// Close closes the epub file, rendering it unusable for I/O.
|
||||
func (rc *ReadCloser) Close() {
|
||||
rc.f.Close()
|
||||
}
|
||||
|
||||
// Hehe
|
||||
func (rf *Rootfile) CountWords() int64 {
|
||||
var completeCount int64
|
||||
for _, item := range rf.Spine.Itemrefs {
|
||||
f, _ := item.Open()
|
||||
tokenizer := html.NewTokenizer(f)
|
||||
completeCount = completeCount + countWords(*tokenizer)
|
||||
}
|
||||
|
||||
return completeCount
|
||||
}
|
||||
|
||||
func countWords(tokenizer html.Tokenizer) int64 {
|
||||
var err error
|
||||
var totalWords int64
|
||||
for {
|
||||
tokenType := tokenizer.Next()
|
||||
token := tokenizer.Token()
|
||||
if tokenType == html.TextToken {
|
||||
currStr := string(token.Data)
|
||||
totalWords = totalWords + int64(len(strings.Fields(currStr)))
|
||||
} else if tokenType == html.ErrorToken {
|
||||
err = tokenizer.Err()
|
||||
}
|
||||
if err == io.EOF {
|
||||
return totalWords
|
||||
} else if err != nil {
|
||||
return 0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
func main() {
|
||||
rc, err := OpenEPUBReader("test.epub")
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
rf := rc.Rootfiles[0]
|
||||
|
||||
totalWords := rf.CountWords()
|
||||
log.Info("WOAH WORDS:", totalWords)
|
||||
}
|
||||
*/
|
||||
Reference in New Issue
Block a user