AnthoLume/metadata/epub.go

331 lines
7.0 KiB
Go
Raw Normal View History

/*
Package epub provides basic support for reading EPUB archives.
Adapted from: https://github.com/taylorskalyo/goreader
*/
package metadata
import (
"archive/zip"
"bytes"
"encoding/xml"
"errors"
"io"
"os"
"path"
"strings"
"golang.org/x/net/html"
)
const containerPath = "META-INF/container.xml"
var (
// ErrNoRootfile occurs when there are no rootfile entries found in
// container.xml.
ErrNoRootfile = errors.New("epub: no rootfile found in container")
// ErrBadRootfile occurs when container.xml references a rootfile that does
// not exist in the zip.
ErrBadRootfile = errors.New("epub: container references non-existent rootfile")
// ErrNoItemref occurrs when a content.opf contains a spine without any
// itemref entries.
ErrNoItemref = errors.New("epub: no itemrefs found in spine")
// ErrBadItemref occurs when an itemref entry in content.opf references an
// item that does not exist in the manifest.
ErrBadItemref = errors.New("epub: itemref references non-existent item")
// ErrBadManifest occurs when a manifest in content.opf references an item
// that does not exist in the zip.
ErrBadManifest = errors.New("epub: manifest references non-existent item")
)
// Reader represents a readable epub file.
type Reader struct {
Container
files map[string]*zip.File
}
// ReadCloser represents a readable epub file that can be closed.
type ReadCloser struct {
Reader
f *os.File
}
// Rootfile contains the location of a content.opf package file.
type Rootfile struct {
FullPath string `xml:"full-path,attr"`
Package
}
// Container serves as a directory of Rootfiles.
type Container struct {
Rootfiles []*Rootfile `xml:"rootfiles>rootfile"`
}
// Package represents an epub content.opf file.
type Package struct {
Metadata
Manifest
Spine
}
// Metadata contains publishing information about the epub.
type Metadata struct {
Title string `xml:"metadata>title"`
Language string `xml:"metadata>language"`
Identifier string `xml:"metadata>idenifier"`
Creator string `xml:"metadata>creator"`
Contributor string `xml:"metadata>contributor"`
Publisher string `xml:"metadata>publisher"`
Subject string `xml:"metadata>subject"`
Description string `xml:"metadata>description"`
Event []struct {
Name string `xml:"event,attr"`
Date string `xml:",innerxml"`
} `xml:"metadata>date"`
Type string `xml:"metadata>type"`
Format string `xml:"metadata>format"`
Source string `xml:"metadata>source"`
Relation string `xml:"metadata>relation"`
Coverage string `xml:"metadata>coverage"`
Rights string `xml:"metadata>rights"`
}
// Manifest lists every file that is part of the epub.
type Manifest struct {
Items []Item `xml:"manifest>item"`
}
// Item represents a file stored in the epub.
type Item struct {
ID string `xml:"id,attr"`
HREF string `xml:"href,attr"`
MediaType string `xml:"media-type,attr"`
f *zip.File
}
// Spine defines the reading order of the epub documents.
type Spine struct {
Itemrefs []Itemref `xml:"spine>itemref"`
}
// Itemref points to an Item.
type Itemref struct {
IDREF string `xml:"idref,attr"`
*Item
}
// OpenEPUBReader will open the epub file specified by name and return a
// ReadCloser.
func OpenEPUBReader(name string) (*ReadCloser, error) {
f, err := os.Open(name)
if err != nil {
return nil, err
}
rc := new(ReadCloser)
rc.f = f
fi, err := f.Stat()
if err != nil {
f.Close()
return nil, err
}
z, err := zip.NewReader(f, fi.Size())
if err != nil {
return nil, err
}
if err = rc.init(z); err != nil {
return nil, err
}
return rc, nil
}
// NewReader returns a new Reader reading from ra, which is assumed to have the
// given size in bytes.
func NewReader(ra io.ReaderAt, size int64) (*Reader, error) {
z, err := zip.NewReader(ra, size)
if err != nil {
return nil, err
}
r := new(Reader)
if err = r.init(z); err != nil {
return nil, err
}
return r, nil
}
func (r *Reader) init(z *zip.Reader) error {
// Create a file lookup table
r.files = make(map[string]*zip.File)
for _, f := range z.File {
r.files[f.Name] = f
}
err := r.setContainer()
if err != nil {
return err
}
err = r.setPackages()
if err != nil {
return err
}
err = r.setItems()
if err != nil {
return err
}
return nil
}
// setContainer unmarshals the epub's container.xml file.
func (r *Reader) setContainer() error {
f, err := r.files[containerPath].Open()
if err != nil {
return err
}
var b bytes.Buffer
_, err = io.Copy(&b, f)
if err != nil {
return err
}
err = xml.Unmarshal(b.Bytes(), &r.Container)
if err != nil {
return err
}
if len(r.Container.Rootfiles) < 1 {
return ErrNoRootfile
}
return nil
}
// setPackages unmarshal's each of the epub's content.opf files.
func (r *Reader) setPackages() error {
for _, rf := range r.Container.Rootfiles {
if r.files[rf.FullPath] == nil {
return ErrBadRootfile
}
f, err := r.files[rf.FullPath].Open()
if err != nil {
return err
}
var b bytes.Buffer
_, err = io.Copy(&b, f)
if err != nil {
return err
}
err = xml.Unmarshal(b.Bytes(), &rf.Package)
if err != nil {
return err
}
}
return nil
}
// setItems associates Itemrefs with their respective Item and Items with
// their zip.File.
func (r *Reader) setItems() error {
itemrefCount := 0
for _, rf := range r.Container.Rootfiles {
itemMap := make(map[string]*Item)
for i := range rf.Manifest.Items {
item := &rf.Manifest.Items[i]
itemMap[item.ID] = item
abs := path.Join(path.Dir(rf.FullPath), item.HREF)
item.f = r.files[abs]
}
for i := range rf.Spine.Itemrefs {
itemref := &rf.Spine.Itemrefs[i]
itemref.Item = itemMap[itemref.IDREF]
if itemref.Item == nil {
return ErrBadItemref
}
}
itemrefCount += len(rf.Spine.Itemrefs)
}
if itemrefCount < 1 {
return ErrNoItemref
}
return nil
}
// Open returns a ReadCloser that provides access to the Items's contents.
// Multiple items may be read concurrently.
func (item *Item) Open() (r io.ReadCloser, err error) {
if item.f == nil {
return nil, ErrBadManifest
}
return item.f.Open()
}
// Close closes the epub file, rendering it unusable for I/O.
func (rc *ReadCloser) Close() {
rc.f.Close()
}
// Hehe
func (rf *Rootfile) CountWords() int64 {
var completeCount int64
for _, item := range rf.Spine.Itemrefs {
f, _ := item.Open()
tokenizer := html.NewTokenizer(f)
completeCount = completeCount + countWords(*tokenizer)
}
return completeCount
}
func countWords(tokenizer html.Tokenizer) int64 {
var err error
var totalWords int64
for {
tokenType := tokenizer.Next()
token := tokenizer.Token()
if tokenType == html.TextToken {
currStr := string(token.Data)
totalWords = totalWords + int64(len(strings.Fields(currStr)))
} else if tokenType == html.ErrorToken {
err = tokenizer.Err()
}
if err == io.EOF {
return totalWords
} else if err != nil {
return 0
}
}
}
/*
func main() {
rc, err := OpenEPUBReader("test.epub")
if err != nil {
log.Fatal(err)
}
rf := rc.Rootfiles[0]
totalWords := rf.CountWords()
log.Info("WOAH WORDS:", totalWords)
}
*/