/* Package epub provides basic support for reading EPUB archives. Adapted from: https://github.com/taylorskalyo/goreader */ package metadata import ( "archive/zip" "bytes" "encoding/xml" "errors" "io" "os" "path" "strings" "golang.org/x/net/html" ) const containerPath = "META-INF/container.xml" var ( // ErrNoRootfile occurs when there are no rootfile entries found in // container.xml. ErrNoRootfile = errors.New("epub: no rootfile found in container") // ErrBadRootfile occurs when container.xml references a rootfile that does // not exist in the zip. ErrBadRootfile = errors.New("epub: container references non-existent rootfile") // ErrNoItemref occurrs when a content.opf contains a spine without any // itemref entries. ErrNoItemref = errors.New("epub: no itemrefs found in spine") // ErrBadItemref occurs when an itemref entry in content.opf references an // item that does not exist in the manifest. ErrBadItemref = errors.New("epub: itemref references non-existent item") // ErrBadManifest occurs when a manifest in content.opf references an item // that does not exist in the zip. ErrBadManifest = errors.New("epub: manifest references non-existent item") ) // Reader represents a readable epub file. type Reader struct { Container files map[string]*zip.File } // ReadCloser represents a readable epub file that can be closed. type ReadCloser struct { Reader f *os.File } // Rootfile contains the location of a content.opf package file. type Rootfile struct { FullPath string `xml:"full-path,attr"` Package } // Container serves as a directory of Rootfiles. type Container struct { Rootfiles []*Rootfile `xml:"rootfiles>rootfile"` } // Package represents an epub content.opf file. type Package struct { Metadata Manifest Spine } // Metadata contains publishing information about the epub. type Metadata struct { Title string `xml:"metadata>title"` Language string `xml:"metadata>language"` Identifier string `xml:"metadata>idenifier"` Creator string `xml:"metadata>creator"` Contributor string `xml:"metadata>contributor"` Publisher string `xml:"metadata>publisher"` Subject string `xml:"metadata>subject"` Description string `xml:"metadata>description"` Event []struct { Name string `xml:"event,attr"` Date string `xml:",innerxml"` } `xml:"metadata>date"` Type string `xml:"metadata>type"` Format string `xml:"metadata>format"` Source string `xml:"metadata>source"` Relation string `xml:"metadata>relation"` Coverage string `xml:"metadata>coverage"` Rights string `xml:"metadata>rights"` } // Manifest lists every file that is part of the epub. type Manifest struct { Items []Item `xml:"manifest>item"` } // Item represents a file stored in the epub. type Item struct { ID string `xml:"id,attr"` HREF string `xml:"href,attr"` MediaType string `xml:"media-type,attr"` f *zip.File } // Spine defines the reading order of the epub documents. type Spine struct { Itemrefs []Itemref `xml:"spine>itemref"` } // Itemref points to an Item. type Itemref struct { IDREF string `xml:"idref,attr"` *Item } // OpenEPUBReader will open the epub file specified by name and return a // ReadCloser. func OpenEPUBReader(name string) (*ReadCloser, error) { f, err := os.Open(name) if err != nil { return nil, err } rc := new(ReadCloser) rc.f = f fi, err := f.Stat() if err != nil { f.Close() return nil, err } z, err := zip.NewReader(f, fi.Size()) if err != nil { return nil, err } if err = rc.init(z); err != nil { return nil, err } return rc, nil } // NewReader returns a new Reader reading from ra, which is assumed to have the // given size in bytes. func NewReader(ra io.ReaderAt, size int64) (*Reader, error) { z, err := zip.NewReader(ra, size) if err != nil { return nil, err } r := new(Reader) if err = r.init(z); err != nil { return nil, err } return r, nil } func (r *Reader) init(z *zip.Reader) error { // Create a file lookup table r.files = make(map[string]*zip.File) for _, f := range z.File { r.files[f.Name] = f } err := r.setContainer() if err != nil { return err } err = r.setPackages() if err != nil { return err } err = r.setItems() if err != nil { return err } return nil } // setContainer unmarshals the epub's container.xml file. func (r *Reader) setContainer() error { f, err := r.files[containerPath].Open() if err != nil { return err } var b bytes.Buffer _, err = io.Copy(&b, f) if err != nil { return err } err = xml.Unmarshal(b.Bytes(), &r.Container) if err != nil { return err } if len(r.Container.Rootfiles) < 1 { return ErrNoRootfile } return nil } // setPackages unmarshal's each of the epub's content.opf files. func (r *Reader) setPackages() error { for _, rf := range r.Container.Rootfiles { if r.files[rf.FullPath] == nil { return ErrBadRootfile } f, err := r.files[rf.FullPath].Open() if err != nil { return err } var b bytes.Buffer _, err = io.Copy(&b, f) if err != nil { return err } err = xml.Unmarshal(b.Bytes(), &rf.Package) if err != nil { return err } } return nil } // setItems associates Itemrefs with their respective Item and Items with // their zip.File. func (r *Reader) setItems() error { itemrefCount := 0 for _, rf := range r.Container.Rootfiles { itemMap := make(map[string]*Item) for i := range rf.Manifest.Items { item := &rf.Manifest.Items[i] itemMap[item.ID] = item abs := path.Join(path.Dir(rf.FullPath), item.HREF) item.f = r.files[abs] } for i := range rf.Spine.Itemrefs { itemref := &rf.Spine.Itemrefs[i] itemref.Item = itemMap[itemref.IDREF] if itemref.Item == nil { return ErrBadItemref } } itemrefCount += len(rf.Spine.Itemrefs) } if itemrefCount < 1 { return ErrNoItemref } return nil } // Open returns a ReadCloser that provides access to the Items's contents. // Multiple items may be read concurrently. func (item *Item) Open() (r io.ReadCloser, err error) { if item.f == nil { return nil, ErrBadManifest } return item.f.Open() } // Close closes the epub file, rendering it unusable for I/O. func (rc *ReadCloser) Close() { rc.f.Close() } // Hehe func (rf *Rootfile) CountWords() int64 { var completeCount int64 for _, item := range rf.Spine.Itemrefs { f, _ := item.Open() tokenizer := html.NewTokenizer(f) completeCount = completeCount + countWords(*tokenizer) } return completeCount } func countWords(tokenizer html.Tokenizer) int64 { var err error var totalWords int64 for { tokenType := tokenizer.Next() token := tokenizer.Token() if tokenType == html.TextToken { currStr := string(token.Data) totalWords = totalWords + int64(len(strings.Fields(currStr))) } else if tokenType == html.ErrorToken { err = tokenizer.Err() } if err == io.EOF { return totalWords } else if err != nil { return 0 } } } /* func main() { rc, err := OpenEPUBReader("test.epub") if err != nil { log.Fatal(err) } rf := rc.Rootfiles[0] totalWords := rf.CountWords() log.Info("WOAH WORDS:", totalWords) } */