EPUB 阅读器：解析 EPUB 文件并提供内容属性访问

/\nReads an EPUB file and makes the content available via properties\n/\nconst fs = require("fs"),\n\tpath = require("path"),\n\t{promisify} = require("util"),\n\treadFileAsync = promisify(fs.readFile),\n\twriteFileAsync = promisify(fs.writeFile),\n\t{DOMParser,XMLSerializer} = require("@xmldom/xmldom"),\n\tJSZip = require("jszip"),\n\t{TextExtractor} = require("./text-extractor"),\n\t{hash,resolvePath} = require("./utils");\n\nconst BINARY_MEDIA_TYPES = [\n\t"image/gif",\n\t"image/png",\n\t"image/jpeg",\n\t"audio/mpeg",\n\t"audio/mp4"\n];\n\nconst URL_PREFIX = "https://example.com/";\n\nclass EpubReader {\n\n\tconstructor (app) {\n\t this.app = app;\n\t this.metadata = Object.create(null); // Hashmap metadata items\n\t this.manifest = Object.create(null); // Hashmap by ID of {properties:,id:,href:,media-type:}\n\t this.spine = []; // Array of IDs of items comprising the publication\n\t this.chunks = []; // Array of chunks {href:, nodes: [], anchorIds: [], stylesheetIds: []}\n\t this.toc = []; // Tree of {id:, text:, href:, children: {}}\n\t this.stylesheets = Object.create(null); // Hashmap by ID of {text:}\n\t this.images = Object.create(null); // Hashmap by path of {type:, text:}\n\t this.errors = []; // Array of errors\n\t}\n\n\tlogError(message) {\n\t this.errors.push(message);\n\t}\n\n\t/\n\tLoad an EPUB from a file path\n\t/\n\tasync load(epubFilepath) {\n\t // Read the ZIP file\n\t const epubFileData = await readFileAsync(epubFilepath);\n\t this.epubHash = hash(epubFileData);\n\t this.zip = await JSZip.loadAsync(epubFileData);\n\t // Load the container file\n\t this.containerFileContents = await this.zip.file("META-INF/container.xml").async("string");\n\t this.containerFileDoc = new DOMParser().parseFromString(this.containerFileContents,"text/xml");\n\t // Load the package file\n\t this.packageFilePath = findNodeAndGetAttribute(this.containerFileDoc,["container","rootfiles","rootfile"],"full-path");\n\t this.packageFileContents = await this.zip.file(this.packageFilePath).async("string");\n\t this.packageFileDoc = new DOMParser().parseFromString(this.packageFileContents,"text/xml");\n\t // Read Dublin Core metadata and meta tags\n\t const nodeMetadata = findNode(this.packageFileDoc,["package","metadata"]);\n\t Array.from(nodeMetadata.childNodes).forEach(node => {\n\t const n = (node.tagName || "").toLowerCase();\n\t if(n.substr(0,3) === "dc:") {\n\t this.metadata[n] = node.textContent.replace(/\s+/mg," ");\n\t } else if(n === "meta") {\n\t const p = node.getAttribute("property"),\n\t ref = node.getAttribute("refines"),\n\t id = node.getAttribute("id"),\n\t scheme = node.getAttribute("scheme"),\n\t name = node.getAttribute("name"),\n\t content = node.getAttribute("content");\n\t if(p) {\n\t this.metadata[p] = node.textContent.replace(/\s+/mg," ");\n\t } else if(name && content) {\n\t this.metadata[name] = content;\n\t }\n\t }\n\t });\n\t // Read manifest\n\t const nodeManifest = findNode(this.packageFileDoc,["package","manifest"]);\n\t Array.from(nodeManifest.childNodes).forEach(node => {\n\t const n = (node.tagName || "").toLowerCase();\n\t if(n === "item") {\n\t const p = node.getAttribute("properties") || "",\n\t id = node.getAttribute("id"),\n\t mediaType = node.getAttribute("media-type");\n\t var href = resolvePath(node.getAttribute("href"),this.packageFilePath);\n\t // Some books include an extraneous slash in internal URLs\n\t if(href.startsWith("/")) {\n\t href = href.slice(1);\n\t }\n\t this.manifest[id] = {properties: p.split(" "), id: id, href: href, "media-type": mediaType};\n\t }\n\t });\n\t // Get the spine node\n\t this.nodeSpine = findNode(this.packageFileDoc,["package","spine"]);\n\t // Read the spine\n\t Array.from(this.nodeSpine.childNodes).forEach(node => {\n\t if((node.tagName || "").toLowerCase() === "itemref") {\n\t this.spine.push(node.getAttribute("idref"));\n\t }\n\t });\n\t // Load the TOC\n\t await this.loadToc();\n\t // Read the text chunks and stylesheets\n\t await this.loadTextChunks();\n\t // Load the images\n\t await this.loadImages();\n\t}\n\n\t/\n\tCheck for a metadata item\n\t/\n\thasMetadataItem(name) {\n\t return name in this.metadata;\n\t}\n\n\t/\n\tGet a metadata item\n\t/\n\tgetMetadataItem(name,defaultValue) {\n\t if(name in this.metadata) {\n\t return this.metadata[name];\n\t } else {\n\t return defaultValue;\n\t }\n\t}\n\n\t/\n\tGet a manifest item\n\t/\n\tgetManifestItem(id,defaultValue) {\n\t return this.manifest[id] || defaultValue;\n\t}\n\n\t/\n\tGet the media type of a manifest item\n\t/\n\tgetMediaTypeOfItem(href) {\n\t var result;\n\t for(const id of Object.keys(this.manifest)) {\n\t const manifestItem = this.manifest[id];\n\t if(manifestItem.href === href) {\n\t result = manifestItem["media-type"];\n\t }\n\t }\n\t return result;\n\t}\n\n\t/\n\tLoad the table of contents\n\tReturns a tree of {id:, text:, href:, children: {}}\n\t/\n\tasync loadToc() {\n\t this.tocItem = this.manifest[this.nodeSpine.getAttribute("toc")].href;\n\t // Get the TOC file\n\t this.tocContents = await this.zip.file(this.tocItem).async("string");\n\t this.tocDoc = new DOMParser().parseFromString(this.tocContents,"text/xml");\n\t // Visit each node collecting up the entries\n\t const visitNodes = nodes => {\n\t const results = [];\n\t Array.from(nodes).forEach(node => {\n\t if(node.nodeType === 1 && node.tagName === "navPoint") {\n\t results.push(visitNode(node));\n\t }\t\t \n\t });\n\t return results;\n\t };\n\t const visitNode = node => {\n\t const href = findNodeAndGetAttribute(node,["content"],"src");\n\t return {\n\t id: node.getAttribute("id"),\n\t text: findNode(node,["navLabel","text"]).textContent,\n\t href: resolvePath(href,this.packageFilePath),\n\t children: visitNodes(node.childNodes)\n\t };\n\t };\n\t // Start at the root\n\t const navMap = findNode(this.tocDoc,["ncx","navMap"]);\n\t this.toc = visitNodes(navMap.childNodes);\n\t}\n\n\t/\n\tLoad the text chunks and stylesheets\n\t/\n\tasync loadTextChunks() {\n\t // Setup the text extractor\n\t const textExtractor = new TextExtractor({\n\t getFile: async fileHref => {\n\t const file = this.zip.file(fileHref);\n\t return {\n\t type: this.getMediaTypeOfItem(fileHref),\n\t contents: file ? await file.async("nodebuffer") : ""\n\t }\n\t },\n\t logError: this.logError.bind(this)\n\t });\n\t // Extract each HTML file listed in the spine\n\t for(const spineItem of this.spine) {\n\t const manifestItem = this.manifest[spineItem];\n\t if(manifestItem["media-type"] === "application/xhtml+xml" ) {\n\t const results = await textExtractor.getPageText(manifestItem.href);\n\t // Collect the IDs of the stylesheets used in this file\n\t const stylesheetIds = [];\n\t for(const stylesheetText of results.stylesheets) {\n\t // If we just got the text then generate an href\n\t const id = hash(stylesheetText,6);\n\t // Save the id\n\t stylesheetIds.push(id);\n\t // Save the stylesheet text if we don't already have this ID\n\t if(!(id in this.stylesheets)) {\n\t this.stylesheets[id] = stylesheetText;\n\t }\n\t }\n\t // Copy the chunks, adding the stylesheets\n\t for(const chunk of results.chunks) {\n\t chunk.stylesheetIds = stylesheetIds;\n\t this.chunks.push(chunk);\n\t }\n\t }\n\t }\n\t}\n\n\t/\n\tLoad all the images\n\t/\n\tasync loadImages() {\n\t // Get the image manifest items\n\t for(const id of Object.keys(this.manifest)) {\n\t const manifestItem = this.manifest[id];\n\t if(manifestItem["media-type"].split("/")[0] === "image" ) {\n\t const file = this.zip.file(manifestItem.href),\n\t encoding = BINARY_MEDIA_TYPES.includes(manifestItem["media-type"]) ? "base64" : "text",\n\t filename = manifestItem.href.substring(manifestItem.href.lastIndexOf('/') + 1);\n\t // 仅使用文件名。\n\t if(file) {\n\t this.images[filename] = {\n\t type: manifestItem["media-type"],\n\t text: await file.async(encoding)\n\t };\n\t } else {\n\t this.logError(Missing image: ${filename});\n\t }\n\t }\n\t }\n\t}\n}\n\nfunction findNodeAndGetAttribute(rootNode,selectors,attributeName) {\n const node = findNode(rootNode,selectors);\n if(node) {\n return node.getAttribute(attributeName);\n }\n return null;\n}\n\n/\nFind an XML node identified by a list of child tag names\nrootNode: reference to root node\nselectors: array of child tag names\n/\nfunction findNode(rootNode,selectors) {\n let node = rootNode;\n for(selector of selectors) {\n node = Array.from(node.childNodes).find(node => !!node.tagName && node.tagName === selector);\n if(!node) {\n return null;\n }\n }\n return node;\n}\n\nexports.EpubReader = EpubReader;