使用python制作epub

#使用python制作epub| 来源: 网络整理| 查看: 265

使用python制作epub 前期工作开始制作第一步、分析网站第二步、抓取数据并清洗获取文章内容第三步、保存到Epub中全部代码

前期工作 ebookLib库关于该库，如果pypi版本太低，需要去gitlab上clone，然后运行python setup.py install zhconv库，主要用来简繁转换一个允许抓取数据的小说网站novel-backup一点点时间开始制作第一步、分析网站

根据自己抓取的网站，获取所有章节的链接 https://novels.novel-backup.cf/index/1558018541.json

根据获得的内容，对内容进行分析（已省略部分数据）

[ { "name": "41．成套的葡萄酒杯", "id": 7460 }, { "name": "42．烤肉午餐", "id": 7550 } ]

里面的id就是下面章节内容的链接xx/yy.json的yy

再获取章节内容，对其内容进行分析 https://novels.novel-backup.cf/novels/93065.json （已省略部分数据）

{ "code_id": 1558018541, "title": "第1卷插圖", "create_date": "2020-10-07 20:51:33", "content": "

", "author": "職業量地官", "views": 2896 }

对于我们来说，有用的是title、content、author

第二步、抓取数据并清洗

ebookLib的章节顺序是按照add_item来排的，所以我们需要对抓取的章节进行排序。首先新建一个py文件，然后新建一个类Espider

def getJson(self,url): html:requests.Response= requests.get(url) return html.json() def getDictList(self,url): js:typing.List[dict]=self.getJson(url) return js def getFilter(self,li_list): maxx=0 id_dicts=[] for li in li_list: idict=li idict['name']=convert(idict['name'],'zh-hans') ll=re.findall(r'([1-9]\d*.\d*|0\.\d*[1-9]\d*)',idict['name']) if(len(ll)>0): s:str=ll[0] num=int(s[:-1]) idict['num']=num maxx=max(maxx,num) else: ll=re.findall(r'第([1-9]\d*)话',idict['name']) if(len(ll)>0): s:str=ll[0] num=int(s) idict['num']=num maxx=max(num,maxx) else: maxx+=1 idict['num']=maxx id_dicts.append(idict) id_dicts.sort(key=lambda it:it['num']) tmp_list:typing.List[dict]=[] for i in range(len(id_dicts)): id_dicts[i]['i']=str(i) tmp_list.append(id_dicts[i]) return tmp_list

首先是获取数据，然后将数据转换格式(getJson,getDictList) getFilter长长的代码简单理解就是将章节的链接List中每个174. 疲勞與真心話或第3話商業公會前面的数字取出来，然后如果有不存在数字的章节，就让这个章节的id=maxx

获取文章内容

对于图片需要特殊处理，先保存到本地后再添加到epub文件里

def getDict(self,url): js:dict=self.getJson(url) return js def saveImg(self,title,src): path='Images/{}'.format(title) if(os.path.exists(path)==False): os.mkdir(path) s=re.findall(r'65535/(.*?)\[/img\]',src) if(len(s)==0): s=re.findall(r'65535/(.*?.png)',src)[0] else: s=s[0] res:requests.Response=requests.get(src,stream=True) res.raise_for_status() with open("{}/{}".format(path,s),"wb") as f: f.write(res.content) self.img_list.append({ 'src':"{}/{}".format(path,s), 'uid':s.split('.')[0] }) return "{}/{}".format(path,s) def contentCheck(self,title,content:str): soup=BeautifulSoup(content,'lxml') for img in soup.findAll('img'): s=self.saveImg(title,img['src']) img['src']=s return str(soup.body) def getContent(self,id): url_s='https://novels.novel-backup.cf/novels/' url_e='.json' print(url_s+id+url_e) js=self.getDict(url_s+id+url_e) js['author']=convert(js['author'],'zh-hans') js['title']=convert(js['title'],'zh-hans') js['content']=convert(js['content'],'zh-hans') return '

搬运：'+js['author']+'

'+self.contentCheck(js['title'],js['content'])

getDict是获取数据，然后getContent对js取出属性，使用contentCheck对内容处理，之后将其保存到List中。

第三步、保存到Epub中

新建一个ebook文件，将eooklib和Espider引入

toc = [] spine = ['nav'] book = epub.EpubBook() chp_list = [] def init(title, author): # set metadata book.set_identifier('id123456') book.set_title(title) book.set_language('cn') book.add_author(author) book.add_author('Anonymous', file_as='Anonymous', role='ill', uid='coauthor') # add default NCX and Nav file book.add_item(epub.EpubNcx()) book.add_item(epub.EpubNav()) # define CSS style style = 'pre{white-space:pre-wrap;background:#f7f9fa;padding:10px 15px;color:#263238;line-height:1.6;font-size:13px;border-radius:3px margin-top: 0;margin-bottom:1em;overflow:auto}b,strong{font-weight:bolder}#title{font-size:16px;color:#212121;font-weight:600;margin-bottom:10px}hr{height:10px;border:0;box-shadow:0 10px 10px -10px #8c8b8b inset}' nav_css = epub.EpubItem( uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style) # add CSS file book.add_item(nav_css) def saveChapter(): c1=getChapter( '前言', '

使用python ebooklib整合，数据来源https://novel-backup.cf/，仅供参考请勿商用

', '000') book.add_item(c1) toc.append(epub.Link(c1.file_name,c1.title,c1.title)) spine.append(c1) for it in chp_list: # For each chapter add chapter to the book, TOC and spine book.add_item(it['chapter']) toc.append(epub.Link(it['chapter'].file_name, it['chapter'].title, it['chapter'].title)) spine.append(it['chapter']) def saveImage(img_list:typing.List[dict]): for img in img_list: image_content = open( img['src'], 'rb').read() img = epub.EpubImage(uid=img['uid'], file_name=img['src'], media_type='image/png', content=image_content) book.add_item(img) def saveEpub(file_name): # define Table Of Contents book.toc = tuple(toc) # basic spine book.spine = spine # write to the file epub.write_epub('epub/'+file_name, book, {}) def getChapter(title, content, id): c1 = epub.EpubHtml(title=title, file_name='chap_'+id+'.xhtml', lang='hr') c1.content = ''+title+''+content return c1 def poChapter(it, llen): i = int(it['i'])+1 c = getChapter(it['name'], es.getContent( str(it['id'])), str(i).zfill(llen)) chp_list.append({ 'chapter': c, 'id': i }) if __name__ == '__main__': init('魔導具師妲莉雅不會低頭　～從今天開始自由的職人生活～', '自动叉积·整合') es = Espider() li_url = 'https://novels.novel-backup.cf/index/1558018541.json' li_list = es.getDictList(li_url) id_dicts = es.getFilter(li_list) llen = len(str(len(id_dicts))) # poChapter(id_dicts[0],llen) # 创建线程 index = [i for i in range(0, len(id_dicts), 4)] threads = [] for i in index: for j in range(0, 4): threads.append(threading.Thread( target=poChapter, args=(id_dicts[i+j], llen))) for t in threads: t.start() for t in threads: t.join() print('Main thread has ended!') chp_list.sort(key=lambda it: it['id']) saveChapter() saveImage(es.img_list) saveEpub('《魔導具師妲莉雅不會低頭　～從今天開始自由的職人生活～》.epub')

init来自ebookLib官方文档给出的函数，str(i).zfill(llen)是对数字进行数位补齐，如’chap_002.xhtml’ 引入threading是为了在爬虫的时候进行多线程，提高效率。

全部代码 # spider.py import requests from bs4 import BeautifulSoup import typing import re import os from zhconv import convert class Espider: # https://novels.novel-backup.cf/index/1558018541.json # https://novels.novel-backup.cf/novels/7460.json img_list=[] def getJson(self,url): html:requests.Response= requests.get(url) # soup = BeautifulSoup(html.json()) return html.json() def getDict(self,url): js:dict=self.getJson(url) # print(js) return js def getDictList(self,url): js:typing.List[dict]=self.getJson(url) # print(js) return js def saveImg(self,title,src): path='Images/{}'.format(title) if(os.path.exists(path)==False): os.mkdir(path) # print(src) s=re.findall(r'65535/(.*?)\[/img\]',src) # print(s) if(len(s)==0): s=re.findall(r'65535/(.*?.png)',src)[0] else: s=s[0] # print(s) res:requests.Response=requests.get(src,stream=True) res.raise_for_status() with open("{}/{}".format(path,s),"wb") as f: f.write(res.content) self.img_list.append({ 'src':"{}/{}".format(path,s), 'uid':s.split('.')[0] }) return "{}/{}".format(path,s) def contentCheck(self,title,content:str): soup=BeautifulSoup(content,'lxml') # print(soup) for img in soup.findAll('img'): s=self.saveImg(title,img['src']) img['src']=s # ''.join(str(it) for it in soup.find_all('p')) return str(soup.body) def getContent(self,id): url_s='https://novels.novel-backup.cf/novels/' url_e='.json' print(url_s+id+url_e) js=self.getDict(url_s+id+url_e) js['author']=convert(js['author'],'zh-hans') js['title']=convert(js['title'],'zh-hans') js['content']=convert(js['content'],'zh-hans') # print(js['author'],js['title'],js['content']) return '

搬运：'+js['author']+'

'+self.contentCheck(js['title'],js['content']) def getFilter(self,li_list): maxx=0 id_dicts=[] for li in li_list: idict=li idict['name']=convert(idict['name'],'zh-hans') ll=re.findall(r'([1-9]\d*.\d*|0\.\d*[1-9]\d*)',idict['name']) if(len(ll)>0): s:str=ll[0] num=int(s[:-1]) idict['num']=num maxx=max(maxx,num) else: ll=re.findall(r'第([1-9]\d*)话',idict['name']) if(len(ll)>0): s:str=ll[0] num=int(s) idict['num']=num maxx=max(num,maxx) else: maxx+=1 idict['num']=maxx id_dicts.append(idict) id_dicts.sort(key=lambda it:it['num']) tmp_list:typing.List[dict]=[] for i in range(len(id_dicts)): id_dicts[i]['i']=str(i) tmp_list.append(id_dicts[i]) return tmp_list def getIdList(self,li_list): id_list:typing.List[str]=[str(it['id']) for it in li_list] return id_list if __name__=="__main__": print("爬取开始") # po=pool.Pool(5) # li_url='https://novels.novel-backup.cf/index/1558018541.json' es=Espider() # li_list=es.getDictList(li_url); # # print(li_list) # id_dicts=es.getFilter(li_list) # print(id_dicts) print(es.getContent('112353')) print(es.getContent('16733')) # print(es.img_list) print('爬取结束') # ebook.py import threading import typing from ebooklib import epub from spider import Espider toc = [] spine = ['nav'] book = epub.EpubBook() chp_list = [] def init(title, author): # set metadata book.set_identifier('id123456') book.set_title(title) book.set_language('cn') book.add_author(author) book.add_author('Anonymous', file_as='Anonymous', role='ill', uid='coauthor') # add default NCX and Nav file book.add_item(epub.EpubNcx()) book.add_item(epub.EpubNav()) # define CSS style style = 'pre{white-space:pre-wrap;background:#f7f9fa;padding:10px 15px;color:#263238;line-height:1.6;font-size:13px;border-radius:3px margin-top: 0;margin-bottom:1em;overflow:auto}b,strong{font-weight:bolder}#title{font-size:16px;color:#212121;font-weight:600;margin-bottom:10px}hr{height:10px;border:0;box-shadow:0 10px 10px -10px #8c8b8b inset}' nav_css = epub.EpubItem( uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style) # add CSS file book.add_item(nav_css) def saveChapter(): c1=getChapter( '前言', '

使用python ebooklib整合，数据来源https://novel-backup.cf/，仅供参考请勿商用

', '000') book.add_item(c1) toc.append(epub.Link(c1.file_name,c1.title,c1.title)) spine.append(c1) for it in chp_list: # For each chapter add chapter to the book, TOC and spine book.add_item(it['chapter']) toc.append(epub.Link(it['chapter'].file_name, it['chapter'].title, it['chapter'].title)) spine.append(it['chapter']) # print('save c', chapter.file_name) def saveImage(img_list:typing.List[dict]): for img in img_list: image_content = open( img['src'], 'rb').read() img = epub.EpubImage(uid=img['uid'], file_name=img['src'], media_type='image/png', content=image_content) book.add_item(img) def saveEpub(file_name): # define Table Of Contents book.toc = tuple(toc) # basic spine book.spine = spine # write to the file epub.write_epub('epub/'+file_name, book, {}) def getChapter(title, content, id): c1 = epub.EpubHtml(title=title, file_name='chap_'+id+'.xhtml', lang='hr') c1.content = ''+title+''+content print("g", c1.file_name, c1.title, id) return c1 def poChapter(it, llen): # print("开始进程", it['i']) i = int(it['i'])+1 c = getChapter(it['name'], es.getContent( str(it['id'])), str(i).zfill(llen)) chp_list.append({ 'chapter': c, 'id': i }) # saveChapter(c, it['i']) if __name__ == '__main__': init('魔導具師妲莉雅不會低頭　～從今天開始自由的職人生活～', '自动叉积·整合') es = Espider() li_url = 'https://novels.novel-backup.cf/index/1558018541.json' li_list = es.getDictList(li_url) id_dicts = es.getFilter(li_list) llen = len(str(len(id_dicts))) # poChapter(id_dicts[0],llen) # 创建线程 index = [i for i in range(0, len(id_dicts), 4)] threads = [] for i in index: for j in range(0, 4): threads.append(threading.Thread( target=poChapter, args=(id_dicts[i+j], llen))) for t in threads: t.start() for t in threads: t.join() print('Main thread has ended!') chp_list.sort(key=lambda it: it['id']) saveChapter() # es.img_list.append('Images/第6卷插圖/51154283631_826ee93727_o.png') saveImage(es.img_list) saveEpub('《魔導具師妲莉雅不會低頭　～從今天開始自由的職人生活～》.epub')

【本文地址】

公司简介

联系我们