python txt小说文本处理,按章节分割小说

#python txt小说文本处理,按章节分割小说| 来源: 网络整理| 查看: 265

初衷

用JSP搭建了个本地的小说网站，网站数据基本靠python在网上抓取,有些是txt文档，没法直接存入数据库，所有需要对文本进行分割之后在存入数据库

准备工作

所需要的包：re os pymysql 先在cmd用pip list查看是否又安装，如果没安装用pip install 安装相应的包例如： pip install pymysql

源码分享

该源码目前有个BUG就是必须在最后一行加入一个章节名字比如：第100章，不然最后一张会存不上有能力的可自行完善，因为我处理的文本量不多

import re import os import pymysql class text_processing(object): # 类的初始化操作 def __init__(self): # txt文本目录 self.source_path = './../novel/已格式化好的文本/' # 用正则表达式匹配txt文档中的第N章 self.section_re = re.compile("^第.*章.*?.*?") self.split_flag = u'章' # 数据库链接地址 self.host = '192.168.3.33' # 数据库用户名 self.user = '192_168_3_33' # 数据库密码 self.passwd = 'iaJLHKPdm3YjDchd' # 数据库名字 self.db = '192_168_3_33' # 获取到文件夹里面的txt文件 def get_text_file_url(self): # 得到文件夹下的所有文件名称 files = os.listdir(self.source_path) # 遍历文件夹 for file in files: # 获取txt文件地址 position = self.source_path + file # 返回文件地址 return position # 处理txt文本方法 def text_processing(self): if os.path.exists(''.join(self.get_text_file_url())): # 打开txt文件 input = open(''.join(self.get_text_file_url()), encoding='utf-8') # 读取txt文件内容 line = input.readline() n = 0 content = '' # tempName='' tempDesc = '' # tempCount=0 while line: data = line line = line.encode('utf-8').decode('utf-8') match = re.match(self.section_re, line.strip()) if match: # print line.split(u'节')[0] n += 1 if n == 1: tempDesc = line.split(self.split_flag)[0] # tempName=line.split(split_flag)[1] if n == 2: n = 0 tempcontent = content # print ('tempDesc is %s,tempName is %s'%(tempDesc.strip(),tempName)) print('chapter title: {}'.format(tempDesc.strip())) # print ('tempCount is %s'%(tempCount)) print('chapter Content {}'.format(tempcontent)) content = '' # tempCount=0 line = data sql = "insert into t_novelchapter(title,chapter,novel_id)values ('{}',".format(tempDesc.strip()) + "'{}',".format(tempcontent) + "'25')" # 调用链接数据库并执行sql语句方法 self.save_data(sql) else: if not match: datas = line.strip() if len(datas) == 0: content += '

' else: content += '

' + line.replace('\n', '').replace("'", "''") + '

\n' # tempCount+=len(datas) line = input.readline() # 链接数据库并执行sql语句 def save_data(self, sql): # 链接数据库 conn = pymysql.connect( host=self.host, user=self.user, passwd=self.passwd, db=self.db, charset='utf8' ) cursor = conn.cursor() # 执行SQL语句 cursor.execute(sql) conn.commit() print('存入数据库成') # 关闭数据链接 conn.close() cursor.close() def run(self): # 调用处理txt文本方法 self.text_processing() if __name__ == '__main__': text_proces = text_processing() text_proces.run() 运行结果

python txt小说文本处理,按章节分割小说运行结果

改善

缺点在代码章节已经说过了，必须在TXT文件最后一行加入一个章节名字比如：第100章，不然最后一张会存不上，有能力的可自行完善，因为我处理的文本量不多

【本文地址】

公司简介

联系我们