python scrapy爬取笔趣阁小说存储到mysql

您所在的位置:网站首页 笔趣阁如何上传小说 python scrapy爬取笔趣阁小说存储到mysql

python scrapy爬取笔趣阁小说存储到mysql

2024-07-14 17:30:12| 来源: 网络整理| 查看: 265

scrapy 爬取笔趣阁小说

首先创建scrapy startproject Novel项目 然后创建爬虫 scrapy genspider Downnovel www.qu.la

利用xpath对笔趣阁进行分析后 Downnovel.py如下 start_url可以换成想要爬取的书url

import scrapy from Novel.items import NovelItem import re from copy import deepcopy import urllib import copy class DownnovelSpider(scrapy.Spider): name = 'DownNovel' allowed_domains = ['www.qu.la'] start_urls = ['https://www.qu.la/book/1230/'] #div_list = response.xpath("//div[@class='article']/div[@class='']/div[@class='']") current_page = 1 #设置当前页 def parse(self, response): item = NovelItem() #现获取当前页的所有章节 chapter_list = response.xpath("//div[@class='section-box']/ul[@class='section-list fix']")[1] chapters = chapter_list.xpath('./li') item['current_page'] = self.current_page #遍历当前所有章节 for chapter_id,chapter in enumerate(chapters): # chapter 是用来后面进行排序的 item['chapter_url']=chapter.xpath("./a/@href").extract_first() item['chapter_name'] = chapter.xpath("./a/text()").extract_first() item['chapter_url'] ='http://www.qu.la'+ item['chapter_url'] yield scrapy.Request( url=item['chapter_url'], callback=self.parse_chapter, meta={'item':deepcopy(item)}, cb_kwargs={'num': chapter_id + 1} ) # 下一页 next_url=response.xpath("//div[@class='listpage']/span[@class='right']/a/@href").extract_first() next_url = urllib.parse.urljoin(response.url,next_url) # if next_url != 'https://www.qu.la/book/1230/index_3.html': # self.current_page+=1 # yield scrapy.Request( # next_url, # callback=self.parse # # ) if next_url !=response.url: self.current_page+=1 yield scrapy.Request( next_url, callback=self.parse ) def parse_chapter(self,response,num): item = response.meta['item'] # item['num'] = str(chapter_id) + ':' item['num']=num print(item['num']) item['chapter_title'] = response.xpath("//div[@class='reader-main']/h1/text()").extract_first() item['chapter_content'] = response.xpath("//div[@class='content']/text()").extract() item['chapter_content'] = [i.strip() for i in item['chapter_content'] if item['chapter_content']!='' ] chapter_content = ''.join(item['chapter_content']) item['chapter_content'] = re.sub("'',", '', chapter_content) yield item

item.py如下

import scrapy class NovelItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() chapter_url = scrapy.Field() chapter_name = scrapy.Field() chapter_title = scrapy.Field() chapter_content =scrapy.Field() num = scrapy.Field() current_page= scrapy.Field() pass

pipelines.py 导入pymysql

import pymysql class NovelPipeline(object): def open_spider(self, spider): """定义items,用来保存每个item""" self.items = [] self.connection = pymysql.connect( host='localhost', # 连接的是本地数据库 user='root', # 自己的mysql用户名 passwd='123456', # 自己的密码 db='novel', # 数据库的名字 charset='utf8mb4', # 默认的编码方式: cursorclass=pymysql.cursors.DictCursor ) def process_item(self, item, spider): """将下载解析到的各个item添加到items,此时是乱序的""" self.items.append(item) return item def close_spider(self, spider): #根据 item里面的current_page,num进行排序 items = sorted(self.items, key=lambda keys: (keys['current_page'],keys['num'])) try: with self.connection.cursor() as cursor: # 数据库表的sql sql1 = 'Create Table If Not Exists jiushen2(id int auto_increment primary key,zjm text,body text)' cursor.execute(sql1) # 单章小说的写入 for item in items: sql = "Insert into jiushen2(zjm,body) values ('%s','%s')" % ( item['chapter_title'], item['chapter_content']) cursor.execute(sql) # 提交本次插入的记录 self.connection.commit() finally: # 关闭连接 self.connection.close()

setting.py

BOT_NAME = 'Novel' LOG_LEVEL = 'DEBUG' SPIDER_MODULES = ['Novel.spiders'] NEWSPIDER_MODULE = 'Novel.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36' # Obey robots.txt rules ROBOTSTXT_OBEY = False LOG_LEVEL = 'WARNING' # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs DOWNLOAD_DELAY = 1 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) # COOKIES_ENABLED = True # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False USER_AGENT = 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) \Chrome/27.0.1453.94 Safari/537.36' # Override the default request headers: DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', } # Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'Novel.middlewares.NovelSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'Novel.middlewares.NovelDownloaderMiddleware': 543, #} # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'Novel.pipelines.NovelPipeline': 300, } # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'


【本文地址】

公司简介

联系我们

今日新闻


点击排行

实验室常用的仪器、试剂和
说到实验室常用到的东西,主要就分为仪器、试剂和耗
不用再找了,全球10大实验
01、赛默飞世尔科技(热电)Thermo Fisher Scientif
三代水柜的量产巅峰T-72坦
作者:寞寒最近,西边闹腾挺大,本来小寞以为忙完这
通风柜跟实验室通风系统有
说到通风柜跟实验室通风,不少人都纠结二者到底是不
集消毒杀菌、烘干收纳为一
厨房是家里细菌较多的地方,潮湿的环境、没有完全密
实验室设备之全钢实验台如
全钢实验台是实验室家具中较为重要的家具之一,很多

推荐新闻


图片新闻

实验室药品柜的特性有哪些
实验室药品柜是实验室家具的重要组成部分之一,主要
小学科学实验中有哪些教学
计算机 计算器 一般 打孔器 打气筒 仪器车 显微镜
实验室各种仪器原理动图讲
1.紫外分光光谱UV分析原理:吸收紫外光能量,引起分
高中化学常见仪器及实验装
1、可加热仪器:2、计量仪器:(1)仪器A的名称:量
微生物操作主要设备和器具
今天盘点一下微生物操作主要设备和器具,别嫌我啰嗦
浅谈通风柜使用基本常识
 众所周知,通风柜功能中最主要的就是排气功能。在

专题文章

    CopyRight 2018-2019 实验室设备网 版权所有 win10的实时保护怎么永久关闭