python scrapy爬取笔趣阁小说存储到mysql |
您所在的位置:网站首页 › 笔趣阁如何上传小说 › python scrapy爬取笔趣阁小说存储到mysql |
scrapy 爬取笔趣阁小说
首先创建scrapy startproject Novel项目 然后创建爬虫 scrapy genspider Downnovel www.qu.la 利用xpath对笔趣阁进行分析后 Downnovel.py如下 start_url可以换成想要爬取的书url import scrapy from Novel.items import NovelItem import re from copy import deepcopy import urllib import copy class DownnovelSpider(scrapy.Spider): name = 'DownNovel' allowed_domains = ['www.qu.la'] start_urls = ['https://www.qu.la/book/1230/'] #div_list = response.xpath("//div[@class='article']/div[@class='']/div[@class='']") current_page = 1 #设置当前页 def parse(self, response): item = NovelItem() #现获取当前页的所有章节 chapter_list = response.xpath("//div[@class='section-box']/ul[@class='section-list fix']")[1] chapters = chapter_list.xpath('./li') item['current_page'] = self.current_page #遍历当前所有章节 for chapter_id,chapter in enumerate(chapters): # chapter 是用来后面进行排序的 item['chapter_url']=chapter.xpath("./a/@href").extract_first() item['chapter_name'] = chapter.xpath("./a/text()").extract_first() item['chapter_url'] ='http://www.qu.la'+ item['chapter_url'] yield scrapy.Request( url=item['chapter_url'], callback=self.parse_chapter, meta={'item':deepcopy(item)}, cb_kwargs={'num': chapter_id + 1} ) # 下一页 next_url=response.xpath("//div[@class='listpage']/span[@class='right']/a/@href").extract_first() next_url = urllib.parse.urljoin(response.url,next_url) # if next_url != 'https://www.qu.la/book/1230/index_3.html': # self.current_page+=1 # yield scrapy.Request( # next_url, # callback=self.parse # # ) if next_url !=response.url: self.current_page+=1 yield scrapy.Request( next_url, callback=self.parse ) def parse_chapter(self,response,num): item = response.meta['item'] # item['num'] = str(chapter_id) + ':' item['num']=num print(item['num']) item['chapter_title'] = response.xpath("//div[@class='reader-main']/h1/text()").extract_first() item['chapter_content'] = response.xpath("//div[@class='content']/text()").extract() item['chapter_content'] = [i.strip() for i in item['chapter_content'] if item['chapter_content']!='' ] chapter_content = ''.join(item['chapter_content']) item['chapter_content'] = re.sub("'',", '', chapter_content) yield itemitem.py如下 import scrapy class NovelItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() chapter_url = scrapy.Field() chapter_name = scrapy.Field() chapter_title = scrapy.Field() chapter_content =scrapy.Field() num = scrapy.Field() current_page= scrapy.Field() passpipelines.py 导入pymysql import pymysql class NovelPipeline(object): def open_spider(self, spider): """定义items,用来保存每个item""" self.items = [] self.connection = pymysql.connect( host='localhost', # 连接的是本地数据库 user='root', # 自己的mysql用户名 passwd='123456', # 自己的密码 db='novel', # 数据库的名字 charset='utf8mb4', # 默认的编码方式: cursorclass=pymysql.cursors.DictCursor ) def process_item(self, item, spider): """将下载解析到的各个item添加到items,此时是乱序的""" self.items.append(item) return item def close_spider(self, spider): #根据 item里面的current_page,num进行排序 items = sorted(self.items, key=lambda keys: (keys['current_page'],keys['num'])) try: with self.connection.cursor() as cursor: # 数据库表的sql sql1 = 'Create Table If Not Exists jiushen2(id int auto_increment primary key,zjm text,body text)' cursor.execute(sql1) # 单章小说的写入 for item in items: sql = "Insert into jiushen2(zjm,body) values ('%s','%s')" % ( item['chapter_title'], item['chapter_content']) cursor.execute(sql) # 提交本次插入的记录 self.connection.commit() finally: # 关闭连接 self.connection.close()setting.py BOT_NAME = 'Novel' LOG_LEVEL = 'DEBUG' SPIDER_MODULES = ['Novel.spiders'] NEWSPIDER_MODULE = 'Novel.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36' # Obey robots.txt rules ROBOTSTXT_OBEY = False LOG_LEVEL = 'WARNING' # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs DOWNLOAD_DELAY = 1 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) # COOKIES_ENABLED = True # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False USER_AGENT = 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) \Chrome/27.0.1453.94 Safari/537.36' # Override the default request headers: DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', } # Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'Novel.middlewares.NovelSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'Novel.middlewares.NovelDownloaderMiddleware': 543, #} # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'Novel.pipelines.NovelPipeline': 300, } # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' |
今日新闻 |
点击排行 |
|
推荐新闻 |
图片新闻 |
|
专题文章 |
CopyRight 2018-2019 实验室设备网 版权所有 win10的实时保护怎么永久关闭 |