【日常】《中国统计年鉴》与《中国金融年鉴》数据表爬虫（附1985

#【日常】《中国统计年鉴》与《中国金融年鉴》数据表爬虫（附1985| 来源: 网络整理| 查看: 265

序言

最近期末比较忙，挂个可能有用的资源：《中国金融年鉴》（1986-2019）和《中国统计年鉴》（1981-2020）的所有Excel表的资源。数据来源于中国知网的爬虫（下面正文中有提及具体链接网址）。目前为止网上还没有人提供完整的自1986年至今的年鉴数据，基本上只有特定年份的年鉴数据，而且还都是需要付费的。

链接：https://pan.baidu.com/s/13fjrInmjjxaNQRgS_Jv91w 提取码：k5ir

好了需要资源的上面自取即可，后记里的废话就不用看了。

目录序言1 《中国统计年鉴》与《中国金融年鉴》Excel数据爬虫2 关于爬虫的一些细节说明及如何使用脚本处理获得的Excel表后记（最近的一些感想）

1 《中国统计年鉴》与《中国金融年鉴》Excel数据爬虫鱼已经提供在上面了，下面是渔，不过笔者事先提醒，渔并不好学，建议自己去爬一遍就知道哪里比较坑了。不过第二部分里笔者也简要说明了一下爬虫的细节。

正在经历史上最难期末，放个历年《中国统计年鉴》和《中国金融年鉴》所有Excel表的爬虫脚本：

# -*- coding: utf-8 -*- # @author: caoyang # @email: [email protected] import os import re import time import requests from selenium import webdriver from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.common.action_chains import ActionChains from bs4 import BeautifulSoup def get_cookie(url): options = webdriver.FirefoxOptions() options.add_argument("--headless") driver = webdriver.Firefox(options=options) driver.get(url) cookies = driver.get_cookies() driver.quit() def _cookie_to_string(cookies): string = '' for cookie in cookies: string += '{}={}; '.format(cookie['name'], cookie['value']) return string.strip() return _cookie_to_string(cookies) def download_chinese_statistical_yearbook(ybcode='N2020100004', year='2020', save_root='csyb', is_initial=True, ignore_caj=True): with open('system_csyb.log', 'w') as f: pass headers = {'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:82.0) Gecko/20100101 Firefox/82.0'} query_url = 'https://data.cnki.net/Yearbook/PartialGetCatalogResult' excel_url = 'https://data.cnki.net/{}'.format caj_url = 'https://data.cnki.net/download/GetCajUrl' regex = r']+>' cookies = get_cookie('https://data.cnki.net/trade/Yearbook/Single/{}?z=Z016'.format(ybcode)) compiler = re.compile(regex, re.S) regular_interval = 15 reset_interval = 300 if not os.path.exists(save_root): os.mkdir(save_root) # year = ybcode[1:5] target_path = os.path.join(save_root, year) if not os.path.exists(target_path): os.mkdir(target_path) with open(os.path.join(target_path, 'log.txt'), 'w') as f: pass formdata = { 'ybcode': ybcode, 'entrycode': '', 'page': '1', 'pagerow': '20' } response = requests.post(query_url, data=formdata, headers=headers) html = response.text soup = BeautifulSoup(html, 'lxml') span = soup.find('span', class_='s_p_listl') for link in span.find_all('a'): onclick = link.attrs.get('onclick') if onclick is not None: lindex = onclick.find('\'') rindex = onclick.find('\'', lindex + 1) n_pages = int(onclick[lindex + 1:rindex]) break with open('system_csyb.log', 'a') as f: f.write('正在处理{}年...\t{}\n'.format(year, time.strftime('%Y-%m-%d %H:%M:%S'))) print('正在处理{}年...'.format(year)) with open('system_csyb.log', 'a') as f: f.write('共计{}页\t{}\n'.format(n_pages, time.strftime('%Y-%m-%d %H:%M:%S'))) print('共计{}页'.format(n_pages)) for page in range(1, n_pages + 1): with open('system_csyb.log', 'a') as f: f.write(' - 第{}页..\t{}\n'.format(page, time.strftime('%Y-%m-%d %H:%M:%S'))) print(' - 第{}页..'.format(page)) if not page == '1': formdata = { 'ybcode': ybcode, 'entrycode': '', 'page': str(page), 'pagerow': '20' } while True: try: response = requests.post(query_url, data=formdata, headers=headers) break except: with open('system_csyb.log', 'a') as f: f.write(' 页面访问失败...\t{}\n'.format(time.strftime('%Y-%m-%d %H:%M:%S'))) print(' 页面访问失败...') time.sleep(reset_interval) html = response.text soup = BeautifulSoup(html, 'lxml') table = soup.find('table') for tr in table.find_all('tr'): tds = tr.find_all('td') assert len(tds) == 3 title = compiler.sub('', str(tds[0])).replace('\n', '').replace('\t', '').replace(' ', '').replace('\r', '') page_range = compiler.sub('', str(tds[1])).replace('\n', '').replace('\t', '').replace(' ', '') for _link in tds[2].find_all('a'): href = _link.attrs['href'] if href.startswith('/download/excel'): # excel filecode = href[href.find('=')+1:] while True: _headers = headers.copy() _headers['Cookie'] = cookies try: with open('system_csyb.log', 'a') as f: f.write(' + 下载{}...\t{}\n'.format(title, time.strftime('%Y-%m-%d %H:%M:%S'))) print(' + 下载{}...'.format(title)) response = requests.get(excel_url(href), headers=_headers) print(' ' + str(response.status_code)) try: html = response.text soup = BeautifulSoup(html, 'lxml') if str(soup.find('title').string)=='中国经济社会大数据研究平台': with open('system_csyb.log', 'a') as f: f.write(' 重置cookie...\t{}\n'.format(time.strftime('%Y-%m-%d %H:%M:%S'))) print(' 重置cookie...') cookies = get_cookie('https://data.cnki.net/trade/Yearbook/Single/{}?z=Z016'.format(ybcode)) else: break except: break except: with open('system_csyb.log', 'a') as f: f.write(' 失败...\t{}\n'.format(time.strftime('%Y-%m-%d %H:%M:%S'))) print(' 失败...') time.sleep(reset_interval) cookies = get_cookie('https://data.cnki.net/trade/Yearbook/Single/{}?z=Z016'.format(ybcode)) time.sleep(regular_interval) with open(os.path.join(target_path, '{}.xls'.format(filecode)), 'wb') as f: f.write(response.content) with open(os.path.join(target_path, 'log.txt'), 'a') as f: f.write('{}\t{}\t{}.xls\n'.format(title, page_range, filecode)) else: # caj if ignore_caj: continue filecode = _link.attrs['fn'] pagerange = _link.attrs['pg'] disk = _link.attrs['disk'] _formdata = { 'filecode': filecode, 'pagerange': pagerange, 'disk': disk, } while True: _headers = headers.copy() _headers['Cookie'] = cookies try: with open('system_csyb.log', 'a') as f: f.write(' + 下载{}的资源链接...\t{}\n'.format(title, time.strftime('%Y-%m-%d %H:%M:%S'))) print(' + 下载{}的资源链接...'.format(title)) response = requests.post(caj_url, headers=_headers, data=_formdata) break except: with open('system_csyb.log', 'a') as f: f.write(' 失败...\t{}\n'.format(time.strftime('%Y-%m-%d %H:%M:%S'))) print(' 失败...') time.sleep(reset_interval) cookies = get_cookie('https://data.cnki.net/trade/Yearbook/Single/{}?z=Z016'.format(ybcode)) resource_url = response.json()['url'] while True: try: with open('system_csyb.log', 'a') as f: f.write(' + 下载{}...\t{}\n'.format(title, time.strftime('%Y-%m-%d %H:%M:%S'))) print(' + 下载{}...'.format(title)) response = requests.get(resource_url, headers=headers) if str(response.status_code) == '200': break else: with open('system_csyb.log', 'a') as f: f.write(' 重置cookie...\t{}\n'.format(time.strftime('%Y-%m-%d %H:%M:%S'))) print(' 重置cookie...') time.sleep(reset_interval) cookies = get_cookie('https://data.cnki.net/trade/Yearbook/Single/{}?z=Z016'.format(ybcode)) except: with open('system_csyb.log', 'a') as f: f.write(' 失败...\t{}\n'.format(time.strftime('%Y-%m-%d %H:%M:%S'))) print(' 失败...') time.sleep(regular_interval) cookies = get_cookie('https://data.cnki.net/trade/Yearbook/Single/{}?z=Z016'.format(ybcode)) time.sleep(regular_interval) with open(os.path.join(target_path, '{}.caj'.format(filecode)), 'wb') as f: f.write(response.content) with open(os.path.join(target_path, 'log.txt'), 'a') as f: f.write('{}\t{}\t{}.caj\n'.format(title, page_range, filecode)) # Find urls of year if is_initial: url = 'https://data.cnki.net/trade/Yearbook/Single/{}?z=Z016'.format(ybcode) response = requests.get(url, headers=headers) html = response.text soup = BeautifulSoup(html, 'lxml') div = soup.find('div', class_='s_year clearfix') links = [] ybcodes = [] for link in div.find_all('a'): class_ = link.attrs.get('class') if class_ is None: # not current href = link.attrs.get('href') ybcode = href.split('/')[-1].split('?')[0] links.append(href) ybcodes.append(ybcode) with open('ybcode_csyb.txt', 'w') as f: for ybcode in ybcodes: f.write(f'{ybcode}\n') # for ybcode in ybcodes: # download_chinese_statistical_yearbook(ybcode=ybcode, is_initial=False) def download_chinese_financial_yearbook(ybcode='N2020070552', year='2019', save_root='cfyb', is_initial=True, ignore_caj=True): with open('system_cfyb.log', 'w') as f: pass headers = {'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:82.0) Gecko/20100101 Firefox/82.0'} query_url = 'https://data.cnki.net/Yearbook/PartialGetCatalogResult' excel_url = 'https://data.cnki.net/{}'.format caj_url = 'https://data.cnki.net/download/GetCajUrl' regex = r']+>' cookies = '''ASP.NET_SessionId=qgfddbtpp2yw1yik5xpie3mo; Ecp_ClientId=2210524115702029814; Ecp_LoginStuts={"IsAutoLogin":false,"UserName":"SH0013","ShowName":"%e4%b8%8a%e6%b5%b7%e8%b4%a2%e7%bb%8f%e5%a4%a7%e5%ad%a6","UserType":"bk","BUserName":"","BShowName":"","BUserType":"","r":"6dHmNy"}; c_m_LinID=LinID=WEEvREcwSlJHSldSdmVqMDh6a1dpNjgzOEtGdzBoZVNMWk5Nc0RUeDFBOD0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!&ot=05/24/2021 12:16:44; LID=WEEvREcwSlJHSldSdmVqMDh6a1dpNjgzOEtGdzBoZVNMWk5Nc0RUeDFBOD0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!; c_m_expire=2021-05-24 12:16:44; SID=009026; Hm_lvt_911066eb2f53848f7d902db7bb8ac4d7=1621828625; Hm_lpvt_911066eb2f53848f7d902db7bb8ac4d7=1621828625''' cookies = '''ASP.NET_SessionId=pdbekustghjjz2neuam5etnt; Ecp_ClientId=5210524165003078186; Ecp_LoginStuts={\"IsAutoLogin\":false,\"UserName\":\"SH0013\",\"ShowName\":\"%e4%b8%8a%e6%b5%b7%e8%b4%a2%e7%bb%8f%e5%a4%a7%e5%ad%a6\",\"UserType\":\"bk\",\"BUserName\":\"\",\"BShowName\":\"\",\"BUserType\":\"\",\"r\":\"087ZRr\"}; c_m_LinID=LinID=WEEvREcwSlJHSldSdmVqelcxUzhJV1VTdGVGdmpHd1JmTGx6Sjd5N1Yzcz0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!&ot=05/24/2021 17:09:30; LID=WEEvREcwSlJHSldSdmVqelcxUzhJV1VTdGVGdmpHd1JmTGx6Sjd5N1Yzcz0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!; c_m_expire=2021-05-24 17:09:30; SID=009024; Hm_lvt_911066eb2f53848f7d902db7bb8ac4d7=1621846228; Hm_lpvt_911066eb2f53848f7d902db7bb8ac4d7=1621846228''' cookies = '''ASP.NET_SessionId=mow1jjxmf3yl0kudfyxajmzc; Ecp_ClientId=2210524182003926881; Ecp_LoginStuts={\"IsAutoLogin\":false,\"UserName\":\"SH0013\",\"ShowName\":\"%e4%b8%8a%e6%b5%b7%e8%b4%a2%e7%bb%8f%e5%a4%a7%e5%ad%a6\",\"UserType\":\"bk\",\"BUserName\":\"\",\"BShowName\":\"\",\"BUserType\":\"\",\"r\":\"4ZXI5N\"}; c_m_LinID=LinID=WEEvREcwSlJHSldSdmVqMDh6a1dpNjgzOEtnL01TODdZeGZBQjFVNFFhVT0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!&ot=05/24/2021 18:39:44; LID=WEEvREcwSlJHSldSdmVqMDh6a1dpNjgzOEtnL01TODdZeGZBQjFVNFFhVT0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!; c_m_expire=2021-05-24 18:39:44; SID=009026; Hm_lvt_911066eb2f53848f7d902db7bb8ac4d7=1621851606; Hm_lpvt_911066eb2f53848f7d902db7bb8ac4d7=1621852178''' cookies = '''ASP.NET_SessionId=x2uuxyelllkb01vne0bg1fcz; Ecp_ClientId=1210524220405317104; Ecp_LoginStuts={\"IsAutoLogin\":false,\"UserName\":\"SH0013\",\"ShowName\":\"%e4%b8%8a%e6%b5%b7%e8%b4%a2%e7%bb%8f%e5%a4%a7%e5%ad%a6\",\"UserType\":\"bk\",\"BUserName\":\"\",\"BShowName\":\"\",\"BUserType\":\"\",\"r\":\"CoZFit\"}; c_m_LinID=LinID=WEEvREcwSlJHSldSdmVqM1BLVW9SQVR4WDNESDFyZmdtZks1OWNYNFlMRT0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!&ot=05/24/2021 22:23:34; LID=WEEvREcwSlJHSldSdmVqM1BLVW9SQVR4WDNESDFyZmdtZks1OWNYNFlMRT0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!; c_m_expire=2021-05-24 22:23:34; SID=009025; Hm_lvt_911066eb2f53848f7d902db7bb8ac4d7=1621865075; Hm_lpvt_911066eb2f53848f7d902db7bb8ac4d7=1621865075''' cookies = '''ASP.NET_SessionId=nl5mpjvzy2az5kamdhek0ydq; Ecp_ClientId=3210525133102568069; Ecp_LoginStuts={\"IsAutoLogin\":false,\"UserName\":\"SH0013\",\"ShowName\":\"%e4%b8%8a%e6%b5%b7%e8%b4%a2%e7%bb%8f%e5%a4%a7%e5%ad%a6\",\"UserType\":\"bk\",\"BUserName\":\"\",\"BShowName\":\"\",\"BUserType\":\"\",\"r\":\"ubJVB4\"}; c_m_LinID=LinID=WEEvREcwSlJHSldSdmVqMVc3M1dGdk5Xa2hFYzh2WjV6Y2cvSUZzR3FPbz0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!&ot=05/25/2021 13:51:50; LID=WEEvREcwSlJHSldSdmVqMVc3M1dGdk5Xa2hFYzh2WjV6Y2cvSUZzR3FPbz0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!; c_m_expire=2021-05-25 13:51:50; SID=009022; Hm_lvt_911066eb2f53848f7d902db7bb8ac4d7=1621920712; Hm_lpvt_911066eb2f53848f7d902db7bb8ac4d7=1621920726''' cookies = get_cookie('https://data.cnki.net/trade/Yearbook/Single/{}?z=Z016'.format(ybcode)) compiler = re.compile(regex, re.S) regular_interval = 15 reset_interval = 300 if not os.path.exists(save_root): os.mkdir(save_root) # year = ybcode[1:5] target_path = os.path.join(save_root, year) if not os.path.exists(target_path): os.mkdir(target_path) with open(os.path.join(target_path, 'log.txt'), 'w') as f: pass formdata = { 'ybcode': ybcode, 'entrycode': '', 'page': '1', 'pagerow': '20' } response = requests.post(query_url, data=formdata, headers=headers) html = response.text soup = BeautifulSoup(html, 'lxml') span = soup.find('span', class_='s_p_listl') for link in span.find_all('a'): onclick = link.attrs.get('onclick') if onclick is not None: lindex = onclick.find('\'') rindex = onclick.find('\'', lindex + 1) n_pages = int(onclick[lindex + 1:rindex]) break with open('system_cfyb.log', 'a') as f: f.write('正在处理{}年...\t{}\n'.format(year, time.strftime('%Y-%m-%d %H:%M:%S'))) print('正在处理{}年...'.format(year)) with open('system_cfyb.log', 'a') as f: f.write('共计{}页\t{}\n'.format(n_pages, time.strftime('%Y-%m-%d %H:%M:%S'))) print('共计{}页'.format(n_pages)) for page in range(1, n_pages + 1): with open('system_cfyb.log', 'a') as f: f.write(' - 第{}页..\t{}\n'.format(page, time.strftime('%Y-%m-%d %H:%M:%S'))) print(' - 第{}页..'.format(page)) if not page == '1': formdata = { 'ybcode': ybcode, 'entrycode': '', 'page': str(page), 'pagerow': '20' } while True: try: response = requests.post(query_url, data=formdata, headers=headers) break except: with open('system_cfyb.log', 'a') as f: f.write(' 页面访问失败...\t{}\n'.format(time.strftime('%Y-%m-%d %H:%M:%S'))) print(' 页面访问失败...') time.sleep(reset_interval) html = response.text soup = BeautifulSoup(html, 'lxml') table = soup.find('table') for tr in table.find_all('tr'): tds = tr.find_all('td') assert len(tds) == 3 title = compiler.sub('', str(tds[0])).replace('\n', '').replace('\t', '').replace(' ', '').replace('\r', '') page_range = compiler.sub('', str(tds[1])).replace('\n', '').replace('\t', '').replace(' ', '') for _link in tds[2].find_all('a'): href = _link.attrs['href'] if href.startswith('/download/excel'): # excel filecode = href[href.find('=')+1:] while True: _headers = headers.copy() _headers['Cookie'] = cookies try: with open('system_cfyb.log', 'a') as f: f.write(' + 下载{}...\t{}\n'.format(title, time.strftime('%Y-%m-%d %H:%M:%S'))) print(' + 下载{}...'.format(title)) response = requests.get(excel_url(href), headers=_headers) print(' ' + str(response.status_code)) try: html = response.text soup = BeautifulSoup(html, 'lxml') if str(soup.find('title').string)=='中国经济社会大数据研究平台': with open('system_cfyb.log', 'a') as f: f.write(' 重置cookie...\t{}\n'.format(time.strftime('%Y-%m-%d %H:%M:%S'))) print(' 重置cookie...') cookies = get_cookie('https://data.cnki.net/trade/Yearbook/Single/{}?z=Z016'.format(ybcode)) else: break except: break except: with open('system_cfyb.log', 'a') as f: f.write(' 失败...\t{}\n'.format(time.strftime('%Y-%m-%d %H:%M:%S'))) print(' 失败...') time.sleep(reset_interval) cookies = get_cookie('https://data.cnki.net/trade/Yearbook/Single/{}?z=Z016'.format(ybcode)) time.sleep(regular_interval) with open(os.path.join(target_path, '{}.xls'.format(filecode)), 'wb') as f: f.write(response.content) with open(os.path.join(target_path, 'log.txt'), 'a') as f: f.write('{}\t{}\t{}.xls\n'.format(title, page_range, filecode)) else: # caj if ignore_caj: continue filecode = _link.attrs['fn'] pagerange = _link.attrs['pg'] disk = _link.attrs['disk'] _formdata = { 'filecode': filecode, 'pagerange': pagerange, 'disk': disk, } while True: _headers = headers.copy() _headers['Cookie'] = cookies try: with open('system_cfyb.log', 'a') as f: f.write(' + 下载{}的资源链接...\t{}\n'.format(title, time.strftime('%Y-%m-%d %H:%M:%S'))) print(' + 下载{}的资源链接...'.format(title)) response = requests.post(caj_url, headers=_headers, data=_formdata) break except: with open('system_cfyb.log', 'a') as f: f.write(' 失败...\t{}\n'.format(time.strftime('%Y-%m-%d %H:%M:%S'))) print(' 失败...') time.sleep(reset_interval) cookies = get_cookie('https://data.cnki.net/trade/Yearbook/Single/{}?z=Z016'.format(ybcode)) resource_url = response.json()['url'] while True: try: with open('system_cfyb.log', 'a') as f: f.write(' + 下载{}...\t{}\n'.format(title, time.strftime('%Y-%m-%d %H:%M:%S'))) print(' + 下载{}...'.format(title)) response = requests.get(resource_url, headers=headers) if str(response.status_code) == '200': break else: with open('system_cfyb.log', 'a') as f: f.write(' 重置cookie...\t{}\n'.format(time.strftime('%Y-%m-%d %H:%M:%S'))) print(' 重置cookie...') time.sleep(reset_interval) cookies = get_cookie('https://data.cnki.net/trade/Yearbook/Single/{}?z=Z016'.format(ybcode)) except: with open('system_cfyb.log', 'a') as f: f.write(' 失败...\t{}\n'.format(time.strftime('%Y-%m-%d %H:%M:%S'))) print(' 失败...') time.sleep(regular_interval) cookies = get_cookie('https://data.cnki.net/trade/Yearbook/Single/{}?z=Z016'.format(ybcode)) time.sleep(regular_interval) with open(os.path.join(target_path, '{}.caj'.format(filecode)), 'wb') as f: f.write(response.content) with open(os.path.join(target_path, 'log.txt'), 'a') as f: f.write('{}\t{}\t{}.caj\n'.format(title, page_range, filecode)) # Find urls of year if is_initial: url = 'https://data.cnki.net/trade/Yearbook/Single/{}?z=Z016'.format(ybcode) response = requests.get(url, headers=headers) html = response.text soup = BeautifulSoup(html, 'lxml') div = soup.find('div', class_='s_year clearfix') links = [] ybcodes = [] for link in div.find_all('a'): class_ = link.attrs.get('class') if class_ is None: # not current href = link.attrs.get('href') ybcode = href.split('/')[-1].split('?')[0] links.append(href) ybcodes.append(ybcode) with open('ybcode_cfyb.txt', 'w') as f: for ybcode in ybcodes: f.write(f'{ybcode}\n') for ybcode in ybcodes: download_chinese_financial_yearbook(ybcode=ybcode, is_initial=False) if __name__ == '__main__': ''' with open('ybcode_csyb.txt', 'r') as f: lines = f.read().splitlines() for line in lines: ybcode, year = line.split() # if int(year) > 1999: # continue download_chinese_statistical_yearbook(ybcode=ybcode, year=year, is_initial=False) ''' with open('ybcode_cfyb.txt', 'r') as f: lines = f.read().splitlines() for line in lines: ybcode, year = line.split() #if int(year) > 1994: # continue download_chinese_financial_yearbook(ybcode=ybcode, year=year, save_root='cfyb', is_initial=False, ignore_caj=True)

数据源来自中国知网，附本爬虫的数据源：

中国金融年鉴@CNKI中国统计年鉴@CNKI

因为笔者所在区域具有知网下载的权限，所以没有知网下载权限的拿到这个爬虫也没什么意义，注意到每次爬虫之前都会调用Selenium驱动浏览器去获取Cookies，以获取下载权限，由于知网Cookies时效很短，一旦失效下载得到的就是知网首页的HTML，所以需要编写逻辑去监测Cookies的有效性，一旦失效就需要继续调用Selenium重新获取Cookies。此外爬取速度尽可能地慢一些，代码中两次下载之间间隔 15 15 15秒，以《中国统计年鉴》为例，截至本文发布共计 39 39 39年，每年差不多有 600 600 600张Excel表，所以基本上需要两三天时间才能全部下完。

2 关于爬虫的一些细节说明及如何使用脚本处理获得的Excel表

笔者鉴于时间有限不想多提爬虫的思路，这里主要记录几个细节（坑点）：

代码中的ignore_caj=True即自动过滤caj链接的下载，事实上caj的下载相对会复杂一些，需要先做一个POST请求获取资源链接，而excel的下载链接直接就写在页面源代码上了。之所以选择不下载caj文件有两个原因，其一是caj文件确实没什么用，有用的数据都写在excel中了，重要的是另一个原因，就是caj的POST请求获取资源链接很容易造成Cookie不可用，导致需要频繁切换Cookie，太费时间，相对excel下载链接就很稳定，一般来说Cookie用上整整一天都不会失效。

爬虫主体仅为requests，浏览器驱动selenium只用作更新Cookie，所以资源耗用是比较小的。

注意上述链接中的页面源代码上是有标签的，所以下载链接并不能直接在页面源代码中找到，需要监听抓包取得标签下内容对应的URL。

这里有一个问题，就是从上述数据源获取得到的Excel表全部都是受保护的，其实只是不能去编辑这些Excel，正常使用Office或是WPS依然可以打开读取（只读模式），如果想要解除保护，需要在审阅菜单下输入密码：

其实这也不影响使用，因为本来也不需要修改这些Excel表格，但是如果是使用pandas.read_excel或是xlrd.open_workbook都会发生报错：

xlrd.biffh.XLRDError: Workbook is encrypted

笔者暂时没有测试openpyxl.load_workbook在受保护的Excel上的读取情况，原因是openpyxl只能支持.xlsx格式的文件读取。因为年鉴数据的Excel表数量实在是太多了，如果想要批量地找一类数据，不借助脚本而是手动去复制数据实在是太蠢，最后终于找到了一种可行的方案，即使用win32com.client模块下的DispatchEx方法：

from win32com.client import DispatchEx excel = DispatchEx('Excel.Application') demo = excel.Workbooks.Open('N2018070031000595.xls') sheet = demo.WorkSheets(1) print(sheet.Cells(1,1).Value) demo.Close(True)

本质是调用Office程序来读取，在任务管理器中会出现Excel的进程，所以demo.Close关闭进程就非常重要，否则计算机很容易会因为打开过多的Excel而内存爆炸。缺点是这个模块没有提供什么现成的结构性数据处理方法，所以取值只能借助sheet.Cells来原始的取值，代码量会比较高。

笔者主要是在取各个省份的金融经济数据表，获取分省份的贷款余额，存款余额，生产总值，价格指数等信息，脚本如下：

# -*- coding: utf-8 -*- # @author: caoyang # @email: [email protected] import re import os import sys import time import xlrd import numpy import pandas import openpyxl from win32com.client import DispatchEx # 全局变量 CURRENT_DIR = os.getcwd() CFYB_DIR = 'cfyb' CSYB_DIR = 'csyb' TEMP_DIR = 'temp' LOG_FILE = 'log.txt' PROVINCE = [ '北京', '天津', '上海', '重庆', '河北', '山西', '辽宁', '吉林', '黑龙江', '江苏', '浙江', '安徽', '福建', '江西', '山东', '河南', '湖北', '湖南', '广东', '海南', '四川', '贵州', '云南', '陕西', '甘肃', '青海', '台湾', '内蒙古', '广西', '西藏', '宁夏', '新疆', '香港', '澳门', ] INT_COMPILER = re.compile(r'[^\d]') # 整型数正则 FLOAT_COMPILER = re.compile(r'[^\d | .]') # 浮点数正则 # 获取分省份的金融经济信息：以贷款余额为主 def get_loan_by_province(): def _get_province_by_title(_title): # 根据数据表名称提取对应省份名 for _province in PROVINCE: if _province in _title: return _province def _format_cell_value(_cell_value, dtype=str): # 标准化单元格的值 if str(_cell_value) == 'None': return None if dtype == int: return INT_COMPILER.sub('', str(_cell_value).replace(' ', '')) if dtype == float: return FLOAT_COMPILER.sub('', str(_cell_value).replace(' ', '')) return str(_cell_value).replace(' ', '') def _get_dataframe_from_sheet(_sheet): _data_dict = { 'year': [], # 年份 'gdp': [], # 国内生产总值 'cpi': [], # 价格指数 'deposit': [], # 总存款余额 'individual_deposit': [], # 个人存款余额 'unit_deposit': [], # 单位存款余额 'finance_deposit': [], # 财政存款余额 'loan': [], # 总贷款余额 'short_loan': [], # 短期贷款余额 'long_loan': [], # 长期贷款余额 } _flags = { # 对应_data_dict中的每一个字段用一个flag记录它是否被找到 'year': True, # 年份 'gdp': True, # 国内生产总值 'cpi': True, # 价格指数 'deposit': True, # 总存款余额 'individual_deposit': True, # 个人存款余额 'unit_deposit': True, # 单位存款余额 'finance_deposit': True, # 财政存款余额 'loan': True, # 总贷款余额 'short_loan': True, # 短期贷款余额 'long_loan': True, # 长期贷款余额 } _row = 0 _MAX_ROW = 100 while _row

【本文地址】

公司简介

联系我们

今日新闻

推荐新闻

专题文章