爬虫过程中解决html乱码和获取的文本乱码问题 response1 = requests.get(url=detail_url, headers=headers) responseText1 = response1.text 获取的html中有乱码,xpath解析出来的文本当然也有乱码。 解决办法: responseText1 = response1.text.encode(‘iso-8859-1’) utf-8也不行,用iso-8859-1
# coding=utf-8
import requests
from lxml import etree
import pandas as pd
import time
import csv
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36',
}
# 写入表头
header = ['company', 'position', 'salary', 'address', 'experience', 'education', 'number_people', 'date', 'welfare', 'position_type']
with open('./beijing.csv', 'w', encoding='utf-8', newline='')as f:
writer = csv.writer(f)
writer.writerow(header)
page = 1
while 1:
# 列表页url
print('爬取第{}页'.format(page))
list_url = 'https://search.51job.com/list/010000,000000,7500,38,9,99,%2B,2,{}.html'.format(str(1))
response = requests.get(url=list_url, headers=headers)
responseText = response.text
# print(responseText)
html_str = etree.HTML(responseText)
# 得到详情页列表
detailUrl_list = html_str.xpath("//p[@class='t1 ']/span/a/@href")
print(detailUrl_list)
# 请求每一个详情页url,xpath解析数据
for detail_url in detailUrl_list:
response1 = requests.get(url=detail_url, headers=headers)
responseText1 = response1.text.encode('iso-8859-1')
html_str1 = etree.HTML(responseText1)
# 解析数据
# 职位
position_list = html_str1.xpath("//div[@class='cn']/h1/@title")
position = position_list[0] if position_list else None
print(position)
# 公司
company_list = html_str1.xpath("//p[@class='cname']/a/@title")
# 处理为空的数据
company = company_list[0] if company_list else None
print(company)
# 薪资
salary_list = html_str1.xpath("//div[@class='cn']/strong/text()")
salary = salary_list[0] if salary_list else None
print(salary)
# 基本信息
try:
other_list = html_str1.xpath("//p[@class='msg ltype']//text()")
print(other_list)
# 数据处理
Other = ''.join(other_list).replace('|', '').split() if other_list else None
print(Other)
address = Other[0] if other_list else None
experience = Other[1] if other_list else None
education = Other[2] if other_list else None
number_people = Other[3] if other_list else None
date = Other[4] if other_list else None
print(address, experience, education, number_people, date)
except:
address, experience, education, number_people, date = None, None, None, None, None
# 福利待遇
try:
welfare_list = html_str1.xpath("//div[@class='t1']/span/text()")
welfare = ','.join(welfare_list)
print(welfare)
except:
welfare = '未公布福利待遇'
try:
position_type_list = html_str1.xpath("//p[@class='fp']/a/text()")
position_type = ','.join(position_type_list)
print(position_type)
except:
position_type = '暂无信息'
# 将数据存入csv
data_tuple = (company, position, salary, address, experience, education, number_people, date, welfare, position_type)
df = pd.DataFrame(columns=data_tuple)
df.to_csv('beijing.csv', mode='a', line_terminator='\n', sep=',', index=False)
# 调整请求速度,可以自己调整 睡眠单位是秒
time.sleep(1)
# 页数+1
page += 1
|