爬虫过程中解决html乱码和获取的文本乱码问题

2024-06-28 17:53| 来源: 网络整理| 查看: 265

爬虫过程中解决html乱码和获取的文本乱码问题 response1 = requests.get(url=detail_url, headers=headers) responseText1 = response1.text 获取的html中有乱码，xpath解析出来的文本当然也有乱码。解决办法： responseText1 = response1.text.encode(‘iso-8859-1’) utf-8也不行，用iso-8859-1

# coding=utf-8 import requests from lxml import etree import pandas as pd import time import csv headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36', } # 写入表头 header = ['company', 'position', 'salary', 'address', 'experience', 'education', 'number_people', 'date', 'welfare', 'position_type'] with open('./beijing.csv', 'w', encoding='utf-8', newline='')as f: writer = csv.writer(f) writer.writerow(header) page = 1 while 1: # 列表页url print('爬取第{}页'.format(page)) list_url = 'https://search.51job.com/list/010000,000000,7500,38,9,99,%2B,2,{}.html'.format(str(1)) response = requests.get(url=list_url, headers=headers) responseText = response.text # print(responseText) html_str = etree.HTML(responseText) # 得到详情页列表 detailUrl_list = html_str.xpath("//p[@class='t1 ']/span/a/@href") print(detailUrl_list) # 请求每一个详情页url，xpath解析数据 for detail_url in detailUrl_list: response1 = requests.get(url=detail_url, headers=headers) responseText1 = response1.text.encode('iso-8859-1') html_str1 = etree.HTML(responseText1) # 解析数据 # 职位 position_list = html_str1.xpath("//div[@class='cn']/h1/@title") position = position_list[0] if position_list else None print(position) # 公司 company_list = html_str1.xpath("//p[@class='cname']/a/@title") # 处理为空的数据 company = company_list[0] if company_list else None print(company) # 薪资 salary_list = html_str1.xpath("//div[@class='cn']/strong/text()") salary = salary_list[0] if salary_list else None print(salary) # 基本信息 try: other_list = html_str1.xpath("//p[@class='msg ltype']//text()") print(other_list) # 数据处理 Other = ''.join(other_list).replace('|', '').split() if other_list else None print(Other) address = Other[0] if other_list else None experience = Other[1] if other_list else None education = Other[2] if other_list else None number_people = Other[3] if other_list else None date = Other[4] if other_list else None print(address, experience, education, number_people, date) except: address, experience, education, number_people, date = None, None, None, None, None # 福利待遇 try: welfare_list = html_str1.xpath("//div[@class='t1']/span/text()") welfare = ','.join(welfare_list) print(welfare) except: welfare = '未公布福利待遇' try: position_type_list = html_str1.xpath("//p[@class='fp']/a/text()") position_type = ','.join(position_type_list) print(position_type) except: position_type = '暂无信息' # 将数据存入csv data_tuple = (company, position, salary, address, experience, education, number_people, date, welfare, position_type) df = pd.DataFrame(columns=data_tuple) df.to_csv('beijing.csv', mode='a', line_terminator='\n', sep=',', index=False) # 调整请求速度，可以自己调整睡眠单位是秒 time.sleep(1) # 页数+1 page += 1

【本文地址】

公司简介

联系我们