12.27更新

2024-07-16 15:45| 来源: 网络整理| 查看: 265

from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC import time import random import csv from bs4 import BeautifulSoup # 设置随机User-Agent def set_user_agent(driver): user_agents = [ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:87.0) Gecko/20100101 Firefox/87.0", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Firefox/87.0", "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:87.0) Gecko/20100101 Firefox/87.0", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Firefox/87.0", ] user_agent = random.choice(user_agents) options = Options() options.add_argument(f'user-agent={user_agent}') driver = webdriver.Chrome(options=options) return driver # 解析商品信息 def get_product_info(driver, div, csv_writer): name = div.find("div", class_="title").get_text().strip() price = div.find("span", class_="bold").get_text().strip() print("书名：" + name) print("价格: " + price) isbn_info = div.find("div", class_="zl-isbn-info") if isbn_info is not None: spans = isbn_info.find_all("span", class_="text") author = spans[0].get_text().strip().replace("/", "") publisher = spans[1].get_text().strip().replace("/", "") publish_date = spans[2].get_text().strip().replace("/", "") else: author = div.find("span", class_="normal-text").get_text().strip() publisher = div.find_all("span", class_="normal-text")[1].get_text().strip() publish_date = div.find_all("span", class_="normal-text")[2].get_text().strip() print("作者：" + author) print("出版社：" + publisher) print("出版年份："+ publish_date) shop_name = div.find("div", class_="text").find("span").get_text().strip() print("商店名称: ", shop_name) link = div.find("div", class_="title").find("a").get("href") if "http" not in link: link = "https:" + link # 在新标签页中打开链接 driver.execute_script(f'''window.open("{link}","_blank");''') windows = driver.window_handles driver.switch_to.window(windows[-1]) time.sleep(3) # 检测新标签页是否成功打开 if len(windows) != len(driver.window_handles): print("无法打开新标签页") driver.switch_to.window(windows[0]) return None try: WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "li[data-page='shop-remark-page']"))) soup_new = BeautifulSoup(driver.page_source, "html.parser") # 点击“店铺评价”链接 remark_link = driver.find_element_by_css_selector("li[data-page='shop-remark-page']") driver.execute_script("arguments[0].click();", remark_link) WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "span.total-count"))) time.sleep(3) remark_page_html = driver.page_source remark_page_soup = BeautifulSoup(remark_page_html, "html.parser") comment_num = remark_page_soup.find("span", class_="total-count").get_text() \ if remark_page_soup.find("span", class_="total-count") is not None else '未找到' print("评论数量：" + comment_num) good_comment = remark_page_soup.find("li", {"data-lev": "good"}) if good_comment is not None: good_count = good_comment.find("span", class_="good-count") if good_count is not None: good_count = good_count.get_text() print("好评数：" + good_count) else: print("未找到好评数") else: print("未找到好评") common_comment = remark_page_soup.find("span", class_="common-count").get_text() \ if remark_page_soup.find("span", class_="common-count") is not None else '未找到' print("中评数：" + common_comment) bad_comment = remark_page_soup.find("span", class_="bad-count").get_text() \ if remark_page_soup.find("span", class_="bad-count") is not None else '未找到' print("差评数：" + bad_comment) print("-----------------------") info = { "书名": name, "价格": price, "作者": author, "出版社": publisher, "出版年份": publish_date, "商店名称": shop_name, "评论数量": comment_num, "好评数": good_count, "中评数": common_comment, "差评数": bad_comment, } # 实时写入CSV文件并查看文件内容 csv_writer.writerow(info) with open(filename, 'r', encoding='utf-8') as read_file: content = read_file.read() print(content) except Exception as e: print("获取评论信息失败:", str(e)) info = None driver.close() driver.switch_to.window(windows[0]) return info # 使用Chrome浏览器和无界面模式 chrome_options = Options() chrome_options.add_argument("--disable-gpu") chrome_options.add_argument("--headless") chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-dev-shm-usage") driver = webdriver.Chrome(options=chrome_options) # 设置参数 keyword = "人工智能" page_num = 1 # 从第几页开始 total_pages = 200 # 设置要爬取的总页数 # 打开CSV文件并设置头部 filename = f"{keyword}.csv" fields = ["书名", "价格", "作者", "出版社", "出版年份", "商店名称", "评论数量", "好评数", "中评数", "差评数"] with open(filename, 'w', newline='', encoding='utf-8') as csvfile: csv_writer = csv.DictWriter(csvfile, fieldnames=fields) csv_writer.writeheader() while page_num

【本文地址】

公司简介

联系我们