Jun 2, 2024
女朋友的闺蜜在某券商做行研,昨天在群里聊天说,每次在网页中找数据时都需要手动copy/paste,效率很低,就希望能学会爬虫,以自动化的方式去取数据 🤓。
虽然也没涉及到过爬虫,接到甲方需求后 🤓,最初想法时直接通过某爬虫库直接爬取结构,然后对关键 dom 节点进行数据获取。但发现需要登陆,并且设计到列表页,有下一步的翻页按钮去执行翻页操作。
那普通的爬虫库是满足不了的,就调研到了Selenium和ChromeDriver。
什么是Selenium和ChromeDriver?
Selenium 是一个强大的工具集,用于自动化 web 浏览器操作,支持多种编程语言和浏览器。ChromeDriver 是一个独立的可执行文件,用于控制 Chrome 浏览器,允许 Selenium WebDriver 与 Chrome 浏览器交互。
通过 Selenium 和 ChromeDriver,可以编写脚本自动化各种 web 操作,如测试、数据抓取等。对于我们的情景,即可实现对于 https://my.sci99.com/channel/myattention.aspx 中的任何数据进行抓取。
from selenium import webdriver from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from bs4 import BeautifulSoup import time import os # 用户登录 def login(driver, username, password): login_url = 'https://mixoil.chem99.com/include/loginframe.aspx' driver.get(login_url) WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.ID, "chemname"))) username_input = driver.find_element(By.ID, 'chemname') password_input = driver.find_element(By.ID, 'chempwd') username_input.send_keys(username) password_input.send_keys(password) login_button = driver.find_element(By.ID, 'Btn_Login') login_button.click() WebDriverWait(driver, 20).until(EC.url_contains('mixoil.chem99.com')) ## 获取所有目标URL的函数,支持翻页 def fetch_target_urls(driver, url, max_pages=None): current_page = 1 target_urls = [] while True: driver.get(url) # 假设翻页不涉及URL变化,如果涉及,请在此处更新URL # 等待页面加载完成 WebDriverWait(driver, 30).until( lambda d: d.execute_script("return document.readyState") == 'complete' ) # 提取当前页面的目标URLs page_content = driver.page_source soup = BeautifulSoup(page_content, 'html.parser') dl = soup.find('dl', {'id': 'dl_SC_News'}) if dl: for dd in dl.find_all('dd'): a_tag = dd.find('a') if a_tag and 'href' in a_tag.attrs: target_urls.append(a_tag['href']) # 检查是否达到最大页数或是否应继续翻页 if max_pages and current_page >= max_pages: break # 尝试找到并点击“下一页”按钮 try: # 请替换 '下一页按钮的XPATH' 为实际的XPATH next_page_button = driver.find_element(By.XPATH, '//a[contains(text(), \'下一页\')]') next_page_button.click() current_page += 1 time.sleep(2) # 等待新页面加载 except Exception as e: print(f"无法继续翻页: {e}") break # 停止翻页 return target_urls # 封装爬取单个页面的函数 def crawl_page(driver, url): driver.get(url) WebDriverWait(driver, 10).until( lambda d: d.execute_script("return document.readyState") == 'complete' ) page_content = driver.page_source soup = BeautifulSoup(page_content, 'html.parser') table = soup.find('div', {'id': 'Panel_News'}).find('table') rows = table.find_all('tr') for row in rows: cells = row.find_all(['td', 'th']) cell_data = [cell.get_text(strip=True) for cell in cells] print(cell_data) print("----------------------分割线----------------------") print("\n") # 获取桌面路径和设置chromedriver desktop_path = os.path.join(os.path.expanduser("~"), "Desktop") chrome_driver_path = os.path.join(desktop_path, "chromedriver") # 检查chromedriver是否存在 if not os.path.exists(chrome_driver_path): raise FileNotFoundError(f"chromedriver not found at {chrome_driver_path}") # 初始化Chrome驱动 service = Service(chrome_driver_path) options = Options() options.add_argument('--headless') options.add_argument('--disable-gpu') options.add_argument('--no-sandbox') # 启动ChromeDriver driver = webdriver.Chrome(service=service, options=options) # 执行登录 login(driver, 'a', 'b') # # 目标URLs列表 # target_urls = [ # 'https://mixoil.chem99.com/news/47244656.html', # "https://mixoil.chem99.com/news/47358920.html" # # 添加其他URL # ] # 提取目标URLs列表 attention_url = "https://my.sci99.com/channel/myattention.aspx" target_urls = fetch_target_urls(driver, attention_url, max_pages=8) # 遍历并爬取每个目标页面 for target_url in target_urls: crawl_page(driver, target_url) # 关闭浏览器 driver.quit()