小爬虫工具

Jun 2, 2024

女朋友的闺蜜在某券商做行研，昨天在群里聊天说，每次在网页中找数据时都需要手动copy/paste，效率很低，就希望能学会爬虫，以自动化的方式去取数据 🤓。

虽然也没涉及到过爬虫，接到甲方需求后 🤓，最初想法时直接通过某爬虫库直接爬取结构，然后对关键 dom 节点进行数据获取。但发现需要登陆，并且设计到列表页，有下一步的翻页按钮去执行翻页操作。

那普通的爬虫库是满足不了的，就调研到了Selenium和ChromeDriver。

🔒

什么是Selenium和ChromeDriver？

Selenium 是一个强大的工具集，用于自动化 web 浏览器操作，支持多种编程语言和浏览器。ChromeDriver 是一个独立的可执行文件，用于控制 Chrome 浏览器，允许 Selenium WebDriver 与 Chrome 浏览器交互。

通过 Selenium 和 ChromeDriver，可以编写脚本自动化各种 web 操作，如测试、数据抓取等。对于我们的情景，即可实现对于 https://my.sci99.com/channel/myattention.aspx 中的任何数据进行抓取。


from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import os

# 用户登录
def login(driver, username, password):
    login_url = 'https://mixoil.chem99.com/include/loginframe.aspx'
    driver.get(login_url)
    WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.ID, "chemname")))
    
    username_input = driver.find_element(By.ID, 'chemname')
    password_input = driver.find_element(By.ID, 'chempwd')
    
    username_input.send_keys(username)
    password_input.send_keys(password)
    
    login_button = driver.find_element(By.ID, 'Btn_Login')
    login_button.click()
    WebDriverWait(driver, 20).until(EC.url_contains('mixoil.chem99.com'))


## 获取所有目标URL的函数，支持翻页
def fetch_target_urls(driver, url, max_pages=None):
    current_page = 1
    target_urls = []
    
    while True:
        driver.get(url)  # 假设翻页不涉及URL变化，如果涉及，请在此处更新URL
        # 等待页面加载完成
        WebDriverWait(driver, 30).until(
            lambda d: d.execute_script("return document.readyState") == 'complete'
        )
        
        # 提取当前页面的目标URLs
        page_content = driver.page_source
        soup = BeautifulSoup(page_content, 'html.parser')
        dl = soup.find('dl', {'id': 'dl_SC_News'})
        if dl:
            for dd in dl.find_all('dd'):
                a_tag = dd.find('a')
                if a_tag and 'href' in a_tag.attrs:
                    target_urls.append(a_tag['href'])
        
        # 检查是否达到最大页数或是否应继续翻页
        if max_pages and current_page >= max_pages:
            break
        
        # 尝试找到并点击“下一页”按钮
        try:
            # 请替换 '下一页按钮的XPATH' 为实际的XPATH
            next_page_button = driver.find_element(By.XPATH, '//a[contains(text(), \'下一页\')]')
            next_page_button.click()
            current_page += 1
            time.sleep(2)  # 等待新页面加载
        except Exception as e:
            print(f"无法继续翻页: {e}")
            break  # 停止翻页
    
    return target_urls

# 封装爬取单个页面的函数
def crawl_page(driver, url):
    driver.get(url)
    WebDriverWait(driver, 10).until(
        lambda d: d.execute_script("return document.readyState") == 'complete'
    )
    page_content = driver.page_source
    soup = BeautifulSoup(page_content, 'html.parser')
    
    table = soup.find('div', {'id': 'Panel_News'}).find('table')
    rows = table.find_all('tr')
    for row in rows:
        cells = row.find_all(['td', 'th'])
        cell_data = [cell.get_text(strip=True) for cell in cells]
        print(cell_data)
    print("----------------------分割线----------------------")
    print("\n")


# 获取桌面路径和设置chromedriver
desktop_path = os.path.join(os.path.expanduser("~"), "Desktop")
chrome_driver_path = os.path.join(desktop_path, "chromedriver")

# 检查chromedriver是否存在
if not os.path.exists(chrome_driver_path):
    raise FileNotFoundError(f"chromedriver not found at {chrome_driver_path}")

# 初始化Chrome驱动
service = Service(chrome_driver_path)
options = Options()
options.add_argument('--headless')
options.add_argument('--disable-gpu')
options.add_argument('--no-sandbox')

# 启动ChromeDriver
driver = webdriver.Chrome(service=service, options=options)

# 执行登录
login(driver, 'a', 'b')

# # 目标URLs列表
# target_urls = [
#     'https://mixoil.chem99.com/news/47244656.html',
#     "https://mixoil.chem99.com/news/47358920.html"
#     # 添加其他URL
# ]

# 提取目标URLs列表
attention_url = "https://my.sci99.com/channel/myattention.aspx"
target_urls = fetch_target_urls(driver, attention_url, max_pages=8)

# 遍历并爬取每个目标页面
for target_url in target_urls:
    crawl_page(driver, target_url)

# 关闭浏览器
driver.quit()