爬取一为导航模板脚本

小柯2022-12-192025-05-18
一为导航模板脚本

import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common import TimeoutException, NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# 初始化WebDriver
driver = webdriver.Edge()  # 使用Chrome浏览器，你也可以选择其他浏览器

# 打开网页
url = "https://nav.xnjun.top/"
driver.get(url)

# 记录初始页面的URL
initial_url = url

# 获取选择卡标签
tab_labels = driver.find_elements(By.CLASS_NAME, "nav-link")

# 初始化数据列表
data_list = []

# 遍历选择卡标签
for tab_index, tab_label in enumerate(tab_labels):
    tab_name = tab_label.text
    if tab_name in ["最新网址", "热门网址", "大家喜欢"]:
        print("跳过操作，tab_name:", tab_name)
        continue
    print("点击选择卡：", tab_name)
    # 点击选择卡
    tab_label.click()
    # 等待选择卡内容加载
    wait = WebDriverWait(driver, 10)
    tab_content = wait.until(EC.presence_of_element_located((By.CLASS_NAME, "tab-content")))

    # 创建新的BeautifulSoup解析器
    soup = BeautifulSoup(driver.page_source, "html.parser")

    # 找到包含">>更多MORE+"链接的<div>元素
    tab_div = tab_label.find_element(By.XPATH,
                                     "./ancestor::div[contains(@class, 'd-flex flex-fill flex-tab align-items-center')]")

    # 检查是否有">>更多MORE+"链接
    more_link = tab_div.find_element(By.XPATH, ".//a[@class='btn-move tab-move text-xs ml-2']")
    style = more_link.get_attribute("style")

    # 判断是否需要跳过操作
    if style is None or "display: none;" not in style:
        print("点击>>更多MORE+链接")
        # 使用driver.get()方法打开">>更多MORE+"链接的URL
        more_link_url = more_link.get_attribute("href")
        driver.get(more_link_url)
        # 等待新页面加载完成
        wait.until(EC.url_changes(url))
        # 循环爬取所有页的信息
        while True:
            new_soup = BeautifulSoup(driver.page_source, "html.parser")
            new_category_elem = new_soup.find("h4", class_="text-gray text-lg mb-4")
            if new_category_elem:
                new_category = new_category_elem.text.strip()
            else:
                new_category = ""
            url_cards = new_soup.find_all("div", class_="url-card col-sm-6 col-md-4 col-xl-3")
            for url_card in url_cards:
                new_site_name = url_card.find("strong").text.strip()
                new_site_url = url_card.find("a")["data-url"]
                new_site_description = url_card.find("p", class_="overflowClip_1").text.strip()
                data_list.append([new_category, new_site_name, new_site_url, new_site_description])
                # 创建DataFrame
                # df = pd.DataFrame(data_list, columns=["分类", "名称", "网址", "介绍"])
                # # 保存为Excel文件
                # df.to_excel("网址信息页内.xlsx", index=False)

            page_links = new_soup.find_all("a", class_="page-numbers")
            current_page_elem = new_soup.find("span", class_="page-numbers current")
            if current_page_elem:
                current_page = int(current_page_elem.text)
                print("当前页:", current_page)
            else:
                print("无法获取当前页数")
                break
            # 获取已经点击过的页数
            clicked_pages = set()

            for page_link in page_links:
                page_number = int(page_link.text)
                if page_number > current_page and page_number not in clicked_pages:
                    # 点击页数链接
                    page_link_url = page_link["href"]
                    driver.get(page_link_url)
                    clicked_pages.add(page_number)
                    # 等待新页面加载完成
                    wait.until(EC.url_changes(url))
            if current_page > page_number:
                # 返回到首页
                driver.back()
                break

        # 返回到首页
        driver.back()
    else:
        print(">>更多MORE+链接已隐藏，跳过该步骤")
    # 等待一段时间，确保内容加载完成
    time.sleep(1)  # 可根据需要调整等待时间
# 创建新的BeautifulSoup解析器
soup = BeautifulSoup(driver.page_source, "html.parser")

# 找到包含网址信息的<div>元素
url_divs = soup.find_all("div", class_=["url-body", "default"])
# url_divs = soup.find_all("div", class_=["ajax-list-body"])
# 遍历每个<div>元素
for div in url_divs:
    # 获取分类（父级）
    category_div = div.find_previous("div", class_=["tab-pane", "active"])
    # category_div = div.find_previous("div", class_="tab-pane active")
    if category_div:
        category_id = category_div["id"]
    else:
        category_id = None

    # 获取名称
    site_name = div.find("strong").text.strip()

    # 获取网址
    site_url = div.find("a")["data-url"]

    # 获取介绍
    site_description = div.find("p", class_="overflowClip_1").text.strip()

    # 添加数据到列表
    data_list.append([category_id, site_name, site_url, site_description])

# 等待一段时间，确保内容加载完成
time.sleep(3)  # 可根据需要调整等待时间

# 创建DataFrame
df = pd.DataFrame(data_list, columns=["分类", "名称", "网址", "介绍"])

# 保存为Excel文件
df.to_excel("S:/Code/Python/NavPy/网址信息11.xlsx", index=False)

# 关闭WebDriver
driver.quit()