批量爬取一为导航nav模板代码

小柯2022-12-202025-01-03

前言

一为模板一共分为两种布局类型，一种是显示小图标的，一种是显示图片的。
本文代码一共分三种，一种是爬取小图标的，一种是爬取图片的，还有一种是单独爬取分类目录的，爬取完存入excel表格中，可以进行对比创建对应的分类目录。

新版

import time
import re
import random
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    TimeoutException, NoSuchElementException,
    ElementClickInterceptedException, StaleElementReferenceException
)

def init_driver():
    options = webdriver.EdgeOptions()
    options.add_argument('--headless')  # 可选：无头模式
    return webdriver.Edge(options=options)

def wait_for(driver, by, value, timeout=10):
    return WebDriverWait(driver, timeout).until(EC.presence_of_element_located((by, value)))

def parse_page(driver, seen_urls, tab_name):
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    url_divs = soup.find_all("div", class_="url-body default")
    result = []

    for div in url_divs:
        link_tag = div.find("a")
        site_url = link_tag["data-url"] if link_tag and "data-url" in link_tag.attrs else ""
        if not site_url or site_url in seen_urls:
            continue
        seen_urls.add(site_url)

        name = div.find("strong").text.strip() if div.find("strong") else ""
        desc_tag = div.find("p", class_="overflowClip_1")
        desc = desc_tag.text.strip() if desc_tag else ""

        result.append([tab_name, name, site_url, desc])
    return result

def get_max_page(driver):
    try:
        page_nav = driver.find_element(By.CLASS_NAME, "posts-nav")
        page_links = page_nav.find_elements(By.CLASS_NAME, "page-numbers")
        numbers = [int(link.text.strip()) for link in page_links if link.text.strip().isdigit()]
        return max(numbers) if numbers else 1
    except:
        return 1

def collect_paginated_data(driver, seen_urls, tab_name):
    data = []
    max_page = get_max_page(driver)

    for page in range(1, max_page + 1):
        if page > 1:
            try:
                link = WebDriverWait(driver, 5).until(
                    EC.element_to_be_clickable((By.XPATH, f"//a[@class='page-numbers' and text()='{page}']"))
                )
                driver.execute_script("arguments[0].click();", link)
                WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.CLASS_NAME, "posts-nav"))
                )
            except Exception as e:
                print(f"[!] 第 {page} 页点击失败：{e}")
                continue
        print(f"  ➤ 正在采集第 {page} 页")
        data.extend(parse_page(driver, seen_urls, tab_name))
        time.sleep(random.uniform(1.5, 3.5))  # 控制频率
    return data

def process_tab(driver, tab_index, tab_element, seen_urls, initial_url):
    tab_name = tab_element.text.strip()
    print(f"\n[+] 正在处理选项卡 [{tab_index+1}]：{tab_name}")
    try:
        tab_element.click()
        wait_for(driver, By.CLASS_NAME, "tab-content")
        time.sleep(random.uniform(1.5, 3.0))  # 等待内容加载稳定
    except Exception as e:
        print(f"[!] tab 点击失败：{e}")
        return []

    try:
        tab_div = tab_element.find_element(By.XPATH,
            "./ancestor::div[contains(@class, 'd-flex flex-fill flex-tab align-items-center')]")
        more_link = tab_div.find_element(By.XPATH, ".//a[@class='btn-move tab-move text-xs ml-2']")
        style = more_link.get_attribute("style")
        if style is None or "display: none;" not in style:
            href = more_link.get_attribute("href")
            print("  ➤ 点击 MORE 链接进入分页页面：", href)
            driver.get(href)
            wait_for(driver, By.CLASS_NAME, "posts-nav")
            time.sleep(random.uniform(1.5, 2.5))
            data = collect_paginated_data(driver, seen_urls, tab_name)
            driver.get(initial_url)
            wait_for(driver, By.CLASS_NAME, "nav-link")
            return data
    except NoSuchElementException:
        pass
    except Exception as e:
        print(f"[!] MORE 链接处理异常：{e}")

    return parse_page(driver, seen_urls, tab_name)

def main():
    driver = init_driver()
    url = "https://aigc.cn"
    seen_urls = set()
    all_data = []

    try:
        driver.get(url)
        wait_for(driver, By.CLASS_NAME, "nav-link")
        initial_url = driver.current_url

        tab_labels = driver.find_elements(By.CLASS_NAME, "nav-link")

        for i in range(len(tab_labels)):
            tab_labels = driver.find_elements(By.CLASS_NAME, "nav-link")  # 防止 DOM 失效
            tab_element = tab_labels[i]
            if not tab_element.text.strip():
                continue
            tab_data = process_tab(driver, i, tab_element, seen_urls, initial_url)
            all_data.extend(tab_data)
            time.sleep(random.uniform(2, 4))  # 每个 tab 后延迟防封禁

        df = pd.DataFrame(all_data, columns=["分类", "名称", "网址", "介绍"])
        df.to_excel("网址信息.xlsx", index=False)
        print("\n✅ 数据抓取完毕，已保存为：网址信息.xlsx")

    except Exception as e:
        print(f"\n[❌] 程序发生错误：{e}")
    finally:
        driver.quit()

if __name__ == "__main__":
    main()

以下为旧版

python依赖

pip install beautifulsoup4
pip install openpyxl
pip install selenium
pip install requests

爬取小图标

一为导航小图标模板代码

import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common import TimeoutException, NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# 初始化WebDriver
driver = webdriver.Chrome()  # 使用Chrome浏览器，你也可以选择其他浏览器

# 打开网页
url = "https://nav.xnjun.top/"
driver.get(url)

# 记录初始页面的URL
initial_url = url

# 获取选择卡标签
tab_labels = driver.find_elements(By.CLASS_NAME, "nav-link")

# 初始化数据列表
data_list = []

# 遍历选择卡标签
for tab_index, tab_label in enumerate(tab_labels):
    tab_name = tab_label.text
    if tab_name in ["最新网址", "热门网址", "大家喜欢"]:
        print("跳过操作，tab_name:", tab_name)
        continue
    print("点击选择卡：", tab_name)
    # 点击选择卡
    tab_label.click()
    # 等待选择卡内容加载
    wait = WebDriverWait(driver, 10)
    tab_content = wait.until(EC.presence_of_element_located((By.CLASS_NAME, "tab-content")))

    # 创建新的BeautifulSoup解析器
    soup = BeautifulSoup(driver.page_source, "html.parser")

    # 找到包含">>更多MORE+"链接的<div>元素
    tab_div = tab_label.find_element(By.XPATH,
                                    "./ancestor::div[contains(@class, 'd-flex flex-fill flex-tab align-items-center')]")

    # 检查是否有">>更多MORE+"链接
    more_link = tab_div.find_element(By.XPATH, ".//a[@class='btn-move tab-move text-xs ml-2']")
    style = more_link.get_attribute("style")

    # 判断是否需要跳过操作
    if style is None or "display: none;" not in style:
        print("点击>>更多MORE+链接")
        # 使用driver.get()方法打开">>更多MORE+"链接的URL
        more_link_url = more_link.get_attribute("href")
        driver.get(more_link_url)
        # 等待新页面加载完成
        wait.until(EC.url_changes(url))
        # 循环爬取所有页的信息
        while True:
            new_soup = BeautifulSoup(driver.page_source, "html.parser")
            new_category_elem = new_soup.find("h4", class_="text-gray text-lg mb-4")
            if new_category_elem:
                new_category = new_category_elem.text.strip()
            else:
                new_category = ""
            url_cards = new_soup.find_all("div", class_="url-card col-sm-6 col-md-4 col-xl-3")
            for url_card in url_cards:
                new_site_name = url_card.find("strong").text.strip()
                new_site_url = url_card.find("a")["data-url"]
                new_site_description = url_card.find("p", class_="overflowClip_1").text.strip()
                data_list.append([new_category, new_site_name, new_site_url, new_site_description])
                # 创建DataFrame
                # df = pd.DataFrame(data_list, columns=["分类", "名称", "网址", "介绍"])
                # # 保存为Excel文件
                # df.to_excel("网址信息页内.xlsx", index=False)

            page_links = new_soup.find_all("a", class_="page-numbers")
            current_page_elem = new_soup.find("span", class_="page-numbers current")
            if current_page_elem:
                current_page = int(current_page_elem.text)
                print("当前页:", current_page)
            else:
                print("无法获取当前页数")
                break
            # 获取已经点击过的页数
            clicked_pages = set()

            for page_link in page_links:
                page_number = int(page_link.text)
                if page_number > current_page and page_number not in clicked_pages:
                    # 点击页数链接
                    page_link_url = page_link["href"]
                    driver.get(page_link_url)
                    clicked_pages.add(page_number)
                    # 等待新页面加载完成
                    wait.until(EC.url_changes(url))
            if current_page > page_number:
                # 返回到首页
                driver.back()
                break

        # 返回到首页
        driver.back()
    else:
        print(">>更多MORE+链接已隐藏，跳过该步骤")
    # 等待一段时间，确保内容加载完成
    time.sleep(1)  # 可根据需要调整等待时间
# 创建新的BeautifulSoup解析器
soup = BeautifulSoup(driver.page_source, "html.parser")

# 找到包含网址信息的<div>元素
url_divs = soup.find_all("div", class_="url-body default")
# 遍历每个<div>元素
for div in url_divs:
    # 获取分类（父级）
    category_div = div.find_previous("div", class_="tab-pane active")
    if category_div:
        category_id = category_div["id"]
    else:
        category_id = None

    # 获取名称
    site_name = div.find("strong").text.strip()

    # 获取网址
    site_url = div.find("a")["data-url"]

    # 获取介绍
    site_description = div.find("p", class_="overflowClip_1").text.strip()

    # 添加数据到列表
    data_list.append([category_id, site_name, site_url, site_description])

# 等待一段时间，确保内容加载完成
time.sleep(3)  # 可根据需要调整等待时间

# 创建DataFrame
df = pd.DataFrame(data_list, columns=["分类", "名称", "网址", "介绍"])

# 保存为Excel文件
df.to_excel("网址信息.xlsx", index=False)

# 关闭WebDriver
driver.quit()

一为导航图片模板代码

import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException, NoSuchElementException
import re

# 初始化WebDriver
driver = webdriver.Chrome()  # 使用Chrome浏览器，你也可以选择其他浏览器

# 打开网页
url = "https://aitoolbox.cn/"
driver.get(url)

# 记录初始页面的URL
initial_url = url

# 获取选择卡标签
tab_labels = driver.find_elements(By.CLASS_NAME, "nav-link")

# 初始化数据列表
data_list = []

# 创建一个集合来存储已经点击过的页面
clicked_pages = set()

# 当前页数
current_page = 1

# 遍历选择卡标签
# 遍历选择卡标签
for tab_index in range(len(tab_labels)):
    # 重新获取tab_label元素
    tab_label = driver.find_elements(By.CLASS_NAME, "nav-link")[tab_index]
    tab_name = tab_label.text
    if tab_name in ["Midjourney新手入门", "我的导航", "最近使用", "热门网址", "最新网址", "Prompt提示词", "AI写作工具"]:
        print("跳过操作，tab_name:", tab_name)
        continue
    print("点击选择卡：", tab_name)
    # 点击选择卡
    tab_label.click()
    # 等待选择卡内容加载
    wait = WebDriverWait(driver, 10)
    tab_content = wait.until(EC.presence_of_element_located((By.CLASS_NAME, "tab-content")))

    # 创建新的BeautifulSoup解析器
    soup = BeautifulSoup(driver.page_source, "html.parser")

    # 找到包含">>更多MORE+"链接的<div>元素
    tab_div = tab_label.find_element(By.XPATH,
                                    "./ancestor::div[contains(@class, 'd-flex flex-fill flex-tab align-items-center')]")

    # 检查是否有">>更多MORE+"链接
    more_link = tab_div.find_element(By.XPATH, ".//a[@class='btn-move tab-move text-xs ml-2']")
    style = more_link.get_attribute("style")

    # 判断是否需要跳过操作
    if style is None or "display: none;" not in style:
        print("点击>>更多MORE+链接以展开所有页面链接")
        # 使用driver.get()方法打开">>更多MORE+"链接的URL
        more_link_url = more_link.get_attribute("href")
        driver.get(more_link_url)
        # 等待新页面加载完成
        wait.until(EC.url_changes(url))

    # 找到分页框架的父元素
    page_nav = driver.find_element(By.CLASS_NAME, "posts-nav")
    # 找到所有分页链接
    page_links = page_nav.find_elements(By.CLASS_NAME, "page-numbers")

    # 判断是否存在隐藏的页面链接
    if "…" in page_nav.text:
        # 使用正则表达式提取数字
        page_numbers = [int(match) for match in re.findall(r'\d+', page_nav.text)]
    else:
        # 直接从链接文本提取数字
        page_numbers = [int(page_link.text.strip()) for page_link in page_links]

    # 获取最大页面数
    if page_numbers:
        max_page_number = max(page_numbers)
        print("最大页面数：", max_page_number)
    else:
        max_page_number = 1  # 如果没有分页链接，默认设置为1

    # 循环点击每个分页链接
    for page_number in range(2, max_page_number + 1):
        # 每次都刷新页面以获取最新的分页链接
        driver.refresh()
        time.sleep(3)  # 可根据需要调整等待时间
        # 使用相对XPath选择器来获取分页链接
        page_link = driver.find_element(By.XPATH, f"//a[@class='page-numbers' and contains(text(), '{page_number}')]")
        # 检查页面是否已经被访问过，如果没有则点击
        if page_number not in clicked_pages:
            page_link.click()
            # 等待页面加载完成
            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "page-numbers")))
            # 标记该页面已经访问过
            clicked_pages.add(page_number)
            print(f"已访问第 {page_number} 页")

    # 返回到初始页面
    driver.get(initial_url)

# 创建新的BeautifulSoup解析器
soup = BeautifulSoup(driver.page_source, "html.parser")

# 找到包含网址信息的<div>元素
url_divs = soup.find_all("div", class_="url-body default")
# 遍历每个<div>元素
for div in url_divs:
    # 获取分类（父级）
    category_div = div.find_previous("div", class_="tab-pane active")
    if category_div:
        category_id = category_div["id"]
    else:
        category_id = None

    # 获取名称
    site_name = div.find("strong").text.strip()

    # 获取网址
    site_url = div.find("a")["data-url"]

    # 获取介绍
    site_description = div.find("p", class_="overflowClip_1").text.strip()

    # 添加数据到列表
    data_list.append([category_id, site_name, site_url, site_description])

# 等待一段时间，确保内容加载完成
time.sleep(3)  # 可根据需要调整等待时间

# 创建DataFrame
df = pd.DataFrame(data_list, columns=["分类", "名称", "网址", "介绍"])

# 保存为Excel文件
df.to_excel("网址信息.xlsx", index=False)

# 关闭WebDriver
driver.quit()

一为导航分类代码

from bs4 import BeautifulSoup
import requests
import openpyxl

# 创建一个新的Excel工作簿
workbook = openpyxl.Workbook()
sheet = workbook.active

# 添加标题行
sheet.append(["序号", "父分类", "子分类", "分类ID"])

# 网站地址
url = "https://www.aigc.cn/"  # 请替换为实际的网站地址

# 发起HTTP请求获取网页内容
response = requests.get(url)
html_code = response.text

# 使用Beautiful Soup解析HTML代码
soup = BeautifulSoup(html_code, "html.parser")

# 找到父级分类容器
parent_categories = soup.find_all("li", class_="sidebar-item")

# 初始化序号
serial_number = 1

# 遍历父级分类
for parent_category in parent_categories:
    parent_name = parent_category.find("span").text

    # 找到子分类容器
    sub_categories_container = parent_category.find("ul")
    if sub_categories_container:
        sub_categories = sub_categories_container.find_all("li")
        # 遍历子分类
        for sub_category in sub_categories:
            sub_name = sub_category.find("span").text
            sub_link = sub_category.find("a")["href"]

            # 将信息添加到Excel表格
            sheet.append([serial_number, parent_name, sub_name, sub_link])
            serial_number += 1

# 保存Excel文件
workbook.save("D:/Gitlab/Python/NavPy/分类.xlsx")

print("Excel文件已保存。")