import time import pandas as pd from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import StaleElementReferenceException, NoSuchElementException import re
driver = webdriver.Chrome()
url = "https://aitoolbox.cn/" driver.get(url)
initial_url = url
tab_labels = driver.find_elements(By.CLASS_NAME, "nav-link")
data_list = []
clicked_pages = set()
current_page = 1
for tab_index in range(len(tab_labels)): tab_label = driver.find_elements(By.CLASS_NAME, "nav-link")[tab_index] tab_name = tab_label.text if tab_name in ["Midjourney新手入门", "我的导航", "最近使用", "热门网址", "最新网址", "Prompt提示词", "AI写作工具"]: print("跳过操作,tab_name:", tab_name) continue print("点击选择卡:", tab_name) tab_label.click() wait = WebDriverWait(driver, 10) tab_content = wait.until(EC.presence_of_element_located((By.CLASS_NAME, "tab-content")))
soup = BeautifulSoup(driver.page_source, "html.parser")
tab_div = tab_label.find_element(By.XPATH, "./ancestor::div[contains(@class, 'd-flex flex-fill flex-tab align-items-center')]")
more_link = tab_div.find_element(By.XPATH, ".//a[@class='btn-move tab-move text-xs ml-2']") style = more_link.get_attribute("style")
if style is None or "display: none;" not in style: print("点击>>更多MORE+链接以展开所有页面链接") more_link_url = more_link.get_attribute("href") driver.get(more_link_url) wait.until(EC.url_changes(url))
page_nav = driver.find_element(By.CLASS_NAME, "posts-nav") page_links = page_nav.find_elements(By.CLASS_NAME, "page-numbers")
if "…" in page_nav.text: page_numbers = [int(match) for match in re.findall(r'\d+', page_nav.text)] else: page_numbers = [int(page_link.text.strip()) for page_link in page_links]
if page_numbers: max_page_number = max(page_numbers) print("最大页面数:", max_page_number) else: max_page_number = 1
for page_number in range(2, max_page_number + 1): driver.refresh() time.sleep(3) page_link = driver.find_element(By.XPATH, f"//a[@class='page-numbers' and contains(text(), '{page_number}')]") if page_number not in clicked_pages: page_link.click() WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "page-numbers"))) clicked_pages.add(page_number) print(f"已访问第 {page_number} 页")
driver.get(initial_url)
soup = BeautifulSoup(driver.page_source, "html.parser")
url_divs = soup.find_all("div", class_="url-body default")
for div in url_divs: category_div = div.find_previous("div", class_="tab-pane active") if category_div: category_id = category_div["id"] else: category_id = None
site_name = div.find("strong").text.strip()
site_url = div.find("a")["data-url"]
site_description = div.find("p", class_="overflowClip_1").text.strip()
data_list.append([category_id, site_name, site_url, site_description])
time.sleep(3)
df = pd.DataFrame(data_list, columns=["分类", "名称", "网址", "介绍"])
df.to_excel("网址信息.xlsx", index=False)
driver.quit()
|