import time import re import random import pandas as pd from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import ( TimeoutException, NoSuchElementException, ElementClickInterceptedException, StaleElementReferenceException )
def init_driver(): options = webdriver.EdgeOptions() options.add_argument('--headless') return webdriver.Edge(options=options)
def wait_for(driver, by, value, timeout=10): return WebDriverWait(driver, timeout).until(EC.presence_of_element_located((by, value)))
def parse_page(driver, seen_urls, tab_name): soup = BeautifulSoup(driver.page_source, 'html.parser') url_divs = soup.find_all("div", class_="url-body default") result = []
for div in url_divs: link_tag = div.find("a") site_url = link_tag["data-url"] if link_tag and "data-url" in link_tag.attrs else "" if not site_url or site_url in seen_urls: continue seen_urls.add(site_url)
name = div.find("strong").text.strip() if div.find("strong") else "" desc_tag = div.find("p", class_="overflowClip_1") desc = desc_tag.text.strip() if desc_tag else ""
result.append([tab_name, name, site_url, desc]) return result
def get_max_page(driver): try: page_nav = driver.find_element(By.CLASS_NAME, "posts-nav") page_links = page_nav.find_elements(By.CLASS_NAME, "page-numbers") numbers = [int(link.text.strip()) for link in page_links if link.text.strip().isdigit()] return max(numbers) if numbers else 1 except: return 1
def collect_paginated_data(driver, seen_urls, tab_name): data = [] max_page = get_max_page(driver)
for page in range(1, max_page + 1): if page > 1: try: link = WebDriverWait(driver, 5).until( EC.element_to_be_clickable((By.XPATH, f"//a[@class='page-numbers' and text()='{page}']")) ) driver.execute_script("arguments[0].click();", link) WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.CLASS_NAME, "posts-nav")) ) except Exception as e: print(f"[!] 第 {page} 页点击失败:{e}") continue print(f" ➤ 正在采集第 {page} 页") data.extend(parse_page(driver, seen_urls, tab_name)) time.sleep(random.uniform(1.5, 3.5)) return data
def process_tab(driver, tab_index, tab_element, seen_urls, initial_url): tab_name = tab_element.text.strip() print(f"\n[+] 正在处理选项卡 [{tab_index+1}]:{tab_name}") try: tab_element.click() wait_for(driver, By.CLASS_NAME, "tab-content") time.sleep(random.uniform(1.5, 3.0)) except Exception as e: print(f"[!] tab 点击失败:{e}") return []
try: tab_div = tab_element.find_element(By.XPATH, "./ancestor::div[contains(@class, 'd-flex flex-fill flex-tab align-items-center')]") more_link = tab_div.find_element(By.XPATH, ".//a[@class='btn-move tab-move text-xs ml-2']") style = more_link.get_attribute("style") if style is None or "display: none;" not in style: href = more_link.get_attribute("href") print(" ➤ 点击 MORE 链接进入分页页面:", href) driver.get(href) wait_for(driver, By.CLASS_NAME, "posts-nav") time.sleep(random.uniform(1.5, 2.5)) data = collect_paginated_data(driver, seen_urls, tab_name) driver.get(initial_url) wait_for(driver, By.CLASS_NAME, "nav-link") return data except NoSuchElementException: pass except Exception as e: print(f"[!] MORE 链接处理异常:{e}")
return parse_page(driver, seen_urls, tab_name)
def main(): driver = init_driver() url = "https://aigc.cn" seen_urls = set() all_data = []
try: driver.get(url) wait_for(driver, By.CLASS_NAME, "nav-link") initial_url = driver.current_url
tab_labels = driver.find_elements(By.CLASS_NAME, "nav-link")
for i in range(len(tab_labels)): tab_labels = driver.find_elements(By.CLASS_NAME, "nav-link") tab_element = tab_labels[i] if not tab_element.text.strip(): continue tab_data = process_tab(driver, i, tab_element, seen_urls, initial_url) all_data.extend(tab_data) time.sleep(random.uniform(2, 4))
df = pd.DataFrame(all_data, columns=["分类", "名称", "网址", "介绍"]) df.to_excel("网址信息.xlsx", index=False) print("\n✅ 数据抓取完毕,已保存为:网址信息.xlsx")
except Exception as e: print(f"\n[❌] 程序发生错误:{e}") finally: driver.quit()
if __name__ == "__main__": main()
|