import time import pandas as pd from bs4 import BeautifulSoup from selenium import webdriver from selenium.common import TimeoutException, NoSuchElementException from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Edge()
url = "https://nav.xnjun.top/" driver.get(url)
initial_url = url
tab_labels = driver.find_elements(By.CLASS_NAME, "nav-link")
data_list = []
for tab_index, tab_label in enumerate(tab_labels): tab_name = tab_label.text if tab_name in ["最新网址", "热门网址", "大家喜欢"]: print("跳过操作,tab_name:", tab_name) continue print("点击选择卡:", tab_name) tab_label.click() wait = WebDriverWait(driver, 10) tab_content = wait.until(EC.presence_of_element_located((By.CLASS_NAME, "tab-content")))
soup = BeautifulSoup(driver.page_source, "html.parser")
tab_div = tab_label.find_element(By.XPATH, "./ancestor::div[contains(@class, 'd-flex flex-fill flex-tab align-items-center')]")
more_link = tab_div.find_element(By.XPATH, ".//a[@class='btn-move tab-move text-xs ml-2']") style = more_link.get_attribute("style")
if style is None or "display: none;" not in style: print("点击>>更多MORE+链接") more_link_url = more_link.get_attribute("href") driver.get(more_link_url) wait.until(EC.url_changes(url)) while True: new_soup = BeautifulSoup(driver.page_source, "html.parser") new_category_elem = new_soup.find("h4", class_="text-gray text-lg mb-4") if new_category_elem: new_category = new_category_elem.text.strip() else: new_category = "" url_cards = new_soup.find_all("div", class_="url-card col-sm-6 col-md-4 col-xl-3") for url_card in url_cards: new_site_name = url_card.find("strong").text.strip() new_site_url = url_card.find("a")["data-url"] new_site_description = url_card.find("p", class_="overflowClip_1").text.strip() data_list.append([new_category, new_site_name, new_site_url, new_site_description])
page_links = new_soup.find_all("a", class_="page-numbers") current_page_elem = new_soup.find("span", class_="page-numbers current") if current_page_elem: current_page = int(current_page_elem.text) print("当前页:", current_page) else: print("无法获取当前页数") break clicked_pages = set()
for page_link in page_links: page_number = int(page_link.text) if page_number > current_page and page_number not in clicked_pages: page_link_url = page_link["href"] driver.get(page_link_url) clicked_pages.add(page_number) wait.until(EC.url_changes(url)) if current_page > page_number: driver.back() break
driver.back() else: print(">>更多MORE+链接已隐藏,跳过该步骤") time.sleep(1)
soup = BeautifulSoup(driver.page_source, "html.parser")
url_divs = soup.find_all("div", class_=["url-body", "default"])
for div in url_divs: category_div = div.find_previous("div", class_=["tab-pane", "active"]) if category_div: category_id = category_div["id"] else: category_id = None
site_name = div.find("strong").text.strip()
site_url = div.find("a")["data-url"]
site_description = div.find("p", class_="overflowClip_1").text.strip()
data_list.append([category_id, site_name, site_url, site_description])
time.sleep(3)
df = pd.DataFrame(data_list, columns=["分类", "名称", "网址", "介绍"])
df.to_excel("S:/Code/Python/NavPy/网址信息11.xlsx", index=False)
driver.quit()
|