代码拉取完成,页面将自动刷新
import os
import re
import time
import pathlib
import requests
from urllib.parse import urljoin, urlparse, urlsplit, unquote, urlunsplit
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
# ====== 可配置项 ======
start_url = "http://www.weather.com.cn"
chrome_driver_path = r"D:\HuaweiMoveData\Users\33659\Desktop\python\chromedriver-win64\chromedriver.exe"
save_dir = pathlib.Path("images单线程")
scroll_max_rounds = 19 # 最多爬取19 页
request_timeout = 20 # 单张图片下载超时(秒)
max_images = 119 # 最多下载 119 张图片
save_dir.mkdir(parents=True, exist_ok=True)
def is_jpg_png(url: str) -> bool:
if not url:
return False
url = url.strip().strip('\'"')
if url.lower().startswith("data:image"):
return False
path = urlsplit(url).path.lower()
return path.endswith(".jpg") or path.endswith(".jpeg") or path.endswith(".png")
def filename_from_url(url: str) -> str:
parts = urlsplit(url)
path = parts.path
name = os.path.basename(path)
name = unquote(name)
if not name:
name = "image"
return name
def ensure_unique_path(base_dir: pathlib.Path, name: str) -> pathlib.Path:
p = base_dir / name
if not p.exists():
return p
stem, ext = os.path.splitext(name)
idx = 1
while True:
candidate = base_dir / f"{stem}({idx}){ext}"
if not candidate.exists():
return candidate
idx += 1
def normalize_url(u: str, base: str) -> str:
if not u:
return ""
u = u.strip().strip('\'"')
if u.startswith("url(") and u.endswith(")"):
u = u[4:-1].strip('\'"')
if u.startswith("//"):
return "http:" + u
return urljoin(base, u)
def download_image(url: str, referer: str):
try:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome Safari",
"Referer": referer,
"Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8",
}
resp = requests.get(url, headers=headers, timeout=request_timeout, stream=True)
resp.raise_for_status()
name = filename_from_url(url)
if not os.path.splitext(name)[1]:
ct = resp.headers.get("Content-Type", "").lower()
if "png" in ct:
name += ".png"
elif "jpeg" in ct or "jpg" in ct:
name += ".jpg"
path = ensure_unique_path(save_dir, name)
with open(path, "wb") as f:
for chunk in resp.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
print(f"[OK] {url} -> {path}")
return True
except Exception as e:
print(f"[FAIL] {url} - {e}")
return False
chrome_options = Options()
# chrome_options.add_argument("--headless=new")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
service = Service(chrome_driver_path)
driver = webdriver.Chrome(service=service, options=chrome_options)
try:
driver.get(start_url)
# ====== 限制最多“滚动 19 次” ======
last_height = driver.execute_script("return document.body.scrollHeight;")
rounds = 0
while rounds < scroll_max_rounds:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(1.0)
new_height = driver.execute_script("return document.body.scrollHeight;")
rounds += 1
if new_height == last_height:
break
last_height = new_height
img_urls = set()
img_attrs = ["src", "data-src", "data-original", "data-url", "src2"]
for attr in img_attrs:
try:
elems = driver.find_elements(By.CSS_SELECTOR, f"img[{attr}]")
except Exception:
elems = []
for el in elems:
try:
val = el.get_attribute(attr)
full = normalize_url(val, start_url)
if is_jpg_png(full):
img_urls.add(full)
except Exception:
pass
elems = driver.find_elements(By.CSS_SELECTOR, "*")
for el in elems:
try:
bg = el.value_of_css_property("background-image")
if not bg or bg == "none":
continue
for m in re.findall(r'url\((.*?)\)', bg):
full = normalize_url(m, start_url)
if is_jpg_png(full):
img_urls.add(full)
except Exception:
pass
html = driver.page_source
for m in re.findall(
r'<img[^>]+?(?:src|data-src|data-original|data-url|src2)\s*=\s*["\']([^"\']+)["\']',
html,
flags=re.IGNORECASE
):
full = normalize_url(m, start_url)
if is_jpg_png(full):
img_urls.add(full)
print(f"共发现候选图片 {len(img_urls)} 张(仅当前首页、JPG/PNG)")
# ====== 限制最多下载 119 张图片 ======
ok = 0
for url in sorted(img_urls):
if ok >= max_images: # 达到上限就停止
print(f"已达到最大下载数量 {max_images} 张,停止下载。")
break
if download_image(url, referer=start_url):
ok += 1
print(f"下载完成:成功 {ok} / 发现 {len(img_urls)}")
finally:
driver.quit()
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。