代码拉取完成,页面将自动刷新
import hashlib
import sys
from concurrent.futures import ThreadPoolExecutor
from queue import Queue
from urllib.parse import urlparse
import requests
from PyQt5.QtGui import QStandardItemModel, QStandardItem, QPixmap, QIcon
from bs4 import BeautifulSoup
from PyQt5.QtWidgets import QWidget, QApplication, QFileDialog
from ImageParserForm import Ui_Form
import threading
from pathlib import Path
import os
header = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'
}
def download(args):
src = args[0]
parser = args[1]
try:
md5 = hashlib.md5()
md5.update(src[0:src.rindex('/')].encode("utf-8"))
album = md5.hexdigest()
albumDir = Path('%s/%s' % (parser.get_dir(), album))
albumDir.mkdir(exist_ok=True)
md5.update(src.encode("utf-8"))
filename = md5.hexdigest() + os.path.splitext(src)[1]
fp = '%s/%s/%s' % (parser.get_dir(), album, filename)
path = Path(fp)
if not path.is_file():
print('download:%s' % src)
resp = requests.get(src, header)
with open(fp, 'wb') as f:
f.write(resp.content)
img = QPixmap()
img.loadFromData(resp.content)
item = QStandardItem(src)
item.setIcon(QIcon(img))
parser.ui.addItem(item)
except Exception as e:
print(e)
finally:
parser.down_and_notify(src)
class ImageParser(threading.Thread):
def __init__(self, ui):
super().__init__()
self.ui = ui
self.dir = None
self.pool = ThreadPoolExecutor(max_workers=3)
self.loadingStatus = False
self.urlQue = Queue()
self.urlSet = set([])
self.doneImgSet = set([])
self.domain = None
self.imgNum = 0
self.condition = threading.Condition()
def parse(self, url):
print('parse:%s' % url)
req = requests.get(url, header)
if req.status_code != 200:
return
soup = self.getSoup(req)
self.parse_imgs(soup)
self.try_links(url)
self.parse_links(soup)
def getSoup(self, req):
if req.encoding == 'ISO-8859-1':
encodings = requests.utils.get_encodings_from_content(req.text)
if encodings:
encoding = encodings[0]
else:
encoding = req.apparent_encoding
encode_content = req.content.decode(encoding, 'replace').encode('utf-8', 'replace')
soup = BeautifulSoup(encode_content, 'html.parser')
else:
soup = BeautifulSoup(req.text, 'html.parser')
return soup
def try_links(self, url):
s = os.path.splitext(url)
if s[1].endswith('.html'):
self.try_url('%s_%s.html', s)
self.try_url('%s/%s.html', s)
ts = s[0].split('/')[-1]
if ts.isdigit():
slen = len(ts)
tsn = int(ts) - 1
if tsn > 0:
ns = '%s%s.html' % (s[0][0:-slen], tsn)
if ns not in self.urlSet:
req = requests.get(ns)
while req.status_code == 200:
soup = self.getSoup(req)
self.parse_imgs(soup)
self.urlSet.add(ns)
tsn = tsn - 1
if tsn > 0:
ns = '%s%s.html' % (s[0][0:-slen], tsn)
if ns not in self.urlSet:
req = requests.get(ns)
else:
break
else:
break
tsn = int(ts) + 1
ns = '%s%s.html' % (s[0][0:-slen], tsn)
if ns not in self.urlSet:
req = requests.get(ns)
while req.status_code == 200:
soup = self.getSoup(req)
self.parse_imgs(soup)
self.urlSet.add(ns)
tsn = tsn + 1
ns = '%s%s.html' % (s[0][0:-slen], tsn)
req = requests.get(ns)
def try_url(self, pt, s):
i = 2
ns = pt % (s[0], i)
if ns not in self.urlSet:
req = requests.get(ns)
while req.status_code == 200:
soup = self.getSoup(req)
self.parse_imgs(soup)
self.urlSet.add(ns)
i = i + 1
ns = pt % (s[0], i)
req = requests.get(ns)
def parse_links(self, doc):
links = doc.find_all('a')
for l in links:
if 'href' in l.attrs:
href = l['href']
if 'http' not in href:
if 'javascript' not in href:
href = '%s://%s/%s' % (self.domain.scheme, self.domain.netloc, href)
else:
continue
elif self.domain.netloc not in href:
continue
href = href.split('?')[0]
#if href[-1] == '/':
# href = href[:-1]
if href not in self.urlSet:
print(href)
self.urlSet.add(href)
self.urlQue.put(href)
def parse_imgs(self, doc):
imgs = doc.find_all('img')
if len(imgs) > 0 and self.loadingStatus:
for img in imgs:
if 'src' in img.attrs:
src = img['src'].split('?')[0]
if 'http' in src and src not in self.doneImgSet:
self.add_img(src)
elif src.startswith('/'):
src = '%s://%s%s' % (self.domain.scheme, self.domain.netloc, src)
self.add_img(src)
def down_and_notify(self, src):
with self.condition:
print("down img:[%s]%s" % (self.imgNum, src))
self.imgNum -= 1
self.doneImgSet.add(src)
if self.imgNum <= 0:
self.condition.notify()
def add_img(self, src):
with self.condition:
if src not in self.doneImgSet:
self.imgNum = self.imgNum + 1
self.pool.submit(download, args=(src, self,))
print("add img:[%s]%s" % (self.imgNum, src))
def run(self):
self.ui.set_loading_status(True)
self.start_parse()
self.ui.set_loading_status(False)
def start_parse(self):
url = self.ui.get_url()
#if url[-1] == '/':
# url = url[:-1]
self.dir = self.ui.get_dir()
path = Path(self.dir)
path.mkdir(exist_ok=True)
self.domain = urlparse(url)
self.urlQue.put(url)
self.set_loading_status(True)
while not self.urlQue.empty() and self.loadingStatus:
try:
self.parse(self.urlQue.get())
except Exception as e:
print(e)
with self.condition:
if self.loadingStatus and self.imgNum > 0:
self.condition.wait()
self.set_loading_status(False)
print('finish parse')
def set_loading_status(self, status):
self.loadingStatus = status
def stop_parse(self):
self.set_loading_status(False)
with self.condition:
self.condition.notify()
def get_dir(self):
return self.dir
class ImageParserWin(QWidget, Ui_Form):
def __init__(self):
super().__init__()
self.setupUi(self)
self.dirLineEdit.setText('D:/pyimages')
self.choosePushButton.clicked.connect(self.choose_dir)
self.startPushButton.clicked.connect(self.start)
self.stopPushButton.clicked.connect(self.stop)
self.stopPushButton.setEnabled(False)
self.model = QStandardItemModel()
self.listView.setModel(self.model)
self.parser = None
self.lock = threading.Lock()
def get_url(self):
return self.addrLineEdit.text()
def get_dir(self):
return self.dirLineEdit.text()
def choose_dir(self):
dir = QFileDialog.getExistingDirectory(None, '存放目录', ".")
self.dirLineEdit.setText(dir)
def start(self):
self.parser = ImageParser(self)
self.parser.start()
def set_loading_status(self, status):
self.startPushButton.setEnabled(not status)
self.stopPushButton.setEnabled(status)
def stop(self):
if self.parser is not None:
self.parser.stop_parse()
def addItem(self, item):
with self.lock:
self.model.appendRow(item)
def closeEvent(self, QCloseEvent):
self.stop()
if __name__ == '__main__':
app = QApplication(sys.argv)
win = ImageParserWin()
win.show()
sys.exit(app.exec_())
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。