代码拉取完成,页面将自动刷新
同步操作将从 goodfeng/myProfile 强制同步,此操作会覆盖自 Fork 仓库以来所做的任何修改,且无法恢复!!!
确定后同步将在后台操作,完成时将刷新页面,请耐心等待。
from flask import Flask, request, jsonify, send_from_directory, render_template
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from datetime import datetime
app = Flask(__name__)
app.config['UPLOAD_FOLDER'] = 'static/resume_images'
# Ensure upload directory exists
os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
@app.route('/')
def index():
return render_template('index.html')
@app.route('/crawl-ip-site', methods=['GET'])
def crawl_ip_site():
# Get URL from query parameter or use default
url = request.args.get('url', 'http://119.23.145.136')
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept-Language': 'zh-CN,zh;q=0.9'
}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
# Encoding handling
encodings_to_try = ['utf-8', 'gbk', 'gb2312', 'big5', 'iso-8859-1']
decoded_text = None
for encoding in encodings_to_try:
try:
decoded_text = response.content.decode(encoding)
if '的' in decoded_text or '是' in decoded_text or '我' in decoded_text:
break
except UnicodeDecodeError:
continue
if decoded_text is None:
decoded_text = response.text
soup = BeautifulSoup(decoded_text, 'html.parser')
# Initialize data structure to store crawled content
crawled_data = {
"page_title": "",
"links": [],
"images": [],
"text_content": [],
"forms": []
}
# Get page title
title = soup.find('title')
crawled_data['page_title'] = title.get_text().strip() if title else "No Title Found"
# Extract all links
for link in soup.find_all('a', href=True):
href = link['href']
absolute_url = urljoin(url, href)
crawled_data['links'].append({
"text": link.get_text().strip(),
"url": absolute_url
})
# Extract all images
for img in soup.find_all('img'):
src = img.get('src', '')
if src:
absolute_src = urljoin(url, src)
img_data = {
"src": absolute_src,
"alt": img.get('alt', ''),
"class": img.get('class', [])
}
# Download image if it's a profile image
if 'profile' in img.get('class', []):
img_filename = f"profile_{datetime.now().strftime('%Y%m%d%H%M%S')}.jpg"
img_path = os.path.join(app.config['UPLOAD_FOLDER'], img_filename)
try:
img_response = requests.get(absolute_src, headers=headers, timeout=5)
if img_response.status_code == 200:
with open(img_path, 'wb') as f:
f.write(img_response.content)
img_data['downloaded'] = True
img_data['local_path'] = img_path
else:
img_data['downloaded'] = False
except Exception as e:
img_data['error'] = str(e)
crawled_data['images'].append(img_data)
# Extract text content from paragraphs and headings
for element in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
text = element.get_text().strip()
if text:
crawled_data['text_content'].append({
"tag": element.name,
"text": text
})
# Extract form information
for form in soup.find_all('form'):
form_data = {
"action": form.get('action', ''),
"method": form.get('method', 'get').lower(),
"inputs": []
}
for input_tag in form.find_all('input'):
form_data['inputs'].append({
"name": input_tag.get('name', ''),
"type": input_tag.get('type', 'text'),
"value": input_tag.get('value', '')
})
crawled_data['forms'].append(form_data)
return jsonify(crawled_data)
except requests.exceptions.RequestException as e:
return jsonify({"error": f"请求失败: {str(e)}"}), 500
except Exception as e:
return jsonify({"error": f"发生错误: {str(e)}"}), 500
# Route to serve downloaded images
@app.route('/get-image/<filename>')
def get_image(filename):
return send_from_directory(app.config['UPLOAD_FOLDER'], filename)
if __name__ == '__main__':
app.run(debug=True)
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。