1 Star 0 Fork 137

听蝉鸣/myProfile

forked from goodfeng/myProfile 
加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
but.py 5.00 KB
一键复制 编辑 原始数据 按行查看 历史
听蝉鸣 提交于 2个月前 . 11111
from flask import Flask, request, jsonify, send_from_directory, render_template
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from datetime import datetime
app = Flask(__name__)
app.config['UPLOAD_FOLDER'] = 'static/resume_images'
# Ensure upload directory exists
os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
@app.route('/')
def index():
return render_template('index.html')
@app.route('/crawl-ip-site', methods=['GET'])
def crawl_ip_site():
# Get URL from query parameter or use default
url = request.args.get('url', 'http://119.23.145.136')
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept-Language': 'zh-CN,zh;q=0.9'
}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
# Encoding handling
encodings_to_try = ['utf-8', 'gbk', 'gb2312', 'big5', 'iso-8859-1']
decoded_text = None
for encoding in encodings_to_try:
try:
decoded_text = response.content.decode(encoding)
if '的' in decoded_text or '是' in decoded_text or '我' in decoded_text:
break
except UnicodeDecodeError:
continue
if decoded_text is None:
decoded_text = response.text
soup = BeautifulSoup(decoded_text, 'html.parser')
# Initialize data structure to store crawled content
crawled_data = {
"page_title": "",
"links": [],
"images": [],
"text_content": [],
"forms": []
}
# Get page title
title = soup.find('title')
crawled_data['page_title'] = title.get_text().strip() if title else "No Title Found"
# Extract all links
for link in soup.find_all('a', href=True):
href = link['href']
absolute_url = urljoin(url, href)
crawled_data['links'].append({
"text": link.get_text().strip(),
"url": absolute_url
})
# Extract all images
for img in soup.find_all('img'):
src = img.get('src', '')
if src:
absolute_src = urljoin(url, src)
img_data = {
"src": absolute_src,
"alt": img.get('alt', ''),
"class": img.get('class', [])
}
# Download image if it's a profile image
if 'profile' in img.get('class', []):
img_filename = f"profile_{datetime.now().strftime('%Y%m%d%H%M%S')}.jpg"
img_path = os.path.join(app.config['UPLOAD_FOLDER'], img_filename)
try:
img_response = requests.get(absolute_src, headers=headers, timeout=5)
if img_response.status_code == 200:
with open(img_path, 'wb') as f:
f.write(img_response.content)
img_data['downloaded'] = True
img_data['local_path'] = img_path
else:
img_data['downloaded'] = False
except Exception as e:
img_data['error'] = str(e)
crawled_data['images'].append(img_data)
# Extract text content from paragraphs and headings
for element in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
text = element.get_text().strip()
if text:
crawled_data['text_content'].append({
"tag": element.name,
"text": text
})
# Extract form information
for form in soup.find_all('form'):
form_data = {
"action": form.get('action', ''),
"method": form.get('method', 'get').lower(),
"inputs": []
}
for input_tag in form.find_all('input'):
form_data['inputs'].append({
"name": input_tag.get('name', ''),
"type": input_tag.get('type', 'text'),
"value": input_tag.get('value', '')
})
crawled_data['forms'].append(form_data)
return jsonify(crawled_data)
except requests.exceptions.RequestException as e:
return jsonify({"error": f"请求失败: {str(e)}"}), 500
except Exception as e:
return jsonify({"error": f"发生错误: {str(e)}"}), 500
# Route to serve downloaded images
@app.route('/get-image/<filename>')
def get_image(filename):
return send_from_directory(app.config['UPLOAD_FOLDER'], filename)
if __name__ == '__main__':
app.run(debug=True)
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/listen-to-can/my-profile.git
git@gitee.com:listen-to-can/my-profile.git
listen-to-can
my-profile
myProfile
master

搜索帮助