代码拉取完成,页面将自动刷新
from flask import Flask, render_template, request, jsonify, session,send_from_directory
import requests
from bs4 import BeautifulSoup
import json,os
from spider import *
app = Flask(__name__)
app.secret_key = 'your_secret_key_here' # 用于session加密
# 你的原始简历数据
with open('resume_data_zhou/resume.json', 'r', encoding='utf-8') as f:
MY_RESUME = json.load(f)
# 设置静态文件路由
@app.route('/resume_data/images/<path:filename>')
def serve_resume_images(filename):
return send_from_directory('resume_data/images', filename)
@app.route('/resume_data_zhou/images/<path:filename>')
def serve_resume_images_zhou(filename):
return send_from_directory('resume_data_zhou/images', filename)
@app.route('/')
def home():
""" 默认显示自己的简历或爬取到的简历 """
# 从session中获取爬取的数据,如果没有则显示自己的简历
crawled_data = session.get('crawled_resume', None)
return render_template('resume.html',
resume=crawled_data if crawled_data else MY_RESUME,
is_crawled=bool(crawled_data))
@app.route('/reset', methods=['POST'])
def reset_resume():
""" 清除爬取的简历数据 """
session.pop('crawled_resume', None)
return jsonify({"success": True})
@app.route('/crawl', methods=['POST'])
def crawl_resume():
""" 爬取目标URL的简历 """
target_url = request.form.get('url')
try:
# 调用爬虫函数
crawled_data = scrape_resume(target_url)
# 将爬取的数据存入session
session['crawled_resume'] = crawled_data
return jsonify({"success": True, "redirect": "/"})
except Exception as e:
return jsonify({"error": str(e)}), 500
def scrape_resume(url):
# 创建保存图片的文件夹
if not os.path.exists('resume_data'):
os.makedirs('resume_data')
if not os.path.exists('resume_data/images'):
os.makedirs('resume_data/images')
try:
# 发送HTTP请求
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
response.encoding = 'utf-8'
# 解析HTML内容
soup = BeautifulSoup(response.text, 'html.parser')
# 提取的数据字典
resume_data = {
'basic_info': {},
'education': [],
'skills': [],
'projects': [],
'awards': [],
'self_evaluation': ''
}
# 1. 提取基本信息
header = soup.find('header')
if header:
# 提取个人照片
profile_img = header.find('img', class_='profile-img')
if profile_img and profile_img.get('src'):
img_url = urljoin(url, profile_img['src'])
download_image(img_url, 'profile.jpg')
resume_data['basic_info']['photo'] = 'resume_data/images/profile.jpg'
# 提取姓名和求职意向
name = header.find('h1').get_text(strip=True) if header.find('h1') else ""
job_intention = header.find('p').get_text(strip=True) if header.find('p') else ""
# 提取联系方式
contact_info = {}
contact_div = header.find('div', class_='contact-info')
if contact_div:
for p in contact_div.find_all('p'):
text = p.get_text(strip=True)
if '电话:' in text:
contact_info['phone'] = text.replace('电话:', '').strip()
elif '邮箱:' in text:
contact_info['email'] = text.replace('邮箱:', '').strip()
elif 'Gitee:' in text:
contact_info['gitee'] = p.find('a')['href'] if p.find('a') else text.replace('Gitee:', '').strip()
resume_data['basic_info'].update({
'name': name,
'job_intention': job_intention,
'contact': contact_info
})
# 2. 提取教育背景
education_section = soup.find('section', string='教育背景') or soup.find('h2', string='教育背景').find_parent('section')
if education_section:
content = education_section.find('div', class_='content')
if content:
education = {
'school': content.find('p').get_text(strip=True) if content.find('p') else "",
'period': content.find_all('p')[1].get_text(strip=True) if len(content.find_all('p')) > 1 else "",
'courses': content.find_all('p')[2].get_text(strip=True) if len(content.find_all('p')) > 2 else ""
}
resume_data['education'].append(education)
# 3. 提取技能专长
skills_section = soup.find('section', string='技能专长') or soup.find('h2', string='技能专长').find_parent('section')
if skills_section:
content = skills_section.find('div', class_='content')
if content and content.find('ul'):
skills = [li.get_text(strip=True) for li in content.find('ul').find_all('li')]
resume_data['skills'] = skills
# 4. 提取项目经验
projects_section = soup.find('section', string='项目经验') or soup.find('h2', string='项目经验').find_parent('section')
if projects_section:
for content in projects_section.find_all('div', class_='content'):
project = {
'name': content.find('h3').get_text(strip=True) if content.find('h3') else "",
'period': "",
'description': "",
'responsibility': "",
'tech_stack': ""
}
paragraphs = content.find_all('p')
for p in paragraphs:
text = p.get_text(strip=True)
if '项目时间:' in text:
project['period'] = text.replace('项目时间:', '').strip()
elif '项目描述:' in text:
project['description'] = text.replace('项目描述:', '').strip()
elif '个人职责:' in text:
project['responsibility'] = text.replace('个人职责:', '').strip()
elif '技术栈:' in text:
project['tech_stack'] = text.replace('技术栈:', '').strip()
resume_data['projects'].append(project)
# 5. 提取获奖情况
awards_section = soup.find('section', string='获奖情况') or soup.find('h2', string='获奖情况').find_parent('section')
if awards_section:
content = awards_section.find('div', class_='content')
if content and content.find('ul'):
awards = [li.get_text(strip=True) for li in content.find('ul').find_all('li')]
resume_data['awards'] = awards
# 6. 提取自我评价
evaluation_section = soup.find('section', string='自我评价') or soup.find('h2', string='自我评价').find_parent('section')
if evaluation_section:
content = evaluation_section.find('div', class_='content')
if content:
resume_data['self_evaluation'] = content.get_text(strip=True)
# 保存提取的数据到文件
save_data(resume_data)
print("爬取完成!数据已保存到 resume_data 文件夹")
return resume_data
except requests.exceptions.RequestException as e:
print(f"请求出错: {e}")
except Exception as e:
print(f"发生错误: {e}")
if __name__ == '__main__':
app.run(debug=True)
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。