master

分支 (1)

管理

管理

master

profile-spider
/
app.py

from flask import Flask, render_template, request, jsonify, session,send_from_directory
import requests
from bs4 import BeautifulSoup
import json,os
from spider import *

app = Flask(__name__)

app.secret_key = 'your_secret_key_here'  # 用于session加密

# 你的原始简历数据
with open('resume_data_zhou/resume.json', 'r', encoding='utf-8') as f:
    MY_RESUME = json.load(f)

# 设置静态文件路由
@app.route('/resume_data/images/<path:filename>')
def serve_resume_images(filename):
    return send_from_directory('resume_data/images', filename)

@app.route('/resume_data_zhou/images/<path:filename>')
def serve_resume_images_zhou(filename):
    return send_from_directory('resume_data_zhou/images', filename)

@app.route('/')
def home():
    """ 默认显示自己的简历或爬取到的简历 """
    # 从session中获取爬取的数据，如果没有则显示自己的简历
    crawled_data = session.get('crawled_resume', None)
    return render_template('resume.html',
                         resume=crawled_data if crawled_data else MY_RESUME,
                         is_crawled=bool(crawled_data))

@app.route('/reset', methods=['POST'])
def reset_resume():
    """ 清除爬取的简历数据 """
    session.pop('crawled_resume', None)
    return jsonify({"success": True})

@app.route('/crawl', methods=['POST'])
def crawl_resume():
    """ 爬取目标URL的简历 """
    target_url = request.form.get('url')

    try:
        # 调用爬虫函数
        crawled_data = scrape_resume(target_url)
        # 将爬取的数据存入session
        session['crawled_resume'] = crawled_data
        return jsonify({"success": True, "redirect": "/"})
    except Exception as e:
        return jsonify({"error": str(e)}), 500

def scrape_resume(url):
    # 创建保存图片的文件夹
    if not os.path.exists('resume_data'):
        os.makedirs('resume_data')
    if not os.path.exists('resume_data/images'):
        os.makedirs('resume_data/images')

    try:
        # 发送HTTP请求
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        response.encoding = 'utf-8'

        # 解析HTML内容
        soup = BeautifulSoup(response.text, 'html.parser')

        # 提取的数据字典
        resume_data = {
            'basic_info': {},
            'education': [],
            'skills': [],
            'projects': [],
            'awards': [],
            'self_evaluation': ''
        }

        # 1. 提取基本信息
        header = soup.find('header')
        if header:
            # 提取个人照片
            profile_img = header.find('img', class_='profile-img')
            if profile_img and profile_img.get('src'):
                img_url = urljoin(url, profile_img['src'])
                download_image(img_url, 'profile.jpg')
                resume_data['basic_info']['photo'] = 'resume_data/images/profile.jpg'

            # 提取姓名和求职意向
            name = header.find('h1').get_text(strip=True) if header.find('h1') else ""
            job_intention = header.find('p').get_text(strip=True) if header.find('p') else ""

            # 提取联系方式
            contact_info = {}
            contact_div = header.find('div', class_='contact-info')
            if contact_div:
                for p in contact_div.find_all('p'):
                    text = p.get_text(strip=True)
                    if '电话:' in text:
                        contact_info['phone'] = text.replace('电话:', '').strip()
                    elif '邮箱:' in text:
                        contact_info['email'] = text.replace('邮箱:', '').strip()
                    elif 'Gitee:' in text:
                        contact_info['gitee'] = p.find('a')['href'] if p.find('a') else text.replace('Gitee:', '').strip()

            resume_data['basic_info'].update({
                'name': name,
                'job_intention': job_intention,
                'contact': contact_info
            })

        # 2. 提取教育背景
        education_section = soup.find('section', string='教育背景') or soup.find('h2', string='教育背景').find_parent('section')
        if education_section:
            content = education_section.find('div', class_='content')
            if content:
                education = {
                    'school': content.find('p').get_text(strip=True) if content.find('p') else "",
                    'period': content.find_all('p')[1].get_text(strip=True) if len(content.find_all('p')) > 1 else "",
                    'courses': content.find_all('p')[2].get_text(strip=True) if len(content.find_all('p')) > 2 else ""
                }
                resume_data['education'].append(education)

        # 3. 提取技能专长
        skills_section = soup.find('section', string='技能专长') or soup.find('h2', string='技能专长').find_parent('section')
        if skills_section:
            content = skills_section.find('div', class_='content')
            if content and content.find('ul'):
                skills = [li.get_text(strip=True) for li in content.find('ul').find_all('li')]
                resume_data['skills'] = skills

        # 4. 提取项目经验
        projects_section = soup.find('section', string='项目经验') or soup.find('h2', string='项目经验').find_parent('section')
        if projects_section:
            for content in projects_section.find_all('div', class_='content'):
                project = {
                    'name': content.find('h3').get_text(strip=True) if content.find('h3') else "",
                    'period': "",
                    'description': "",
                    'responsibility': "",
                    'tech_stack': ""
                }

                paragraphs = content.find_all('p')
                for p in paragraphs:
                    text = p.get_text(strip=True)
                    if '项目时间:' in text:
                        project['period'] = text.replace('项目时间:', '').strip()
                    elif '项目描述:' in text:
                        project['description'] = text.replace('项目描述:', '').strip()
                    elif '个人职责:' in text:
                        project['responsibility'] = text.replace('个人职责:', '').strip()
                    elif '技术栈:' in text:
                        project['tech_stack'] = text.replace('技术栈:', '').strip()

                resume_data['projects'].append(project)

        # 5. 提取获奖情况
        awards_section = soup.find('section', string='获奖情况') or soup.find('h2', string='获奖情况').find_parent('section')
        if awards_section:
            content = awards_section.find('div', class_='content')
            if content and content.find('ul'):
                awards = [li.get_text(strip=True) for li in content.find('ul').find_all('li')]
                resume_data['awards'] = awards

        # 6. 提取自我评价
        evaluation_section = soup.find('section', string='自我评价') or soup.find('h2', string='自我评价').find_parent('section')
        if evaluation_section:
            content = evaluation_section.find('div', class_='content')
            if content:
                resume_data['self_evaluation'] = content.get_text(strip=True)


        # 保存提取的数据到文件
        save_data(resume_data)

        print("爬取完成！数据已保存到 resume_data 文件夹")
        return resume_data

    except requests.exceptions.RequestException as e:
        print(f"请求出错: {e}")
    except Exception as e:
        print(f"发生错误: {e}")

if __name__ == '__main__':
    app.run(debug=True)