master

分支 (1)

管理

管理

master

my-profile
/
but.py

from flask import Flask, request, jsonify, send_from_directory, render_template
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from datetime import datetime

app = Flask(__name__)
app.config['UPLOAD_FOLDER'] = 'static/resume_images'

# Ensure upload directory exists
os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)

@app.route('/')
def index():
    return render_template('index.html')

@app.route('/crawl-ip-site', methods=['GET'])
def crawl_ip_site():
    # Get URL from query parameter or use default
    url = request.args.get('url', 'http://119.23.145.136')

    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept-Language': 'zh-CN,zh;q=0.9'
        }

        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()

        # Encoding handling
        encodings_to_try = ['utf-8', 'gbk', 'gb2312', 'big5', 'iso-8859-1']
        decoded_text = None

        for encoding in encodings_to_try:
            try:
                decoded_text = response.content.decode(encoding)
                if '的' in decoded_text or '是' in decoded_text or '我' in decoded_text:
                    break
            except UnicodeDecodeError:
                continue

        if decoded_text is None:
            decoded_text = response.text

        soup = BeautifulSoup(decoded_text, 'html.parser')

        # Initialize data structure to store crawled content
        crawled_data = {
            "page_title": "",
            "links": [],
            "images": [],
            "text_content": [],
            "forms": []
        }

        # Get page title
        title = soup.find('title')
        crawled_data['page_title'] = title.get_text().strip() if title else "No Title Found"

        # Extract all links
        for link in soup.find_all('a', href=True):
            href = link['href']
            absolute_url = urljoin(url, href)
            crawled_data['links'].append({
                "text": link.get_text().strip(),
                "url": absolute_url
            })

        # Extract all images
        for img in soup.find_all('img'):
            src = img.get('src', '')
            if src:
                absolute_src = urljoin(url, src)
                img_data = {
                    "src": absolute_src,
                    "alt": img.get('alt', ''),
                    "class": img.get('class', [])
                }

                # Download image if it's a profile image
                if 'profile' in img.get('class', []):
                    img_filename = f"profile_{datetime.now().strftime('%Y%m%d%H%M%S')}.jpg"
                    img_path = os.path.join(app.config['UPLOAD_FOLDER'], img_filename)

                    try:
                        img_response = requests.get(absolute_src, headers=headers, timeout=5)
                        if img_response.status_code == 200:
                            with open(img_path, 'wb') as f:
                                f.write(img_response.content)
                            img_data['downloaded'] = True
                            img_data['local_path'] = img_path
                        else:
                            img_data['downloaded'] = False
                    except Exception as e:
                        img_data['error'] = str(e)

                crawled_data['images'].append(img_data)

        # Extract text content from paragraphs and headings
        for element in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
            text = element.get_text().strip()
            if text:
                crawled_data['text_content'].append({
                    "tag": element.name,
                    "text": text
                })

        # Extract form information
        for form in soup.find_all('form'):
            form_data = {
                "action": form.get('action', ''),
                "method": form.get('method', 'get').lower(),
                "inputs": []
            }

            for input_tag in form.find_all('input'):
                form_data['inputs'].append({
                    "name": input_tag.get('name', ''),
                    "type": input_tag.get('type', 'text'),
                    "value": input_tag.get('value', '')
                })

            crawled_data['forms'].append(form_data)

        return jsonify(crawled_data)

    except requests.exceptions.RequestException as e:
        return jsonify({"error": f"请求失败: {str(e)}"}), 500
    except Exception as e:
        return jsonify({"error": f"发生错误: {str(e)}"}), 500

# Route to serve downloaded images
@app.route('/get-image/<filename>')
def get_image(filename):
    return send_from_directory(app.config['UPLOAD_FOLDER'], filename)

if __name__ == '__main__':
    app.run(debug=True)