数据采集融合第二次代码

### 作业①：中国气象网城市天气预报数据采集与存储

#### 1. 作业代码
```python
import requests
import sqlite3
from bs4 import BeautifulSoup
import logging

# 配置日志记录
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# 获取指定城市的天气数据
def fetch_weather_data(city_code):
    base_url = 'http://www.weather.com.cn/weather/'
    full_url = f'{base_url}{city_code}.shtml'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 SLBrowser/9.0.3.5211 SLBChan/112'
    }
    try:
        response = requests.get(full_url, headers=headers)
        response.raise_for_status()
        response.encoding = 'utf-8'
        soup = BeautifulSoup(response.text, 'html.parser')

weather_forecast = []
        forecast_list = soup.find('ul', class_='t clearfix').find_all('li')

for day in forecast_list:
            date = day.find('h1').text.strip()
            weather = day.find('p', class_='wea').text.strip()
            temp_high = day.find('span').text.strip() if day.find('span') else ''
            temp_low = day.find('i').text.strip()
            temp = f"{temp_high}/{temp_low}"
            weather_forecast.append((date, weather, temp))

return weather_forecast

except requests.RequestException as e:
        logging.error(f"Error fetching weather data for city code {city_code}: {e}")
        return []

# 创建数据库和表
def setup_database():
    conn = sqlite3.connect('weathers.db')
    cursor = conn.cursor()
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS weather_forecast (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            city TEXT,
            date TEXT,
            weather TEXT,
            temperature TEXT
        )
    ''')
    conn.commit()
    return conn

# 保存天气数据到数据库
def save_weather_data(city, weather_data, conn):
    cursor = conn.cursor()
    for date, weather, temp in weather_data:
        cursor.execute("INSERT INTO weather_forecast (city, date, weather, temperature) VALUES (?,?,?,?)",
                       (city, date, weather, temp))
    conn.commit()

# 显示数据库中的天气数据
def display_weather_data(conn):
    cursor = conn.cursor()
    cursor.execute("SELECT * FROM weather_forecast")
    rows = cursor.fetchall()

print(f"{'序号':<5} {'地区':<10} {'日期':<15} {'天气信息':<20} {'温度':<15}")
    for row in rows:
        print(f"{row[0]:<5} {row[1]:<10} {row[2]:<15} {row[3]:<20} {row[4]:<15}")

def main():
    # 定义城市和对应的代码
    city_codes = {
        '北京': '101010100',
        '上海': '101020100',
        '福州': '101230101',
        '天津': '101030100'
    }

# 创建数据库连接
    conn = setup_database()

# 获取并保存每个城市的天气数据
    for city, city_code in city_codes.items():
        weather_data = fetch_weather_data(city_code)
        if weather_data:
            save_weather_data(city, weather_data, conn)
        else:
            logging.warning(f"No weather data for city {city}")

# 显示数据库中的天气数据
    display_weather_data(conn)

conn.close()

if __name__ == '__main__':
    main()
```

#### 2. 心得体会
体会到良好编程实践（如模块化设计、错误处理、日志记录和数据验证）对构建健壮、可维护应用程序的重要性。模块化设计便于理解与扩展代码；错误处理让程序能优雅应对异常；日志记录辅助监控和调试；数据验证保障数据准确完整。

### 作业②：股票相关信息爬取与存储

#### 1. 作业代码
```python
import requests
import re
import sqlite3

# 用get方法访问服务器并提取页面数据
def getHtml(page, cmd):
    url = ("http://66.push2.eastmoney.com/api/qt/clist/get?cb=jQuery112409097606620255823_1696662149317&pn=1&pz=20&po=1&np=" + str(page) +
           "&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&wbp2u=|0|0|0|web&" + cmd +
           "&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_=1696662149318")
    r = requests.get(url)
    pat = "\"diff\":\\[(.*?)\\]"
    data = re.compile(pat, re.S).findall(r.text)
    return data

def getOnePageStock(cmd, page):
    # 提供的JSON数组
    data = getHtml(page, cmd)
    datas = data[0].split("},")
    # 分解每条股票

# 连接到SQLite数据库（如果不存在，则会创建一个新的数据库文件）
    conn = sqlite3.connect('stock_data.db')
    cursor = conn.cursor()

# 创建股票信息表
    cursor.execute('''CREATE TABLE IF NOT EXISTS stock_info (
                        id INTEGER PRIMARY KEY,
                        stock_code TEXT,
                        stock_name TEXT,
                        stock_price REAL,
                        price_change REAL,
                        price_change_percent REAL,
                        volume INTEGER,
                        turnover REAL,
                        amplitude REAL,
                        highest REAL,
                        lowest REAL,
                        open_price REAL,
                        last_close REAL
                    )''')

# 解析JSON数组并将数据存储到数据库中
    for item in datas:
        # 使用字符串操作来提取键值对
        stock_info = {}
        pairs = item.split(',')
        for pair in pairs:
            key, value = pair.split(':')
            key = key.strip('"')
            value = value.strip('"')
            stock_info[key] = value

# 提取需要的关键信息
        stock_code = stock_info.get('f12', 'N/A')
        stock_name = stock_info.get('f14', 'N/A')
        stock_price = float(stock_info.get('f2', 0.0))
        price_change_percent = float(stock_info.get('f3', 0.0))
        price_change = float(stock_info.get('f4', 0.0))
        volume = int(stock_info.get('f5', 0))
        turnover = float(stock_info.get('f6', 0.0))
        amplitude = float(stock_info.get('f7', 0.0))
        highest = float(stock_info.get('f15', 0.0))
        lowest = float(stock_info.get('f16', 0.0')
        open_price = float(stock_info.get('f17', 0.0))
        last_close = float(stock_info.get('f18', 0.0))

# 插入数据到数据库中
        cursor.execute(
            "INSERT INTO stock_info (stock_code, stock_name, stock_price, price_change_percent, price_change, volume, turnover, amplitude, highest, lowest, open_price, last_close) VALUES (?,?,?,?,?,?,?,?,?,?,?,?)",
            (stock_code, stock_name, stock_price, price_change_percent, price_change, volume, turnover, amplitude, highest,
             lowest, open_price, last_close))

conn.commit()

# 查询股票信息
    cursor.execute("SELECT * FROM stock_info")

# 获取查询结果
    stocks = cursor.fetchall()

# 获取查询结果的列名
    columns = [desc[0] for desc in cursor.description]

# 打印列标签
    print("\t".join(columns))

# 打印股票信息
    for stock in stocks:
        # 打印每行数据
        print("\t".join(map(str, stock)))

# 关闭数据库连接
    conn.close()

page = 1

getOnePageStock("fid=f3&fs=m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048", page)
```

#### 2. 心得体会
体会到网络数据抓取和解析的复杂性与挑战性。学会通过浏览器F12调试工具监控网络请求以识别API端点；明白要仔细处理API返回的JSON数据中结构化与非结构化部分，利用正则表达式和字符串操作提取关键信息；认识到请求参数选择对获取正确数据的重要性，可按需定制数据抓取任务来提高效率和准确性。

### 作业③：中国大学2021主榜院校信息爬取与存储

#### 1. 作业代码
```python
import urllib.request
from bs4 import BeautifulSoup
import sqlite3

# 目标网页URL
your_url = 'https://www.shanghairanking.cn/rankings/bcur/2021'

# 使用urllib打开网页
response = urllib.request.urlopen(your_url)
html = response.read()

# 使用BeautifulSoup解析网页
soup = BeautifulSoup(html, 'html.parser')

# 定位到包含大学排名信息的部分
table = soup.find('table', {'class': 'rk-table'})

# 连接到SQLite数据库（如果数据库不存在则创建）
conn = sqlite3.connect('schools_rank.db')
cursor = conn.cursor()

# 创建表（如果尚未创建）
cursor.execute('''CREATE TABLE IF NOT EXISTS university_ranking
                    (rank TEXT, school_name TEXT, province_city TEXT, school_type TEXT, total_score TEXT)''')

# 遍历表格中的每一行
for row in table.find_all('tr')[1:]:   # 跳过表头
    cols = row.find_all('td')
    rank = cols[0].text.strip()
    school_name = cols[1].text.strip()
    province_city = cols[2].text.strip()
    school_type = cols[3].text.strip()
    total_score = cols[4].text.strip()

# 插入数据到数据库
    cursor.execute('''INSERT INTO university_ranking (rank, school_name, province_city, school_type, total_score)
                       VALUES (?,?,?,?,?)''', (rank, school_name, province_city, school_type, total_score))

# 提交事务
conn.commit()

# 查询数据库并打印所有记录
cursor.execute("SELECT * FROM university_ranking")
all_records = cursor.fetchall()
for record in all_records:
    # 移除字符串中的换行符
    cleaned_record = tuple(field.replace('\n', '') for field in record)

# 打印清理后的记录
    print(cleaned_record)

# 关闭数据库连接
conn.close()

print("大学排名数据已保存到数据库")
```

#### 2. 心得体会
体会到网络数据抓取和解析的复杂性与挑战性。学会分析目标网站网络请求以识别API端点来理解数据来源和结构；明白要用`BeautifulSoup`库解析HTML内容提取关键信息；认识到将数据存储到数据库实现数据持久化的重要性，便于后续数据分析和应用。

张诚坤/数据采集融合

内容风险标识

评论 (0)

张诚坤/数据采集融合 .gitee-modal { width: 500px !important; }

内容风险标识