1 Star 0 Fork 0

傅钰/数据采集

加入 Gitee
与超过 1400万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
rank.py 2.82 KB
一键复制 编辑 原始数据 按行查看 历史
傅钰 提交于 2024-10-17 17:59 +08:00 . 四个实验
import requests
from bs4 import BeautifulSoup
import sqlite3
def get_display_length(s):
length = 0
for char in s:
if '\u4e00' <= char <= '\u9fff':
length += 2
else:
length += 1
return length;
def format_str(s, target_length):
current_length = get_display_length(s)
return s + ' ' * (target_length - current_length)
def insert_into_database(data):
try:
print("正在将数据插入到数据库...")
conn = sqlite3.connect('universities.db')
cursor = conn.cursor()
# 创建表格
cursor.execute('''
CREATE TABLE IF NOT EXISTS universities (
rank TEXT,
name TEXT,
province TEXT,
school_type TEXT,
score TEXT
)
''')
cursor.executemany('INSERT INTO universities VALUES (?, ?, ?, ?, ?)', data)
conn.commit()
print("数据插入成功!")
except sqlite3.Error as e:
print(f"数据库操作失败: {e}")
finally:
conn.close()
def scrape_data():
url = 'http://www.shanghairanking.cn/rankings/bcur/2020'
try:
response = requests.get(url)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find('table')
rows = table.find_all('tr')
rank_width = 6
name_width = 30
province_width = 10
type_width = 8
score_width = 8
print(f"{'排名':<{rank_width}} {'学校名称':<{name_width}} {'省市':<{province_width}} {'学校类型':<{type_width}} {'总分':<{score_width}}")
university_data = []
# 从表格中提取信息
for row in rows[1:]: # 忽略第一行(表头)
cols = row.find_all('td')
rank = cols[0].text.strip()
school_name_tag = cols[1].find('span', class_='name-cn')
school_name = school_name_tag.text.strip() if school_name_tag else '未知'
province = cols[2].text.strip()
school_type = cols[3].text.strip()
score = cols[4].text.strip()
print(f"{rank:<{rank_width}} {format_str(school_name, name_width)} {format_str(province, province_width)} {format_str(school_type, type_width)} {score:<{score_width}}")
# 将数据存入列表,稍后插入数据库
university_data.append((rank, school_name, province, school_type, score))
return university_data
except requests.RequestException as e:
print(f"网页请求失败: {e}")
return []
if __name__ == "__main__":
data = scrape_data()
if data:
insert_into_database(data)
else:
print("没有数据可插入到数据库。")
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/see-no-green-hills/data-acquisition.git
git@gitee.com:see-no-green-hills/data-acquisition.git
see-no-green-hills
data-acquisition
数据采集
master

搜索帮助