代码拉取完成,页面将自动刷新
import requests
from bs4 import BeautifulSoup
import sqlite3
def get_display_length(s):
length = 0
for char in s:
if '\u4e00' <= char <= '\u9fff':
length += 2
else:
length += 1
return length;
def format_str(s, target_length):
current_length = get_display_length(s)
return s + ' ' * (target_length - current_length)
def insert_into_database(data):
try:
print("正在将数据插入到数据库...")
conn = sqlite3.connect('universities.db')
cursor = conn.cursor()
# 创建表格
cursor.execute('''
CREATE TABLE IF NOT EXISTS universities (
rank TEXT,
name TEXT,
province TEXT,
school_type TEXT,
score TEXT
)
''')
cursor.executemany('INSERT INTO universities VALUES (?, ?, ?, ?, ?)', data)
conn.commit()
print("数据插入成功!")
except sqlite3.Error as e:
print(f"数据库操作失败: {e}")
finally:
conn.close()
def scrape_data():
url = 'http://www.shanghairanking.cn/rankings/bcur/2020'
try:
response = requests.get(url)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find('table')
rows = table.find_all('tr')
rank_width = 6
name_width = 30
province_width = 10
type_width = 8
score_width = 8
print(f"{'排名':<{rank_width}} {'学校名称':<{name_width}} {'省市':<{province_width}} {'学校类型':<{type_width}} {'总分':<{score_width}}")
university_data = []
# 从表格中提取信息
for row in rows[1:]: # 忽略第一行(表头)
cols = row.find_all('td')
rank = cols[0].text.strip()
school_name_tag = cols[1].find('span', class_='name-cn')
school_name = school_name_tag.text.strip() if school_name_tag else '未知'
province = cols[2].text.strip()
school_type = cols[3].text.strip()
score = cols[4].text.strip()
print(f"{rank:<{rank_width}} {format_str(school_name, name_width)} {format_str(province, province_width)} {format_str(school_type, type_width)} {score:<{score_width}}")
# 将数据存入列表,稍后插入数据库
university_data.append((rank, school_name, province, school_type, score))
return university_data
except requests.RequestException as e:
print(f"网页请求失败: {e}")
return []
if __name__ == "__main__":
data = scrape_data()
if data:
insert_into_database(data)
else:
print("没有数据可插入到数据库。")
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。