1 Star 0 Fork 0

王绮雯 / WebScraping_2

Create your Gitee Account
Explore and code with more than 6 million developers,Free private repositories !:)
Sign up
This repository doesn't specify license. Without author's permission, this code is only for learning and cannot be used for other purposes.
Clone or Download
work3.py 3.10 KB
Copy Edit Web IDE Raw Blame History
王绮雯 authored 2021-10-13 10:11 . 第二次实验-作业3
import sqlite3
import requests
import re
import pandas as pd
import urllib.request
from bs4 import UnicodeDammit, BeautifulSoup
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36"
}
#半角字符转全角字符(参考林同学解决方式)
def chartrans(i):
return chr(ord(i)+65248)
#半角字符串转全角字符串(参考林同学解决方式)
def strtrans(s):
new=''
s=str(s)
for i in s:
new += chartrans(i)
return new
#用urllib.resuest方法访问服务器
def getHtml():
url = "https://www.shanghairanking.cn/rankings/bcur/2021"
req = urllib.request.Request(url, headers=headers)
html=urllib.request.urlopen(req)
html=html.read()
dammit=UnicodeDammit(html,["utf-8","gbk"])
html=dammit.unicode_markup
return html
#获取排名数据
def parsePage(html):
datas = [] # 用于记录得到的数据
soup = BeautifulSoup(html, 'html.parser')
lis = soup.find_all("tr")
for i in range(1,len(lis)):
try:
tr=lis[i]
td=tr.find_all("td")
rank=td[0].find('div').text.strip()
name=td[1].find('a').text.strip() # 学校名称
score=td[4].text.strip() # 总分
datas.append([rank,name,score])
except Exception as err:
print(err)
return datas
# 按格式打印得到的数据
def printList(list):
tplt = "{0:^4}\t{1:^10}\t{2:^6}"
print(tplt.format("排名","学校","总分",chr(12288)))
for data in list:
print(tplt.format(strtrans(data[0]),data[1],strtrans(data[2]),chr(12288)))
# #-----------保存数据到数据库-----------------
def saveData2DB(datalist, dbpath):
init_db(dbpath)
conn = sqlite3.connect(dbpath)
cur = conn.cursor()
for data in datalist:
for index in range(len(data)):
data[index] = '"' + data[index] + '"'
sql = '''
insert into rank(
id,name,score
)
values(%s)
''' % ",".join(data)
cur.execute(sql)
conn.commit()
cur.close()
conn.close()
print("保存到数据库成功!")
# #数据库初始化
def init_db(dbpath):
sql = '''
create table rank
(
id text,
name text,
score text
);
'''
conn = sqlite3.connect(dbpath)
cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
conn.close()
def main():
html = getHtml() # 爬取指定页面
list = parsePage(html) # 解析html页面数据
printList(list) # 按格式打印页面数据
dbpath = r'D:\wqwDownload\PyCharm Community Edition 2021.2.2\PycharmProjects\wqwcodes\爬虫实践课\第二次作业\rank.db' # 数据库路径
saveData2DB(list, dbpath) # 数据保存到数据库
if __name__ == "__main__":
main()

Comment ( 0 )

Sign in to post a comment

1
https://gitee.com/penguin02/web-scraping_2.git
git@gitee.com:penguin02/web-scraping_2.git
penguin02
web-scraping_2
WebScraping_2
master

Search