4 Star 1 Fork 0

江筝 / 2023数据采集与融合

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
MOOC.py 2.61 KB
一键复制 编辑 原始数据 按行查看 历史
江筝 提交于 2023-11-02 13:47 . 实践4
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from scrapy.selector import Selector
import time
from sqlalchemy import create_engine
browser = webdriver.Chrome()
url = 'https://www.icourse163.org/'
browser.get(url)
time.sleep(2)
# 登录
denglu=browser.find_element(By.XPATH,'//*[@id="app"]/div/div/div[1]/div[3]/div[3]/div')
denglu.click()
time.sleep(3)
browser.switch_to.default_content()
browser.switch_to.frame(browser.find_elements(By.TAG_NAME,'iframe')[0])
phone=browser.find_element(By.XPATH,'/html/body/div[2]/div[2]/div[2]/form/div/div[2]/div[2]/input')
phone.clear()
phone.send_keys("18805952874")
time.sleep(3)
password=browser.find_element(By.XPATH,'/html/body/div[2]/div[2]/div[2]/form/div/div[4]/div[2]/input[2]')
password.clear()
password.send_keys("021022jz.")
deng=browser.find_element(By.XPATH,'//*[@id="submitBtn"]')
deng.click()
time.sleep(5)
browser.switch_to.default_content()
select_course=browser.find_element(By.XPATH,'/html/body/div[4]/div[1]/div/div/div/div/div[7]/div[1]/div/div/div[1]/div/div/div/div/div/div/input')
select_course.send_keys("python")
dianji=browser.find_element(By.XPATH,'//html/body/div[4]/div[1]/div/div/div/div/div[7]/div[1]/div/div/div[2]/span')
dianji.click()
time.sleep(3)
content = browser.page_source
print(content)
# 退出
browser.quit()
selector = Selector(text=content)
rows = selector.xpath("//div[@class='m-course-list']/div/div")
data = []
# 遍历每个<tr>元素
for row in rows:
lis = []
course= row.xpath(".//span[@class=' u-course-name f-thide']//text()").extract()
course_string="".join(course)
school=row.xpath(".//a[@class='t21 f-fc9']/text()").extract_first()
teacher=row.xpath(".//a[@class='f-fc9']//text()").extract_first()
team = row.xpath(".//a[@class='f-fc9']//text()").extract()
team_string=",".join(team)
number = row.xpath(".//span[@class='hot']/text()").extract_first()
time = row.xpath(".//span[@class='txt']/text()").extract_first()
jianjie=row.xpath(".//span[@class='p5 brief f-ib f-f0 f-cb']//text()").extract()
jianjie_string=",".join(jianjie)
lis.append(course_string)
lis.append(school)
lis.append(teacher)
lis.append(team_string)
lis.append(number)
lis.append(time)
lis.append(jianjie_string)
data.append(lis)
df = pd.DataFrame(data=data, columns=['course','school','teacher','team','number','time','jianjie'])
print(df)
engine = create_engine("mysql+mysqlconnector://root:021022@127.0.0.1:3306/mybatis")
df.to_sql("mooccourse", engine, if_exists="replace", index=False)
Python
1
https://gitee.com/jiang-zheng-jiang-zhenng/crawl_project.git
git@gitee.com:jiang-zheng-jiang-zhenng/crawl_project.git
jiang-zheng-jiang-zhenng
crawl_project
2023数据采集与融合
master

搜索帮助