1 Star 0 Fork 1

WeFamily / python-noaa-data-use-selenium

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
noaa_webdriver.py 3.04 KB
一键复制 编辑 原始数据 按行查看 历史
B.K. 提交于 2019-11-18 20:22 . update new method
# coding=utf-8
import re
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# 参数-要爬取的数据类型(也是保存的文件名)
type = "MSL"
# 爬虫的页面
main_url = 'https://tidesandcurrents.noaa.gov/datums.html?datum=' + type + '&units=0&epoch=0'
# 本地的chromedriver地址
path = '/usr/bin/chromedriver'
# 引用并初始化webdriver
browser = webdriver.Chrome(executable_path=path)
# 等待浏览器加载
wait = WebDriverWait(browser, 10)
# 最后的数据输出
data_value = []
data_name = []
# 关闭子标签
def close_current_page():
browser.close()
browser.switch_to_window(handles[0])
# 保存数据到本地
def text_save(content, filename, mode='w+'):
file = open(filename + '.txt', mode)
d_name = content[0]
d_value = content[1]
for i in range(len(d_name)):
con = d_name[i] + ';' + str(d_value[i])
file.write(con + '\n')
file.close()
# 记录数据
def get_msl(name):
sub_msl = browser.page_source
msl = re.findall(type + ':(.*?)</tspan>', sub_msl)
if msl[0].strip() == '':
msl = 0.0
else:
msl = float(msl[0].strip())
data_name.append(name)
data_value.append(msl)
print('Records(' + str(len(data_value)) + '):' + name + ' =====> ' + str(msl))
# 停一会儿
time.sleep(3)
close_current_page()
# 获取MSL数据
def get_msl_by_url(id, name, state, url):
windows_open = "window.open('" + url + "')"
browser.execute_script(windows_open)
handler = browser.window_handles[len(browser.window_handles) - 1]
browser.switch_to_window(handler)
name = id + '-' + name + '-' + state
wait.until(EC.presence_of_element_located((By.ID, 'stationselect')), get_msl(name))
# 获取数据
def get_data():
html = browser.page_source
links = re.findall('<select id="stationselect"(.+?)class="chzn-done" style="display: none;">(.+?)</select>', html)
links = re.findall('<option\svalue="(.+?)".*?>(.+?)</option>', links[0][1])
# links = links[0:5] # 5条测试
for link in links:
item = link[0]
id_length = len(re.findall('\d+\.?\d*', link[0])[0])
id = link[0][0:id_length]
except_id_item = link[0][id_length + 1:].split(',')
state = except_id_item[len(except_id_item) - 1]
total_length = len(item)
state_length = len(state)
name = item[id_length + 1:total_length - state_length - 1]
sub_url = main_url + '&id=' + id + '&name=' + name + '&state=' + state.strip()
# 根据这个地址,打开新页面,等待10秒,拿取MSL数值,保存起来
get_msl_by_url(id, name, state.strip(), sub_url)
text_save([data_name, data_value], type)
# 初始化-给个初始页面
init_url = main_url + '&id=1611347&name=PORT+ALLEN%2C+HANAPEPE+BAY%2C+KAUAI+ISLAND&state=HI'
browser.get(init_url)
handles = browser.window_handles
wait.until(EC.presence_of_element_located((By.ID, 'stationselect')), get_data())
Python
1
https://gitee.com/wefamily/python-noaa-data-use-selenium.git
git@gitee.com:wefamily/python-noaa-data-use-selenium.git
wefamily
python-noaa-data-use-selenium
python-noaa-data-use-selenium
master

搜索帮助