python-noaa-data-use-selenium
/
noaa_webdriver.py

# coding=utf-8
import re
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


# 参数-要爬取的数据类型(也是保存的文件名)
type = "MSL"
# 爬虫的页面
main_url = 'https://tidesandcurrents.noaa.gov/datums.html?datum=' + type + '&units=0&epoch=0'
# 本地的chromedriver地址
path = '/usr/bin/chromedriver'
# 引用并初始化webdriver
browser = webdriver.Chrome(executable_path=path)
# 等待浏览器加载
wait = WebDriverWait(browser, 10)
# 最后的数据输出
data_value = []
data_name = []


# 关闭子标签
def close_current_page():
    browser.close()
    browser.switch_to_window(handles[0])


# 保存数据到本地
def text_save(content, filename, mode='w+'):
    file = open(filename + '.txt', mode)
    d_name = content[0]
    d_value = content[1]
    for i in range(len(d_name)):
        con = d_name[i] + ';' + str(d_value[i])
        file.write(con + '\n')
    file.close()


# 记录数据
def get_msl(name):
    sub_msl = browser.page_source
    msl = re.findall(type + ':(.*?)</tspan>', sub_msl)
    if msl[0].strip() == '':
        msl = 0.0
    else:
        msl = float(msl[0].strip())
    data_name.append(name)
    data_value.append(msl)
    print('Records(' + str(len(data_value)) + '):' + name + ' =====> ' + str(msl))
    # 停一会儿
    time.sleep(3)
    close_current_page()


# 获取MSL数据
def get_msl_by_url(id, name, state, url):
    windows_open = "window.open('" + url + "')"
    browser.execute_script(windows_open)
    handler = browser.window_handles[len(browser.window_handles) - 1]
    browser.switch_to_window(handler)
    name = id + '-' + name + '-' + state
    wait.until(EC.presence_of_element_located((By.ID, 'stationselect')), get_msl(name))


# 获取数据
def get_data():
    html = browser.page_source
    links = re.findall('<select id="stationselect"(.+?)class="chzn-done" style="display: none;">(.+?)</select>', html)
    links = re.findall('<option\svalue="(.+?)".*?>(.+?)</option>', links[0][1])
    # links = links[0:5] # 5条测试
    for link in links:
        item = link[0]
        id_length = len(re.findall('\d+\.?\d*', link[0])[0])
        id = link[0][0:id_length]
        except_id_item = link[0][id_length + 1:].split(',')
        state = except_id_item[len(except_id_item) - 1]
        total_length = len(item)
        state_length = len(state)
        name = item[id_length + 1:total_length - state_length - 1]
        sub_url = main_url + '&id=' + id + '&name=' + name + '&state=' + state.strip()
        # 根据这个地址，打开新页面，等待10秒，拿取MSL数值，保存起来
        get_msl_by_url(id, name, state.strip(), sub_url)

    text_save([data_name, data_value], type)


# 初始化-给个初始页面
init_url = main_url + '&id=1611347&name=PORT+ALLEN%2C+HANAPEPE+BAY%2C+KAUAI+ISLAND&state=HI'
browser.get(init_url)
handles = browser.window_handles
wait.until(EC.presence_of_element_located((By.ID, 'stationselect')), get_data())