代码拉取完成,页面将自动刷新
# coding=utf-8
import re
import time
import urllib.request
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# 网络较慢时,设置大一些
lower_network = 3
# 参数-要爬取的数据类型(也是保存的文件名)
type = "MSL"
# units 单位(0表示 Feet, 1表示 Meters)
units = 0
# epoch 时间(0表示 Present, 1表示 Superseded)
epoch = 0
# 爬虫的页面
main_url = 'https://tidesandcurrents.noaa.gov/datums.html?datum=' + type + '&units=' + str(units) + '&epoch=' + str(epoch)
# 本地的chromedriver地址
path = '/usr/bin/chromedriver'
# 引用并初始化webdriver
browser = webdriver.Chrome(executable_path=path)
# 等待浏览器加载
wait = WebDriverWait(browser, 10)
# 最后的数据输出
data_id = []
data_value = []
data_name = []
data_state = []
data_urls = []
# 保存数据到本地
def text_save(content, filename, mode='w+'):
file = open(filename + '.txt', mode)
d_id = content[0]
d_name = content[1]
d_value = content[2]
d_state = content[3]
data_urls = content[4]
for i in range(len(d_name)):
con = d_id[i] + ';' + d_name[i] + ';' + d_state[i] + ';' + d_value[i] + ';' + data_urls[i]
file.write(con + '\n')
file.close()
# 获取数据
def get_data():
html = browser.page_source
links = re.findall('<select id="stationselect"(.+?)class="chzn-done" style="display: none;">(.+?)</select>', html)
links = re.findall('<option\svalue="(.+?)".*?>(.+?)</option>', links[0][1])
# links = links[0:15] # 15条测试
for link in links:
starttime = time.time()
item = link[0]
id_length = len(re.findall('\d+\.?\d*', link[0])[0])
id = link[0][0:id_length]
except_id_item = link[0][id_length + 1:].split(',')
state = except_id_item[len(except_id_item) - 1]
total_length = len(item)
state_length = len(state)
name = item[id_length + 1:total_length - state_length - 1]
sub_url = main_url + '&id=' + id + '&name=' + name + '&state=' + state.strip()
sub_url = sub_url.replace(' ', '%20')
# 根据子连接获取数据
html1 = urllib.request.urlopen(sub_url).read()
html1 = html1.decode('utf-8')
msl = re.findall('<td><a href="/datum_options.html#' + type + '">' + type + '</a></td><td>(.*?)</td>', html1)[0]
msl[0].strip()
if msl == '':
msl = '0.00'
if msl == '-':
msl = '0.00'
data_id.append(id)
data_name.append(name)
data_value.append(msl)
data_state.append(state.strip())
data_urls.append(sub_url)
endtime = time.time()
dtime = endtime - starttime
print('Records(' + str(len(data_id)) + ')' + id + '&name=' + name + '&state=' + state.strip() + '===>' + msl + '【' + sub_url + '】' + 'Running time: %s Seconds'%dtime)
# 停一会儿
time.sleep(lower_network)
text_save([data_id, data_name, data_value, data_state, data_urls], type)
# 初始化-给个初始页面
init_url = main_url + '&id=1611347&name=PORT+ALLEN%2C+HANAPEPE+BAY%2C+KAUAI+ISLAND&state=HI'
browser.get(init_url)
handles = browser.window_handles
wait.until(EC.presence_of_element_located((By.ID, 'stationselect')), get_data())
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。