1 Star 0 Fork 0

WangHemeng/TensorFlow-Tutorials

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
克隆/下载
weather.py 7.98 KB
一键复制 编辑 原始数据 按行查看 历史
Magnus 提交于 2020-04-12 14:17 . weather.py tiny fix for new Pandas version
########################################################################
#
# Functions for downloading and re-sampling weather-data
# for 5 cities in Denmark between 1980-2018.
#
# The raw data was obtained from:
#
# National Climatic Data Center (NCDC) in USA
# https://www7.ncdc.noaa.gov/CDO/cdoselect.cmd
#
# Note that the NCDC's database functionality may change soon, and
# that the CSV-file needed some manual editing before it could be read.
# See the function _convert_raw_data() below for inspiration if you
# want to convert a new data-file from NCDC's database.
#
# Implemented in Python 3.6
#
# Usage:
# 1) Set the desired storage directory in the data_dir variable.
# 2) Call maybe_download_and_extract() to download the data-set
# if it is not already located in the given data_dir.
# 3) Either call load_original_data() or load_resampled_data()
# to load the original or resampled data for use in your program.
#
# Format:
# The raw data-file from NCDC is not included in the downloaded archive,
# which instead contains a cleaned-up version of the raw data-file
# referred to as the "original data". This data has not yet been resampled.
# The original data-file is available as a pickled file for fast reloading
# with Pandas, and as a CSV-file for broad compatibility.
#
########################################################################
#
# This file is part of the TensorFlow Tutorials available at:
#
# https://github.com/Hvass-Labs/TensorFlow-Tutorials
#
# Published under the MIT License. See the file LICENSE for details.
#
# Copyright 2018 by Magnus Erik Hvass Pedersen
#
########################################################################
import pandas as pd
import os
import download
########################################################################
# Directory where you want to download and save the data-set.
# Set this before you start calling any of the functions below.
data_dir = "data/weather-denmark/"
# Full path for the pickled data-file. (Original data).
def path_original_data_pickle():
return os.path.join(data_dir, "weather-denmark.pkl")
# Full path for the comma-separated text-file. (Original data).
def path_original_data_csv():
return os.path.join(data_dir, "weather-denmark.csv")
# Full path for the resampled data as a pickled file.
def path_resampled_data_pickle():
return os.path.join(data_dir, "weather-denmark-resampled.pkl")
# URL for the data-set on the internet.
data_url = "https://github.com/Hvass-Labs/weather-denmark/raw/master/weather-denmark.tar.gz"
# List of the cities in this data-set. These are cities in Denmark.
cities = ['Aalborg', 'Aarhus', 'Esbjerg', 'Odense', 'Roskilde']
########################################################################
# Private helper-functions.
def _date_string(x):
"""Convert two integers to a string for the date and time."""
date = x[0] # Date. Example: 19801231
time = x[1] # Time. Example: 1230
return "{0}{1:04d}".format(date, time)
def _usaf_to_city(usaf):
"""
The raw data-file uses USAF-codes to identify weather-stations.
If you download another data-set from NCDC then you will have to
change this function to use the USAF-codes in your new data-file.
"""
table = \
{
60300: 'Aalborg',
60700: 'Aarhus',
60800: 'Esbjerg',
61200: 'Odense',
61700: 'Roskilde'
}
return table[usaf]
def _convert_raw_data(path):
"""
This converts a raw data-file obtained from the NCDC database.
This function may be useful as an inspiration if you want to
download another raw data-file from NCDC, but you will have
to modify this function to match the data you have downloaded.
Note that you may also have to manually edit the raw data-file,
e.g. because the header is not in a proper comma-separated format.
"""
# The raw CSV-file uses various markers for "not-available" (NA).
# (This is one of several oddities with NCDC's file-format.)
na_values = ['999', '999.0', '999.9', '9999.9']
# Use Pandas to load the comma-separated file.
# Note that you may have to manually edit the file's header
# to get this to load correctly.
df_raw = pd.read_csv(path, sep=',', header=1,
index_col=False, na_values=na_values)
# Create a new data-frame containing only the data
# we are interested in.
df = pd.DataFrame()
# Get the city-name / weather-station name from the USAF code.
df['City'] = df_raw['USAF '].apply(_usaf_to_city)
# Convert the integer date-time to a proper date-time object.
datestr = df_raw[['Date ', 'HrMn']].apply(_date_string, axis=1)
df['DateTime'] = pd.to_datetime(datestr, format='%Y%m%d%H%M')
# Get the data we are interested in.
df['Temp'] = df_raw['Temp ']
df['Pressure'] = df_raw['Slp ']
df['WindSpeed'] = df_raw['Spd ']
df['WindDir'] = df_raw['Dir']
# Set the city-name and date-time as the index.
df.set_index(['City', 'DateTime'], inplace=True)
# Save the new data-frame as a pickle for fast reloading.
df.to_pickle(path_original_data_pickle())
# Save the new data-frame as a CSV-file for general readability.
df.to_csv(path_original_data_csv())
return df
def _resample(df):
"""
Resample the contents of a Pandas data-frame by first
removing empty rows and columns, then up-sampling and
interpolating the data for 1-minute intervals, and
finally down-sampling to 60-minute intervals.
"""
# Remove all empty rows.
df_res = df.dropna(how='all')
# Upsample so the time-series has data for every minute.
df_res = df_res.resample('1T')
# Fill in missing values.
df_res = df_res.interpolate(method='time')
# Downsample so the time-series has data for every hour.
df_res = df_res.resample('60T')
# Finalize the resampling. (Is this really necessary?)
df_res = df_res.interpolate()
# Remove all empty rows.
df_res = df_res.dropna(how='all')
return df_res
########################################################################
# Public functions that you may call to download the data-set from
# the internet and load the data into memory.
def maybe_download_and_extract():
"""
Download and extract the weather-data if the data-files don't
already exist in the data_dir.
"""
download.maybe_download_and_extract(url=data_url, download_dir=data_dir)
def load_original_data():
"""
Load and return the original data that has not been resampled.
Note that this is not the raw data obtained from NCDC.
It is a cleaned-up version of that data, as written by the
function _convert_raw_data() above.
"""
return pd.read_pickle(path_original_data_pickle())
def load_resampled_data():
"""
Load and return the resampled weather-data.
This has data-points at regular 60-minute intervals where
missing data has been linearly interpolated.
This uses a cache-file for saving and quickly reloading the data,
so the original data is only resampled once.
"""
# Path for the cache-file with the resampled data.
path = path_resampled_data_pickle()
# If the cache-file exists ...
if os.path.exists(path):
# Reload the cache-file.
df = pd.read_pickle(path)
else:
# Otherwise resample the original data and save it in a cache-file.
# Load the original data.
df_org = load_original_data()
# Split the original data into separate data-frames for each city.
df_cities = [df_org.xs(city) for city in cities]
# Resample the data for each city.
df_resampled = [_resample(df_city) for df_city in df_cities]
# Join the resampled data into a single data-frame.
df = pd.concat(df_resampled, keys=cities,
axis=1, join='inner')
# Save the resampled data in a cache-file for quick reloading.
df.to_pickle(path)
return df
########################################################################
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/LemonIceSprite/TensorFlow-Tutorials.git
git@gitee.com:LemonIceSprite/TensorFlow-Tutorials.git
LemonIceSprite
TensorFlow-Tutorials
TensorFlow-Tutorials
master

搜索帮助