# Modeling the Joint Distribution of Wind Speed and Direction using Gaussain Mixture Models¶

OEN Method: Harris, Cook The parent wind speed distribution: Why Weibull? http://www.sciencedirect.com/science/article/pii/S0167610514001056

Gaussian Mixture Models, http://scikit-learn.org/stable/modules/mixture.html

# 1. Set up¶

## 1.1 Environment¶

In [1]:
%matplotlib inline

from import_file import *
from helpers.parallel_helper import *

plt.rcParams['axes.autolimit_mode'] = 'round_numbers'
plt.rcParams['axes.xmargin'] = 0.
plt.rcParams['axes.ymargin'] = 0.
mpl.rcParams['patch.force_edgecolor'] = True


In [2]:
# file_path, bandwidth= './data/NCDC/europe/uk/marham/dat.txt', 1.7
# file_path, bandwidth, NUMBER_OF_GAUSSIAN= './data/NCDC/europe/uk/tiree/dat.txt', 1.9, 4
# file_path, bandwidth, NUMBER_OF_GAUSSIAN = './data/NCDC/europe/uk/boscombe_down/dat.txt', 1.5, 4
# file_path, bandwidth= './data/NCDC/europe/uk/middle_wallop/dat.txt', 1.3
# file_path, bandwidth= './data/NCDC/europe/uk/bournemouth/dat.txt',1.3 # 4?
# file_path= "./data/NCDC/europe/uk/weybourne/dat.txt"
# file_path= "./data/NCDC/europe/uk/skye_lusa/dat.txt" #
# file_path= "./data/NCDC/europe/uk/wattisham/dat.txt"
# file_path= "./data/NCDC/europe/uk/south_uist_range/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/europe/uk/holbeach/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/europe/uk/cambridge/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/europe/us/baltimore/dat.txt" # time too short
# file_path= "./data/NCDC/europe/uk/bealach_na_ba/dat.txt" # time too short
# file_path= "./data/NCDC/europe/uk/benbecula/dat.txt" # truncate (untruncate in m/s), 4?
# file_path= './data/NCDC/europe/uk/southhamption/dat.txt' # high 0, trend

# file_path, bandwidth, NUMBER_OF_GAUSSIAN = "./data/NCDC/europe/germany/landsberg_lech/dat.txt", 0.9, 4
# file_path, bandwidth= "./data/NCDC/europe/germany/neuburg/dat.txt", 0.7
# file_path, bandwidth= "./data/NCDC/europe/germany/laupheim/dat.txt", 0.7 # double peak, 4?, trend
# file_path, bandwidth= './data/NCDC/europe/germany/niederstetten/dat.txt', 0.9 # get the peak
# file_path, bandwidth= "./data/NCDC/europe/germany/holzdorf/dat.txt", 0.9 # 2008 year
# file_path, bandwidth, NUMBER_OF_GAUSSIAN= './data/NCDC/europe/france/nantes/dat.txt', 0.9, 4 # unit shift, one direction deviate big
# file_path= './data/NCDC/europe/france/pau_pyrenees/dat.txt' # unit shift, 2; force using knot
# file_path= "./data/NCDC/europe/france/avord/dat.txt" # try 4, initial speed (should be good with m/s), incompete dataset
# file_path= "./data/NCDC/europe/france/vatry/dat.txt"  # double peak, initial speed, incompete dataset
# file_path, bandwidth, NUMBER_OF_GAUSSIAN= "./data/NCDC/europe/spain/valladolid/dat.txt", 1.1, 4
# file_path= './data/NCDC/europe/spain/jerez/dat.txt' # high 0
# file_path, bandwidth= "./data/NCDC/europe/spain/barayas/dat.txt", 0.7 # not good fit
# file_path, bandwidth= './data/NCDC/europe/spain/malaga/dat.txt', 0.7 # directions blocked?
# file_path, bandwidth= './data/NCDC/europe/spain/tenerife_sur/dat.txt', 0.7 # directions blocked?
# file_path, bandwidth= './data/NCDC/europe/spain/almeria/dat.txt', 0.7 # negative dimensions?
# file_path, bandwidth= './data/NCDC/europe/greece/eleftherios_intl/dat.txt',0.7 # some direction might be blocked
# file_path= './data/NCDC/europe/ciampino/dat.txt' # try 4, bandwidth?
# file_path= "./data/NCDC/europe/huspel_aws/dat.txt"  # integer, 4?
# file_path= './data/NCDC/gibraltar/dat.txt' # bad fit

# MidEast
# file_path, bandwidth= './data/NCDC/mideast/uae/al_maktoum/dat.txt', 1.1
# file_path= './data/NCDC/mideast/uae/sharjah_intl/dat.txt'
# file_path= './data/NCDC/mideast/uae/dubai_intl/dat.txt'
# file_path= './data/NCDC/mideast/uae/abu_dhabi_intl/dat.txt' # Time shift
# file_path= './data/NCDC/mideast/uae/bateen/dat.txt' # Time shift
# file_path= './data/NCDC/mideast/buraimi/dat.txt' # not good dataset
# file_path= './data/NCDC/mideast/turkey/konya/dat.txt'
# file_path= './data/NCDC/mideast/turkey/sivas/dat.txt' # bad dataset
# file_path= './data/NCDC/mideast/turkey/balikesir/dat.txt' # bad dataset
# file_path= './data/NCDC/mideast/turkey/bartin/dat.txt' # bad dataset
# file_path= './data/NCDC/mideast/iran/chahbahar/dat.txt'
# file_path= './data/NCDC/mideast/iran/zabol/dat.txt' # Problematic data
# file_path= './data/NCDC/mideast/iran/torbat_heydarieh/dat.txt' # Unusable

# file_path, bandwidth = "./data/NCDC/cn/shanghai/hongqiao_intl/dat.txt", 0.6
# file_path, bandwidth= "./data/NCDC/cn/shanghai/pudong/dat.txt", 0.8
# file_path, bandwidth= "./data/NCDC/cn/hefei_luogang/dat.txt", 0.6 # few 0, trend, try 2
# file_path, bandwidth= "./data/NCDC/cn/nanjing_lukou/dat.txt", 0.5
# file_path= "./data/NCDC/cn/zhengzhou_xinzheng/dat.txt"
# file_path= "./data/NCDC/cn/tianjin/binhai/dat.txt" # few 0, trend, stationary speed, unstationary direction
# file_path= "./data/NCDC/cn/tianjin/tianjing/dat.txt" # 16 sectors
# file_path= "./data/NCDC/cn/shijiazhuang_zhengding/dat.txt"
# file_path= "./data/NCDC/cn/henan_gushi/dat.txt" # 16 sectors, fit not very good
# file_path= "./data/NCDC/cn/nanning_wuxu/dat.txt" # numpy priblem, unstationary speed
# file_path= './data/NCDC/cn/macau/dat.txt'
# file_path= "./data/NCDC/cn/hk_intl/dat.txt" # few 0
# file_path= './data/NCDC/cn/gaoqi/dat.txt'

# file_path= './data/NCDC/southeast_asia/malaysia/mersing/dat.txt' # 2 mode, paper comparison
# file_path= './data/NCDC/southeast_asia/malaysia/penang/dat.txt'
# file_path= './data/NCDC/southeast_asia/malaysia/butterworth/dat.txt' # 2 mode
# file_path= "./data/NCDC/southeast_asia/malaysia/bsultan_mahmud/dat.txt" # stable
# file_path= "./data/NCDC/southeast_asia/malaysia/bsultan_ismail/dat.txt" #
# file_path= "./data/NCDC/southeast_asia/singapore/changi/dat.txt" # trend, no 0, questionary data
# file_path= "./data/NCDC/southeast_asia/singapore/paya_lebar/dat.txt" # questionary data
# file_path= "./data/NCDC/southeast_asia/singapore/seletar/dat.txt"
# file_path= "./data/NCDC/east_asia/cheongju_intl/dat.txt" # 2005-2009  may have problem, fit is good; numpy problem
# file_path= "./data/NCDC/east_asia/daegu_ab/dat.txt" # recent 5 year may have problem, but fit is generally good; numpy problem

# file_path, bandwidth= "./data/NCDC/oceania/auckland_intl/dat.txt", 0.9  # Good data, double mode
# file_path= "./data/NCDC/oceania/brisbane_archerfield/dat.txt" # high 0, few data
# file_path= "./data/NCDC/oceania/narrandera/dat.txt" # high 0, few data
# file_path, bandwidth= "./data/NCDC/oceania/canberra/dat.txt", 0.7 # high 0, bad fit
# file_path, bandwidth, NUMBER_OF_GAUSSIAN= './data/NCDC/oceania/horsham/dat.txt', 0.9, 4 # get the peak

file_path, bandwidth= './data/NCDC/us/boston_16nm/dat.txt', 0.9 # Offshore, mixed type

# file_path, bandwidth= './data/asos/olympia/hr_avg.csv', 0.5 # might block
# file_path, bandwidth, NUMBER_OF_GAUSSIAN  = './data/asos/bismarck_ND/hr_avg.csv', 1.1, 4
# file_path, bandwidth, NUMBER_OF_GAUSSIAN = './data/asos/aberdeen_SD/hr_avg.csv', 1.7, 2 # only to 2012
# file_path, bandwidth, NUMBER_OF_GAUSSIAN = './data/asos/minneapolis/hr_avg.csv', 1.1, 4
# file_path, bandwidth = './data/asos/lincoln_NE/hr_avg.csv', 0.9
# file_path, bandwidth = './data/asos/des_moines_IA/hr_avg.csv', 1.3
# file_path, bandwidth = './data/asos/springfield_IL/hr_avg.csv', 1.1
# file_path, bandwidth = './data/asos/topeka/hr_avg.csv', 0.7 # High 0
# file_path, bandwidth = './data/asos/denver/hr_avg.csv', 1.3

# file_path, bandwidth, NUMBER_OF_GAUSSIAN = './data/NDAWN/baker/hr_avg.csv', 0.7, 4
# file_path, bandwidth = './data/NDAWN/dickinson/hr_avg.csv', 0.6
# file_path = './data/NDAWN/rugby/hr_avg.csv'
# file_path = './data/NDAWN/bowman/hr_avg.csv'
# file_path = './data/NDAWN/grand_forks/hr_avg.csv'
# file_path = './data/NDAWN/williston/hr_avg.csv'
# file_path = './data/NDAWN/jamestown/hr_avg.csv'

# file_path, bandwidth, NUMBER_OF_GAUSSIAN = 'data/ECMWF/usa/47N123W/dat.csv', 0.7, 4 #good
# file_path, bandwidth = 'data/ECMWF/venezuela/8N67W/dat.csv', 0.7 # good, but the data might be problematic.
# file_path, bandwidth = 'data/ECMWF/chile/52S75W/dat.csv', 1.9 # good
# file_path, bandwidth= 'data/ECMWF/iceland/65N17W/dat.csv', 1.9 # good
# file_path, bandwidth, NUMBER_OF_GAUSSIAN  = 'data/ECMWF/germany/49N9E/dat.csv', 1.1, 4 # miss peak
# file_path, bandwdith = 'data/ECMWF/sudan/18N32E/dat.csv', 1.1 # good
# file_path, bandwidth = 'data/ECMWF/china/24N121E/dat.csv', 0.9 # good
# file_path, bandwidth, NUMBER_OF_GAUSSIAN = 'data/ECMWF/australia/37S142E/dat.csv', 0.7, 4 # miss the peak, force bandwidth 0.7

In [3]:
if "cn_database" in file_path:
elif 'NCDC' in file_path:
df.rename(columns={'Date':'date','Dir':'dir','Spd':'speed','Type':'type','I.1':'wind_type'}, inplace=True)
df = df[['date','HrMn','type','dir','speed','wind_type' ]]
df.dropna(subset=['dir','speed'], inplace=True)
integer_data = True
elif 'NDAWN' in file_path:
df['type']='default'
df['wind_type']='default'
df = df.dropna()
convert_to_knot = False
integer_data = False
elif 'asos' in file_path:
# ASOS
df['type']='default'
df['wind_type']='default'
df = df.dropna()
convert_to_knot = False
integer_data = False
knot_unit = True
else:
df.rename(columns={'U':'x','V':'y'}, inplace=True)
df.x=-df.x
df.y=-df.y
df['speed']=np.sqrt(df.x**2+df.y**2)
df['dir']=np.degrees(np.arctan2(df.y, df.x))%360
df['time']=pd.to_datetime('1979-01-01T00:00:00Z')+pd.to_timedelta(df['time'], unit='h')
df['date']=df['time'].dt.strftime('%Y%m%d')
df['date']=df['date'].astype(int)
df['HrMn']=df['time'].dt.strftime('%H00')
df['type']='default'
df['wind_type']='default'
convert_to_knot = True
integer_data = False
cartesian = True

In [4]:
df

Out[4]:
date HrMn type dir speed wind_type
0 19850312 1400 FM-18 130 13.0 N
1 19850312 1500 FM-18 130 12.0 N
2 19850312 1600 FM-18 130 12.0 N
3 19850312 1700 FM-18 160 7.0 N
4 19850312 1800 FM-18 170 8.0 N
5 19850312 1900 FM-18 190 7.0 N
6 19850312 2000 FM-18 210 6.0 N
7 19850312 2100 FM-18 190 4.0 N
8 19850312 2200 FM-18 170 6.0 N
9 19850312 2300 FM-18 270 8.0 N
10 19850313 0000 FM-18 270 7.0 N
11 19850313 0100 FM-18 240 6.0 N
12 19850313 0200 FM-18 230 6.0 N
13 19850313 0300 FM-18 220 4.0 N
14 19850313 0400 FM-18 230 6.0 N
15 19850313 0500 FM-18 240 6.0 N
16 19850313 0600 FM-18 250 6.0 N
17 19850313 0700 FM-18 240 8.0 N
18 19850313 0800 FM-18 260 10.0 N
19 19850313 0900 FM-18 240 10.0 N
20 19850313 1000 FM-18 260 10.0 N
21 19850313 1100 FM-18 250 10.0 N
22 19850313 1200 FM-18 260 11.0 N
23 19850313 1300 FM-18 260 10.0 N
24 19850313 1400 FM-18 290 11.0 N
25 19850313 1500 FM-18 290 12.0 N
26 19850313 1600 FM-18 310 13.0 N
27 19850313 1800 FM-18 300 11.0 N
28 19850313 1900 FM-18 290 11.0 N
29 19850313 2000 FM-18 290 12.0 N
... ... ... ... ... ... ...
235836 20170331 1800 FM-13 110 2.0 N
235837 20170331 1900 FM-13 999 999.9 C
235838 20170331 2000 FM-13 80 4.0 N
235839 20170331 2100 FM-13 90 4.0 N
235840 20170331 2200 FM-13 70 6.0 N
235841 20170331 2300 FM-13 80 6.0 N
235842 20170401 0000 FM-13 90 8.0 N
235843 20170401 0100 FM-13 80 8.0 N
235844 20170401 0200 FM-13 80 10.0 N
235845 20170401 0300 FM-13 90 11.0 N
235846 20170401 0400 FM-13 80 11.0 N
235847 20170401 0500 FM-13 80 11.0 N
235848 20170401 0600 FM-13 80 11.0 N
235849 20170401 0700 FM-13 80 12.0 N
235850 20170401 0800 FM-13 80 12.0 N
235851 20170401 0900 FM-13 70 11.0 N
235852 20170401 1000 FM-13 70 12.0 N
235853 20170401 1100 FM-13 70 12.0 N
235854 20170401 1200 FM-13 60 13.0 N
235855 20170401 1300 FM-13 50 12.0 N
235856 20170401 1400 FM-13 50 12.0 N
235857 20170401 1500 FM-13 40 13.0 N
235858 20170401 1600 FM-13 40 14.0 N
235859 20170401 1700 FM-13 30 14.0 N
235860 20170401 1800 FM-13 30 14.0 N
235861 20170401 1900 FM-13 30 14.0 N
235862 20170401 2000 FM-13 30 14.0 N
235863 20170401 2100 FM-13 30 15.0 N
235864 20170401 2200 FM-13 30 14.0 N
235865 20170401 2300 FM-13 10 14.0 N

235866 rows Ã— 6 columns

In [5]:
if 'NCDC' in file_path:
lat, long = get_lat_long(file_path)
print(lat,long)
map_osm = folium.Map(location=[lat, long], zoom_start=4)
display(map_osm)

42.35 -70.69

In [6]:
df['time']=pd.to_datetime(df["date"].astype(str).map(str) + df["HrMn"], format='%Y%m%d%H%M')
df.set_index(['time'], inplace=True)
df['HrMn']=df['HrMn'].astype(int)
df = df.query("(dir <= 999) & (speed < 100) ")['1970':'2016']

In [7]:
plot_speed_and_angle_distribution(df.speed, df.dir)

D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
warnings.warn(self.msg_depr % (key, alt_key))

In [8]:
# Dir [10,360]=> [0,350]
df['dir'] = df['dir'].apply(lambda x: x%360 if x < 999 else x)
# Convert Windrose coordianates to Polar Cooridinates
if 'cartesian' in globals():
df['dir_windrose'] = df['dir'].apply(lambda x: (90 - x)%360 if x < 999 else x)
else:
df['dir_windrose'] = df['dir']
df['dir'] = df['dir'].apply(lambda x: (90 - x)%360 if x < 999 else x)
display(df.describe())
df.plot(y='speed',legend=True,figsize=(20,5))

date HrMn dir speed dir_windrose
count 2.334610e+05 233461.000000 233461.000000 233461.000000 233461.000000
mean 2.000936e+07 1147.748875 196.144380 6.089394 207.081881
std 9.282218e+04 692.306640 136.124931 3.331242 139.864066
min 1.985031e+07 0.000000 0.000000 0.000000 0.000000
25% 1.993041e+07 500.000000 120.000000 4.000000 120.000000
50% 2.000113e+07 1100.000000 190.000000 6.000000 210.000000
75% 2.009073e+07 1700.000000 260.000000 8.000000 280.000000
max 2.016123e+07 2300.000000 999.000000 26.000000 999.000000
Out[8]:
<matplotlib.axes._subplots.AxesSubplot at 0x4ff23c8>

## 1.3 General Data Info¶

### 1.3.1 Unit Detection¶

In [9]:
df['decimal'] = df.speed % 1
df.decimal.hist(alpha=0.5, label='m/s', figsize=(4, 3))

if 'convert_to_knot' not in globals():
convert_to_knot = True if len(df.query('decimal >= 0.2')) / len(df) > 0.3 else False

if convert_to_knot:
knot_unit = True
df['speed'] = df['speed'] * 1.943845
df['decimal'] = df.speed % 1
df.decimal.hist(alpha=0.5, label='knot')
# need more elaboration, some is not near an integer
if integer_data:
df['speed'] = df['speed'].apply(lambda x: int(round(x)))
plt_configure(xlabel='Decimal', ylabel='Frequency', legend={'loc': 'best'}, title='Decimal Distribution')
else:
if 'knot_unit' not in globals():
knot_unit = False

df.drop(['decimal'], 1,inplace=True)
print(knot_unit)

False

In [10]:
dir_unit_text = ' (degree)'
if knot_unit == True:
speed_unit_text = ' (knot)'
else:
speed_unit_text = ' (m/s)'


### 1.3.2 Sampling Type Selection¶

In [11]:
sample_type = df.query('date > 20000000')['type']
sample_type.value_counts().plot(
kind = 'bar', title = 'Report Types Comprisement', figsize=(4,3))

report_type_most_used = sample_type.value_counts().argmax()
df = df.query("type==@report_type_most_used")


### 1.3.3 Sampling Time Selection¶

In [12]:
MID_YEAR = int(np.average(df.index.year))

df['HrMn'].value_counts().sort_index().plot(kind='bar', alpha=0.5,label='Overall')
df[str(MID_YEAR):]['HrMn'].value_counts().sort_index().plot(
kind='bar', alpha=0.5, label='>= %s' %  MID_YEAR )

plt_configure(xlabel='Sampling Time', ylabel='Frequency', legend={'loc':'best'}, figsize=(8, 4),
title = 'Sampling Time Distribution, Overall and > %s ' %  MID_YEAR)

In [13]:
df['sample_time'] = df.HrMn % 100
sample_time = df['2000':]['sample_time']
sample_times = sample_time.value_counts()[sample_time.value_counts() > 2000]
sample_times = sample_times.index.tolist()
# df = df.query("sample_time in @sample_times")
df = df.query("sample_time == @sample_times[0]")
df.drop(['sample_time'], 1,inplace=True)
print(sample_times)

df['HrMn'].value_counts().sort_index().plot(kind='bar', alpha=0.5, figsize=(10, 4))

[0]

Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0xd66f5f8>

## 1.4 Error Data handling and Adjustment¶

### 1.4.1 Artefacts¶

wrong direction record

In [14]:
if integer_data:
display(df.query("(dir % 10 >= 0.1) & (dir != 999)"))
df = df.query('(dir % 10 <= 0.1) | (dir == 999)')

date HrMn type dir speed wind_type dir_windrose
time

sudden increase in speed

In [15]:
# sudden increse
df['incre'] = df.speed.diff(1)
df['incre'].fillna(0, inplace=True)
df['incre_reverse'] = df.speed.diff(-1)
df['incre_reverse'].fillna(0, inplace=True)

df['incre'].plot(kind='hist', bins=arange(-15, 15), legend=True, figsize=(8, 3))

date HrMn type dir speed wind_type dir_windrose incre incre_reverse
time
1993-03-13 23:00:00 19930313 2300 FM-18 20 26.0 N 70 3.0 5.0
1985-09-27 20:00:00 19850927 2000 FM-18 280 24.0 N 170 6.0 2.0
1992-12-12 14:00:00 19921212 1400 FM-18 50 23.0 N 40 2.0 1.0
1993-03-13 22:00:00 19930313 2200 FM-18 20 23.0 N 70 2.0 -3.0
1992-12-11 18:00:00 19921211 1800 FM-18 10 22.0 N 80 2.0 2.0
2003-12-07 02:00:00 20031207 200 FM-18 60 22.0 N 30 2.0 0.0
1985-09-27 21:00:00 19850927 2100 FM-18 270 22.0 N 180 -2.0 2.0
2003-12-07 03:00:00 20031207 300 FM-18 60 22.0 N 30 0.0 2.0
1992-12-13 02:00:00 19921213 200 FM-18 40 22.0 N 50 1.0 3.0
1992-12-12 15:00:00 19921212 1500 FM-18 40 22.0 N 50 -1.0 0.0
Out[15]:
<matplotlib.axes._subplots.AxesSubplot at 0xd5894a8>
In [16]:
incre_threshold = 20 if knot_unit else 10
print('sudden increase number', len(df.query('(incre > @incre_threshold )&(incre_reverse > @incre_threshold )')))
df = df.query('(incre < @incre_threshold )|(incre_reverse < @incre_threshold )')

# Check the max speed
df.drop(['incre', 'incre_reverse'], 1, inplace=True)

sudden increase number 1

date HrMn type dir speed wind_type dir_windrose incre incre_reverse
time
1993-03-13 23:00:00 19930313 2300 FM-18 20 26.0 N 70 3.0 5.0
1985-09-27 20:00:00 19850927 2000 FM-18 280 24.0 N 170 6.0 2.0
1993-03-13 22:00:00 19930313 2200 FM-18 20 23.0 N 70 2.0 -3.0
1992-12-12 14:00:00 19921212 1400 FM-18 50 23.0 N 40 2.0 1.0
1986-12-19 11:00:00 19861219 1100 FM-18 70 22.0 N 20 2.0 1.0
2003-12-07 03:00:00 20031207 300 FM-18 60 22.0 N 30 0.0 2.0
1992-12-12 16:00:00 19921212 1600 FM-18 50 22.0 N 40 0.0 3.0
1985-09-27 21:00:00 19850927 2100 FM-18 270 22.0 N 180 -2.0 2.0
1992-12-12 15:00:00 19921212 1500 FM-18 40 22.0 N 50 -1.0 0.0
2003-12-07 02:00:00 20031207 200 FM-18 60 22.0 N 30 2.0 0.0

### 1.4.2 Direction re-aligment¶

For some dataset, the 16 sectors are not record properly,

e.g. the sectors are [0,20,50 ...], need to redistribute the angle into 22.5, e.g. [0, 22.5, 45...]

In [17]:
display(df['dir'].value_counts().sort_index())
effective_column = df.query('dir < 999')['dir'].value_counts()[df['dir'].value_counts() > 30].sort_index()
if integer_data:
SECTOR_LENGTH = 360/len(effective_column)
else:
SECTOR_LENGTH = 10
print(len(effective_column), SECTOR_LENGTH)

0      3268
10     2973
20     3265
30     3460
40     3945
50     4485
60     4641
70     4754
80     4396
90     4144
100    3936
110    4522
120    5801
130    6521
140    7334
150    7647
160    7957
170    8295
180    8399
190    6724
200    6296
210    6639
220    6665
230    7142
240    7168
250    7362
260    6459
270    5605
280    4961
290    5359
300    6069
310    5671
320    4648
330    4157
340    3575
350    3125
999    3636
Name: dir, dtype: int64
36 10.0

In [18]:
df=realign_direction(df, effective_column)


### 1.4.3 0 Speed¶

In [19]:
with_too_many_zero, null_wind_frequency = is_with_too_many_zero(df['2005':])
delete_zero = with_too_many_zero
if delete_zero:
df = df.query('(speed > 0)')
print(delete_zero, null_wind_frequency)

False 0.0104086353123

In [20]:
print(df.query('dir == 999')['speed'].value_counts())
df=fill_direction_999(df, SECTOR_LENGTH)

0.0    3636
Name: speed, dtype: int64


## 1.5 Time Shift Comparison¶

In [21]:
DIR_REDISTRIBUTE = 'even'
if DIR_REDISTRIBUTE == 'even':
DIR_BIN = arange(-5, 360, 10)
elif DIR_REDISTRIBUTE == 'round_up':
DIR_BIN = arange(0, 360+10, 10)

# Comparison between mid_year, looking for:
# 1. Odd Even Bias
# 2. Time Shift of Wind Speed Distribution
bins = arange(0, df.speed.max() + 1)
df[:str(MID_YEAR)]['speed'].plot(
kind='hist', alpha=0.5,bins=bins, label='< %s' % MID_YEAR)

df[str(MID_YEAR+1):]['speed'].plot(
kind='hist', alpha=0.5,bins=bins, label='> %s' % MID_YEAR)

plt.suptitle('Speed Comparison between year < %s, > %s ' % (MID_YEAR, MID_YEAR), fontsize = 14)
plt_configure(xlabel='Speed', ylabel='Frequency', legend=True, figsize=(8, 3))

In [22]:
df[:str(MID_YEAR)]['dir'].plot(
kind='hist', alpha=0.5,bins=DIR_BIN, label='< %s' % MID_YEAR)

df[str(MID_YEAR+1):]['dir'].plot(
kind='hist', alpha=0.5,bins=DIR_BIN, label='> %s' % MID_YEAR)

plt.suptitle('Dir Comparison between year < %s, and > %s ' % (MID_YEAR, MID_YEAR), fontsize = 14)
plt_configure(xlabel='Dir', ylabel='Frequency', legend={'loc':'best'}, figsize=(8, 3), tight='x')

In [23]:
display(df[df['dir'].isnull()])
df.dropna(subset=['dir'], inplace=True)

date HrMn type dir speed wind_type dir_windrose
time
In [24]:
# Inspect the time shift of speed and degree distribution, and odd-even bias
check_time_shift(df, speed_unit_text=speed_unit_text, dir_unit_text=dir_unit_text)

1986 - 1990

D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
warnings.warn(self.msg_depr % (key, alt_key))

1991 - 1995

1996 - 2000

2001 - 2004

2006 - 2010

2011 - 2013

In [25]:
df.resample('A').mean().plot(y='speed')
plt.gca().set_ylim(bottom=0)
df.resample('M').mean().plot(y='speed', figsize=(20,4))
plt.gca().set_ylim(bottom=0)

Out[25]:
(0, 10.0)
In [26]:
%%time
for column in ['speed', 'dir']:
if column == 'speed':
bins = arange(0, df[column].max()+1, 1)
else:
bins = arange(0, 361, 10)
den, _ = np.histogram(df[column], bins=bins, density=True)
y_top=max(den)*1.2
for year in arange(1980, 2016):
end_year = year
sub_df = df[str(year):str(end_year)]
if len(sub_df) > 1000:
plt.figure()
df[column].hist(bins=bins, alpha=0.3, normed=True)
sub_df[column].hist(bins=bins, alpha=0.5, figsize=(3,1.5), normed=True)
plt.gca().set_ylim(top=y_top)
plt_configure(title=str(year))
align_figures()

D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\pyplot.py:524: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (matplotlib.pyplot.figure) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam figure.max_open_warning).
max_open_warning, RuntimeWarning)