Modeling the Joint Distribution of Wind Speed and Direction using Gaussain Mixture Models

OEN Method: Harris, Cook The parent wind speed distribution: Why Weibull? http://www.sciencedirect.com/science/article/pii/S0167610514001056

Gaussian Mixture Models, http://scikit-learn.org/stable/modules/mixture.html

1. Set up

1.1 Environment

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

from import_file import *
from helpers.parallel_helper import *
load_libs()

plt.rcParams['axes.autolimit_mode'] = 'round_numbers'
plt.rcParams['axes.xmargin'] = 0.
plt.rcParams['axes.ymargin'] = 0.
mpl.rcParams['patch.force_edgecolor'] = True

1.2 Read Data

In [2]:
# file_path, bandwidth= './data/NCDC/europe/uk/marham/dat.txt', 1.7
# file_path, bandwidth= './data/NCDC/europe/uk/tiree/dat.txt', 1.9
# file_path, bandwidth, NUMBER_OF_GAUSSIAN = './data/NCDC/europe/uk/boscombe_down/dat.txt', 1.5, 4
# file_path, bandwidth= './data/NCDC/europe/uk/middle_wallop/dat.txt', 1.3
# file_path, bandwidth= './data/NCDC/europe/uk/bournemouth/dat.txt',1.3 # 4?
# file_path= "./data/NCDC/europe/uk/weybourne/dat.txt"
# file_path= "./data/NCDC/europe/uk/skye_lusa/dat.txt" # 
# file_path= "./data/NCDC/europe/uk/wattisham/dat.txt"
# file_path= "./data/NCDC/europe/uk/south_uist_range/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/europe/uk/holbeach/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/europe/uk/cambridge/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/europe/us/baltimore/dat.txt" # time too short
# file_path= "./data/NCDC/europe/uk/bealach_na_ba/dat.txt" # time too short
# file_path= "./data/NCDC/europe/uk/benbecula/dat.txt" # truncate (untruncate in m/s), 4?
# file_path= './data/NCDC/europe/uk/southhamption/dat.txt' # high 0, trend

# file_path, bandwidth, NUMBER_OF_GAUSSIAN = "./data/NCDC/europe/germany/landsberg_lech/dat.txt", 0.9, 4 
# file_path, bandwidth= "./data/NCDC/europe/germany/neuburg/dat.txt", 0.7
# file_path, bandwidth= "./data/NCDC/europe/germany/laupheim/dat.txt", 0.7 # double peak, 4?, trend
# file_path, bandwidth= './data/NCDC/europe/germany/niederstetten/dat.txt', 0.9 # get the peak
# file_path, bandwidth= "./data/NCDC/europe/germany/holzdorf/dat.txt", 0.9 # 2008 year
# file_path, bandwidth, NUMBER_OF_GAUSSIAN= './data/NCDC/europe/france/nantes/dat.txt', 0.9, 4 # unit shift, one direction deviate big
# file_path, convert_to_knot= './data/NCDC/europe/france/pau_pyrenees/dat.txt', True # unit shift, 2; force using knot 
# file_path= "./data/NCDC/europe/france/avord/dat.txt" # try 4, initial speed (should be good with m/s), incompete dataset
# file_path= "./data/NCDC/europe/france/vatry/dat.txt"  # double peak, initial speed, incompete dataset
# file_path, bandwidth, NUMBER_OF_GAUSSIAN= "./data/NCDC/europe/spain/valladolid/dat.txt", 1.1, 4
# file_path= './data/NCDC/europe/spain/jerez/dat.txt' # high 0
# file_path, bandwidth= "./data/NCDC/europe/spain/barayas/dat.txt", 0.7 # not good fit
# file_path, bandwidth= './data/NCDC/europe/spain/malaga/dat.txt', 0.7 # directions blocked?
# file_path, bandwidth= './data/NCDC/europe/spain/tenerife_sur/dat.txt', 0.7 # directions blocked?
# file_path, bandwidth= './data/NCDC/europe/spain/almeria/dat.txt', 0.7 # negative dimensions?
# file_path, bandwidth= './data/NCDC/europe/greece/eleftherios_intl/dat.txt',0.7 # some direction might be blocked
# file_path= './data/NCDC/europe/ciampino/dat.txt' # try 4, bandwidth?
# file_path= "./data/NCDC/europe/huspel_aws/dat.txt"  # integer, 4?
# file_path= './data/NCDC/gibraltar/dat.txt' # bad fit

# MidEast
file_path, bandwidth= './data/NCDC/mideast/uae/al_maktoum/dat.txt', 1.1
# file_path= './data/NCDC/mideast/uae/sharjah_intl/dat.txt' 
# file_path= './data/NCDC/mideast/uae/dubai_intl/dat.txt' 
# file_path= './data/NCDC/mideast/uae/abu_dhabi_intl/dat.txt' # Time shift
# file_path= './data/NCDC/mideast/uae/bateen/dat.txt' # Time shift
# file_path= './data/NCDC/mideast/buraimi/dat.txt' # not good dataset
# file_path= './data/NCDC/mideast/turkey/konya/dat.txt' 
# file_path= './data/NCDC/mideast/turkey/sivas/dat.txt' # bad dataset
# file_path= './data/NCDC/mideast/turkey/balikesir/dat.txt' # bad dataset
# file_path= './data/NCDC/mideast/turkey/bartin/dat.txt' # bad dataset
# file_path= './data/NCDC/mideast/iran/chahbahar/dat.txt'
# file_path= './data/NCDC/mideast/iran/zabol/dat.txt' # Problematic data
# file_path= './data/NCDC/mideast/iran/torbat_heydarieh/dat.txt' # Unusable

# file_path, bandwidth = "./data/NCDC/cn/shanghai/hongqiao_intl/dat.txt", 0.6
# file_path, bandwidth= "./data/NCDC/cn/shanghai/pudong/dat.txt", 0.8
# file_path, bandwidth= "./data/NCDC/cn/hefei_luogang/dat.txt", 0.6 # few 0, trend
# file_path, bandwidth= "./data/NCDC/cn/nanjing_lukou/dat.txt", 0.5
# file_path= "./data/NCDC/cn/zhengzhou_xinzheng/dat.txt" 
# file_path= "./data/NCDC/cn/tianjin/binhai/dat.txt" # few 0, trend, stationary speed, unstationary direction
# file_path= "./data/NCDC/cn/tianjin/tianjing/dat.txt" # 16 sectors
# file_path= "./data/NCDC/cn/shijiazhuang_zhengding/dat.txt" 
# file_path= "./data/NCDC/cn/henan_gushi/dat.txt" # 16 sectors, fit not very good
# file_path= "./data/NCDC/cn/nanning_wuxu/dat.txt" # numpy priblem, unstationary speed
# file_path= './data/NCDC/cn/macau/dat.txt'  
# file_path= "./data/NCDC/cn/hk_intl/dat.txt" # few 0
# file_path= './data/NCDC/cn/gaoqi/dat.txt' 

# file_path= './data/NCDC/southeast_asia/malaysia/mersing/dat.txt' # 2 mode, paper comparison
# file_path= './data/NCDC/southeast_asia/malaysia/penang/dat.txt'
# file_path= './data/NCDC/southeast_asia/malaysia/butterworth/dat.txt' # 2 mode 
# file_path= "./data/NCDC/southeast_asia/malaysia/bsultan_mahmud/dat.txt" # stable
# file_path= "./data/NCDC/southeast_asia/malaysia/bsultan_ismail/dat.txt" # 
# file_path= "./data/NCDC/southeast_asia/singapore/changi/dat.txt" # trend, no 0, questionary data
# file_path= "./data/NCDC/southeast_asia/singapore/paya_lebar/dat.txt" # questionary data
# file_path= "./data/NCDC/southeast_asia/singapore/seletar/dat.txt"
# file_path= "./data/NCDC/east_asia/cheongju_intl/dat.txt" # 2005-2009  may have problem, fit is good; numpy problem
# file_path= "./data/NCDC/east_asia/daegu_ab/dat.txt" # recent 5 year may have problem, but fit is generally good; numpy problem

# file_path, bandwidth= "./data/NCDC/oceania/auckland_intl/dat.txt", 0.9  # Good data, double mode
# file_path= "./data/NCDC/oceania/brisbane_archerfield/dat.txt" # high 0, few data 
# file_path= "./data/NCDC/oceania/narrandera/dat.txt" # high 0, few data
# file_path, bandwidth= "./data/NCDC/oceania/canberra/dat.txt", 0.7 # high 0, bad fit
# file_path, bandwidth, NUMBER_OF_GAUSSIAN= './data/NCDC/oceania/horsham/dat.txt', 0.9, 4 # get the peak

# file_path, bandwidth= './data/NCDC/us/boston_16nm/dat.txt', 0.9 # Offshore, mixed type

# file_path, bandwidth= './data/asos/olympia/hr_avg.csv', 0.5 # might block
# file_path, bandwidth, NUMBER_OF_GAUSSIAN  = './data/asos/bismarck_ND/hr_avg.csv', 1.1, 4
# file_path, bandwidth, NUMBER_OF_GAUSSIAN = './data/asos/aberdeen_SD/hr_avg.csv', 1.7, 2 # only to 2012
# file_path, bandwidth, NUMBER_OF_GAUSSIAN = './data/asos/minneapolis/hr_avg.csv', 1.1, 4
# file_path, bandwidth = './data/asos/lincoln_NE/hr_avg.csv', 0.9
# file_path, bandwidth = './data/asos/des_moines_IA/hr_avg.csv', 1.3
# file_path, bandwidth = './data/asos/springfield_IL/hr_avg.csv', 1.1 
# file_path, bandwidth = './data/asos/topeka/hr_avg.csv', 0.7 # High 0
# file_path, bandwidth = './data/asos/denver/hr_avg.csv', 1.3

# file_path, bandwidth, NUMBER_OF_GAUSSIAN = './data/NDAWN/baker/hr_avg.csv', 0.7, 4 
# file_path, bandwidth = './data/NDAWN/dickinson/hr_avg.csv', 0.6
# file_path = './data/NDAWN/rugby/hr_avg.csv'
# file_path = './data/NDAWN/bowman/hr_avg.csv'
# file_path = './data/NDAWN/grand_forks/hr_avg.csv'
# file_path = './data/NDAWN/williston/hr_avg.csv'
# file_path = './data/NDAWN/jamestown/hr_avg.csv'

# file_path, bandwidth, NUMBER_OF_GAUSSIAN = 'data/ECMWF/usa/47N123W/dat.csv', 0.7, 4 #good 
# file_path, bandwidth = 'data/ECMWF/venezuela/8N67W/dat.csv', 0.7 # good, but the data might be problematic.
# file_path, bandwidth = 'data/ECMWF/chile/52S75W/dat.csv', 1.9 # good
# file_path, bandwidth= 'data/ECMWF/iceland/65N17W/dat.csv', 1.9 # good
# file_path, bandwidth, NUMBER_OF_GAUSSIAN  = 'data/ECMWF/germany/49N9E/dat.csv', 1.1, 4 # miss peak
# file_path, bandwdith = 'data/ECMWF/sudan/18N32E/dat.csv', 1.1 # good
# file_path, bandwidth = 'data/ECMWF/china/24N121E/dat.csv', 0.9 # good
# file_path, bandwidth, NUMBER_OF_GAUSSIAN = 'data/ECMWF/australia/37S142E/dat.csv', 0.7, 4 # miss the peak, force bandwidth 0.7
In [3]:
if "cn_database" in file_path: 
    df = read_cn_database(file_path)
elif 'NCDC' in file_path:
    df = pd.read_csv(file_path, header=0, skipinitialspace=True, dtype={'HrMn':'object'})
    df.rename(columns={'Date':'date','Dir':'dir','Spd':'speed','Type':'type','I.1':'wind_type'}, inplace=True)
    df = df[['date','HrMn','type','dir','speed','wind_type' ]]
    df.dropna(subset=['dir','speed'], inplace=True)
    integer_data = True
elif 'NDAWN' in file_path:
    df = pd.read_csv(file_path, header=0, skipinitialspace=True, dtype={'HrMn':'object'})
    df['type']='default'
    df['wind_type']='default'
    df = df.dropna()
    convert_to_knot = False
    integer_data = False
elif 'asos' in file_path:
    # ASOS
    df = pd.read_csv(file_path, header=0, skipinitialspace=True, dtype={'HrMn':'object'})
    df['type']='default'
    df['wind_type']='default'
    df = df.dropna()
    convert_to_knot = False
    integer_data = False
    knot_unit = True
else:
    df = pd.read_csv(file_path, header=0, skipinitialspace=True)
    df.rename(columns={'U':'x','V':'y'}, inplace=True)
    df.x=-df.x
    df.y=-df.y
    df['speed']=np.sqrt(df.x**2+df.y**2)
    df['dir']=np.degrees(np.arctan2(df.y, df.x))%360
    df['time']=pd.to_datetime('1979-01-01T00:00:00Z')+pd.to_timedelta(df['time'], unit='h')
    df['date']=df['time'].dt.strftime('%Y%m%d')
    df['date']=df['date'].astype(int)
    df['HrMn']=df['time'].dt.strftime('%H00')
    df['type']='default'
    df['wind_type']='default'
    convert_to_knot = True
    integer_data = False
    cartesian = True
In [4]:
df
Out[4]:
date HrMn type dir speed wind_type
0 20100518 0500 FM-15 999 1.0 V
1 20100518 0600 FM-15 999 1.0 V
2 20100518 0700 FM-15 330 2.1 N
3 20100527 0900 FM-15 190 5.1 N
4 20100614 0400 FM-15 170 7.2 N
5 20100614 0500 FM-15 160 6.2 N
6 20100614 0600 FM-15 170 5.1 N
7 20100614 0700 FM-15 180 2.6 N
8 20100614 0800 FM-15 999 1.5 V
9 20100614 1100 FM-15 310 7.2 N
10 20100614 1200 FM-15 340 7.7 N
11 20100614 1300 FM-15 330 8.2 N
12 20100614 1400 FM-15 320 6.2 N
13 20100614 1500 FM-15 320 2.6 N
14 20100614 1700 FM-15 340 2.6 N
15 20100614 1800 FM-15 360 3.1 N
16 20100614 1900 FM-15 90 2.6 N
17 20100614 2000 FM-15 100 3.6 N
18 20100614 2200 FM-15 110 2.1 N
19 20100614 2300 FM-15 120 2.1 N
20 20100615 0000 FM-15 140 1.0 N
21 20100615 0100 FM-15 150 2.1 N
22 20100615 0200 FM-15 130 0.5 N
23 20100615 0400 FM-15 150 2.6 N
24 20100615 0500 FM-15 160 6.2 N
25 20100615 0600 FM-15 160 4.1 N
26 20100615 0700 FM-15 170 3.1 N
27 20100615 0800 FM-15 999 2.1 V
28 20100615 0900 FM-15 999 1.0 V
29 20100615 1000 FM-15 340 8.2 N
... ... ... ... ... ... ...
61722 20170331 1800 FM-15 120 4.1 N
61723 20170331 1900 FM-15 120 3.6 N
61724 20170331 2000 FM-15 90 2.6 N
61725 20170331 2100 FM-15 110 3.1 N
61726 20170331 2200 FM-15 110 3.1 N
61727 20170331 2300 FM-15 130 3.1 N
61728 20170401 0000 FM-15 80 2.1 N
61729 20170401 0100 FM-15 120 2.1 N
61730 20170401 0200 FM-15 80 2.1 N
61731 20170401 0300 FM-15 40 2.1 N
61732 20170401 0400 FM-15 999 0.5 V
61733 20170401 0500 FM-15 310 1.5 N
61734 20170401 0600 FM-15 330 1.5 V
61735 20170401 0700 FM-15 999 1.0 V
61736 20170401 0800 FM-15 290 3.1 V
61737 20170401 0900 FM-15 330 3.1 V
61738 20170401 1000 FM-15 320 2.6 V
61739 20170401 1100 FM-15 330 2.6 V
61740 20170401 1200 FM-15 340 2.1 V
61741 20170401 1300 FM-15 340 6.2 N
61742 20170401 1400 FM-15 330 4.1 N
61743 20170401 1500 FM-15 330 4.1 N
61744 20170401 1600 FM-15 340 2.6 N
61745 20170401 1700 FM-15 20 1.0 N
61746 20170401 1800 FM-15 100 1.5 N
61747 20170401 1900 FM-15 90 3.1 N
61748 20170401 2000 FM-15 80 3.1 N
61749 20170401 2100 FM-15 60 2.6 N
61750 20170401 2200 FM-15 80 2.1 N
61751 20170401 2300 FM-15 90 3.1 N

61752 rows × 6 columns

In [5]:
if 'NCDC' in file_path:
    lat, long = get_lat_long(file_path)
    print(lat,long)
    map_osm = folium.Map(location=[lat, long], zoom_start=4)
    folium.Marker([lat, long]).add_to(map_osm)
    display(map_osm)
24.886 55.172
In [6]:
df['time']=pd.to_datetime(df["date"].astype(str).map(str) + df["HrMn"], format='%Y%m%d%H%M')
df.set_index(['time'], inplace=True)
df['HrMn']=df['HrMn'].astype(int)
df = df.query("(dir <= 999) & (speed < 100) ")['1970':'2016']
In [7]:
plot_speed_and_angle_distribution(df.speed, df.dir)
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
In [8]:
# Dir [10,360]=> [0,350]
df['dir'] = df['dir'].apply(lambda x: x%360 if x < 999 else x) 
# Convert Windrose coordianates to Polar Cooridinates 
if 'cartesian' in globals():
    df['dir_windrose'] = df['dir'].apply(lambda x: (90 - x)%360 if x < 999 else x)
else:
    df['dir_windrose'] = df['dir']
    df['dir'] = df['dir'].apply(lambda x: (90 - x)%360 if x < 999 else x)
display(df.describe())
df.plot(y='speed',legend=True,figsize=(20,5))
date HrMn dir speed dir_windrose
count 5.938300e+04 59383.000000 59383.000000 59383.000000 59383.000000
mean 2.013312e+07 1135.866376 226.156712 4.131822 237.997642
std 1.889311e+04 692.584200 210.592490 2.311919 208.489040
min 2.010052e+07 0.000000 0.000000 0.000000 0.000000
25% 2.012022e+07 500.000000 120.000000 2.600000 120.000000
50% 2.013101e+07 1100.000000 180.000000 3.600000 200.000000
75% 2.015052e+07 1700.000000 280.000000 5.700000 300.000000
max 2.016123e+07 2359.000000 999.000000 20.100000 999.000000
Out[8]:
<matplotlib.axes._subplots.AxesSubplot at 0xc27f940>

1.3 General Data Info

1.3.1 Unit Detection

In [9]:
df['decimal'] = df.speed % 1
df.decimal.hist(alpha=0.5, label='m/s', figsize=(4, 3))

if 'convert_to_knot' not in globals():
    convert_to_knot = True if len(df.query('decimal >= 0.2')) / len(df) > 0.3 else False
    
if convert_to_knot:
    knot_unit = True
    df['speed'] = df['speed'] * 1.943845
    df['decimal'] = df.speed % 1
    df.decimal.hist(alpha=0.5, label='knot')
    # need more elaboration, some is not near an integer
    if integer_data:
        df['speed'] = df['speed'].apply(lambda x: int(round(x)))
    plt_configure(xlabel='Decimal', ylabel='Frequency', legend={'loc': 'best'}, title='Decimal Distribution')
else:
    if 'knot_unit' not in globals():
        knot_unit = False
    
df.drop(['decimal'], 1,inplace=True)
print(knot_unit)
True
In [10]:
dir_unit_text = ' (degree)'
if knot_unit == True:
    speed_unit_text = ' (knot)'
else: 
    speed_unit_text = ' (m/s)'

1.3.2 Sampling Type Selection

In [11]:
sample_type = df.query('date > 20000000')['type']
sample_type.value_counts().plot(
    kind = 'bar', title = 'Report Types Comprisement', figsize=(4,3))

report_type_most_used = sample_type.value_counts().argmax()
df = df.query("type==@report_type_most_used")

1.3.3 Sampling Time Selection

In [12]:
MID_YEAR = int(np.average(df.index.year))

df['HrMn'].value_counts().sort_index().plot(kind='bar', alpha=0.5,label='Overall')
df[str(MID_YEAR):]['HrMn'].value_counts().sort_index().plot(
    kind='bar', alpha=0.5, label='>= %s' %  MID_YEAR )

plt_configure(xlabel='Sampling Time', ylabel='Frequency', legend={'loc':'best'}, figsize=(8, 4), 
              title = 'Sampling Time Distribution, Overall and > %s ' %  MID_YEAR)
In [13]:
df['sample_time'] = df.HrMn % 100 
sample_time = df['2000':]['sample_time']
sample_times = sample_time.value_counts()[sample_time.value_counts() > 2000]
sample_times = sample_times.index.tolist()
# df = df.query("sample_time in @sample_times")
df = df.query("sample_time == @sample_times[0]")
df.drop(['sample_time'], 1,inplace=True)
print(sample_times)

df['HrMn'].value_counts().sort_index().plot(kind='bar', alpha=0.5, figsize=(10, 4))
[0]
Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0xc3edd30>

1.4 Error Data handling and Adjustment

1.4.1 Artefacts

wrong direction record

In [14]:
if integer_data:
    display(df.query("(dir % 10 >= 0.1) & (dir != 999)"))
    df = df.query('(dir % 10 <= 0.1) | (dir == 999)')
date HrMn type dir speed wind_type dir_windrose
time

sudden increase in speed

In [15]:
# sudden increse
df['incre'] = df.speed.diff(1)
df['incre'].fillna(0, inplace=True)
df['incre_reverse'] = df.speed.diff(-1)
df['incre_reverse'].fillna(0, inplace=True)

display(df.sort_values(by='speed',ascending=False).head(10))
df['incre'].plot(kind='hist', bins=arange(-15, 15), legend=True, figsize=(8, 3))
date HrMn type dir speed wind_type dir_windrose incre incre_reverse
time
2016-03-09 10:00:00 20160309 1000 FM-15 60 36 N 30 13.0 13.0
2012-04-14 00:00:00 20120414 0 FM-15 180 34 N 270 8.0 26.0
2015-02-21 06:00:00 20150221 600 FM-15 280 32 N 170 3.0 1.0
2012-02-26 05:00:00 20120226 500 FM-15 270 32 N 180 7.0 1.0
2012-02-26 00:00:00 20120226 0 FM-15 290 32 N 160 2.0 0.0
2012-02-26 01:00:00 20120226 100 FM-15 280 32 N 170 0.0 8.0
2012-02-26 06:00:00 20120226 600 FM-15 270 31 N 180 -1.0 7.0
2011-01-28 10:00:00 20110128 1000 FM-15 110 31 N 340 23.0 5.0
2015-02-21 07:00:00 20150221 700 FM-15 270 31 N 180 -1.0 1.0
2015-02-21 08:00:00 20150221 800 FM-15 270 30 N 180 -1.0 4.0
Out[15]:
<matplotlib.axes._subplots.AxesSubplot at 0xc65f9b0>
In [16]:
incre_threshold = 20 if knot_unit else 10
print('sudden increase number', len(df.query('(incre > @incre_threshold )&(incre_reverse > @incre_threshold )')))
df = df.query('(incre < @incre_threshold )|(incre_reverse < @incre_threshold )')

# Check the max speed
display(df.sort_values(by='speed',ascending=False).head(10))
df.drop(['incre', 'incre_reverse'], 1, inplace=True)
sudden increase number 1
date HrMn type dir speed wind_type dir_windrose incre incre_reverse
time
2016-03-09 10:00:00 20160309 1000 FM-15 60 36 N 30 13.0 13.0
2012-04-14 00:00:00 20120414 0 FM-15 180 34 N 270 8.0 26.0
2012-02-26 05:00:00 20120226 500 FM-15 270 32 N 180 7.0 1.0
2015-02-21 06:00:00 20150221 600 FM-15 280 32 N 170 3.0 1.0
2012-02-26 01:00:00 20120226 100 FM-15 280 32 N 170 0.0 8.0
2012-02-26 00:00:00 20120226 0 FM-15 290 32 N 160 2.0 0.0
2015-02-21 07:00:00 20150221 700 FM-15 270 31 N 180 -1.0 1.0
2011-01-28 10:00:00 20110128 1000 FM-15 110 31 N 340 23.0 5.0
2012-02-26 06:00:00 20120226 600 FM-15 270 31 N 180 -1.0 7.0
2012-02-25 23:00:00 20120225 2300 FM-15 290 30 N 160 2.0 -2.0

1.4.2 Direction re-aligment

For some dataset, the 16 sectors are not record properly,

e.g. the sectors are [0,20,50 ...], need to redistribute the angle into 22.5, e.g. [0, 22.5, 45...]

In [17]:
display(df['dir'].value_counts().sort_index())
effective_column = df.query('dir < 999')['dir'].value_counts()[df['dir'].value_counts() > 30].sort_index()
if integer_data:
    SECTOR_LENGTH = 360/len(effective_column) 
else: 
    SECTOR_LENGTH = 10
print(len(effective_column), SECTOR_LENGTH)
0      1535
10     1354
20     1263
30     1208
40     1043
50      984
60      864
70      795
80      939
90     1073
100    1208
110    1596
120    2068
130    1973
140    2554
150    2690
160    2782
170    1791
180    1222
190     859
200     765
210     819
220    1001
230     995
240    1412
250    1376
260    1482
270    2078
280    2232
290    2133
300    1825
310    1310
320    1100
330    1115
340    1282
350    1394
999    3156
Name: dir, dtype: int64
36 10.0
In [18]:
df=realign_direction(df, effective_column)

1.4.3 0 Speed

In [19]:
with_too_many_zero, null_wind_frequency = is_with_too_many_zero(df['2005':])
delete_zero = with_too_many_zero
if delete_zero:
    df = df.query('(speed > 0)')
print(delete_zero, null_wind_frequency)
False 0.00341920544178
In [20]:
print(df.query('dir == 999')['speed'].value_counts())
df=fill_direction_999(df, SECTOR_LENGTH)
2     1403
1      533
3      519
4      257
0      189
5      152
6       65
7       26
8        7
10       3
11       1
9        1
Name: speed, dtype: int64

1.5 Time Shift Comparison

In [21]:
DIR_REDISTRIBUTE = 'even'
if DIR_REDISTRIBUTE == 'even':
    DIR_BIN = arange(-5, 360, 10) 
elif DIR_REDISTRIBUTE == 'round_up':
    DIR_BIN = arange(0, 360+10, 10) 

# Comparison between mid_year, looking for: 
# 1. Odd Even Bias
# 2. Time Shift of Wind Speed Distribution
bins = arange(0, df.speed.max() + 1)
df[:str(MID_YEAR)]['speed'].plot(
    kind='hist', alpha=0.5,bins=bins, label='< %s' % MID_YEAR)

df[str(MID_YEAR+1):]['speed'].plot(
    kind='hist', alpha=0.5,bins=bins, label='> %s' % MID_YEAR)

plt.suptitle('Speed Comparison between year < %s, > %s ' % (MID_YEAR, MID_YEAR), fontsize = 14)
plt_configure(xlabel='Speed', ylabel='Frequency', legend=True, figsize=(8, 3))
In [22]:
df[:str(MID_YEAR)]['dir'].plot(
    kind='hist', alpha=0.5,bins=DIR_BIN, label='< %s' % MID_YEAR)

df[str(MID_YEAR+1):]['dir'].plot(
    kind='hist', alpha=0.5,bins=DIR_BIN, label='> %s' % MID_YEAR)

plt.suptitle('Dir Comparison between year < %s, and > %s ' % (MID_YEAR, MID_YEAR), fontsize = 14)
plt_configure(xlabel='Dir', ylabel='Frequency', legend={'loc':'best'}, figsize=(8, 3), tight='x')
In [23]:
display(df[df['dir'].isnull()])
df.dropna(subset=['dir'], inplace=True)
date HrMn type dir speed wind_type dir_windrose
time
2010-05-18 05:00:00 20100518 500 FM-15 NaN 2 V 999
2010-05-18 06:00:00 20100518 600 FM-15 NaN 2 V 999
In [24]:
# Inspect the time shift of speed and degree distribution, and odd-even bias
check_time_shift(df, speed_unit_text=speed_unit_text, dir_unit_text=dir_unit_text)
2011 - 2015
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
In [25]:
df.resample('A').mean().plot(y='speed')
plt.gca().set_ylim(bottom=0)
df.resample('M').mean().plot(y='speed', figsize=(20,4))
plt.gca().set_ylim(bottom=0)
Out[25]:
(0, 10.0)
In [26]:
%%time
for column in ['speed', 'dir']:
    if column == 'speed':
        bins = arange(0, df[column].max()+1, 1)
    else:
        bins = arange(0, 361, 10)
    den, _ = np.histogram(df[column], bins=bins, density=True)
    y_top=max(den)*1.2
    for year in arange(1980, 2016):
        end_year = year
        sub_df = df[str(year):str(end_year)]
        if len(sub_df) > 1000:
            plt.figure()
            df[column].hist(bins=bins, alpha=0.3, normed=True)
            sub_df[column].hist(bins=bins, alpha=0.5, figsize=(3,1.5), normed=True)
            plt.gca().set_ylim(top=y_top)
            plt_configure(title=str(year))
    align_figures()
Wall time: 4.24 s
In [27]:
for column in ['speed', 'dir']:
    if column == 'speed':
        bins = arange(0, df[column].max()+1, 1)
    else:
        bins = arange(0, 361, 10)
    density_all, _ = np.histogram(df[column], bins=bins, density=True)
    df[column].hist(bins=bins, figsize=(5,3))

    R_squares = []
    years = []
    for year in arange(1980, 2016):
        start_year, end_year = year-1, year+1
        sub_df = df[str(start_year):str(end_year)]
        if len(sub_df) > 1000:
            density, _ = np.histogram(sub_df[column], bins=bins, density=True)
            y_mean = np.mean(density_all)
            SS_tot = np.sum(np.power(density_all - y_mean, 2))
            SS_res = np.sum(np.power(density_all - density, 2))

            R_square = 1 - SS_res / SS_tot
            R_squares.append(R_square)
            years.append(year)

    plt.figure()
    plot(years, R_squares)
    ylim = max(min(plt.gca().get_ylim()[0],0.85),0)
    plt.gca().set_ylim(bottom=ylim, top=1)
    plt_configure(figsize=(5,3))
    align_figures()

1.6 Re-distribute Direction and Speed (Optional)

e.g. Dir 50 -> -45 ~ 55, to make KDE result better

In [28]:
if integer_data:
    df = randomize_angle(df, DIR_REDISTRIBUTE, SECTOR_LENGTH)
In [29]:
if integer_data:
    if delete_zero:
        redistribute_method = 'down'
    else:
        redistribute_method = 'up'

    df, speed_redistribution_info = randomize_speed(df, redistribute_method)
Redistribute upward, e.g. 0 -> [0,1]

1.7 Generate (x,y) from (speed,dir)

In [30]:
# Cook orientation
# df['dir']= (df['dir'] + 180)%360
In [31]:
# There might be a small dot in the centre, which is due to too many zero (more than 1 speed) in center
# Scatter plot in matplot has performance issue, the speed is very slow
df['x'] = df['speed'] * cos(df['dir'] * pi / 180.0)
df['y'] = df['speed'] * sin(df['dir'] * pi / 180.0)

2. Re-select Data and Overview

2.1 Data Overview

In [32]:
## Summery of the data selection
print('Knot unit?', knot_unit)
print('Report type used:', report_type_most_used)
print('Sampling time used:', sample_times)
if 'speed_redistribution_info' in globals():
    print('Speed redistribution info:', speed_redistribution_info )

df_all_years = df # for later across-year comparison
df = df_all_years['2011':'2015']
# df = df.query('(HrMn == 0) and (speed >= 0.5) and (date%10000 > 900) and (date%10000 < 1000)' )
df.describe()
Knot unit? True
Report type used: FM-15
Sampling time used: [0]
Speed redistribution info: Redistribute upward, e.g. 0 -> [0,1]
Out[32]:
date HrMn dir speed dir_windrose x y
count 4.206600e+04 42066.000000 42066.000000 42066.000000 42066.000000 42066.000000 42066.000000
mean 2.013044e+07 1153.544430 179.874592 8.447488 237.262920 -1.553747 0.376113
std 1.405171e+04 691.989734 98.737423 4.375064 208.025899 6.507981 6.752294
min 2.011010e+07 0.000000 -4.978698 0.005481 0.000000 -34.524102 -32.472160
25% 2.012033e+07 600.000000 110.516651 5.156952 110.000000 -6.005882 -4.253203
50% 2.013062e+07 1200.000000 167.650918 7.482294 200.000000 -0.962347 0.459120
75% 2.014092e+07 1800.000000 269.171783 11.220914 300.000000 3.482725 5.152950
max 2.015123e+07 2300.000000 354.999753 34.535472 999.000000 27.009530 29.545306
In [33]:
df.plot(y='speed',legend=True,figsize=(20,5))
Out[33]:
<matplotlib.axes._subplots.AxesSubplot at 0xc3aed30>
In [34]:
# Accumulation by month
df.resample('M').count().plot(y='date', kind='bar',figsize=(20,4))
Out[34]:
<matplotlib.axes._subplots.AxesSubplot at 0xc6c35f8>
In [35]:
# 90 degree is east
ax = WindroseAxes.from_ax()
viridis = plt.get_cmap('viridis')
ax.bar(df.dir_windrose, df.speed, normed=True, opening=0.8, edgecolor='white', nsector=36, cmap=viridis)
ax.set_legend()
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)
In [36]:
if len(df) > 1000000:
    bins=arange(0,362)
    df['dir'].hist(bins=bins, normed=True,alpha=0.5,label='min')
    
    df = df_all_years.sample(n=500000, replace=True)    
    df['dir'].hist(bins=bins, normed=True,alpha=0.5,label='min resmapled')
    plt_configure(legend=True, figsize=(20,4))
In [37]:
x, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(df.speed)

# 1. Histogram comparison
fig = plt.figure()
df['speed'].hist(bins=arange(0, df.speed.max()), alpha=0.5, label='Data', normed=True)             
plot(x, y_weibull, '-', color='black',label='Weibull')   
plt_configure(figsize=(4,3),xlabel='V',ylabel='PDF', legend=True)

# 2. CDF comparison
fig = plt.figure()
plot(log(x), log(-log(1-y_ecdf)),'o', label='ECDF')
plot(log(x), log(-log(1-y_cdf_weibull)),'-', label='Weibull')
plt_configure(xlabel="ln(V)", ylabel="ln(-ln(1-P)",legend={'loc':'best'}, figsize=(4,3))
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:11: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:12: RuntimeWarning: divide by zero encountered in log
In [38]:
df.plot(kind='scatter', x='x', y='y', alpha=0.05, s=2)
plt.gca().set_aspect('equal')
plt_configure(figsize=(3.2,3.2),xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)

2.2 Overview by Direction

In [39]:
if len(effective_column) == 16:
    rebinned_angle = 22.5
else: 
    rebinned_angle = 10
In [40]:
%%time
original_incre, incre = SECTOR_LENGTH, rebinned_angle
start, end = -original_incre/2 + incre/2, 360

max_speed = df.speed.max()
max_count = max_count_for_angles(df, start, end, incre)
plot_range = [0, max_speed, 0, max_count*1.05]

for angle in arange(start, end, incre):
    start_angle, end_angle = angle-incre/2, angle+incre/2
    sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)   
    
    fig = plt.figure()
    sub_df['speed'].hist(bins=arange(0, max_speed), alpha=0.5, label='Data')
    title ='%s (%s - %s), %s' % (angle, start_angle, end_angle, len(sub_df)) 
    plt.axis(plot_range)
    plt_configure(figsize=(3,1.5), title=title)
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\pyplot.py:524: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)
Wall time: 8.19 s

2.3 Overview by Month

In [41]:
%%time
current_df = df.query('speed>=1')
for month in arange(1, 13): 
    sub_df = current_df[current_df.index.month == month]
    ax = WindroseAxes.from_ax()
    ax.bar(sub_df.dir_windrose, sub_df.speed, normed=True, opening=0.8, edgecolor='white', nsector=36, cmap=plt.get_cmap('viridis'))
    plt_configure(figsize=(3,3), title='Month: %s'%(month))
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)
Wall time: 18.7 s
In [42]:
df.describe()
Out[42]:
date HrMn dir speed dir_windrose x y
count 4.206600e+04 42066.000000 42066.000000 42066.000000 42066.000000 42066.000000 42066.000000
mean 2.013044e+07 1153.544430 179.874592 8.447488 237.262920 -1.553747 0.376113
std 1.405171e+04 691.989734 98.737423 4.375064 208.025899 6.507981 6.752294
min 2.011010e+07 0.000000 -4.978698 0.005481 0.000000 -34.524102 -32.472160
25% 2.012033e+07 600.000000 110.516651 5.156952 110.000000 -6.005882 -4.253203
50% 2.013062e+07 1200.000000 167.650918 7.482294 200.000000 -0.962347 0.459120
75% 2.014092e+07 1800.000000 269.171783 11.220914 300.000000 3.482725 5.152950
max 2.015123e+07 2300.000000 354.999753 34.535472 999.000000 27.009530 29.545306

3. Create input data and configuration

In [43]:
SPEED_SET = array(list(zip(df.x, df.y)))
if 'NUMBER_OF_GAUSSIAN' not in globals():
    NUMBER_OF_GAUSSIAN = 3
FIT_METHOD = 'square_error'
DEFAULT_BANDWDITH = 1.5 if knot_unit else 0.7
fig_list = []
In [44]:
fit_limit = ceil(df['speed'].quantile(.95))
fitting_axis_range = arange(-fit_limit, fit_limit+1, 1)
print(fitting_axis_range)

FITTING_RANGE = []
for i in fitting_axis_range:
    for j in fitting_axis_range:
        FITTING_RANGE.append([i,j])
[-17 -16 -15 -14 -13 -12 -11 -10  -9  -8  -7  -6  -5  -4  -3  -2  -1   0
   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17]
In [45]:
plot_limit = ceil(df['speed'].quantile(.95))
PLOT_AXIS_RANGE = arange(-plot_limit, plot_limit+1, 1)

4. Kernel Density Estimation

In [46]:
sample = SPEED_SET
KDE_KERNEL = 'gaussian'
# KDE_KERNEL, bandwidth = 'tophat', 1
In [47]:
%%time
if 'bandwidth' not in globals():
    bandwidth = DEFAULT_BANDWDITH
    from sklearn.grid_search import GridSearchCV
    # from sklearn.model_selection import GridSearchCV  ## too slow

    # The bandwidth value sometimes would be too radical
    if knot_unit:
        bandwidth_range = arange(0.7,2,0.2)
    else:
        bandwidth_range = arange(0.4,1.1,0.1)

    # Grid search is unable to deal with too many data (a long time is needed)
    if len(sample) > 50000:    
        df_resample=df.sample(n=50000, replace=True)
        bandwidth_search_sample = array(list(zip(df_resample.x, df_resample.y)))
    else:
        bandwidth_search_sample = sample

    grid = GridSearchCV(neighbors.KernelDensity(kernel = KDE_KERNEL),
                    {'bandwidth': bandwidth_range}, n_jobs=-1, cv=4) 

    grid.fit(bandwidth_search_sample)
    bandwidth = grid.best_params_['bandwidth']
    
print(bandwidth)
1.1
Wall time: 0 ns
In [48]:
if 'bandwidth' not in globals():
    bandwidth = DEFAULT_BANDWDITH

kde = neighbors.KernelDensity(bandwidth=bandwidth, kernel = KDE_KERNEL).fit(sample)

points = FITTING_RANGE
# very slow if the dataset is too large, e.g. 100,000
# kde returns log prob, need to convert it
kde_result = exp(kde.score_samples(points))
print('bandwidth:', bandwidth, len(kde_result))
print(kde_result[:5])
bandwidth: 1.1 1225
[  7.58432771e-08   3.20539294e-07   6.95667192e-07   1.13492786e-06
   1.99381126e-06]
In [49]:
# Plot jPDF
X = Y = PLOT_AXIS_RANGE
# Can't work if pass as generate_Z_from_X_Y(X,Y, exp(kde.score_samples())), need to use lambda
# see http://stackoverflow.com/questions/21035437/passing-a-function-as-an-argument-in-python
kde_Z = generate_Z_from_X_Y(X,Y, lambda coords: exp(kde.score_samples(coords)))
colorbar_lim = 0, kde_Z.max()

plot_3d_prob_density(X,Y,kde_Z)

fig_kde,ax1 = plt.subplots(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,kde_Z,xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, ax=ax1)

with sns.axes_style({'axes.grid' : False}):
    from matplotlib import ticker
    fig_hist,ax2 = plt.subplots(figsize=(3.5,2.5))
    _,_,_,image = ax2.hist2d(df.x, df.y, bins=PLOT_AXIS_RANGE, cmap='viridis',)
    ax2.set_aspect('equal')
    cb = plt.colorbar(image)
    tick_locator = ticker.MaxNLocator(nbins=6)
    cb.locator = tick_locator
    cb.update_ticks()
    plt_configure(ax=ax2, xlabel='x'+speed_unit_text,ylabel='y'+speed_unit_text)
align_figures()

4.1 Bootstrap GOF limit

In [50]:
kde_cdf = cdf_from_pdf(kde_result)
config = {'bandwidth': bandwidth, 
          'fitting_range': FITTING_RANGE,
          'fit_limit': fit_limit,
          'kde_kernel': KDE_KERNEL}
In [51]:
%%time
gof_kde=Parallel(n_jobs=-1)(delayed(resampled_kde)(df, kde_result, config) 
                                       for i in arange(20)) 
Wall time: 20 s
In [52]:
for gof_name in [ 'R_square', 'K_S','Chi_square']:
    plt.figure(figsize=(4,3))
    pd.DataFrame(gof_kde)[gof_name].hist()
    plt_configure(title=gof_name)
align_figures()

4.2 Bivariate Empirical Limit

In [55]:
gofs_bivar
Out[55]:
In [56]:
%%time
gofs_mean_set_bivar = []
fig1, ax1 = plt.subplots(figsize=(4,3))
fig2, ax2 = plt.subplots(figsize=(4,3))

for year_length in [5, 10]:
    start_year, end_year = df_all_years.index.year[0], 2015-year_length+1
    df_standard = df_all_years[str(2015-year_length+1):'2015']
    speed_ = array(list(zip(df_standard.x, df_standard.y)))
    kde_current = neighbors.KernelDensity(bandwidth=bandwidth, kernel=KDE_KERNEL).fit(speed_)
    kde_result_standard = exp(kde_current.score_samples(points))
    gofs_bivar=Parallel(n_jobs=-1)(delayed(kde_gofs)(df_all_years, start_year, start_year+year_length-1, kde_result_standard, config) 
                                   for start_year in arange(start_year, end_year+1)) 
    gofs_bivar=pd.DataFrame(gofs_bivar, index=arange(start_year, end_year+1))
    if len(gofs_bivar)>0:
        gofs_bivar.plot(y='R_square', ax=ax1, label=year_length)
        gofs_bivar.plot(y='K_S', ax=ax2, label=year_length)
        year_lim = end_year-year_length-10, end_year-year_length
        gofs_mean = gofs_bivar.query('index >= @year_lim[0] & index <= @year_lim[1]').mean().to_dict()
        gofs_mean['year_lim']=year_lim
        gofs_mean_set_bivar.append(gofs_mean)
    
align_figures()
display(pd.DataFrame(gofs_mean_set_bivar).set_index('year_lim'))
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
year_lim
(1996, 2006) NaN NaN NaN NaN NaN NaN

4.3 Univariate GOF Limit

In [57]:
def yearly_gof(df_all_years, start_year, end_year, density, y_ecdf, x, density_dir):
    df_previous = df_all_years[str(start_year):str(end_year)]
    # 1. Speed
    density_expected, _ = np.histogram(df_previous['speed'], bins=x, density=True)
    r_square = sector_r_square(density, density_expected)
    
    y_ecdf_previous = sm.distributions.ECDF(df_previous['speed'])(x)
    k_s = max(np.abs(y_ecdf - y_ecdf_previous))
    
    # 2. Direction
    density_dir_expected, _ = dir_hist(df_previous['dir'], bins=arange(-5,370,10), density=True)
    r_square_dir = sector_r_square(density_dir, density_dir_expected)
    return {'year': start_year, 'r_square': r_square, 'k_s': k_s, 'r_square_dir': r_square_dir}
In [62]:
%%time
x = arange(0, df.speed.max() + 1)
fig = plt.figure(figsize=(9,2.5))
ax1 = fig.add_subplot(1,3,1)
ax2 = fig.add_subplot(1,3,2)
ax3 = fig.add_subplot(1,3,3)
gofs_mean_set = []

for year_length in [5, 7, 10]:
    start_year, end_year =df_all_years.index.year[0], 2015-year_length+1
    df_standard = df_all_years[str(2015-year_length+1):str(2015)]
    density, _ = np.histogram(df_standard['speed'], bins=x, density=True)
    density_dir, _ = dir_hist(df_standard['dir'], bins=arange(-5,370,10), density=True)
    y_ecdf = sm.distributions.ECDF(df_standard.speed)(x)
    gofs = [yearly_gof(df_all_years, start_year, start_year+year_length-1, density, y_ecdf, x, density_dir) 
            for start_year in arange(start_year, end_year+1)]
    if len(gofs)>0:
        gofs = pd.DataFrame(gofs)
        gofs.set_index(['year'], inplace=True)  
        ax1.plot(gofs.r_square, label=year_length)
        ax2.plot(gofs.k_s, label=year_length)
        ax3.plot(gofs.r_square_dir, label=year_length)
        year_lim = end_year-year_length-10, end_year-year_length
        gofs_mean = gofs.query('index >= @year_lim[0] & index <= @year_lim[1]').mean().to_dict()
        gofs_mean['year_lim']=year_lim
        gofs_mean_set.append(gofs_mean)
plt.tight_layout()
plt.legend()
for ax in [ax1, ax2, ax3]:
    plt_configure(ax=ax, tight='xtight')
    
display(pd.DataFrame(gofs_mean_set).set_index('year_lim'))
k_s r_square r_square_dir
year_lim
(1996, 2006) NaN NaN NaN
Wall time: 365 ms

5. GMM by Expectation-maximization

In [63]:
sample=SPEED_SET
clf = mixture.GaussianMixture(n_components=NUMBER_OF_GAUSSIAN, covariance_type='full')
clf.fit(sample)
print(clf.converged_)
True
In [64]:
gmm_em_result = read_gmm_em_result(clf)
pretty_print_gmm(gmm_em_result)
Out[64]:
weight mean_x mean_y sig_x sig_y corr
1 0.392 -1.081 -4.889 4.453 5.444 -0.409
2 0.316 3.965 0.903 3.735 4.144 0.042
3 0.291 -8.180 6.888 5.121 4.232 0.271
In [65]:
fig,ax = plt.subplots(figsize=(3.5,3.5))
plot_gmm_ellipses(gmm_em_result, ax=ax, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
GMM Plot Result
0.39214374523 [[-1.08145626 -4.8888137 ]] [ 3.69728893  5.98295965] -148.15149755
0.316409427461 [[ 3.96454531  0.90307035]] [ 3.71777203  4.15912948] 169.063085492
0.291446827309 [[-8.18015384  6.88801886]] [ 3.85656019  5.40984061] -62.6391653746
In [66]:
X = Y = PLOT_AXIS_RANGE
pdf_Z = generate_Z_from_X_Y(X,Y, lambda coords: exp(clf.score_samples(coords)))

def residule_between_kde_and_gmm(points):
    kde_vals = exp(kde.score_samples(points))
    gmm_vals = exp(clf.score_samples(points))
    return kde_vals - gmm_vals 

residual_Z = generate_Z_from_X_Y(X,Y, residule_between_kde_and_gmm)

plot_3d_prob_density(X,Y,pdf_Z)
plot_3d_prob_density(X,Y,residual_Z)
align_figures()

fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,kde_Z,xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar_lim=colorbar_lim)
fig_em = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,pdf_Z,xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar_lim=colorbar_lim)
fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,residual_Z,
                     xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
align_figures()

Goodness-of-fit Statistics

In [67]:
points = FITTING_RANGE
gmm_pdf_result = exp(clf.score_samples(points))
gof_df(gmm_pdf_result, kde_result)
Out[67]:
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.932 0.019 0.062 7.015380e-08 0.053 0.332
In [68]:
gmm_em = group_gmm_param_from_gmm_param_array(gmm_em_result, sort_group = True)
mixed_model_pdf_em = generate_gmm_pdf_from_grouped_gmm_param(gmm_em)

6. GMM by Optimization

In [69]:
sample = SPEED_SET
points = FITTING_RANGE
max_speed = df.speed.max()
print(FIT_METHOD)
square_error
In [70]:
# from GMM,EM 
# GMM format: weight, meanx, meany, sigx, sigy, rho
x0 = gmm_em_result

cons = [
        # sum of every 6th element, which is the fraction of each gaussian
        {'type': 'eq', 'fun': lambda x: sum(x[::6]) - 1},
        # # limit the width/height ratio of elliplse, optional
#         {'type': 'ineq', 'fun': lambda x: width_height_ratios_set(x) - 1/3},
#         {'type': 'ineq', 'fun': lambda x: 3 - width_height_ratios_set(x)},
]

bonds = [(0., 0.99),(-fit_limit, fit_limit),(-fit_limit, fit_limit),
         (0., fit_limit),(0., fit_limit),(-0.99, 0.99)]*(len(x0)//6)

result = sp.optimize.minimize(
    lambda x0: GMM_fit_score(x0, kde_result, points, FIT_METHOD),
    x0,
    bounds = bonds,
    constraints=cons,
    tol = 0.000000000001,
    options = {"maxiter": 500})
result
Out[70]:
     fun: -17.487047747921288
     jac: array([  1.31396914e+00,   0.00000000e+00,  -2.38418579e-07,
         2.38418579e-07,  -2.38418579e-07,  -7.15255737e-07,
         1.31397510e+00,  -2.38418579e-07,   0.00000000e+00,
        -4.76837158e-07,   4.76837158e-07,  -4.76837158e-07,
         1.31397033e+00,   0.00000000e+00,   0.00000000e+00,
        -2.38418579e-07,   0.00000000e+00,  -1.43051147e-06,
         0.00000000e+00])
 message: 'Optimization terminated successfully.'
    nfev: 691
     nit: 34
    njev: 34
  status: 0
 success: True
       x: array([ 0.21749595,  4.81770768,  0.25496374,  2.34390443,  3.4996429 ,
       -0.13601912,  0.34900005, -0.96243785, -4.96121765,  4.03854841,
        3.91285527, -0.41001555,  0.43350399, -6.75039928,  6.07358941,
        6.96218254,  4.56096909,  0.01542211])

6.1 GMM Result

In [71]:
gmm = group_gmm_param_from_gmm_param_array(result.x, sort_group = True)
mixed_model_pdf = generate_gmm_pdf_from_grouped_gmm_param(gmm)
gmm_pdf_result = mixed_model_pdf(points)
pretty_print_gmm(gmm)
Out[71]:
weight mean_x mean_y sig_x sig_y corr
1 0.434 -6.750 6.074 6.962 4.561 0.015
2 0.349 -0.962 -4.961 4.039 3.913 -0.410
3 0.217 4.818 0.255 2.344 3.500 -0.136
In [72]:
fig_gmm, ax = plt.subplots(figsize=(3.5,3.5))
plot_gmm_ellipses(gmm, ax=ax, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
GMM Plot Result
0.43350399117 [[-6.75039928  6.07358941]] [ 4.56001911  6.96280479] -88.9863559092
0.349000054847 [[-0.96243785 -4.96121765]] [ 3.05151472  4.72319464] -132.794842144
0.217495953983 [[ 4.81770768  0.25496374]] [ 2.30528388  3.52520278] -170.857880113

6.2 Bivariate Goodness-of-fit statistics

In [73]:
gof_df(gmm_pdf_result, kde_result)
Out[73]:
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.976 0.017 0.171 2.543734e-08 0.032 0.200
In [74]:
pd.DataFrame(gofs_mean_set_bivar).set_index('year_lim')
Out[74]:
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
year_lim
(1996, 2006) NaN NaN NaN NaN NaN NaN
In [75]:
X = Y = PLOT_AXIS_RANGE
pdf_Z = generate_Z_from_X_Y(X,Y, mixed_model_pdf)# passing a function as an argument

def residule_between_kde_and_gmm(points):
    kde_vals = exp(kde.score_samples(points))
    gmm_vals = mixed_model_pdf(points)
    return kde_vals - gmm_vals 

residual_Z = generate_Z_from_X_Y(X,Y, residule_between_kde_and_gmm)

plot_3d_prob_density(X,Y,pdf_Z)
plot_3d_prob_density(X,Y,residual_Z)
align_figures()

fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,kde_Z, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
fig_gmm = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,pdf_Z, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,residual_Z,  xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
align_figures()

6.3 Univariate Goodness-of-fit

In [76]:
def f(V,theta):
    return (mixed_model_pdf([[V*cos(theta),V*sin(theta)]]))*V

def f_em(V,theta):
    return (mixed_model_pdf_em([[V*cos(theta),V*sin(theta)]]))*V
In [77]:
%%time
x = arange(0, max_speed, 0.5)
# 1. Fit Weibull
_, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(df.speed, x=x)

# 2. GMM Model
y_ =[integrate.nquad(f, [[x_-0.01, x_+0.01],[0, 2*pi]]) for x_ in x]
y_gmm = array(list(zip(*y_))[0])/0.02

# 3. Plot Comparison
df['speed'].hist(bins=arange(0, df.speed.max()), alpha=0.5, label='Data')
plot(x, y_gmm*len(df.speed),'-', color='black', label='GMM')
plot(x, y_weibull*len(df.speed), '--', color='black', label='Weibull') 
print('Speed Distribution Comparison')
plt_configure(xlabel='Speed'+speed_unit_text,
              ylabel='Frequency',legend=True, figsize=(4, 2))
plt.gca().set_ylim(bottom = 0)
plt.tight_layout()
plt.locator_params(axis='y', nbins=5)

# 4. R square for GMM, Weibull
print(R_square_for_speed(df['speed'], f, weibull_params, f_em))
Speed Distribution Comparison
(0.97338573924997929, 0.93373447449521618, 0.93829643702763399)
Wall time: 32.8 s
In [78]:
%%time
y_ = [integrate.nquad(f, [[0, x_val],[0, 2*pi]]) for x_val in x]
y_cdf_gmm = array(list(zip(*y_))[0])

# 5.2. CDF Comaprison
plot(x, y_ecdf,'o', alpha=0.8, label='Data')
plot(x, y_cdf_gmm,'-', color='black',label='GMM')
plot(x, y_cdf_weibull,'--', color='black',label='Weibull')
plt_configure(xlabel = "V", ylabel='P', legend=True, figsize=(4,3))

plt.figure()
plot(log(x), log(-log(1-y_ecdf)),'o', label = 'Empirical')
plot(log(x), log(-log(1-y_cdf_weibull)),'--', label = 'Weibull')
plot(log(x), log(-log(1-y_cdf_gmm)),'-', color='black', label = 'GMM')
plt_configure(xlabel='ln(V)',ylabel='ln(-ln(1-P))',legend={'loc':'best'}, figsize=(4,3))
align_figures()

cdf_diff, cdf_diff_weibull= np.abs(y_ecdf - y_cdf_gmm), np.abs(y_ecdf - y_cdf_weibull)
print(cdf_diff.max(), cdf_diff_weibull.max()) 
print(x[cdf_diff.argmax()], x[cdf_diff_weibull.argmax()])
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:11: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:12: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:13: RuntimeWarning: divide by zero encountered in log
0.0240836004578 0.0460066091602
7.0 7.0
Wall time: 27.5 s
In [79]:
# Calculate Angle Distribution
x = linspace(0,2*pi, num=36+1)
y_ =[integrate.nquad(f, [[0, inf],[x_-pi/36, x_+pi/36]]) for x_ in x]
y = array(list(zip(*y_))[0])
density, _ = dir_hist(df['dir'], bins=arange(-5, 370, 10), density=True)

plt.bar(arange(0, 360, 10), density*10*len(df['dir']), width=10, alpha=0.5, label='Data')
plot(x/pi*180, y*len(df['dir']) ,'-', color='black', label='GMM')
plt_configure(xlabel='Direction'+dir_unit_text, ylabel='Frequency', 
              legend={'loc': 'best'} ,tight='xtight',figsize = (4,2))
plt.tight_layout()
dir_fig = plt.gcf()
print('Direction Distribution Comparison')
sector_r_square(density*10, y[:-1])
Direction Distribution Comparison
Out[79]:
0.8899034217098688
In [80]:
pd.DataFrame(gofs_mean_set).set_index('year_lim')
Out[80]:
k_s r_square r_square_dir
year_lim
(1996, 2006) NaN NaN NaN

6.4 Sectoral Comaprison

In [81]:
%%time
incre = max(SECTOR_LENGTH, 10)
density_collection=Parallel(n_jobs=-1)(delayed(direction_compare)(gmm, df, angle, incre) 
                                        for angle in arange(0, 360, incre))  
# This R square is computed as in paper 
# Comparison of bivariate distribution constructionapproaches for analysing wind speed anddirection data
# http://onlinelibrary.wiley.com/doi/10.1002/we.400/full
print(true_R_square(density_collection))
0.904634357711
Wall time: 10.9 s
In [82]:
# %%time
# curve_collection=Parallel(n_jobs=-1)(delayed(direction_compare2)
#                                      (gmm, df, angle, incre, complex=True) for angle in arange(start, end, incre))  
In [83]:
# Calculate Speed Distribution
def model_data_comparison(df, original_incre = 10, incre = 10):
    start, end = -original_incre/2 + incre/2, 360
    curve_collection = []
    max_speed = df.speed.max()
    
    # Find a max count for plotting histogram
    max_count = max_count_for_angles(df, start, end, incre)
    plot_range = [0, max_speed, 0, max_count*1.05]
    
    for angle in arange(start, end, incre):
        angle_radian, incre_radian = np.radians([angle, incre])  
        start_angle, end_angle = angle-incre/2, angle+incre/2
        
        # 0. Select data from observation
        sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)
        data_size = len(sub_df.speed)
        # 1. Get Weibull and ECDF
        x, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(sub_df.speed)
        # 2. Get GMM PDF, CDF
        _, y_gmm, y_cdf_gmm, direction_prob = gmm_integration_in_direction(f, angle_radian-incre_radian/2, angle_radian+incre_radian/2, x)
        
        # 3. R square for GMM, Weibull
        bins = arange(0, sub_df.speed.max()+1)
        density, _ = np.histogram(sub_df['speed'], bins=bins, normed=True)
        density_expected_gmm_ =[integrate.nquad(f, [[x_, x_+1],[angle_radian-incre_radian/2, angle_radian+incre_radian/2]]) 
                            for x_ in bins[:-1]]
        density_expected_gmm = array(list(zip(*density_expected_gmm_ ))[0])/direction_prob
        R_square_gmm = sector_r_square(density, density_expected_gmm)
        
        density_expected_weibull = sp.stats.weibull_min.cdf(bins[1:], *weibull_params) - sp.stats.weibull_min.cdf(bins[:-1], *weibull_params) 
        R_square_weibull = sector_r_square(density, density_expected_weibull)

        # 4. K-S for GMM, Weibull
        cdf_diff, cdf_diff_weibull= np.abs(y_ecdf - y_cdf_gmm), np.abs(y_ecdf - y_cdf_weibull)
                
        # 5. Make Plots
        fig = plt.figure(figsize=(10,1.9))
        # 5.1. Frequency Comparison
        ax1 = fig.add_subplot(1,3,1)        
        sub_df['speed'].hist(bins=arange(0, sub_max_speed), alpha=0.5, label='Data')                  
        plot(x, y_gmm*data_size,'-', color='black', label='GMM')
        plot(x, y_weibull*data_size, '--', color='black',label='Weibull')   
        plt_configure(xlabel = "V", ylabel='Frequency', legend=True)
        plt.axis(plot_range)
        
        # 5.2. CDF Comaprison
        ax2 = fig.add_subplot(1,3,2)
        plot(x, y_ecdf,'o', alpha=0.8, label='Data')
        plot(x, y_cdf_gmm,'-', color='black',label='GMM')
        plot(x, y_cdf_weibull,'--', color='black',label='Weibull')
        plt.gca().set_xlim(right = max_speed)
        plt_configure(xlabel = "V", ylabel='P', legend=True)
        
        curves = {'direction': angle, 'datasize': data_size, 'weight': direction_prob, 'x': x, 
                  'gmm_pdf': y_gmm, 'gmm_cdf': y_cdf_gmm,
                  'weibull_pdf': y_weibull, 'weibull_cdf': y_cdf_weibull, 'ecdf': y_ecdf,
                  'max_cdf_diff_gmm': cdf_diff.max(), 'max_cdf_diff_weibull': cdf_diff_weibull.max(), 
                  'r_square_gmm': R_square_gmm, 'r_square_weibull': R_square_weibull}
        curve_collection.append(curves)
        
        plt.tight_layout()
        plt.show()
        print('%s (%s - %s) degree' % (angle, start_angle, end_angle))
        print('data size:', len(sub_df), 'weight', len(sub_df)/len(df))
        print('GMM', 'Weibull')
        print('R square', R_square_gmm,  R_square_weibull)
        print('max diff:', cdf_diff.max(), cdf_diff_weibull.max(), 
              'speed value:', x[cdf_diff.argmax()], x[cdf_diff_weibull.argmax()], 'y gmm', y_cdf_gmm[cdf_diff.argmax()])
        print(' ')
    return curve_collection
In [84]:
%%time
if len(effective_column) == 16:
    rebinned_angle = 22.5
else: 
    rebinned_angle = 20
    
curve_collection = model_data_comparison(df, SECTOR_LENGTH, rebinned_angle)
5.0 (-5.0 - 15.0) degree
data size: 1623 weight 0.038582227927542435
GMM Weibull
R square 0.968932084792 0.852369381409
max diff: 0.0595515449171 0.198427167935 speed value: 10.1629733605 4.35556001166 y gmm 0.964049388417
 
25.0 (15.0 - 35.0) degree
data size: 1963 weight 0.04666476489326297
GMM Weibull
R square 0.963135497897 0.809260720149
max diff: 0.0778809867194 0.0925059045251 speed value: 9.78690387245 6.99064562318 y gmm 0.921487711121
 
45.0 (35.0 - 55.0) degree
data size: 1674 weight 0.03979460847240052
GMM Weibull
R square 0.971479358342 0.949920446149
max diff: 0.0357451004303 0.0911978279373 speed value: 3.22865961112 4.30487948149 y gmm 0.116390261721
 
65.0 (55.0 - 75.0) degree
data size: 1377 weight 0.03273427471116817
GMM Weibull
R square 0.928734801949 0.963173442922
max diff: 0.102856708103 0.033696730751 speed value: 7.57405778767 7.57405778767 y gmm 0.565262391389
 
85.0 (75.0 - 95.0) degree
data size: 1612 weight 0.038320734084533827
GMM Weibull
R square 0.941861346263 0.980038602165
max diff: 0.0561137103165 0.017183048329 speed value: 7.09976891104 2.83990756442 y gmm 0.403563709038
 
105.0 (95.0 - 115.0) degree
data size: 2189 weight 0.0520372747587125
GMM Weibull
R square 0.869476030183 0.913945227493
max diff: 0.0660270942357 0.0529631927168 speed value: 6.66456866377 11.6629951616 y gmm 0.263803878811
 
125.0 (115.0 - 135.0) degree
data size: 3127 weight 0.07433556791708268
GMM Weibull
R square 0.834583788738 0.903990196563
max diff: 0.0732214858202 0.478539566835 speed value: 10.4548361125 14.3753996547 y gmm 0.479361556175
 
145.0 (135.0 - 155.0) degree
data size: 4151 weight 0.09867826748442923
GMM Weibull
R square 0.847060404673 0.961673040545
max diff: 0.128433143769 0.198803039157 speed value: 16.0179147698 13.1055666298 y gmm 0.764122866831
 
165.0 (155.0 - 175.0) degree
data size: 3826 weight 0.09095231303190225
GMM Weibull
R square 0.900166168063 0.937930485331
max diff: 0.0895674609233 0.0795291775011 speed value: 15.6691860875 7.12235731252 y gmm 0.711268921722
 
185.0 (175.0 - 195.0) degree
data size: 1844 weight 0.04383587695526078
GMM Weibull
R square 0.916175482251 0.974081434963
max diff: 0.123821576264 0.0207780879259 speed value: 9.08828198541 9.08828198541 y gmm 0.545375820699
 
205.0 (195.0 - 215.0) degree
data size: 1483 weight 0.03525412447106927
GMM Weibull
R square 0.95224364198 0.953919790751
max diff: 0.0672209963555 0.034515829403 speed value: 6.09334785528 7.31201742633 y gmm 0.481666394069
 
225.0 (215.0 - 235.0) degree
data size: 1833 weight 0.043574383112252174
GMM Weibull
R square 0.967559200329 0.937613578976
max diff: 0.043445370955 0.0478406841563 speed value: 10.1697752138 7.62733141036 y gmm 0.928333532439
 
245.0 (235.0 - 255.0) degree
data size: 2345 weight 0.05574573289592545
GMM Weibull
R square 0.964061410151 0.923305473511
max diff: 0.0571672653154 0.0461293645755 speed value: 10.3367827698 7.75258707735 y gmm 0.913030804761
 
265.0 (255.0 - 275.0) degree
data size: 2847 weight 0.06767936100413635
GMM Weibull
R square 0.963666595641 0.933346681815
max diff: 0.0876441702477 0.0507640620525 speed value: 11.9739883789 10.2634186105 y gmm 0.936959238741
 
285.0 (275.0 - 295.0) degree
data size: 3372 weight 0.08015974896591072
GMM Weibull
R square 0.86274027778 0.886861129627
max diff: 0.138648197852 0.0500365278813 speed value: 13.8404499386 8.65028121162 y gmm 0.953891376974
 
305.0 (295.0 - 315.0) degree
data size: 2376 weight 0.05648267008985879
GMM Weibull
R square 0.934622758199 0.913638487609
max diff: 0.0373581493881 0.0811388284716 speed value: 12.6946775136 3.1736693784 y gmm 0.921196533226
 
325.0 (315.0 - 335.0) degree
data size: 1761 weight 0.04186278704892312
GMM Weibull
R square 0.959304843254 0.945002518944
max diff: 0.0340685488821 0.0401675407046 speed value: 9.98837952382 2.85382272109 y gmm 0.904596657911
 
345.0 (335.0 - 355.0) degree
data size: 2057 weight 0.04889934864260923
GMM Weibull
R square 0.93014066759 0.902467836745
max diff: 0.0423417756096 0.108764686371 speed value: 2.57799728927 3.8669959339 y gmm 0.0724827576223
 
Wall time: 58.7 s
In [85]:
diff_df = pd.DataFrame(curve_collection) 

gmm_mean, weibull_mean = plot_sectoral_comparison(diff_df.r_square_gmm, diff_df.r_square_weibull, 
                                                  diff_df.direction, diff_df.datasize)
plt_configure(ylabel="$\ R^2$", xlabel='Direction'+dir_unit_text)
ylim = min(plt.gca().get_ylim()[0],0.75)
plt.gca().set_ylim(top=1, bottom=ylim)
plt.tight_layout()
print(gmm_mean, weibull_mean)
0.9158637084432527 0.9240186747750685
In [86]:
gmm_mean, weibull_mean = plot_sectoral_comparison(diff_df.max_cdf_diff_gmm, diff_df.max_cdf_diff_weibull, 
                                                  diff_df.direction, diff_df.datasize)
plt_configure(ylabel="K-S", xlabel='Direction'+dir_unit_text)
ylim = max(plt.gca().get_ylim()[1],0.25)
plt.gca().set_ylim(top=ylim, bottom=0)
plt.tight_layout()
print(gmm_mean, weibull_mean)
0.07924202878479368 0.10993984897662709
In [87]:
# Compare direction weight with previous figure
display(dir_fig)

6.5 Insufficient-fit Sector Investigation

(1) Data Variability, by Bootstrap (Resampling)

In [88]:
angle =  max_diff_angle = diff_df.ix[diff_df['max_cdf_diff_gmm'].idxmax()]['direction']
incre = rebinned_angle
In [89]:
FRACTION = 1

# Select data from observation
start_angle, end_angle = angle-incre/2, angle+incre/2
angle_radian, incre_radian = radians(angle), radians(incre)  
sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)
In [90]:
x = arange(0, sub_max_speed, 0.5)
_, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(sub_df.speed, x)
_, y_gmm, y_cdf_gmm, direction_prob = gmm_integration_in_direction(f, angle_radian-incre_radian/2, angle_radian+incre_radian/2, x)

fig = plt.figure(figsize=(10,1.9))
ax1 = fig.add_subplot(1,3,1)   
ax2 = fig.add_subplot(1,3,2)   
ax3 = fig.add_subplot(1,3,3)   

# 1. Data
bins=arange(0, sub_max_speed)
sub_df['speed'].hist(ax=ax1, bins=bins, alpha=0.5, label='Data', normed=True)  

# 2. GMM
ax1.plot(x, y_gmm,'-', color='black', label='GMM')
ax2.plot(x, y_cdf_gmm,'-', color = 'black', label='GMM')
ax3.plot(log(x), log(-log(1-y_cdf_gmm)),'-', color = 'black',label='GMM')

# 3. Weilbull 
ax1.plot(x, y_weibull,'--',color='black',label='Weibull')
ax2.plot(x, y_cdf_weibull,'--',label='Weibull')
ax3.plot(log(x), log(-log(1-y_cdf_weibull)),'--',label='Weibull')

# 4. Data Resampled
count_collection = []
for i in range(1,100):
    sub_df_resampled = sub_df.sample(frac=FRACTION, replace=True)    
    resampled_count, _ = np.histogram(sub_df_resampled['speed'], bins=bins, normed=True) 
    count_collection.append(resampled_count)
    
    ecdf = sm.distributions.ECDF(sub_df_resampled.speed)
    y_ecdf = ecdf(x) 
    ax2.plot(x, y_ecdf,':', label='Data Resampled')
    ax3.plot(log(x), log(-log(1-y_ecdf)),':', label='Data Resampled')
    if i == 1: 
#         plt_configure(ax=ax2, xlabel = "$V$", ylabel='$P$', legend={'loc':'best'})
#         plt_configure(ax=ax3, xlabel="ln($V$)", ylabel="ln(-ln(1-$P$)",legend={'loc':'best'})
        plt_configure(ax=ax2, xlabel = "V", ylabel='P', legend={'loc':'best'})
        plt_configure(ax=ax3, xlabel="ln(V)", ylabel="ln(-ln(1-P)",legend={'loc':'best'})

print('%s (%s - %s) Degree Speed Distribution' % (angle, start_angle, end_angle))
count_collection = np.array(count_collection)
mx, mn = np.max(count_collection,0), np.min(count_collection,0)
ax1.plot(bins[1:]-0.5, mx, ':', color='blue')
ax1.plot(bins[1:]-0.5, mn, ':', color='blue', label='Resample limit')
ax1.set_ylim(bottom = 0)
# plt_configure(ax=ax1, xlabel='$V$',ylabel='Frequency',legend={'loc':'best'})
plt_configure(ax=ax1, xlabel='V', ylabel='Frequency',legend={'loc':'best'})
ax1.locator_params(axis='y', nbins=5)
ax2.locator_params(axis='y', nbins=5)
ax3.locator_params(axis='y', nbins=5)
plt.tight_layout()
diff = abs(y_ecdf - y_cdf_gmm)
print(diff.max(), x[diff.argmax()], y_cdf_gmm[diff.argmax()])
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:17: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:22: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:34: RuntimeWarning: divide by zero encountered in log
285.0 (275.0 - 295.0) Degree Speed Distribution
0.145021180383 13.5 0.944546684535

(2) Time Variability

In [91]:
fig_time_variability_3d = plt.figure()
ax1 = fig_time_variability_3d.gca(projection='3d')

fig_time_variability_cdf,ax2 = plt.subplots(figsize=(3,1.8))
fig_time_variability_weibull, ax3 = plt.subplots(figsize=(3,1.8))

ax2.plot(x, y_cdf_gmm,'-', color='black', label = 'GMM')
ax2.plot(x, y_cdf_weibull,'--', label='Weibull')

ax3.plot(log(x), log(-log(1-y_cdf_gmm)),'-', color='black',label='GMM')
ax3.plot(log(x), log(-log(1-y_cdf_weibull)), '--', label='Weibull')

# 3. Data
prop_cycle=iter(mpl.rcParams['axes.color_cycle'])
for start_time in range(2001, 2015, 5):
    end_time = start_time + 4 
    df_other_years = df_all_years[str(start_time):str(end_time)]
    df_other_years_at_angle, sub_max_speed_other_year = select_df_by_angle(df_other_years, start_angle, end_angle)
    if len(df_other_years_at_angle) > 0 :
        
        ecdf = sm.distributions.ECDF(df_other_years_at_angle.speed)
        y_ecdf = ecdf(x)
        ax2.plot(x, y_ecdf,':', label = start_time)
        ax3.plot(log(x), log(-log(1-y_ecdf)),':', label = start_time)
        
        count, division = np.histogram(df_other_years_at_angle['speed'], normed=True,
                                       bins=arange(0, sub_max_speed_other_year))
        ax1.bar(left=division[:-1], height=count, zs=start_time, zdir='x', 
                color=next(prop_cycle), alpha=0.8)
        x_3d = start_time*np.ones_like(x)
        ax1.plot(x_3d, x, y_gmm, '-', color='black', label='GMM'  if start_time == 2011 else '')
        ax1.plot(x_3d, x, y_weibull, '--', color='blue', label='Weibull' if start_time == 2011 else '')
        
print('%s (%s - %s) Degree Speed Distribution' % (angle, start_angle, end_angle))
ax1.set_ylim(bottom = 0)
ax1.set_zlabel('Frequency')
plt_configure(ax=ax1, xlabel='Time',ylabel='V', legend=True)
plt_configure(ax=ax2, xlabel = "V", ylabel='P', legend={'loc':'best'})
plt_configure(ax=ax3, xlabel="ln(V)", ylabel="ln(-ln(1-P)", legend={'loc':'best'})

ax1.set_zlim(bottom = 0)
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:10: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:11: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:24: RuntimeWarning: divide by zero encountered in log
285.0 (275.0 - 295.0) Degree Speed Distribution

(3) Adjacent Sector Variability

In [92]:
incre = rebinned_angle
angle_group = [max_diff_angle-incre, max_diff_angle, max_diff_angle+incre]
In [93]:
fig_adjecent_variability_3d = plt.figure()
ax1 = fig_adjecent_variability_3d.gca(projection='3d')
fig_adjecent_variability_cdf, ax2 = plt.subplots(figsize=(3,1.8))
fig_adjecent_variability_weibull, ax3 = plt.subplots(figsize=(3,1.8))

legend_3d = False
prop_cycle=iter(mpl.rcParams['axes.color_cycle'])

curve_df = pd.DataFrame(curve_collection)

for angle in angle_group:
    curves = curve_df.query('direction == @angle%360').T.to_dict()
    curves = curves[list(curves)[0]]
    data_size, x =  curves['datasize'], curves['x']
    y_gmm, y_cdf_gmm =  curves['gmm_pdf'], curves['gmm_cdf'] 
    y_weibull, y_cdf_weibull, y_cdf = curves['weibull_pdf'],  curves['weibull_cdf'], curves['ecdf']

    linestyle = '-' if angle == max_diff_angle else ':'
    alpha = 0.7 if angle == max_diff_angle else 0.3

    ax2.plot(x, y_gmm*data_size, linestyle, label=angle)        
    ax3.plot(x, y_weibull*data_size, linestyle, label=angle)

    start_angle, end_angle = angle-incre/2, angle+incre/2
    sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)

    x_3d = angle*np.ones_like(x)
    ax1.plot(x_3d, x, y_gmm*data_size, color='black', label='GMM')
    ax1.plot(x_3d, x, y_weibull*data_size, color='blue', linestyle='--',label='Weibull')

    count, division = np.histogram(sub_df['speed'], bins=arange(0, sub_max_speed))
    ax1.bar(left=division[:-1], height=count, zs=angle, zdir='x', color=next(prop_cycle), alpha=0.8)

    if legend_3d == False:
        ax1.legend()
        legend_3d = True
        
plt_configure(ax=ax1, xlabel='Direction', ylabel='Speed')   
plt_configure(ax=ax2, xlabel='V',ylabel='Frequency',legend={'loc':'best'})
plt_configure(ax=ax3, xlabel='V',ylabel='Frequency',legend={'loc':'best'})
ax1.set_zlabel('Frequency')
ax1.set_zlim(bottom = 0)
ylim = max(ax1.get_ylim()[1],ax3.get_ylim()[1])
ax2.set_ylim(bottom = 0, top=ylim)
ax3.set_ylim(bottom = 0, top=ylim)

print(max_diff_angle) 
print('GMM, Weibull, Histogram')
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
285.0
GMM, Weibull, Histogram

7. Result Variability & Cross-Validation

In [94]:
if 'bandwidth' not in globals():
    bandwidth = DEFAULT_BANDWDITH    
if 'FIT_METHOD' not in globals():
    FIT_METHOD = 'square_error'       
if 'KDE_KERNEL' not in globals():
    KDE_KERNEL = 'gaussian'
    
config = {'bandwidth': bandwidth, 
          'fitting_range': FITTING_RANGE,
          'fit_limit': fit_limit,
          'kde_kernel': KDE_KERNEL}

print(bandwidth, FIT_METHOD)
1.1 square_error

7.1 Variability of the Result

In [95]:
%%time
results = Parallel(n_jobs=-1)(delayed(resampled_fitting)(df, FIT_METHOD, NUMBER_OF_GAUSSIAN, config) for i in range(12))                        
for result in results:
    display(pretty_print_gmm(result['gmm']))
    fig,ax = plt.subplots(figsize=(3.5,3.5))
    plot_gmm_ellipses(result['gmm'],ax=ax, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
    plt.show()
    
    display(gof_df(result['gmm_pdf_result'], result['kde_result']))
    display(gof_df(result['gmm_pdf_result'], kde_result))
    print('')
weight mean_x mean_y sig_x sig_y corr
1 0.448 -6.512 5.925 7.197 4.601 -0.006
2 0.346 -0.837 -5.014 4.013 3.857 -0.407
3 0.206 4.884 0.220 2.287 3.447 -0.158
GMM Plot Result
0.448036532228 [[-6.51214692  5.92524672]] [ 4.60116993  7.1974108 ] -90.3452493942
0.345704594149 [[-0.83747624 -5.01406905]] [ 3.02645788  4.67124611] -132.207610193
0.206258873622 [[ 4.88443919  0.21986022]] [ 2.23720332  3.47941485] -169.698844626
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.976 0.019 0.192 2.510481e-08 0.032 0.199
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.975 0.018 0.171 2.572148e-08 0.032 0.201

weight mean_x mean_y sig_x sig_y corr
1 0.448 -6.492 5.919 7.143 4.554 -0.020
2 0.352 -0.754 -5.051 4.048 3.865 -0.388
3 0.200 4.923 0.215 2.246 3.360 -0.124
GMM Plot Result
0.447890406655 [[-6.49183548  5.9188433 ]] [ 4.55234787  7.14390305] -91.2211731677
0.352069480925 [[-0.75390438 -5.05075567]] [ 3.08951442  4.66684713] -131.609655527
0.20004011242 [[ 4.92297989  0.21498254]] [ 2.2151241   3.38011531] -171.64224864
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.973 0.018 0.167 2.774818e-08 0.033 0.209
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.975 0.017 0.156 2.589704e-08 0.032 0.202

weight mean_x mean_y sig_x sig_y corr
1 0.440 -6.513 5.966 7.221 4.608 -0.022
2 0.350 -0.968 -4.950 4.083 3.955 -0.429
3 0.211 4.833 0.224 2.274 3.388 -0.140
GMM Plot Result
0.439568851141 [[-6.51268134  5.96626057]] [ 4.60626072  7.22265112] -91.36741611
0.349782627497 [[-0.96751755 -4.95017288]] [ 3.03504302  4.80607589] -132.867643351
0.210648521362 [[ 4.83261012  0.22427152]] [ 2.23442957  3.41472455] -170.572543035
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.977 0.018 0.163 2.426041e-08 0.031 0.196
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.975 0.018 0.159 2.609387e-08 0.032 0.203

weight mean_x mean_y sig_x sig_y corr
1 0.426 -6.678 6.071 7.061 4.490 0.008
2 0.357 -0.931 -4.981 4.077 3.957 -0.434
3 0.217 4.800 0.318 2.380 3.487 -0.176
GMM Plot Result
0.425999129967 [[-6.6782429   6.07123447]] [ 4.48973393  7.0610181 ] -89.5331177498
0.357240522802 [[-0.93136138 -4.9808192 ]] [ 3.01942591  4.81241654] -133.036309333
0.216760347231 [[ 4.79959528  0.31839297]] [ 2.31341069  3.53173437] -167.873134439
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.976 0.016 0.185 2.499153e-08 0.032 0.198
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.975 0.016 0.181 2.595358e-08 0.032 0.202

weight mean_x mean_y sig_x sig_y corr
1 0.449 -6.635 5.927 6.898 4.686 -0.014
2 0.336 -0.880 -5.130 3.993 3.841 -0.385
3 0.216 4.805 0.222 2.342 3.561 -0.085
GMM Plot Result
0.448838959803 [[-6.63542324  5.92697198]] [ 4.68554162  6.89817286] -91.040742888
0.335591813593 [[-0.88014181 -5.12953988]] [ 3.06743402  4.61386351] -132.122309199
0.215569226604 [[ 4.80456246  0.22157181]] [ 2.32702431  3.57066483] -174.453997155
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.974 0.017 0.143 2.684618e-08 0.033 0.206
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.975 0.018 0.152 2.594566e-08 0.032 0.202

weight mean_x mean_y sig_x sig_y corr
1 0.426 -6.919 6.116 6.730 4.528 0.023
2 0.349 -1.024 -5.026 4.076 3.944 -0.421
3 0.225 4.800 0.282 2.381 3.518 -0.120
GMM Plot Result
0.425730093472 [[-6.91865248  6.11566491]] [ 4.52543823  6.73134375] -88.3560423262
0.348795125808 [[-1.02412473 -5.02648444]] [ 3.04871382  4.78280646] -132.773643064
0.225474780719 [[ 4.79978796  0.28225717]] [ 2.34951612  3.5387729 ] -171.666347327
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.974 0.017 0.200 2.689420e-08 0.033 0.206
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.975 0.016 0.179 2.581176e-08 0.032 0.202

weight mean_x mean_y sig_x sig_y corr
1 0.444 -6.683 6.134 7.076 4.670 0.019
2 0.344 -0.925 -4.998 4.014 3.913 -0.411
3 0.212 4.866 0.115 2.276 3.417 -0.102
GMM Plot Result
0.444217427964 [[-6.6833258  6.1340159]] [ 4.6680496  7.0769235] -88.7264106078
0.343673686357 [[-0.92495256 -4.99833944]] [ 3.03961175  4.70951313] -133.221907837
0.21210888568 [[ 4.86580319  0.11535039]] [ 2.25544661  3.43130237] -173.162553058
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.974 0.020 0.217 2.716025e-08 0.032 0.207
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.975 0.023 0.163 2.610465e-08 0.032 0.203

weight mean_x mean_y sig_x sig_y corr
1 0.453 -6.441 5.836 7.250 4.630 -0.019
2 0.338 -0.951 -5.036 3.955 3.859 -0.397
3 0.209 4.860 0.122 2.255 3.374 -0.116
GMM Plot Result
0.452749943573 [[-6.44111625  5.83589984]] [ 4.62912287  7.25054816] -91.1489216716
0.338285970052 [[-0.95116244 -5.03557176]] [ 3.03147728  4.62005779] -133.245757596
0.208964086375 [[ 4.86014435  0.1215363 ]] [ 2.22839668  3.39214984] -172.194159101
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.977 0.019 0.178 2.423958e-08 0.030 0.196
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.975 0.019 0.158 2.606868e-08 0.032 0.203

weight mean_x mean_y sig_x sig_y corr
1 0.428 -6.993 6.021 6.758 4.576 0.019
2 0.348 -0.922 -5.024 4.015 3.887 -0.393
3 0.225 4.815 0.304 2.379 3.582 -0.135
GMM Plot Result
0.427558977139 [[-6.99303923  6.02075921]] [ 4.5741216   6.75904299] -88.6117728069
0.34779051675 [[-0.92153899 -5.02358542]] [ 3.07577141  4.66583554] -132.641448334
0.224650506111 [[ 4.8147642   0.30364574]] [ 2.34132134  3.60665577] -171.118643795
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.974 0.017 0.180 2.735067e-08 0.034 0.208
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.975 0.017 0.173 2.570319e-08 0.032 0.201

weight mean_x mean_y sig_x sig_y corr
1 0.427 -6.939 6.146 6.844 4.562 0.035
2 0.357 -0.916 -4.921 4.009 3.901 -0.382
3 0.216 4.790 0.416 2.394 3.524 -0.190
GMM Plot Result
0.427161056703 [[-6.93880856  6.14573916]] [ 4.55752255  6.84708938] -87.6213585576
0.356914005507 [[-0.91609106 -4.92052236]] [ 3.10810931  4.65064641] -132.952087976
0.21592493779 [[ 4.79012719  0.41571923]] [ 2.31704503  3.57534652] -167.190069832
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.972 0.018 0.188 2.847131e-08 0.035 0.212
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.975 0.017 0.169 2.587638e-08 0.032 0.202

weight mean_x mean_y sig_x sig_y corr
1 0.442 -6.594 5.902 7.109 4.618 0.004
2 0.344 -0.919 -5.009 4.019 3.861 -0.395
3 0.215 4.869 0.248 2.317 3.459 -0.123
GMM Plot Result
0.441551664992 [[-6.59428416  5.90186732]] [ 4.61833482  7.10917075] -89.7606705115
0.343672167131 [[-0.91878616 -5.00945377]] [ 3.05964739  4.65788142] -132.101046126
0.214776167877 [[ 4.86944205  0.24783553]] [ 2.2854402   3.47964066] -171.691796669
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.975 0.018 0.152 2.633190e-08 0.032 0.203
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.975 0.018 0.158 2.577298e-08 0.032 0.202

weight mean_x mean_y sig_x sig_y corr
1 0.414 -7.068 6.216 6.661 4.530 0.074
2 0.357 -0.958 -4.965 4.080 3.936 -0.409
3 0.229 4.780 0.448 2.406 3.533 -0.167
GMM Plot Result
0.414075996987 [[-7.06780742  6.21555577]] [ 4.50748138  6.67664413] -84.7305269459
0.356676549647 [[-0.95846895 -4.96477575]] [ 3.07927106  4.75963525] -132.495718834
0.229247453367 [[ 4.77977562  0.4479891 ]] [ 2.34564323  3.57325841] -168.505982285
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.974 0.015 0.184 2.664357e-08 0.033 0.205
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.975 0.016 0.193 2.584240e-08 0.032 0.202
Wall time: 21.3 s

7.2 Cross-validation, to select the number of Gaussian

In [96]:
%%time
from sklearn.cross_validation import train_test_split, KFold

## 5-fold cross validation
gaussian_number_range = arange(1,6)
CV_result_train_all,CV_result_test_all =[],[]
number_of_fold = 4
print('Number of train/test dataset', len(df)*(number_of_fold-1)/number_of_fold, len(df)/number_of_fold) 

for number_of_gaussian in gaussian_number_range:
    print( '  ')
    print('Number of gaussian', number_of_gaussian)
    
    kf = KFold(len(df), n_folds=number_of_fold, shuffle=True) 

    CV_result = Parallel(n_jobs=-1)(delayed(fit_per_fold)(df, train_index, test_index, FIT_METHOD, number_of_gaussian, config) for train_index, test_index in kf)                        

    CV_result_train, CV_result_test = list(zip(*CV_result))
    CV_result_train, CV_result_test = list(CV_result_train), list(CV_result_test)
        
    CV_result_train_all.append(CV_result_train)
    CV_result_test_all.append(CV_result_test)
    
    print('Train')
    pretty_pd_display(CV_result_train)
    print('Test')
    pretty_pd_display(CV_result_test)
D:\ProgramData\Anaconda3\lib\site-packages\sklearn\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
Number of train/test dataset 31549.5 10516.5
  
Number of gaussian 1
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.182558 0.058436 2.435895e-07 0.099267 0.619421 0.765859
1 0.182516 0.058233 2.431249e-07 0.098218 0.618769 0.765971
2 0.180312 0.056443 2.368530e-07 0.099152 0.611256 0.770435
3 0.181892 0.058028 2.411418e-07 0.098179 0.616276 0.768998
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.185844 0.052871 2.382331e-07 0.097883 0.612949 0.770088
1 0.184048 0.054111 2.349332e-07 0.100112 0.608867 0.774488
2 0.191001 0.062170 2.528227e-07 0.097124 0.630015 0.761859
3 0.189034 0.064485 2.518646e-07 0.102481 0.630319 0.754925
  
Number of gaussian 2
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.116961 0.023942 9.006525e-08 0.059317 0.376629 0.913588
1 0.114057 0.025048 8.992753e-08 0.060022 0.376439 0.913816
2 0.115565 0.024748 9.052175e-08 0.061224 0.377714 0.912450
3 0.109330 0.024198 9.052395e-08 0.060999 0.377662 0.912576
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.126982 0.023941 9.574609e-08 0.065502 0.388639 0.907252
1 0.107255 0.026843 9.475828e-08 0.062647 0.386327 0.907778
2 0.145494 0.032475 9.406191e-08 0.059442 0.384803 0.911013
3 0.128568 0.025059 9.432500e-08 0.060161 0.385516 0.910391
  
Number of gaussian 3
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.166170 0.018455 2.588128e-08 0.031903 0.202026 0.974987
1 0.161144 0.016163 2.525301e-08 0.032535 0.199547 0.975585
2 0.208430 0.017181 2.552398e-08 0.032584 0.200465 0.975312
3 0.168793 0.017009 2.611411e-08 0.031862 0.202751 0.975181
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.177436 0.017164 2.983356e-08 0.036180 0.216523 0.971699
1 0.215154 0.023783 3.243144e-08 0.034309 0.225793 0.969288
2 0.150731 0.021950 3.073525e-08 0.033762 0.220303 0.970906
3 0.288367 0.020775 3.044033e-08 0.037229 0.219301 0.969651
  
Number of gaussian 4
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 2.092061 0.020415 1.695521e-08 0.026388 0.163474 0.983628
1 3.886085 0.020336 1.608949e-08 0.025591 0.159185 0.984510
2 3.139501 0.018859 1.568497e-08 0.024835 0.157191 0.984911
3 2.174824 0.019262 1.564160e-08 0.025174 0.157007 0.984975
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 2.416939 0.020222 1.845747e-08 0.026647 0.170446 0.982446
1 2.573307 0.023474 1.944738e-08 0.027711 0.175156 0.981318
2 2.215120 0.020397 2.298371e-08 0.031759 0.190348 0.977907
3 4.603554 0.024197 2.113483e-08 0.029089 0.182413 0.979561
  
Number of gaussian 5
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.159491 0.009697 7.038815e-09 0.016727 0.105321 0.993244
1 0.168774 0.008625 7.455789e-09 0.017688 0.108384 0.992798
2 0.173598 0.008576 7.269189e-09 0.016573 0.106973 0.993004
3 0.016151 0.005008 8.529774e-09 0.019003 0.115970 0.991789
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.199232 0.013811 1.236204e-08 0.022582 0.139521 0.988020
1 0.399635 0.013498 1.069134e-08 0.019669 0.129794 0.989834
2 0.216942 0.013080 1.593612e-08 0.028252 0.158670 0.984731
3 0.021954 0.007244 1.265326e-08 0.021137 0.141045 0.987848
Wall time: 1min
In [97]:
train_scores_mean, train_scores_std = generate_mean_std_gof(CV_result_train_all)
print('Train gof mean, std')
display(train_scores_mean)

test_scores_mean, test_scores_std = generate_mean_std_gof(CV_result_test_all)
print('Test gof mean, std')
display(test_scores_mean)
Train gof mean, std
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
1 0.181819 0.057785 2.411773e-07 0.098704 0.616430 0.767816
2 0.113978 0.024484 9.025962e-08 0.060391 0.377111 0.913107
3 0.176134 0.017202 2.569310e-08 0.032221 0.201197 0.975266
4 2.823118 0.019718 1.609282e-08 0.025497 0.159214 0.984506
5 0.129503 0.007976 7.573392e-09 0.017498 0.109162 0.992709
Test gof mean, std
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
1 0.187482 0.058409 2.444634e-07 0.099400 0.620538 0.765340
2 0.127075 0.027079 9.472282e-08 0.061938 0.386321 0.909109
3 0.207922 0.020918 3.086014e-08 0.035370 0.220480 0.970386
4 2.952230 0.022073 2.050584e-08 0.028801 0.179591 0.980308
5 0.209441 0.011908 1.291069e-08 0.022910 0.142258 0.987609
In [98]:
prop_cycle=mpl.rcParams['axes.color_cycle']
gaussian_number_range = train_scores_mean.index
for column, column_name in zip(['R_square','K_S','Chi_square'],["$\ R^2$", "K-S", "$\widetilde{\chi^2} $"]):
    plot(gaussian_number_range, train_scores_mean[column],
             '--', label = 'training', color=prop_cycle[0])
    plt.fill_between(gaussian_number_range, 
                     train_scores_mean[column] - train_scores_std[column],
                     train_scores_mean[column] + train_scores_std[column], 
                     alpha=0.2, color=prop_cycle[0])
    
    plot(gaussian_number_range, test_scores_mean[column],
             '-', label = 'test',color=prop_cycle[1])
    plt.fill_between(gaussian_number_range, 
                 test_scores_mean[column] - test_scores_std[column],
                 test_scores_mean[column] + test_scores_std[column], 
                 alpha=0.2,color=prop_cycle[1])
    plt.xticks(gaussian_number_range)
    print(column)
    plt.locator_params(axis='y', nbins=5)
    plt_configure(xlabel='Number of Gaussian Distributions', ylabel=column_name, 
                  figsize=(3,2), legend={'loc':'best'})
    if column == 'R_square':
        plt.gca().set_ylim(top=1)
    if column == 'K_S' or column == 'Chi_square':
        plt.gca().set_ylim(bottom=0)
    plt.show()
R_square
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
K_S
Chi_square
In [99]:
fig = plt.figure(figsize=(4.2,2.4))
ax1 = fig.add_subplot(1,2,1) 
plot_2d_prob_density(X, Y, kde_Z, ax=ax1,
                     xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar=False)
ax1.grid(False)
ax2 = fig.add_subplot(1,2,2) 
plot_2d_prob_density(X, Y, pdf_Z, ax=ax2,
                     xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar=False)
ax2.grid(False)
ax2.get_yaxis().set_visible(False)
In [ ]:
for fig in [fig_hist, fig_kde, fig_em, fig_gmm]:
    display(fig)
for fig in [fig_time_variability_3d, fig_time_variability_cdf, fig_time_variability_weibull, 
            fig_adjecent_variability_3d, fig_adjecent_variability_cdf, fig_adjecent_variability_weibull,]:
    display(fig)
In [ ]:
import time
save_notebook()
time.sleep(3)
location_name = get_location_name(file_path)
print(location_name)
current_file = 'GMM.ipynb'
output_file = './output_HTML/'+location_name+'.html' 

output_HTML(current_file, output_file)