Modeling the Joint Distribution of Wind Speed and Direction using Gaussain Mixture Models

OEN Method: Harris, Cook The parent wind speed distribution: Why Weibull? http://www.sciencedirect.com/science/article/pii/S0167610514001056

Gaussian Mixture Models, http://scikit-learn.org/stable/modules/mixture.html

1. Set up

1.1 Environment

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

from import_file import *
from helpers.parallel_helper import *
load_libs()

plt.rcParams['axes.autolimit_mode'] = 'round_numbers'
plt.rcParams['axes.xmargin'] = 0.
plt.rcParams['axes.ymargin'] = 0.
mpl.rcParams['patch.force_edgecolor'] = True

1.2 Read Data

In [2]:
# file_path, bandwidth= './data/NCDC/europe/uk/marham/dat.txt', 1.7
# file_path, bandwidth, NUMBER_OF_GAUSSIAN= './data/NCDC/europe/uk/tiree/dat.txt', 1.9, 4 
# file_path, bandwidth, NUMBER_OF_GAUSSIAN = './data/NCDC/europe/uk/boscombe_down/dat.txt', 1.5, 4
# file_path, bandwidth= './data/NCDC/europe/uk/middle_wallop/dat.txt', 1.3
# file_path, bandwidth= './data/NCDC/europe/uk/bournemouth/dat.txt',1.3 # 4?
# file_path= "./data/NCDC/europe/uk/weybourne/dat.txt"
# file_path= "./data/NCDC/europe/uk/skye_lusa/dat.txt" # 
# file_path= "./data/NCDC/europe/uk/wattisham/dat.txt"
# file_path= "./data/NCDC/europe/uk/south_uist_range/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/europe/uk/holbeach/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/europe/uk/cambridge/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/europe/us/baltimore/dat.txt" # time too short
# file_path= "./data/NCDC/europe/uk/bealach_na_ba/dat.txt" # time too short
# file_path= "./data/NCDC/europe/uk/benbecula/dat.txt" # truncate (untruncate in m/s), 4?
# file_path= './data/NCDC/europe/uk/southhamption/dat.txt' # high 0, trend

# file_path, bandwidth, NUMBER_OF_GAUSSIAN = "./data/NCDC/europe/germany/landsberg_lech/dat.txt", 0.9, 4 
# file_path, bandwidth= "./data/NCDC/europe/germany/neuburg/dat.txt", 0.7
# file_path, bandwidth= "./data/NCDC/europe/germany/laupheim/dat.txt", 0.7 # double peak, 4?, trend
# file_path, bandwidth= './data/NCDC/europe/germany/niederstetten/dat.txt', 0.9 # get the peak
# file_path, bandwidth= "./data/NCDC/europe/germany/holzdorf/dat.txt", 0.9 # 2008 year
# file_path, bandwidth, NUMBER_OF_GAUSSIAN= './data/NCDC/europe/france/nantes/dat.txt', 0.9, 4 # unit shift, one direction deviate big
# file_path= './data/NCDC/europe/france/pau_pyrenees/dat.txt' # unit shift, 2; force using knot 
# file_path= "./data/NCDC/europe/france/avord/dat.txt" # try 4, initial speed (should be good with m/s), incompete dataset
# file_path= "./data/NCDC/europe/france/vatry/dat.txt"  # double peak, initial speed, incompete dataset
# file_path, bandwidth, NUMBER_OF_GAUSSIAN= "./data/NCDC/europe/spain/valladolid/dat.txt", 1.1, 4
# file_path= './data/NCDC/europe/spain/jerez/dat.txt' # high 0
# file_path, bandwidth= "./data/NCDC/europe/spain/barayas/dat.txt", 0.7 # not good fit
# file_path, bandwidth= './data/NCDC/europe/spain/malaga/dat.txt', 0.7 # directions blocked?
# file_path, bandwidth= './data/NCDC/europe/spain/tenerife_sur/dat.txt', 0.7 # directions blocked?
# file_path, bandwidth= './data/NCDC/europe/spain/almeria/dat.txt', 0.7 # negative dimensions?
# file_path, bandwidth= './data/NCDC/europe/greece/eleftherios_intl/dat.txt',0.7 # some direction might be blocked
# file_path= './data/NCDC/europe/ciampino/dat.txt' # try 4, bandwidth?
# file_path= "./data/NCDC/europe/huspel_aws/dat.txt"  # integer, 4?
# file_path= './data/NCDC/gibraltar/dat.txt' # bad fit

# MidEast
# file_path, bandwidth= './data/NCDC/mideast/uae/al_maktoum/dat.txt', 1.1
# file_path= './data/NCDC/mideast/uae/sharjah_intl/dat.txt' 
# file_path= './data/NCDC/mideast/uae/dubai_intl/dat.txt' 
# file_path= './data/NCDC/mideast/uae/abu_dhabi_intl/dat.txt' # Time shift
# file_path= './data/NCDC/mideast/uae/bateen/dat.txt' # Time shift
# file_path= './data/NCDC/mideast/buraimi/dat.txt' # not good dataset
# file_path= './data/NCDC/mideast/turkey/konya/dat.txt' 
# file_path= './data/NCDC/mideast/turkey/sivas/dat.txt' # bad dataset
# file_path= './data/NCDC/mideast/turkey/balikesir/dat.txt' # bad dataset
# file_path= './data/NCDC/mideast/turkey/bartin/dat.txt' # bad dataset
# file_path= './data/NCDC/mideast/iran/chahbahar/dat.txt'
# file_path= './data/NCDC/mideast/iran/zabol/dat.txt' # Problematic data
# file_path= './data/NCDC/mideast/iran/torbat_heydarieh/dat.txt' # Unusable

# file_path, bandwidth = "./data/NCDC/cn/shanghai/hongqiao_intl/dat.txt", 0.6
# file_path, bandwidth= "./data/NCDC/cn/shanghai/pudong/dat.txt", 0.8
# file_path, bandwidth= "./data/NCDC/cn/hefei_luogang/dat.txt", 0.6 # few 0, trend, try 2
# file_path, bandwidth= "./data/NCDC/cn/nanjing_lukou/dat.txt", 0.5
# file_path= "./data/NCDC/cn/zhengzhou_xinzheng/dat.txt" 
# file_path= "./data/NCDC/cn/tianjin/binhai/dat.txt" # few 0, trend, stationary speed, unstationary direction
# file_path= "./data/NCDC/cn/tianjin/tianjing/dat.txt" # 16 sectors
# file_path= "./data/NCDC/cn/shijiazhuang_zhengding/dat.txt" 
# file_path= "./data/NCDC/cn/henan_gushi/dat.txt" # 16 sectors, fit not very good
# file_path= "./data/NCDC/cn/nanning_wuxu/dat.txt" # numpy priblem, unstationary speed
# file_path= './data/NCDC/cn/macau/dat.txt'  
# file_path= "./data/NCDC/cn/hk_intl/dat.txt" # few 0
# file_path= './data/NCDC/cn/gaoqi/dat.txt' 

# file_path= './data/NCDC/southeast_asia/malaysia/mersing/dat.txt' # 2 mode, paper comparison
# file_path= './data/NCDC/southeast_asia/malaysia/penang/dat.txt'
# file_path= './data/NCDC/southeast_asia/malaysia/butterworth/dat.txt' # 2 mode 
# file_path= "./data/NCDC/southeast_asia/malaysia/bsultan_mahmud/dat.txt" # stable
# file_path= "./data/NCDC/southeast_asia/malaysia/bsultan_ismail/dat.txt" # 
# file_path= "./data/NCDC/southeast_asia/singapore/changi/dat.txt" # trend, no 0, questionary data
# file_path= "./data/NCDC/southeast_asia/singapore/paya_lebar/dat.txt" # questionary data
# file_path= "./data/NCDC/southeast_asia/singapore/seletar/dat.txt"
# file_path= "./data/NCDC/east_asia/cheongju_intl/dat.txt" # 2005-2009  may have problem, fit is good; numpy problem
# file_path= "./data/NCDC/east_asia/daegu_ab/dat.txt" # recent 5 year may have problem, but fit is generally good; numpy problem

# file_path, bandwidth= "./data/NCDC/oceania/auckland_intl/dat.txt", 0.9  # Good data, double mode
# file_path= "./data/NCDC/oceania/brisbane_archerfield/dat.txt" # high 0, few data 
# file_path= "./data/NCDC/oceania/narrandera/dat.txt" # high 0, few data
# file_path, bandwidth= "./data/NCDC/oceania/canberra/dat.txt", 0.7 # high 0, bad fit
# file_path, bandwidth, NUMBER_OF_GAUSSIAN= './data/NCDC/oceania/horsham/dat.txt', 0.9, 4 # get the peak

file_path, bandwidth= './data/NCDC/us/boston_16nm/dat.txt', 0.9 # Offshore, mixed type

# file_path, bandwidth= './data/asos/olympia/hr_avg.csv', 0.5 # might block
# file_path, bandwidth, NUMBER_OF_GAUSSIAN  = './data/asos/bismarck_ND/hr_avg.csv', 1.1, 4
# file_path, bandwidth, NUMBER_OF_GAUSSIAN = './data/asos/aberdeen_SD/hr_avg.csv', 1.7, 2 # only to 2012
# file_path, bandwidth, NUMBER_OF_GAUSSIAN = './data/asos/minneapolis/hr_avg.csv', 1.1, 4
# file_path, bandwidth = './data/asos/lincoln_NE/hr_avg.csv', 0.9
# file_path, bandwidth = './data/asos/des_moines_IA/hr_avg.csv', 1.3
# file_path, bandwidth = './data/asos/springfield_IL/hr_avg.csv', 1.1 
# file_path, bandwidth = './data/asos/topeka/hr_avg.csv', 0.7 # High 0
# file_path, bandwidth = './data/asos/denver/hr_avg.csv', 1.3

# file_path, bandwidth, NUMBER_OF_GAUSSIAN = './data/NDAWN/baker/hr_avg.csv', 0.7, 4 
# file_path, bandwidth = './data/NDAWN/dickinson/hr_avg.csv', 0.6
# file_path = './data/NDAWN/rugby/hr_avg.csv'
# file_path = './data/NDAWN/bowman/hr_avg.csv'
# file_path = './data/NDAWN/grand_forks/hr_avg.csv'
# file_path = './data/NDAWN/williston/hr_avg.csv'
# file_path = './data/NDAWN/jamestown/hr_avg.csv'

# file_path, bandwidth, NUMBER_OF_GAUSSIAN = 'data/ECMWF/usa/47N123W/dat.csv', 0.7, 4 #good 
# file_path, bandwidth = 'data/ECMWF/venezuela/8N67W/dat.csv', 0.7 # good, but the data might be problematic.
# file_path, bandwidth = 'data/ECMWF/chile/52S75W/dat.csv', 1.9 # good
# file_path, bandwidth= 'data/ECMWF/iceland/65N17W/dat.csv', 1.9 # good
# file_path, bandwidth, NUMBER_OF_GAUSSIAN  = 'data/ECMWF/germany/49N9E/dat.csv', 1.1, 4 # miss peak
# file_path, bandwdith = 'data/ECMWF/sudan/18N32E/dat.csv', 1.1 # good
# file_path, bandwidth = 'data/ECMWF/china/24N121E/dat.csv', 0.9 # good
# file_path, bandwidth, NUMBER_OF_GAUSSIAN = 'data/ECMWF/australia/37S142E/dat.csv', 0.7, 4 # miss the peak, force bandwidth 0.7
In [3]:
if "cn_database" in file_path: 
    df = read_cn_database(file_path)
elif 'NCDC' in file_path:
    df = pd.read_csv(file_path, header=0, skipinitialspace=True, dtype={'HrMn':'object'})
    df.rename(columns={'Date':'date','Dir':'dir','Spd':'speed','Type':'type','I.1':'wind_type'}, inplace=True)
    df = df[['date','HrMn','type','dir','speed','wind_type' ]]
    df.dropna(subset=['dir','speed'], inplace=True)
    integer_data = True
elif 'NDAWN' in file_path:
    df = pd.read_csv(file_path, header=0, skipinitialspace=True, dtype={'HrMn':'object'})
    df['type']='default'
    df['wind_type']='default'
    df = df.dropna()
    convert_to_knot = False
    integer_data = False
elif 'asos' in file_path:
    # ASOS
    df = pd.read_csv(file_path, header=0, skipinitialspace=True, dtype={'HrMn':'object'})
    df['type']='default'
    df['wind_type']='default'
    df = df.dropna()
    convert_to_knot = False
    integer_data = False
    knot_unit = True
else:
    df = pd.read_csv(file_path, header=0, skipinitialspace=True)
    df.rename(columns={'U':'x','V':'y'}, inplace=True)
    df.x=-df.x
    df.y=-df.y
    df['speed']=np.sqrt(df.x**2+df.y**2)
    df['dir']=np.degrees(np.arctan2(df.y, df.x))%360
    df['time']=pd.to_datetime('1979-01-01T00:00:00Z')+pd.to_timedelta(df['time'], unit='h')
    df['date']=df['time'].dt.strftime('%Y%m%d')
    df['date']=df['date'].astype(int)
    df['HrMn']=df['time'].dt.strftime('%H00')
    df['type']='default'
    df['wind_type']='default'
    convert_to_knot = True
    integer_data = False
    cartesian = True
In [4]:
df
Out[4]:
date HrMn type dir speed wind_type
0 19850312 1400 FM-18 130 13.0 N
1 19850312 1500 FM-18 130 12.0 N
2 19850312 1600 FM-18 130 12.0 N
3 19850312 1700 FM-18 160 7.0 N
4 19850312 1800 FM-18 170 8.0 N
5 19850312 1900 FM-18 190 7.0 N
6 19850312 2000 FM-18 210 6.0 N
7 19850312 2100 FM-18 190 4.0 N
8 19850312 2200 FM-18 170 6.0 N
9 19850312 2300 FM-18 270 8.0 N
10 19850313 0000 FM-18 270 7.0 N
11 19850313 0100 FM-18 240 6.0 N
12 19850313 0200 FM-18 230 6.0 N
13 19850313 0300 FM-18 220 4.0 N
14 19850313 0400 FM-18 230 6.0 N
15 19850313 0500 FM-18 240 6.0 N
16 19850313 0600 FM-18 250 6.0 N
17 19850313 0700 FM-18 240 8.0 N
18 19850313 0800 FM-18 260 10.0 N
19 19850313 0900 FM-18 240 10.0 N
20 19850313 1000 FM-18 260 10.0 N
21 19850313 1100 FM-18 250 10.0 N
22 19850313 1200 FM-18 260 11.0 N
23 19850313 1300 FM-18 260 10.0 N
24 19850313 1400 FM-18 290 11.0 N
25 19850313 1500 FM-18 290 12.0 N
26 19850313 1600 FM-18 310 13.0 N
27 19850313 1800 FM-18 300 11.0 N
28 19850313 1900 FM-18 290 11.0 N
29 19850313 2000 FM-18 290 12.0 N
... ... ... ... ... ... ...
235836 20170331 1800 FM-13 110 2.0 N
235837 20170331 1900 FM-13 999 999.9 C
235838 20170331 2000 FM-13 80 4.0 N
235839 20170331 2100 FM-13 90 4.0 N
235840 20170331 2200 FM-13 70 6.0 N
235841 20170331 2300 FM-13 80 6.0 N
235842 20170401 0000 FM-13 90 8.0 N
235843 20170401 0100 FM-13 80 8.0 N
235844 20170401 0200 FM-13 80 10.0 N
235845 20170401 0300 FM-13 90 11.0 N
235846 20170401 0400 FM-13 80 11.0 N
235847 20170401 0500 FM-13 80 11.0 N
235848 20170401 0600 FM-13 80 11.0 N
235849 20170401 0700 FM-13 80 12.0 N
235850 20170401 0800 FM-13 80 12.0 N
235851 20170401 0900 FM-13 70 11.0 N
235852 20170401 1000 FM-13 70 12.0 N
235853 20170401 1100 FM-13 70 12.0 N
235854 20170401 1200 FM-13 60 13.0 N
235855 20170401 1300 FM-13 50 12.0 N
235856 20170401 1400 FM-13 50 12.0 N
235857 20170401 1500 FM-13 40 13.0 N
235858 20170401 1600 FM-13 40 14.0 N
235859 20170401 1700 FM-13 30 14.0 N
235860 20170401 1800 FM-13 30 14.0 N
235861 20170401 1900 FM-13 30 14.0 N
235862 20170401 2000 FM-13 30 14.0 N
235863 20170401 2100 FM-13 30 15.0 N
235864 20170401 2200 FM-13 30 14.0 N
235865 20170401 2300 FM-13 10 14.0 N

235866 rows × 6 columns

In [5]:
if 'NCDC' in file_path:
    lat, long = get_lat_long(file_path)
    print(lat,long)
    map_osm = folium.Map(location=[lat, long], zoom_start=4)
    folium.Marker([lat, long]).add_to(map_osm)
    display(map_osm)
42.35 -70.69
In [6]:
df['time']=pd.to_datetime(df["date"].astype(str).map(str) + df["HrMn"], format='%Y%m%d%H%M')
df.set_index(['time'], inplace=True)
df['HrMn']=df['HrMn'].astype(int)
df = df.query("(dir <= 999) & (speed < 100) ")['1970':'2016']
In [7]:
plot_speed_and_angle_distribution(df.speed, df.dir)
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
In [8]:
# Dir [10,360]=> [0,350]
df['dir'] = df['dir'].apply(lambda x: x%360 if x < 999 else x) 
# Convert Windrose coordianates to Polar Cooridinates 
if 'cartesian' in globals():
    df['dir_windrose'] = df['dir'].apply(lambda x: (90 - x)%360 if x < 999 else x)
else:
    df['dir_windrose'] = df['dir']
    df['dir'] = df['dir'].apply(lambda x: (90 - x)%360 if x < 999 else x)
display(df.describe())
df.plot(y='speed',legend=True,figsize=(20,5))
date HrMn dir speed dir_windrose
count 2.334610e+05 233461.000000 233461.000000 233461.000000 233461.000000
mean 2.000936e+07 1147.748875 196.144380 6.089394 207.081881
std 9.282218e+04 692.306640 136.124931 3.331242 139.864066
min 1.985031e+07 0.000000 0.000000 0.000000 0.000000
25% 1.993041e+07 500.000000 120.000000 4.000000 120.000000
50% 2.000113e+07 1100.000000 190.000000 6.000000 210.000000
75% 2.009073e+07 1700.000000 260.000000 8.000000 280.000000
max 2.016123e+07 2300.000000 999.000000 26.000000 999.000000
Out[8]:
<matplotlib.axes._subplots.AxesSubplot at 0x4ff23c8>

1.3 General Data Info

1.3.1 Unit Detection

In [9]:
df['decimal'] = df.speed % 1
df.decimal.hist(alpha=0.5, label='m/s', figsize=(4, 3))

if 'convert_to_knot' not in globals():
    convert_to_knot = True if len(df.query('decimal >= 0.2')) / len(df) > 0.3 else False
    
if convert_to_knot:
    knot_unit = True
    df['speed'] = df['speed'] * 1.943845
    df['decimal'] = df.speed % 1
    df.decimal.hist(alpha=0.5, label='knot')
    # need more elaboration, some is not near an integer
    if integer_data:
        df['speed'] = df['speed'].apply(lambda x: int(round(x)))
    plt_configure(xlabel='Decimal', ylabel='Frequency', legend={'loc': 'best'}, title='Decimal Distribution')
else:
    if 'knot_unit' not in globals():
        knot_unit = False
    
df.drop(['decimal'], 1,inplace=True)
print(knot_unit)
False
In [10]:
dir_unit_text = ' (degree)'
if knot_unit == True:
    speed_unit_text = ' (knot)'
else: 
    speed_unit_text = ' (m/s)'

1.3.2 Sampling Type Selection

In [11]:
sample_type = df.query('date > 20000000')['type']
sample_type.value_counts().plot(
    kind = 'bar', title = 'Report Types Comprisement', figsize=(4,3))

report_type_most_used = sample_type.value_counts().argmax()
df = df.query("type==@report_type_most_used")

1.3.3 Sampling Time Selection

In [12]:
MID_YEAR = int(np.average(df.index.year))

df['HrMn'].value_counts().sort_index().plot(kind='bar', alpha=0.5,label='Overall')
df[str(MID_YEAR):]['HrMn'].value_counts().sort_index().plot(
    kind='bar', alpha=0.5, label='>= %s' %  MID_YEAR )

plt_configure(xlabel='Sampling Time', ylabel='Frequency', legend={'loc':'best'}, figsize=(8, 4), 
              title = 'Sampling Time Distribution, Overall and > %s ' %  MID_YEAR)
In [13]:
df['sample_time'] = df.HrMn % 100 
sample_time = df['2000':]['sample_time']
sample_times = sample_time.value_counts()[sample_time.value_counts() > 2000]
sample_times = sample_times.index.tolist()
# df = df.query("sample_time in @sample_times")
df = df.query("sample_time == @sample_times[0]")
df.drop(['sample_time'], 1,inplace=True)
print(sample_times)

df['HrMn'].value_counts().sort_index().plot(kind='bar', alpha=0.5, figsize=(10, 4))
[0]
Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0xd66f5f8>

1.4 Error Data handling and Adjustment

1.4.1 Artefacts

wrong direction record

In [14]:
if integer_data:
    display(df.query("(dir % 10 >= 0.1) & (dir != 999)"))
    df = df.query('(dir % 10 <= 0.1) | (dir == 999)')
date HrMn type dir speed wind_type dir_windrose
time

sudden increase in speed

In [15]:
# sudden increse
df['incre'] = df.speed.diff(1)
df['incre'].fillna(0, inplace=True)
df['incre_reverse'] = df.speed.diff(-1)
df['incre_reverse'].fillna(0, inplace=True)

display(df.sort_values(by='speed',ascending=False).head(10))
df['incre'].plot(kind='hist', bins=arange(-15, 15), legend=True, figsize=(8, 3))
date HrMn type dir speed wind_type dir_windrose incre incre_reverse
time
1993-03-13 23:00:00 19930313 2300 FM-18 20 26.0 N 70 3.0 5.0
1985-09-27 20:00:00 19850927 2000 FM-18 280 24.0 N 170 6.0 2.0
1992-12-12 14:00:00 19921212 1400 FM-18 50 23.0 N 40 2.0 1.0
1993-03-13 22:00:00 19930313 2200 FM-18 20 23.0 N 70 2.0 -3.0
1992-12-11 18:00:00 19921211 1800 FM-18 10 22.0 N 80 2.0 2.0
2003-12-07 02:00:00 20031207 200 FM-18 60 22.0 N 30 2.0 0.0
1985-09-27 21:00:00 19850927 2100 FM-18 270 22.0 N 180 -2.0 2.0
2003-12-07 03:00:00 20031207 300 FM-18 60 22.0 N 30 0.0 2.0
1992-12-13 02:00:00 19921213 200 FM-18 40 22.0 N 50 1.0 3.0
1992-12-12 15:00:00 19921212 1500 FM-18 40 22.0 N 50 -1.0 0.0
Out[15]:
<matplotlib.axes._subplots.AxesSubplot at 0xd5894a8>
In [16]:
incre_threshold = 20 if knot_unit else 10
print('sudden increase number', len(df.query('(incre > @incre_threshold )&(incre_reverse > @incre_threshold )')))
df = df.query('(incre < @incre_threshold )|(incre_reverse < @incre_threshold )')

# Check the max speed
display(df.sort_values(by='speed',ascending=False).head(10))
df.drop(['incre', 'incre_reverse'], 1, inplace=True)
sudden increase number 1
date HrMn type dir speed wind_type dir_windrose incre incre_reverse
time
1993-03-13 23:00:00 19930313 2300 FM-18 20 26.0 N 70 3.0 5.0
1985-09-27 20:00:00 19850927 2000 FM-18 280 24.0 N 170 6.0 2.0
1993-03-13 22:00:00 19930313 2200 FM-18 20 23.0 N 70 2.0 -3.0
1992-12-12 14:00:00 19921212 1400 FM-18 50 23.0 N 40 2.0 1.0
1986-12-19 11:00:00 19861219 1100 FM-18 70 22.0 N 20 2.0 1.0
2003-12-07 03:00:00 20031207 300 FM-18 60 22.0 N 30 0.0 2.0
1992-12-12 16:00:00 19921212 1600 FM-18 50 22.0 N 40 0.0 3.0
1985-09-27 21:00:00 19850927 2100 FM-18 270 22.0 N 180 -2.0 2.0
1992-12-12 15:00:00 19921212 1500 FM-18 40 22.0 N 50 -1.0 0.0
2003-12-07 02:00:00 20031207 200 FM-18 60 22.0 N 30 2.0 0.0

1.4.2 Direction re-aligment

For some dataset, the 16 sectors are not record properly,

e.g. the sectors are [0,20,50 ...], need to redistribute the angle into 22.5, e.g. [0, 22.5, 45...]

In [17]:
display(df['dir'].value_counts().sort_index())
effective_column = df.query('dir < 999')['dir'].value_counts()[df['dir'].value_counts() > 30].sort_index()
if integer_data:
    SECTOR_LENGTH = 360/len(effective_column) 
else: 
    SECTOR_LENGTH = 10
print(len(effective_column), SECTOR_LENGTH)
0      3268
10     2973
20     3265
30     3460
40     3945
50     4485
60     4641
70     4754
80     4396
90     4144
100    3936
110    4522
120    5801
130    6521
140    7334
150    7647
160    7957
170    8295
180    8399
190    6724
200    6296
210    6639
220    6665
230    7142
240    7168
250    7362
260    6459
270    5605
280    4961
290    5359
300    6069
310    5671
320    4648
330    4157
340    3575
350    3125
999    3636
Name: dir, dtype: int64
36 10.0
In [18]:
df=realign_direction(df, effective_column)

1.4.3 0 Speed

In [19]:
with_too_many_zero, null_wind_frequency = is_with_too_many_zero(df['2005':])
delete_zero = with_too_many_zero
if delete_zero:
    df = df.query('(speed > 0)')
print(delete_zero, null_wind_frequency)
False 0.0104086353123
In [20]:
print(df.query('dir == 999')['speed'].value_counts())
df=fill_direction_999(df, SECTOR_LENGTH)
0.0    3636
Name: speed, dtype: int64

1.5 Time Shift Comparison

In [21]:
DIR_REDISTRIBUTE = 'even'
if DIR_REDISTRIBUTE == 'even':
    DIR_BIN = arange(-5, 360, 10) 
elif DIR_REDISTRIBUTE == 'round_up':
    DIR_BIN = arange(0, 360+10, 10) 

# Comparison between mid_year, looking for: 
# 1. Odd Even Bias
# 2. Time Shift of Wind Speed Distribution
bins = arange(0, df.speed.max() + 1)
df[:str(MID_YEAR)]['speed'].plot(
    kind='hist', alpha=0.5,bins=bins, label='< %s' % MID_YEAR)

df[str(MID_YEAR+1):]['speed'].plot(
    kind='hist', alpha=0.5,bins=bins, label='> %s' % MID_YEAR)

plt.suptitle('Speed Comparison between year < %s, > %s ' % (MID_YEAR, MID_YEAR), fontsize = 14)
plt_configure(xlabel='Speed', ylabel='Frequency', legend=True, figsize=(8, 3))
In [22]:
df[:str(MID_YEAR)]['dir'].plot(
    kind='hist', alpha=0.5,bins=DIR_BIN, label='< %s' % MID_YEAR)

df[str(MID_YEAR+1):]['dir'].plot(
    kind='hist', alpha=0.5,bins=DIR_BIN, label='> %s' % MID_YEAR)

plt.suptitle('Dir Comparison between year < %s, and > %s ' % (MID_YEAR, MID_YEAR), fontsize = 14)
plt_configure(xlabel='Dir', ylabel='Frequency', legend={'loc':'best'}, figsize=(8, 3), tight='x')
In [23]:
display(df[df['dir'].isnull()])
df.dropna(subset=['dir'], inplace=True)
date HrMn type dir speed wind_type dir_windrose
time
In [24]:
# Inspect the time shift of speed and degree distribution, and odd-even bias
check_time_shift(df, speed_unit_text=speed_unit_text, dir_unit_text=dir_unit_text)
1986 - 1990
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
1991 - 1995
1996 - 2000
2001 - 2004
2006 - 2010
2011 - 2013
In [25]:
df.resample('A').mean().plot(y='speed')
plt.gca().set_ylim(bottom=0)
df.resample('M').mean().plot(y='speed', figsize=(20,4))
plt.gca().set_ylim(bottom=0)
Out[25]:
(0, 10.0)
In [26]:
%%time
for column in ['speed', 'dir']:
    if column == 'speed':
        bins = arange(0, df[column].max()+1, 1)
    else:
        bins = arange(0, 361, 10)
    den, _ = np.histogram(df[column], bins=bins, density=True)
    y_top=max(den)*1.2
    for year in arange(1980, 2016):
        end_year = year
        sub_df = df[str(year):str(end_year)]
        if len(sub_df) > 1000:
            plt.figure()
            df[column].hist(bins=bins, alpha=0.3, normed=True)
            sub_df[column].hist(bins=bins, alpha=0.5, figsize=(3,1.5), normed=True)
            plt.gca().set_ylim(top=y_top)
            plt_configure(title=str(year))
    align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\pyplot.py:524: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)
Wall time: 16.8 s
In [27]:
for column in ['speed', 'dir']:
    if column == 'speed':
        bins = arange(0, df[column].max()+1, 1)
    else:
        bins = arange(0, 361, 10)
    density_all, _ = np.histogram(df[column], bins=bins, density=True)
    df[column].hist(bins=bins, figsize=(5,3))

    R_squares = []
    years = []
    for year in arange(1980, 2016):
        start_year, end_year = year-1, year+1
        sub_df = df[str(start_year):str(end_year)]
        if len(sub_df) > 1000:
            density, _ = np.histogram(sub_df[column], bins=bins, density=True)
            y_mean = np.mean(density_all)
            SS_tot = np.sum(np.power(density_all - y_mean, 2))
            SS_res = np.sum(np.power(density_all - density, 2))

            R_square = 1 - SS_res / SS_tot
            R_squares.append(R_square)
            years.append(year)

    plt.figure()
    plot(years, R_squares)
    ylim = max(min(plt.gca().get_ylim()[0],0.85),0)
    plt.gca().set_ylim(bottom=ylim, top=1)
    plt_configure(figsize=(5,3))
    align_figures()

1.6 Re-distribute Direction and Speed (Optional)

e.g. Dir 50 -> -45 ~ 55, to make KDE result better

In [28]:
if integer_data:
    df = randomize_angle(df, DIR_REDISTRIBUTE, SECTOR_LENGTH)
In [29]:
if integer_data:
    if delete_zero:
        redistribute_method = 'down'
    else:
        redistribute_method = 'up'

    df, speed_redistribution_info = randomize_speed(df, redistribute_method)
Redistribute upward, e.g. 0 -> [0,1]

1.7 Generate (x,y) from (speed,dir)

In [30]:
# Cook orientation
# df['dir']= (df['dir'] + 180)%360
In [31]:
# There might be a small dot in the centre, which is due to too many zero (more than 1 speed) in center
# Scatter plot in matplot has performance issue, the speed is very slow
df['x'] = df['speed'] * cos(df['dir'] * pi / 180.0)
df['y'] = df['speed'] * sin(df['dir'] * pi / 180.0)

2. Re-select Data and Overview

2.1 Data Overview

In [32]:
## Summery of the data selection
print('Knot unit?', knot_unit)
print('Report type used:', report_type_most_used)
print('Sampling time used:', sample_times)
if 'speed_redistribution_info' in globals():
    print('Speed redistribution info:', speed_redistribution_info )

df_all_years = df # for later across-year comparison
df = df_all_years['2011':'2015']
# df = df.query('(HrMn == 0) and (speed >= 0.5) and (date%10000 > 900) and (date%10000 < 1000)' )
df.describe()
Knot unit? False
Report type used: FM-18
Sampling time used: [0]
Speed redistribution info: Redistribute upward, e.g. 0 -> [0,1]
Out[32]:
date HrMn dir speed dir_windrose x y
count 1.433600e+04 14336.000000 14336.000000 14336.000000 14336.000000 14336.000000 14336.000000
mean 2.011598e+07 1145.619420 186.837329 6.386756 203.242397 -1.347841 -0.019738
std 6.716762e+03 692.151673 92.232399 3.292966 123.793493 4.939837 5.041621
min 2.011010e+07 0.000000 -4.928937 0.012049 0.000000 -18.387649 -18.251777
25% 2.011053e+07 500.000000 121.160226 3.963283 130.000000 -4.628845 -3.799711
50% 2.011103e+07 1100.000000 191.464250 5.739531 210.000000 -1.276468 -0.645959
75% 2.012100e+07 1700.000000 263.105775 8.445066 280.000000 2.047118 3.484122
max 2.013043e+07 2300.000000 354.984408 21.850382 999.000000 21.712824 20.818823
In [33]:
df.plot(y='speed',legend=True,figsize=(20,5))
Out[33]:
<matplotlib.axes._subplots.AxesSubplot at 0x1af366d8>
In [34]:
# Accumulation by month
df.resample('M').count().plot(y='date', kind='bar',figsize=(20,4))
Out[34]:
<matplotlib.axes._subplots.AxesSubplot at 0xd62e208>
In [35]:
# 90 degree is east
ax = WindroseAxes.from_ax()
viridis = plt.get_cmap('viridis')
ax.bar(df.dir_windrose, df.speed, normed=True, opening=0.8, edgecolor='white', nsector=36, cmap=viridis)
ax.set_legend()
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)
In [36]:
if len(df) > 1000000:
    bins=arange(0,362)
    df['dir'].hist(bins=bins, normed=True,alpha=0.5,label='min')
    
    df = df_all_years.sample(n=500000, replace=True)    
    df['dir'].hist(bins=bins, normed=True,alpha=0.5,label='min resmapled')
    plt_configure(legend=True, figsize=(20,4))
In [37]:
x, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(df.speed)

# 1. Histogram comparison
fig = plt.figure()
df['speed'].hist(bins=arange(0, df.speed.max()), alpha=0.5, label='Data', normed=True)             
plot(x, y_weibull, '-', color='black',label='Weibull')   
plt_configure(figsize=(4,3),xlabel='V',ylabel='PDF', legend=True)

# 2. CDF comparison
fig = plt.figure()
plot(log(x), log(-log(1-y_ecdf)),'o', label='ECDF')
plot(log(x), log(-log(1-y_cdf_weibull)),'-', label='Weibull')
plt_configure(xlabel="ln(V)", ylabel="ln(-ln(1-P)",legend={'loc':'best'}, figsize=(4,3))
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:11: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:12: RuntimeWarning: divide by zero encountered in log
In [38]:
df.plot(kind='scatter', x='x', y='y', alpha=0.05, s=2)
plt.gca().set_aspect('equal')
plt_configure(figsize=(3.2,3.2),xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)

2.2 Overview by Direction

In [39]:
if len(effective_column) == 16:
    rebinned_angle = 22.5
else: 
    rebinned_angle = 10
In [40]:
%%time
original_incre, incre = SECTOR_LENGTH, rebinned_angle
start, end = -original_incre/2 + incre/2, 360

max_speed = df.speed.max()
max_count = max_count_for_angles(df, start, end, incre)
plot_range = [0, max_speed, 0, max_count*1.05]

for angle in arange(start, end, incre):
    start_angle, end_angle = angle-incre/2, angle+incre/2
    sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)   
    
    fig = plt.figure()
    sub_df['speed'].hist(bins=arange(0, max_speed), alpha=0.5, label='Data')
    title ='%s (%s - %s), %s' % (angle, start_angle, end_angle, len(sub_df)) 
    plt.axis(plot_range)
    plt_configure(figsize=(3,1.5), title=title)
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\pyplot.py:524: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)
Wall time: 7.2 s

2.3 Overview by Month

In [41]:
%%time
current_df = df.query('speed>=1')
for month in arange(1, 13): 
    sub_df = current_df[current_df.index.month == month]
    ax = WindroseAxes.from_ax()
    ax.bar(sub_df.dir_windrose, sub_df.speed, normed=True, opening=0.8, edgecolor='white', nsector=36, cmap=plt.get_cmap('viridis'))
    plt_configure(figsize=(3,3), title='Month: %s'%(month))
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)
Wall time: 19.3 s
In [42]:
df.describe()
Out[42]:
date HrMn dir speed dir_windrose x y
count 1.433600e+04 14336.000000 14336.000000 14336.000000 14336.000000 14336.000000 14336.000000
mean 2.011598e+07 1145.619420 186.837329 6.386756 203.242397 -1.347841 -0.019738
std 6.716762e+03 692.151673 92.232399 3.292966 123.793493 4.939837 5.041621
min 2.011010e+07 0.000000 -4.928937 0.012049 0.000000 -18.387649 -18.251777
25% 2.011053e+07 500.000000 121.160226 3.963283 130.000000 -4.628845 -3.799711
50% 2.011103e+07 1100.000000 191.464250 5.739531 210.000000 -1.276468 -0.645959
75% 2.012100e+07 1700.000000 263.105775 8.445066 280.000000 2.047118 3.484122
max 2.013043e+07 2300.000000 354.984408 21.850382 999.000000 21.712824 20.818823

3. Create input data and configuration

In [43]:
SPEED_SET = array(list(zip(df.x, df.y)))
if 'NUMBER_OF_GAUSSIAN' not in globals():
    NUMBER_OF_GAUSSIAN = 3
FIT_METHOD = 'square_error'
DEFAULT_BANDWDITH = 1.5 if knot_unit else 0.7
fig_list = []
In [44]:
fit_limit = ceil(df['speed'].quantile(.95))
fitting_axis_range = arange(-fit_limit, fit_limit+1, 1)
print(fitting_axis_range)

FITTING_RANGE = []
for i in fitting_axis_range:
    for j in fitting_axis_range:
        FITTING_RANGE.append([i,j])
[-13 -12 -11 -10  -9  -8  -7  -6  -5  -4  -3  -2  -1   0   1   2   3   4
   5   6   7   8   9  10  11  12  13]
In [45]:
plot_limit = ceil(df['speed'].quantile(.95))
PLOT_AXIS_RANGE = arange(-plot_limit, plot_limit+1, 1)

4. Kernel Density Estimation

In [46]:
sample = SPEED_SET
KDE_KERNEL = 'gaussian'
# KDE_KERNEL, bandwidth = 'tophat', 1
In [47]:
%%time
if 'bandwidth' not in globals():
    bandwidth = DEFAULT_BANDWDITH
    from sklearn.grid_search import GridSearchCV
    # from sklearn.model_selection import GridSearchCV  ## too slow

    # The bandwidth value sometimes would be too radical
    if knot_unit:
        bandwidth_range = arange(0.7,2,0.2)
    else:
        bandwidth_range = arange(0.4,1.1,0.1)

    # Grid search is unable to deal with too many data (a long time is needed)
    if len(sample) > 50000:    
        df_resample=df.sample(n=50000, replace=True)
        bandwidth_search_sample = array(list(zip(df_resample.x, df_resample.y)))
    else:
        bandwidth_search_sample = sample

    grid = GridSearchCV(neighbors.KernelDensity(kernel = KDE_KERNEL),
                    {'bandwidth': bandwidth_range}, n_jobs=-1, cv=4) 

    grid.fit(bandwidth_search_sample)
    bandwidth = grid.best_params_['bandwidth']
    
print(bandwidth)
0.9
Wall time: 0 ns
In [48]:
if 'bandwidth' not in globals():
    bandwidth = DEFAULT_BANDWDITH

kde = neighbors.KernelDensity(bandwidth=bandwidth, kernel = KDE_KERNEL).fit(sample)

points = FITTING_RANGE
# very slow if the dataset is too large, e.g. 100,000
# kde returns log prob, need to convert it
kde_result = exp(kde.score_samples(points))
print('bandwidth:', bandwidth, len(kde_result))
print(kde_result[:5])
bandwidth: 0.9 729
[  1.95719167e-10   4.81077136e-09   4.10618731e-08   1.60513691e-07
   1.06274942e-06]
In [49]:
# Plot jPDF
X = Y = PLOT_AXIS_RANGE
# Can't work if pass as generate_Z_from_X_Y(X,Y, exp(kde.score_samples())), need to use lambda
# see http://stackoverflow.com/questions/21035437/passing-a-function-as-an-argument-in-python
kde_Z = generate_Z_from_X_Y(X,Y, lambda coords: exp(kde.score_samples(coords)))
colorbar_lim = 0, kde_Z.max()

plot_3d_prob_density(X,Y,kde_Z)

fig_kde,ax1 = plt.subplots(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,kde_Z,xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, ax=ax1)

with sns.axes_style({'axes.grid' : False}):
    from matplotlib import ticker
    fig_hist,ax2 = plt.subplots(figsize=(3.5,2.5))
    _,_,_,image = ax2.hist2d(df.x, df.y, bins=PLOT_AXIS_RANGE, cmap='viridis',)
    ax2.set_aspect('equal')
    cb = plt.colorbar(image)
    tick_locator = ticker.MaxNLocator(nbins=6)
    cb.locator = tick_locator
    cb.update_ticks()
    plt_configure(ax=ax2, xlabel='x'+speed_unit_text,ylabel='y'+speed_unit_text)
align_figures()

4.1 Bootstrap GOF limit

In [50]:
kde_cdf = cdf_from_pdf(kde_result)
config = {'bandwidth': bandwidth, 
          'fitting_range': FITTING_RANGE,
          'fit_limit': fit_limit,
          'kde_kernel': KDE_KERNEL}
In [51]:
%%time
gof_kde=Parallel(n_jobs=-1)(delayed(resampled_kde)(df, kde_result, config) 
                                       for i in arange(20)) 
Wall time: 8.75 s
In [52]:
for gof_name in [ 'R_square', 'K_S','Chi_square']:
    plt.figure(figsize=(4,3))
    pd.DataFrame(gof_kde)[gof_name].hist()
    plt_configure(title=gof_name)
align_figures()

4.2 Bivariate Empirical Limit

In [53]:
%%time
gofs_mean_set_bivar = []
fig1, ax1 = plt.subplots(figsize=(4,3))
fig2, ax2 = plt.subplots(figsize=(4,3))

for year_length in [5, 10]:
    start_year, end_year = df_all_years.index.year[0], 2015-year_length+1
    df_standard = df_all_years[str(2015-year_length+1):'2015']
    speed_ = array(list(zip(df_standard.x, df_standard.y)))
    kde_current = neighbors.KernelDensity(bandwidth=bandwidth, kernel=KDE_KERNEL).fit(speed_)
    kde_result_standard = exp(kde_current.score_samples(points))
    gofs_bivar=Parallel(n_jobs=-1)(delayed(kde_gofs)(df_all_years, start_year, start_year+year_length-1, kde_result_standard, config) 
                                   for start_year in arange(start_year, end_year+1)) 
    gofs_bivar=pd.DataFrame(gofs_bivar, index=arange(start_year, end_year+1))

    gofs_bivar.plot(y='R_square', ax=ax1, label=year_length)
    gofs_bivar.plot(y='K_S', ax=ax2, label=year_length)
    year_lim = end_year-year_length-10, end_year-year_length
    gofs_mean = gofs_bivar.query('index >= @year_lim[0] & index <= @year_lim[1]').mean().to_dict()
    gofs_mean['year_lim']=year_lim
    gofs_mean_set_bivar.append(gofs_mean)
    
align_figures()
display(pd.DataFrame(gofs_mean_set_bivar).set_index('year_lim'))
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
year_lim
(1996, 2006) 0.068082 0.040320 1.353014e-07 0.041649 0.272139 0.954268
(1986, 1996) 0.039296 0.034205 8.345238e-08 0.039417 0.208740 0.966743
Wall time: 39.7 s

4.3 Univariate GOF Limit

In [54]:
def yearly_gof(df_all_years, start_year, end_year, density, y_ecdf, x, density_dir):
    df_previous = df_all_years[str(start_year):str(end_year)]
    # 1. Speed
    density_expected, _ = np.histogram(df_previous['speed'], bins=x, density=True)
    r_square = sector_r_square(density, density_expected)
    
    y_ecdf_previous = sm.distributions.ECDF(df_previous['speed'])(x)
    k_s = max(np.abs(y_ecdf - y_ecdf_previous))
    
    # 2. Direction
    density_dir_expected, _ = dir_hist(df_previous['dir'], bins=arange(-5,370,10), density=True)
    r_square_dir = sector_r_square(density_dir, density_dir_expected)
    return {'year': start_year, 'r_square': r_square, 'k_s': k_s, 'r_square_dir': r_square_dir}
In [55]:
%%time
x = arange(0, df.speed.max() + 1)
fig = plt.figure(figsize=(9,2.5))
ax1 = fig.add_subplot(1,3,1)
ax2 = fig.add_subplot(1,3,2)
ax3 = fig.add_subplot(1,3,3)
gofs_mean_set = []

for year_length in [5, 7, 10]:
    start_year, end_year =df_all_years.index.year[0], 2015-year_length+1
    df_standard = df_all_years[str(2015-year_length+1):str(2015)]
    density, _ = np.histogram(df_standard['speed'], bins=x, density=True)
    density_dir, _ = dir_hist(df_standard['dir'], bins=arange(-5,370,10), density=True)
    y_ecdf = sm.distributions.ECDF(df_standard.speed)(x)
    gofs = [yearly_gof(df_all_years, start_year, start_year+year_length-1, density, y_ecdf, x, density_dir) 
            for start_year in arange(start_year, end_year+1)]

    gofs = pd.DataFrame(gofs)
    gofs.set_index(['year'], inplace=True)    
    if len(gofs)>0:
        ax1.plot(gofs.r_square, label=year_length)
        ax2.plot(gofs.k_s, label=year_length)
        ax3.plot(gofs.r_square_dir, label=year_length)
    year_lim = end_year-year_length-10, end_year-year_length
    gofs_mean = gofs.query('index >= @year_lim[0] & index <= @year_lim[1]').mean().to_dict()
    gofs_mean['year_lim']=year_lim
    gofs_mean_set.append(gofs_mean)
plt.tight_layout()
plt.legend()
for ax in [ax1, ax2, ax3]:
    plt_configure(ax=ax, tight='xtight')
    
display(pd.DataFrame(gofs_mean_set).set_index('year_lim'))
k_s r_square r_square_dir
year_lim
(1996, 2006) 0.029320 0.991129 0.646934
(1992, 2002) 0.016814 0.995331 0.755758
(1986, 1996) 0.019757 0.987923 0.795987
Wall time: 1.09 s

5. GMM by Expectation-maximization

In [56]:
sample=SPEED_SET
clf = mixture.GaussianMixture(n_components=NUMBER_OF_GAUSSIAN, covariance_type='full')
clf.fit(sample)
print(clf.converged_)
True
In [57]:
gmm_em_result = read_gmm_em_result(clf)
pretty_print_gmm(gmm_em_result)
Out[57]:
weight mean_x mean_y sig_x sig_y corr
1 0.488 -0.941 -3.423 3.466 2.914 0.079
2 0.275 -5.922 3.699 3.807 3.813 -0.004
3 0.237 3.139 2.671 4.080 5.075 -0.077
In [58]:
fig,ax = plt.subplots(figsize=(3.5,3.5))
plot_gmm_ellipses(gmm_em_result, ax=ax, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
GMM Plot Result
0.487999934433 [[-0.94113898 -3.4233064 ]] [ 2.88405607  3.49094881] -77.7991458548
0.275423082355 [[-5.92249578  3.69926155]] [ 3.80286434  3.8174506 ] -145.984338421
0.236576983212 [[ 3.13905024  2.67132162]] [ 4.04627448  5.10148088] -170.358726403
In [59]:
X = Y = PLOT_AXIS_RANGE
pdf_Z = generate_Z_from_X_Y(X,Y, lambda coords: exp(clf.score_samples(coords)))

def residule_between_kde_and_gmm(points):
    kde_vals = exp(kde.score_samples(points))
    gmm_vals = exp(clf.score_samples(points))
    return kde_vals - gmm_vals 

residual_Z = generate_Z_from_X_Y(X,Y, residule_between_kde_and_gmm)

plot_3d_prob_density(X,Y,pdf_Z)
plot_3d_prob_density(X,Y,residual_Z)
align_figures()

fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,kde_Z,xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar_lim=colorbar_lim)
fig_em = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,pdf_Z,xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar_lim=colorbar_lim)
fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,residual_Z,
                     xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
align_figures()

Goodness-of-fit Statistics

In [60]:
points = FITTING_RANGE
gmm_pdf_result = exp(clf.score_samples(points))
gof_df(gmm_pdf_result, kde_result)
Out[60]:
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.960 0.018 0.037 1.184629e-07 0.039 0.255
In [61]:
gmm_em = group_gmm_param_from_gmm_param_array(gmm_em_result, sort_group = True)
mixed_model_pdf_em = generate_gmm_pdf_from_grouped_gmm_param(gmm_em)

6. GMM by Optimization

In [62]:
sample = SPEED_SET
points = FITTING_RANGE
max_speed = df.speed.max()
print(FIT_METHOD)
square_error
In [63]:
# from GMM,EM 
# GMM format: weight, meanx, meany, sigx, sigy, rho
x0 = gmm_em_result

cons = [
        # sum of every 6th element, which is the fraction of each gaussian
        {'type': 'eq', 'fun': lambda x: sum(x[::6]) - 1},
        # # limit the width/height ratio of elliplse, optional
#         {'type': 'ineq', 'fun': lambda x: width_height_ratios_set(x) - 1/3},
#         {'type': 'ineq', 'fun': lambda x: 3 - width_height_ratios_set(x)},
]

bonds = [(0., 0.99),(-fit_limit, fit_limit),(-fit_limit, fit_limit),
         (0., fit_limit),(0., fit_limit),(-0.99, 0.99)]*(len(x0)//6)

result = sp.optimize.minimize(
    lambda x0: GMM_fit_score(x0, kde_result, points, FIT_METHOD),
    x0,
    bounds = bonds,
    constraints=cons,
    tol = 0.000000000001,
    options = {"maxiter": 500})
result
Out[63]:
     fun: -16.577688727268132
     jac: array([  2.22098351e-01,   2.38418579e-07,   4.76837158e-07,
        -7.15255737e-07,   4.76837158e-07,  -1.19209290e-06,
         2.22084045e-01,  -4.76837158e-07,  -2.38418579e-07,
         4.76837158e-07,   2.38418579e-07,   4.76837158e-07,
         2.22091913e-01,   2.38418579e-07,   0.00000000e+00,
        -2.38418579e-07,  -4.76837158e-07,  -7.15255737e-07,
         0.00000000e+00])
 message: 'Optimization terminated successfully.'
    nfev: 910
     nit: 45
    njev: 45
  status: 0
 success: True
       x: array([ 0.25261235, -0.03895393, -3.79043303,  2.95167123,  2.17658974,
       -0.19074757,  0.19330258,  3.50739092,  2.01772813,  2.98081238,
        4.01809797, -0.05323661,  0.55408508, -4.0697654 ,  0.84910556,
        4.29550799,  5.84886855, -0.23105052])

6.1 GMM Result

In [64]:
gmm = group_gmm_param_from_gmm_param_array(result.x, sort_group = True)
mixed_model_pdf = generate_gmm_pdf_from_grouped_gmm_param(gmm)
gmm_pdf_result = mixed_model_pdf(points)
pretty_print_gmm(gmm)
Out[64]:
weight mean_x mean_y sig_x sig_y corr
1 0.554 -4.070 0.849 4.296 5.849 -0.231
2 0.253 -0.039 -3.790 2.952 2.177 -0.191
3 0.193 3.507 2.018 2.981 4.018 -0.053
In [65]:
fig_gmm, ax = plt.subplots(figsize=(3.5,3.5))
plot_gmm_ellipses(gmm, ax=ax, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
GMM Plot Result
0.554085076293 [[-4.0697654   0.84910556]] [ 4.06741832  6.00972216] -161.809359511
0.252612345281 [[-0.03895393 -3.79043303]] [ 2.09525434  3.00995269] -105.82936214
0.193302578427 [[ 3.50739092  2.01772813]] [ 2.9714754   4.02500779] -175.018602291

6.2 Bivariate Goodness-of-fit statistics

In [66]:
gof_df(gmm_pdf_result, kde_result)
Out[66]:
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.979 0.011 0.100 6.315406e-08 0.028 0.186
In [67]:
pd.DataFrame(gofs_mean_set_bivar).set_index('year_lim')
Out[67]:
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
year_lim
(1996, 2006) 0.068082 0.040320 1.353014e-07 0.041649 0.272139 0.954268
(1986, 1996) 0.039296 0.034205 8.345238e-08 0.039417 0.208740 0.966743
In [68]:
X = Y = PLOT_AXIS_RANGE
pdf_Z = generate_Z_from_X_Y(X,Y, mixed_model_pdf)# passing a function as an argument

def residule_between_kde_and_gmm(points):
    kde_vals = exp(kde.score_samples(points))
    gmm_vals = mixed_model_pdf(points)
    return kde_vals - gmm_vals 

residual_Z = generate_Z_from_X_Y(X,Y, residule_between_kde_and_gmm)

plot_3d_prob_density(X,Y,pdf_Z)
plot_3d_prob_density(X,Y,residual_Z)
align_figures()

fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,kde_Z, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
fig_gmm = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,pdf_Z, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,residual_Z,  xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
align_figures()

6.3 Univariate Goodness-of-fit

In [69]:
def f(V,theta):
    return (mixed_model_pdf([[V*cos(theta),V*sin(theta)]]))*V

def f_em(V,theta):
    return (mixed_model_pdf_em([[V*cos(theta),V*sin(theta)]]))*V
In [70]:
%%time
x = arange(0, max_speed, 0.5)
_, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(df.speed, x=x)
Wall time: 2.7 s
In [71]:
%%time
# Calculate Speed Distribution
# 1. GMM Model
y_ =[integrate.nquad(f, [[x_-0.01, x_+0.01],[0, 2*pi]]) for x_ in x]
y_gmm = array(list(zip(*y_))[0])/0.02

# 2. Weibull
y_weibul = sp.stats.weibull_min.pdf(x, *weibull_params)

# 3. Plot Comparison
df['speed'].hist(bins=arange(0, df.speed.max()), alpha=0.5, label='Data')
plot(x, y_gmm*len(df.speed),'-', color='black', label='GMM')
plot(x, y_weibul*len(df.speed), '--', color='black', label='Weibull') 
print('Speed Distribution Comparison')
plt_configure(xlabel='Speed'+speed_unit_text,
              ylabel='Frequency',legend=True, figsize=(4, 2))
plt.gca().set_ylim(bottom = 0)
plt.tight_layout()
plt.locator_params(axis='y', nbins=5)

# 4. R square for GMM, Weibull
print(R_square_for_speed(df['speed'], f, weibull_params, f_em))
Speed Distribution Comparison
(0.97575614820879186, 0.9563360944073418, 0.96058276832349465)
Wall time: 10.3 s
In [72]:
%%time
y_ = [integrate.nquad(f, [[0, x_val],[0, 2*pi]]) for x_val in x]
y_cdf_gmm = array(list(zip(*y_))[0])

# 5.2. CDF Comaprison
plot(x, y_ecdf,'o', alpha=0.8, label='Data')
plot(x, y_cdf_gmm,'-', color='black',label='GMM')
plot(x, y_cdf_weibull,'--', color='black',label='Weibull')
plt_configure(xlabel = "V", ylabel='P', legend=True, figsize=(4,3))

plt.figure()
plot(log(x), log(-log(1-y_ecdf)),'o', label = 'Empirical')
plot(log(x), log(-log(1-y_cdf_weibull)),'--', label = 'Weibull')
plot(log(x), log(-log(1-y_cdf_gmm)),'-', color='black', label = 'GMM')
plt_configure(xlabel='ln(V)',ylabel='ln(-ln(1-P))',legend={'loc':'best'}, figsize=(4,3))
align_figures()

cdf_diff, cdf_diff_weibull= np.abs(y_ecdf - y_cdf_gmm), np.abs(y_ecdf - y_cdf_weibull)
print(cdf_diff.max(), cdf_diff_weibull.max()) 
print(x[cdf_diff.argmax()], x[cdf_diff_weibull.argmax()])
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:11: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:12: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:13: RuntimeWarning: divide by zero encountered in log
0.0274923425721 0.0362219449501
6.0 6.0
Wall time: 9.32 s
In [73]:
# Calculate Angle Distribution
x = linspace(0,2*pi, num=36+1)
y_ =[integrate.nquad(f, [[0, inf],[x_-pi/36, x_+pi/36]]) for x_ in x]
y = array(list(zip(*y_))[0])
density, _ = dir_hist(df['dir'], bins=arange(-5, 370, 10), density=True)

plt.bar(arange(0, 360, 10), density*10*len(df['dir']), width=10, alpha=0.5, label='Data')
plot(x/pi*180, y*len(df['dir']) ,'-', color='black', label='GMM')
plt_configure(xlabel='Direction'+dir_unit_text, ylabel='Frequency', 
              legend={'loc': 'best'} ,tight='xtight',figsize = (4,2))
plt.tight_layout()
dir_fig = plt.gcf()
print('Direction Distribution Comparison')
sector_r_square(density*10, y[:-1])
Direction Distribution Comparison
Out[73]:
0.93265285851466229
In [74]:
pd.DataFrame(gofs_mean_set).set_index('year_lim')
Out[74]:
k_s r_square r_square_dir
year_lim
(1996, 2006) 0.029320 0.991129 0.646934
(1992, 2002) 0.016814 0.995331 0.755758
(1986, 1996) 0.019757 0.987923 0.795987

6.4 Sectoral Comaprison

In [75]:
%%time
incre = max(SECTOR_LENGTH, 10)
density_collection=Parallel(n_jobs=-1)(delayed(direction_compare)(gmm, df, angle, incre) 
                                        for angle in arange(0, 360, incre))  
# This R square is computed as in paper 
# Comparison of bivariate distribution constructionapproaches for analysing wind speed anddirection data
# http://onlinelibrary.wiley.com/doi/10.1002/we.400/full
print(true_R_square(density_collection))
0.887304747595
Wall time: 9.68 s
In [76]:
# %%time
# curve_collection=Parallel(n_jobs=-1)(delayed(direction_compare2)
#                                      (gmm, df, angle, incre, complex=True) for angle in arange(start, end, incre))  
In [77]:
# Calculate Speed Distribution
def model_data_comparison(df, original_incre = 10, incre = 10):
    start, end = -original_incre/2 + incre/2, 360
    curve_collection = []
    max_speed = df.speed.max()
    
    # Find a max count for plotting histogram
    max_count = max_count_for_angles(df, start, end, incre)
    plot_range = [0, max_speed, 0, max_count*1.05]
    
    for angle in arange(start, end, incre):
        angle_radian, incre_radian = np.radians([angle, incre])  
        start_angle, end_angle = angle-incre/2, angle+incre/2
        
        # 0. Select data from observation
        sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)
        data_size = len(sub_df.speed)
        # 1. Get Weibull and ECDF
        x, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(sub_df.speed)
        # 2. Get GMM PDF, CDF
        _, y_gmm, y_cdf_gmm, direction_prob = gmm_integration_in_direction(f, angle_radian-incre_radian/2, angle_radian+incre_radian/2, x)
        
        # 3. R square for GMM, Weibull
        bins = arange(0, sub_df.speed.max()+1)
        density, _ = np.histogram(sub_df['speed'], bins=bins, normed=True)
        density_expected_gmm_ =[integrate.nquad(f, [[x_, x_+1],[angle_radian-incre_radian/2, angle_radian+incre_radian/2]]) 
                            for x_ in bins[:-1]]
        density_expected_gmm = array(list(zip(*density_expected_gmm_ ))[0])/direction_prob
        R_square_gmm = sector_r_square(density, density_expected_gmm)
        
        density_expected_weibull = sp.stats.weibull_min.cdf(bins[1:], *weibull_params) - sp.stats.weibull_min.cdf(bins[:-1], *weibull_params) 
        R_square_weibull = sector_r_square(density, density_expected_weibull)

        # 4. K-S for GMM, Weibull
        cdf_diff, cdf_diff_weibull= np.abs(y_ecdf - y_cdf_gmm), np.abs(y_ecdf - y_cdf_weibull)
                
        # 5. Make Plots
        fig = plt.figure(figsize=(10,1.9))
        # 5.1. Frequency Comparison
        ax1 = fig.add_subplot(1,3,1)        
        sub_df['speed'].hist(bins=arange(0, sub_max_speed), alpha=0.5, label='Data')                  
        plot(x, y_gmm*data_size,'-', color='black', label='GMM')
        plot(x, y_weibull*data_size, '--', color='black',label='Weibull')   
        plt_configure(xlabel = "V", ylabel='Frequency', legend=True)
        plt.axis(plot_range)
        
        # 5.2. CDF Comaprison
        ax2 = fig.add_subplot(1,3,2)
        plot(x, y_ecdf,'o', alpha=0.8, label='Data')
        plot(x, y_cdf_gmm,'-', color='black',label='GMM')
        plot(x, y_cdf_weibull,'--', color='black',label='Weibull')
        plt.gca().set_xlim(right = max_speed)
        plt_configure(xlabel = "V", ylabel='P', legend=True)
        
        curves = {'direction': angle, 'datasize': data_size, 'weight': direction_prob, 'x': x, 
                  'gmm_pdf': y_gmm, 'gmm_cdf': y_cdf_gmm,
                  'weibull_pdf': y_weibull, 'weibull_cdf': y_cdf_weibull, 'ecdf': y_ecdf,
                  'max_cdf_diff_gmm': cdf_diff.max(), 'max_cdf_diff_weibull': cdf_diff_weibull.max(), 
                  'r_square_gmm': R_square_gmm, 'r_square_weibull': R_square_weibull}
        curve_collection.append(curves)
        
        plt.tight_layout()
        plt.show()
        print('%s (%s - %s) degree' % (angle, start_angle, end_angle))
        print('data size:', len(sub_df), 'weight', len(sub_df)/len(df))
        print('GMM', 'Weibull')
        print('R square', R_square_gmm,  R_square_weibull)
        print('max diff:', cdf_diff.max(), cdf_diff_weibull.max(), 
              'speed value:', x[cdf_diff.argmax()], x[cdf_diff_weibull.argmax()], 'y gmm', y_cdf_gmm[cdf_diff.argmax()])
        print(' ')
    return curve_collection
In [78]:
%%time
if len(effective_column) == 16:
    rebinned_angle = 22.5
else: 
    rebinned_angle = 20
    
curve_collection = model_data_comparison(df, SECTOR_LENGTH, rebinned_angle)
5.0 (-5.0 - 15.0) degree
data size: 360 weight 0.025111607142857144
GMM Weibull
R square 0.902579793011 0.968356866064
max diff: 0.105773546966 0.157829814936 speed value: 8.05014063662 3.45006027284 y gmm 0.858551324743
 
25.0 (15.0 - 35.0) degree
data size: 503 weight 0.03508649553571429
GMM Weibull
R square 0.942724153412 0.925006461162
max diff: 0.0535962776464 0.0558759380225 speed value: 5.31255678432 6.37506814119 y gmm 0.47904359375
 
45.0 (35.0 - 55.0) degree
data size: 544 weight 0.03794642857142857
GMM Weibull
R square 0.901442899585 0.906703024753
max diff: 0.0667149385202 0.0425808445794 speed value: 9.98370068374 6.98859047862 y gmm 0.912303173814
 
65.0 (55.0 - 75.0) degree
data size: 632 weight 0.04408482142857143
GMM Weibull
R square 0.789346460115 0.867882807402
max diff: 0.124642140026 0.0506535875886 speed value: 8.82110242745 5.51318901716 y gmm 0.805021886862
 
85.0 (75.0 - 95.0) degree
data size: 519 weight 0.03620256696428571
GMM Weibull
R square 0.897151335594 0.875340154892
max diff: 0.0608205168614 0.040770753395 speed value: 9.93517483757 7.727358207 y gmm 0.848874466765
 
105.0 (95.0 - 115.0) degree
data size: 604 weight 0.04213169642857143
GMM Weibull
R square 0.86334450936 0.974743635334
max diff: 0.118124369522 0.0355693531617 speed value: 8.22002748165 8.22002748165 y gmm 0.583862385445
 
125.0 (115.0 - 135.0) degree
data size: 995 weight 0.06940569196428571
GMM Weibull
R square 0.851113804797 0.912438187433
max diff: 0.105401641324 0.0904046603706 speed value: 10.4421821338 5.22109106689 y gmm 0.669472730536
 
145.0 (135.0 - 155.0) degree
data size: 1055 weight 0.07359095982142858
GMM Weibull
R square 0.866460832279 0.882646012079
max diff: 0.0430832314305 0.193317739741 speed value: 9.28590979785 8.25414204253 y gmm 0.60990787598
 
165.0 (155.0 - 175.0) degree
data size: 1083 weight 0.07554408482142858
GMM Weibull
R square 0.798474715408 0.815873304945
max diff: 0.0814506436858 0.12614599958 speed value: 9.13460327627 5.07477959793 y gmm 0.687175482098
 
185.0 (175.0 - 195.0) degree
data size: 908 weight 0.06333705357142858
GMM Weibull
R square 0.766940407381 0.867054191264
max diff: 0.121361467917 0.0667080031426 speed value: 5.62862115124 5.62862115124 y gmm 0.403968928559
 
205.0 (195.0 - 215.0) degree
data size: 925 weight 0.06452287946428571
GMM Weibull
R square 0.926849014682 0.938131118339
max diff: 0.070810307174 0.038958968976 speed value: 6.72434885241 5.88380524586 y gmm 0.601622125258
 
225.0 (215.0 - 235.0) degree
data size: 991 weight 0.06912667410714286
GMM Weibull
R square 0.946571883639 0.912377325191
max diff: 0.0285677437937 0.0656107240297 speed value: 11.0293047993 5.88229589298 y gmm 0.931068986781
 
245.0 (235.0 - 255.0) degree
data size: 1114 weight 0.07770647321428571
GMM Weibull
R square 0.97113132378 0.963828471043
max diff: 0.0370613446688 0.0496583861191 speed value: 6.76963393816 5.80254337557 y gmm 0.677097251312
 
265.0 (255.0 - 275.0) degree
data size: 1023 weight 0.07135881696428571
GMM Weibull
R square 0.951559353282 0.960042910043
max diff: 0.100780719048 0.0445463257222 speed value: 5.85442274506 5.85442274506 y gmm 0.581526221323
 
285.0 (275.0 - 295.0) degree
data size: 1083 weight 0.07554408482142858
GMM Weibull
R square 0.94166409694 0.936097450297
max diff: 0.0543214880892 0.0367859923407 speed value: 7.31891660078 5.48918745059 y gmm 0.782243608864
 
305.0 (295.0 - 315.0) degree
data size: 821 weight 0.05726841517857143
GMM Weibull
R square 0.911582859067 0.92247603126
max diff: 0.0798274627916 0.0486281569292 speed value: 4.9815256977 2.98891541862 y gmm 0.454886300911
 
325.0 (315.0 - 335.0) degree
data size: 563 weight 0.039271763392857144
GMM Weibull
R square 0.893260689285 0.922879635547
max diff: 0.0607137503304 0.0786369966152 speed value: 5.15843277151 2.0633731086 y gmm 0.520103301179
 
345.0 (335.0 - 355.0) degree
data size: 490 weight 0.0341796875
GMM Weibull
R square 0.830847051303 0.903364657627
max diff: 0.099423295004 0.141255130704 speed value: 7.77492670939 2.59164223646 y gmm 0.858606968473
 
Wall time: 42.7 s
In [79]:
diff_df = pd.DataFrame(curve_collection) 

gmm_mean, weibull_mean = plot_sectoral_comparison(diff_df.r_square_gmm, diff_df.r_square_weibull, 
                                                  diff_df.direction, diff_df.datasize)
plt_configure(ylabel="$\ R^2$", xlabel='Direction'+dir_unit_text)
ylim = min(plt.gca().get_ylim()[0],0.75)
plt.gca().set_ylim(top=1, bottom=ylim)
plt.tight_layout()
print(gmm_mean, weibull_mean)
0.8883756475065896 0.9125285046516485
In [80]:
gmm_mean, weibull_mean = plot_sectoral_comparison(diff_df.max_cdf_diff_gmm, diff_df.max_cdf_diff_weibull, 
                                                  diff_df.direction, diff_df.datasize)
plt_configure(ylabel="K-S", xlabel='Direction'+dir_unit_text)
ylim = max(plt.gca().get_ylim()[1],0.25)
plt.gca().set_ylim(top=ylim, bottom=0)
plt.tight_layout()
print(gmm_mean, weibull_mean)
0.07579081050011181 0.07521517772982446
In [81]:
# Compare direction weight with previous figure
display(dir_fig)

6.5 Insufficient-fit Sector Investigation

(1) Data Variability, by Bootstrap (Resampling)

In [82]:
angle =  max_diff_angle = diff_df.ix[diff_df['max_cdf_diff_gmm'].idxmax()]['direction']
incre = rebinned_angle
In [83]:
FRACTION = 1

# Select data from observation
start_angle, end_angle = angle-incre/2, angle+incre/2
angle_radian, incre_radian = radians(angle), radians(incre)  
sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)
In [84]:
x = arange(0, sub_max_speed, 0.5)
_, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(sub_df.speed, x)
_, y_gmm, y_cdf_gmm, direction_prob = gmm_integration_in_direction(f, angle_radian-incre_radian/2, angle_radian+incre_radian/2, x)

fig = plt.figure(figsize=(10,1.9))
ax1 = fig.add_subplot(1,3,1)   
ax2 = fig.add_subplot(1,3,2)   
ax3 = fig.add_subplot(1,3,3)   

# 1. Data
bins=arange(0, sub_max_speed)
sub_df['speed'].hist(ax=ax1, bins=bins, alpha=0.5, label='Data', normed=True)  

# 2. GMM
ax1.plot(x, y_gmm,'-', color='black', label='GMM')
ax2.plot(x, y_cdf_gmm,'-', color = 'black', label='GMM')
ax3.plot(log(x), log(-log(1-y_cdf_gmm)),'-', color = 'black',label='GMM')

# 3. Weilbull 
ax1.plot(x, y_weibull,'--',color='black',label='Weibull')
ax2.plot(x, y_cdf_weibull,'--',label='Weibull')
ax3.plot(log(x), log(-log(1-y_cdf_weibull)),'--',label='Weibull')

# 4. Data Resampled
count_collection = []
for i in range(1,100):
    sub_df_resampled = sub_df.sample(frac=FRACTION, replace=True)    
    resampled_count, _ = np.histogram(sub_df_resampled['speed'], bins=bins, normed=True) 
    count_collection.append(resampled_count)
    
    ecdf = sm.distributions.ECDF(sub_df_resampled.speed)
    y_ecdf = ecdf(x) 
    ax2.plot(x, y_ecdf,':', label='Data Resampled')
    ax3.plot(log(x), log(-log(1-y_ecdf)),':', label='Data Resampled')
    if i == 1: 
#         plt_configure(ax=ax2, xlabel = "$V$", ylabel='$P$', legend={'loc':'best'})
#         plt_configure(ax=ax3, xlabel="ln($V$)", ylabel="ln(-ln(1-$P$)",legend={'loc':'best'})
        plt_configure(ax=ax2, xlabel = "V", ylabel='P', legend={'loc':'best'})
        plt_configure(ax=ax3, xlabel="ln(V)", ylabel="ln(-ln(1-P)",legend={'loc':'best'})

print('%s (%s - %s) Degree Speed Distribution' % (angle, start_angle, end_angle))
count_collection = np.array(count_collection)
mx, mn = np.max(count_collection,0), np.min(count_collection,0)
ax1.plot(bins[1:]-0.5, mx, ':', color='blue')
ax1.plot(bins[1:]-0.5, mn, ':', color='blue', label='Resample limit')
ax1.set_ylim(bottom = 0)
# plt_configure(ax=ax1, xlabel='$V$',ylabel='Frequency',legend={'loc':'best'})
plt_configure(ax=ax1, xlabel='V', ylabel='Frequency',legend={'loc':'best'})
ax1.locator_params(axis='y', nbins=5)
ax2.locator_params(axis='y', nbins=5)
ax3.locator_params(axis='y', nbins=5)
plt.tight_layout()
diff = abs(y_ecdf - y_cdf_gmm)
print(diff.max(), x[diff.argmax()], y_cdf_gmm[diff.argmax()])
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:17: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:22: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:34: RuntimeWarning: divide by zero encountered in log
65.0 (55.0 - 75.0) Degree Speed Distribution
0.121064810361 8.5 0.777710379981

(2) Time Variability

In [85]:
fig_time_variability_3d = plt.figure()
ax1 = fig_time_variability_3d.gca(projection='3d')

fig_time_variability_cdf,ax2 = plt.subplots(figsize=(3,1.8))
fig_time_variability_weibull, ax3 = plt.subplots(figsize=(3,1.8))

ax2.plot(x, y_cdf_gmm,'-', color='black', label = 'GMM')
ax2.plot(x, y_cdf_weibull,'--', label='Weibull')

ax3.plot(log(x), log(-log(1-y_cdf_gmm)),'-', color='black',label='GMM')
ax3.plot(log(x), log(-log(1-y_cdf_weibull)), '--', label='Weibull')

# 3. Data
prop_cycle=iter(mpl.rcParams['axes.color_cycle'])
for start_time in range(2001, 2015, 5):
    end_time = start_time + 4 
    df_other_years = df_all_years[str(start_time):str(end_time)]
    df_other_years_at_angle, sub_max_speed_other_year = select_df_by_angle(df_other_years, start_angle, end_angle)
    if len(df_other_years_at_angle) > 0 :
        
        ecdf = sm.distributions.ECDF(df_other_years_at_angle.speed)
        y_ecdf = ecdf(x)
        ax2.plot(x, y_ecdf,':', label = start_time)
        ax3.plot(log(x), log(-log(1-y_ecdf)),':', label = start_time)
        
        count, division = np.histogram(df_other_years_at_angle['speed'], normed=True,
                                       bins=arange(0, sub_max_speed_other_year))
        ax1.bar(left=division[:-1], height=count, zs=start_time, zdir='x', 
                color=next(prop_cycle), alpha=0.8)
        x_3d = start_time*np.ones_like(x)
        ax1.plot(x_3d, x, y_gmm, '-', color='black', label='GMM'  if start_time == 2011 else '')
        ax1.plot(x_3d, x, y_weibull, '--', color='blue', label='Weibull' if start_time == 2011 else '')
        
print('%s (%s - %s) Degree Speed Distribution' % (angle, start_angle, end_angle))
ax1.set_ylim(bottom = 0)
ax1.set_zlabel('Frequency')
plt_configure(ax=ax1, xlabel='Time',ylabel='V', legend=True)
plt_configure(ax=ax2, xlabel = "V", ylabel='P', legend={'loc':'best'})
plt_configure(ax=ax3, xlabel="ln(V)", ylabel="ln(-ln(1-P)", legend={'loc':'best'})

ax1.set_zlim(bottom = 0)
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:10: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:11: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:24: RuntimeWarning: divide by zero encountered in log
65.0 (55.0 - 75.0) Degree Speed Distribution

(3) Adjacent Sector Variability

In [86]:
incre = rebinned_angle
angle_group = [max_diff_angle-incre, max_diff_angle, max_diff_angle+incre]
In [87]:
fig_adjecent_variability_3d = plt.figure()
ax1 = fig_adjecent_variability_3d.gca(projection='3d')
fig_adjecent_variability_cdf, ax2 = plt.subplots(figsize=(3,1.8))
fig_adjecent_variability_weibull, ax3 = plt.subplots(figsize=(3,1.8))

legend_3d = False
prop_cycle=iter(mpl.rcParams['axes.color_cycle'])

curve_df = pd.DataFrame(curve_collection)

for angle in angle_group:
    curves = curve_df.query('direction == @angle%360').T.to_dict()
    curves = curves[list(curves)[0]]
    data_size, x =  curves['datasize'], curves['x']
    y_gmm, y_cdf_gmm =  curves['gmm_pdf'], curves['gmm_cdf'] 
    y_weibull, y_cdf_weibull, y_cdf = curves['weibull_pdf'],  curves['weibull_cdf'], curves['ecdf']

    linestyle = '-' if angle == max_diff_angle else ':'
    alpha = 0.7 if angle == max_diff_angle else 0.3

    ax2.plot(x, y_gmm*data_size, linestyle, label=angle)        
    ax3.plot(x, y_weibull*data_size, linestyle, label=angle)

    start_angle, end_angle = angle-incre/2, angle+incre/2
    sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)

    x_3d = angle*np.ones_like(x)
    ax1.plot(x_3d, x, y_gmm*data_size, color='black', label='GMM')
    ax1.plot(x_3d, x, y_weibull*data_size, color='blue', linestyle='--',label='Weibull')

    count, division = np.histogram(sub_df['speed'], bins=arange(0, sub_max_speed))
    ax1.bar(left=division[:-1], height=count, zs=angle, zdir='x', color=next(prop_cycle), alpha=0.8)

    if legend_3d == False:
        ax1.legend()
        legend_3d = True
        
plt_configure(ax=ax1, xlabel='Direction', ylabel='Speed')   
plt_configure(ax=ax2, xlabel='V',ylabel='Frequency',legend={'loc':'best'})
plt_configure(ax=ax3, xlabel='V',ylabel='Frequency',legend={'loc':'best'})
ax1.set_zlabel('Frequency')
ax1.set_zlim(bottom = 0)
ylim = max(ax1.get_ylim()[1],ax3.get_ylim()[1])
ax2.set_ylim(bottom = 0, top=ylim)
ax3.set_ylim(bottom = 0, top=ylim)

print(max_diff_angle) 
print('GMM, Weibull, Histogram')
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
65.0
GMM, Weibull, Histogram

7. Result Variability & Cross-Validation

In [88]:
if 'bandwidth' not in globals():
    bandwidth = DEFAULT_BANDWDITH    
if 'FIT_METHOD' not in globals():
    FIT_METHOD = 'square_error'       
if 'KDE_KERNEL' not in globals():
    KDE_KERNEL = 'gaussian'
    
config = {'bandwidth': bandwidth, 
          'fitting_range': FITTING_RANGE,
          'fit_limit': fit_limit,
          'kde_kernel': KDE_KERNEL}

print(bandwidth, FIT_METHOD)
0.9 square_error

7.1 Variability of the Result

In [89]:
%%time
results = Parallel(n_jobs=-1)(delayed(resampled_fitting)(df, FIT_METHOD, NUMBER_OF_GAUSSIAN, config) for i in range(12))                        
for result in results:
    display(pretty_print_gmm(result['gmm']))
    fig,ax = plt.subplots(figsize=(3.5,3.5))
    plot_gmm_ellipses(result['gmm'],ax=ax, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
    plt.show()
    
    display(gof_df(result['gmm_pdf_result'], result['kde_result']))
    display(gof_df(result['gmm_pdf_result'], kde_result))
    print('')
weight mean_x mean_y sig_x sig_y corr
1 0.585 -3.789 0.731 4.285 5.788 -0.256
2 0.224 -0.038 -3.756 2.881 2.131 -0.168
3 0.191 3.595 2.000 2.942 4.245 -0.052
GMM Plot Result
0.585275056185 [[-3.78949846  0.73107592]] [ 4.00711994  5.98336208] -160.032475744
0.223577885506 [[-0.03806631 -3.75617764]] [ 2.06750037  2.9268044 ] -104.393756209
0.191147058308 [[ 3.59499622  1.99978871]] [ 2.93453648  4.24989194] -176.014570636
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.978 0.012 0.075 6.515037e-08 0.030 0.189
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.977 0.010 0.089 6.669950e-08 0.029 0.191

weight mean_x mean_y sig_x sig_y corr
1 0.537 -4.156 0.753 4.267 5.769 -0.247
2 0.268 0.143 -3.806 3.060 2.174 -0.051
3 0.195 3.445 2.513 3.352 3.819 -0.125
GMM Plot Result
0.537041072458 [[-4.15564859  0.75275112]] [ 4.00734022  5.95189801] -160.555151707
0.267701276031 [[ 0.14321608 -3.80570361]] [ 2.16812771  3.06377668] -94.1761757919
0.195257651511 [[ 3.44475921  2.51284591]] [ 3.25463723  3.90236337] -158.133239245
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.979 0.010 0.055 6.105985e-08 0.029 0.183
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.977 0.011 0.077 6.706755e-08 0.029 0.192

weight mean_x mean_y sig_x sig_y corr
1 0.564 -4.139 0.880 4.296 5.819 -0.210
2 0.236 0.034 -3.915 2.750 2.174 -0.218
3 0.200 3.622 1.687 2.900 4.194 0.003
GMM Plot Result
0.564128500156 [[-4.13903841  0.88032532]] [ 4.10288116  5.95691834] -162.868244903
0.235983591339 [[ 0.03414763 -3.91488862]] [ 2.05411414  2.84029568] -111.282581331
0.199887908505 [[ 3.6224492   1.68660327]] [ 2.89984854  4.1938875 ] 179.778996038
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.975 0.012 0.116 7.381730e-08 0.030 0.201
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.977 0.012 0.089 6.745737e-08 0.029 0.192

weight mean_x mean_y sig_x sig_y corr
1 0.594 -3.736 0.764 4.462 5.869 -0.213
2 0.231 -0.010 -3.682 2.859 2.132 -0.216
3 0.175 3.670 1.848 2.728 4.156 0.007
GMM Plot Result
0.593690234736 [[-3.73636951  0.76437452]] [ 4.24515525  6.02801585] -161.252999148
0.231194610939 [[-0.01007175 -3.68155491]] [ 2.0290204   2.93234142] -107.985229282
0.175115154324 [[ 3.67041823  1.84767326]] [ 2.72746032  4.15624971] 179.541700979
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.979 0.011 0.147 6.225226e-08 0.028 0.185
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.978 0.015 0.116 6.642564e-08 0.029 0.191

weight mean_x mean_y sig_x sig_y corr
1 0.510 -4.429 0.865 4.169 5.885 -0.292
2 0.285 0.037 -3.726 3.136 2.325 -0.207
3 0.206 3.390 2.281 3.271 3.833 -0.193
GMM Plot Result
0.509575982233 [[-4.42927022  0.86546725]] [ 3.84678959  6.10050811] -160.143801801
0.284545725378 [[ 0.03683799 -3.72553083]] [ 2.22214902  3.20916734] -107.136498432
0.205878292389 [[ 3.39045887  2.28097379]] [ 3.09135204  3.97924038] -154.772097204
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.977 0.012 0.225 6.737818e-08 0.031 0.192
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.977 0.013 0.140 6.700925e-08 0.029 0.192

weight mean_x mean_y sig_x sig_y corr
1 0.549 -4.046 0.865 4.313 5.833 -0.247
2 0.252 -0.144 -3.817 2.982 2.162 -0.183
3 0.200 3.411 1.682 3.101 4.025 0.004
GMM Plot Result
0.54853149273 [[-4.04646199  0.86507174]] [ 4.05039036  6.01803666] -160.561596261
0.251739494753 [[-0.14445275 -3.81673916]] [ 2.08947138  3.03347593] -104.587030638
0.199729012517 [[ 3.41146644  1.68183065]] [ 3.10119173  4.02491008] 179.593085398
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.977 0.011 0.074 6.938954e-08 0.030 0.195
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.978 0.016 0.081 6.491990e-08 0.029 0.189

weight mean_x mean_y sig_x sig_y corr
1 0.693 -2.155 1.319 5.734 5.382 -0.123
2 0.236 -0.934 -3.736 2.961 2.278 -0.291
3 0.071 3.005 -1.284 1.869 3.717 0.606
GMM Plot Result
0.69273027989 [[-2.15545172  1.31873141]] [ 5.16193858  5.933189  ] -121.391656718
0.236194363057 [[-0.93380271 -3.73578383]] [ 2.07953072  3.10389244] -113.795960664
0.0710753570531 [[ 3.00461387 -1.28411397]] [ 1.41240582  3.91292851] 160.400701903
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.981 0.021 0.041 5.755669e-08 0.028 0.178
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.980 0.023 0.040 6.009261e-08 0.028 0.182

weight mean_x mean_y sig_x sig_y corr
1 0.529 -4.146 1.023 4.360 5.919 -0.232
2 0.261 -0.105 -3.827 2.979 2.185 -0.162
3 0.210 3.201 1.909 3.191 4.184 -0.001
GMM Plot Result
0.528980907687 [[-4.14608472  1.0228419 ]] [ 4.12533484  6.0854021 ] -161.608901653
0.261006871925 [[-0.1053296 -3.8267578]] [ 2.12580024  3.02174843] -103.566113578
0.210012220388 [[ 3.20081989  1.90876707]] [ 3.19146515  4.18374125] -179.891879379
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.977 0.012 0.075 6.918625e-08 0.029 0.195
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.978 0.018 0.067 6.566461e-08 0.029 0.190

weight mean_x mean_y sig_x sig_y corr
1 0.717 -2.330 0.973 5.476 5.518 -0.114
2 0.205 -0.621 -3.741 2.996 2.104 -0.323
3 0.078 2.973 -0.961 2.107 3.837 0.686
GMM Plot Result
0.717188638885 [[-2.33035051  0.97297992]] [ 5.17475204  5.80108618] -136.900462083
0.205099455423 [[-0.62125816 -3.74117042]] [ 1.9099792   3.12315781] -110.905136854
0.0777119056918 [[ 2.97342377 -0.96083056]] [ 1.4206983  4.1400584] 156.412513007
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.980 0.017 0.048 5.933648e-08 0.027 0.180
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.979 0.018 0.045 6.206471e-08 0.028 0.185

weight mean_x mean_y sig_x sig_y corr
1 0.537 -4.147 0.748 4.160 5.822 -0.252
2 0.258 0.013 -3.783 2.971 2.188 -0.123
3 0.206 3.281 2.175 3.145 3.892 -0.064
GMM Plot Result
0.536671216091 [[-4.14658654  0.74825117]] [ 3.91199853  5.99173776] -161.82468658
0.257784305924 [[ 0.01323741 -3.78334486]] [ 2.15325063  2.99693152] -100.785203838
0.205544477985 [[ 3.2812947   2.17518594]] [ 3.12705425  3.90638673] -171.743118506
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.979 0.011 0.094 6.360415e-08 0.029 0.187
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.978 0.014 0.096 6.521023e-08 0.029 0.189

weight mean_x mean_y sig_x sig_y corr
1 0.665 -2.379 1.748 5.902 5.238 -0.151
2 0.267 -0.958 -3.779 3.040 2.340 -0.222
3 0.068 2.811 -1.506 2.071 3.507 0.679
GMM Plot Result
0.665064564052 [[-2.37858918  1.74813315]] [ 5.01836667  6.09032229] -115.807612913
0.266739923444 [[-0.958354   -3.77948628]] [ 2.21393645  3.13331092] -110.009623011
0.0681955125041 [[ 2.81064244 -1.50642918]] [ 1.39388601  3.82729061] 154.54323977
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.982 0.021 0.044 5.378699e-08 0.026 0.172
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.979 0.019 0.043 6.097353e-08 0.028 0.183

weight mean_x mean_y sig_x sig_y corr
1 0.479 -4.756 0.671 4.034 6.097 -0.318
2 0.266 0.015 -3.822 2.959 2.150 -0.158
3 0.255 2.756 2.278 3.634 4.102 -0.124
GMM Plot Result
0.478718876701 [[-4.75609935  0.67132736]] [ 3.69792425  6.3061423 ] -161.605567866
0.266227423263 [[ 0.01548589 -3.82233502]] [ 2.09528367  2.99739501] -102.962273704
0.255053700036 [[ 2.75637605  2.27830848]] [ 3.52501671  4.19573633] -157.179318622
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.977 0.013 0.064 6.615672e-08 0.029 0.191
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.978 0.013 0.062 6.652270e-08 0.029 0.191
Wall time: 15.7 s

7.2 Cross-validation, to select the number of Gaussian

In [90]:
%%time
from sklearn.cross_validation import train_test_split, KFold

## 5-fold cross validation
gaussian_number_range = arange(1,6)
CV_result_train_all,CV_result_test_all =[],[]
number_of_fold = 4
print('Number of train/test dataset', len(df)*(number_of_fold-1)/number_of_fold, len(df)/number_of_fold) 

for number_of_gaussian in gaussian_number_range:
    print( '  ')
    print('Number of gaussian', number_of_gaussian)
    
    kf = KFold(len(df), n_folds=number_of_fold, shuffle=True) 

    CV_result = Parallel(n_jobs=-1)(delayed(fit_per_fold)(df, train_index, test_index, FIT_METHOD, number_of_gaussian, config) for train_index, test_index in kf)                        

    CV_result_train, CV_result_test = list(zip(*CV_result))
    CV_result_train, CV_result_test = list(CV_result_train), list(CV_result_test)
        
    CV_result_train_all.append(CV_result_train)
    CV_result_test_all.append(CV_result_test)
    
    print('Train')
    pretty_pd_display(CV_result_train)
    print('Test')
    pretty_pd_display(CV_result_test)
Number of train/test dataset
D:\ProgramData\Anaconda3\lib\site-packages\sklearn\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
 10752.0 3584.0
  
Number of gaussian 1
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.143144 0.071273 5.304168e-07 0.081678 0.539624 0.821352
1 0.150828 0.070472 5.163950e-07 0.083938 0.532304 0.824238
2 0.148059 0.072403 5.465168e-07 0.080609 0.547461 0.816369
3 0.153187 0.073965 5.341887e-07 0.082831 0.541250 0.819718
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.181460 0.070359 5.429837e-07 0.081208 0.545256 0.816953
1 0.161861 0.069157 5.907109e-07 0.078389 0.569159 0.806979
2 0.172339 0.076068 5.268045e-07 0.089801 0.537927 0.820641
3 0.148941 0.072530 5.656833e-07 0.082002 0.557428 0.810150
  
Number of gaussian 2
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.045866 0.023568 9.805185e-08 0.034522 0.232051 0.967295
1 0.061912 0.024010 7.988715e-08 0.032433 0.209314 0.972968
2 0.057215 0.023471 9.113579e-08 0.034256 0.223701 0.969081
3 0.058146 0.026487 9.475013e-08 0.034993 0.227826 0.967830
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.067306 0.023844 1.131488e-07 0.040024 0.248778 0.960772
1 0.099382 0.027758 1.565775e-07 0.043016 0.293249 0.947888
2 0.059787 0.033193 1.169749e-07 0.038151 0.253005 0.961332
3 0.060008 0.026278 1.040571e-07 0.035652 0.239469 0.965638
  
Number of gaussian 3
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.101845 0.010159 7.127204e-08 0.029594 0.197806 0.975849
1 0.114976 0.010174 6.482695e-08 0.029226 0.188567 0.978182
2 0.044533 0.018961 6.465406e-08 0.028927 0.188353 0.977967
3 0.083290 0.010472 5.473927e-08 0.026698 0.173245 0.981698
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.071203 0.022975 8.288001e-08 0.034201 0.213031 0.972562
1 0.147877 0.024974 1.014193e-07 0.034802 0.235966 0.965716
2 0.059131 0.020182 8.542011e-08 0.032780 0.216425 0.972171
3 0.147308 0.017976 1.387808e-07 0.040854 0.276175 0.952259
  
Number of gaussian 4
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.067546 0.010622 3.440547e-08 0.021292 0.137493 0.988306
1 0.024946 0.007180 3.041639e-08 0.019662 0.129098 0.989705
2 0.038613 0.012183 3.370247e-08 0.020672 0.136008 0.988559
3 0.044208 0.010706 3.493869e-08 0.021134 0.138401 0.988376
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.062152 0.025649 7.873375e-08 0.030659 0.207364 0.974175
1 0.051316 0.018237 8.554044e-08 0.031909 0.217041 0.971717
2 0.051083 0.024698 7.238031e-08 0.030904 0.199142 0.976120
3 0.125748 0.023417 7.415992e-08 0.031114 0.201920 0.974034
  
Number of gaussian 5
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.047036 0.008117 2.321246e-08 0.017308 0.112908 0.992199
1 0.047961 0.009534 2.713720e-08 0.018514 0.121960 0.990846
2 0.030713 0.008620 1.966211e-08 0.016036 0.103898 0.993270
3 0.030899 0.019391 2.779159e-08 0.018756 0.123430 0.990683
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.081780 0.011522 5.255007e-08 0.025585 0.169531 0.982088
1 0.050058 0.012345 4.811613e-08 0.024538 0.162701 0.983814
2 0.050326 0.013342 6.208785e-08 0.027126 0.184367 0.979972
3 0.057651 0.019657 6.191830e-08 0.028838 0.184533 0.978820
Wall time: 47.1 s
In [91]:
train_scores_mean, train_scores_std = generate_mean_std_gof(CV_result_train_all)
print('Train gof mean, std')
display(train_scores_mean)

test_scores_mean, test_scores_std = generate_mean_std_gof(CV_result_test_all)
print('Test gof mean, std')
display(test_scores_mean)
Train gof mean, std
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
1 0.148804 0.072028 5.318793e-07 0.082264 0.540160 0.820419
2 0.055785 0.024384 9.095623e-08 0.034051 0.223223 0.969294
3 0.086161 0.012442 6.387308e-08 0.028611 0.186993 0.978424
4 0.043828 0.010173 3.336575e-08 0.020690 0.135250 0.988737
5 0.039152 0.011416 2.445084e-08 0.017653 0.115549 0.991750
Test gof mean, std
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
1 0.166150 0.072028 5.565456e-07 0.082850 0.552443 0.813681
2 0.071621 0.027768 1.226896e-07 0.039211 0.258625 0.958907
3 0.106380 0.021527 1.021250e-07 0.035659 0.235399 0.965677
4 0.072575 0.023000 7.770360e-08 0.031146 0.206367 0.974011
5 0.059954 0.014216 5.616809e-08 0.026522 0.175283 0.981173
In [92]:
prop_cycle=mpl.rcParams['axes.color_cycle']
gaussian_number_range = train_scores_mean.index
for column, column_name in zip(['R_square','K_S','Chi_square'],["$\ R^2$", "K-S", "$\widetilde{\chi^2} $"]):
    plot(gaussian_number_range, train_scores_mean[column],
             '--', label = 'training', color=prop_cycle[0])
    plt.fill_between(gaussian_number_range, 
                     train_scores_mean[column] - train_scores_std[column],
                     train_scores_mean[column] + train_scores_std[column], 
                     alpha=0.2, color=prop_cycle[0])
    
    plot(gaussian_number_range, test_scores_mean[column],
             '-', label = 'test',color=prop_cycle[1])
    plt.fill_between(gaussian_number_range, 
                 test_scores_mean[column] - test_scores_std[column],
                 test_scores_mean[column] + test_scores_std[column], 
                 alpha=0.2,color=prop_cycle[1])
    plt.xticks(gaussian_number_range)
    print(column)
    plt.locator_params(axis='y', nbins=5)
    plt_configure(xlabel='Number of Gaussian Distributions', ylabel=column_name, 
                  figsize=(3,2), legend={'loc':'best'})
    if column == 'R_square':
        plt.gca().set_ylim(top=1)
    if column == 'K_S' or column == 'Chi_square':
        plt.gca().set_ylim(bottom=0)
    plt.show()
R_square
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
K_S
Chi_square
In [93]:
fig = plt.figure(figsize=(4.2,2.4))
ax1 = fig.add_subplot(1,2,1) 
plot_2d_prob_density(X, Y, kde_Z, ax=ax1,
                     xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar=False)
ax1.grid(False)
ax2 = fig.add_subplot(1,2,2) 
plot_2d_prob_density(X, Y, pdf_Z, ax=ax2,
                     xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar=False)
ax2.grid(False)
ax2.get_yaxis().set_visible(False)
In [ ]:
for fig in [fig_hist, fig_kde, fig_em, fig_gmm]:
    display(fig)
for fig in [fig_time_variability_3d, fig_time_variability_cdf, fig_time_variability_weibull, 
            fig_adjecent_variability_3d, fig_adjecent_variability_cdf, fig_adjecent_variability_weibull,]:
    display(fig)
In [ ]:
import time
save_notebook()
time.sleep(3)
location_name = get_location_name(file_path)
print(location_name)
current_file = 'GMM.ipynb'
output_file = './output_HTML/'+location_name+'.html' 

output_HTML(current_file, output_file)