Modeling the Joint Distribution of Wind Speed and Direction using Gaussain Mixture Models¶

OEN Method: Harris, Cook The parent wind speed distribution: Why Weibull? http://www.sciencedirect.com/science/article/pii/S0167610514001056

Gaussian Mixture Models, http://scikit-learn.org/stable/modules/mixture.html

1. Set up¶

1.1 Environment¶

%matplotlib inline
%load_ext autoreload
%autoreload 2

from import_file import *
from helpers.parallel_helper import *
load_libs()

plt.rcParams['axes.autolimit_mode'] = 'round_numbers'
plt.rcParams['axes.xmargin'] = 0.
plt.rcParams['axes.ymargin'] = 0.
mpl.rcParams['patch.force_edgecolor'] = True

1.2 Read Data¶

# file_path, bandwidth= './data/NCDC/europe/uk/marham/dat.txt', 1.7
# file_path, bandwidth= './data/NCDC/europe/uk/tiree/dat.txt', 1.9
# file_path, bandwidth, NUMBER_OF_GAUSSIAN = './data/NCDC/europe/uk/boscombe_down/dat.txt', 1.5, 4
# file_path, bandwidth= './data/NCDC/europe/uk/middle_wallop/dat.txt', 1.3
# file_path, bandwidth= './data/NCDC/europe/uk/bournemouth/dat.txt',1.3 # 4?
# file_path= "./data/NCDC/europe/uk/weybourne/dat.txt"
# file_path= "./data/NCDC/europe/uk/skye_lusa/dat.txt" # 
# file_path= "./data/NCDC/europe/uk/wattisham/dat.txt"
# file_path= "./data/NCDC/europe/uk/south_uist_range/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/europe/uk/holbeach/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/europe/uk/cambridge/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/europe/us/baltimore/dat.txt" # time too short
# file_path= "./data/NCDC/europe/uk/bealach_na_ba/dat.txt" # time too short
# file_path= "./data/NCDC/europe/uk/benbecula/dat.txt" # truncate (untruncate in m/s), 4?
# file_path= './data/NCDC/europe/uk/southhamption/dat.txt' # high 0, trend

# file_path, bandwidth, NUMBER_OF_GAUSSIAN = "./data/NCDC/europe/germany/landsberg_lech/dat.txt", 0.9, 4 
# file_path, bandwidth= "./data/NCDC/europe/germany/neuburg/dat.txt", 0.7
# file_path, bandwidth= "./data/NCDC/europe/germany/laupheim/dat.txt", 0.7 # double peak, 4?, trend
# file_path, bandwidth= './data/NCDC/europe/germany/niederstetten/dat.txt', 0.9 # get the peak
# file_path, bandwidth= "./data/NCDC/europe/germany/holzdorf/dat.txt", 0.9 # 2008 year
# file_path, bandwidth, NUMBER_OF_GAUSSIAN= './data/NCDC/europe/france/nantes/dat.txt', 0.9, 4 # unit shift, one direction deviate big
# file_path, convert_to_knot= './data/NCDC/europe/france/pau_pyrenees/dat.txt', True # unit shift, 2; force using knot 
# file_path= "./data/NCDC/europe/france/avord/dat.txt" # try 4, initial speed (should be good with m/s), incompete dataset
# file_path= "./data/NCDC/europe/france/vatry/dat.txt"  # double peak, initial speed, incompete dataset
# file_path, bandwidth, NUMBER_OF_GAUSSIAN= "./data/NCDC/europe/spain/valladolid/dat.txt", 1.1, 4
# file_path= './data/NCDC/europe/spain/jerez/dat.txt' # high 0
# file_path, bandwidth= "./data/NCDC/europe/spain/barayas/dat.txt", 0.7 # not good fit
# file_path, bandwidth= './data/NCDC/europe/spain/malaga/dat.txt', 0.7 # directions blocked?
# file_path, bandwidth= './data/NCDC/europe/spain/tenerife_sur/dat.txt', 0.7 # directions blocked?
# file_path, bandwidth= './data/NCDC/europe/spain/almeria/dat.txt', 0.7 # negative dimensions?
# file_path, bandwidth= './data/NCDC/europe/greece/eleftherios_intl/dat.txt',0.7 # some direction might be blocked
# file_path= './data/NCDC/europe/ciampino/dat.txt' # try 4, bandwidth?
# file_path= "./data/NCDC/europe/huspel_aws/dat.txt"  # integer, 4?
# file_path= './data/NCDC/gibraltar/dat.txt' # bad fit

# MidEast
file_path, bandwidth= './data/NCDC/mideast/uae/al_maktoum/dat.txt', 1.1
# file_path= './data/NCDC/mideast/uae/sharjah_intl/dat.txt' 
# file_path= './data/NCDC/mideast/uae/dubai_intl/dat.txt' 
# file_path= './data/NCDC/mideast/uae/abu_dhabi_intl/dat.txt' # Time shift
# file_path= './data/NCDC/mideast/uae/bateen/dat.txt' # Time shift
# file_path= './data/NCDC/mideast/buraimi/dat.txt' # not good dataset
# file_path= './data/NCDC/mideast/turkey/konya/dat.txt' 
# file_path= './data/NCDC/mideast/turkey/sivas/dat.txt' # bad dataset
# file_path= './data/NCDC/mideast/turkey/balikesir/dat.txt' # bad dataset
# file_path= './data/NCDC/mideast/turkey/bartin/dat.txt' # bad dataset
# file_path= './data/NCDC/mideast/iran/chahbahar/dat.txt'
# file_path= './data/NCDC/mideast/iran/zabol/dat.txt' # Problematic data
# file_path= './data/NCDC/mideast/iran/torbat_heydarieh/dat.txt' # Unusable

# file_path, bandwidth = "./data/NCDC/cn/shanghai/hongqiao_intl/dat.txt", 0.6
# file_path, bandwidth= "./data/NCDC/cn/shanghai/pudong/dat.txt", 0.8
# file_path, bandwidth= "./data/NCDC/cn/hefei_luogang/dat.txt", 0.6 # few 0, trend
# file_path, bandwidth= "./data/NCDC/cn/nanjing_lukou/dat.txt", 0.5
# file_path= "./data/NCDC/cn/zhengzhou_xinzheng/dat.txt" 
# file_path= "./data/NCDC/cn/tianjin/binhai/dat.txt" # few 0, trend, stationary speed, unstationary direction
# file_path= "./data/NCDC/cn/tianjin/tianjing/dat.txt" # 16 sectors
# file_path= "./data/NCDC/cn/shijiazhuang_zhengding/dat.txt" 
# file_path= "./data/NCDC/cn/henan_gushi/dat.txt" # 16 sectors, fit not very good
# file_path= "./data/NCDC/cn/nanning_wuxu/dat.txt" # numpy priblem, unstationary speed
# file_path= './data/NCDC/cn/macau/dat.txt'  
# file_path= "./data/NCDC/cn/hk_intl/dat.txt" # few 0
# file_path= './data/NCDC/cn/gaoqi/dat.txt' 

# file_path= './data/NCDC/southeast_asia/malaysia/mersing/dat.txt' # 2 mode, paper comparison
# file_path= './data/NCDC/southeast_asia/malaysia/penang/dat.txt'
# file_path= './data/NCDC/southeast_asia/malaysia/butterworth/dat.txt' # 2 mode 
# file_path= "./data/NCDC/southeast_asia/malaysia/bsultan_mahmud/dat.txt" # stable
# file_path= "./data/NCDC/southeast_asia/malaysia/bsultan_ismail/dat.txt" # 
# file_path= "./data/NCDC/southeast_asia/singapore/changi/dat.txt" # trend, no 0, questionary data
# file_path= "./data/NCDC/southeast_asia/singapore/paya_lebar/dat.txt" # questionary data
# file_path= "./data/NCDC/southeast_asia/singapore/seletar/dat.txt"
# file_path= "./data/NCDC/east_asia/cheongju_intl/dat.txt" # 2005-2009  may have problem, fit is good; numpy problem
# file_path= "./data/NCDC/east_asia/daegu_ab/dat.txt" # recent 5 year may have problem, but fit is generally good; numpy problem

# file_path, bandwidth= "./data/NCDC/oceania/auckland_intl/dat.txt", 0.9  # Good data, double mode
# file_path= "./data/NCDC/oceania/brisbane_archerfield/dat.txt" # high 0, few data 
# file_path= "./data/NCDC/oceania/narrandera/dat.txt" # high 0, few data
# file_path, bandwidth= "./data/NCDC/oceania/canberra/dat.txt", 0.7 # high 0, bad fit
# file_path, bandwidth, NUMBER_OF_GAUSSIAN= './data/NCDC/oceania/horsham/dat.txt', 0.9, 4 # get the peak

# file_path, bandwidth= './data/NCDC/us/boston_16nm/dat.txt', 0.9 # Offshore, mixed type

# file_path, bandwidth= './data/asos/olympia/hr_avg.csv', 0.5 # might block
# file_path, bandwidth, NUMBER_OF_GAUSSIAN  = './data/asos/bismarck_ND/hr_avg.csv', 1.1, 4
# file_path, bandwidth, NUMBER_OF_GAUSSIAN = './data/asos/aberdeen_SD/hr_avg.csv', 1.7, 2 # only to 2012
# file_path, bandwidth, NUMBER_OF_GAUSSIAN = './data/asos/minneapolis/hr_avg.csv', 1.1, 4
# file_path, bandwidth = './data/asos/lincoln_NE/hr_avg.csv', 0.9
# file_path, bandwidth = './data/asos/des_moines_IA/hr_avg.csv', 1.3
# file_path, bandwidth = './data/asos/springfield_IL/hr_avg.csv', 1.1 
# file_path, bandwidth = './data/asos/topeka/hr_avg.csv', 0.7 # High 0
# file_path, bandwidth = './data/asos/denver/hr_avg.csv', 1.3

# file_path, bandwidth, NUMBER_OF_GAUSSIAN = './data/NDAWN/baker/hr_avg.csv', 0.7, 4 
# file_path, bandwidth = './data/NDAWN/dickinson/hr_avg.csv', 0.6
# file_path = './data/NDAWN/rugby/hr_avg.csv'
# file_path = './data/NDAWN/bowman/hr_avg.csv'
# file_path = './data/NDAWN/grand_forks/hr_avg.csv'
# file_path = './data/NDAWN/williston/hr_avg.csv'
# file_path = './data/NDAWN/jamestown/hr_avg.csv'

# file_path, bandwidth, NUMBER_OF_GAUSSIAN = 'data/ECMWF/usa/47N123W/dat.csv', 0.7, 4 #good 
# file_path, bandwidth = 'data/ECMWF/venezuela/8N67W/dat.csv', 0.7 # good, but the data might be problematic.
# file_path, bandwidth = 'data/ECMWF/chile/52S75W/dat.csv', 1.9 # good
# file_path, bandwidth= 'data/ECMWF/iceland/65N17W/dat.csv', 1.9 # good
# file_path, bandwidth, NUMBER_OF_GAUSSIAN  = 'data/ECMWF/germany/49N9E/dat.csv', 1.1, 4 # miss peak
# file_path, bandwdith = 'data/ECMWF/sudan/18N32E/dat.csv', 1.1 # good
# file_path, bandwidth = 'data/ECMWF/china/24N121E/dat.csv', 0.9 # good
# file_path, bandwidth, NUMBER_OF_GAUSSIAN = 'data/ECMWF/australia/37S142E/dat.csv', 0.7, 4 # miss the peak, force bandwidth 0.7

if "cn_database" in file_path: 
    df = read_cn_database(file_path)
elif 'NCDC' in file_path:
    df = pd.read_csv(file_path, header=0, skipinitialspace=True, dtype={'HrMn':'object'})
    df.rename(columns={'Date':'date','Dir':'dir','Spd':'speed','Type':'type','I.1':'wind_type'}, inplace=True)
    df = df[['date','HrMn','type','dir','speed','wind_type' ]]
    df.dropna(subset=['dir','speed'], inplace=True)
    integer_data = True
elif 'NDAWN' in file_path:
    df = pd.read_csv(file_path, header=0, skipinitialspace=True, dtype={'HrMn':'object'})
    df['type']='default'
    df['wind_type']='default'
    df = df.dropna()
    convert_to_knot = False
    integer_data = False
elif 'asos' in file_path:
    # ASOS
    df = pd.read_csv(file_path, header=0, skipinitialspace=True, dtype={'HrMn':'object'})
    df['type']='default'
    df['wind_type']='default'
    df = df.dropna()
    convert_to_knot = False
    integer_data = False
    knot_unit = True
else:
    df = pd.read_csv(file_path, header=0, skipinitialspace=True)
    df.rename(columns={'U':'x','V':'y'}, inplace=True)
    df.x=-df.x
    df.y=-df.y
    df['speed']=np.sqrt(df.x**2+df.y**2)
    df['dir']=np.degrees(np.arctan2(df.y, df.x))%360
    df['time']=pd.to_datetime('1979-01-01T00:00:00Z')+pd.to_timedelta(df['time'], unit='h')
    df['date']=df['time'].dt.strftime('%Y%m%d')
    df['date']=df['date'].astype(int)
    df['HrMn']=df['time'].dt.strftime('%H00')
    df['type']='default'
    df['wind_type']='default'
    convert_to_knot = True
    integer_data = False
    cartesian = True

df

if 'NCDC' in file_path:
    lat, long = get_lat_long(file_path)
    print(lat,long)
    map_osm = folium.Map(location=[lat, long], zoom_start=4)
    folium.Marker([lat, long]).add_to(map_osm)
    display(map_osm)

24.886 55.172

df['time']=pd.to_datetime(df["date"].astype(str).map(str) + df["HrMn"], format='%Y%m%d%H%M')
df.set_index(['time'], inplace=True)
df['HrMn']=df['HrMn'].astype(int)
df = df.query("(dir <= 999) & (speed < 100) ")['1970':'2016']

plot_speed_and_angle_distribution(df.speed, df.dir)

D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))

# Dir [10,360]=> [0,350]
df['dir'] = df['dir'].apply(lambda x: x%360 if x < 999 else x) 
# Convert Windrose coordianates to Polar Cooridinates 
if 'cartesian' in globals():
    df['dir_windrose'] = df['dir'].apply(lambda x: (90 - x)%360 if x < 999 else x)
else:
    df['dir_windrose'] = df['dir']
    df['dir'] = df['dir'].apply(lambda x: (90 - x)%360 if x < 999 else x)
display(df.describe())
df.plot(y='speed',legend=True,figsize=(20,5))

<matplotlib.axes._subplots.AxesSubplot at 0xc27f940>

1.3 General Data Info¶

1.3.1 Unit Detection¶

df['decimal'] = df.speed % 1
df.decimal.hist(alpha=0.5, label='m/s', figsize=(4, 3))

if 'convert_to_knot' not in globals():
    convert_to_knot = True if len(df.query('decimal >= 0.2')) / len(df) > 0.3 else False
    
if convert_to_knot:
    knot_unit = True
    df['speed'] = df['speed'] * 1.943845
    df['decimal'] = df.speed % 1
    df.decimal.hist(alpha=0.5, label='knot')
    # need more elaboration, some is not near an integer
    if integer_data:
        df['speed'] = df['speed'].apply(lambda x: int(round(x)))
    plt_configure(xlabel='Decimal', ylabel='Frequency', legend={'loc': 'best'}, title='Decimal Distribution')
else:
    if 'knot_unit' not in globals():
        knot_unit = False
    
df.drop(['decimal'], 1,inplace=True)
print(knot_unit)

True

dir_unit_text = ' (degree)'
if knot_unit == True:
    speed_unit_text = ' (knot)'
else: 
    speed_unit_text = ' (m/s)'

1.3.2 Sampling Type Selection¶

sample_type = df.query('date > 20000000')['type']
sample_type.value_counts().plot(
    kind = 'bar', title = 'Report Types Comprisement', figsize=(4,3))

report_type_most_used = sample_type.value_counts().argmax()
df = df.query("type==@report_type_most_used")

1.3.3 Sampling Time Selection¶

MID_YEAR = int(np.average(df.index.year))

df['HrMn'].value_counts().sort_index().plot(kind='bar', alpha=0.5,label='Overall')
df[str(MID_YEAR):]['HrMn'].value_counts().sort_index().plot(
    kind='bar', alpha=0.5, label='>= %s' %  MID_YEAR )

plt_configure(xlabel='Sampling Time', ylabel='Frequency', legend={'loc':'best'}, figsize=(8, 4), 
              title = 'Sampling Time Distribution, Overall and > %s ' %  MID_YEAR)

df['sample_time'] = df.HrMn % 100 
sample_time = df['2000':]['sample_time']
sample_times = sample_time.value_counts()[sample_time.value_counts() > 2000]
sample_times = sample_times.index.tolist()
# df = df.query("sample_time in @sample_times")
df = df.query("sample_time == @sample_times[0]")
df.drop(['sample_time'], 1,inplace=True)
print(sample_times)

df['HrMn'].value_counts().sort_index().plot(kind='bar', alpha=0.5, figsize=(10, 4))

[0]

<matplotlib.axes._subplots.AxesSubplot at 0xc3edd30>

1.4 Error Data handling and Adjustment¶

1.4.1 Artefacts¶

wrong direction record

if integer_data:
    display(df.query("(dir % 10 >= 0.1) & (dir != 999)"))
    df = df.query('(dir % 10 <= 0.1) | (dir == 999)')

sudden increase in speed

# sudden increse
df['incre'] = df.speed.diff(1)
df['incre'].fillna(0, inplace=True)
df['incre_reverse'] = df.speed.diff(-1)
df['incre_reverse'].fillna(0, inplace=True)

display(df.sort_values(by='speed',ascending=False).head(10))
df['incre'].plot(kind='hist', bins=arange(-15, 15), legend=True, figsize=(8, 3))

<matplotlib.axes._subplots.AxesSubplot at 0xc65f9b0>

incre_threshold = 20 if knot_unit else 10
print('sudden increase number', len(df.query('(incre > @incre_threshold )&(incre_reverse > @incre_threshold )')))
df = df.query('(incre < @incre_threshold )|(incre_reverse < @incre_threshold )')

# Check the max speed
display(df.sort_values(by='speed',ascending=False).head(10))
df.drop(['incre', 'incre_reverse'], 1, inplace=True)

sudden increase number 1

1.4.2 Direction re-aligment¶

For some dataset, the 16 sectors are not record properly,

e.g. the sectors are [0,20,50 ...], need to redistribute the angle into 22.5, e.g. [0, 22.5, 45...]

display(df['dir'].value_counts().sort_index())
effective_column = df.query('dir < 999')['dir'].value_counts()[df['dir'].value_counts() > 30].sort_index()
if integer_data:
    SECTOR_LENGTH = 360/len(effective_column) 
else: 
    SECTOR_LENGTH = 10
print(len(effective_column), SECTOR_LENGTH)

0      1535
10     1354
20     1263
30     1208
40     1043
50      984
60      864
70      795
80      939
90     1073
100    1208
110    1596
120    2068
130    1973
140    2554
150    2690
160    2782
170    1791
180    1222
190     859
200     765
210     819
220    1001
230     995
240    1412
250    1376
260    1482
270    2078
280    2232
290    2133
300    1825
310    1310
320    1100
330    1115
340    1282
350    1394
999    3156
Name: dir, dtype: int64

36 10.0

df=realign_direction(df, effective_column)

1.4.3 0 Speed¶

with_too_many_zero, null_wind_frequency = is_with_too_many_zero(df['2005':])
delete_zero = with_too_many_zero
if delete_zero:
    df = df.query('(speed > 0)')
print(delete_zero, null_wind_frequency)

False 0.00341920544178

print(df.query('dir == 999')['speed'].value_counts())
df=fill_direction_999(df, SECTOR_LENGTH)

2     1403
1      533
3      519
4      257
0      189
5      152
6       65
7       26
8        7
10       3
11       1
9        1
Name: speed, dtype: int64

1.5 Time Shift Comparison¶

DIR_REDISTRIBUTE = 'even'
if DIR_REDISTRIBUTE == 'even':
    DIR_BIN = arange(-5, 360, 10) 
elif DIR_REDISTRIBUTE == 'round_up':
    DIR_BIN = arange(0, 360+10, 10) 

# Comparison between mid_year, looking for: 
# 1. Odd Even Bias
# 2. Time Shift of Wind Speed Distribution
bins = arange(0, df.speed.max() + 1)
df[:str(MID_YEAR)]['speed'].plot(
    kind='hist', alpha=0.5,bins=bins, label='< %s' % MID_YEAR)

df[str(MID_YEAR+1):]['speed'].plot(
    kind='hist', alpha=0.5,bins=bins, label='> %s' % MID_YEAR)

plt.suptitle('Speed Comparison between year < %s, > %s ' % (MID_YEAR, MID_YEAR), fontsize = 14)
plt_configure(xlabel='Speed', ylabel='Frequency', legend=True, figsize=(8, 3))

df[:str(MID_YEAR)]['dir'].plot(
    kind='hist', alpha=0.5,bins=DIR_BIN, label='< %s' % MID_YEAR)

df[str(MID_YEAR+1):]['dir'].plot(
    kind='hist', alpha=0.5,bins=DIR_BIN, label='> %s' % MID_YEAR)

plt.suptitle('Dir Comparison between year < %s, and > %s ' % (MID_YEAR, MID_YEAR), fontsize = 14)
plt_configure(xlabel='Dir', ylabel='Frequency', legend={'loc':'best'}, figsize=(8, 3), tight='x')

display(df[df['dir'].isnull()])
df.dropna(subset=['dir'], inplace=True)

# Inspect the time shift of speed and degree distribution, and odd-even bias
check_time_shift(df, speed_unit_text=speed_unit_text, dir_unit_text=dir_unit_text)

2011 - 2015

D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))

df.resample('A').mean().plot(y='speed')
plt.gca().set_ylim(bottom=0)
df.resample('M').mean().plot(y='speed', figsize=(20,4))
plt.gca().set_ylim(bottom=0)

(0, 10.0)

%%time
for column in ['speed', 'dir']:
    if column == 'speed':
        bins = arange(0, df[column].max()+1, 1)
    else:
        bins = arange(0, 361, 10)
    den, _ = np.histogram(df[column], bins=bins, density=True)
    y_top=max(den)*1.2
    for year in arange(1980, 2016):
        end_year = year
        sub_df = df[str(year):str(end_year)]
        if len(sub_df) > 1000:
            plt.figure()
            df[column].hist(bins=bins, alpha=0.3, normed=True)
            sub_df[column].hist(bins=bins, alpha=0.5, figsize=(3,1.5), normed=True)
            plt.gca().set_ylim(top=y_top)
            plt_configure(title=str(year))
    align_figures()

Wall time: 4.24 s

for column in ['speed', 'dir']:
    if column == 'speed':
        bins = arange(0, df[column].max()+1, 1)
    else:
        bins = arange(0, 361, 10)
    density_all, _ = np.histogram(df[column], bins=bins, density=True)
    df[column].hist(bins=bins, figsize=(5,3))

    R_squares = []
    years = []
    for year in arange(1980, 2016):
        start_year, end_year = year-1, year+1
        sub_df = df[str(start_year):str(end_year)]
        if len(sub_df) > 1000:
            density, _ = np.histogram(sub_df[column], bins=bins, density=True)
            y_mean = np.mean(density_all)
            SS_tot = np.sum(np.power(density_all - y_mean, 2))
            SS_res = np.sum(np.power(density_all - density, 2))

            R_square = 1 - SS_res / SS_tot
            R_squares.append(R_square)
            years.append(year)

    plt.figure()
    plot(years, R_squares)
    ylim = max(min(plt.gca().get_ylim()[0],0.85),0)
    plt.gca().set_ylim(bottom=ylim, top=1)
    plt_configure(figsize=(5,3))
    align_figures()

1.6 Re-distribute Direction and Speed (Optional)¶

e.g. Dir 50 -> -45 ~ 55, to make KDE result better

if integer_data:
    df = randomize_angle(df, DIR_REDISTRIBUTE, SECTOR_LENGTH)

if integer_data:
    if delete_zero:
        redistribute_method = 'down'
    else:
        redistribute_method = 'up'

    df, speed_redistribution_info = randomize_speed(df, redistribute_method)

Redistribute upward, e.g. 0 -> [0,1]

1.7 Generate (x,y) from (speed,dir)¶

# Cook orientation
# df['dir']= (df['dir'] + 180)%360

# There might be a small dot in the centre, which is due to too many zero (more than 1 speed) in center
# Scatter plot in matplot has performance issue, the speed is very slow
df['x'] = df['speed'] * cos(df['dir'] * pi / 180.0)
df['y'] = df['speed'] * sin(df['dir'] * pi / 180.0)

2. Re-select Data and Overview¶

2.1 Data Overview¶

## Summery of the data selection
print('Knot unit?', knot_unit)
print('Report type used:', report_type_most_used)
print('Sampling time used:', sample_times)
if 'speed_redistribution_info' in globals():
    print('Speed redistribution info:', speed_redistribution_info )

df_all_years = df # for later across-year comparison
df = df_all_years['2011':'2015']
# df = df.query('(HrMn == 0) and (speed >= 0.5) and (date%10000 > 900) and (date%10000 < 1000)' )
df.describe()

Knot unit? True
Report type used: FM-15
Sampling time used: [0]
Speed redistribution info: Redistribute upward, e.g. 0 -> [0,1]

df.plot(y='speed',legend=True,figsize=(20,5))

<matplotlib.axes._subplots.AxesSubplot at 0xc3aed30>

# Accumulation by month
df.resample('M').count().plot(y='date', kind='bar',figsize=(20,4))

<matplotlib.axes._subplots.AxesSubplot at 0xc6c35f8>

# 90 degree is east
ax = WindroseAxes.from_ax()
viridis = plt.get_cmap('viridis')
ax.bar(df.dir_windrose, df.speed, normed=True, opening=0.8, edgecolor='white', nsector=36, cmap=viridis)
ax.set_legend()

D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

if len(df) > 1000000:
    bins=arange(0,362)
    df['dir'].hist(bins=bins, normed=True,alpha=0.5,label='min')
    
    df = df_all_years.sample(n=500000, replace=True)    
    df['dir'].hist(bins=bins, normed=True,alpha=0.5,label='min resmapled')
    plt_configure(legend=True, figsize=(20,4))

x, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(df.speed)

# 1. Histogram comparison
fig = plt.figure()
df['speed'].hist(bins=arange(0, df.speed.max()), alpha=0.5, label='Data', normed=True)             
plot(x, y_weibull, '-', color='black',label='Weibull')   
plt_configure(figsize=(4,3),xlabel='V',ylabel='PDF', legend=True)

# 2. CDF comparison
fig = plt.figure()
plot(log(x), log(-log(1-y_ecdf)),'o', label='ECDF')
plot(log(x), log(-log(1-y_cdf_weibull)),'-', label='Weibull')
plt_configure(xlabel="ln(V)", ylabel="ln(-ln(1-P)",legend={'loc':'best'}, figsize=(4,3))
align_figures()

D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:11: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:12: RuntimeWarning: divide by zero encountered in log

df.plot(kind='scatter', x='x', y='y', alpha=0.05, s=2)
plt.gca().set_aspect('equal')
plt_configure(figsize=(3.2,3.2),xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)

2.2 Overview by Direction¶

if len(effective_column) == 16:
    rebinned_angle = 22.5
else: 
    rebinned_angle = 10

%%time
original_incre, incre = SECTOR_LENGTH, rebinned_angle
start, end = -original_incre/2 + incre/2, 360

max_speed = df.speed.max()
max_count = max_count_for_angles(df, start, end, incre)
plot_range = [0, max_speed, 0, max_count*1.05]

for angle in arange(start, end, incre):
    start_angle, end_angle = angle-incre/2, angle+incre/2
    sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)   
    
    fig = plt.figure()
    sub_df['speed'].hist(bins=arange(0, max_speed), alpha=0.5, label='Data')
    title ='%s (%s - %s), %s' % (angle, start_angle, end_angle, len(sub_df)) 
    plt.axis(plot_range)
    plt_configure(figsize=(3,1.5), title=title)
align_figures()

D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\pyplot.py:524: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)

Wall time: 8.19 s

2.3 Overview by Month¶

%%time
current_df = df.query('speed>=1')
for month in arange(1, 13): 
    sub_df = current_df[current_df.index.month == month]
    ax = WindroseAxes.from_ax()
    ax.bar(sub_df.dir_windrose, sub_df.speed, normed=True, opening=0.8, edgecolor='white', nsector=36, cmap=plt.get_cmap('viridis'))
    plt_configure(figsize=(3,3), title='Month: %s'%(month))
align_figures()

D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

Wall time: 18.7 s

df.describe()

3. Create input data and configuration¶

SPEED_SET = array(list(zip(df.x, df.y)))
if 'NUMBER_OF_GAUSSIAN' not in globals():
    NUMBER_OF_GAUSSIAN = 3
FIT_METHOD = 'square_error'
DEFAULT_BANDWDITH = 1.5 if knot_unit else 0.7
fig_list = []

fit_limit = ceil(df['speed'].quantile(.95))
fitting_axis_range = arange(-fit_limit, fit_limit+1, 1)
print(fitting_axis_range)

FITTING_RANGE = []
for i in fitting_axis_range:
    for j in fitting_axis_range:
        FITTING_RANGE.append([i,j])

[-17 -16 -15 -14 -13 -12 -11 -10  -9  -8  -7  -6  -5  -4  -3  -2  -1   0
   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17]

plot_limit = ceil(df['speed'].quantile(.95))
PLOT_AXIS_RANGE = arange(-plot_limit, plot_limit+1, 1)

4. Kernel Density Estimation¶

sample = SPEED_SET
KDE_KERNEL = 'gaussian'
# KDE_KERNEL, bandwidth = 'tophat', 1

%%time
if 'bandwidth' not in globals():
    bandwidth = DEFAULT_BANDWDITH
    from sklearn.grid_search import GridSearchCV
    # from sklearn.model_selection import GridSearchCV  ## too slow

    # The bandwidth value sometimes would be too radical
    if knot_unit:
        bandwidth_range = arange(0.7,2,0.2)
    else:
        bandwidth_range = arange(0.4,1.1,0.1)

    # Grid search is unable to deal with too many data (a long time is needed)
    if len(sample) > 50000:    
        df_resample=df.sample(n=50000, replace=True)
        bandwidth_search_sample = array(list(zip(df_resample.x, df_resample.y)))
    else:
        bandwidth_search_sample = sample

    grid = GridSearchCV(neighbors.KernelDensity(kernel = KDE_KERNEL),
                    {'bandwidth': bandwidth_range}, n_jobs=-1, cv=4) 

    grid.fit(bandwidth_search_sample)
    bandwidth = grid.best_params_['bandwidth']
    
print(bandwidth)

1.1
Wall time: 0 ns

if 'bandwidth' not in globals():
    bandwidth = DEFAULT_BANDWDITH

kde = neighbors.KernelDensity(bandwidth=bandwidth, kernel = KDE_KERNEL).fit(sample)

points = FITTING_RANGE
# very slow if the dataset is too large, e.g. 100,000
# kde returns log prob, need to convert it
kde_result = exp(kde.score_samples(points))
print('bandwidth:', bandwidth, len(kde_result))
print(kde_result[:5])

bandwidth: 1.1 1225
[  7.58432771e-08   3.20539294e-07   6.95667192e-07   1.13492786e-06
   1.99381126e-06]

# Plot jPDF
X = Y = PLOT_AXIS_RANGE
# Can't work if pass as generate_Z_from_X_Y(X,Y, exp(kde.score_samples())), need to use lambda
# see http://stackoverflow.com/questions/21035437/passing-a-function-as-an-argument-in-python
kde_Z = generate_Z_from_X_Y(X,Y, lambda coords: exp(kde.score_samples(coords)))
colorbar_lim = 0, kde_Z.max()

plot_3d_prob_density(X,Y,kde_Z)

fig_kde,ax1 = plt.subplots(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,kde_Z,xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, ax=ax1)

with sns.axes_style({'axes.grid' : False}):
    from matplotlib import ticker
    fig_hist,ax2 = plt.subplots(figsize=(3.5,2.5))
    _,_,_,image = ax2.hist2d(df.x, df.y, bins=PLOT_AXIS_RANGE, cmap='viridis',)
    ax2.set_aspect('equal')
    cb = plt.colorbar(image)
    tick_locator = ticker.MaxNLocator(nbins=6)
    cb.locator = tick_locator
    cb.update_ticks()
    plt_configure(ax=ax2, xlabel='x'+speed_unit_text,ylabel='y'+speed_unit_text)
align_figures()

4.1 Bootstrap GOF limit¶

kde_cdf = cdf_from_pdf(kde_result)
config = {'bandwidth': bandwidth, 
          'fitting_range': FITTING_RANGE,
          'fit_limit': fit_limit,
          'kde_kernel': KDE_KERNEL}

%%time
gof_kde=Parallel(n_jobs=-1)(delayed(resampled_kde)(df, kde_result, config) 
                                       for i in arange(20))

Wall time: 20 s

for gof_name in [ 'R_square', 'K_S','Chi_square']:
    plt.figure(figsize=(4,3))
    pd.DataFrame(gof_kde)[gof_name].hist()
    plt_configure(title=gof_name)
align_figures()

4.2 Bivariate Empirical Limit¶

gofs_bivar

%%time
gofs_mean_set_bivar = []
fig1, ax1 = plt.subplots(figsize=(4,3))
fig2, ax2 = plt.subplots(figsize=(4,3))

for year_length in [5, 10]:
    start_year, end_year = df_all_years.index.year[0], 2015-year_length+1
    df_standard = df_all_years[str(2015-year_length+1):'2015']
    speed_ = array(list(zip(df_standard.x, df_standard.y)))
    kde_current = neighbors.KernelDensity(bandwidth=bandwidth, kernel=KDE_KERNEL).fit(speed_)
    kde_result_standard = exp(kde_current.score_samples(points))
    gofs_bivar=Parallel(n_jobs=-1)(delayed(kde_gofs)(df_all_years, start_year, start_year+year_length-1, kde_result_standard, config) 
                                   for start_year in arange(start_year, end_year+1)) 
    gofs_bivar=pd.DataFrame(gofs_bivar, index=arange(start_year, end_year+1))
    if len(gofs_bivar)>0:
        gofs_bivar.plot(y='R_square', ax=ax1, label=year_length)
        gofs_bivar.plot(y='K_S', ax=ax2, label=year_length)
        year_lim = end_year-year_length-10, end_year-year_length
        gofs_mean = gofs_bivar.query('index >= @year_lim[0] & index <= @year_lim[1]').mean().to_dict()
        gofs_mean['year_lim']=year_lim
        gofs_mean_set_bivar.append(gofs_mean)
    
align_figures()
display(pd.DataFrame(gofs_mean_set_bivar).set_index('year_lim'))

4.3 Univariate GOF Limit¶

def yearly_gof(df_all_years, start_year, end_year, density, y_ecdf, x, density_dir):
    df_previous = df_all_years[str(start_year):str(end_year)]
    # 1. Speed
    density_expected, _ = np.histogram(df_previous['speed'], bins=x, density=True)
    r_square = sector_r_square(density, density_expected)
    
    y_ecdf_previous = sm.distributions.ECDF(df_previous['speed'])(x)
    k_s = max(np.abs(y_ecdf - y_ecdf_previous))
    
    # 2. Direction
    density_dir_expected, _ = dir_hist(df_previous['dir'], bins=arange(-5,370,10), density=True)
    r_square_dir = sector_r_square(density_dir, density_dir_expected)
    return {'year': start_year, 'r_square': r_square, 'k_s': k_s, 'r_square_dir': r_square_dir}

%%time
x = arange(0, df.speed.max() + 1)
fig = plt.figure(figsize=(9,2.5))
ax1 = fig.add_subplot(1,3,1)
ax2 = fig.add_subplot(1,3,2)
ax3 = fig.add_subplot(1,3,3)
gofs_mean_set = []

for year_length in [5, 7, 10]:
    start_year, end_year =df_all_years.index.year[0], 2015-year_length+1
    df_standard = df_all_years[str(2015-year_length+1):str(2015)]
    density, _ = np.histogram(df_standard['speed'], bins=x, density=True)
    density_dir, _ = dir_hist(df_standard['dir'], bins=arange(-5,370,10), density=True)
    y_ecdf = sm.distributions.ECDF(df_standard.speed)(x)
    gofs = [yearly_gof(df_all_years, start_year, start_year+year_length-1, density, y_ecdf, x, density_dir) 
            for start_year in arange(start_year, end_year+1)]
    if len(gofs)>0:
        gofs = pd.DataFrame(gofs)
        gofs.set_index(['year'], inplace=True)  
        ax1.plot(gofs.r_square, label=year_length)
        ax2.plot(gofs.k_s, label=year_length)
        ax3.plot(gofs.r_square_dir, label=year_length)
        year_lim = end_year-year_length-10, end_year-year_length
        gofs_mean = gofs.query('index >= @year_lim[0] & index <= @year_lim[1]').mean().to_dict()
        gofs_mean['year_lim']=year_lim
        gofs_mean_set.append(gofs_mean)
plt.tight_layout()
plt.legend()
for ax in [ax1, ax2, ax3]:
    plt_configure(ax=ax, tight='xtight')
    
display(pd.DataFrame(gofs_mean_set).set_index('year_lim'))

Wall time: 365 ms

5. GMM by Expectation-maximization¶

sample=SPEED_SET
clf = mixture.GaussianMixture(n_components=NUMBER_OF_GAUSSIAN, covariance_type='full')
clf.fit(sample)
print(clf.converged_)

True

gmm_em_result = read_gmm_em_result(clf)
pretty_print_gmm(gmm_em_result)

fig,ax = plt.subplots(figsize=(3.5,3.5))
plot_gmm_ellipses(gmm_em_result, ax=ax, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)

GMM Plot Result
0.39214374523 [[-1.08145626 -4.8888137 ]] [ 3.69728893  5.98295965] -148.15149755
0.316409427461 [[ 3.96454531  0.90307035]] [ 3.71777203  4.15912948] 169.063085492
0.291446827309 [[-8.18015384  6.88801886]] [ 3.85656019  5.40984061] -62.6391653746

X = Y = PLOT_AXIS_RANGE
pdf_Z = generate_Z_from_X_Y(X,Y, lambda coords: exp(clf.score_samples(coords)))

def residule_between_kde_and_gmm(points):
    kde_vals = exp(kde.score_samples(points))
    gmm_vals = exp(clf.score_samples(points))
    return kde_vals - gmm_vals 

residual_Z = generate_Z_from_X_Y(X,Y, residule_between_kde_and_gmm)

plot_3d_prob_density(X,Y,pdf_Z)
plot_3d_prob_density(X,Y,residual_Z)
align_figures()

fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,kde_Z,xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar_lim=colorbar_lim)
fig_em = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,pdf_Z,xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar_lim=colorbar_lim)
fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,residual_Z,
                     xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
align_figures()

Goodness-of-fit Statistics¶

points = FITTING_RANGE
gmm_pdf_result = exp(clf.score_samples(points))
gof_df(gmm_pdf_result, kde_result)

gmm_em = group_gmm_param_from_gmm_param_array(gmm_em_result, sort_group = True)
mixed_model_pdf_em = generate_gmm_pdf_from_grouped_gmm_param(gmm_em)

6. GMM by Optimization¶

sample = SPEED_SET
points = FITTING_RANGE
max_speed = df.speed.max()
print(FIT_METHOD)

square_error

# from GMM,EM 
# GMM format: weight, meanx, meany, sigx, sigy, rho
x0 = gmm_em_result

cons = [
        # sum of every 6th element, which is the fraction of each gaussian
        {'type': 'eq', 'fun': lambda x: sum(x[::6]) - 1},
        # # limit the width/height ratio of elliplse, optional
#         {'type': 'ineq', 'fun': lambda x: width_height_ratios_set(x) - 1/3},
#         {'type': 'ineq', 'fun': lambda x: 3 - width_height_ratios_set(x)},
]

bonds = [(0., 0.99),(-fit_limit, fit_limit),(-fit_limit, fit_limit),
         (0., fit_limit),(0., fit_limit),(-0.99, 0.99)]*(len(x0)//6)

result = sp.optimize.minimize(
    lambda x0: GMM_fit_score(x0, kde_result, points, FIT_METHOD),
    x0,
    bounds = bonds,
    constraints=cons,
    tol = 0.000000000001,
    options = {"maxiter": 500})
result

     fun: -17.487047747921288
     jac: array([  1.31396914e+00,   0.00000000e+00,  -2.38418579e-07,
         2.38418579e-07,  -2.38418579e-07,  -7.15255737e-07,
         1.31397510e+00,  -2.38418579e-07,   0.00000000e+00,
        -4.76837158e-07,   4.76837158e-07,  -4.76837158e-07,
         1.31397033e+00,   0.00000000e+00,   0.00000000e+00,
        -2.38418579e-07,   0.00000000e+00,  -1.43051147e-06,
         0.00000000e+00])
 message: 'Optimization terminated successfully.'
    nfev: 691
     nit: 34
    njev: 34
  status: 0
 success: True
       x: array([ 0.21749595,  4.81770768,  0.25496374,  2.34390443,  3.4996429 ,
       -0.13601912,  0.34900005, -0.96243785, -4.96121765,  4.03854841,
        3.91285527, -0.41001555,  0.43350399, -6.75039928,  6.07358941,
        6.96218254,  4.56096909,  0.01542211])

6.1 GMM Result¶

gmm = group_gmm_param_from_gmm_param_array(result.x, sort_group = True)
mixed_model_pdf = generate_gmm_pdf_from_grouped_gmm_param(gmm)
gmm_pdf_result = mixed_model_pdf(points)
pretty_print_gmm(gmm)

fig_gmm, ax = plt.subplots(figsize=(3.5,3.5))
plot_gmm_ellipses(gmm, ax=ax, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)

GMM Plot Result
0.43350399117 [[-6.75039928  6.07358941]] [ 4.56001911  6.96280479] -88.9863559092
0.349000054847 [[-0.96243785 -4.96121765]] [ 3.05151472  4.72319464] -132.794842144
0.217495953983 [[ 4.81770768  0.25496374]] [ 2.30528388  3.52520278] -170.857880113

6.2 Bivariate Goodness-of-fit statistics¶

gof_df(gmm_pdf_result, kde_result)

pd.DataFrame(gofs_mean_set_bivar).set_index('year_lim')

X = Y = PLOT_AXIS_RANGE
pdf_Z = generate_Z_from_X_Y(X,Y, mixed_model_pdf)# passing a function as an argument

def residule_between_kde_and_gmm(points):
    kde_vals = exp(kde.score_samples(points))
    gmm_vals = mixed_model_pdf(points)
    return kde_vals - gmm_vals 

residual_Z = generate_Z_from_X_Y(X,Y, residule_between_kde_and_gmm)

plot_3d_prob_density(X,Y,pdf_Z)
plot_3d_prob_density(X,Y,residual_Z)
align_figures()

fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,kde_Z, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
fig_gmm = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,pdf_Z, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,residual_Z,  xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
align_figures()

6.3 Univariate Goodness-of-fit¶

def f(V,theta):
    return (mixed_model_pdf([[V*cos(theta),V*sin(theta)]]))*V

def f_em(V,theta):
    return (mixed_model_pdf_em([[V*cos(theta),V*sin(theta)]]))*V

%%time
x = arange(0, max_speed, 0.5)
# 1. Fit Weibull
_, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(df.speed, x=x)

# 2. GMM Model
y_ =[integrate.nquad(f, [[x_-0.01, x_+0.01],[0, 2*pi]]) for x_ in x]
y_gmm = array(list(zip(*y_))[0])/0.02

# 3. Plot Comparison
df['speed'].hist(bins=arange(0, df.speed.max()), alpha=0.5, label='Data')
plot(x, y_gmm*len(df.speed),'-', color='black', label='GMM')
plot(x, y_weibull*len(df.speed), '--', color='black', label='Weibull') 
print('Speed Distribution Comparison')
plt_configure(xlabel='Speed'+speed_unit_text,
              ylabel='Frequency',legend=True, figsize=(4, 2))
plt.gca().set_ylim(bottom = 0)
plt.tight_layout()
plt.locator_params(axis='y', nbins=5)

# 4. R square for GMM, Weibull
print(R_square_for_speed(df['speed'], f, weibull_params, f_em))

Speed Distribution Comparison
(0.97338573924997929, 0.93373447449521618, 0.93829643702763399)
Wall time: 32.8 s

%%time
y_ = [integrate.nquad(f, [[0, x_val],[0, 2*pi]]) for x_val in x]
y_cdf_gmm = array(list(zip(*y_))[0])

# 5.2. CDF Comaprison
plot(x, y_ecdf,'o', alpha=0.8, label='Data')
plot(x, y_cdf_gmm,'-', color='black',label='GMM')
plot(x, y_cdf_weibull,'--', color='black',label='Weibull')
plt_configure(xlabel = "V", ylabel='P', legend=True, figsize=(4,3))

plt.figure()
plot(log(x), log(-log(1-y_ecdf)),'o', label = 'Empirical')
plot(log(x), log(-log(1-y_cdf_weibull)),'--', label = 'Weibull')
plot(log(x), log(-log(1-y_cdf_gmm)),'-', color='black', label = 'GMM')
plt_configure(xlabel='ln(V)',ylabel='ln(-ln(1-P))',legend={'loc':'best'}, figsize=(4,3))
align_figures()

cdf_diff, cdf_diff_weibull= np.abs(y_ecdf - y_cdf_gmm), np.abs(y_ecdf - y_cdf_weibull)
print(cdf_diff.max(), cdf_diff_weibull.max()) 
print(x[cdf_diff.argmax()], x[cdf_diff_weibull.argmax()])

D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:11: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:12: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:13: RuntimeWarning: divide by zero encountered in log

0.0240836004578 0.0460066091602
7.0 7.0
Wall time: 27.5 s

# Calculate Angle Distribution
x = linspace(0,2*pi, num=36+1)
y_ =[integrate.nquad(f, [[0, inf],[x_-pi/36, x_+pi/36]]) for x_ in x]
y = array(list(zip(*y_))[0])
density, _ = dir_hist(df['dir'], bins=arange(-5, 370, 10), density=True)

plt.bar(arange(0, 360, 10), density*10*len(df['dir']), width=10, alpha=0.5, label='Data')
plot(x/pi*180, y*len(df['dir']) ,'-', color='black', label='GMM')
plt_configure(xlabel='Direction'+dir_unit_text, ylabel='Frequency', 
              legend={'loc': 'best'} ,tight='xtight',figsize = (4,2))
plt.tight_layout()
dir_fig = plt.gcf()
print('Direction Distribution Comparison')
sector_r_square(density*10, y[:-1])

Direction Distribution Comparison

0.8899034217098688

pd.DataFrame(gofs_mean_set).set_index('year_lim')

6.4 Sectoral Comaprison¶

%%time
incre = max(SECTOR_LENGTH, 10)
density_collection=Parallel(n_jobs=-1)(delayed(direction_compare)(gmm, df, angle, incre) 
                                        for angle in arange(0, 360, incre))  
# This R square is computed as in paper 
# Comparison of bivariate distribution constructionapproaches for analysing wind speed anddirection data
# http://onlinelibrary.wiley.com/doi/10.1002/we.400/full
print(true_R_square(density_collection))

0.904634357711
Wall time: 10.9 s

# %%time
# curve_collection=Parallel(n_jobs=-1)(delayed(direction_compare2)
#                                      (gmm, df, angle, incre, complex=True) for angle in arange(start, end, incre))

# Calculate Speed Distribution
def model_data_comparison(df, original_incre = 10, incre = 10):
    start, end = -original_incre/2 + incre/2, 360
    curve_collection = []
    max_speed = df.speed.max()
    
    # Find a max count for plotting histogram
    max_count = max_count_for_angles(df, start, end, incre)
    plot_range = [0, max_speed, 0, max_count*1.05]
    
    for angle in arange(start, end, incre):
        angle_radian, incre_radian = np.radians([angle, incre])  
        start_angle, end_angle = angle-incre/2, angle+incre/2
        
        # 0. Select data from observation
        sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)
        data_size = len(sub_df.speed)
        # 1. Get Weibull and ECDF
        x, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(sub_df.speed)
        # 2. Get GMM PDF, CDF
        _, y_gmm, y_cdf_gmm, direction_prob = gmm_integration_in_direction(f, angle_radian-incre_radian/2, angle_radian+incre_radian/2, x)
        
        # 3. R square for GMM, Weibull
        bins = arange(0, sub_df.speed.max()+1)
        density, _ = np.histogram(sub_df['speed'], bins=bins, normed=True)
        density_expected_gmm_ =[integrate.nquad(f, [[x_, x_+1],[angle_radian-incre_radian/2, angle_radian+incre_radian/2]]) 
                            for x_ in bins[:-1]]
        density_expected_gmm = array(list(zip(*density_expected_gmm_ ))[0])/direction_prob
        R_square_gmm = sector_r_square(density, density_expected_gmm)
        
        density_expected_weibull = sp.stats.weibull_min.cdf(bins[1:], *weibull_params) - sp.stats.weibull_min.cdf(bins[:-1], *weibull_params) 
        R_square_weibull = sector_r_square(density, density_expected_weibull)

        # 4. K-S for GMM, Weibull
        cdf_diff, cdf_diff_weibull= np.abs(y_ecdf - y_cdf_gmm), np.abs(y_ecdf - y_cdf_weibull)
                
        # 5. Make Plots
        fig = plt.figure(figsize=(10,1.9))
        # 5.1. Frequency Comparison
        ax1 = fig.add_subplot(1,3,1)        
        sub_df['speed'].hist(bins=arange(0, sub_max_speed), alpha=0.5, label='Data')                  
        plot(x, y_gmm*data_size,'-', color='black', label='GMM')
        plot(x, y_weibull*data_size, '--', color='black',label='Weibull')   
        plt_configure(xlabel = "V", ylabel='Frequency', legend=True)
        plt.axis(plot_range)
        
        # 5.2. CDF Comaprison
        ax2 = fig.add_subplot(1,3,2)
        plot(x, y_ecdf,'o', alpha=0.8, label='Data')
        plot(x, y_cdf_gmm,'-', color='black',label='GMM')
        plot(x, y_cdf_weibull,'--', color='black',label='Weibull')
        plt.gca().set_xlim(right = max_speed)
        plt_configure(xlabel = "V", ylabel='P', legend=True)
        
        curves = {'direction': angle, 'datasize': data_size, 'weight': direction_prob, 'x': x, 
                  'gmm_pdf': y_gmm, 'gmm_cdf': y_cdf_gmm,
                  'weibull_pdf': y_weibull, 'weibull_cdf': y_cdf_weibull, 'ecdf': y_ecdf,
                  'max_cdf_diff_gmm': cdf_diff.max(), 'max_cdf_diff_weibull': cdf_diff_weibull.max(), 
                  'r_square_gmm': R_square_gmm, 'r_square_weibull': R_square_weibull}
        curve_collection.append(curves)
        
        plt.tight_layout()
        plt.show()
        print('%s (%s - %s) degree' % (angle, start_angle, end_angle))
        print('data size:', len(sub_df), 'weight', len(sub_df)/len(df))
        print('GMM', 'Weibull')
        print('R square', R_square_gmm,  R_square_weibull)
        print('max diff:', cdf_diff.max(), cdf_diff_weibull.max(), 
              'speed value:', x[cdf_diff.argmax()], x[cdf_diff_weibull.argmax()], 'y gmm', y_cdf_gmm[cdf_diff.argmax()])
        print(' ')
    return curve_collection

%%time
if len(effective_column) == 16:
    rebinned_angle = 22.5
else: 
    rebinned_angle = 20
    
curve_collection = model_data_comparison(df, SECTOR_LENGTH, rebinned_angle)

5.0 (-5.0 - 15.0) degree
data size: 1623 weight 0.038582227927542435
GMM Weibull
R square 0.968932084792 0.852369381409
max diff: 0.0595515449171 0.198427167935 speed value: 10.1629733605 4.35556001166 y gmm 0.964049388417

25.0 (15.0 - 35.0) degree
data size: 1963 weight 0.04666476489326297
GMM Weibull
R square 0.963135497897 0.809260720149
max diff: 0.0778809867194 0.0925059045251 speed value: 9.78690387245 6.99064562318 y gmm 0.921487711121

45.0 (35.0 - 55.0) degree
data size: 1674 weight 0.03979460847240052
GMM Weibull
R square 0.971479358342 0.949920446149
max diff: 0.0357451004303 0.0911978279373 speed value: 3.22865961112 4.30487948149 y gmm 0.116390261721

65.0 (55.0 - 75.0) degree
data size: 1377 weight 0.03273427471116817
GMM Weibull
R square 0.928734801949 0.963173442922
max diff: 0.102856708103 0.033696730751 speed value: 7.57405778767 7.57405778767 y gmm 0.565262391389

85.0 (75.0 - 95.0) degree
data size: 1612 weight 0.038320734084533827
GMM Weibull
R square 0.941861346263 0.980038602165
max diff: 0.0561137103165 0.017183048329 speed value: 7.09976891104 2.83990756442 y gmm 0.403563709038

105.0 (95.0 - 115.0) degree
data size: 2189 weight 0.0520372747587125
GMM Weibull
R square 0.869476030183 0.913945227493
max diff: 0.0660270942357 0.0529631927168 speed value: 6.66456866377 11.6629951616 y gmm 0.263803878811

125.0 (115.0 - 135.0) degree
data size: 3127 weight 0.07433556791708268
GMM Weibull
R square 0.834583788738 0.903990196563
max diff: 0.0732214858202 0.478539566835 speed value: 10.4548361125 14.3753996547 y gmm 0.479361556175

145.0 (135.0 - 155.0) degree
data size: 4151 weight 0.09867826748442923
GMM Weibull
R square 0.847060404673 0.961673040545
max diff: 0.128433143769 0.198803039157 speed value: 16.0179147698 13.1055666298 y gmm 0.764122866831

165.0 (155.0 - 175.0) degree
data size: 3826 weight 0.09095231303190225
GMM Weibull
R square 0.900166168063 0.937930485331
max diff: 0.0895674609233 0.0795291775011 speed value: 15.6691860875 7.12235731252 y gmm 0.711268921722

185.0 (175.0 - 195.0) degree
data size: 1844 weight 0.04383587695526078
GMM Weibull
R square 0.916175482251 0.974081434963
max diff: 0.123821576264 0.0207780879259 speed value: 9.08828198541 9.08828198541 y gmm 0.545375820699

diff_df = pd.DataFrame(curve_collection) 

gmm_mean, weibull_mean = plot_sectoral_comparison(diff_df.r_square_gmm, diff_df.r_square_weibull, 
                                                  diff_df.direction, diff_df.datasize)
plt_configure(ylabel="$\ R^2$", xlabel='Direction'+dir_unit_text)
ylim = min(plt.gca().get_ylim()[0],0.75)
plt.gca().set_ylim(top=1, bottom=ylim)
plt.tight_layout()
print(gmm_mean, weibull_mean)

0.9158637084432527 0.9240186747750685

gmm_mean, weibull_mean = plot_sectoral_comparison(diff_df.max_cdf_diff_gmm, diff_df.max_cdf_diff_weibull, 
                                                  diff_df.direction, diff_df.datasize)
plt_configure(ylabel="K-S", xlabel='Direction'+dir_unit_text)
ylim = max(plt.gca().get_ylim()[1],0.25)
plt.gca().set_ylim(top=ylim, bottom=0)
plt.tight_layout()
print(gmm_mean, weibull_mean)

0.07924202878479368 0.10993984897662709

# Compare direction weight with previous figure
display(dir_fig)

6.5 Insufficient-fit Sector Investigation¶

(1) Data Variability, by Bootstrap (Resampling)¶

angle =  max_diff_angle = diff_df.ix[diff_df['max_cdf_diff_gmm'].idxmax()]['direction']
incre = rebinned_angle

FRACTION = 1

# Select data from observation
start_angle, end_angle = angle-incre/2, angle+incre/2
angle_radian, incre_radian = radians(angle), radians(incre)  
sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)

x = arange(0, sub_max_speed, 0.5)
_, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(sub_df.speed, x)
_, y_gmm, y_cdf_gmm, direction_prob = gmm_integration_in_direction(f, angle_radian-incre_radian/2, angle_radian+incre_radian/2, x)

fig = plt.figure(figsize=(10,1.9))
ax1 = fig.add_subplot(1,3,1)   
ax2 = fig.add_subplot(1,3,2)   
ax3 = fig.add_subplot(1,3,3)   

# 1. Data
bins=arange(0, sub_max_speed)
sub_df['speed'].hist(ax=ax1, bins=bins, alpha=0.5, label='Data', normed=True)  

# 2. GMM
ax1.plot(x, y_gmm,'-', color='black', label='GMM')
ax2.plot(x, y_cdf_gmm,'-', color = 'black', label='GMM')
ax3.plot(log(x), log(-log(1-y_cdf_gmm)),'-', color = 'black',label='GMM')

# 3. Weilbull 
ax1.plot(x, y_weibull,'--',color='black',label='Weibull')
ax2.plot(x, y_cdf_weibull,'--',label='Weibull')
ax3.plot(log(x), log(-log(1-y_cdf_weibull)),'--',label='Weibull')

# 4. Data Resampled
count_collection = []
for i in range(1,100):
    sub_df_resampled = sub_df.sample(frac=FRACTION, replace=True)    
    resampled_count, _ = np.histogram(sub_df_resampled['speed'], bins=bins, normed=True) 
    count_collection.append(resampled_count)
    
    ecdf = sm.distributions.ECDF(sub_df_resampled.speed)
    y_ecdf = ecdf(x) 
    ax2.plot(x, y_ecdf,':', label='Data Resampled')
    ax3.plot(log(x), log(-log(1-y_ecdf)),':', label='Data Resampled')
    if i == 1: 
#         plt_configure(ax=ax2, xlabel = "$V$", ylabel='$P$', legend={'loc':'best'})
#         plt_configure(ax=ax3, xlabel="ln($V$)", ylabel="ln(-ln(1-$P$)",legend={'loc':'best'})
        plt_configure(ax=ax2, xlabel = "V", ylabel='P', legend={'loc':'best'})
        plt_configure(ax=ax3, xlabel="ln(V)", ylabel="ln(-ln(1-P)",legend={'loc':'best'})

print('%s (%s - %s) Degree Speed Distribution' % (angle, start_angle, end_angle))
count_collection = np.array(count_collection)
mx, mn = np.max(count_collection,0), np.min(count_collection,0)
ax1.plot(bins[1:]-0.5, mx, ':', color='blue')
ax1.plot(bins[1:]-0.5, mn, ':', color='blue', label='Resample limit')
ax1.set_ylim(bottom = 0)
# plt_configure(ax=ax1, xlabel='$V$',ylabel='Frequency',legend={'loc':'best'})
plt_configure(ax=ax1, xlabel='V', ylabel='Frequency',legend={'loc':'best'})
ax1.locator_params(axis='y', nbins=5)
ax2.locator_params(axis='y', nbins=5)
ax3.locator_params(axis='y', nbins=5)
plt.tight_layout()
diff = abs(y_ecdf - y_cdf_gmm)
print(diff.max(), x[diff.argmax()], y_cdf_gmm[diff.argmax()])

D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:17: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:22: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:34: RuntimeWarning: divide by zero encountered in log

285.0 (275.0 - 295.0) Degree Speed Distribution
0.145021180383 13.5 0.944546684535

(2) Time Variability¶

fig_time_variability_3d = plt.figure()
ax1 = fig_time_variability_3d.gca(projection='3d')

fig_time_variability_cdf,ax2 = plt.subplots(figsize=(3,1.8))
fig_time_variability_weibull, ax3 = plt.subplots(figsize=(3,1.8))

ax2.plot(x, y_cdf_gmm,'-', color='black', label = 'GMM')
ax2.plot(x, y_cdf_weibull,'--', label='Weibull')

ax3.plot(log(x), log(-log(1-y_cdf_gmm)),'-', color='black',label='GMM')
ax3.plot(log(x), log(-log(1-y_cdf_weibull)), '--', label='Weibull')

# 3. Data
prop_cycle=iter(mpl.rcParams['axes.color_cycle'])
for start_time in range(2001, 2015, 5):
    end_time = start_time + 4 
    df_other_years = df_all_years[str(start_time):str(end_time)]
    df_other_years_at_angle, sub_max_speed_other_year = select_df_by_angle(df_other_years, start_angle, end_angle)
    if len(df_other_years_at_angle) > 0 :
        
        ecdf = sm.distributions.ECDF(df_other_years_at_angle.speed)
        y_ecdf = ecdf(x)
        ax2.plot(x, y_ecdf,':', label = start_time)
        ax3.plot(log(x), log(-log(1-y_ecdf)),':', label = start_time)
        
        count, division = np.histogram(df_other_years_at_angle['speed'], normed=True,
                                       bins=arange(0, sub_max_speed_other_year))
        ax1.bar(left=division[:-1], height=count, zs=start_time, zdir='x', 
                color=next(prop_cycle), alpha=0.8)
        x_3d = start_time*np.ones_like(x)
        ax1.plot(x_3d, x, y_gmm, '-', color='black', label='GMM'  if start_time == 2011 else '')
        ax1.plot(x_3d, x, y_weibull, '--', color='blue', label='Weibull' if start_time == 2011 else '')
        
print('%s (%s - %s) Degree Speed Distribution' % (angle, start_angle, end_angle))
ax1.set_ylim(bottom = 0)
ax1.set_zlabel('Frequency')
plt_configure(ax=ax1, xlabel='Time',ylabel='V', legend=True)
plt_configure(ax=ax2, xlabel = "V", ylabel='P', legend={'loc':'best'})
plt_configure(ax=ax3, xlabel="ln(V)", ylabel="ln(-ln(1-P)", legend={'loc':'best'})

ax1.set_zlim(bottom = 0)
align_figures()

D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:10: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:11: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:24: RuntimeWarning: divide by zero encountered in log

285.0 (275.0 - 295.0) Degree Speed Distribution

(3) Adjacent Sector Variability¶

incre = rebinned_angle
angle_group = [max_diff_angle-incre, max_diff_angle, max_diff_angle+incre]

fig_adjecent_variability_3d = plt.figure()
ax1 = fig_adjecent_variability_3d.gca(projection='3d')
fig_adjecent_variability_cdf, ax2 = plt.subplots(figsize=(3,1.8))
fig_adjecent_variability_weibull, ax3 = plt.subplots(figsize=(3,1.8))

legend_3d = False
prop_cycle=iter(mpl.rcParams['axes.color_cycle'])

curve_df = pd.DataFrame(curve_collection)

for angle in angle_group:
    curves = curve_df.query('direction == @angle%360').T.to_dict()
    curves = curves[list(curves)[0]]
    data_size, x =  curves['datasize'], curves['x']
    y_gmm, y_cdf_gmm =  curves['gmm_pdf'], curves['gmm_cdf'] 
    y_weibull, y_cdf_weibull, y_cdf = curves['weibull_pdf'],  curves['weibull_cdf'], curves['ecdf']

    linestyle = '-' if angle == max_diff_angle else ':'
    alpha = 0.7 if angle == max_diff_angle else 0.3

    ax2.plot(x, y_gmm*data_size, linestyle, label=angle)        
    ax3.plot(x, y_weibull*data_size, linestyle, label=angle)

    start_angle, end_angle = angle-incre/2, angle+incre/2
    sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)

    x_3d = angle*np.ones_like(x)
    ax1.plot(x_3d, x, y_gmm*data_size, color='black', label='GMM')
    ax1.plot(x_3d, x, y_weibull*data_size, color='blue', linestyle='--',label='Weibull')

    count, division = np.histogram(sub_df['speed'], bins=arange(0, sub_max_speed))
    ax1.bar(left=division[:-1], height=count, zs=angle, zdir='x', color=next(prop_cycle), alpha=0.8)

    if legend_3d == False:
        ax1.legend()
        legend_3d = True
        
plt_configure(ax=ax1, xlabel='Direction', ylabel='Speed')   
plt_configure(ax=ax2, xlabel='V',ylabel='Frequency',legend={'loc':'best'})
plt_configure(ax=ax3, xlabel='V',ylabel='Frequency',legend={'loc':'best'})
ax1.set_zlabel('Frequency')
ax1.set_zlim(bottom = 0)
ylim = max(ax1.get_ylim()[1],ax3.get_ylim()[1])
ax2.set_ylim(bottom = 0, top=ylim)
ax3.set_ylim(bottom = 0, top=ylim)

print(max_diff_angle) 
print('GMM, Weibull, Histogram')
align_figures()

D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))

285.0
GMM, Weibull, Histogram

7. Result Variability & Cross-Validation¶

if 'bandwidth' not in globals():
    bandwidth = DEFAULT_BANDWDITH    
if 'FIT_METHOD' not in globals():
    FIT_METHOD = 'square_error'       
if 'KDE_KERNEL' not in globals():
    KDE_KERNEL = 'gaussian'
    
config = {'bandwidth': bandwidth, 
          'fitting_range': FITTING_RANGE,
          'fit_limit': fit_limit,
          'kde_kernel': KDE_KERNEL}

print(bandwidth, FIT_METHOD)

1.1 square_error

7.1 Variability of the Result¶

%%time
results = Parallel(n_jobs=-1)(delayed(resampled_fitting)(df, FIT_METHOD, NUMBER_OF_GAUSSIAN, config) for i in range(12))                        
for result in results:
    display(pretty_print_gmm(result['gmm']))
    fig,ax = plt.subplots(figsize=(3.5,3.5))
    plot_gmm_ellipses(result['gmm'],ax=ax, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
    plt.show()
    
    display(gof_df(result['gmm_pdf_result'], result['kde_result']))
    display(gof_df(result['gmm_pdf_result'], kde_result))
    print('')

GMM Plot Result
0.448036532228 [[-6.51214692  5.92524672]] [ 4.60116993  7.1974108 ] -90.3452493942
0.345704594149 [[-0.83747624 -5.01406905]] [ 3.02645788  4.67124611] -132.207610193
0.206258873622 [[ 4.88443919  0.21986022]] [ 2.23720332  3.47941485] -169.698844626

GMM Plot Result
0.447890406655 [[-6.49183548  5.9188433 ]] [ 4.55234787  7.14390305] -91.2211731677
0.352069480925 [[-0.75390438 -5.05075567]] [ 3.08951442  4.66684713] -131.609655527
0.20004011242 [[ 4.92297989  0.21498254]] [ 2.2151241   3.38011531] -171.64224864

GMM Plot Result
0.439568851141 [[-6.51268134  5.96626057]] [ 4.60626072  7.22265112] -91.36741611
0.349782627497 [[-0.96751755 -4.95017288]] [ 3.03504302  4.80607589] -132.867643351
0.210648521362 [[ 4.83261012  0.22427152]] [ 2.23442957  3.41472455] -170.572543035

GMM Plot Result
0.425999129967 [[-6.6782429   6.07123447]] [ 4.48973393  7.0610181 ] -89.5331177498
0.357240522802 [[-0.93136138 -4.9808192 ]] [ 3.01942591  4.81241654] -133.036309333
0.216760347231 [[ 4.79959528  0.31839297]] [ 2.31341069  3.53173437] -167.873134439

GMM Plot Result
0.448838959803 [[-6.63542324  5.92697198]] [ 4.68554162  6.89817286] -91.040742888
0.335591813593 [[-0.88014181 -5.12953988]] [ 3.06743402  4.61386351] -132.122309199
0.215569226604 [[ 4.80456246  0.22157181]] [ 2.32702431  3.57066483] -174.453997155

7.2 Cross-validation, to select the number of Gaussian¶

%%time
from sklearn.cross_validation import train_test_split, KFold

## 5-fold cross validation
gaussian_number_range = arange(1,6)
CV_result_train_all,CV_result_test_all =[],[]
number_of_fold = 4
print('Number of train/test dataset', len(df)*(number_of_fold-1)/number_of_fold, len(df)/number_of_fold) 

for number_of_gaussian in gaussian_number_range:
    print( '  ')
    print('Number of gaussian', number_of_gaussian)
    
    kf = KFold(len(df), n_folds=number_of_fold, shuffle=True) 

    CV_result = Parallel(n_jobs=-1)(delayed(fit_per_fold)(df, train_index, test_index, FIT_METHOD, number_of_gaussian, config) for train_index, test_index in kf)                        

    CV_result_train, CV_result_test = list(zip(*CV_result))
    CV_result_train, CV_result_test = list(CV_result_train), list(CV_result_test)
        
    CV_result_train_all.append(CV_result_train)
    CV_result_test_all.append(CV_result_test)
    
    print('Train')
    pretty_pd_display(CV_result_train)
    print('Test')
    pretty_pd_display(CV_result_test)

D:\ProgramData\Anaconda3\lib\site-packages\sklearn\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

Number of train/test dataset 31549.5 10516.5
  
Number of gaussian 1
Train

Test

  
Number of gaussian 2
Train

Test

  
Number of gaussian 3
Train

Test

  
Number of gaussian 4
Train

Test

  
Number of gaussian 5
Train

train_scores_mean, train_scores_std = generate_mean_std_gof(CV_result_train_all)
print('Train gof mean, std')
display(train_scores_mean)

test_scores_mean, test_scores_std = generate_mean_std_gof(CV_result_test_all)
print('Test gof mean, std')
display(test_scores_mean)

Train gof mean, std

Test gof mean, std

prop_cycle=mpl.rcParams['axes.color_cycle']
gaussian_number_range = train_scores_mean.index
for column, column_name in zip(['R_square','K_S','Chi_square'],["$\ R^2$", "K-S", "$\widetilde{\chi^2} $"]):
    plot(gaussian_number_range, train_scores_mean[column],
             '--', label = 'training', color=prop_cycle[0])
    plt.fill_between(gaussian_number_range, 
                     train_scores_mean[column] - train_scores_std[column],
                     train_scores_mean[column] + train_scores_std[column], 
                     alpha=0.2, color=prop_cycle[0])
    
    plot(gaussian_number_range, test_scores_mean[column],
             '-', label = 'test',color=prop_cycle[1])
    plt.fill_between(gaussian_number_range, 
                 test_scores_mean[column] - test_scores_std[column],
                 test_scores_mean[column] + test_scores_std[column], 
                 alpha=0.2,color=prop_cycle[1])
    plt.xticks(gaussian_number_range)
    print(column)
    plt.locator_params(axis='y', nbins=5)
    plt_configure(xlabel='Number of Gaussian Distributions', ylabel=column_name, 
                  figsize=(3,2), legend={'loc':'best'})
    if column == 'R_square':
        plt.gca().set_ylim(top=1)
    if column == 'K_S' or column == 'Chi_square':
        plt.gca().set_ylim(bottom=0)
    plt.show()

R_square

D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))

K_S

Chi_square

fig = plt.figure(figsize=(4.2,2.4))
ax1 = fig.add_subplot(1,2,1) 
plot_2d_prob_density(X, Y, kde_Z, ax=ax1,
                     xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar=False)
ax1.grid(False)
ax2 = fig.add_subplot(1,2,2) 
plot_2d_prob_density(X, Y, pdf_Z, ax=ax2,
                     xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar=False)
ax2.grid(False)
ax2.get_yaxis().set_visible(False)

for fig in [fig_hist, fig_kde, fig_em, fig_gmm]:
    display(fig)
for fig in [fig_time_variability_3d, fig_time_variability_cdf, fig_time_variability_weibull, 
            fig_adjecent_variability_3d, fig_adjecent_variability_cdf, fig_adjecent_variability_weibull,]:
    display(fig)

import time
save_notebook()
time.sleep(3)
location_name = get_location_name(file_path)
print(location_name)
current_file = 'GMM.ipynb'
output_file = './output_HTML/'+location_name+'.html' 

output_HTML(current_file, output_file)

	date	HrMn	dir	speed	dir_windrose
count	5.938300e+04	59383.000000	59383.000000	59383.000000	59383.000000
mean	2.013312e+07	1135.866376	226.156712	4.131822	237.997642
std	1.889311e+04	692.584200	210.592490	2.311919	208.489040
min	2.010052e+07	0.000000	0.000000	0.000000	0.000000
25%	2.012022e+07	500.000000	120.000000	2.600000	120.000000
50%	2.013101e+07	1100.000000	180.000000	3.600000	200.000000
75%	2.015052e+07	1700.000000	280.000000	5.700000	300.000000
max	2.016123e+07	2359.000000	999.000000	20.100000	999.000000

	date	HrMn	dir	speed	dir_windrose	x	y
count	4.206600e+04	42066.000000	42066.000000	42066.000000	42066.000000	42066.000000	42066.000000
mean	2.013044e+07	1153.544430	179.874592	8.447488	237.262920	-1.553747	0.376113
std	1.405171e+04	691.989734	98.737423	4.375064	208.025899	6.507981	6.752294
min	2.011010e+07	0.000000	-4.978698	0.005481	0.000000	-34.524102	-32.472160
25%	2.012033e+07	600.000000	110.516651	5.156952	110.000000	-6.005882	-4.253203
50%	2.013062e+07	1200.000000	167.650918	7.482294	200.000000	-0.962347	0.459120
75%	2.014092e+07	1800.000000	269.171783	11.220914	300.000000	3.482725	5.152950
max	2.015123e+07	2300.000000	354.999753	34.535472	999.000000	27.009530	29.545306

	date	HrMn	dir	speed	dir_windrose	x	y
count	4.206600e+04	42066.000000	42066.000000	42066.000000	42066.000000	42066.000000	42066.000000
mean	2.013044e+07	1153.544430	179.874592	8.447488	237.262920	-1.553747	0.376113
std	1.405171e+04	691.989734	98.737423	4.375064	208.025899	6.507981	6.752294
min	2.011010e+07	0.000000	-4.978698	0.005481	0.000000	-34.524102	-32.472160
25%	2.012033e+07	600.000000	110.516651	5.156952	110.000000	-6.005882	-4.253203
50%	2.013062e+07	1200.000000	167.650918	7.482294	200.000000	-0.962347	0.459120
75%	2.014092e+07	1800.000000	269.171783	11.220914	300.000000	3.482725	5.152950
max	2.015123e+07	2300.000000	354.999753	34.535472	999.000000	27.009530	29.545306

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
year_lim
(1996, 2006)	NaN	NaN	NaN	NaN	NaN	NaN

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.392	-1.081	-4.889	4.453	5.444	-0.409
2	0.316	3.965	0.903	3.735	4.144	0.042
3	0.291	-8.180	6.888	5.121	4.232	0.271

	date	HrMn	type	dir	speed	wind_type
0	20100518	0500	FM-15	999	1.0	V
1	20100518	0600	FM-15	999	1.0	V
2	20100518	0700	FM-15	330	2.1	N
3	20100527	0900	FM-15	190	5.1	N
4	20100614	0400	FM-15	170	7.2	N
5	20100614	0500	FM-15	160	6.2	N
6	20100614	0600	FM-15	170	5.1	N
7	20100614	0700	FM-15	180	2.6	N
8	20100614	0800	FM-15	999	1.5	V
9	20100614	1100	FM-15	310	7.2	N
10	20100614	1200	FM-15	340	7.7	N
11	20100614	1300	FM-15	330	8.2	N
12	20100614	1400	FM-15	320	6.2	N
13	20100614	1500	FM-15	320	2.6	N
14	20100614	1700	FM-15	340	2.6	N
15	20100614	1800	FM-15	360	3.1	N
16	20100614	1900	FM-15	90	2.6	N
17	20100614	2000	FM-15	100	3.6	N
18	20100614	2200	FM-15	110	2.1	N
19	20100614	2300	FM-15	120	2.1	N
20	20100615	0000	FM-15	140	1.0	N
21	20100615	0100	FM-15	150	2.1	N
22	20100615	0200	FM-15	130	0.5	N
23	20100615	0400	FM-15	150	2.6	N
24	20100615	0500	FM-15	160	6.2	N
25	20100615	0600	FM-15	160	4.1	N
26	20100615	0700	FM-15	170	3.1	N
27	20100615	0800	FM-15	999	2.1	V
28	20100615	0900	FM-15	999	1.0	V
29	20100615	1000	FM-15	340	8.2	N
...	...	...	...	...	...	...
61722	20170331	1800	FM-15	120	4.1	N
61723	20170331	1900	FM-15	120	3.6	N
61724	20170331	2000	FM-15	90	2.6	N
61725	20170331	2100	FM-15	110	3.1	N
61726	20170331	2200	FM-15	110	3.1	N
61727	20170331	2300	FM-15	130	3.1	N
61728	20170401	0000	FM-15	80	2.1	N
61729	20170401	0100	FM-15	120	2.1	N
61730	20170401	0200	FM-15	80	2.1	N
61731	20170401	0300	FM-15	40	2.1	N
61732	20170401	0400	FM-15	999	0.5	V
61733	20170401	0500	FM-15	310	1.5	N
61734	20170401	0600	FM-15	330	1.5	V
61735	20170401	0700	FM-15	999	1.0	V
61736	20170401	0800	FM-15	290	3.1	V
61737	20170401	0900	FM-15	330	3.1	V
61738	20170401	1000	FM-15	320	2.6	V
61739	20170401	1100	FM-15	330	2.6	V
61740	20170401	1200	FM-15	340	2.1	V
61741	20170401	1300	FM-15	340	6.2	N
61742	20170401	1400	FM-15	330	4.1	N
61743	20170401	1500	FM-15	330	4.1	N
61744	20170401	1600	FM-15	340	2.6	N
61745	20170401	1700	FM-15	20	1.0	N
61746	20170401	1800	FM-15	100	1.5	N
61747	20170401	1900	FM-15	90	3.1	N
61748	20170401	2000	FM-15	80	3.1	N
61749	20170401	2100	FM-15	60	2.6	N
61750	20170401	2200	FM-15	80	2.1	N
61751	20170401	2300	FM-15	90	3.1	N

	date	HrMn	type	dir	speed	wind_type	dir_windrose	incre	incre_reverse
time
2016-03-09 10:00:00	20160309	1000	FM-15	60	36	N	30	13.0	13.0
2012-04-14 00:00:00	20120414	0	FM-15	180	34	N	270	8.0	26.0
2015-02-21 06:00:00	20150221	600	FM-15	280	32	N	170	3.0	1.0
2012-02-26 05:00:00	20120226	500	FM-15	270	32	N	180	7.0	1.0
2012-02-26 00:00:00	20120226	0	FM-15	290	32	N	160	2.0	0.0
2012-02-26 01:00:00	20120226	100	FM-15	280	32	N	170	0.0	8.0
2012-02-26 06:00:00	20120226	600	FM-15	270	31	N	180	-1.0	7.0
2011-01-28 10:00:00	20110128	1000	FM-15	110	31	N	340	23.0	5.0
2015-02-21 07:00:00	20150221	700	FM-15	270	31	N	180	-1.0	1.0
2015-02-21 08:00:00	20150221	800	FM-15	270	30	N	180	-1.0	4.0

	date	HrMn	type	dir	speed	wind_type	dir_windrose
time
2010-05-18 05:00:00	20100518	500	FM-15	NaN	2	V	999
2010-05-18 06:00:00	20100518	600	FM-15	NaN	2	V	999

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.434	-6.750	6.074	6.962	4.561	0.015
2	0.349	-0.962	-4.961	4.039	3.913	-0.410
3	0.217	4.818	0.255	2.344	3.500	-0.136

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.448	-6.512	5.925	7.197	4.601	-0.006
2	0.346	-0.837	-5.014	4.013	3.857	-0.407
3	0.206	4.884	0.220	2.287	3.447	-0.158

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.448	-6.492	5.919	7.143	4.554	-0.020
2	0.352	-0.754	-5.051	4.048	3.865	-0.388
3	0.200	4.923	0.215	2.246	3.360	-0.124

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.440	-6.513	5.966	7.221	4.608	-0.022
2	0.350	-0.968	-4.950	4.083	3.955	-0.429
3	0.211	4.833	0.224	2.274	3.388	-0.140

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.426	-6.678	6.071	7.061	4.490	0.008
2	0.357	-0.931	-4.981	4.077	3.957	-0.434
3	0.217	4.800	0.318	2.380	3.487	-0.176

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.449	-6.635	5.927	6.898	4.686	-0.014
2	0.336	-0.880	-5.130	3.993	3.841	-0.385
3	0.216	4.805	0.222	2.342	3.561	-0.085

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.426	-6.919	6.116	6.730	4.528	0.023
2	0.349	-1.024	-5.026	4.076	3.944	-0.421
3	0.225	4.800	0.282	2.381	3.518	-0.120

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.444	-6.683	6.134	7.076	4.670	0.019
2	0.344	-0.925	-4.998	4.014	3.913	-0.411
3	0.212	4.866	0.115	2.276	3.417	-0.102

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.453	-6.441	5.836	7.250	4.630	-0.019
2	0.338	-0.951	-5.036	3.955	3.859	-0.397
3	0.209	4.860	0.122	2.255	3.374	-0.116

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.428	-6.993	6.021	6.758	4.576	0.019
2	0.348	-0.922	-5.024	4.015	3.887	-0.393
3	0.225	4.815	0.304	2.379	3.582	-0.135

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.427	-6.939	6.146	6.844	4.562	0.035
2	0.357	-0.916	-4.921	4.009	3.901	-0.382
3	0.216	4.790	0.416	2.394	3.524	-0.190

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.442	-6.594	5.902	7.109	4.618	0.004
2	0.344	-0.919	-5.009	4.019	3.861	-0.395
3	0.215	4.869	0.248	2.317	3.459	-0.123

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.414	-7.068	6.216	6.661	4.530	0.074
2	0.357	-0.958	-4.965	4.080	3.936	-0.409
3	0.229	4.780	0.448	2.406	3.533	-0.167

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
0	0.182558	0.058436	2.435895e-07	0.099267	0.619421	0.765859
1	0.182516	0.058233	2.431249e-07	0.098218	0.618769	0.765971
2	0.180312	0.056443	2.368530e-07	0.099152	0.611256	0.770435
3	0.181892	0.058028	2.411418e-07	0.098179	0.616276	0.768998

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
0	0.185844	0.052871	2.382331e-07	0.097883	0.612949	0.770088
1	0.184048	0.054111	2.349332e-07	0.100112	0.608867	0.774488
2	0.191001	0.062170	2.528227e-07	0.097124	0.630015	0.761859
3	0.189034	0.064485	2.518646e-07	0.102481	0.630319	0.754925

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
0	0.116961	0.023942	9.006525e-08	0.059317	0.376629	0.913588
1	0.114057	0.025048	8.992753e-08	0.060022	0.376439	0.913816
2	0.115565	0.024748	9.052175e-08	0.061224	0.377714	0.912450
3	0.109330	0.024198	9.052395e-08	0.060999	0.377662	0.912576

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
0	0.126982	0.023941	9.574609e-08	0.065502	0.388639	0.907252
1	0.107255	0.026843	9.475828e-08	0.062647	0.386327	0.907778
2	0.145494	0.032475	9.406191e-08	0.059442	0.384803	0.911013
3	0.128568	0.025059	9.432500e-08	0.060161	0.385516	0.910391

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
0	0.166170	0.018455	2.588128e-08	0.031903	0.202026	0.974987
1	0.161144	0.016163	2.525301e-08	0.032535	0.199547	0.975585
2	0.208430	0.017181	2.552398e-08	0.032584	0.200465	0.975312
3	0.168793	0.017009	2.611411e-08	0.031862	0.202751	0.975181

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
0	0.177436	0.017164	2.983356e-08	0.036180	0.216523	0.971699
1	0.215154	0.023783	3.243144e-08	0.034309	0.225793	0.969288
2	0.150731	0.021950	3.073525e-08	0.033762	0.220303	0.970906
3	0.288367	0.020775	3.044033e-08	0.037229	0.219301	0.969651

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
0	2.092061	0.020415	1.695521e-08	0.026388	0.163474	0.983628
1	3.886085	0.020336	1.608949e-08	0.025591	0.159185	0.984510
2	3.139501	0.018859	1.568497e-08	0.024835	0.157191	0.984911
3	2.174824	0.019262	1.564160e-08	0.025174	0.157007	0.984975

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
0	2.416939	0.020222	1.845747e-08	0.026647	0.170446	0.982446
1	2.573307	0.023474	1.944738e-08	0.027711	0.175156	0.981318
2	2.215120	0.020397	2.298371e-08	0.031759	0.190348	0.977907
3	4.603554	0.024197	2.113483e-08	0.029089	0.182413	0.979561

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
0	0.159491	0.009697	7.038815e-09	0.016727	0.105321	0.993244
1	0.168774	0.008625	7.455789e-09	0.017688	0.108384	0.992798
2	0.173598	0.008576	7.269189e-09	0.016573	0.106973	0.993004
3	0.016151	0.005008	8.529774e-09	0.019003	0.115970	0.991789

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
0	0.199232	0.013811	1.236204e-08	0.022582	0.139521	0.988020
1	0.399635	0.013498	1.069134e-08	0.019669	0.129794	0.989834
2	0.216942	0.013080	1.593612e-08	0.028252	0.158670	0.984731
3	0.021954	0.007244	1.265326e-08	0.021137	0.141045	0.987848

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
1	0.181819	0.057785	2.411773e-07	0.098704	0.616430	0.767816
2	0.113978	0.024484	9.025962e-08	0.060391	0.377111	0.913107
3	0.176134	0.017202	2.569310e-08	0.032221	0.201197	0.975266
4	2.823118	0.019718	1.609282e-08	0.025497	0.159214	0.984506
5	0.129503	0.007976	7.573392e-09	0.017498	0.109162	0.992709

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
1	0.187482	0.058409	2.444634e-07	0.099400	0.620538	0.765340
2	0.127075	0.027079	9.472282e-08	0.061938	0.386321	0.909109
3	0.207922	0.020918	3.086014e-08	0.035370	0.220480	0.970386
4	2.952230	0.022073	2.050584e-08	0.028801	0.179591	0.980308
5	0.209441	0.011908	1.291069e-08	0.022910	0.142258	0.987609