Modeling the Joint Distribution of Wind Speed and Direction using Gaussain Mixture Models¶

OEN Method: Harris, Cook The parent wind speed distribution: Why Weibull? http://www.sciencedirect.com/science/article/pii/S0167610514001056

Gaussian Mixture Models, http://scikit-learn.org/stable/modules/mixture.html

1. Set up¶

1.1 Environment¶

%matplotlib inline
%load_ext autoreload
%autoreload 2

from import_file import *
from helpers.parallel_helper import *
load_libs()

plt.rcParams['axes.autolimit_mode'] = 'round_numbers'
plt.rcParams['axes.xmargin'] = 0.
plt.rcParams['axes.ymargin'] = 0.
mpl.rcParams['patch.force_edgecolor'] = True

1.2 Read Data¶

# file_path, bandwidth= './data/NCDC/europe/uk/marham/dat.txt', 1.7
# file_path, bandwidth, NUMBER_OF_GAUSSIAN= './data/NCDC/europe/uk/tiree/dat.txt', 1.9, 4 
# file_path, bandwidth, NUMBER_OF_GAUSSIAN = './data/NCDC/europe/uk/boscombe_down/dat.txt', 1.5, 4
# file_path, bandwidth= './data/NCDC/europe/uk/middle_wallop/dat.txt', 1.3
# file_path, bandwidth= './data/NCDC/europe/uk/bournemouth/dat.txt',1.3 # 4?
# file_path= "./data/NCDC/europe/uk/weybourne/dat.txt"
# file_path= "./data/NCDC/europe/uk/skye_lusa/dat.txt" # 
# file_path= "./data/NCDC/europe/uk/wattisham/dat.txt"
# file_path= "./data/NCDC/europe/uk/south_uist_range/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/europe/uk/holbeach/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/europe/uk/cambridge/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/europe/us/baltimore/dat.txt" # time too short
# file_path= "./data/NCDC/europe/uk/bealach_na_ba/dat.txt" # time too short
# file_path= "./data/NCDC/europe/uk/benbecula/dat.txt" # truncate (untruncate in m/s), 4?
# file_path= './data/NCDC/europe/uk/southhamption/dat.txt' # high 0, trend

# file_path, bandwidth, NUMBER_OF_GAUSSIAN = "./data/NCDC/europe/germany/landsberg_lech/dat.txt", 0.9, 4 
# file_path, bandwidth= "./data/NCDC/europe/germany/neuburg/dat.txt", 0.7
# file_path, bandwidth= "./data/NCDC/europe/germany/laupheim/dat.txt", 0.7 # double peak, 4?, trend
# file_path, bandwidth= './data/NCDC/europe/germany/niederstetten/dat.txt', 0.9 # get the peak
# file_path, bandwidth= "./data/NCDC/europe/germany/holzdorf/dat.txt", 0.9 # 2008 year
# file_path, bandwidth, NUMBER_OF_GAUSSIAN= './data/NCDC/europe/france/nantes/dat.txt', 0.9, 4 # unit shift, one direction deviate big
# file_path= './data/NCDC/europe/france/pau_pyrenees/dat.txt' # unit shift, 2; force using knot 
# file_path= "./data/NCDC/europe/france/avord/dat.txt" # try 4, initial speed (should be good with m/s), incompete dataset
# file_path= "./data/NCDC/europe/france/vatry/dat.txt"  # double peak, initial speed, incompete dataset
# file_path, bandwidth, NUMBER_OF_GAUSSIAN= "./data/NCDC/europe/spain/valladolid/dat.txt", 1.1, 4
# file_path= './data/NCDC/europe/spain/jerez/dat.txt' # high 0
# file_path, bandwidth= "./data/NCDC/europe/spain/barayas/dat.txt", 0.7 # not good fit
# file_path, bandwidth= './data/NCDC/europe/spain/malaga/dat.txt', 0.7 # directions blocked?
# file_path, bandwidth= './data/NCDC/europe/spain/tenerife_sur/dat.txt', 0.7 # directions blocked?
# file_path, bandwidth= './data/NCDC/europe/spain/almeria/dat.txt', 0.7 # negative dimensions?
# file_path, bandwidth= './data/NCDC/europe/greece/eleftherios_intl/dat.txt',0.7 # some direction might be blocked
# file_path= './data/NCDC/europe/ciampino/dat.txt' # try 4, bandwidth?
# file_path= "./data/NCDC/europe/huspel_aws/dat.txt"  # integer, 4?
# file_path= './data/NCDC/gibraltar/dat.txt' # bad fit

# MidEast
# file_path, bandwidth= './data/NCDC/mideast/uae/al_maktoum/dat.txt', 1.1
# file_path= './data/NCDC/mideast/uae/sharjah_intl/dat.txt' 
# file_path= './data/NCDC/mideast/uae/dubai_intl/dat.txt' 
# file_path= './data/NCDC/mideast/uae/abu_dhabi_intl/dat.txt' # Time shift
# file_path= './data/NCDC/mideast/uae/bateen/dat.txt' # Time shift
# file_path= './data/NCDC/mideast/buraimi/dat.txt' # not good dataset
# file_path= './data/NCDC/mideast/turkey/konya/dat.txt' 
# file_path= './data/NCDC/mideast/turkey/sivas/dat.txt' # bad dataset
# file_path= './data/NCDC/mideast/turkey/balikesir/dat.txt' # bad dataset
# file_path= './data/NCDC/mideast/turkey/bartin/dat.txt' # bad dataset
# file_path= './data/NCDC/mideast/iran/chahbahar/dat.txt'
# file_path= './data/NCDC/mideast/iran/zabol/dat.txt' # Problematic data
# file_path= './data/NCDC/mideast/iran/torbat_heydarieh/dat.txt' # Unusable

# file_path, bandwidth = "./data/NCDC/cn/shanghai/hongqiao_intl/dat.txt", 0.6
# file_path, bandwidth= "./data/NCDC/cn/shanghai/pudong/dat.txt", 0.8
# file_path, bandwidth= "./data/NCDC/cn/hefei_luogang/dat.txt", 0.6 # few 0, trend, try 2
# file_path, bandwidth= "./data/NCDC/cn/nanjing_lukou/dat.txt", 0.5
# file_path= "./data/NCDC/cn/zhengzhou_xinzheng/dat.txt" 
# file_path= "./data/NCDC/cn/tianjin/binhai/dat.txt" # few 0, trend, stationary speed, unstationary direction
# file_path= "./data/NCDC/cn/tianjin/tianjing/dat.txt" # 16 sectors
# file_path= "./data/NCDC/cn/shijiazhuang_zhengding/dat.txt" 
# file_path= "./data/NCDC/cn/henan_gushi/dat.txt" # 16 sectors, fit not very good
# file_path= "./data/NCDC/cn/nanning_wuxu/dat.txt" # numpy priblem, unstationary speed
# file_path= './data/NCDC/cn/macau/dat.txt'  
# file_path= "./data/NCDC/cn/hk_intl/dat.txt" # few 0
# file_path= './data/NCDC/cn/gaoqi/dat.txt' 

# file_path= './data/NCDC/southeast_asia/malaysia/mersing/dat.txt' # 2 mode, paper comparison
# file_path= './data/NCDC/southeast_asia/malaysia/penang/dat.txt'
# file_path= './data/NCDC/southeast_asia/malaysia/butterworth/dat.txt' # 2 mode 
# file_path= "./data/NCDC/southeast_asia/malaysia/bsultan_mahmud/dat.txt" # stable
# file_path= "./data/NCDC/southeast_asia/malaysia/bsultan_ismail/dat.txt" # 
# file_path= "./data/NCDC/southeast_asia/singapore/changi/dat.txt" # trend, no 0, questionary data
# file_path= "./data/NCDC/southeast_asia/singapore/paya_lebar/dat.txt" # questionary data
# file_path= "./data/NCDC/southeast_asia/singapore/seletar/dat.txt"
# file_path= "./data/NCDC/east_asia/cheongju_intl/dat.txt" # 2005-2009  may have problem, fit is good; numpy problem
# file_path= "./data/NCDC/east_asia/daegu_ab/dat.txt" # recent 5 year may have problem, but fit is generally good; numpy problem

# file_path, bandwidth= "./data/NCDC/oceania/auckland_intl/dat.txt", 0.9  # Good data, double mode
# file_path= "./data/NCDC/oceania/brisbane_archerfield/dat.txt" # high 0, few data 
# file_path= "./data/NCDC/oceania/narrandera/dat.txt" # high 0, few data
# file_path, bandwidth= "./data/NCDC/oceania/canberra/dat.txt", 0.7 # high 0, bad fit
# file_path, bandwidth, NUMBER_OF_GAUSSIAN= './data/NCDC/oceania/horsham/dat.txt', 0.9, 4 # get the peak

file_path, bandwidth= './data/NCDC/us/boston_16nm/dat.txt', 0.9 # Offshore, mixed type

# file_path, bandwidth= './data/asos/olympia/hr_avg.csv', 0.5 # might block
# file_path, bandwidth, NUMBER_OF_GAUSSIAN  = './data/asos/bismarck_ND/hr_avg.csv', 1.1, 4
# file_path, bandwidth, NUMBER_OF_GAUSSIAN = './data/asos/aberdeen_SD/hr_avg.csv', 1.7, 2 # only to 2012
# file_path, bandwidth, NUMBER_OF_GAUSSIAN = './data/asos/minneapolis/hr_avg.csv', 1.1, 4
# file_path, bandwidth = './data/asos/lincoln_NE/hr_avg.csv', 0.9
# file_path, bandwidth = './data/asos/des_moines_IA/hr_avg.csv', 1.3
# file_path, bandwidth = './data/asos/springfield_IL/hr_avg.csv', 1.1 
# file_path, bandwidth = './data/asos/topeka/hr_avg.csv', 0.7 # High 0
# file_path, bandwidth = './data/asos/denver/hr_avg.csv', 1.3

# file_path, bandwidth, NUMBER_OF_GAUSSIAN = './data/NDAWN/baker/hr_avg.csv', 0.7, 4 
# file_path, bandwidth = './data/NDAWN/dickinson/hr_avg.csv', 0.6
# file_path = './data/NDAWN/rugby/hr_avg.csv'
# file_path = './data/NDAWN/bowman/hr_avg.csv'
# file_path = './data/NDAWN/grand_forks/hr_avg.csv'
# file_path = './data/NDAWN/williston/hr_avg.csv'
# file_path = './data/NDAWN/jamestown/hr_avg.csv'

# file_path, bandwidth, NUMBER_OF_GAUSSIAN = 'data/ECMWF/usa/47N123W/dat.csv', 0.7, 4 #good 
# file_path, bandwidth = 'data/ECMWF/venezuela/8N67W/dat.csv', 0.7 # good, but the data might be problematic.
# file_path, bandwidth = 'data/ECMWF/chile/52S75W/dat.csv', 1.9 # good
# file_path, bandwidth= 'data/ECMWF/iceland/65N17W/dat.csv', 1.9 # good
# file_path, bandwidth, NUMBER_OF_GAUSSIAN  = 'data/ECMWF/germany/49N9E/dat.csv', 1.1, 4 # miss peak
# file_path, bandwdith = 'data/ECMWF/sudan/18N32E/dat.csv', 1.1 # good
# file_path, bandwidth = 'data/ECMWF/china/24N121E/dat.csv', 0.9 # good
# file_path, bandwidth, NUMBER_OF_GAUSSIAN = 'data/ECMWF/australia/37S142E/dat.csv', 0.7, 4 # miss the peak, force bandwidth 0.7

if "cn_database" in file_path: 
    df = read_cn_database(file_path)
elif 'NCDC' in file_path:
    df = pd.read_csv(file_path, header=0, skipinitialspace=True, dtype={'HrMn':'object'})
    df.rename(columns={'Date':'date','Dir':'dir','Spd':'speed','Type':'type','I.1':'wind_type'}, inplace=True)
    df = df[['date','HrMn','type','dir','speed','wind_type' ]]
    df.dropna(subset=['dir','speed'], inplace=True)
    integer_data = True
elif 'NDAWN' in file_path:
    df = pd.read_csv(file_path, header=0, skipinitialspace=True, dtype={'HrMn':'object'})
    df['type']='default'
    df['wind_type']='default'
    df = df.dropna()
    convert_to_knot = False
    integer_data = False
elif 'asos' in file_path:
    # ASOS
    df = pd.read_csv(file_path, header=0, skipinitialspace=True, dtype={'HrMn':'object'})
    df['type']='default'
    df['wind_type']='default'
    df = df.dropna()
    convert_to_knot = False
    integer_data = False
    knot_unit = True
else:
    df = pd.read_csv(file_path, header=0, skipinitialspace=True)
    df.rename(columns={'U':'x','V':'y'}, inplace=True)
    df.x=-df.x
    df.y=-df.y
    df['speed']=np.sqrt(df.x**2+df.y**2)
    df['dir']=np.degrees(np.arctan2(df.y, df.x))%360
    df['time']=pd.to_datetime('1979-01-01T00:00:00Z')+pd.to_timedelta(df['time'], unit='h')
    df['date']=df['time'].dt.strftime('%Y%m%d')
    df['date']=df['date'].astype(int)
    df['HrMn']=df['time'].dt.strftime('%H00')
    df['type']='default'
    df['wind_type']='default'
    convert_to_knot = True
    integer_data = False
    cartesian = True

df

if 'NCDC' in file_path:
    lat, long = get_lat_long(file_path)
    print(lat,long)
    map_osm = folium.Map(location=[lat, long], zoom_start=4)
    folium.Marker([lat, long]).add_to(map_osm)
    display(map_osm)

42.35 -70.69

df['time']=pd.to_datetime(df["date"].astype(str).map(str) + df["HrMn"], format='%Y%m%d%H%M')
df.set_index(['time'], inplace=True)
df['HrMn']=df['HrMn'].astype(int)
df = df.query("(dir <= 999) & (speed < 100) ")['1970':'2016']

plot_speed_and_angle_distribution(df.speed, df.dir)

D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))

# Dir [10,360]=> [0,350]
df['dir'] = df['dir'].apply(lambda x: x%360 if x < 999 else x) 
# Convert Windrose coordianates to Polar Cooridinates 
if 'cartesian' in globals():
    df['dir_windrose'] = df['dir'].apply(lambda x: (90 - x)%360 if x < 999 else x)
else:
    df['dir_windrose'] = df['dir']
    df['dir'] = df['dir'].apply(lambda x: (90 - x)%360 if x < 999 else x)
display(df.describe())
df.plot(y='speed',legend=True,figsize=(20,5))

<matplotlib.axes._subplots.AxesSubplot at 0x4ff23c8>

1.3 General Data Info¶

1.3.1 Unit Detection¶

df['decimal'] = df.speed % 1
df.decimal.hist(alpha=0.5, label='m/s', figsize=(4, 3))

if 'convert_to_knot' not in globals():
    convert_to_knot = True if len(df.query('decimal >= 0.2')) / len(df) > 0.3 else False
    
if convert_to_knot:
    knot_unit = True
    df['speed'] = df['speed'] * 1.943845
    df['decimal'] = df.speed % 1
    df.decimal.hist(alpha=0.5, label='knot')
    # need more elaboration, some is not near an integer
    if integer_data:
        df['speed'] = df['speed'].apply(lambda x: int(round(x)))
    plt_configure(xlabel='Decimal', ylabel='Frequency', legend={'loc': 'best'}, title='Decimal Distribution')
else:
    if 'knot_unit' not in globals():
        knot_unit = False
    
df.drop(['decimal'], 1,inplace=True)
print(knot_unit)

False

dir_unit_text = ' (degree)'
if knot_unit == True:
    speed_unit_text = ' (knot)'
else: 
    speed_unit_text = ' (m/s)'

1.3.2 Sampling Type Selection¶

sample_type = df.query('date > 20000000')['type']
sample_type.value_counts().plot(
    kind = 'bar', title = 'Report Types Comprisement', figsize=(4,3))

report_type_most_used = sample_type.value_counts().argmax()
df = df.query("type==@report_type_most_used")

1.3.3 Sampling Time Selection¶

MID_YEAR = int(np.average(df.index.year))

df['HrMn'].value_counts().sort_index().plot(kind='bar', alpha=0.5,label='Overall')
df[str(MID_YEAR):]['HrMn'].value_counts().sort_index().plot(
    kind='bar', alpha=0.5, label='>= %s' %  MID_YEAR )

plt_configure(xlabel='Sampling Time', ylabel='Frequency', legend={'loc':'best'}, figsize=(8, 4), 
              title = 'Sampling Time Distribution, Overall and > %s ' %  MID_YEAR)

df['sample_time'] = df.HrMn % 100 
sample_time = df['2000':]['sample_time']
sample_times = sample_time.value_counts()[sample_time.value_counts() > 2000]
sample_times = sample_times.index.tolist()
# df = df.query("sample_time in @sample_times")
df = df.query("sample_time == @sample_times[0]")
df.drop(['sample_time'], 1,inplace=True)
print(sample_times)

df['HrMn'].value_counts().sort_index().plot(kind='bar', alpha=0.5, figsize=(10, 4))

[0]

<matplotlib.axes._subplots.AxesSubplot at 0xd66f5f8>

1.4 Error Data handling and Adjustment¶

1.4.1 Artefacts¶

wrong direction record

if integer_data:
    display(df.query("(dir % 10 >= 0.1) & (dir != 999)"))
    df = df.query('(dir % 10 <= 0.1) | (dir == 999)')

sudden increase in speed

# sudden increse
df['incre'] = df.speed.diff(1)
df['incre'].fillna(0, inplace=True)
df['incre_reverse'] = df.speed.diff(-1)
df['incre_reverse'].fillna(0, inplace=True)

display(df.sort_values(by='speed',ascending=False).head(10))
df['incre'].plot(kind='hist', bins=arange(-15, 15), legend=True, figsize=(8, 3))

<matplotlib.axes._subplots.AxesSubplot at 0xd5894a8>

incre_threshold = 20 if knot_unit else 10
print('sudden increase number', len(df.query('(incre > @incre_threshold )&(incre_reverse > @incre_threshold )')))
df = df.query('(incre < @incre_threshold )|(incre_reverse < @incre_threshold )')

# Check the max speed
display(df.sort_values(by='speed',ascending=False).head(10))
df.drop(['incre', 'incre_reverse'], 1, inplace=True)

sudden increase number 1

1.4.2 Direction re-aligment¶

For some dataset, the 16 sectors are not record properly,

e.g. the sectors are [0,20,50 ...], need to redistribute the angle into 22.5, e.g. [0, 22.5, 45...]

display(df['dir'].value_counts().sort_index())
effective_column = df.query('dir < 999')['dir'].value_counts()[df['dir'].value_counts() > 30].sort_index()
if integer_data:
    SECTOR_LENGTH = 360/len(effective_column) 
else: 
    SECTOR_LENGTH = 10
print(len(effective_column), SECTOR_LENGTH)

0      3268
10     2973
20     3265
30     3460
40     3945
50     4485
60     4641
70     4754
80     4396
90     4144
100    3936
110    4522
120    5801
130    6521
140    7334
150    7647
160    7957
170    8295
180    8399
190    6724
200    6296
210    6639
220    6665
230    7142
240    7168
250    7362
260    6459
270    5605
280    4961
290    5359
300    6069
310    5671
320    4648
330    4157
340    3575
350    3125
999    3636
Name: dir, dtype: int64

36 10.0

df=realign_direction(df, effective_column)

1.4.3 0 Speed¶

with_too_many_zero, null_wind_frequency = is_with_too_many_zero(df['2005':])
delete_zero = with_too_many_zero
if delete_zero:
    df = df.query('(speed > 0)')
print(delete_zero, null_wind_frequency)

False 0.0104086353123

print(df.query('dir == 999')['speed'].value_counts())
df=fill_direction_999(df, SECTOR_LENGTH)

0.0    3636
Name: speed, dtype: int64

1.5 Time Shift Comparison¶

DIR_REDISTRIBUTE = 'even'
if DIR_REDISTRIBUTE == 'even':
    DIR_BIN = arange(-5, 360, 10) 
elif DIR_REDISTRIBUTE == 'round_up':
    DIR_BIN = arange(0, 360+10, 10) 

# Comparison between mid_year, looking for: 
# 1. Odd Even Bias
# 2. Time Shift of Wind Speed Distribution
bins = arange(0, df.speed.max() + 1)
df[:str(MID_YEAR)]['speed'].plot(
    kind='hist', alpha=0.5,bins=bins, label='< %s' % MID_YEAR)

df[str(MID_YEAR+1):]['speed'].plot(
    kind='hist', alpha=0.5,bins=bins, label='> %s' % MID_YEAR)

plt.suptitle('Speed Comparison between year < %s, > %s ' % (MID_YEAR, MID_YEAR), fontsize = 14)
plt_configure(xlabel='Speed', ylabel='Frequency', legend=True, figsize=(8, 3))

df[:str(MID_YEAR)]['dir'].plot(
    kind='hist', alpha=0.5,bins=DIR_BIN, label='< %s' % MID_YEAR)

df[str(MID_YEAR+1):]['dir'].plot(
    kind='hist', alpha=0.5,bins=DIR_BIN, label='> %s' % MID_YEAR)

plt.suptitle('Dir Comparison between year < %s, and > %s ' % (MID_YEAR, MID_YEAR), fontsize = 14)
plt_configure(xlabel='Dir', ylabel='Frequency', legend={'loc':'best'}, figsize=(8, 3), tight='x')

display(df[df['dir'].isnull()])
df.dropna(subset=['dir'], inplace=True)

# Inspect the time shift of speed and degree distribution, and odd-even bias
check_time_shift(df, speed_unit_text=speed_unit_text, dir_unit_text=dir_unit_text)

1986 - 1990

D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))

1991 - 1995

1996 - 2000

2001 - 2004

2006 - 2010

2011 - 2013

df.resample('A').mean().plot(y='speed')
plt.gca().set_ylim(bottom=0)
df.resample('M').mean().plot(y='speed', figsize=(20,4))
plt.gca().set_ylim(bottom=0)

(0, 10.0)

%%time
for column in ['speed', 'dir']:
    if column == 'speed':
        bins = arange(0, df[column].max()+1, 1)
    else:
        bins = arange(0, 361, 10)
    den, _ = np.histogram(df[column], bins=bins, density=True)
    y_top=max(den)*1.2
    for year in arange(1980, 2016):
        end_year = year
        sub_df = df[str(year):str(end_year)]
        if len(sub_df) > 1000:
            plt.figure()
            df[column].hist(bins=bins, alpha=0.3, normed=True)
            sub_df[column].hist(bins=bins, alpha=0.5, figsize=(3,1.5), normed=True)
            plt.gca().set_ylim(top=y_top)
            plt_configure(title=str(year))
    align_figures()

D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\pyplot.py:524: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)

Wall time: 16.8 s

for column in ['speed', 'dir']:
    if column == 'speed':
        bins = arange(0, df[column].max()+1, 1)
    else:
        bins = arange(0, 361, 10)
    density_all, _ = np.histogram(df[column], bins=bins, density=True)
    df[column].hist(bins=bins, figsize=(5,3))

    R_squares = []
    years = []
    for year in arange(1980, 2016):
        start_year, end_year = year-1, year+1
        sub_df = df[str(start_year):str(end_year)]
        if len(sub_df) > 1000:
            density, _ = np.histogram(sub_df[column], bins=bins, density=True)
            y_mean = np.mean(density_all)
            SS_tot = np.sum(np.power(density_all - y_mean, 2))
            SS_res = np.sum(np.power(density_all - density, 2))

            R_square = 1 - SS_res / SS_tot
            R_squares.append(R_square)
            years.append(year)

    plt.figure()
    plot(years, R_squares)
    ylim = max(min(plt.gca().get_ylim()[0],0.85),0)
    plt.gca().set_ylim(bottom=ylim, top=1)
    plt_configure(figsize=(5,3))
    align_figures()

1.6 Re-distribute Direction and Speed (Optional)¶

e.g. Dir 50 -> -45 ~ 55, to make KDE result better

if integer_data:
    df = randomize_angle(df, DIR_REDISTRIBUTE, SECTOR_LENGTH)

if integer_data:
    if delete_zero:
        redistribute_method = 'down'
    else:
        redistribute_method = 'up'

    df, speed_redistribution_info = randomize_speed(df, redistribute_method)

Redistribute upward, e.g. 0 -> [0,1]

1.7 Generate (x,y) from (speed,dir)¶

# Cook orientation
# df['dir']= (df['dir'] + 180)%360

# There might be a small dot in the centre, which is due to too many zero (more than 1 speed) in center
# Scatter plot in matplot has performance issue, the speed is very slow
df['x'] = df['speed'] * cos(df['dir'] * pi / 180.0)
df['y'] = df['speed'] * sin(df['dir'] * pi / 180.0)

2. Re-select Data and Overview¶

2.1 Data Overview¶

## Summery of the data selection
print('Knot unit?', knot_unit)
print('Report type used:', report_type_most_used)
print('Sampling time used:', sample_times)
if 'speed_redistribution_info' in globals():
    print('Speed redistribution info:', speed_redistribution_info )

df_all_years = df # for later across-year comparison
df = df_all_years['2011':'2015']
# df = df.query('(HrMn == 0) and (speed >= 0.5) and (date%10000 > 900) and (date%10000 < 1000)' )
df.describe()

Knot unit? False
Report type used: FM-18
Sampling time used: [0]
Speed redistribution info: Redistribute upward, e.g. 0 -> [0,1]

df.plot(y='speed',legend=True,figsize=(20,5))

<matplotlib.axes._subplots.AxesSubplot at 0x1af366d8>

# Accumulation by month
df.resample('M').count().plot(y='date', kind='bar',figsize=(20,4))

<matplotlib.axes._subplots.AxesSubplot at 0xd62e208>

# 90 degree is east
ax = WindroseAxes.from_ax()
viridis = plt.get_cmap('viridis')
ax.bar(df.dir_windrose, df.speed, normed=True, opening=0.8, edgecolor='white', nsector=36, cmap=viridis)
ax.set_legend()

D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

if len(df) > 1000000:
    bins=arange(0,362)
    df['dir'].hist(bins=bins, normed=True,alpha=0.5,label='min')
    
    df = df_all_years.sample(n=500000, replace=True)    
    df['dir'].hist(bins=bins, normed=True,alpha=0.5,label='min resmapled')
    plt_configure(legend=True, figsize=(20,4))

x, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(df.speed)

# 1. Histogram comparison
fig = plt.figure()
df['speed'].hist(bins=arange(0, df.speed.max()), alpha=0.5, label='Data', normed=True)             
plot(x, y_weibull, '-', color='black',label='Weibull')   
plt_configure(figsize=(4,3),xlabel='V',ylabel='PDF', legend=True)

# 2. CDF comparison
fig = plt.figure()
plot(log(x), log(-log(1-y_ecdf)),'o', label='ECDF')
plot(log(x), log(-log(1-y_cdf_weibull)),'-', label='Weibull')
plt_configure(xlabel="ln(V)", ylabel="ln(-ln(1-P)",legend={'loc':'best'}, figsize=(4,3))
align_figures()

D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:11: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:12: RuntimeWarning: divide by zero encountered in log

df.plot(kind='scatter', x='x', y='y', alpha=0.05, s=2)
plt.gca().set_aspect('equal')
plt_configure(figsize=(3.2,3.2),xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)

2.2 Overview by Direction¶

if len(effective_column) == 16:
    rebinned_angle = 22.5
else: 
    rebinned_angle = 10

%%time
original_incre, incre = SECTOR_LENGTH, rebinned_angle
start, end = -original_incre/2 + incre/2, 360

max_speed = df.speed.max()
max_count = max_count_for_angles(df, start, end, incre)
plot_range = [0, max_speed, 0, max_count*1.05]

for angle in arange(start, end, incre):
    start_angle, end_angle = angle-incre/2, angle+incre/2
    sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)   
    
    fig = plt.figure()
    sub_df['speed'].hist(bins=arange(0, max_speed), alpha=0.5, label='Data')
    title ='%s (%s - %s), %s' % (angle, start_angle, end_angle, len(sub_df)) 
    plt.axis(plot_range)
    plt_configure(figsize=(3,1.5), title=title)
align_figures()

D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\pyplot.py:524: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)

Wall time: 7.2 s

2.3 Overview by Month¶

%%time
current_df = df.query('speed>=1')
for month in arange(1, 13): 
    sub_df = current_df[current_df.index.month == month]
    ax = WindroseAxes.from_ax()
    ax.bar(sub_df.dir_windrose, sub_df.speed, normed=True, opening=0.8, edgecolor='white', nsector=36, cmap=plt.get_cmap('viridis'))
    plt_configure(figsize=(3,3), title='Month: %s'%(month))
align_figures()

D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

Wall time: 19.3 s

df.describe()

3. Create input data and configuration¶

SPEED_SET = array(list(zip(df.x, df.y)))
if 'NUMBER_OF_GAUSSIAN' not in globals():
    NUMBER_OF_GAUSSIAN = 3
FIT_METHOD = 'square_error'
DEFAULT_BANDWDITH = 1.5 if knot_unit else 0.7
fig_list = []

fit_limit = ceil(df['speed'].quantile(.95))
fitting_axis_range = arange(-fit_limit, fit_limit+1, 1)
print(fitting_axis_range)

FITTING_RANGE = []
for i in fitting_axis_range:
    for j in fitting_axis_range:
        FITTING_RANGE.append([i,j])

[-13 -12 -11 -10  -9  -8  -7  -6  -5  -4  -3  -2  -1   0   1   2   3   4
   5   6   7   8   9  10  11  12  13]

plot_limit = ceil(df['speed'].quantile(.95))
PLOT_AXIS_RANGE = arange(-plot_limit, plot_limit+1, 1)

4. Kernel Density Estimation¶

sample = SPEED_SET
KDE_KERNEL = 'gaussian'
# KDE_KERNEL, bandwidth = 'tophat', 1

%%time
if 'bandwidth' not in globals():
    bandwidth = DEFAULT_BANDWDITH
    from sklearn.grid_search import GridSearchCV
    # from sklearn.model_selection import GridSearchCV  ## too slow

    # The bandwidth value sometimes would be too radical
    if knot_unit:
        bandwidth_range = arange(0.7,2,0.2)
    else:
        bandwidth_range = arange(0.4,1.1,0.1)

    # Grid search is unable to deal with too many data (a long time is needed)
    if len(sample) > 50000:    
        df_resample=df.sample(n=50000, replace=True)
        bandwidth_search_sample = array(list(zip(df_resample.x, df_resample.y)))
    else:
        bandwidth_search_sample = sample

    grid = GridSearchCV(neighbors.KernelDensity(kernel = KDE_KERNEL),
                    {'bandwidth': bandwidth_range}, n_jobs=-1, cv=4) 

    grid.fit(bandwidth_search_sample)
    bandwidth = grid.best_params_['bandwidth']
    
print(bandwidth)

0.9
Wall time: 0 ns

if 'bandwidth' not in globals():
    bandwidth = DEFAULT_BANDWDITH

kde = neighbors.KernelDensity(bandwidth=bandwidth, kernel = KDE_KERNEL).fit(sample)

points = FITTING_RANGE
# very slow if the dataset is too large, e.g. 100,000
# kde returns log prob, need to convert it
kde_result = exp(kde.score_samples(points))
print('bandwidth:', bandwidth, len(kde_result))
print(kde_result[:5])

bandwidth: 0.9 729
[  1.95719167e-10   4.81077136e-09   4.10618731e-08   1.60513691e-07
   1.06274942e-06]

# Plot jPDF
X = Y = PLOT_AXIS_RANGE
# Can't work if pass as generate_Z_from_X_Y(X,Y, exp(kde.score_samples())), need to use lambda
# see http://stackoverflow.com/questions/21035437/passing-a-function-as-an-argument-in-python
kde_Z = generate_Z_from_X_Y(X,Y, lambda coords: exp(kde.score_samples(coords)))
colorbar_lim = 0, kde_Z.max()

plot_3d_prob_density(X,Y,kde_Z)

fig_kde,ax1 = plt.subplots(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,kde_Z,xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, ax=ax1)

with sns.axes_style({'axes.grid' : False}):
    from matplotlib import ticker
    fig_hist,ax2 = plt.subplots(figsize=(3.5,2.5))
    _,_,_,image = ax2.hist2d(df.x, df.y, bins=PLOT_AXIS_RANGE, cmap='viridis',)
    ax2.set_aspect('equal')
    cb = plt.colorbar(image)
    tick_locator = ticker.MaxNLocator(nbins=6)
    cb.locator = tick_locator
    cb.update_ticks()
    plt_configure(ax=ax2, xlabel='x'+speed_unit_text,ylabel='y'+speed_unit_text)
align_figures()

4.1 Bootstrap GOF limit¶

kde_cdf = cdf_from_pdf(kde_result)
config = {'bandwidth': bandwidth, 
          'fitting_range': FITTING_RANGE,
          'fit_limit': fit_limit,
          'kde_kernel': KDE_KERNEL}

%%time
gof_kde=Parallel(n_jobs=-1)(delayed(resampled_kde)(df, kde_result, config) 
                                       for i in arange(20))

Wall time: 8.75 s

for gof_name in [ 'R_square', 'K_S','Chi_square']:
    plt.figure(figsize=(4,3))
    pd.DataFrame(gof_kde)[gof_name].hist()
    plt_configure(title=gof_name)
align_figures()

4.2 Bivariate Empirical Limit¶

%%time
gofs_mean_set_bivar = []
fig1, ax1 = plt.subplots(figsize=(4,3))
fig2, ax2 = plt.subplots(figsize=(4,3))

for year_length in [5, 10]:
    start_year, end_year = df_all_years.index.year[0], 2015-year_length+1
    df_standard = df_all_years[str(2015-year_length+1):'2015']
    speed_ = array(list(zip(df_standard.x, df_standard.y)))
    kde_current = neighbors.KernelDensity(bandwidth=bandwidth, kernel=KDE_KERNEL).fit(speed_)
    kde_result_standard = exp(kde_current.score_samples(points))
    gofs_bivar=Parallel(n_jobs=-1)(delayed(kde_gofs)(df_all_years, start_year, start_year+year_length-1, kde_result_standard, config) 
                                   for start_year in arange(start_year, end_year+1)) 
    gofs_bivar=pd.DataFrame(gofs_bivar, index=arange(start_year, end_year+1))

    gofs_bivar.plot(y='R_square', ax=ax1, label=year_length)
    gofs_bivar.plot(y='K_S', ax=ax2, label=year_length)
    year_lim = end_year-year_length-10, end_year-year_length
    gofs_mean = gofs_bivar.query('index >= @year_lim[0] & index <= @year_lim[1]').mean().to_dict()
    gofs_mean['year_lim']=year_lim
    gofs_mean_set_bivar.append(gofs_mean)
    
align_figures()
display(pd.DataFrame(gofs_mean_set_bivar).set_index('year_lim'))

Wall time: 39.7 s

4.3 Univariate GOF Limit¶

def yearly_gof(df_all_years, start_year, end_year, density, y_ecdf, x, density_dir):
    df_previous = df_all_years[str(start_year):str(end_year)]
    # 1. Speed
    density_expected, _ = np.histogram(df_previous['speed'], bins=x, density=True)
    r_square = sector_r_square(density, density_expected)
    
    y_ecdf_previous = sm.distributions.ECDF(df_previous['speed'])(x)
    k_s = max(np.abs(y_ecdf - y_ecdf_previous))
    
    # 2. Direction
    density_dir_expected, _ = dir_hist(df_previous['dir'], bins=arange(-5,370,10), density=True)
    r_square_dir = sector_r_square(density_dir, density_dir_expected)
    return {'year': start_year, 'r_square': r_square, 'k_s': k_s, 'r_square_dir': r_square_dir}

%%time
x = arange(0, df.speed.max() + 1)
fig = plt.figure(figsize=(9,2.5))
ax1 = fig.add_subplot(1,3,1)
ax2 = fig.add_subplot(1,3,2)
ax3 = fig.add_subplot(1,3,3)
gofs_mean_set = []

for year_length in [5, 7, 10]:
    start_year, end_year =df_all_years.index.year[0], 2015-year_length+1
    df_standard = df_all_years[str(2015-year_length+1):str(2015)]
    density, _ = np.histogram(df_standard['speed'], bins=x, density=True)
    density_dir, _ = dir_hist(df_standard['dir'], bins=arange(-5,370,10), density=True)
    y_ecdf = sm.distributions.ECDF(df_standard.speed)(x)
    gofs = [yearly_gof(df_all_years, start_year, start_year+year_length-1, density, y_ecdf, x, density_dir) 
            for start_year in arange(start_year, end_year+1)]

    gofs = pd.DataFrame(gofs)
    gofs.set_index(['year'], inplace=True)    
    if len(gofs)>0:
        ax1.plot(gofs.r_square, label=year_length)
        ax2.plot(gofs.k_s, label=year_length)
        ax3.plot(gofs.r_square_dir, label=year_length)
    year_lim = end_year-year_length-10, end_year-year_length
    gofs_mean = gofs.query('index >= @year_lim[0] & index <= @year_lim[1]').mean().to_dict()
    gofs_mean['year_lim']=year_lim
    gofs_mean_set.append(gofs_mean)
plt.tight_layout()
plt.legend()
for ax in [ax1, ax2, ax3]:
    plt_configure(ax=ax, tight='xtight')
    
display(pd.DataFrame(gofs_mean_set).set_index('year_lim'))

Wall time: 1.09 s

5. GMM by Expectation-maximization¶

sample=SPEED_SET
clf = mixture.GaussianMixture(n_components=NUMBER_OF_GAUSSIAN, covariance_type='full')
clf.fit(sample)
print(clf.converged_)

True

gmm_em_result = read_gmm_em_result(clf)
pretty_print_gmm(gmm_em_result)

fig,ax = plt.subplots(figsize=(3.5,3.5))
plot_gmm_ellipses(gmm_em_result, ax=ax, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)

GMM Plot Result
0.487999934433 [[-0.94113898 -3.4233064 ]] [ 2.88405607  3.49094881] -77.7991458548
0.275423082355 [[-5.92249578  3.69926155]] [ 3.80286434  3.8174506 ] -145.984338421
0.236576983212 [[ 3.13905024  2.67132162]] [ 4.04627448  5.10148088] -170.358726403

X = Y = PLOT_AXIS_RANGE
pdf_Z = generate_Z_from_X_Y(X,Y, lambda coords: exp(clf.score_samples(coords)))

def residule_between_kde_and_gmm(points):
    kde_vals = exp(kde.score_samples(points))
    gmm_vals = exp(clf.score_samples(points))
    return kde_vals - gmm_vals 

residual_Z = generate_Z_from_X_Y(X,Y, residule_between_kde_and_gmm)

plot_3d_prob_density(X,Y,pdf_Z)
plot_3d_prob_density(X,Y,residual_Z)
align_figures()

fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,kde_Z,xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar_lim=colorbar_lim)
fig_em = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,pdf_Z,xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar_lim=colorbar_lim)
fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,residual_Z,
                     xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
align_figures()

Goodness-of-fit Statistics¶

points = FITTING_RANGE
gmm_pdf_result = exp(clf.score_samples(points))
gof_df(gmm_pdf_result, kde_result)

gmm_em = group_gmm_param_from_gmm_param_array(gmm_em_result, sort_group = True)
mixed_model_pdf_em = generate_gmm_pdf_from_grouped_gmm_param(gmm_em)

6. GMM by Optimization¶

sample = SPEED_SET
points = FITTING_RANGE
max_speed = df.speed.max()
print(FIT_METHOD)

square_error

# from GMM,EM 
# GMM format: weight, meanx, meany, sigx, sigy, rho
x0 = gmm_em_result

cons = [
        # sum of every 6th element, which is the fraction of each gaussian
        {'type': 'eq', 'fun': lambda x: sum(x[::6]) - 1},
        # # limit the width/height ratio of elliplse, optional
#         {'type': 'ineq', 'fun': lambda x: width_height_ratios_set(x) - 1/3},
#         {'type': 'ineq', 'fun': lambda x: 3 - width_height_ratios_set(x)},
]

bonds = [(0., 0.99),(-fit_limit, fit_limit),(-fit_limit, fit_limit),
         (0., fit_limit),(0., fit_limit),(-0.99, 0.99)]*(len(x0)//6)

result = sp.optimize.minimize(
    lambda x0: GMM_fit_score(x0, kde_result, points, FIT_METHOD),
    x0,
    bounds = bonds,
    constraints=cons,
    tol = 0.000000000001,
    options = {"maxiter": 500})
result

     fun: -16.577688727268132
     jac: array([  2.22098351e-01,   2.38418579e-07,   4.76837158e-07,
        -7.15255737e-07,   4.76837158e-07,  -1.19209290e-06,
         2.22084045e-01,  -4.76837158e-07,  -2.38418579e-07,
         4.76837158e-07,   2.38418579e-07,   4.76837158e-07,
         2.22091913e-01,   2.38418579e-07,   0.00000000e+00,
        -2.38418579e-07,  -4.76837158e-07,  -7.15255737e-07,
         0.00000000e+00])
 message: 'Optimization terminated successfully.'
    nfev: 910
     nit: 45
    njev: 45
  status: 0
 success: True
       x: array([ 0.25261235, -0.03895393, -3.79043303,  2.95167123,  2.17658974,
       -0.19074757,  0.19330258,  3.50739092,  2.01772813,  2.98081238,
        4.01809797, -0.05323661,  0.55408508, -4.0697654 ,  0.84910556,
        4.29550799,  5.84886855, -0.23105052])

6.1 GMM Result¶

gmm = group_gmm_param_from_gmm_param_array(result.x, sort_group = True)
mixed_model_pdf = generate_gmm_pdf_from_grouped_gmm_param(gmm)
gmm_pdf_result = mixed_model_pdf(points)
pretty_print_gmm(gmm)

fig_gmm, ax = plt.subplots(figsize=(3.5,3.5))
plot_gmm_ellipses(gmm, ax=ax, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)

GMM Plot Result
0.554085076293 [[-4.0697654   0.84910556]] [ 4.06741832  6.00972216] -161.809359511
0.252612345281 [[-0.03895393 -3.79043303]] [ 2.09525434  3.00995269] -105.82936214
0.193302578427 [[ 3.50739092  2.01772813]] [ 2.9714754   4.02500779] -175.018602291

6.2 Bivariate Goodness-of-fit statistics¶

gof_df(gmm_pdf_result, kde_result)

pd.DataFrame(gofs_mean_set_bivar).set_index('year_lim')

X = Y = PLOT_AXIS_RANGE
pdf_Z = generate_Z_from_X_Y(X,Y, mixed_model_pdf)# passing a function as an argument

def residule_between_kde_and_gmm(points):
    kde_vals = exp(kde.score_samples(points))
    gmm_vals = mixed_model_pdf(points)
    return kde_vals - gmm_vals 

residual_Z = generate_Z_from_X_Y(X,Y, residule_between_kde_and_gmm)

plot_3d_prob_density(X,Y,pdf_Z)
plot_3d_prob_density(X,Y,residual_Z)
align_figures()

fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,kde_Z, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
fig_gmm = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,pdf_Z, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,residual_Z,  xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
align_figures()

6.3 Univariate Goodness-of-fit¶

def f(V,theta):
    return (mixed_model_pdf([[V*cos(theta),V*sin(theta)]]))*V

def f_em(V,theta):
    return (mixed_model_pdf_em([[V*cos(theta),V*sin(theta)]]))*V

%%time
x = arange(0, max_speed, 0.5)
_, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(df.speed, x=x)

Wall time: 2.7 s

%%time
# Calculate Speed Distribution
# 1. GMM Model
y_ =[integrate.nquad(f, [[x_-0.01, x_+0.01],[0, 2*pi]]) for x_ in x]
y_gmm = array(list(zip(*y_))[0])/0.02

# 2. Weibull
y_weibul = sp.stats.weibull_min.pdf(x, *weibull_params)

# 3. Plot Comparison
df['speed'].hist(bins=arange(0, df.speed.max()), alpha=0.5, label='Data')
plot(x, y_gmm*len(df.speed),'-', color='black', label='GMM')
plot(x, y_weibul*len(df.speed), '--', color='black', label='Weibull') 
print('Speed Distribution Comparison')
plt_configure(xlabel='Speed'+speed_unit_text,
              ylabel='Frequency',legend=True, figsize=(4, 2))
plt.gca().set_ylim(bottom = 0)
plt.tight_layout()
plt.locator_params(axis='y', nbins=5)

# 4. R square for GMM, Weibull
print(R_square_for_speed(df['speed'], f, weibull_params, f_em))

Speed Distribution Comparison
(0.97575614820879186, 0.9563360944073418, 0.96058276832349465)
Wall time: 10.3 s

%%time
y_ = [integrate.nquad(f, [[0, x_val],[0, 2*pi]]) for x_val in x]
y_cdf_gmm = array(list(zip(*y_))[0])

# 5.2. CDF Comaprison
plot(x, y_ecdf,'o', alpha=0.8, label='Data')
plot(x, y_cdf_gmm,'-', color='black',label='GMM')
plot(x, y_cdf_weibull,'--', color='black',label='Weibull')
plt_configure(xlabel = "V", ylabel='P', legend=True, figsize=(4,3))

plt.figure()
plot(log(x), log(-log(1-y_ecdf)),'o', label = 'Empirical')
plot(log(x), log(-log(1-y_cdf_weibull)),'--', label = 'Weibull')
plot(log(x), log(-log(1-y_cdf_gmm)),'-', color='black', label = 'GMM')
plt_configure(xlabel='ln(V)',ylabel='ln(-ln(1-P))',legend={'loc':'best'}, figsize=(4,3))
align_figures()

cdf_diff, cdf_diff_weibull= np.abs(y_ecdf - y_cdf_gmm), np.abs(y_ecdf - y_cdf_weibull)
print(cdf_diff.max(), cdf_diff_weibull.max()) 
print(x[cdf_diff.argmax()], x[cdf_diff_weibull.argmax()])

D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:11: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:12: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:13: RuntimeWarning: divide by zero encountered in log

0.0274923425721 0.0362219449501
6.0 6.0
Wall time: 9.32 s

# Calculate Angle Distribution
x = linspace(0,2*pi, num=36+1)
y_ =[integrate.nquad(f, [[0, inf],[x_-pi/36, x_+pi/36]]) for x_ in x]
y = array(list(zip(*y_))[0])
density, _ = dir_hist(df['dir'], bins=arange(-5, 370, 10), density=True)

plt.bar(arange(0, 360, 10), density*10*len(df['dir']), width=10, alpha=0.5, label='Data')
plot(x/pi*180, y*len(df['dir']) ,'-', color='black', label='GMM')
plt_configure(xlabel='Direction'+dir_unit_text, ylabel='Frequency', 
              legend={'loc': 'best'} ,tight='xtight',figsize = (4,2))
plt.tight_layout()
dir_fig = plt.gcf()
print('Direction Distribution Comparison')
sector_r_square(density*10, y[:-1])

Direction Distribution Comparison

0.93265285851466229

pd.DataFrame(gofs_mean_set).set_index('year_lim')

6.4 Sectoral Comaprison¶

%%time
incre = max(SECTOR_LENGTH, 10)
density_collection=Parallel(n_jobs=-1)(delayed(direction_compare)(gmm, df, angle, incre) 
                                        for angle in arange(0, 360, incre))  
# This R square is computed as in paper 
# Comparison of bivariate distribution constructionapproaches for analysing wind speed anddirection data
# http://onlinelibrary.wiley.com/doi/10.1002/we.400/full
print(true_R_square(density_collection))

0.887304747595
Wall time: 9.68 s

# %%time
# curve_collection=Parallel(n_jobs=-1)(delayed(direction_compare2)
#                                      (gmm, df, angle, incre, complex=True) for angle in arange(start, end, incre))

# Calculate Speed Distribution
def model_data_comparison(df, original_incre = 10, incre = 10):
    start, end = -original_incre/2 + incre/2, 360
    curve_collection = []
    max_speed = df.speed.max()
    
    # Find a max count for plotting histogram
    max_count = max_count_for_angles(df, start, end, incre)
    plot_range = [0, max_speed, 0, max_count*1.05]
    
    for angle in arange(start, end, incre):
        angle_radian, incre_radian = np.radians([angle, incre])  
        start_angle, end_angle = angle-incre/2, angle+incre/2
        
        # 0. Select data from observation
        sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)
        data_size = len(sub_df.speed)
        # 1. Get Weibull and ECDF
        x, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(sub_df.speed)
        # 2. Get GMM PDF, CDF
        _, y_gmm, y_cdf_gmm, direction_prob = gmm_integration_in_direction(f, angle_radian-incre_radian/2, angle_radian+incre_radian/2, x)
        
        # 3. R square for GMM, Weibull
        bins = arange(0, sub_df.speed.max()+1)
        density, _ = np.histogram(sub_df['speed'], bins=bins, normed=True)
        density_expected_gmm_ =[integrate.nquad(f, [[x_, x_+1],[angle_radian-incre_radian/2, angle_radian+incre_radian/2]]) 
                            for x_ in bins[:-1]]
        density_expected_gmm = array(list(zip(*density_expected_gmm_ ))[0])/direction_prob
        R_square_gmm = sector_r_square(density, density_expected_gmm)
        
        density_expected_weibull = sp.stats.weibull_min.cdf(bins[1:], *weibull_params) - sp.stats.weibull_min.cdf(bins[:-1], *weibull_params) 
        R_square_weibull = sector_r_square(density, density_expected_weibull)

        # 4. K-S for GMM, Weibull
        cdf_diff, cdf_diff_weibull= np.abs(y_ecdf - y_cdf_gmm), np.abs(y_ecdf - y_cdf_weibull)
                
        # 5. Make Plots
        fig = plt.figure(figsize=(10,1.9))
        # 5.1. Frequency Comparison
        ax1 = fig.add_subplot(1,3,1)        
        sub_df['speed'].hist(bins=arange(0, sub_max_speed), alpha=0.5, label='Data')                  
        plot(x, y_gmm*data_size,'-', color='black', label='GMM')
        plot(x, y_weibull*data_size, '--', color='black',label='Weibull')   
        plt_configure(xlabel = "V", ylabel='Frequency', legend=True)
        plt.axis(plot_range)
        
        # 5.2. CDF Comaprison
        ax2 = fig.add_subplot(1,3,2)
        plot(x, y_ecdf,'o', alpha=0.8, label='Data')
        plot(x, y_cdf_gmm,'-', color='black',label='GMM')
        plot(x, y_cdf_weibull,'--', color='black',label='Weibull')
        plt.gca().set_xlim(right = max_speed)
        plt_configure(xlabel = "V", ylabel='P', legend=True)
        
        curves = {'direction': angle, 'datasize': data_size, 'weight': direction_prob, 'x': x, 
                  'gmm_pdf': y_gmm, 'gmm_cdf': y_cdf_gmm,
                  'weibull_pdf': y_weibull, 'weibull_cdf': y_cdf_weibull, 'ecdf': y_ecdf,
                  'max_cdf_diff_gmm': cdf_diff.max(), 'max_cdf_diff_weibull': cdf_diff_weibull.max(), 
                  'r_square_gmm': R_square_gmm, 'r_square_weibull': R_square_weibull}
        curve_collection.append(curves)
        
        plt.tight_layout()
        plt.show()
        print('%s (%s - %s) degree' % (angle, start_angle, end_angle))
        print('data size:', len(sub_df), 'weight', len(sub_df)/len(df))
        print('GMM', 'Weibull')
        print('R square', R_square_gmm,  R_square_weibull)
        print('max diff:', cdf_diff.max(), cdf_diff_weibull.max(), 
              'speed value:', x[cdf_diff.argmax()], x[cdf_diff_weibull.argmax()], 'y gmm', y_cdf_gmm[cdf_diff.argmax()])
        print(' ')
    return curve_collection

%%time
if len(effective_column) == 16:
    rebinned_angle = 22.5
else: 
    rebinned_angle = 20
    
curve_collection = model_data_comparison(df, SECTOR_LENGTH, rebinned_angle)

5.0 (-5.0 - 15.0) degree
data size: 360 weight 0.025111607142857144
GMM Weibull
R square 0.902579793011 0.968356866064
max diff: 0.105773546966 0.157829814936 speed value: 8.05014063662 3.45006027284 y gmm 0.858551324743

25.0 (15.0 - 35.0) degree
data size: 503 weight 0.03508649553571429
GMM Weibull
R square 0.942724153412 0.925006461162
max diff: 0.0535962776464 0.0558759380225 speed value: 5.31255678432 6.37506814119 y gmm 0.47904359375

45.0 (35.0 - 55.0) degree
data size: 544 weight 0.03794642857142857
GMM Weibull
R square 0.901442899585 0.906703024753
max diff: 0.0667149385202 0.0425808445794 speed value: 9.98370068374 6.98859047862 y gmm 0.912303173814

65.0 (55.0 - 75.0) degree
data size: 632 weight 0.04408482142857143
GMM Weibull
R square 0.789346460115 0.867882807402
max diff: 0.124642140026 0.0506535875886 speed value: 8.82110242745 5.51318901716 y gmm 0.805021886862

85.0 (75.0 - 95.0) degree
data size: 519 weight 0.03620256696428571
GMM Weibull
R square 0.897151335594 0.875340154892
max diff: 0.0608205168614 0.040770753395 speed value: 9.93517483757 7.727358207 y gmm 0.848874466765

105.0 (95.0 - 115.0) degree
data size: 604 weight 0.04213169642857143
GMM Weibull
R square 0.86334450936 0.974743635334
max diff: 0.118124369522 0.0355693531617 speed value: 8.22002748165 8.22002748165 y gmm 0.583862385445

125.0 (115.0 - 135.0) degree
data size: 995 weight 0.06940569196428571
GMM Weibull
R square 0.851113804797 0.912438187433
max diff: 0.105401641324 0.0904046603706 speed value: 10.4421821338 5.22109106689 y gmm 0.669472730536

145.0 (135.0 - 155.0) degree
data size: 1055 weight 0.07359095982142858
GMM Weibull
R square 0.866460832279 0.882646012079
max diff: 0.0430832314305 0.193317739741 speed value: 9.28590979785 8.25414204253 y gmm 0.60990787598

165.0 (155.0 - 175.0) degree
data size: 1083 weight 0.07554408482142858
GMM Weibull
R square 0.798474715408 0.815873304945
max diff: 0.0814506436858 0.12614599958 speed value: 9.13460327627 5.07477959793 y gmm 0.687175482098

185.0 (175.0 - 195.0) degree
data size: 908 weight 0.06333705357142858
GMM Weibull
R square 0.766940407381 0.867054191264
max diff: 0.121361467917 0.0667080031426 speed value: 5.62862115124 5.62862115124 y gmm 0.403968928559

diff_df = pd.DataFrame(curve_collection) 

gmm_mean, weibull_mean = plot_sectoral_comparison(diff_df.r_square_gmm, diff_df.r_square_weibull, 
                                                  diff_df.direction, diff_df.datasize)
plt_configure(ylabel="$\ R^2$", xlabel='Direction'+dir_unit_text)
ylim = min(plt.gca().get_ylim()[0],0.75)
plt.gca().set_ylim(top=1, bottom=ylim)
plt.tight_layout()
print(gmm_mean, weibull_mean)

0.8883756475065896 0.9125285046516485

gmm_mean, weibull_mean = plot_sectoral_comparison(diff_df.max_cdf_diff_gmm, diff_df.max_cdf_diff_weibull, 
                                                  diff_df.direction, diff_df.datasize)
plt_configure(ylabel="K-S", xlabel='Direction'+dir_unit_text)
ylim = max(plt.gca().get_ylim()[1],0.25)
plt.gca().set_ylim(top=ylim, bottom=0)
plt.tight_layout()
print(gmm_mean, weibull_mean)

0.07579081050011181 0.07521517772982446

# Compare direction weight with previous figure
display(dir_fig)

6.5 Insufficient-fit Sector Investigation¶

(1) Data Variability, by Bootstrap (Resampling)¶

angle =  max_diff_angle = diff_df.ix[diff_df['max_cdf_diff_gmm'].idxmax()]['direction']
incre = rebinned_angle

FRACTION = 1

# Select data from observation
start_angle, end_angle = angle-incre/2, angle+incre/2
angle_radian, incre_radian = radians(angle), radians(incre)  
sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)

x = arange(0, sub_max_speed, 0.5)
_, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(sub_df.speed, x)
_, y_gmm, y_cdf_gmm, direction_prob = gmm_integration_in_direction(f, angle_radian-incre_radian/2, angle_radian+incre_radian/2, x)

fig = plt.figure(figsize=(10,1.9))
ax1 = fig.add_subplot(1,3,1)   
ax2 = fig.add_subplot(1,3,2)   
ax3 = fig.add_subplot(1,3,3)   

# 1. Data
bins=arange(0, sub_max_speed)
sub_df['speed'].hist(ax=ax1, bins=bins, alpha=0.5, label='Data', normed=True)  

# 2. GMM
ax1.plot(x, y_gmm,'-', color='black', label='GMM')
ax2.plot(x, y_cdf_gmm,'-', color = 'black', label='GMM')
ax3.plot(log(x), log(-log(1-y_cdf_gmm)),'-', color = 'black',label='GMM')

# 3. Weilbull 
ax1.plot(x, y_weibull,'--',color='black',label='Weibull')
ax2.plot(x, y_cdf_weibull,'--',label='Weibull')
ax3.plot(log(x), log(-log(1-y_cdf_weibull)),'--',label='Weibull')

# 4. Data Resampled
count_collection = []
for i in range(1,100):
    sub_df_resampled = sub_df.sample(frac=FRACTION, replace=True)    
    resampled_count, _ = np.histogram(sub_df_resampled['speed'], bins=bins, normed=True) 
    count_collection.append(resampled_count)
    
    ecdf = sm.distributions.ECDF(sub_df_resampled.speed)
    y_ecdf = ecdf(x) 
    ax2.plot(x, y_ecdf,':', label='Data Resampled')
    ax3.plot(log(x), log(-log(1-y_ecdf)),':', label='Data Resampled')
    if i == 1: 
#         plt_configure(ax=ax2, xlabel = "$V$", ylabel='$P$', legend={'loc':'best'})
#         plt_configure(ax=ax3, xlabel="ln($V$)", ylabel="ln(-ln(1-$P$)",legend={'loc':'best'})
        plt_configure(ax=ax2, xlabel = "V", ylabel='P', legend={'loc':'best'})
        plt_configure(ax=ax3, xlabel="ln(V)", ylabel="ln(-ln(1-P)",legend={'loc':'best'})

print('%s (%s - %s) Degree Speed Distribution' % (angle, start_angle, end_angle))
count_collection = np.array(count_collection)
mx, mn = np.max(count_collection,0), np.min(count_collection,0)
ax1.plot(bins[1:]-0.5, mx, ':', color='blue')
ax1.plot(bins[1:]-0.5, mn, ':', color='blue', label='Resample limit')
ax1.set_ylim(bottom = 0)
# plt_configure(ax=ax1, xlabel='$V$',ylabel='Frequency',legend={'loc':'best'})
plt_configure(ax=ax1, xlabel='V', ylabel='Frequency',legend={'loc':'best'})
ax1.locator_params(axis='y', nbins=5)
ax2.locator_params(axis='y', nbins=5)
ax3.locator_params(axis='y', nbins=5)
plt.tight_layout()
diff = abs(y_ecdf - y_cdf_gmm)
print(diff.max(), x[diff.argmax()], y_cdf_gmm[diff.argmax()])

D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:17: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:22: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:34: RuntimeWarning: divide by zero encountered in log

65.0 (55.0 - 75.0) Degree Speed Distribution
0.121064810361 8.5 0.777710379981

(2) Time Variability¶

fig_time_variability_3d = plt.figure()
ax1 = fig_time_variability_3d.gca(projection='3d')

fig_time_variability_cdf,ax2 = plt.subplots(figsize=(3,1.8))
fig_time_variability_weibull, ax3 = plt.subplots(figsize=(3,1.8))

ax2.plot(x, y_cdf_gmm,'-', color='black', label = 'GMM')
ax2.plot(x, y_cdf_weibull,'--', label='Weibull')

ax3.plot(log(x), log(-log(1-y_cdf_gmm)),'-', color='black',label='GMM')
ax3.plot(log(x), log(-log(1-y_cdf_weibull)), '--', label='Weibull')

# 3. Data
prop_cycle=iter(mpl.rcParams['axes.color_cycle'])
for start_time in range(2001, 2015, 5):
    end_time = start_time + 4 
    df_other_years = df_all_years[str(start_time):str(end_time)]
    df_other_years_at_angle, sub_max_speed_other_year = select_df_by_angle(df_other_years, start_angle, end_angle)
    if len(df_other_years_at_angle) > 0 :
        
        ecdf = sm.distributions.ECDF(df_other_years_at_angle.speed)
        y_ecdf = ecdf(x)
        ax2.plot(x, y_ecdf,':', label = start_time)
        ax3.plot(log(x), log(-log(1-y_ecdf)),':', label = start_time)
        
        count, division = np.histogram(df_other_years_at_angle['speed'], normed=True,
                                       bins=arange(0, sub_max_speed_other_year))
        ax1.bar(left=division[:-1], height=count, zs=start_time, zdir='x', 
                color=next(prop_cycle), alpha=0.8)
        x_3d = start_time*np.ones_like(x)
        ax1.plot(x_3d, x, y_gmm, '-', color='black', label='GMM'  if start_time == 2011 else '')
        ax1.plot(x_3d, x, y_weibull, '--', color='blue', label='Weibull' if start_time == 2011 else '')
        
print('%s (%s - %s) Degree Speed Distribution' % (angle, start_angle, end_angle))
ax1.set_ylim(bottom = 0)
ax1.set_zlabel('Frequency')
plt_configure(ax=ax1, xlabel='Time',ylabel='V', legend=True)
plt_configure(ax=ax2, xlabel = "V", ylabel='P', legend={'loc':'best'})
plt_configure(ax=ax3, xlabel="ln(V)", ylabel="ln(-ln(1-P)", legend={'loc':'best'})

ax1.set_zlim(bottom = 0)
align_figures()

D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:10: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:11: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:24: RuntimeWarning: divide by zero encountered in log

65.0 (55.0 - 75.0) Degree Speed Distribution

(3) Adjacent Sector Variability¶

incre = rebinned_angle
angle_group = [max_diff_angle-incre, max_diff_angle, max_diff_angle+incre]

fig_adjecent_variability_3d = plt.figure()
ax1 = fig_adjecent_variability_3d.gca(projection='3d')
fig_adjecent_variability_cdf, ax2 = plt.subplots(figsize=(3,1.8))
fig_adjecent_variability_weibull, ax3 = plt.subplots(figsize=(3,1.8))

legend_3d = False
prop_cycle=iter(mpl.rcParams['axes.color_cycle'])

curve_df = pd.DataFrame(curve_collection)

for angle in angle_group:
    curves = curve_df.query('direction == @angle%360').T.to_dict()
    curves = curves[list(curves)[0]]
    data_size, x =  curves['datasize'], curves['x']
    y_gmm, y_cdf_gmm =  curves['gmm_pdf'], curves['gmm_cdf'] 
    y_weibull, y_cdf_weibull, y_cdf = curves['weibull_pdf'],  curves['weibull_cdf'], curves['ecdf']

    linestyle = '-' if angle == max_diff_angle else ':'
    alpha = 0.7 if angle == max_diff_angle else 0.3

    ax2.plot(x, y_gmm*data_size, linestyle, label=angle)        
    ax3.plot(x, y_weibull*data_size, linestyle, label=angle)

    start_angle, end_angle = angle-incre/2, angle+incre/2
    sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)

    x_3d = angle*np.ones_like(x)
    ax1.plot(x_3d, x, y_gmm*data_size, color='black', label='GMM')
    ax1.plot(x_3d, x, y_weibull*data_size, color='blue', linestyle='--',label='Weibull')

    count, division = np.histogram(sub_df['speed'], bins=arange(0, sub_max_speed))
    ax1.bar(left=division[:-1], height=count, zs=angle, zdir='x', color=next(prop_cycle), alpha=0.8)

    if legend_3d == False:
        ax1.legend()
        legend_3d = True
        
plt_configure(ax=ax1, xlabel='Direction', ylabel='Speed')   
plt_configure(ax=ax2, xlabel='V',ylabel='Frequency',legend={'loc':'best'})
plt_configure(ax=ax3, xlabel='V',ylabel='Frequency',legend={'loc':'best'})
ax1.set_zlabel('Frequency')
ax1.set_zlim(bottom = 0)
ylim = max(ax1.get_ylim()[1],ax3.get_ylim()[1])
ax2.set_ylim(bottom = 0, top=ylim)
ax3.set_ylim(bottom = 0, top=ylim)

print(max_diff_angle) 
print('GMM, Weibull, Histogram')
align_figures()

D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))

65.0
GMM, Weibull, Histogram

7. Result Variability & Cross-Validation¶

if 'bandwidth' not in globals():
    bandwidth = DEFAULT_BANDWDITH    
if 'FIT_METHOD' not in globals():
    FIT_METHOD = 'square_error'       
if 'KDE_KERNEL' not in globals():
    KDE_KERNEL = 'gaussian'
    
config = {'bandwidth': bandwidth, 
          'fitting_range': FITTING_RANGE,
          'fit_limit': fit_limit,
          'kde_kernel': KDE_KERNEL}

print(bandwidth, FIT_METHOD)

0.9 square_error

7.1 Variability of the Result¶

%%time
results = Parallel(n_jobs=-1)(delayed(resampled_fitting)(df, FIT_METHOD, NUMBER_OF_GAUSSIAN, config) for i in range(12))                        
for result in results:
    display(pretty_print_gmm(result['gmm']))
    fig,ax = plt.subplots(figsize=(3.5,3.5))
    plot_gmm_ellipses(result['gmm'],ax=ax, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
    plt.show()
    
    display(gof_df(result['gmm_pdf_result'], result['kde_result']))
    display(gof_df(result['gmm_pdf_result'], kde_result))
    print('')

GMM Plot Result
0.585275056185 [[-3.78949846  0.73107592]] [ 4.00711994  5.98336208] -160.032475744
0.223577885506 [[-0.03806631 -3.75617764]] [ 2.06750037  2.9268044 ] -104.393756209
0.191147058308 [[ 3.59499622  1.99978871]] [ 2.93453648  4.24989194] -176.014570636

GMM Plot Result
0.537041072458 [[-4.15564859  0.75275112]] [ 4.00734022  5.95189801] -160.555151707
0.267701276031 [[ 0.14321608 -3.80570361]] [ 2.16812771  3.06377668] -94.1761757919
0.195257651511 [[ 3.44475921  2.51284591]] [ 3.25463723  3.90236337] -158.133239245

GMM Plot Result
0.564128500156 [[-4.13903841  0.88032532]] [ 4.10288116  5.95691834] -162.868244903
0.235983591339 [[ 0.03414763 -3.91488862]] [ 2.05411414  2.84029568] -111.282581331
0.199887908505 [[ 3.6224492   1.68660327]] [ 2.89984854  4.1938875 ] 179.778996038

GMM Plot Result
0.593690234736 [[-3.73636951  0.76437452]] [ 4.24515525  6.02801585] -161.252999148
0.231194610939 [[-0.01007175 -3.68155491]] [ 2.0290204   2.93234142] -107.985229282
0.175115154324 [[ 3.67041823  1.84767326]] [ 2.72746032  4.15624971] 179.541700979

GMM Plot Result
0.509575982233 [[-4.42927022  0.86546725]] [ 3.84678959  6.10050811] -160.143801801
0.284545725378 [[ 0.03683799 -3.72553083]] [ 2.22214902  3.20916734] -107.136498432
0.205878292389 [[ 3.39045887  2.28097379]] [ 3.09135204  3.97924038] -154.772097204

7.2 Cross-validation, to select the number of Gaussian¶

%%time
from sklearn.cross_validation import train_test_split, KFold

## 5-fold cross validation
gaussian_number_range = arange(1,6)
CV_result_train_all,CV_result_test_all =[],[]
number_of_fold = 4
print('Number of train/test dataset', len(df)*(number_of_fold-1)/number_of_fold, len(df)/number_of_fold) 

for number_of_gaussian in gaussian_number_range:
    print( '  ')
    print('Number of gaussian', number_of_gaussian)
    
    kf = KFold(len(df), n_folds=number_of_fold, shuffle=True) 

    CV_result = Parallel(n_jobs=-1)(delayed(fit_per_fold)(df, train_index, test_index, FIT_METHOD, number_of_gaussian, config) for train_index, test_index in kf)                        

    CV_result_train, CV_result_test = list(zip(*CV_result))
    CV_result_train, CV_result_test = list(CV_result_train), list(CV_result_test)
        
    CV_result_train_all.append(CV_result_train)
    CV_result_test_all.append(CV_result_test)
    
    print('Train')
    pretty_pd_display(CV_result_train)
    print('Test')
    pretty_pd_display(CV_result_test)

Number of train/test dataset

D:\ProgramData\Anaconda3\lib\site-packages\sklearn\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

 10752.0 3584.0
  
Number of gaussian 1
Train

Test

  
Number of gaussian 2
Train

Test

  
Number of gaussian 3
Train

Test

  
Number of gaussian 4
Train

Test

train_scores_mean, train_scores_std = generate_mean_std_gof(CV_result_train_all)
print('Train gof mean, std')
display(train_scores_mean)

test_scores_mean, test_scores_std = generate_mean_std_gof(CV_result_test_all)
print('Test gof mean, std')
display(test_scores_mean)

Train gof mean, std

Test gof mean, std

prop_cycle=mpl.rcParams['axes.color_cycle']
gaussian_number_range = train_scores_mean.index
for column, column_name in zip(['R_square','K_S','Chi_square'],["$\ R^2$", "K-S", "$\widetilde{\chi^2} $"]):
    plot(gaussian_number_range, train_scores_mean[column],
             '--', label = 'training', color=prop_cycle[0])
    plt.fill_between(gaussian_number_range, 
                     train_scores_mean[column] - train_scores_std[column],
                     train_scores_mean[column] + train_scores_std[column], 
                     alpha=0.2, color=prop_cycle[0])
    
    plot(gaussian_number_range, test_scores_mean[column],
             '-', label = 'test',color=prop_cycle[1])
    plt.fill_between(gaussian_number_range, 
                 test_scores_mean[column] - test_scores_std[column],
                 test_scores_mean[column] + test_scores_std[column], 
                 alpha=0.2,color=prop_cycle[1])
    plt.xticks(gaussian_number_range)
    print(column)
    plt.locator_params(axis='y', nbins=5)
    plt_configure(xlabel='Number of Gaussian Distributions', ylabel=column_name, 
                  figsize=(3,2), legend={'loc':'best'})
    if column == 'R_square':
        plt.gca().set_ylim(top=1)
    if column == 'K_S' or column == 'Chi_square':
        plt.gca().set_ylim(bottom=0)
    plt.show()

R_square

D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))

K_S

Chi_square

fig = plt.figure(figsize=(4.2,2.4))
ax1 = fig.add_subplot(1,2,1) 
plot_2d_prob_density(X, Y, kde_Z, ax=ax1,
                     xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar=False)
ax1.grid(False)
ax2 = fig.add_subplot(1,2,2) 
plot_2d_prob_density(X, Y, pdf_Z, ax=ax2,
                     xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar=False)
ax2.grid(False)
ax2.get_yaxis().set_visible(False)

for fig in [fig_hist, fig_kde, fig_em, fig_gmm]:
    display(fig)
for fig in [fig_time_variability_3d, fig_time_variability_cdf, fig_time_variability_weibull, 
            fig_adjecent_variability_3d, fig_adjecent_variability_cdf, fig_adjecent_variability_weibull,]:
    display(fig)

import time
save_notebook()
time.sleep(3)
location_name = get_location_name(file_path)
print(location_name)
current_file = 'GMM.ipynb'
output_file = './output_HTML/'+location_name+'.html' 

output_HTML(current_file, output_file)

	date	HrMn	type	dir	speed	wind_type
0	19850312	1400	FM-18	130	13.0	N
1	19850312	1500	FM-18	130	12.0	N
2	19850312	1600	FM-18	130	12.0	N
3	19850312	1700	FM-18	160	7.0	N
4	19850312	1800	FM-18	170	8.0	N
5	19850312	1900	FM-18	190	7.0	N
6	19850312	2000	FM-18	210	6.0	N
7	19850312	2100	FM-18	190	4.0	N
8	19850312	2200	FM-18	170	6.0	N
9	19850312	2300	FM-18	270	8.0	N
10	19850313	0000	FM-18	270	7.0	N
11	19850313	0100	FM-18	240	6.0	N
12	19850313	0200	FM-18	230	6.0	N
13	19850313	0300	FM-18	220	4.0	N
14	19850313	0400	FM-18	230	6.0	N
15	19850313	0500	FM-18	240	6.0	N
16	19850313	0600	FM-18	250	6.0	N
17	19850313	0700	FM-18	240	8.0	N
18	19850313	0800	FM-18	260	10.0	N
19	19850313	0900	FM-18	240	10.0	N
20	19850313	1000	FM-18	260	10.0	N
21	19850313	1100	FM-18	250	10.0	N
22	19850313	1200	FM-18	260	11.0	N
23	19850313	1300	FM-18	260	10.0	N
24	19850313	1400	FM-18	290	11.0	N
25	19850313	1500	FM-18	290	12.0	N
26	19850313	1600	FM-18	310	13.0	N
27	19850313	1800	FM-18	300	11.0	N
28	19850313	1900	FM-18	290	11.0	N
29	19850313	2000	FM-18	290	12.0	N
...	...	...	...	...	...	...
235836	20170331	1800	FM-13	110	2.0	N
235837	20170331	1900	FM-13	999	999.9	C
235838	20170331	2000	FM-13	80	4.0	N
235839	20170331	2100	FM-13	90	4.0	N
235840	20170331	2200	FM-13	70	6.0	N
235841	20170331	2300	FM-13	80	6.0	N
235842	20170401	0000	FM-13	90	8.0	N
235843	20170401	0100	FM-13	80	8.0	N
235844	20170401	0200	FM-13	80	10.0	N
235845	20170401	0300	FM-13	90	11.0	N
235846	20170401	0400	FM-13	80	11.0	N
235847	20170401	0500	FM-13	80	11.0	N
235848	20170401	0600	FM-13	80	11.0	N
235849	20170401	0700	FM-13	80	12.0	N
235850	20170401	0800	FM-13	80	12.0	N
235851	20170401	0900	FM-13	70	11.0	N
235852	20170401	1000	FM-13	70	12.0	N
235853	20170401	1100	FM-13	70	12.0	N
235854	20170401	1200	FM-13	60	13.0	N
235855	20170401	1300	FM-13	50	12.0	N
235856	20170401	1400	FM-13	50	12.0	N
235857	20170401	1500	FM-13	40	13.0	N
235858	20170401	1600	FM-13	40	14.0	N
235859	20170401	1700	FM-13	30	14.0	N
235860	20170401	1800	FM-13	30	14.0	N
235861	20170401	1900	FM-13	30	14.0	N
235862	20170401	2000	FM-13	30	14.0	N
235863	20170401	2100	FM-13	30	15.0	N
235864	20170401	2200	FM-13	30	14.0	N
235865	20170401	2300	FM-13	10	14.0	N

	date	HrMn	dir	speed	dir_windrose
count	2.334610e+05	233461.000000	233461.000000	233461.000000	233461.000000
mean	2.000936e+07	1147.748875	196.144380	6.089394	207.081881
std	9.282218e+04	692.306640	136.124931	3.331242	139.864066
min	1.985031e+07	0.000000	0.000000	0.000000	0.000000
25%	1.993041e+07	500.000000	120.000000	4.000000	120.000000
50%	2.000113e+07	1100.000000	190.000000	6.000000	210.000000
75%	2.009073e+07	1700.000000	260.000000	8.000000	280.000000
max	2.016123e+07	2300.000000	999.000000	26.000000	999.000000

	date	HrMn	dir	speed	dir_windrose	x	y
count	1.433600e+04	14336.000000	14336.000000	14336.000000	14336.000000	14336.000000	14336.000000
mean	2.011598e+07	1145.619420	186.837329	6.386756	203.242397	-1.347841	-0.019738
std	6.716762e+03	692.151673	92.232399	3.292966	123.793493	4.939837	5.041621
min	2.011010e+07	0.000000	-4.928937	0.012049	0.000000	-18.387649	-18.251777
25%	2.011053e+07	500.000000	121.160226	3.963283	130.000000	-4.628845	-3.799711
50%	2.011103e+07	1100.000000	191.464250	5.739531	210.000000	-1.276468	-0.645959
75%	2.012100e+07	1700.000000	263.105775	8.445066	280.000000	2.047118	3.484122
max	2.013043e+07	2300.000000	354.984408	21.850382	999.000000	21.712824	20.818823

	date	HrMn	dir	speed	dir_windrose	x	y
count	1.433600e+04	14336.000000	14336.000000	14336.000000	14336.000000	14336.000000	14336.000000
mean	2.011598e+07	1145.619420	186.837329	6.386756	203.242397	-1.347841	-0.019738
std	6.716762e+03	692.151673	92.232399	3.292966	123.793493	4.939837	5.041621
min	2.011010e+07	0.000000	-4.928937	0.012049	0.000000	-18.387649	-18.251777
25%	2.011053e+07	500.000000	121.160226	3.963283	130.000000	-4.628845	-3.799711
50%	2.011103e+07	1100.000000	191.464250	5.739531	210.000000	-1.276468	-0.645959
75%	2.012100e+07	1700.000000	263.105775	8.445066	280.000000	2.047118	3.484122
max	2.013043e+07	2300.000000	354.984408	21.850382	999.000000	21.712824	20.818823

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
year_lim
(1996, 2006)	0.068082	0.040320	1.353014e-07	0.041649	0.272139	0.954268
(1986, 1996)	0.039296	0.034205	8.345238e-08	0.039417	0.208740	0.966743

	date	HrMn	type	dir	speed	wind_type	dir_windrose	incre	incre_reverse
time
1993-03-13 23:00:00	19930313	2300	FM-18	20	26.0	N	70	3.0	5.0
1985-09-27 20:00:00	19850927	2000	FM-18	280	24.0	N	170	6.0	2.0
1992-12-12 14:00:00	19921212	1400	FM-18	50	23.0	N	40	2.0	1.0
1993-03-13 22:00:00	19930313	2200	FM-18	20	23.0	N	70	2.0	-3.0
1992-12-11 18:00:00	19921211	1800	FM-18	10	22.0	N	80	2.0	2.0
2003-12-07 02:00:00	20031207	200	FM-18	60	22.0	N	30	2.0	0.0
1985-09-27 21:00:00	19850927	2100	FM-18	270	22.0	N	180	-2.0	2.0
2003-12-07 03:00:00	20031207	300	FM-18	60	22.0	N	30	0.0	2.0
1992-12-13 02:00:00	19921213	200	FM-18	40	22.0	N	50	1.0	3.0
1992-12-12 15:00:00	19921212	1500	FM-18	40	22.0	N	50	-1.0	0.0

	k_s	r_square	r_square_dir
year_lim
(1996, 2006)	0.029320	0.991129	0.646934
(1992, 2002)	0.016814	0.995331	0.755758
(1986, 1996)	0.019757	0.987923	0.795987

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.488	-0.941	-3.423	3.466	2.914	0.079
2	0.275	-5.922	3.699	3.807	3.813	-0.004
3	0.237	3.139	2.671	4.080	5.075	-0.077

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.554	-4.070	0.849	4.296	5.849	-0.231
2	0.253	-0.039	-3.790	2.952	2.177	-0.191
3	0.193	3.507	2.018	2.981	4.018	-0.053

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.585	-3.789	0.731	4.285	5.788	-0.256
2	0.224	-0.038	-3.756	2.881	2.131	-0.168
3	0.191	3.595	2.000	2.942	4.245	-0.052

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.537	-4.156	0.753	4.267	5.769	-0.247
2	0.268	0.143	-3.806	3.060	2.174	-0.051
3	0.195	3.445	2.513	3.352	3.819	-0.125

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.564	-4.139	0.880	4.296	5.819	-0.210
2	0.236	0.034	-3.915	2.750	2.174	-0.218
3	0.200	3.622	1.687	2.900	4.194	0.003

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.594	-3.736	0.764	4.462	5.869	-0.213
2	0.231	-0.010	-3.682	2.859	2.132	-0.216
3	0.175	3.670	1.848	2.728	4.156	0.007

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.510	-4.429	0.865	4.169	5.885	-0.292
2	0.285	0.037	-3.726	3.136	2.325	-0.207
3	0.206	3.390	2.281	3.271	3.833	-0.193

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.549	-4.046	0.865	4.313	5.833	-0.247
2	0.252	-0.144	-3.817	2.982	2.162	-0.183
3	0.200	3.411	1.682	3.101	4.025	0.004

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.693	-2.155	1.319	5.734	5.382	-0.123
2	0.236	-0.934	-3.736	2.961	2.278	-0.291
3	0.071	3.005	-1.284	1.869	3.717	0.606

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.529	-4.146	1.023	4.360	5.919	-0.232
2	0.261	-0.105	-3.827	2.979	2.185	-0.162
3	0.210	3.201	1.909	3.191	4.184	-0.001

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.717	-2.330	0.973	5.476	5.518	-0.114
2	0.205	-0.621	-3.741	2.996	2.104	-0.323
3	0.078	2.973	-0.961	2.107	3.837	0.686

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.537	-4.147	0.748	4.160	5.822	-0.252
2	0.258	0.013	-3.783	2.971	2.188	-0.123
3	0.206	3.281	2.175	3.145	3.892	-0.064

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.665	-2.379	1.748	5.902	5.238	-0.151
2	0.267	-0.958	-3.779	3.040	2.340	-0.222
3	0.068	2.811	-1.506	2.071	3.507	0.679

	weight	mean_x	mean_y	sig_x	sig_y	corr
1	0.479	-4.756	0.671	4.034	6.097	-0.318
2	0.266	0.015	-3.822	2.959	2.150	-0.158
3	0.255	2.756	2.278	3.634	4.102	-0.124

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
0	0.143144	0.071273	5.304168e-07	0.081678	0.539624	0.821352
1	0.150828	0.070472	5.163950e-07	0.083938	0.532304	0.824238
2	0.148059	0.072403	5.465168e-07	0.080609	0.547461	0.816369
3	0.153187	0.073965	5.341887e-07	0.082831	0.541250	0.819718

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
0	0.181460	0.070359	5.429837e-07	0.081208	0.545256	0.816953
1	0.161861	0.069157	5.907109e-07	0.078389	0.569159	0.806979
2	0.172339	0.076068	5.268045e-07	0.089801	0.537927	0.820641
3	0.148941	0.072530	5.656833e-07	0.082002	0.557428	0.810150

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
0	0.045866	0.023568	9.805185e-08	0.034522	0.232051	0.967295
1	0.061912	0.024010	7.988715e-08	0.032433	0.209314	0.972968
2	0.057215	0.023471	9.113579e-08	0.034256	0.223701	0.969081
3	0.058146	0.026487	9.475013e-08	0.034993	0.227826	0.967830

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
0	0.067306	0.023844	1.131488e-07	0.040024	0.248778	0.960772
1	0.099382	0.027758	1.565775e-07	0.043016	0.293249	0.947888
2	0.059787	0.033193	1.169749e-07	0.038151	0.253005	0.961332
3	0.060008	0.026278	1.040571e-07	0.035652	0.239469	0.965638

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
0	0.101845	0.010159	7.127204e-08	0.029594	0.197806	0.975849
1	0.114976	0.010174	6.482695e-08	0.029226	0.188567	0.978182
2	0.044533	0.018961	6.465406e-08	0.028927	0.188353	0.977967
3	0.083290	0.010472	5.473927e-08	0.026698	0.173245	0.981698

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
0	0.071203	0.022975	8.288001e-08	0.034201	0.213031	0.972562
1	0.147877	0.024974	1.014193e-07	0.034802	0.235966	0.965716
2	0.059131	0.020182	8.542011e-08	0.032780	0.216425	0.972171
3	0.147308	0.017976	1.387808e-07	0.040854	0.276175	0.952259

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
0	0.067546	0.010622	3.440547e-08	0.021292	0.137493	0.988306
1	0.024946	0.007180	3.041639e-08	0.019662	0.129098	0.989705
2	0.038613	0.012183	3.370247e-08	0.020672	0.136008	0.988559
3	0.044208	0.010706	3.493869e-08	0.021134	0.138401	0.988376

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
0	0.062152	0.025649	7.873375e-08	0.030659	0.207364	0.974175
1	0.051316	0.018237	8.554044e-08	0.031909	0.217041	0.971717
2	0.051083	0.024698	7.238031e-08	0.030904	0.199142	0.976120
3	0.125748	0.023417	7.415992e-08	0.031114	0.201920	0.974034

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
0	0.047036	0.008117	2.321246e-08	0.017308	0.112908	0.992199
1	0.047961	0.009534	2.713720e-08	0.018514	0.121960	0.990846
2	0.030713	0.008620	1.966211e-08	0.016036	0.103898	0.993270
3	0.030899	0.019391	2.779159e-08	0.018756	0.123430	0.990683

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
0	0.081780	0.011522	5.255007e-08	0.025585	0.169531	0.982088
1	0.050058	0.012345	4.811613e-08	0.024538	0.162701	0.983814
2	0.050326	0.013342	6.208785e-08	0.027126	0.184367	0.979972
3	0.057651	0.019657	6.191830e-08	0.028838	0.184533	0.978820

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
1	0.148804	0.072028	5.318793e-07	0.082264	0.540160	0.820419
2	0.055785	0.024384	9.095623e-08	0.034051	0.223223	0.969294
3	0.086161	0.012442	6.387308e-08	0.028611	0.186993	0.978424
4	0.043828	0.010173	3.336575e-08	0.020690	0.135250	0.988737
5	0.039152	0.011416	2.445084e-08	0.017653	0.115549	0.991750

	Chi_square	K_S	MSE	RMSE / Max	RMSE / Mean	R_square
1	0.166150	0.072028	5.565456e-07	0.082850	0.552443	0.813681
2	0.071621	0.027768	1.226896e-07	0.039211	0.258625	0.958907
3	0.106380	0.021527	1.021250e-07	0.035659	0.235399	0.965677
4	0.072575	0.023000	7.770360e-08	0.031146	0.206367	0.974011
5	0.059954	0.014216	5.616809e-08	0.026522	0.175283	0.981173