OEN Method: Harris, Cook The parent wind speed distribution: Why Weibull? http://www.sciencedirect.com/science/article/pii/S0167610514001056
Gaussian Mixture Models, http://scikit-learn.org/stable/modules/mixture.html
%matplotlib inline
%load_ext autoreload
%autoreload 2
from import_file import *
from helpers.parallel_helper import *
load_libs()
plt.rcParams['axes.autolimit_mode'] = 'round_numbers'
plt.rcParams['axes.xmargin'] = 0.
plt.rcParams['axes.ymargin'] = 0.
mpl.rcParams['patch.force_edgecolor'] = True
# file_path, bandwidth= './data/NCDC/europe/uk/marham/dat.txt', 1.7
# file_path, bandwidth= './data/NCDC/europe/uk/tiree/dat.txt', 1.9
# file_path, bandwidth, NUMBER_OF_GAUSSIAN = './data/NCDC/europe/uk/boscombe_down/dat.txt', 1.5, 4
# file_path, bandwidth= './data/NCDC/europe/uk/middle_wallop/dat.txt', 1.3
# file_path, bandwidth= './data/NCDC/europe/uk/bournemouth/dat.txt',1.3 # 4?
# file_path= "./data/NCDC/europe/uk/weybourne/dat.txt"
# file_path= "./data/NCDC/europe/uk/skye_lusa/dat.txt" #
# file_path= "./data/NCDC/europe/uk/wattisham/dat.txt"
# file_path= "./data/NCDC/europe/uk/south_uist_range/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/europe/uk/holbeach/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/europe/uk/cambridge/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/europe/us/baltimore/dat.txt" # time too short
# file_path= "./data/NCDC/europe/uk/bealach_na_ba/dat.txt" # time too short
# file_path= "./data/NCDC/europe/uk/benbecula/dat.txt" # truncate (untruncate in m/s), 4?
# file_path= './data/NCDC/europe/uk/southhamption/dat.txt' # high 0, trend
# file_path, bandwidth, NUMBER_OF_GAUSSIAN = "./data/NCDC/europe/germany/landsberg_lech/dat.txt", 0.9, 4
# file_path, bandwidth= "./data/NCDC/europe/germany/neuburg/dat.txt", 0.7
# file_path, bandwidth= "./data/NCDC/europe/germany/laupheim/dat.txt", 0.7 # double peak, 4?, trend
# file_path, bandwidth= './data/NCDC/europe/germany/niederstetten/dat.txt', 0.9 # get the peak
# file_path, bandwidth= "./data/NCDC/europe/germany/holzdorf/dat.txt", 0.9 # 2008 year
# file_path, bandwidth, NUMBER_OF_GAUSSIAN= './data/NCDC/europe/france/nantes/dat.txt', 0.9, 4 # unit shift, one direction deviate big
# file_path, convert_to_knot= './data/NCDC/europe/france/pau_pyrenees/dat.txt', True # unit shift, 2; force using knot
# file_path= "./data/NCDC/europe/france/avord/dat.txt" # try 4, initial speed (should be good with m/s), incompete dataset
# file_path= "./data/NCDC/europe/france/vatry/dat.txt" # double peak, initial speed, incompete dataset
# file_path, bandwidth, NUMBER_OF_GAUSSIAN= "./data/NCDC/europe/spain/valladolid/dat.txt", 1.1, 4
# file_path= './data/NCDC/europe/spain/jerez/dat.txt' # high 0
# file_path, bandwidth= "./data/NCDC/europe/spain/barayas/dat.txt", 0.7 # not good fit
# file_path, bandwidth= './data/NCDC/europe/spain/malaga/dat.txt', 0.7 # directions blocked?
# file_path, bandwidth= './data/NCDC/europe/spain/tenerife_sur/dat.txt', 0.7 # directions blocked?
# file_path, bandwidth= './data/NCDC/europe/spain/almeria/dat.txt', 0.7 # negative dimensions?
# file_path, bandwidth= './data/NCDC/europe/greece/eleftherios_intl/dat.txt',0.7 # some direction might be blocked
# file_path= './data/NCDC/europe/ciampino/dat.txt' # try 4, bandwidth?
# file_path= "./data/NCDC/europe/huspel_aws/dat.txt" # integer, 4?
# file_path= './data/NCDC/gibraltar/dat.txt' # bad fit
# MidEast
file_path, bandwidth= './data/NCDC/mideast/uae/al_maktoum/dat.txt', 1.1
# file_path= './data/NCDC/mideast/uae/sharjah_intl/dat.txt'
# file_path= './data/NCDC/mideast/uae/dubai_intl/dat.txt'
# file_path= './data/NCDC/mideast/uae/abu_dhabi_intl/dat.txt' # Time shift
# file_path= './data/NCDC/mideast/uae/bateen/dat.txt' # Time shift
# file_path= './data/NCDC/mideast/buraimi/dat.txt' # not good dataset
# file_path= './data/NCDC/mideast/turkey/konya/dat.txt'
# file_path= './data/NCDC/mideast/turkey/sivas/dat.txt' # bad dataset
# file_path= './data/NCDC/mideast/turkey/balikesir/dat.txt' # bad dataset
# file_path= './data/NCDC/mideast/turkey/bartin/dat.txt' # bad dataset
# file_path= './data/NCDC/mideast/iran/chahbahar/dat.txt'
# file_path= './data/NCDC/mideast/iran/zabol/dat.txt' # Problematic data
# file_path= './data/NCDC/mideast/iran/torbat_heydarieh/dat.txt' # Unusable
# file_path, bandwidth = "./data/NCDC/cn/shanghai/hongqiao_intl/dat.txt", 0.6
# file_path, bandwidth= "./data/NCDC/cn/shanghai/pudong/dat.txt", 0.8
# file_path, bandwidth= "./data/NCDC/cn/hefei_luogang/dat.txt", 0.6 # few 0, trend
# file_path, bandwidth= "./data/NCDC/cn/nanjing_lukou/dat.txt", 0.5
# file_path= "./data/NCDC/cn/zhengzhou_xinzheng/dat.txt"
# file_path= "./data/NCDC/cn/tianjin/binhai/dat.txt" # few 0, trend, stationary speed, unstationary direction
# file_path= "./data/NCDC/cn/tianjin/tianjing/dat.txt" # 16 sectors
# file_path= "./data/NCDC/cn/shijiazhuang_zhengding/dat.txt"
# file_path= "./data/NCDC/cn/henan_gushi/dat.txt" # 16 sectors, fit not very good
# file_path= "./data/NCDC/cn/nanning_wuxu/dat.txt" # numpy priblem, unstationary speed
# file_path= './data/NCDC/cn/macau/dat.txt'
# file_path= "./data/NCDC/cn/hk_intl/dat.txt" # few 0
# file_path= './data/NCDC/cn/gaoqi/dat.txt'
# file_path= './data/NCDC/southeast_asia/malaysia/mersing/dat.txt' # 2 mode, paper comparison
# file_path= './data/NCDC/southeast_asia/malaysia/penang/dat.txt'
# file_path= './data/NCDC/southeast_asia/malaysia/butterworth/dat.txt' # 2 mode
# file_path= "./data/NCDC/southeast_asia/malaysia/bsultan_mahmud/dat.txt" # stable
# file_path= "./data/NCDC/southeast_asia/malaysia/bsultan_ismail/dat.txt" #
# file_path= "./data/NCDC/southeast_asia/singapore/changi/dat.txt" # trend, no 0, questionary data
# file_path= "./data/NCDC/southeast_asia/singapore/paya_lebar/dat.txt" # questionary data
# file_path= "./data/NCDC/southeast_asia/singapore/seletar/dat.txt"
# file_path= "./data/NCDC/east_asia/cheongju_intl/dat.txt" # 2005-2009 may have problem, fit is good; numpy problem
# file_path= "./data/NCDC/east_asia/daegu_ab/dat.txt" # recent 5 year may have problem, but fit is generally good; numpy problem
# file_path, bandwidth= "./data/NCDC/oceania/auckland_intl/dat.txt", 0.9 # Good data, double mode
# file_path= "./data/NCDC/oceania/brisbane_archerfield/dat.txt" # high 0, few data
# file_path= "./data/NCDC/oceania/narrandera/dat.txt" # high 0, few data
# file_path, bandwidth= "./data/NCDC/oceania/canberra/dat.txt", 0.7 # high 0, bad fit
# file_path, bandwidth, NUMBER_OF_GAUSSIAN= './data/NCDC/oceania/horsham/dat.txt', 0.9, 4 # get the peak
# file_path, bandwidth= './data/NCDC/us/boston_16nm/dat.txt', 0.9 # Offshore, mixed type
# file_path, bandwidth= './data/asos/olympia/hr_avg.csv', 0.5 # might block
# file_path, bandwidth, NUMBER_OF_GAUSSIAN = './data/asos/bismarck_ND/hr_avg.csv', 1.1, 4
# file_path, bandwidth, NUMBER_OF_GAUSSIAN = './data/asos/aberdeen_SD/hr_avg.csv', 1.7, 2 # only to 2012
# file_path, bandwidth, NUMBER_OF_GAUSSIAN = './data/asos/minneapolis/hr_avg.csv', 1.1, 4
# file_path, bandwidth = './data/asos/lincoln_NE/hr_avg.csv', 0.9
# file_path, bandwidth = './data/asos/des_moines_IA/hr_avg.csv', 1.3
# file_path, bandwidth = './data/asos/springfield_IL/hr_avg.csv', 1.1
# file_path, bandwidth = './data/asos/topeka/hr_avg.csv', 0.7 # High 0
# file_path, bandwidth = './data/asos/denver/hr_avg.csv', 1.3
# file_path, bandwidth, NUMBER_OF_GAUSSIAN = './data/NDAWN/baker/hr_avg.csv', 0.7, 4
# file_path, bandwidth = './data/NDAWN/dickinson/hr_avg.csv', 0.6
# file_path = './data/NDAWN/rugby/hr_avg.csv'
# file_path = './data/NDAWN/bowman/hr_avg.csv'
# file_path = './data/NDAWN/grand_forks/hr_avg.csv'
# file_path = './data/NDAWN/williston/hr_avg.csv'
# file_path = './data/NDAWN/jamestown/hr_avg.csv'
# file_path, bandwidth, NUMBER_OF_GAUSSIAN = 'data/ECMWF/usa/47N123W/dat.csv', 0.7, 4 #good
# file_path, bandwidth = 'data/ECMWF/venezuela/8N67W/dat.csv', 0.7 # good, but the data might be problematic.
# file_path, bandwidth = 'data/ECMWF/chile/52S75W/dat.csv', 1.9 # good
# file_path, bandwidth= 'data/ECMWF/iceland/65N17W/dat.csv', 1.9 # good
# file_path, bandwidth, NUMBER_OF_GAUSSIAN = 'data/ECMWF/germany/49N9E/dat.csv', 1.1, 4 # miss peak
# file_path, bandwdith = 'data/ECMWF/sudan/18N32E/dat.csv', 1.1 # good
# file_path, bandwidth = 'data/ECMWF/china/24N121E/dat.csv', 0.9 # good
# file_path, bandwidth, NUMBER_OF_GAUSSIAN = 'data/ECMWF/australia/37S142E/dat.csv', 0.7, 4 # miss the peak, force bandwidth 0.7
if "cn_database" in file_path:
df = read_cn_database(file_path)
elif 'NCDC' in file_path:
df = pd.read_csv(file_path, header=0, skipinitialspace=True, dtype={'HrMn':'object'})
df.rename(columns={'Date':'date','Dir':'dir','Spd':'speed','Type':'type','I.1':'wind_type'}, inplace=True)
df = df[['date','HrMn','type','dir','speed','wind_type' ]]
df.dropna(subset=['dir','speed'], inplace=True)
integer_data = True
elif 'NDAWN' in file_path:
df = pd.read_csv(file_path, header=0, skipinitialspace=True, dtype={'HrMn':'object'})
df['type']='default'
df['wind_type']='default'
df = df.dropna()
convert_to_knot = False
integer_data = False
elif 'asos' in file_path:
# ASOS
df = pd.read_csv(file_path, header=0, skipinitialspace=True, dtype={'HrMn':'object'})
df['type']='default'
df['wind_type']='default'
df = df.dropna()
convert_to_knot = False
integer_data = False
knot_unit = True
else:
df = pd.read_csv(file_path, header=0, skipinitialspace=True)
df.rename(columns={'U':'x','V':'y'}, inplace=True)
df.x=-df.x
df.y=-df.y
df['speed']=np.sqrt(df.x**2+df.y**2)
df['dir']=np.degrees(np.arctan2(df.y, df.x))%360
df['time']=pd.to_datetime('1979-01-01T00:00:00Z')+pd.to_timedelta(df['time'], unit='h')
df['date']=df['time'].dt.strftime('%Y%m%d')
df['date']=df['date'].astype(int)
df['HrMn']=df['time'].dt.strftime('%H00')
df['type']='default'
df['wind_type']='default'
convert_to_knot = True
integer_data = False
cartesian = True
df
if 'NCDC' in file_path:
lat, long = get_lat_long(file_path)
print(lat,long)
map_osm = folium.Map(location=[lat, long], zoom_start=4)
folium.Marker([lat, long]).add_to(map_osm)
display(map_osm)
df['time']=pd.to_datetime(df["date"].astype(str).map(str) + df["HrMn"], format='%Y%m%d%H%M')
df.set_index(['time'], inplace=True)
df['HrMn']=df['HrMn'].astype(int)
df = df.query("(dir <= 999) & (speed < 100) ")['1970':'2016']
plot_speed_and_angle_distribution(df.speed, df.dir)
# Dir [10,360]=> [0,350]
df['dir'] = df['dir'].apply(lambda x: x%360 if x < 999 else x)
# Convert Windrose coordianates to Polar Cooridinates
if 'cartesian' in globals():
df['dir_windrose'] = df['dir'].apply(lambda x: (90 - x)%360 if x < 999 else x)
else:
df['dir_windrose'] = df['dir']
df['dir'] = df['dir'].apply(lambda x: (90 - x)%360 if x < 999 else x)
display(df.describe())
df.plot(y='speed',legend=True,figsize=(20,5))
df['decimal'] = df.speed % 1
df.decimal.hist(alpha=0.5, label='m/s', figsize=(4, 3))
if 'convert_to_knot' not in globals():
convert_to_knot = True if len(df.query('decimal >= 0.2')) / len(df) > 0.3 else False
if convert_to_knot:
knot_unit = True
df['speed'] = df['speed'] * 1.943845
df['decimal'] = df.speed % 1
df.decimal.hist(alpha=0.5, label='knot')
# need more elaboration, some is not near an integer
if integer_data:
df['speed'] = df['speed'].apply(lambda x: int(round(x)))
plt_configure(xlabel='Decimal', ylabel='Frequency', legend={'loc': 'best'}, title='Decimal Distribution')
else:
if 'knot_unit' not in globals():
knot_unit = False
df.drop(['decimal'], 1,inplace=True)
print(knot_unit)
dir_unit_text = ' (degree)'
if knot_unit == True:
speed_unit_text = ' (knot)'
else:
speed_unit_text = ' (m/s)'
sample_type = df.query('date > 20000000')['type']
sample_type.value_counts().plot(
kind = 'bar', title = 'Report Types Comprisement', figsize=(4,3))
report_type_most_used = sample_type.value_counts().argmax()
df = df.query("type==@report_type_most_used")
MID_YEAR = int(np.average(df.index.year))
df['HrMn'].value_counts().sort_index().plot(kind='bar', alpha=0.5,label='Overall')
df[str(MID_YEAR):]['HrMn'].value_counts().sort_index().plot(
kind='bar', alpha=0.5, label='>= %s' % MID_YEAR )
plt_configure(xlabel='Sampling Time', ylabel='Frequency', legend={'loc':'best'}, figsize=(8, 4),
title = 'Sampling Time Distribution, Overall and > %s ' % MID_YEAR)
df['sample_time'] = df.HrMn % 100
sample_time = df['2000':]['sample_time']
sample_times = sample_time.value_counts()[sample_time.value_counts() > 2000]
sample_times = sample_times.index.tolist()
# df = df.query("sample_time in @sample_times")
df = df.query("sample_time == @sample_times[0]")
df.drop(['sample_time'], 1,inplace=True)
print(sample_times)
df['HrMn'].value_counts().sort_index().plot(kind='bar', alpha=0.5, figsize=(10, 4))
wrong direction record
if integer_data:
display(df.query("(dir % 10 >= 0.1) & (dir != 999)"))
df = df.query('(dir % 10 <= 0.1) | (dir == 999)')
sudden increase in speed
# sudden increse
df['incre'] = df.speed.diff(1)
df['incre'].fillna(0, inplace=True)
df['incre_reverse'] = df.speed.diff(-1)
df['incre_reverse'].fillna(0, inplace=True)
display(df.sort_values(by='speed',ascending=False).head(10))
df['incre'].plot(kind='hist', bins=arange(-15, 15), legend=True, figsize=(8, 3))
incre_threshold = 20 if knot_unit else 10
print('sudden increase number', len(df.query('(incre > @incre_threshold )&(incre_reverse > @incre_threshold )')))
df = df.query('(incre < @incre_threshold )|(incre_reverse < @incre_threshold )')
# Check the max speed
display(df.sort_values(by='speed',ascending=False).head(10))
df.drop(['incre', 'incre_reverse'], 1, inplace=True)
For some dataset, the 16 sectors are not record properly,
e.g. the sectors are [0,20,50 ...], need to redistribute the angle into 22.5, e.g. [0, 22.5, 45...]
display(df['dir'].value_counts().sort_index())
effective_column = df.query('dir < 999')['dir'].value_counts()[df['dir'].value_counts() > 30].sort_index()
if integer_data:
SECTOR_LENGTH = 360/len(effective_column)
else:
SECTOR_LENGTH = 10
print(len(effective_column), SECTOR_LENGTH)
df=realign_direction(df, effective_column)
with_too_many_zero, null_wind_frequency = is_with_too_many_zero(df['2005':])
delete_zero = with_too_many_zero
if delete_zero:
df = df.query('(speed > 0)')
print(delete_zero, null_wind_frequency)
print(df.query('dir == 999')['speed'].value_counts())
df=fill_direction_999(df, SECTOR_LENGTH)
DIR_REDISTRIBUTE = 'even'
if DIR_REDISTRIBUTE == 'even':
DIR_BIN = arange(-5, 360, 10)
elif DIR_REDISTRIBUTE == 'round_up':
DIR_BIN = arange(0, 360+10, 10)
# Comparison between mid_year, looking for:
# 1. Odd Even Bias
# 2. Time Shift of Wind Speed Distribution
bins = arange(0, df.speed.max() + 1)
df[:str(MID_YEAR)]['speed'].plot(
kind='hist', alpha=0.5,bins=bins, label='< %s' % MID_YEAR)
df[str(MID_YEAR+1):]['speed'].plot(
kind='hist', alpha=0.5,bins=bins, label='> %s' % MID_YEAR)
plt.suptitle('Speed Comparison between year < %s, > %s ' % (MID_YEAR, MID_YEAR), fontsize = 14)
plt_configure(xlabel='Speed', ylabel='Frequency', legend=True, figsize=(8, 3))
df[:str(MID_YEAR)]['dir'].plot(
kind='hist', alpha=0.5,bins=DIR_BIN, label='< %s' % MID_YEAR)
df[str(MID_YEAR+1):]['dir'].plot(
kind='hist', alpha=0.5,bins=DIR_BIN, label='> %s' % MID_YEAR)
plt.suptitle('Dir Comparison between year < %s, and > %s ' % (MID_YEAR, MID_YEAR), fontsize = 14)
plt_configure(xlabel='Dir', ylabel='Frequency', legend={'loc':'best'}, figsize=(8, 3), tight='x')
display(df[df['dir'].isnull()])
df.dropna(subset=['dir'], inplace=True)
# Inspect the time shift of speed and degree distribution, and odd-even bias
check_time_shift(df, speed_unit_text=speed_unit_text, dir_unit_text=dir_unit_text)
df.resample('A').mean().plot(y='speed')
plt.gca().set_ylim(bottom=0)
df.resample('M').mean().plot(y='speed', figsize=(20,4))
plt.gca().set_ylim(bottom=0)
%%time
for column in ['speed', 'dir']:
if column == 'speed':
bins = arange(0, df[column].max()+1, 1)
else:
bins = arange(0, 361, 10)
den, _ = np.histogram(df[column], bins=bins, density=True)
y_top=max(den)*1.2
for year in arange(1980, 2016):
end_year = year
sub_df = df[str(year):str(end_year)]
if len(sub_df) > 1000:
plt.figure()
df[column].hist(bins=bins, alpha=0.3, normed=True)
sub_df[column].hist(bins=bins, alpha=0.5, figsize=(3,1.5), normed=True)
plt.gca().set_ylim(top=y_top)
plt_configure(title=str(year))
align_figures()
for column in ['speed', 'dir']:
if column == 'speed':
bins = arange(0, df[column].max()+1, 1)
else:
bins = arange(0, 361, 10)
density_all, _ = np.histogram(df[column], bins=bins, density=True)
df[column].hist(bins=bins, figsize=(5,3))
R_squares = []
years = []
for year in arange(1980, 2016):
start_year, end_year = year-1, year+1
sub_df = df[str(start_year):str(end_year)]
if len(sub_df) > 1000:
density, _ = np.histogram(sub_df[column], bins=bins, density=True)
y_mean = np.mean(density_all)
SS_tot = np.sum(np.power(density_all - y_mean, 2))
SS_res = np.sum(np.power(density_all - density, 2))
R_square = 1 - SS_res / SS_tot
R_squares.append(R_square)
years.append(year)
plt.figure()
plot(years, R_squares)
ylim = max(min(plt.gca().get_ylim()[0],0.85),0)
plt.gca().set_ylim(bottom=ylim, top=1)
plt_configure(figsize=(5,3))
align_figures()
e.g. Dir 50 -> -45 ~ 55, to make KDE result better
if integer_data:
df = randomize_angle(df, DIR_REDISTRIBUTE, SECTOR_LENGTH)
if integer_data:
if delete_zero:
redistribute_method = 'down'
else:
redistribute_method = 'up'
df, speed_redistribution_info = randomize_speed(df, redistribute_method)
# Cook orientation
# df['dir']= (df['dir'] + 180)%360
# There might be a small dot in the centre, which is due to too many zero (more than 1 speed) in center
# Scatter plot in matplot has performance issue, the speed is very slow
df['x'] = df['speed'] * cos(df['dir'] * pi / 180.0)
df['y'] = df['speed'] * sin(df['dir'] * pi / 180.0)
## Summery of the data selection
print('Knot unit?', knot_unit)
print('Report type used:', report_type_most_used)
print('Sampling time used:', sample_times)
if 'speed_redistribution_info' in globals():
print('Speed redistribution info:', speed_redistribution_info )
df_all_years = df # for later across-year comparison
df = df_all_years['2011':'2015']
# df = df.query('(HrMn == 0) and (speed >= 0.5) and (date%10000 > 900) and (date%10000 < 1000)' )
df.describe()
df.plot(y='speed',legend=True,figsize=(20,5))
# Accumulation by month
df.resample('M').count().plot(y='date', kind='bar',figsize=(20,4))
# 90 degree is east
ax = WindroseAxes.from_ax()
viridis = plt.get_cmap('viridis')
ax.bar(df.dir_windrose, df.speed, normed=True, opening=0.8, edgecolor='white', nsector=36, cmap=viridis)
ax.set_legend()
if len(df) > 1000000:
bins=arange(0,362)
df['dir'].hist(bins=bins, normed=True,alpha=0.5,label='min')
df = df_all_years.sample(n=500000, replace=True)
df['dir'].hist(bins=bins, normed=True,alpha=0.5,label='min resmapled')
plt_configure(legend=True, figsize=(20,4))
x, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(df.speed)
# 1. Histogram comparison
fig = plt.figure()
df['speed'].hist(bins=arange(0, df.speed.max()), alpha=0.5, label='Data', normed=True)
plot(x, y_weibull, '-', color='black',label='Weibull')
plt_configure(figsize=(4,3),xlabel='V',ylabel='PDF', legend=True)
# 2. CDF comparison
fig = plt.figure()
plot(log(x), log(-log(1-y_ecdf)),'o', label='ECDF')
plot(log(x), log(-log(1-y_cdf_weibull)),'-', label='Weibull')
plt_configure(xlabel="ln(V)", ylabel="ln(-ln(1-P)",legend={'loc':'best'}, figsize=(4,3))
align_figures()
df.plot(kind='scatter', x='x', y='y', alpha=0.05, s=2)
plt.gca().set_aspect('equal')
plt_configure(figsize=(3.2,3.2),xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
if len(effective_column) == 16:
rebinned_angle = 22.5
else:
rebinned_angle = 10
%%time
original_incre, incre = SECTOR_LENGTH, rebinned_angle
start, end = -original_incre/2 + incre/2, 360
max_speed = df.speed.max()
max_count = max_count_for_angles(df, start, end, incre)
plot_range = [0, max_speed, 0, max_count*1.05]
for angle in arange(start, end, incre):
start_angle, end_angle = angle-incre/2, angle+incre/2
sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)
fig = plt.figure()
sub_df['speed'].hist(bins=arange(0, max_speed), alpha=0.5, label='Data')
title ='%s (%s - %s), %s' % (angle, start_angle, end_angle, len(sub_df))
plt.axis(plot_range)
plt_configure(figsize=(3,1.5), title=title)
align_figures()
%%time
current_df = df.query('speed>=1')
for month in arange(1, 13):
sub_df = current_df[current_df.index.month == month]
ax = WindroseAxes.from_ax()
ax.bar(sub_df.dir_windrose, sub_df.speed, normed=True, opening=0.8, edgecolor='white', nsector=36, cmap=plt.get_cmap('viridis'))
plt_configure(figsize=(3,3), title='Month: %s'%(month))
align_figures()
df.describe()
SPEED_SET = array(list(zip(df.x, df.y)))
if 'NUMBER_OF_GAUSSIAN' not in globals():
NUMBER_OF_GAUSSIAN = 3
FIT_METHOD = 'square_error'
DEFAULT_BANDWDITH = 1.5 if knot_unit else 0.7
fig_list = []
fit_limit = ceil(df['speed'].quantile(.95))
fitting_axis_range = arange(-fit_limit, fit_limit+1, 1)
print(fitting_axis_range)
FITTING_RANGE = []
for i in fitting_axis_range:
for j in fitting_axis_range:
FITTING_RANGE.append([i,j])
plot_limit = ceil(df['speed'].quantile(.95))
PLOT_AXIS_RANGE = arange(-plot_limit, plot_limit+1, 1)
sample = SPEED_SET
KDE_KERNEL = 'gaussian'
# KDE_KERNEL, bandwidth = 'tophat', 1
%%time
if 'bandwidth' not in globals():
bandwidth = DEFAULT_BANDWDITH
from sklearn.grid_search import GridSearchCV
# from sklearn.model_selection import GridSearchCV ## too slow
# The bandwidth value sometimes would be too radical
if knot_unit:
bandwidth_range = arange(0.7,2,0.2)
else:
bandwidth_range = arange(0.4,1.1,0.1)
# Grid search is unable to deal with too many data (a long time is needed)
if len(sample) > 50000:
df_resample=df.sample(n=50000, replace=True)
bandwidth_search_sample = array(list(zip(df_resample.x, df_resample.y)))
else:
bandwidth_search_sample = sample
grid = GridSearchCV(neighbors.KernelDensity(kernel = KDE_KERNEL),
{'bandwidth': bandwidth_range}, n_jobs=-1, cv=4)
grid.fit(bandwidth_search_sample)
bandwidth = grid.best_params_['bandwidth']
print(bandwidth)
if 'bandwidth' not in globals():
bandwidth = DEFAULT_BANDWDITH
kde = neighbors.KernelDensity(bandwidth=bandwidth, kernel = KDE_KERNEL).fit(sample)
points = FITTING_RANGE
# very slow if the dataset is too large, e.g. 100,000
# kde returns log prob, need to convert it
kde_result = exp(kde.score_samples(points))
print('bandwidth:', bandwidth, len(kde_result))
print(kde_result[:5])
# Plot jPDF
X = Y = PLOT_AXIS_RANGE
# Can't work if pass as generate_Z_from_X_Y(X,Y, exp(kde.score_samples())), need to use lambda
# see http://stackoverflow.com/questions/21035437/passing-a-function-as-an-argument-in-python
kde_Z = generate_Z_from_X_Y(X,Y, lambda coords: exp(kde.score_samples(coords)))
colorbar_lim = 0, kde_Z.max()
plot_3d_prob_density(X,Y,kde_Z)
fig_kde,ax1 = plt.subplots(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,kde_Z,xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, ax=ax1)
with sns.axes_style({'axes.grid' : False}):
from matplotlib import ticker
fig_hist,ax2 = plt.subplots(figsize=(3.5,2.5))
_,_,_,image = ax2.hist2d(df.x, df.y, bins=PLOT_AXIS_RANGE, cmap='viridis',)
ax2.set_aspect('equal')
cb = plt.colorbar(image)
tick_locator = ticker.MaxNLocator(nbins=6)
cb.locator = tick_locator
cb.update_ticks()
plt_configure(ax=ax2, xlabel='x'+speed_unit_text,ylabel='y'+speed_unit_text)
align_figures()
kde_cdf = cdf_from_pdf(kde_result)
config = {'bandwidth': bandwidth,
'fitting_range': FITTING_RANGE,
'fit_limit': fit_limit,
'kde_kernel': KDE_KERNEL}
%%time
gof_kde=Parallel(n_jobs=-1)(delayed(resampled_kde)(df, kde_result, config)
for i in arange(20))
for gof_name in [ 'R_square', 'K_S','Chi_square']:
plt.figure(figsize=(4,3))
pd.DataFrame(gof_kde)[gof_name].hist()
plt_configure(title=gof_name)
align_figures()
gofs_bivar
%%time
gofs_mean_set_bivar = []
fig1, ax1 = plt.subplots(figsize=(4,3))
fig2, ax2 = plt.subplots(figsize=(4,3))
for year_length in [5, 10]:
start_year, end_year = df_all_years.index.year[0], 2015-year_length+1
df_standard = df_all_years[str(2015-year_length+1):'2015']
speed_ = array(list(zip(df_standard.x, df_standard.y)))
kde_current = neighbors.KernelDensity(bandwidth=bandwidth, kernel=KDE_KERNEL).fit(speed_)
kde_result_standard = exp(kde_current.score_samples(points))
gofs_bivar=Parallel(n_jobs=-1)(delayed(kde_gofs)(df_all_years, start_year, start_year+year_length-1, kde_result_standard, config)
for start_year in arange(start_year, end_year+1))
gofs_bivar=pd.DataFrame(gofs_bivar, index=arange(start_year, end_year+1))
if len(gofs_bivar)>0:
gofs_bivar.plot(y='R_square', ax=ax1, label=year_length)
gofs_bivar.plot(y='K_S', ax=ax2, label=year_length)
year_lim = end_year-year_length-10, end_year-year_length
gofs_mean = gofs_bivar.query('index >= @year_lim[0] & index <= @year_lim[1]').mean().to_dict()
gofs_mean['year_lim']=year_lim
gofs_mean_set_bivar.append(gofs_mean)
align_figures()
display(pd.DataFrame(gofs_mean_set_bivar).set_index('year_lim'))
def yearly_gof(df_all_years, start_year, end_year, density, y_ecdf, x, density_dir):
df_previous = df_all_years[str(start_year):str(end_year)]
# 1. Speed
density_expected, _ = np.histogram(df_previous['speed'], bins=x, density=True)
r_square = sector_r_square(density, density_expected)
y_ecdf_previous = sm.distributions.ECDF(df_previous['speed'])(x)
k_s = max(np.abs(y_ecdf - y_ecdf_previous))
# 2. Direction
density_dir_expected, _ = dir_hist(df_previous['dir'], bins=arange(-5,370,10), density=True)
r_square_dir = sector_r_square(density_dir, density_dir_expected)
return {'year': start_year, 'r_square': r_square, 'k_s': k_s, 'r_square_dir': r_square_dir}
%%time
x = arange(0, df.speed.max() + 1)
fig = plt.figure(figsize=(9,2.5))
ax1 = fig.add_subplot(1,3,1)
ax2 = fig.add_subplot(1,3,2)
ax3 = fig.add_subplot(1,3,3)
gofs_mean_set = []
for year_length in [5, 7, 10]:
start_year, end_year =df_all_years.index.year[0], 2015-year_length+1
df_standard = df_all_years[str(2015-year_length+1):str(2015)]
density, _ = np.histogram(df_standard['speed'], bins=x, density=True)
density_dir, _ = dir_hist(df_standard['dir'], bins=arange(-5,370,10), density=True)
y_ecdf = sm.distributions.ECDF(df_standard.speed)(x)
gofs = [yearly_gof(df_all_years, start_year, start_year+year_length-1, density, y_ecdf, x, density_dir)
for start_year in arange(start_year, end_year+1)]
if len(gofs)>0:
gofs = pd.DataFrame(gofs)
gofs.set_index(['year'], inplace=True)
ax1.plot(gofs.r_square, label=year_length)
ax2.plot(gofs.k_s, label=year_length)
ax3.plot(gofs.r_square_dir, label=year_length)
year_lim = end_year-year_length-10, end_year-year_length
gofs_mean = gofs.query('index >= @year_lim[0] & index <= @year_lim[1]').mean().to_dict()
gofs_mean['year_lim']=year_lim
gofs_mean_set.append(gofs_mean)
plt.tight_layout()
plt.legend()
for ax in [ax1, ax2, ax3]:
plt_configure(ax=ax, tight='xtight')
display(pd.DataFrame(gofs_mean_set).set_index('year_lim'))
sample=SPEED_SET
clf = mixture.GaussianMixture(n_components=NUMBER_OF_GAUSSIAN, covariance_type='full')
clf.fit(sample)
print(clf.converged_)
gmm_em_result = read_gmm_em_result(clf)
pretty_print_gmm(gmm_em_result)
fig,ax = plt.subplots(figsize=(3.5,3.5))
plot_gmm_ellipses(gmm_em_result, ax=ax, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
X = Y = PLOT_AXIS_RANGE
pdf_Z = generate_Z_from_X_Y(X,Y, lambda coords: exp(clf.score_samples(coords)))
def residule_between_kde_and_gmm(points):
kde_vals = exp(kde.score_samples(points))
gmm_vals = exp(clf.score_samples(points))
return kde_vals - gmm_vals
residual_Z = generate_Z_from_X_Y(X,Y, residule_between_kde_and_gmm)
plot_3d_prob_density(X,Y,pdf_Z)
plot_3d_prob_density(X,Y,residual_Z)
align_figures()
fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,kde_Z,xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar_lim=colorbar_lim)
fig_em = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,pdf_Z,xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar_lim=colorbar_lim)
fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,residual_Z,
xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
align_figures()
points = FITTING_RANGE
gmm_pdf_result = exp(clf.score_samples(points))
gof_df(gmm_pdf_result, kde_result)
gmm_em = group_gmm_param_from_gmm_param_array(gmm_em_result, sort_group = True)
mixed_model_pdf_em = generate_gmm_pdf_from_grouped_gmm_param(gmm_em)
sample = SPEED_SET
points = FITTING_RANGE
max_speed = df.speed.max()
print(FIT_METHOD)
# from GMM,EM
# GMM format: weight, meanx, meany, sigx, sigy, rho
x0 = gmm_em_result
cons = [
# sum of every 6th element, which is the fraction of each gaussian
{'type': 'eq', 'fun': lambda x: sum(x[::6]) - 1},
# # limit the width/height ratio of elliplse, optional
# {'type': 'ineq', 'fun': lambda x: width_height_ratios_set(x) - 1/3},
# {'type': 'ineq', 'fun': lambda x: 3 - width_height_ratios_set(x)},
]
bonds = [(0., 0.99),(-fit_limit, fit_limit),(-fit_limit, fit_limit),
(0., fit_limit),(0., fit_limit),(-0.99, 0.99)]*(len(x0)//6)
result = sp.optimize.minimize(
lambda x0: GMM_fit_score(x0, kde_result, points, FIT_METHOD),
x0,
bounds = bonds,
constraints=cons,
tol = 0.000000000001,
options = {"maxiter": 500})
result
gmm = group_gmm_param_from_gmm_param_array(result.x, sort_group = True)
mixed_model_pdf = generate_gmm_pdf_from_grouped_gmm_param(gmm)
gmm_pdf_result = mixed_model_pdf(points)
pretty_print_gmm(gmm)
fig_gmm, ax = plt.subplots(figsize=(3.5,3.5))
plot_gmm_ellipses(gmm, ax=ax, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
gof_df(gmm_pdf_result, kde_result)
pd.DataFrame(gofs_mean_set_bivar).set_index('year_lim')
X = Y = PLOT_AXIS_RANGE
pdf_Z = generate_Z_from_X_Y(X,Y, mixed_model_pdf)# passing a function as an argument
def residule_between_kde_and_gmm(points):
kde_vals = exp(kde.score_samples(points))
gmm_vals = mixed_model_pdf(points)
return kde_vals - gmm_vals
residual_Z = generate_Z_from_X_Y(X,Y, residule_between_kde_and_gmm)
plot_3d_prob_density(X,Y,pdf_Z)
plot_3d_prob_density(X,Y,residual_Z)
align_figures()
fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,kde_Z, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
fig_gmm = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,pdf_Z, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,residual_Z, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
align_figures()
def f(V,theta):
return (mixed_model_pdf([[V*cos(theta),V*sin(theta)]]))*V
def f_em(V,theta):
return (mixed_model_pdf_em([[V*cos(theta),V*sin(theta)]]))*V
%%time
x = arange(0, max_speed, 0.5)
# 1. Fit Weibull
_, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(df.speed, x=x)
# 2. GMM Model
y_ =[integrate.nquad(f, [[x_-0.01, x_+0.01],[0, 2*pi]]) for x_ in x]
y_gmm = array(list(zip(*y_))[0])/0.02
# 3. Plot Comparison
df['speed'].hist(bins=arange(0, df.speed.max()), alpha=0.5, label='Data')
plot(x, y_gmm*len(df.speed),'-', color='black', label='GMM')
plot(x, y_weibull*len(df.speed), '--', color='black', label='Weibull')
print('Speed Distribution Comparison')
plt_configure(xlabel='Speed'+speed_unit_text,
ylabel='Frequency',legend=True, figsize=(4, 2))
plt.gca().set_ylim(bottom = 0)
plt.tight_layout()
plt.locator_params(axis='y', nbins=5)
# 4. R square for GMM, Weibull
print(R_square_for_speed(df['speed'], f, weibull_params, f_em))
%%time
y_ = [integrate.nquad(f, [[0, x_val],[0, 2*pi]]) for x_val in x]
y_cdf_gmm = array(list(zip(*y_))[0])
# 5.2. CDF Comaprison
plot(x, y_ecdf,'o', alpha=0.8, label='Data')
plot(x, y_cdf_gmm,'-', color='black',label='GMM')
plot(x, y_cdf_weibull,'--', color='black',label='Weibull')
plt_configure(xlabel = "V", ylabel='P', legend=True, figsize=(4,3))
plt.figure()
plot(log(x), log(-log(1-y_ecdf)),'o', label = 'Empirical')
plot(log(x), log(-log(1-y_cdf_weibull)),'--', label = 'Weibull')
plot(log(x), log(-log(1-y_cdf_gmm)),'-', color='black', label = 'GMM')
plt_configure(xlabel='ln(V)',ylabel='ln(-ln(1-P))',legend={'loc':'best'}, figsize=(4,3))
align_figures()
cdf_diff, cdf_diff_weibull= np.abs(y_ecdf - y_cdf_gmm), np.abs(y_ecdf - y_cdf_weibull)
print(cdf_diff.max(), cdf_diff_weibull.max())
print(x[cdf_diff.argmax()], x[cdf_diff_weibull.argmax()])
# Calculate Angle Distribution
x = linspace(0,2*pi, num=36+1)
y_ =[integrate.nquad(f, [[0, inf],[x_-pi/36, x_+pi/36]]) for x_ in x]
y = array(list(zip(*y_))[0])
density, _ = dir_hist(df['dir'], bins=arange(-5, 370, 10), density=True)
plt.bar(arange(0, 360, 10), density*10*len(df['dir']), width=10, alpha=0.5, label='Data')
plot(x/pi*180, y*len(df['dir']) ,'-', color='black', label='GMM')
plt_configure(xlabel='Direction'+dir_unit_text, ylabel='Frequency',
legend={'loc': 'best'} ,tight='xtight',figsize = (4,2))
plt.tight_layout()
dir_fig = plt.gcf()
print('Direction Distribution Comparison')
sector_r_square(density*10, y[:-1])
pd.DataFrame(gofs_mean_set).set_index('year_lim')
%%time
incre = max(SECTOR_LENGTH, 10)
density_collection=Parallel(n_jobs=-1)(delayed(direction_compare)(gmm, df, angle, incre)
for angle in arange(0, 360, incre))
# This R square is computed as in paper
# Comparison of bivariate distribution constructionapproaches for analysing wind speed anddirection data
# http://onlinelibrary.wiley.com/doi/10.1002/we.400/full
print(true_R_square(density_collection))
# %%time
# curve_collection=Parallel(n_jobs=-1)(delayed(direction_compare2)
# (gmm, df, angle, incre, complex=True) for angle in arange(start, end, incre))
# Calculate Speed Distribution
def model_data_comparison(df, original_incre = 10, incre = 10):
start, end = -original_incre/2 + incre/2, 360
curve_collection = []
max_speed = df.speed.max()
# Find a max count for plotting histogram
max_count = max_count_for_angles(df, start, end, incre)
plot_range = [0, max_speed, 0, max_count*1.05]
for angle in arange(start, end, incre):
angle_radian, incre_radian = np.radians([angle, incre])
start_angle, end_angle = angle-incre/2, angle+incre/2
# 0. Select data from observation
sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)
data_size = len(sub_df.speed)
# 1. Get Weibull and ECDF
x, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(sub_df.speed)
# 2. Get GMM PDF, CDF
_, y_gmm, y_cdf_gmm, direction_prob = gmm_integration_in_direction(f, angle_radian-incre_radian/2, angle_radian+incre_radian/2, x)
# 3. R square for GMM, Weibull
bins = arange(0, sub_df.speed.max()+1)
density, _ = np.histogram(sub_df['speed'], bins=bins, normed=True)
density_expected_gmm_ =[integrate.nquad(f, [[x_, x_+1],[angle_radian-incre_radian/2, angle_radian+incre_radian/2]])
for x_ in bins[:-1]]
density_expected_gmm = array(list(zip(*density_expected_gmm_ ))[0])/direction_prob
R_square_gmm = sector_r_square(density, density_expected_gmm)
density_expected_weibull = sp.stats.weibull_min.cdf(bins[1:], *weibull_params) - sp.stats.weibull_min.cdf(bins[:-1], *weibull_params)
R_square_weibull = sector_r_square(density, density_expected_weibull)
# 4. K-S for GMM, Weibull
cdf_diff, cdf_diff_weibull= np.abs(y_ecdf - y_cdf_gmm), np.abs(y_ecdf - y_cdf_weibull)
# 5. Make Plots
fig = plt.figure(figsize=(10,1.9))
# 5.1. Frequency Comparison
ax1 = fig.add_subplot(1,3,1)
sub_df['speed'].hist(bins=arange(0, sub_max_speed), alpha=0.5, label='Data')
plot(x, y_gmm*data_size,'-', color='black', label='GMM')
plot(x, y_weibull*data_size, '--', color='black',label='Weibull')
plt_configure(xlabel = "V", ylabel='Frequency', legend=True)
plt.axis(plot_range)
# 5.2. CDF Comaprison
ax2 = fig.add_subplot(1,3,2)
plot(x, y_ecdf,'o', alpha=0.8, label='Data')
plot(x, y_cdf_gmm,'-', color='black',label='GMM')
plot(x, y_cdf_weibull,'--', color='black',label='Weibull')
plt.gca().set_xlim(right = max_speed)
plt_configure(xlabel = "V", ylabel='P', legend=True)
curves = {'direction': angle, 'datasize': data_size, 'weight': direction_prob, 'x': x,
'gmm_pdf': y_gmm, 'gmm_cdf': y_cdf_gmm,
'weibull_pdf': y_weibull, 'weibull_cdf': y_cdf_weibull, 'ecdf': y_ecdf,
'max_cdf_diff_gmm': cdf_diff.max(), 'max_cdf_diff_weibull': cdf_diff_weibull.max(),
'r_square_gmm': R_square_gmm, 'r_square_weibull': R_square_weibull}
curve_collection.append(curves)
plt.tight_layout()
plt.show()
print('%s (%s - %s) degree' % (angle, start_angle, end_angle))
print('data size:', len(sub_df), 'weight', len(sub_df)/len(df))
print('GMM', 'Weibull')
print('R square', R_square_gmm, R_square_weibull)
print('max diff:', cdf_diff.max(), cdf_diff_weibull.max(),
'speed value:', x[cdf_diff.argmax()], x[cdf_diff_weibull.argmax()], 'y gmm', y_cdf_gmm[cdf_diff.argmax()])
print(' ')
return curve_collection
%%time
if len(effective_column) == 16:
rebinned_angle = 22.5
else:
rebinned_angle = 20
curve_collection = model_data_comparison(df, SECTOR_LENGTH, rebinned_angle)
diff_df = pd.DataFrame(curve_collection)
gmm_mean, weibull_mean = plot_sectoral_comparison(diff_df.r_square_gmm, diff_df.r_square_weibull,
diff_df.direction, diff_df.datasize)
plt_configure(ylabel="$\ R^2$", xlabel='Direction'+dir_unit_text)
ylim = min(plt.gca().get_ylim()[0],0.75)
plt.gca().set_ylim(top=1, bottom=ylim)
plt.tight_layout()
print(gmm_mean, weibull_mean)
gmm_mean, weibull_mean = plot_sectoral_comparison(diff_df.max_cdf_diff_gmm, diff_df.max_cdf_diff_weibull,
diff_df.direction, diff_df.datasize)
plt_configure(ylabel="K-S", xlabel='Direction'+dir_unit_text)
ylim = max(plt.gca().get_ylim()[1],0.25)
plt.gca().set_ylim(top=ylim, bottom=0)
plt.tight_layout()
print(gmm_mean, weibull_mean)
# Compare direction weight with previous figure
display(dir_fig)
angle = max_diff_angle = diff_df.ix[diff_df['max_cdf_diff_gmm'].idxmax()]['direction']
incre = rebinned_angle
FRACTION = 1
# Select data from observation
start_angle, end_angle = angle-incre/2, angle+incre/2
angle_radian, incre_radian = radians(angle), radians(incre)
sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)
x = arange(0, sub_max_speed, 0.5)
_, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(sub_df.speed, x)
_, y_gmm, y_cdf_gmm, direction_prob = gmm_integration_in_direction(f, angle_radian-incre_radian/2, angle_radian+incre_radian/2, x)
fig = plt.figure(figsize=(10,1.9))
ax1 = fig.add_subplot(1,3,1)
ax2 = fig.add_subplot(1,3,2)
ax3 = fig.add_subplot(1,3,3)
# 1. Data
bins=arange(0, sub_max_speed)
sub_df['speed'].hist(ax=ax1, bins=bins, alpha=0.5, label='Data', normed=True)
# 2. GMM
ax1.plot(x, y_gmm,'-', color='black', label='GMM')
ax2.plot(x, y_cdf_gmm,'-', color = 'black', label='GMM')
ax3.plot(log(x), log(-log(1-y_cdf_gmm)),'-', color = 'black',label='GMM')
# 3. Weilbull
ax1.plot(x, y_weibull,'--',color='black',label='Weibull')
ax2.plot(x, y_cdf_weibull,'--',label='Weibull')
ax3.plot(log(x), log(-log(1-y_cdf_weibull)),'--',label='Weibull')
# 4. Data Resampled
count_collection = []
for i in range(1,100):
sub_df_resampled = sub_df.sample(frac=FRACTION, replace=True)
resampled_count, _ = np.histogram(sub_df_resampled['speed'], bins=bins, normed=True)
count_collection.append(resampled_count)
ecdf = sm.distributions.ECDF(sub_df_resampled.speed)
y_ecdf = ecdf(x)
ax2.plot(x, y_ecdf,':', label='Data Resampled')
ax3.plot(log(x), log(-log(1-y_ecdf)),':', label='Data Resampled')
if i == 1:
# plt_configure(ax=ax2, xlabel = "$V$", ylabel='$P$', legend={'loc':'best'})
# plt_configure(ax=ax3, xlabel="ln($V$)", ylabel="ln(-ln(1-$P$)",legend={'loc':'best'})
plt_configure(ax=ax2, xlabel = "V", ylabel='P', legend={'loc':'best'})
plt_configure(ax=ax3, xlabel="ln(V)", ylabel="ln(-ln(1-P)",legend={'loc':'best'})
print('%s (%s - %s) Degree Speed Distribution' % (angle, start_angle, end_angle))
count_collection = np.array(count_collection)
mx, mn = np.max(count_collection,0), np.min(count_collection,0)
ax1.plot(bins[1:]-0.5, mx, ':', color='blue')
ax1.plot(bins[1:]-0.5, mn, ':', color='blue', label='Resample limit')
ax1.set_ylim(bottom = 0)
# plt_configure(ax=ax1, xlabel='$V$',ylabel='Frequency',legend={'loc':'best'})
plt_configure(ax=ax1, xlabel='V', ylabel='Frequency',legend={'loc':'best'})
ax1.locator_params(axis='y', nbins=5)
ax2.locator_params(axis='y', nbins=5)
ax3.locator_params(axis='y', nbins=5)
plt.tight_layout()
diff = abs(y_ecdf - y_cdf_gmm)
print(diff.max(), x[diff.argmax()], y_cdf_gmm[diff.argmax()])
fig_time_variability_3d = plt.figure()
ax1 = fig_time_variability_3d.gca(projection='3d')
fig_time_variability_cdf,ax2 = plt.subplots(figsize=(3,1.8))
fig_time_variability_weibull, ax3 = plt.subplots(figsize=(3,1.8))
ax2.plot(x, y_cdf_gmm,'-', color='black', label = 'GMM')
ax2.plot(x, y_cdf_weibull,'--', label='Weibull')
ax3.plot(log(x), log(-log(1-y_cdf_gmm)),'-', color='black',label='GMM')
ax3.plot(log(x), log(-log(1-y_cdf_weibull)), '--', label='Weibull')
# 3. Data
prop_cycle=iter(mpl.rcParams['axes.color_cycle'])
for start_time in range(2001, 2015, 5):
end_time = start_time + 4
df_other_years = df_all_years[str(start_time):str(end_time)]
df_other_years_at_angle, sub_max_speed_other_year = select_df_by_angle(df_other_years, start_angle, end_angle)
if len(df_other_years_at_angle) > 0 :
ecdf = sm.distributions.ECDF(df_other_years_at_angle.speed)
y_ecdf = ecdf(x)
ax2.plot(x, y_ecdf,':', label = start_time)
ax3.plot(log(x), log(-log(1-y_ecdf)),':', label = start_time)
count, division = np.histogram(df_other_years_at_angle['speed'], normed=True,
bins=arange(0, sub_max_speed_other_year))
ax1.bar(left=division[:-1], height=count, zs=start_time, zdir='x',
color=next(prop_cycle), alpha=0.8)
x_3d = start_time*np.ones_like(x)
ax1.plot(x_3d, x, y_gmm, '-', color='black', label='GMM' if start_time == 2011 else '')
ax1.plot(x_3d, x, y_weibull, '--', color='blue', label='Weibull' if start_time == 2011 else '')
print('%s (%s - %s) Degree Speed Distribution' % (angle, start_angle, end_angle))
ax1.set_ylim(bottom = 0)
ax1.set_zlabel('Frequency')
plt_configure(ax=ax1, xlabel='Time',ylabel='V', legend=True)
plt_configure(ax=ax2, xlabel = "V", ylabel='P', legend={'loc':'best'})
plt_configure(ax=ax3, xlabel="ln(V)", ylabel="ln(-ln(1-P)", legend={'loc':'best'})
ax1.set_zlim(bottom = 0)
align_figures()
incre = rebinned_angle
angle_group = [max_diff_angle-incre, max_diff_angle, max_diff_angle+incre]
fig_adjecent_variability_3d = plt.figure()
ax1 = fig_adjecent_variability_3d.gca(projection='3d')
fig_adjecent_variability_cdf, ax2 = plt.subplots(figsize=(3,1.8))
fig_adjecent_variability_weibull, ax3 = plt.subplots(figsize=(3,1.8))
legend_3d = False
prop_cycle=iter(mpl.rcParams['axes.color_cycle'])
curve_df = pd.DataFrame(curve_collection)
for angle in angle_group:
curves = curve_df.query('direction == @angle%360').T.to_dict()
curves = curves[list(curves)[0]]
data_size, x = curves['datasize'], curves['x']
y_gmm, y_cdf_gmm = curves['gmm_pdf'], curves['gmm_cdf']
y_weibull, y_cdf_weibull, y_cdf = curves['weibull_pdf'], curves['weibull_cdf'], curves['ecdf']
linestyle = '-' if angle == max_diff_angle else ':'
alpha = 0.7 if angle == max_diff_angle else 0.3
ax2.plot(x, y_gmm*data_size, linestyle, label=angle)
ax3.plot(x, y_weibull*data_size, linestyle, label=angle)
start_angle, end_angle = angle-incre/2, angle+incre/2
sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)
x_3d = angle*np.ones_like(x)
ax1.plot(x_3d, x, y_gmm*data_size, color='black', label='GMM')
ax1.plot(x_3d, x, y_weibull*data_size, color='blue', linestyle='--',label='Weibull')
count, division = np.histogram(sub_df['speed'], bins=arange(0, sub_max_speed))
ax1.bar(left=division[:-1], height=count, zs=angle, zdir='x', color=next(prop_cycle), alpha=0.8)
if legend_3d == False:
ax1.legend()
legend_3d = True
plt_configure(ax=ax1, xlabel='Direction', ylabel='Speed')
plt_configure(ax=ax2, xlabel='V',ylabel='Frequency',legend={'loc':'best'})
plt_configure(ax=ax3, xlabel='V',ylabel='Frequency',legend={'loc':'best'})
ax1.set_zlabel('Frequency')
ax1.set_zlim(bottom = 0)
ylim = max(ax1.get_ylim()[1],ax3.get_ylim()[1])
ax2.set_ylim(bottom = 0, top=ylim)
ax3.set_ylim(bottom = 0, top=ylim)
print(max_diff_angle)
print('GMM, Weibull, Histogram')
align_figures()
if 'bandwidth' not in globals():
bandwidth = DEFAULT_BANDWDITH
if 'FIT_METHOD' not in globals():
FIT_METHOD = 'square_error'
if 'KDE_KERNEL' not in globals():
KDE_KERNEL = 'gaussian'
config = {'bandwidth': bandwidth,
'fitting_range': FITTING_RANGE,
'fit_limit': fit_limit,
'kde_kernel': KDE_KERNEL}
print(bandwidth, FIT_METHOD)
%%time
results = Parallel(n_jobs=-1)(delayed(resampled_fitting)(df, FIT_METHOD, NUMBER_OF_GAUSSIAN, config) for i in range(12))
for result in results:
display(pretty_print_gmm(result['gmm']))
fig,ax = plt.subplots(figsize=(3.5,3.5))
plot_gmm_ellipses(result['gmm'],ax=ax, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
plt.show()
display(gof_df(result['gmm_pdf_result'], result['kde_result']))
display(gof_df(result['gmm_pdf_result'], kde_result))
print('')
%%time
from sklearn.cross_validation import train_test_split, KFold
## 5-fold cross validation
gaussian_number_range = arange(1,6)
CV_result_train_all,CV_result_test_all =[],[]
number_of_fold = 4
print('Number of train/test dataset', len(df)*(number_of_fold-1)/number_of_fold, len(df)/number_of_fold)
for number_of_gaussian in gaussian_number_range:
print( ' ')
print('Number of gaussian', number_of_gaussian)
kf = KFold(len(df), n_folds=number_of_fold, shuffle=True)
CV_result = Parallel(n_jobs=-1)(delayed(fit_per_fold)(df, train_index, test_index, FIT_METHOD, number_of_gaussian, config) for train_index, test_index in kf)
CV_result_train, CV_result_test = list(zip(*CV_result))
CV_result_train, CV_result_test = list(CV_result_train), list(CV_result_test)
CV_result_train_all.append(CV_result_train)
CV_result_test_all.append(CV_result_test)
print('Train')
pretty_pd_display(CV_result_train)
print('Test')
pretty_pd_display(CV_result_test)
train_scores_mean, train_scores_std = generate_mean_std_gof(CV_result_train_all)
print('Train gof mean, std')
display(train_scores_mean)
test_scores_mean, test_scores_std = generate_mean_std_gof(CV_result_test_all)
print('Test gof mean, std')
display(test_scores_mean)
prop_cycle=mpl.rcParams['axes.color_cycle']
gaussian_number_range = train_scores_mean.index
for column, column_name in zip(['R_square','K_S','Chi_square'],["$\ R^2$", "K-S", "$\widetilde{\chi^2} $"]):
plot(gaussian_number_range, train_scores_mean[column],
'--', label = 'training', color=prop_cycle[0])
plt.fill_between(gaussian_number_range,
train_scores_mean[column] - train_scores_std[column],
train_scores_mean[column] + train_scores_std[column],
alpha=0.2, color=prop_cycle[0])
plot(gaussian_number_range, test_scores_mean[column],
'-', label = 'test',color=prop_cycle[1])
plt.fill_between(gaussian_number_range,
test_scores_mean[column] - test_scores_std[column],
test_scores_mean[column] + test_scores_std[column],
alpha=0.2,color=prop_cycle[1])
plt.xticks(gaussian_number_range)
print(column)
plt.locator_params(axis='y', nbins=5)
plt_configure(xlabel='Number of Gaussian Distributions', ylabel=column_name,
figsize=(3,2), legend={'loc':'best'})
if column == 'R_square':
plt.gca().set_ylim(top=1)
if column == 'K_S' or column == 'Chi_square':
plt.gca().set_ylim(bottom=0)
plt.show()
fig = plt.figure(figsize=(4.2,2.4))
ax1 = fig.add_subplot(1,2,1)
plot_2d_prob_density(X, Y, kde_Z, ax=ax1,
xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar=False)
ax1.grid(False)
ax2 = fig.add_subplot(1,2,2)
plot_2d_prob_density(X, Y, pdf_Z, ax=ax2,
xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar=False)
ax2.grid(False)
ax2.get_yaxis().set_visible(False)
for fig in [fig_hist, fig_kde, fig_em, fig_gmm]:
display(fig)
for fig in [fig_time_variability_3d, fig_time_variability_cdf, fig_time_variability_weibull,
fig_adjecent_variability_3d, fig_adjecent_variability_cdf, fig_adjecent_variability_weibull,]:
display(fig)
import time
save_notebook()
time.sleep(3)
location_name = get_location_name(file_path)
print(location_name)
current_file = 'GMM.ipynb'
output_file = './output_HTML/'+location_name+'.html'
output_HTML(current_file, output_file)