<!DOCTYPE html> <html xmlns="http://www.w3.org/1999/xhtml"> <head> <meta charset="utf-8" /> <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> <meta name="generator" content="pandoc" /> <meta name="author" content="Robert W Schlegel" /> <meta name="date" content="2019-03-19" /> <title>Assessing the effects of time series duration</title> <script src="site_libs/jquery-1.11.3/jquery.min.js"></script> <meta name="viewport" content="width=device-width, initial-scale=1" /> <link href="site_libs/bootstrap-3.3.5/css/cosmo.min.css" rel="stylesheet" /> <script src="site_libs/bootstrap-3.3.5/js/bootstrap.min.js"></script> <script src="site_libs/bootstrap-3.3.5/shim/html5shiv.min.js"></script> <script src="site_libs/bootstrap-3.3.5/shim/respond.min.js"></script> <script src="site_libs/jqueryui-1.11.4/jquery-ui.min.js"></script> <link href="site_libs/tocify-1.9.1/jquery.tocify.css" rel="stylesheet" /> <script src="site_libs/tocify-1.9.1/jquery.tocify.js"></script> <script src="site_libs/navigation-1.1/tabsets.js"></script> <link href="site_libs/highlightjs-9.12.0/textmate.css" rel="stylesheet" /> <script src="site_libs/highlightjs-9.12.0/highlight.js"></script> <link href="site_libs/font-awesome-5.0.13/css/fa-svg-with-js.css" rel="stylesheet" /> <script src="site_libs/font-awesome-5.0.13/js/fontawesome-all.min.js"></script> <script src="site_libs/font-awesome-5.0.13/js/fa-v4-shims.min.js"></script> <style type="text/css">code{white-space: pre;}</style> <style type="text/css"> pre:not([class]) { background-color: white; } </style> <script type="text/javascript"> if (window.hljs) { hljs.configure({languages: []}); hljs.initHighlightingOnLoad(); if (document.readyState && document.readyState === "complete") { window.setTimeout(function() { hljs.initHighlighting(); }, 0); } } </script> <style type="text/css"> h1 { font-size: 34px; } h1.title { font-size: 38px; } h2 { font-size: 30px; } h3 { font-size: 24px; } h4 { font-size: 18px; } h5 { font-size: 16px; } h6 { font-size: 12px; } .table th:not([align]) { text-align: left; } </style> </head> <body> <style type = "text/css"> .main-container { max-width: 940px; margin-left: auto; margin-right: auto; } code { color: inherit; background-color: rgba(0, 0, 0, 0.04); } img { max-width:100%; height: auto; } .tabbed-pane { padding-top: 12px; } .html-widget { margin-bottom: 20px; } button.code-folding-btn:focus { outline: none; } </style> <style type="text/css"> /* padding for bootstrap navbar */ body { padding-top: 51px; padding-bottom: 40px; } /* offset scroll position for anchor links (for fixed navbar) */ .section h1 { padding-top: 56px; margin-top: -56px; } .section h2 { padding-top: 56px; margin-top: -56px; } .section h3 { padding-top: 56px; margin-top: -56px; } .section h4 { padding-top: 56px; margin-top: -56px; } .section h5 { padding-top: 56px; margin-top: -56px; } .section h6 { padding-top: 56px; margin-top: -56px; } </style> <script> // manage active state of menu based on current page $(document).ready(function () { // active menu anchor href = window.location.pathname href = href.substr(href.lastIndexOf('/') + 1) if (href === "") href = "index.html"; var menuAnchor = $('a[href="' + href + '"]'); // mark it active menuAnchor.parent().addClass('active'); // if it's got a parent navbar menu mark it active as well menuAnchor.closest('li.dropdown').addClass('active'); }); </script> <div class="container-fluid main-container"> <!-- tabsets --> <script> $(document).ready(function () { window.buildTabsets("TOC"); }); </script> <!-- code folding --> <script> $(document).ready(function () { // move toc-ignore selectors from section div to header $('div.section.toc-ignore') .removeClass('toc-ignore') .children('h1,h2,h3,h4,h5').addClass('toc-ignore'); // establish options var options = { selectors: "h1,h2,h3", theme: "bootstrap3", context: '.toc-content', hashGenerator: function (text) { return text.replace(/[.\\/?&!#<>]/g, '').replace(/\s/g, '_').toLowerCase(); }, ignoreSelector: ".toc-ignore", scrollTo: 0 }; options.showAndHide = true; options.smoothScroll = true; // tocify var toc = $("#TOC").tocify(options).data("toc-tocify"); }); </script> <style type="text/css"> #TOC { margin: 25px 0px 20px 0px; } @media (max-width: 768px) { #TOC { position: relative; width: 100%; } } .toc-content { padding-left: 30px; padding-right: 40px; } div.main-container { max-width: 1200px; } div.tocify { width: 20%; max-width: 260px; max-height: 85%; } @media (min-width: 768px) and (max-width: 991px) { div.tocify { width: 25%; } } @media (max-width: 767px) { div.tocify { width: 100%; max-width: none; } } .tocify ul, .tocify li { line-height: 20px; } .tocify-subheader .tocify-item { font-size: 0.90em; padding-left: 25px; text-indent: 0; } .tocify .list-group-item { border-radius: 0px; } </style> <!-- setup 3col/9col grid for toc_float and main content --> <div class="row-fluid"> <div class="col-xs-12 col-sm-4 col-md-3"> <div id="TOC" class="tocify"> </div> </div> <div class="toc-content col-xs-12 col-sm-8 col-md-9"> <div class="navbar navbar-default navbar-fixed-top" role="navigation"> <div class="container"> <div class="navbar-header"> <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar"> <span class="icon-bar"></span> <span class="icon-bar"></span> <span class="icon-bar"></span> </button> <a class="navbar-brand" href="index.html">MHWdetection</a> </div> <div id="navbar" class="navbar-collapse collapse"> <ul class="nav navbar-nav"> <li> <a href="index.html">Home</a> </li> <li> <a href="about.html">About</a> </li> <li> <a href="license.html">License</a> </li> </ul> <ul class="nav navbar-nav navbar-right"> <li> <a href="https://github.com/jdblischak/workflowr"> <span class="fa fa-github"></span> </a> </li> </ul> </div><!--/.nav-collapse --> </div><!--/.container --> </div><!--/.navbar --> <!-- Add a small amount of space between sections. --> <style type="text/css"> div.section { padding-top: 12px; } </style> <div class="fluid-row" id="header"> <h1 class="title toc-ignore">Assessing the effects of time series duration</h1> <h4 class="author"><em>Robert W Schlegel</em></h4> <h4 class="date"><em>2019-03-19</em></h4> </div> <p><strong>Last updated:</strong> 2019-03-19</p> <strong>workflowr checks:</strong> <small>(Click a bullet for more information)</small> <ul> <li> <p><details> <summary> <strong style="color:blue;">✔</strong> <strong>R Markdown file:</strong> up-to-date </summary></p> <p>Great! Since the R Markdown file has been committed to the Git repository, you know the exact version of the code that produced these results.</p> </details> </li> <li> <p><details> <summary> <strong style="color:blue;">✔</strong> <strong>Environment:</strong> empty </summary></p> <p>Great job! The global environment was empty. Objects defined in the global environment can affect the analysis in your R Markdown file in unknown ways. For reproduciblity it’s best to always run the code in an empty environment.</p> </details> </li> <li> <p><details> <summary> <strong style="color:blue;">✔</strong> <strong>Seed:</strong> <code>set.seed(20190319)</code> </summary></p> <p>The command <code>set.seed(20190319)</code> was run prior to running the code in the R Markdown file. Setting a seed ensures that any results that rely on randomness, e.g. subsampling or permutations, are reproducible.</p> </details> </li> <li> <p><details> <summary> <strong style="color:blue;">✔</strong> <strong>Session information:</strong> recorded </summary></p> <p>Great job! Recording the operating system, R version, and package versions is critical for reproducibility.</p> </details> </li> <li> <p><details> <summary> <strong style="color:blue;">✔</strong> <strong>Repository version:</strong> <a href="https://github.com/robwschlegel/MHWdetection/tree/64ac134076a04088c834291ce86c6405eedaf672" target="_blank">64ac134</a> </summary></p> Great! You are using Git for version control. Tracking code development and connecting the code version to the results is critical for reproducibility. The version displayed above was the version of the Git repository at the time these results were generated. <br><br> Note that you need to be careful to ensure that all relevant files for the analysis have been committed to Git prior to generating the results (you can use <code>wflow_publish</code> or <code>wflow_git_commit</code>). workflowr only checks the R Markdown file, but you know if there are other scripts or data files that it depends on. Below is the status of the Git repository when the results were generated: <pre><code> Ignored files: Ignored: .Rhistory Ignored: .Rproj.user/ Ignored: vignettes/data/sst_ALL_clim_only.Rdata Ignored: vignettes/data/sst_ALL_event_aov_tukey.Rdata Ignored: vignettes/data/sst_ALL_flat.Rdata Ignored: vignettes/data/sst_ALL_miss.Rdata Ignored: vignettes/data/sst_ALL_miss_cat_chi.Rdata Ignored: vignettes/data/sst_ALL_miss_clim_event_cat.Rdata Ignored: vignettes/data/sst_ALL_miss_clim_only.Rdata Ignored: vignettes/data/sst_ALL_repl.Rdata Ignored: vignettes/data/sst_ALL_smooth.Rdata Ignored: vignettes/data/sst_ALL_smooth_aov_tukey.Rdata Ignored: vignettes/data/sst_ALL_smooth_event.Rdata Ignored: vignettes/data/sst_ALL_trend.Rdata Ignored: vignettes/data/sst_ALL_trend_clim_event_cat.Rdata Untracked files: Untracked: analysis/bibliography.bib Untracked: analysis/data/ Untracked: code/functions.R Untracked: code/workflow.R Untracked: data/sst_WA.csv Untracked: docs/figure/ Unstaged changes: Modified: NEWS.md </code></pre> Note that any generated files, e.g. HTML, png, CSS, etc., are not included in this status report because it is ok for generated content to have uncommitted changes. </details> </li> </ul> <details> <summary> <small><strong>Expand here to see past versions:</strong></small> </summary> <ul> <table style="border-collapse:separate; border-spacing:5px;"> <thead> <tr> <th style="text-align:left;"> File </th> <th style="text-align:left;"> Version </th> <th style="text-align:left;"> Author </th> <th style="text-align:left;"> Date </th> <th style="text-align:left;"> Message </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> Rmd </td> <td style="text-align:left;"> <a href="https://github.com/robwschlegel/MHWdetection/blob/64ac134076a04088c834291ce86c6405eedaf672/analysis/time_series_duration.Rmd" target="_blank">64ac134</a> </td> <td style="text-align:left;"> robwschlegel </td> <td style="text-align:left;"> 2019-03-19 </td> <td style="text-align:left;"> Publish analysis files </td> </tr> </tbody> </table> </ul> <p></details></p> <hr /> <div id="overview" class="section level2"> <h2>Overview</h2> <p>It has been shown that the length (duration) of a time series may be one of the most important factors in determining the usefulness of those data for any number of applications <span class="citation">(Schlegel and Smit 2016)</span>. For the detection of MHWs it is recommended that one has at least 30 years of data in order to accurately detect the events therein. This is no longer an issue for most ocean surfaces because many high quality SST products, such as NOAA OISST, are now in exceedence of the 30-year minimum. However, besides issues relating to the coarseness of these older products, there are many other reasons why one would want to detect MHWs in products with shorter records. <em>In situ</em> measurements along the coast are one good example of this, another being the use of the newer higher resolution SST products, a third being the use of reanalysis/model products for the detection of events in 3D. It is therefore necessary to quantify the effects that shorter time series duration (< 30 years) has on the accurate detection of events. Once any discrepancies have been accounted for, best practices must be developed to allow users to improve the precision of the detection of events in their shorter time series.</p> </div> <div id="time-series-shortening" class="section level2"> <h2>Time series shortening</h2> <p>A time series derives it’s usefulness from it’s length. This is generally because the greater the number of observations (larger sample size), the more confident one can be about the resultant statistics. This is particularly true for decadal scale measurements and is why the WMO recommends a minimum of 30 years of data before drawing any conclusions about decadal (long-term) trends observed in a time series. We however want to look not at decadal scale trends, but rather at how observed MHWs compare against one another when they have been detected using longer or shorter time series. In order to to quantify this effect we will use the minimum and maximum range of time series available to us. The maximum length will be 37 years, as this is the full extent of the NOAA OISST data included in <strong><code>heatwaveR</code></strong>. The minimum length for comparing the seasonal signal and 90th percentile thresholds will be three years as his is the minimum number of years required to calculate a climatology. For comparing MHWs the minimum will be ten years so that we may still have enough events detected between shortened time series to produce robust statistics.</p> <p>Before we can discuss shortening techniques, we must first consider the inherent decadal trend in the time series themselves. This is usually the primary driver for much of the event detection changes over time <span class="citation">(Oliver et al. 2018)</span>. Therefore, if we truly want to understand the effect that a shortened time series may have on event detection, apart from the effect of the decadal trend, we must de-trend the time series first. To this end there is an entire separate <a href="https://robwschlegel.github.io/MHWdetection/articles/trend.html">vignette</a> that quantifies the effect of long-term trends on the detection of events.</p> <p>The time series shortening technique proposed here is <a href="https://robwschlegel.github.io/MHWdetection/articles/Short_climatologies.html">re-sampling</a>. This methodology takes the full 37 year de-trended time series and randomly samples <em>n</em> years from it to simulate a 3 – 37 year time series. One then uses the jumbled up, randomly selected years of data to create the seasonal signal and 90th percentile threshold (hereafter referred to as “the climatologies”). Re-sampling the time series in this way is useful because it allows us to replicate random climatology creation 100 (or more) times for each time series length in order to produce a more confident estimation of how likely climatologies generated from certain time series lengths are to impact the accuracy of event detection. It also allows us to see how consistent the size of MHWs are in a given time series, or if they very quite a bit depending on the years used.</p> <p>The WMO recommends that the period used to create a climatology be 30 years in length, starting on the first year of a decade (e.g. 1981), however; because we are going to be running these analyses on many time series shorter than 30 years, for the sake of consistency we will use all of the years present in each re-sampled time series them as the climatology period, regardless of length. For example, if a time series spans 1982 – 2018, that will be the period used for calculating the climatologies. Likewise, should the time series only span the years 2006 – 2018, that will be the period used.</p> <p>The effect this has on the climatologies will be quantified through the following statistics:</p> <ul> <li>An ANOVA will compare the climatologies created from different lengths;</li> <li>A post-hoc Tukey test to determine where the difference in time series length for climatology creation become significant;</li> <li>Correlations between time series length/SD and seas/thresh mean/SD</li> <li>Kolmogorov-Smirnov test for where the climatologies diverge significantly from the 30 year climatologies</li> </ul> <p>The effect this has on the MHW metrics will be quantified with the following statistics:</p> <ul> <li>ANOVA comparing the primary metrics for MHWs detected using different time series lengths;</li> <li>A post-hoc Tukey test to show where the difference in time series length for event detection become significant;</li> <li>Confidence intervals (CIs) for the measured metrics</li> <li>Correlations between time series length/SD and MHW metrics</li> </ul> <p>In addition to checking all of the statistics mentioned above, it is also necessary to record what the overall relationship is with the reduction of time series length and the resultant climatologies/metrics. For example, are MHWs on average more intense in shorter time series or less? We must also look into how the categories of the events are affected. This will form part of the best practice advice later.</p> </div> <div id="re-sampling" class="section level2"> <h2>Re-sampling</h2> <p>Below we use <a href="https://robwschlegel.github.io/MHWdetection/articles/Short_climatologies.html">re-sampling</a> to simulate 100 time series at each length of 3 – 37 years on de-trended data. This is presently being done with the three pre-packaged time series available in the <strong><code>heatwaveR</code></strong> package and the python distribution, but eventually will be run on the global data. The original data are appended to these 100 re-samples as <code>rep = 0</code>.</p> <pre class="r"><code>library(tidyverse) library(broom) library(heatwaveR, lib.loc = "~/R-packages/") # cat(paste0("heatwaveR version = ",packageDescription("heatwaveR")$Version)) library(lubridate) # This is intentionally activated after data.table library(fasttime) library(ggpubr) library(boot) library(FNN) library(mgcv) library(doMC); doMC::registerDoMC(cores = 50) MHW_colours <- c( "I Moderate" = "#ffc866", "II Strong" = "#ff6900", "III Severe" = "#9e0000", "IV Extreme" = "#2d0000" )</code></pre> <pre class="r"><code># First put all of the data together and create a site column sst_ALL <- rbind(sst_Med, sst_NW_Atl, sst_WA) %>% mutate(site = rep(c("Med", "NW_Atl", "WA"), each = nrow(sst_WA))) save(sst_ALL, file = "data/sst_ALL.Rdata") # Function for creating a re-sample from 37 years of data sample_37 <- function(rep){ year_filter <- c(1,2,3) while(length(unique(year_filter)) != 37){ year_filter <- factor(sample(1982:2018, size = 37, replace = F)) } res <- data.frame() for(i in 1:length(year_filter)){ df_sub <- sst_ALL %>% filter(year(t) == year_filter[i], # Remove leap-days here paste0(month(t),"-",day(t)) != "2-29") %>% mutate(year = seq(1982,2018)[i], month = month(t), day = day(t), year_orig = year(t)) %>% mutate(t = as.Date(fastPOSIXct(paste(year, month, day, sep = "-")))) %>% select(-year, -month, -day) res <- rbind(res, df_sub) } res$rep <- as.character(rep) return(res) } # Re-sample the data 100 times doMC::registerDoMC(cores = 50) sst_ALL_repl <- plyr::ldply(2:100, sample_37, .parallel = T) # Add the original data as rep = "1" sst_ALL_1 <- sst_ALL %>% mutate(year_orig = year(t), rep = "1") sst_ALL_repl <- rbind(sst_ALL_1, sst_ALL_repl) # Save and clear save(sst_ALL_repl, file = "data/sst_ALL_repl.Rdata") rm(sst_ALL_1, sst_ALL_repl); gc()</code></pre> <pre class="r"><code># Load randomly sample time series load("data/sst_ALL_repl.Rdata") # Function to extract the residuals around the linear trend # This is the de-trended anomaly time series detrend <- function(df){ resids <- broom::augment(lm(temp ~ t, df)) res <- df %>% mutate(temp = temp - resids$.fitted) %>% select(-year_orig) return(res) } # Create de-trended anomaly time series from all re-samples doMC::registerDoMC(cores = 50) sst_ALL_flat <- plyr::ddply(sst_ALL_repl, c("site", "rep"), detrend) # Save and clear save(sst_ALL_flat, file = "data/sst_ALL_flat.Rdata") rm(sst_ALL_repl, sst_ALL_flat); gc()</code></pre> <pre class="r"><code># Load de-trended data load("data/sst_ALL_flat.Rdata") # Calculate climatologies, events, and categories on shrinking time series shrinking_results <- function(year_begin){ res <- sst_ALL_flat %>% filter(year(t) >= year_begin) %>% mutate(year_index = 2018-year_begin+1) %>% group_by(rep, site, year_index) %>% nest() %>% mutate(clims = map(data, ts2clm, climatologyPeriod = c(paste0(year_begin,"-01-01"), "2018-12-31")), events = map(clims, detect_event), cats = map(events, category)) %>% select(-data, -clims) return(res) } # Run this preferably with 35 cores for speed, RAM allowing doMC::registerDoMC(cores = 35) sst_ALL_smooth <- plyr::ldply(1982:2016, shrinking_results, .parallel = T) save(sst_ALL_smooth, file = "data/sst_ALL_smooth.Rdata") rm(sst_ALL_flat, sst_ALL_smooth); gc()</code></pre> </div> <div id="climatology-statistics" class="section level2"> <h2>Climatology statistics</h2> <p>We will now see if the 100 re-sampled time series produce significantly different climatologies as they are shortened. If so, we also want to see at what point the climatologies begin to diverge.</p> <pre class="r"><code>load("data/sst_ALL_smooth.Rdata") # Extract climatology values only sst_ALL_clim_only <- sst_ALL_smooth %>% unnest(events) %>% filter(row_number() %% 2 == 1) %>% unnest(events) %>% select(rep:doy, seas:thresh) %>% unique() %>% arrange(site, rep, -year_index, doy) save(sst_ALL_clim_only, file = "data/sst_ALL_clim_only.Rdata") rm(sst_ALL_smooth, sst_ALL_clim_only); gc()</code></pre> <div id="anova-p-values" class="section level3"> <h3>ANOVA <em>p</em>-values</h3> <pre class="r"><code># Run an ANOVA on each metric of the combined event results and get the p-value # df <- res aov_p <- function(df){ aov_models <- df[ , -grep("year_index", names(df))] %>% map(~ aov(.x ~ df$year_index)) %>% map_dfr(~ broom::tidy(.), .id = 'metric') %>% mutate(p.value = round(p.value, 4)) %>% filter(term != "Residuals") %>% select(metric, p.value) return(aov_models) } # Run an ANOVA on each metric and then a Tukey test aov_tukey <- function(df){ aov_tukey <- df[ , -grep("year_index", names(df))] %>% map(~ TukeyHSD(aov(.x ~ df$year_index))) %>% map_dfr(~ broom::tidy(.), .id = 'metric') %>% mutate(p.value = round(adj.p.value, 4)) %>% # filter(term != "Residuals") %>% select(metric, comparison, adj.p.value) %>% # filter(adj.p.value <= 0.05) %>% arrange(metric, adj.p.value) return(aov_tukey) } # Quick wrapper for getting results for ANOVA and Tukey on clims # df <- sst_ALL_clim_only %>% # filter(rep == "1", site == "WA") %>% # select(-rep, -site) clim_aov_tukey <- function(df){ res <- df %>% select(year_index, seas, thresh) %>% mutate(year_index = as.factor(year_index)) %>% nest() %>% mutate(aov = map(data, aov_p), tukey = map(data, aov_tukey)) %>% select(-data) return(res) }</code></pre> <pre class="r"><code># This file is not uploaded to GitHub as it is too large # One must first run the above code locally to generate and save the file load("data/sst_ALL_clim_only.Rdata") doMC::registerDoMC(cores = 50) sst_ALL_smooth_aov_tukey <- plyr::ddply(sst_ALL_clim_only, c("site", "rep"), clim_aov_tukey, .parallel = T) save(sst_ALL_smooth_aov_tukey, file = "data/sst_ALL_smooth_aov_tukey.Rdata") rm(sst_ALL_smooth, sst_ALL_smooth_aov_tukey); gc()</code></pre> <pre class="r"><code>load("~/MHWdetection/analysis/data/sst_ALL_smooth_aov_tukey.Rdata") sst_ALL_smooth_aov <- sst_ALL_smooth_aov_tukey %>% select(-tukey) %>% unnest() label_count_aov_p <- sst_ALL_smooth_aov %>% group_by(site, metric) %>% filter(p.value <= 0.05) %>% summarise(count = n()) knitr::kable(label_count_aov_p, col.names = c("Time series", "Climatology", "Significantly different"), caption = "Table 1: This table shows the results of 100 ANOVAs on the real and re-sampled time series with systematically shortened lengths from 37 -- 3 years. Note that the climatologies for the Med time series were never significantly different and so aren't shown here.")</code></pre> <table> <caption>Table 1: This table shows the results of 100 ANOVAs on the real and re-sampled time series with systematically shortened lengths from 37 – 3 years. Note that the climatologies for the Med time series were never significantly different and so aren’t shown here.</caption> <thead> <tr class="header"> <th align="left">Time series</th> <th align="left">Climatology</th> <th align="right">Significantly different</th> </tr> </thead> <tbody> <tr class="odd"> <td align="left">Med</td> <td align="left">thresh</td> <td align="right">1</td> </tr> <tr class="even"> <td align="left">NW_Atl</td> <td align="left">seas</td> <td align="right">1</td> </tr> <tr class="odd"> <td align="left">NW_Atl</td> <td align="left">thresh</td> <td align="right">23</td> </tr> <tr class="even"> <td align="left">WA</td> <td align="left">seas</td> <td align="right">52</td> </tr> <tr class="odd"> <td align="left">WA</td> <td align="left">thresh</td> <td align="right">75</td> </tr> </tbody> </table> <pre class="r"><code>rm(sst_ALL_smooth_aov, label_count_aov_p); gc()</code></pre> <pre><code> used (Mb) gc trigger (Mb) max used (Mb) Ncells 1943008 103.8 3969330 212.0 2487694 132.9 Vcells 4457481 34.1 10146329 77.5 5920704 45.2</code></pre> <p>The above table shows that there are no significant differences in the climatologies for the Mediterannean and relatively few for the North West Atlantic time series. We do however see that of the 100 re-samples for the Western Australia time series, 52 of the seasonal signals and 75 of the thresholds did differ significantly at some point during the shortening.</p> </div> <div id="post-hoc-tukey-test" class="section level3"> <h3>Post-hoc Tukey test</h3> <p>To get a more precise pictures of where these differences are occurring we will need to look at the post-hoc Tukey test results.</p> <pre class="r"><code>sst_ALL_smooth_tukey <- sst_ALL_smooth_aov_tukey %>% select(-aov) %>% unnest() clim_tukey_sig <- sst_ALL_smooth_tukey %>% filter(adj.p.value <= 0.05) %>% separate(comparison, into = c("year_long", "year_short"), sep = "-") %>% mutate(year_long = as.numeric(year_long), year_short = as.numeric(year_short)) %>% mutate(year_dif = year_long - year_short) %>% # select(-adj.p.value, -rep) %>% group_by(site, metric, year_long, year_short, year_dif) %>% summarise(count = n()) %>% ungroup() %>% arrange(-year_long, -year_dif) %>% mutate(year_index = factor(paste0(year_long,"-",year_dif)), year_index = factor(year_index, levels = unique(year_index))) # levels(clim_tukey_sig$year_index) ggplot(clim_tukey_sig, aes(x = year_long, y = year_short)) + # geom_point() + # geom_errorbarh(aes(xmin = year_long, xmax = year_short)) + geom_tile(aes(fill = count)) + facet_grid(metric~site) + scale_x_reverse(breaks = seq(3, 36, 3)) + scale_y_reverse(breaks = seq(4 , 16, 2)) + scale_fill_viridis_c("count out of \n100 re-samples") + labs(x = "Length (years) of long time series", y = "Length (years) of short time series")</code></pre> <div class="figure" style="text-align: center"> <img src="figure/time_series_duration.Rmd/clim-tukey-heat-1.png" alt="Heatmap showing the distribution of the difference in years for climatologies that were significantly different within the same time series when different lengths of the time series were used to calculate the thresholds. The y axis shows the range of shorter time series lengths that produced significant results when compared to longer time series lengths, which are shown on the x axis. The fill at each pixel shows the count out of 100 re-samples in which the comparison of these two time series lengths were significantly different." width="768" /> <p class="caption"> Heatmap showing the distribution of the difference in years for climatologies that were significantly different within the same time series when different lengths of the time series were used to calculate the thresholds. The y axis shows the range of shorter time series lengths that produced significant results when compared to longer time series lengths, which are shown on the x axis. The fill at each pixel shows the count out of 100 re-samples in which the comparison of these two time series lengths were significantly different. </p> </div> <pre class="r"><code>rm(sst_ALL_smooth_aov_tukey, sst_ALL_smooth_tukey, clim_tukey_sig); gc()</code></pre> <pre><code> used (Mb) gc trigger (Mb) max used (Mb) Ncells 2080314 111.2 3969330 212.0 3969330 212.0 Vcells 3876957 29.6 10146329 77.5 10146085 77.5</code></pre> <p>The above figure shows us that as few as a four year difference in time series length was able to create significantly different thresholds, but that the most common difference was 14 – 17 years with the distribution left-skewed.</p> </div> <div id="kolmogorov-smirnov-tests" class="section level3"> <h3>Kolmogorov-Smirnov tests</h3> <p>Whereas the above figure does reveal to us that shorter time series are more often significantly different than longer time series, it takes quie a while to decipher this information and it doesn’t seem very helpful to know this level of detail. Rather I think it would be more useful to simply run a series of Kolmogorov-Smirnov tests for each length of each re-sampled time series to see how often the climatology distributions are significantly different from the climatologies at the 30 year length. This is not only a more appropriate test for these climatology data than an ANOVA, it is also a more simplified way of understanding the results.</p> <pre class="r"><code>load("data/sst_ALL_clim_only.Rdata") # Function for running a pairwise KS test against the KS_sub <- function(df_2, df_1){ suppressWarnings( # Suppress warnings about perfect matches res <- data.frame(seas = round(ks.test(df_1$seas, df_2$seas)$p.value, 4), thresh = round(ks.test(df_1$thresh, df_2$thresh)$p.value, 4), year_1 = df_1$year_index[1], year_2 = df_2$year_index[1]) ) return(res) } # Wrapper function to run all pairwise KS tests # df <- sst_ALL_clim_only %>% # filter(rep == "30", site == "WA") KS_all <- function(df){ # The 30 year clims against which all are compared df_30 <- df %>% filter(year_index == 30) # The results res <- df %>% mutate(year_index2 = year_index) %>% group_by(year_index2) %>% nest() %>% mutate(KS = map(data, KS_sub, df_1 = df_30)) %>% select(-data, -year_index2) %>% unnest() %>% tidyr::gather(key = "metric", value = "p.value", -year_1, -year_2) return(res) } doMC::registerDoMC(cores = 50) sst_ALL_KS <- plyr::ddply(sst_ALL_clim_only, KS_all, .variables = c("rep", "site"), .parallel = T) save(sst_ALL_KS, file = "data/sst_ALL_KS.Rdata") rm(sst_ALL_clim_only, sst_ALL_KS); gc()</code></pre> <pre class="r"><code>load("~/MHWdetection/analysis/data/sst_ALL_KS.Rdata") # Filter non-significant results KS_sig <- sst_ALL_KS %>% filter(p.value <= 0.05) %>% group_by(site, metric, year_2) %>% summarise(count = n()) %>% ungroup() # Pad in NA for years with no significant differences KS_sig <- padr::pad_int(x = KS_sig, by = "year_2", group = c("site", "metric")) %>% select(site, metric, year_2, count) %>% rbind(data.frame(site = "Med", metric = "seas", year_2 = 25:37, count = NA)) ggplot(KS_sig, aes(x = year_2, y = site)) + geom_line(aes(colour = count), size = 3) + geom_vline(aes(xintercept = 30), alpha = 0.7, linetype = "dotted") + facet_wrap(~metric, ncol = 1) + scale_colour_viridis_c() + scale_x_reverse(breaks = seq(5, 35, 5)) + labs(x = "Climatology period (years)", y = NULL, colour = "Sig. dif.\nout of 100")</code></pre> <div class="figure" style="text-align: center"> <img src="figure/time_series_duration.Rmd/KS-clims-1.png" alt="Line plots showing the results of pairwise Kolmogorov-Smirnoff tests for the seasonal signals (top panel) and 90th percentile thresholds (bottom panel) from the same time series at differing lengths. The colour of the line shows how many times out of 100 re-samples that the climatologies were significantly different from the control. The dotted verticle line denotes the 30 year climatology mark, against which all othe climatologies were compared. If no re-samplings were significantly different this is shown with a grey line." width="768" /> <p class="caption"> Line plots showing the results of pairwise Kolmogorov-Smirnoff tests for the seasonal signals (top panel) and 90th percentile thresholds (bottom panel) from the same time series at differing lengths. The colour of the line shows how many times out of 100 re-samples that the climatologies were significantly different from the control. The dotted verticle line denotes the 30 year climatology mark, against which all othe climatologies were compared. If no re-samplings were significantly different this is shown with a grey line. </p> </div> <pre class="r"><code>rm(sst_ALL_KS, KS_sig); gc()</code></pre> <pre><code> used (Mb) gc trigger (Mb) max used (Mb) Ncells 2102663 112.3 3969330 212.0 3969330 212.0 Vcells 3891511 29.7 10146329 77.5 10146085 77.5</code></pre> <p>The above figure shows that the results of the KS tests were more often significantly different than the ANOVAs. This means that the actual shape of the distributions changes rather quickly, as shown by how it only takes a few years for the KS test to show the results as being different. The ANOVAs report fewer significant differences because that is a test of central tendency. Meaning that the shape of the climatologies changes rather quickly depending on the climatology period used, but the overall range of values within a climatology does not differ significantly as quickly.</p> <pre class="r"><code>load("~/MHWdetection/analysis/data/sst_ALL_clim_only.Rdata") sst_ALL_clim_1 <- sst_ALL_clim_only %>% filter(rep == "1") %>% arrange(year_index) ggplot(data = sst_ALL_clim_1, aes(x = doy, y = thresh)) + geom_line(aes(group = year_index, colour = year_index), alpha = 0.8) + facet_wrap(~site, ncol = 1) + scale_colour_viridis_c(direction = -1)</code></pre> <div class="figure" style="text-align: center"> <img src="figure/time_series_duration.Rmd/clim-ts-all-1.png" alt="Time series of each of climatology period used from the original data shown overlapping one another to visualise how the climatologies differ depending on the length of the climatology period used." width="768" /> <p class="caption"> Time series of each of climatology period used from the original data shown overlapping one another to visualise how the climatologies differ depending on the length of the climatology period used. </p> </div> <p>The above figure shows us that the larger the seasonal signal is, the less effect deviations away from it will likely have.</p> </div> <div id="linear-relationships-r2" class="section level3"> <h3>Linear relationships (R^2)</h3> <p>It is also of interest to see what the relationship in the mean and SD of the seasonal signal and thresholds may be against the length of the time series.</p> <pre class="r"><code>load("data/sst_ALL_clim_only.Rdata") # Summary stats for correlation sst_ALL_smooth_R2_base <- sst_ALL_clim_only %>% group_by(rep, site, year_index) %>% summarise(seas_mean = mean(seas), seas_sd = sd(seas), thresh_mean = mean(thresh), thresh_sd = sd(thresh)) %>% # select(-year_index) %>% group_by(rep, site) save(sst_ALL_smooth_R2_base, file = "data/sst_ALL_smooth_R2_base.Rdata") rm(sst_ALL_clim_only, sst_ALL_smooth_R2_base); gc()</code></pre> <pre class="r"><code>load("~/MHWdetection/analysis/data/sst_ALL_smooth_R2_base.Rdata") sst_ALL_smooth_R2_base_long <- sst_ALL_smooth_R2_base %>% gather(key = metric, value = val, -year_index, -site, -rep) ggplot(sst_ALL_smooth_R2_base_long, aes(x = year_index, y = val)) + geom_point() + geom_smooth(method = "lm") + facet_grid(metric~site, scales = "free_y")</code></pre> <div class="figure" style="text-align: center"> <img src="figure/time_series_duration.Rmd/clim-R2-base-scatter-plot-1.png" alt="Scatterplot showing the relationship between time series length (x-axis) and a range of metrics (y-axis). The clear pattern is that the variance of the resulting relationships diminishes rapidly after at least 10 years of data are being used to determine the seasonal signal and threshold. The data shown here are all 100 re-samples." width="768" /> <p class="caption"> Scatterplot showing the relationship between time series length (x-axis) and a range of metrics (y-axis). The clear pattern is that the variance of the resulting relationships diminishes rapidly after at least 10 years of data are being used to determine the seasonal signal and threshold. The data shown here are all 100 re-samples. </p> </div> <pre class="r"><code># Load base data load("data/sst_ALL_smooth_R2_base.Rdata") # Linear model function for nesting linear_vals <- function(df, val){ val_vector <- as.vector(as.matrix(df[,colnames(df) == val])) year_vector <- df$year_index l_mod <- lm(val_vector ~ year_vector) res <- glance(l_mod) %>% mutate(metric = val, adj.r.squared = round(adj.r.squared, 2), p.value = round(p.value, 4)) %>% select(metric, adj.r.squared, p.value) row.names(res) <- NULL return(res) } # Simple wrapper linear_vals_wrap <- function(df){ seas_mean <- linear_vals(df, "seas_mean") seas_sd <- linear_vals(df, "seas_sd") thresh_mean <- linear_vals(df, "thresh_mean") thresh_sd <- linear_vals(df, "thresh_sd") res <- rbind(seas_mean, seas_sd, thresh_mean, thresh_sd) return(res) } # Calculate all R2 and p values sst_ALL_smooth_R2 <- sst_ALL_smooth_R2_base %>% group_by(site, rep) %>% nest() %>% mutate(lm_res = map(data, linear_vals_wrap)) %>% select(-data) %>% unnest() %>% mutate(adj.r.squared = ifelse(adj.r.squared < 0, 0, adj.r.squared)) # Save and clear save(sst_ALL_smooth_R2, file = "data/sst_ALL_smooth_R2.Rdata")</code></pre> <pre class="r"><code>load("~/MHWdetection/analysis/data/sst_ALL_smooth_R2.Rdata") ggplot(sst_ALL_smooth_R2, aes(x = metric, y = adj.r.squared, fill = metric)) + geom_boxplot() + scale_fill_viridis_d() + facet_wrap(~site)</code></pre> <div class="figure" style="text-align: center"> <img src="figure/time_series_duration.Rmd/clim-R2-fig-1.png" alt="Boxplots showing the distribution of coefficients of determination (R2) for the linear relationship between the mean and SD of the climatologies for all the varying lengths of the 100 re-samples of each of the three reference time series." width="768" /> <p class="caption"> Boxplots showing the distribution of coefficients of determination (R2) for the linear relationship between the mean and SD of the climatologies for all the varying lengths of the 100 re-samples of each of the three reference time series. </p> </div> <pre class="r"><code>rm(sst_ALL_smooth_R2)</code></pre> <!-- The above figure shows us that all three reference time series respond relatively similarly with regards to the correlations of their climatology means and SD to the length and SD of the base temperatures from which the climatologies were derived. Though the patterns observed otherwise tell us little, it is an important finding to know that this dud pattern is consistent across the different reference time series. The other important pattern this figure shows is that the length of a time series appears to not have a consistent relationship with the resultant climatologies. Instead we see that it is the variance in the time series that tends to have a positive relationship with the variance, and to a lesser extent the means, in the climatologies. --> <!-- The pattern that is beginning to emerge from the results for the climatologies is that what matters most for the difference in the results is how much variance is present in the base time series, meaning how large is the seasonal signal relative to the temperatures in the time series. This variance could also be a representation of the presence of extreme temperature differences. Relating to the reseach question here this means large MHWs. It must therefore be investigated what the relationship is between categories of MHWs and the significant differences detected in the thresholds in the WA time series. It should also be taken into consideration that whenever the re-sampled time series have the year 2011, the massive MHW, these will likely be the ones producing significantly different results. --> </div> </div> <div id="mhw-metrics" class="section level2"> <h2>MHW metrics</h2> <p>With the effects of time series duration on the climatologies now mapped out it is time to apply a similar methodology to the MHW metrics. In order to produce more robust statistics we will now use only results from time series of lengths 37 –10 years and will only compare events that occurred in the most recent 10 years of each time series.</p> <pre class="r"><code># Quick wrapper for getting results for ANOVA on events # df <- sst_ALL_smooth %>% # filter(rep == "1", site == "WA") %>% # select(-rep, -site) event_aov_tukey <- function(df){ res <- df %>% unnest(events) %>% filter(row_number() %% 2 == 0) %>% unnest(events) %>% filter(year_index >= 10) %>% filter(date_start >= "2009-01-02", date_end <= "2018-12-31") %>% select(year_index, duration, intensity_mean, intensity_max, intensity_cumulative) %>% mutate(year_index = as.factor(year_index)) %>% nest() %>% mutate(aov = map(data, aov_p), tukey = map(data, aov_tukey)) %>% select(-data) return(res) }</code></pre> <div id="anova-p-values-1" class="section level3"> <h3>ANOVA <em>p</em>-values</h3> <pre class="r"><code>load("data/sst_ALL_smooth.Rdata") doMC::registerDoMC(cores = 50) sst_ALL_event_aov_tukey <- plyr::ddply(sst_ALL_smooth, c("rep", "site"), event_aov_tukey, .parallel = T) save(sst_ALL_event_aov_tukey, file = "data/sst_ALL_event_aov_tukey.Rdata")</code></pre> <pre class="r"><code># This file is not uploaded to GitHub as it is too large # One must first run the above code locally to generate and save the file load("~/MHWdetection/analysis/data/sst_ALL_event_aov_tukey.Rdata") sst_ALL_event_aov <- sst_ALL_event_aov_tukey %>% select(-tukey) %>% unnest() label_count_event_aov_p <- sst_ALL_event_aov %>% group_by(site, metric) %>% filter(p.value <= 0.05) %>% summarise(count = n()) knitr::kable(label_count_event_aov_p, col.names = c("Time series", "MHW metric", "Significantly different"), caption = "Table 1: This table shows the results of 100 ANOVAs on event metrics detected in the real and re-sampled time series with systematically shortened lengths from 37 -- 10 years.")</code></pre> <table> <caption>Table 1: This table shows the results of 100 ANOVAs on event metrics detected in the real and re-sampled time series with systematically shortened lengths from 37 – 10 years.</caption> <thead> <tr class="header"> <th align="left">Time series</th> <th align="left">MHW metric</th> <th align="right">Significantly different</th> </tr> </thead> <tbody> <tr class="odd"> <td align="left">NW_Atl</td> <td align="left">duration</td> <td align="right">1</td> </tr> <tr class="even"> <td align="left">NW_Atl</td> <td align="left">intensity_mean</td> <td align="right">2</td> </tr> <tr class="odd"> <td align="left">WA</td> <td align="left">intensity_max</td> <td align="right">1</td> </tr> <tr class="even"> <td align="left">WA</td> <td align="left">intensity_mean</td> <td align="right">5</td> </tr> </tbody> </table> <p>The above table shows us that generally the length of a time series has no significant effect on the MHWs detected within it. We see that the MHWs in the Mediterranean time series are not affected at all in any of the 100 replicates. Surprisingly the events in the North West Atlantic time series are significantly different more often than the Western Australia time series. This must mean that the event metrics are more sensistive to a number of longer larger events than a single large one, as is the case in the WA time series. This also means that the thresholds between shortened time series may not need to be significantly different in order to produce significantly different events. It must now be investigate just which years are significantly different from one another.</p> </div> <div id="post-hoc-tukey-test-1" class="section level3"> <h3>Post-hoc Tukey test</h3> <pre class="r"><code>sst_ALL_event_tukey <- sst_ALL_event_aov_tukey %>% select(-aov) %>% unnest() event_tukey_sig <- sst_ALL_event_tukey %>% filter(adj.p.value <= 0.05)# %>% # separate(comparison, into = c("year_long", "year_short"), sep = "-") %>% # mutate(year_long = as.numeric(year_long), # year_short = as.numeric(year_short)) %>% # mutate(year_dif = year_long - year_short) %>% # # select(-adj.p.value, -rep) %>% # group_by(site, metric, year_long, year_short, year_dif) %>% # summarise(count = n()) %>% # ungroup() %>% # arrange(-year_long, -year_dif) %>% # mutate(year_index = factor(paste0(year_long,"-",year_dif)), # year_index = factor(year_index, levels = unique(year_index))) # ggplot(event_tukey_sig, aes(x = year_long, y = year_short)) + # # geom_point() + # # geom_errorbarh(aes(xmin = year_long, xmax = year_short)) + # geom_tile(aes(fill = count)) + # facet_grid(~metric) + # scale_x_reverse(breaks = seq(3, 36, 3)) + # scale_y_reverse(breaks = seq(3 , 7, 1)) + # scale_fill_viridis_c("count out of \n100 re-samples", breaks = seq(2,12,2)) + # labs(x = "Length (years) of long time series", # y = "Length (years) of short time series")</code></pre> <p>The post-hoc Tukey test revealed that only 1 rep out of 100 for the <code>WA</code> produced a significant difference when comparing the 10 year time series against 28, 29, 36, and 37 year time series. This shows that time series length has a negligible effect on the detection of events.</p> </div> <div id="linear-relationships-r2-1" class="section level3"> <h3>Linear relationships (R^2)</h3> <pre class="r"><code>load("data/sst_ALL_smooth.Rdata") sst_ALL_event_cor <- sst_ALL_smooth %>% unnest(events) %>% filter(row_number() %% 2 == 0) %>% unnest(events) %>% filter(year_index >= 10) %>% filter(date_start >= "2009-01-01", date_end <= "2018-12-31") %>% select(rep, site, year_index, duration, intensity_mean, intensity_max, intensity_cumulative) %>% group_by(site, rep, year_index) %>% summarise_all(.funs = c("mean", "sd")) %>% group_by(site, rep) %>% do(data.frame(t(cor(.[,4:11], .[,3])))) %>% gather(key = y, value = r, -rep, -site) %>% mutate(x = "length") save(sst_ALL_event_cor, file = "data/sst_ALL_event_cor.Rdata")</code></pre> <pre class="r"><code>load("data/sst_ALL_event_cor.Rdata") ggplot(sst_ALL_event_cor, aes(x = y, y = r, fill = y)) + geom_boxplot() + geom_hline(aes(yintercept = 0), colour = "grey20", size = 1.2, linetype = "dashed") + scale_fill_viridis_d() + facet_wrap(~site, ncol = 1) + labs(x = NULL, y = "Pearson r") + theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), legend.position = "bottom")</code></pre> <!-- The figure above shows us that the relationship between the length of all three reference time series and the resultant MHW metrics are similar. Specifically we see that mean duration and intensities of events has a strong (r~-0.5) relationship to the length of the time series. Meaning that longer the climatology period used is, the shorter and less intense MHWs tend to be. This was to be expected as the original hypothesis was that longer time series would have lower/smoother thresholds and so events detected with them would be longer and more intense. This pattern is a bit less clear for mean intensities. --> </div> <div id="confidence-intervals" class="section level3"> <h3>Confidence intervals</h3> <p>After looking at these correlations it seems as though somehow calculating confidence intervals on something would be worthwhile…</p> <pre class="r"><code># Calculate CIs using a bootstrapping technique to deal with the non-normal small sample sizes # df <- sst_ALL_both_event %>% # filter(lubridate::year(date_peak) >= 2012, # site == "WA", # trend == "detrended") %>% # select(year_index, duration, intensity_mean, intensity_max, intensity_cumulative) #%>% # select(site, trend, year_index, duration, intensity_mean, intensity_max, intensity_cumulative) #%>% # nest(-site, -trend) Bmean <- function(data, indices) { d <- data[indices] # allows boot to select sample return(mean(d)) } event_aov_CI <- function(df){ df_conf <- gather(df, key = "metric", value = "value", -year_index) %>% group_by(year_index, metric) %>% # ggplot(aes(x = value)) + # geom_histogram(aes(fill = metric)) + # facet_wrap(~metric, scales = "free_x") summarise(lower = boot.ci(boot(data=value, statistic=Bmean, R=1000), type = "basic")$basic[4], mid = mean(value), upper = boot.ci(boot(data=value, statistic=Bmean, R=1000), type = "basic")$basic[5], n = n()) %>% mutate_if(is.numeric, round, 4) return(df_conf) }</code></pre> </div> </div> <div id="categories" class="section level2"> <h2>Categories</h2> <p>In this section we want to look at how the categories in the different time periods compare. I’ll start out with doing basic calculations and comparisons of the categories of events with the different time periods. After that we will run <em>chi</em>-squared tests. Lastly a post-hoc test of sorts will be used to pull out exactly which categories differ significantly in count with different time series lengths.</p> <div id="event-count" class="section level3"> <h3>Event count</h3> <pre class="r"><code>load("data/sst_ALL_smooth.Rdata") suppressWarnings( sst_ALL_category <- sst_ALL_smooth %>% select(-events) %>% unnest(cats) %>% filter(year_index >= 10, peak_date >= "2009-01-01", peak_date <= "2018-12-31") ) # Indeed... but how to compare qualitative data? # Let's start out by counting the occurrence of the categories sst_ALL_clim_category_count <- sst_ALL_category %>% group_by(site, rep, year_index, category) %>% summarise(count = n()) %>% mutate(category = factor(category, levels = c("I Moderate", "II Strong", "III Severe", "IV Extreme"))) save(sst_ALL_clim_category_count, file = "data/sst_ALL_clim_category_count.Rdata")</code></pre> <pre class="r"><code>load("~/MHWdetection/analysis/data/sst_ALL_clim_category_count.Rdata") # total_category_count <- sst_ALL_clim_category_count %>% # ungroup() %>% # select(-rep) %>% # group_by(site, category) %>% # summarise(count = sum(count)) ggplot(sst_ALL_clim_category_count, aes(x = year_index, y = count, fill = category)) + geom_bar(stat = "identity") + # geom_label(data = label_category_count, show.legend = F, fill = "white", # aes(label = count, y = 1000)) + scale_fill_manual(values = MHW_colours) + facet_wrap(~site, ncol = 1) + labs(x = "Tim series length (years)", y = "Total count") + theme(legend.position = "bottom")</code></pre> <div class="figure" style="text-align: center"> <img src="figure/time_series_duration.Rmd/cat-compare-1.png" alt="Bar plots showing the counts of the different categories of events faceted in a grid with different sites along the top and the different category classifications down the side. The colours of the bars denote the different climatology periods used. Note that the general trend is that more 'smaller' events are detected with a 10 year clim period, and more 'larger' events detected with the 30 year clim. The 20 year clim tends to rest in the middle." width="768" /> <p class="caption"> Bar plots showing the counts of the different categories of events faceted in a grid with different sites along the top and the different category classifications down the side. The colours of the bars denote the different climatology periods used. Note that the general trend is that more ‘smaller’ events are detected with a 10 year clim period, and more ‘larger’ events detected with the 30 year clim. The 20 year clim tends to rest in the middle. </p> </div> <p>The results in the figure above show that we see more larger events when the climatologies are based on 30 years of data, rather than 10, but that we tend to see more events overall when we use only 10 years of data. This is about what would be expected if there were a trend present in the data. SO it is interesting that it still comes through.</p> </div> <div id="chi-squared" class="section level3"> <h3><em>chi</em>-squared</h3> <p>In this section we will use pairwise chi-squared tests to see if the count of events in the different re-sampled lengths differ significantly from the same re-sampled time series of 30 years (this serves as the control). We will combine categories 3 and 4 together into one category for a more robust test and because we are interested in how these larger events stand out together against the smaller events.</p> <pre class="r"><code># Load and extract category data load("data/sst_ALL_smooth.Rdata") suppressWarnings( # Suppress warning about category levels not all being present sst_ALL_cat <- sst_ALL_smooth %>% select(-events) %>% unnest() ) # Prep the data for better chi-squared use sst_ALL_cat_prep <- sst_ALL_cat %>% filter(year_index >= 10, peak_date >= "2009-01-01", peak_date <= "2018-12-31") %>% select(year_index, rep, site, category) %>% mutate(category = ifelse(category == "III Severe", "III & IV", category), category = ifelse(category == "IV Extreme", "III & IV", category)) # Extract the true climatologies sst_ALL_cat_only_30 <- sst_ALL_cat_prep %>% filter(year_index == 30) %>% select(site, year_index, rep, category) # chi-squared pairwise function # Takes one rep of missing data and compares it against the complete data # df <- sst_ALL_miss_cat_prep %>% # filter(site == "WA", rep == "20", miss == 0.1) # df <- unnest(slice(sst_ALL_miss_cat_chi,1)) %>% # select(-site2, -rep, -miss2) chi_pair <- function(df){ # Prep data df_comp <- sst_ALL_cat_only_30 %>% filter(site == df$site[1], rep == df$rep[1]) df_joint <- rbind(df, df_comp) # Run tests res <- chisq.test(table(df_joint$year_index, df_joint$category), correct = FALSE) res_broom <- broom::augment(res) %>% mutate(p.value = broom::tidy(res)$p.value) return(res_broom) } # The pairwise chai-squared results suppressWarnings( # Suppress poor match warnings sst_ALL_cat_chi <- sst_ALL_cat_prep %>% filter(year_index != 30) %>% # mutate(miss = as.factor(miss)) %>% mutate(site2 = site, year_index2 = year_index, rep2 = rep) %>% group_by(site2, year_index2, rep2) %>% nest() %>% mutate(chi = map(data, chi_pair)) %>% select(-data) %>% unnest() %>% dplyr::rename(site = site2, year_index = year_index2, rep = rep2, year_comp = Var1, category = Var2) ) save(sst_ALL_cat_chi, file = "data/sst_ALL_cat_chi.Rdata")</code></pre> <pre class="r"><code>load("data/sst_ALL_cat_chi.Rdata") sst_ALL_cat_chi_sig <- sst_ALL_cat_chi %>% filter(p.value <= 0.05) %>% select(site, year_index, p.value) %>% group_by(site, year_index) %>% unique() %>% summarise(count = n()) knitr::kable(sst_ALL_cat_chi_sig, caption = "Table showing the count out of 100 for significant differences (_p_-value) for the count of different categories of events for each site based on the amount of missing data present.")</code></pre> <pre class="r"><code>load("~/MHWdetection/analysis/data/sst_ALL_cat_chi.Rdata") # Prep data for plotting chi_sig <- sst_ALL_cat_chi %>% filter(p.value <= 0.05) %>% select(site, year_index, p.value) %>% group_by(site, year_index) %>% unique() %>% summarise(count = n()) ggplot(chi_sig, aes(x = as.numeric(year_index), y = site)) + geom_line(aes(colour = count, group = site)) + geom_point(aes(colour = count)) + # facet_wrap(~metric, ncol = 1) + scale_colour_viridis_c() + scale_x_reverse() + labs(x = "Time series length (years)", y = NULL, colour = "Sig. dif.\nout of 100") + theme(axis.text.x = element_text(angle = 20))</code></pre> <div class="figure" style="text-align: center"> <img src="figure/time_series_duration.Rmd/chi-pair-plot-1.png" alt="Line graph showing the count of times out of 100 random replicates when a given time series length led to significant differences in the count of categories of MHWs as determined by a _chi_-squared test." width="768" /> <p class="caption"> Line graph showing the count of times out of 100 random replicates when a given time series length led to significant differences in the count of categories of MHWs as determined by a <em>chi</em>-squared test. </p> </div> <p>The line plot above shows at what point the category counts begin to differ significantly from the matching 30 year time series. It was expected that the <code>WA</code> time series would show sig. dif. more quickly than the other two, though I had expected the <code>Med</code> to be the slowest to change, not the <code>NW_Atl</code>. Over a sample of 100, there were never more than five replicates showing significant differences in count of categories though so it is relatively safe to say that the length of a time series has little effect on the count of events.</p> </div> <div id="post-hoc-chi-squared" class="section level3"> <h3>Post-hoc <em>chi</em>-squared</h3> <p>It is also necessary to see which of the categories specifically are differing significantly. Unfortunately there is no proper post-hoc <em>chi</em>-squared test. Instead it is possible to look at the standardised residuals for each category within the pairwise comparisons being made.</p> <p>I still need to do this…</p> </div> <div id="correlations" class="section level3"> <h3>Correlations</h3> <p>It would also be useful to look at the correlations between consecutive missing days of data and significantly different counts of categories.</p> </div> </div> <div id="can-we-fix-it" class="section level2"> <h2>Can we fix it?</h2> <div id="fourier-transform-climatologies" class="section level3"> <h3>Fourier transform climatologies</h3> <p>Lastly we will now go about reproducing all of the checks made above, but based on a climatology derived from a Fourier transformation, and not the default climatology creation method.</p> <p>(RWS: I’ve not gotten to the Fourier transformations yet either. I’m also not convinced that this is going to be the way forward. I am thinking that it may be better to just increase the smoothing window width to produce a more sinusoidal climatology line. But I suppose that in order to know that it must be tested.)</p> <p>(ECJO)[One advantage of the Fourier transform method (or, I think equivalently harmonic regression) is that it is VERY fast. Much faster than the method used in the MHW code. I also think you can probably get away with shorter time series using this method than with the “standard MHW climatology” method]</p> <p>(RWS)[I suppose then that I will need to look into this.]</p> </div> <div id="wider-windows" class="section level3"> <h3>Wider windows</h3> <p>Talk a bit here about changing the arguments for climatology creation to get them closer to the control.</p> </div> </div> <div id="best-practices" class="section level2"> <h2>Best practices</h2> <p>(RWS: I envision this last section distilling everything learned above into a nice bullet list. These bulletted items for each different vignette would then all be added up in one central <a href="https://robwschlegel.github.io/MHWdetection/articles/best_practices.html">best practices</a> vignette that would be ported over into the final write-up in the discussion/best-practices section of the paper. Ideally these best-practices could also be incorporated into the R/python distributions of the MHW detection code to allow users to make use of the findings from the paper in their own work.)</p> <div id="options-for-improving-climatology-precision-in-short-time-series" class="section level3"> <h3>Options for improving climatology precision in short time series</h3> </div> <div id="options-for-improving-mhw-metric-precision-in-short-time-series" class="section level3"> <h3>Options for improving MHW metric precision in short time series</h3> </div> </div> <div id="references" class="section level2"> <h2>References</h2> <div id="refs"> <div id="ref-Oliver2018"> <p>Oliver, Eric C.J., Markus G. Donat, Michael T. Burrows, Pippa J. Moore, Dan A. Smale, Lisa V. Alexander, Jessica A. Benthuysen, et al. 2018. “Longer and more frequent marine heatwaves over the past century.” <em>Nature Communications</em> 9 (1). doi:<a href="https://doi.org/10.1038/s41467-018-03732-9">10.1038/s41467-018-03732-9</a>.</p> </div> <div id="ref-Schlegel2016"> <p>Schlegel, Robert W., and Albertus J. Smit. 2016. “Climate Change in Coastal Waters: Time Series Properties Affecting Trend Estimation.” <em>Journal of Climate</em> 29 (24): 9113–24. doi:<a href="https://doi.org/10.1175/JCLI-D-16-0014.1">10.1175/JCLI-D-16-0014.1</a>.</p> </div> </div> </div> <div id="session-information" class="section level2"> <h2>Session information</h2> <pre class="r"><code>sessionInfo()</code></pre> <pre><code>R version 3.5.3 (2019-03-11) Platform: x86_64-pc-linux-gnu (64-bit) Running under: Ubuntu 16.04.5 LTS Matrix products: default BLAS: /usr/lib/openblas-base/libblas.so.3 LAPACK: /usr/lib/libopenblasp-r0.2.18.so locale: [1] LC_CTYPE=en_CA.UTF-8 LC_NUMERIC=C [3] LC_TIME=en_CA.UTF-8 LC_COLLATE=en_CA.UTF-8 [5] LC_MONETARY=en_CA.UTF-8 LC_MESSAGES=en_CA.UTF-8 [7] LC_PAPER=en_CA.UTF-8 LC_NAME=C [9] LC_ADDRESS=C LC_TELEPHONE=C [11] LC_MEASUREMENT=en_CA.UTF-8 LC_IDENTIFICATION=C attached base packages: [1] parallel stats graphics grDevices utils datasets methods [8] base other attached packages: [1] bindrcpp_0.2.2 doMC_1.3.5 iterators_1.0.10 [4] foreach_1.4.4 mgcv_1.8-24 nlme_3.1-137 [7] FNN_1.1.2.1 boot_1.3-20 ggpubr_0.1.8 [10] magrittr_1.5 fasttime_1.0-2 lubridate_1.7.4 [13] heatwaveR_0.3.6.9003 data.table_1.11.6 broom_0.5.0 [16] forcats_0.3.0 stringr_1.3.1 dplyr_0.7.6 [19] purrr_0.2.5 readr_1.1.1 tidyr_0.8.1 [22] tibble_1.4.2 ggplot2_3.0.0 tidyverse_1.2.1 loaded via a namespace (and not attached): [1] Rcpp_0.12.18 lattice_0.20-35 assertthat_0.2.0 [4] rprojroot_1.3-2 digest_0.6.16 R6_2.2.2 [7] cellranger_1.1.0 plyr_1.8.4 backports_1.1.2 [10] evaluate_0.11 highr_0.7 httr_1.3.1 [13] pillar_1.3.0 rlang_0.2.2 lazyeval_0.2.1 [16] readxl_1.1.0 rstudioapi_0.7 whisker_0.3-2 [19] R.utils_2.7.0 R.oo_1.22.0 Matrix_1.2-14 [22] rmarkdown_1.10 labeling_0.3 htmlwidgets_1.2 [25] munsell_0.5.0 compiler_3.5.3 modelr_0.1.2 [28] pkgconfig_2.0.2 htmltools_0.3.6 tidyselect_0.2.4 [31] workflowr_1.1.1 codetools_0.2-15 viridisLite_0.3.0 [34] crayon_1.3.4 withr_2.1.2 R.methodsS3_1.7.1 [37] grid_3.5.3 jsonlite_1.5 gtable_0.2.0 [40] git2r_0.23.0 scales_1.0.0 cli_1.0.0 [43] stringi_1.2.4 reshape2_1.4.3 padr_0.4.1 [46] xml2_1.2.0 tools_3.5.3 glue_1.3.0 [49] hms_0.4.2 yaml_2.2.0 colorspace_1.3-2 [52] rvest_0.3.2 plotly_4.8.0 knitr_1.20 [55] bindr_0.1.1 haven_1.1.2 </code></pre> </div> <!-- Adjust MathJax settings so that all math formulae are shown using TeX fonts only; see http://docs.mathjax.org/en/latest/configuration.html. This will make the presentation more consistent at the cost of the webpage sometimes taking slightly longer to load. Note that this only works because the footer is added to webpages before the MathJax javascript. --> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <hr> <p> This reproducible <a href="http://rmarkdown.rstudio.com">R Markdown</a> analysis was created with <a href="https://github.com/jdblischak/workflowr">workflowr</a> 1.1.1 </p> <hr> </div> </div> </div> <script> // add bootstrap table styles to pandoc tables function bootstrapStylePandocTables() { $('tr.header').parent('thead').parent('table').addClass('table table-condensed'); } $(document).ready(function () { bootstrapStylePandocTables(); }); </script> <!-- dynamically load mathjax for compatibility with self-contained --> <script> (function () { var script = document.createElement("script"); script.type = "text/javascript"; script.src = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML"; document.getElementsByTagName("head")[0].appendChild(script); })(); </script> </body> </html>