library(heatmaply)
library(dendextend)

1 Measles

1.1 How the data was prepared

Here eval=FALSE as we have the final version in the heatmaplyExamples package.

Preparing the data:

# measles
# bil gates (20 Feb 2015): https://twitter.com/BillGates/status/568749655112228865
# http://graphics.wsj.com/infectious-diseases-and-vaccines/#b02g20t20w15
# http://www.opiniomics.org/recreating-a-famous-visualisation/
# http://www.r-bloggers.com/recreating-the-vaccination-heatmaps-in-r-2/




# source: http://www.r-bloggers.com/recreating-the-vaccination-heatmaps-in-r/
# measles <- read.csv("https://raw.githubusercontent.com/blmoore/blogR/master/data/measles_incidence.csv", header=T, 
#                     skip=2, stringsAsFactors=F)
# dir.create("data")
# write.csv(measles, file = "data\\measles.csv")
# measles[measles == "-"] <- NA
measles[,-(1:2)] <- apply(measles[,-(1:2)], 2, as.numeric)

dim(measles)
# head(measles)

d2 <- aggregate(measles, list(measles[, "YEAR"]), "sum", na.rm = T)
rownames(measles) <- d2[,1]
d2 <- d2[, -c(1:4)]
d2 <- t(measles)
# head(measles)

dim(measles)
d2[1:5,1:5]

# head(md2)
# 
# # per week
# d22 <- aggregate(measles, list(measles$YEAR, measles$WEEK), "sum", na.rm = T)
# rownames(measles2) <- paste(measles2[,1], d22[,2], sep = "_")
# d22 <- d22[, -c(1:5)]
# d22 <- t(measles2)
# md22 <- reshape2::melt(measles2)
# dim(measles2) # 51 3952
# pander::pander(sqrt(measles))
if(!file.exists("data\\sqrt_d2.csv")) write.csv(sqrt(measles), file = "data\\sqrt_d2.csv")

Let’s have measles in a long format.

library(heatmaplyExamples)
data(measles)
dim(measles)
#> [1] 59 72
md2 <- reshape2::melt(measles)
head(md2)
#>             Var1 Var2 value
#> 1        ALABAMA 1930  4389
#> 2         ALASKA 1930     0
#> 3 AMERICAN.SAMOA 1930     0
#> 4        ARIZONA 1930  2107
#> 5       ARKANSAS 1930   996
#> 6     CALIFORNIA 1930 43585

1.2 Which transformation to use?

We can look at the general timeline trend with different transformations.

library(ggplot2)
ggplot(md2, aes(x = Var2, y = value)) + geom_line(aes(group = Var1)) + geom_smooth(size = 2)
#> `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ggplot(md2, aes(x = Var2, y = sqrt(value))) + geom_line(aes(group = Var1)) + geom_smooth(size = 2)
#> `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ggplot(md2, aes(x = Var2, y = log(value+.1))) + geom_line(aes(group = Var1)) + geom_smooth(size = 2)
#> `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

# ggplot(md22, aes(x = Var2, y = value)) + geom_line(aes(group = Var1)) # + geom_smooth(size = 2)

#
# we miss the per state information and the patters of similarity

We can try several heatmaps settings until we get one that works. Notice the effect of the color scheme.

library(gplots)
library(viridis)
heatmap.2(measles)

heatmap.2(measles, margins = c(3, 9))

heatmap.2(measles,trace = "none", margins = c(3, 9))

heatmap.2(measles, trace = "none", margins = c(3, 9), dendrogram = "row", Colv = NULL)

# heatmap.2(sqrt(measles), Colv = NULL, trace = "none", margins = c(3, 9))
library(viridis)
heatmap.2(measles, Colv = NULL, trace = "none", col = viridis(200), margins = c(3, 9))
#> Warning in heatmap.2(measles, Colv = NULL, trace = "none", col =
#> viridis(200), : Discrepancy: Colv is FALSE, while dendrogram is `both'.
#> Omitting column dendogram.

heatmap.2(sqrt(measles), Colv = NULL, trace = "none", col = viridis(200), margins = c(3, 9))
#> Warning in heatmap.2(sqrt(measles), Colv = NULL, trace = "none", col =
#> viridis(200), : Discrepancy: Colv is FALSE, while dendrogram is `both'.
#> Omitting column dendogram.

# hist(measles, col = )

library(ggplot2)
library(viridis)
hp <- qplot(x = as.numeric(measles), fill = ..count.., geom = "histogram")
hp + scale_fill_viridis(direction = -1) + xlab("Measles incedence") + theme_bw()
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Using scaling doesn’t make much sense in this data.

heatmap.2(measles, dendrogram = "row", Colv = NULL, trace = "none", margins = c(3, 9),
          col = viridis(200), 
          scale = "row"
          )

heatmap.2(measles, dendrogram = "row", Colv = NULL, trace = "none", margins = c(3, 9),
          col = viridis(200), 
          scale = "column"
          )

1.3 Clustering

Decisions for clustering (highlight_branches helps identify the topology of the dendrogram, it colors each branch based on its height):

DATA <- measles
d <- dist(sqrt(DATA))

library(dendextend)
dend_expend(d)[[3]]
#>   dist_methods hclust_methods     optim
#> 1      unknown         ward.D 0.8435986
#> 2      unknown        ward.D2 0.8499247
#> 3      unknown         single 0.8600853
#> 4      unknown       complete 0.8656224
#> 5      unknown        average 0.8774877
#> 6      unknown       mcquitty 0.8447603
#> 7      unknown         median 0.8682800
#> 8      unknown       centroid 0.8508883
# average seems to make the most sense

dend_row <- d %>% hclust(method = "average") %>% as.dendrogram 
dend_row %>% highlight_branches %>% plot

What would be the optimal number of clusters? (it seems that 3 would be good)

library(dendextend)

dend_k <- find_k(dend_row)
plot(dend_k)

Let’s plot it:

Rowv <- dend_row %>% color_branches(k = 3)
heatmap.2(sqrt(measles), Colv = NULL, Rowv = Rowv, trace = "none", col = viridis(200), margins = c(3, 9))
#> Warning in heatmap.2(sqrt(measles), Colv = NULL, Rowv = Rowv, trace =
#> "none", : Discrepancy: Colv is FALSE, while dendrogram is `both'. Omitting
#> column dendogram.

Rowv <- dend_row %>% color_branches(k = 3) %>% seriate_dendrogram(x=d)
heatmap.2(sqrt(measles), Colv = NULL, Rowv = Rowv, trace = "none", col = viridis(200), margins = c(3, 9))
#> Warning in heatmap.2(sqrt(measles), Colv = NULL, Rowv = Rowv, trace =
#> "none", : Discrepancy: Colv is FALSE, while dendrogram is `both'. Omitting
#> column dendogram.

1.4 heatmaply

All of this work (other then finding that the hclust method should be average) can simply be done using the following code to produce an interactive heatmap using heatmaply:

library(heatmaply)
heatmaply(sqrt(measles), Colv = NULL, hclust_method = "average", 
          fontsize_row = 8,fontsize_col = 6,
          k_row = NA, margins = c(60,170, 70,40),
          xlab = "Year", ylab = "State", main = "Project Tycho's heatmap visualization \nof the measles vaccine"
        ) 

We can also use plot_method = “plotly”, row_dend_left = TRUE to make the plot more similar to heatmap.2 using the plotly engine (this is a bit more experimental. The default, plot_method = “ggplot”, is often safer to use - but it does not yet support row_dend_left = TRUE properly).

heatmaply(sqrt(measles), Colv = NULL, hclust_method = "average", 
          fontsize_row = 8,fontsize_col = 6,
          k_row = NA, margins = c(60,50, 70,90),
          xlab = "Year", ylab = "State", main = "Project Tycho's heatmap visualization \nof the measles vaccine",
          plot_method = "plotly", row_dend_left = TRUE
        ) 
# save.image("example.Rdata")

2 sessionInfo

sessionInfo()
#> R version 3.4.1 (2017-06-30)
#> Platform: x86_64-w64-mingw32/x64 (64-bit)
#> Running under: Windows 7 x64 (build 7601) Service Pack 1
#> 
#> Matrix products: default
#> 
#> locale:
#> [1] LC_COLLATE=Hebrew_Israel.1255  LC_CTYPE=Hebrew_Israel.1255   
#> [3] LC_MONETARY=Hebrew_Israel.1255 LC_NUMERIC=C                  
#> [5] LC_TIME=Hebrew_Israel.1255    
#> 
#> attached base packages:
#> [1] parallel  stats     graphics  grDevices datasets  utils     methods  
#> [8] base     
#> 
#> other attached packages:
#>  [1] gplots_3.0.1            limma_3.22.7           
#>  [3] ALL_1.7.1               Biobase_2.26.0         
#>  [5] BiocGenerics_0.12.1     dendextend_1.6.0       
#>  [7] glmnet_2.0-10           foreach_1.4.3          
#>  [9] Matrix_1.2-10           bindrcpp_0.2           
#> [11] heatmaplyExamples_0.1.0 heatmaply_0.11.0       
#> [13] viridis_0.4.0           viridisLite_0.2.0      
#> [15] plotly_4.7.1            ggplot2_2.2.1.9000     
#> [17] knitr_1.16              installr_0.19.0        
#> [19] stringr_1.2.0          
#> 
#> loaded via a namespace (and not attached):
#>  [1] httr_1.2.1         tidyr_0.6.3        jsonlite_1.5      
#>  [4] gtools_3.5.0       shiny_1.0.3        assertthat_0.2.0  
#>  [7] stats4_3.4.1       yaml_2.1.14        robustbase_0.92-7 
#> [10] backports_1.1.0    lattice_0.20-35    glue_1.1.1        
#> [13] digest_0.6.12      RColorBrewer_1.1-2 colorspace_1.3-2  
#> [16] httpuv_1.3.5       htmltools_0.3.6    plyr_1.8.4        
#> [19] pkgconfig_2.0.1    devtools_1.13.2    xtable_1.8-2      
#> [22] purrr_0.2.2.2      mvtnorm_1.0-6      scales_0.5.0      
#> [25] gdata_2.18.0       whisker_0.3-2      tibble_1.3.4      
#> [28] mgcv_1.8-17        withr_2.0.0        nnet_7.3-12       
#> [31] lazyeval_0.2.0     mime_0.5           magrittr_1.5      
#> [34] mclust_5.3         memoise_1.1.0      evaluate_0.10.1   
#> [37] nlme_3.1-131       MASS_7.3-47        class_7.3-14      
#> [40] tools_3.4.1        registry_0.3       data.table_1.10.4 
#> [43] trimcluster_0.1-2  kernlab_0.9-25     munsell_0.4.3     
#> [46] cluster_2.0.6      fpc_2.1-10         compiler_3.4.1    
#> [49] caTools_1.17.1     rlang_0.1.2        grid_3.4.1        
#> [52] iterators_1.0.8    htmlwidgets_0.9    crosstalk_1.0.0   
#> [55] labeling_0.3       bitops_1.0-6       rmarkdown_1.6     
#> [58] gtable_0.2.0       codetools_0.2-15   flexmix_2.3-14    
#> [61] TSP_1.1-5          reshape2_1.4.2     R6_2.2.2          
#> [64] seriation_1.2-2    gridExtra_2.2.1    prabclus_2.2-6    
#> [67] dplyr_0.7.3.9000   bindr_0.1          rprojroot_1.2     
#> [70] KernSmooth_2.23-15 modeltools_0.2-21  stringi_1.1.5     
#> [73] Rcpp_0.12.12       gclus_1.3.1        DEoptimR_1.0-8    
#> [76] diptest_0.75-7