Mash vs. Flash

Last updated: 2018-06-11

workflowr checks: (Click a bullet for more information)

✔ R Markdown file: up-to-date

Great! Since the R Markdown file has been committed to the Git repository, you know the exact version of the code that produced these results.
✔ Environment: empty

Great job! The global environment was empty. Objects defined in the global environment can affect the analysis in your R Markdown file in unknown ways. For reproduciblity it’s best to always run the code in an empty environment.
✔ Seed: set.seed(20180609)

The command set.seed(20180609) was run prior to running the code in the R Markdown file. Setting a seed ensures that any results that rely on randomness, e.g. subsampling or permutations, are reproducible.
✔ Session information: recorded

Great job! Recording the operating system, R version, and package versions is critical for reproducibility.
✔ Repository version: 485717c
Great! You are using Git for version control. Tracking code development and connecting the code version to the results is critical for reproducibility. The version displayed above was the version of the Git repository at the time these results were generated.

Note that you need to be careful to ensure that all relevant files for the analysis have been committed to Git prior to generating the results (you can use wflow_publish or wflow_git_commit). workflowr only checks the R Markdown file, but you know if there are other scripts or data files that it depends on. Below is the status of the Git repository when the results were generated:
```
Ignored files:
    Ignored:    .DS_Store
    Ignored:    .Rproj.user/
```
Note that any generated files, e.g. HTML, png, CSS, etc., are not included in this status report because it is ok for generated content to have uncommitted changes.

Expand here to see past versions:

File	Version	Author	Date	Message
Rmd	485717c	Jason Willwerscheid	2018-06-11	wflow_publish(“analysis/mashvflash.Rmd”)
html	a1180b5	Jason Willwerscheid	2018-06-11	Build site.
Rmd	7628520	Jason Willwerscheid	2018-06-11	wflow_publish(“analysis/mashvflash.Rmd”)
html	624de1d	Jason Willwerscheid	2018-06-11	Build site.
Rmd	ea2721a	Jason Willwerscheid	2018-06-11	wflow_publish(“analysis/mashvflash.Rmd”)
html	9f81f06	Jason Willwerscheid	2018-06-11	Build site.
Rmd	f5750b8	Jason Willwerscheid	2018-06-11	wflow_publish(“analysis/mashvflash.Rmd”)
html	17f4dfb	Jason Willwerscheid	2018-06-11	Build site.
Rmd	b47adc3	Jason Willwerscheid	2018-06-11	wflow_publish(“analysis/mashvflash.Rmd”)
html	6508f2b	Jason Willwerscheid	2018-06-09	Build site.
Rmd	861d4b3	Jason Willwerscheid	2018-06-09	readd mashvflash analysis
html	861d4b3	Jason Willwerscheid	2018-06-09	readd mashvflash analysis
Rmd	a76f3cf	Jason Willwerscheid	2018-06-09	add mashvflash rmd
html	a76f3cf	Jason Willwerscheid	2018-06-09	add mashvflash rmd

Code

Setup code.

# FLASH v MASH ------------------------------------------------------
flash_v_mash <- function(Y, true_Y, nfactors) {
  data <- flash_set_data(Y, S = 1)
  res <- list()

  t <- Sys.time()
  fl <- fit_flash(data, nfactors)
  res$fl_time <- Sys.time() - t

  t <- Sys.time()
  m <- fit_mash(Y)
  res$m_time <- Sys.time() - t

  # Sample from FLASH fit
  fl_sampler <- flash_lf_sampler(Y, fl, ebnm_fn=ebnm_pn, fixed="factors")

  nsamp <- 200
  fl_samp <- fl_sampler(nsamp)

  res$fl_mse <- flash_pm_mse(fl_samp, true_Y)
  res$m_mse <- mash_pm_mse(m, true_Y)
  res$fl_ci <- flash_ci_acc(fl_samp, true_Y)
  res$m_ci <- mash_ci_acc(m, true_Y)
  res$fl_lfsr <- flash_lfsr(fl_samp, true_Y)
  res$m_lfsr <-  mash_lfsr(m, true_Y)
  res
}

plot_res <- function(res) {
  old_par <- par("mfrow")
  par(mfrow=c(1, 2))
  x <- seq(0.025, 0.475, by=0.05)
  plot(x, res$fl_lfsr, type='l', ylim=c(0, 0.6), xlab="FLASH", ylab="lfsr")
  abline(0, 1)
  plot(x, res$m_lfsr, type='l', ylim=c(0, 0.6), xlab="MASH", ylab="lfsr")
  abline(0, 1)
  par(mfrow=old_par)
}


# Fit using FLASH ---------------------------------------------------
fit_flash <- function(data, nfactors) {
  p <- ncol(data$Y)
  fl <- flash_add_greedy(data, nfactors, var_type = "zero")
  fl <- flash_add_fixed_f(data, diag(rep(1, p)), fl)
  flash_backfit(data, fl, nullcheck = F, var_type = "zero")
}

# Fit using MASH ---------------------------------------------------
fit_mash <- function(Y) {
  data <- mash_set_data(Y)
  U.c = cov_canonical(data)
  m.1by1 <- mash_1by1(data)
  strong <- get_significant_results(m.1by1, 0.05)
  U.pca <- cov_pca(data, 5, strong)
  U.ed <- cov_ed(data, U.pca, strong)
  mash(data, c(U.c,U.ed))
}


# MSE of posterior means (FLASH) ------------------------------------
flash_pm_mse <- function(fl_samp, true_Y) {
  n <- nrow(true_Y)
  p <- ncol(true_Y)
  nsamp <- length(fl_samp)

  post_means <- matrix(0, nrow=n, ncol=p)
  for (i in 1:nsamp) {
    post_means <- post_means + fl_samp[[i]]
  }
  post_means <- post_means / nsamp
  sum((post_means - true_Y)^2) / (n * p)
}
# Compare with just using FLASH LF:
# sum((flash_get_lf(fl)- true_flash_Y)^2) / (n * p)


# MSE for MASH ------------------------------------------------------
mash_pm_mse <- function(m, true_Y) {
  n <- nrow(true_Y)
  p <- ncol(true_Y)
  sum((get_pm(m) - true_Y)^2) / (n * p)
}


# CI coverage for FLASH ---------------------------------------------
flash_ci_acc <- function(fl_samp, true_Y) {
  n <- nrow(true_Y)
  p <- ncol(true_Y)
  nsamp <- length(fl_samp)

  flat_samp <- matrix(0, nrow=n*p, ncol=nsamp)
  for (i in 1:nsamp) {
    flat_samp[, i] <- as.vector(fl_samp[[i]])
  }
  CI <- t(apply(flat_samp, 1, function(x) {quantile(x, c(0.025, 0.975))}))
  sum((as.vector(true_Y) > CI[, 1])
      & (as.vector(true_Y < CI[, 2]))) / (n * p)
}

# CI coverage for MASH ----------------------------------------------
mash_ci_acc <- function(m, true_Y) {
  sum((true_Y > get_pm(m) - 1.96 * get_psd(m))
      & (true_Y < get_pm(m) + 1.96 * get_psd(m))) / (n * p)
}


# LFSR for FLASH ----------------------------------------------------
flash_lfsr <- function(fl_samp, true_Y, step=0.05) {
  n <- nrow(true_Y)
  p <- ncol(true_Y)
  nsamp <- length(fl_samp)

  lfsr <- matrix(0, nrow=n, ncol=p)
  for (i in 1:nsamp) {
    lfsr <- lfsr + (fl_samp[[i]] > 0) + 0.5*(fl_samp[[i]] == 0)
  }
  signs <- lfsr >= nsamp / 2
  correct_signs <- true_Y > 0
  gotitright <- signs == correct_signs
  lfsr <- pmin(lfsr, 100 - lfsr) / 100

  nsteps <- floor(.5 / step)
  fsr_by_lfsr <- rep(0, nsteps)
  for (k in 1:nsteps) {
    idx <- (lfsr >= (step * (k - 1)) & lfsr < (step * k))
    fsr_by_lfsr[k] <- ifelse(sum(idx) == 0, 0,
                             1 - sum(gotitright[idx]) / sum(idx))
  }
  fsr_by_lfsr
}


# LFSR for MASH -----------------------------------------------------
mash_lfsr <- function(m, true_Y, step=0.05) {
  lfsr <- get_lfsr(m)
  signs <- get_pm(m) > 0
  correct_signs <- true_Y > 0
  gotitright <- signs == correct_signs

  nsteps <- floor(.5 / step)
  fsr_by_lfsr <- rep(0, nsteps)
  for (k in 1:nsteps) {
    idx <- (lfsr >= (step * (k - 1)) & lfsr < (step * k))
    fsr_by_lfsr[k] <- ifelse(sum(idx) == 0, 0,
                             1 - sum(gotitright[idx]) / sum(idx))
  }
  fsr_by_lfsr
}

Augmented FLASH simulation.

# Simulate from FLASH model -----------------------------------------
n <- 1000
p <- 10
flash_factors <- 5

# Use one factor of all ones and one more interesting factor
nfactors <- 2
k <- p + nfactors
ff <- matrix(0, nrow=k, ncol=p)
ff[1, ] <- rep(10, p)
ff[2, ] <- c(seq(10, 2, by=-2), rep(0, p - 5))
diag(ff[3:k, ]) <- 3
ll <- matrix(rnorm(n * k), nrow=n, ncol=k)
true_flash_Y <- ll %*% ff
flash_Y <- true_flash_Y + rnorm(n*p)
# RESULTS
flash_res <- flash_v_mash(flash_Y, true_flash_Y, flash_factors)

fitting factor/loading 1

fitting factor/loading 2

fitting factor/loading 3

fitting factor/loading 4

fitting factor/loading 5

 - Computing 1000 x 463 likelihood matrix.
 - Likelihood calculations took 0.12 seconds.
 - Fitting model with 463 mixture components.
 - Model fitting took 0.26 seconds.
 - Computing posterior matrices.
 - Computation allocated took 0.03 seconds.

FLASH simulation.

# Simulate from basic FLASH model -----------------------------------
ff <- ff[1:nfactors, ]
ll <- matrix(rnorm(n * nfactors), nrow=n, ncol=nfactors)
true_basic_Y <- ll %*% ff
basic_Y <- true_basic_Y + rnorm(n*p)
# RESULTS
basic_res <- flash_v_mash(basic_Y, true_basic_Y, flash_factors)

fitting factor/loading 1

fitting factor/loading 2

fitting factor/loading 3

 - Computing 1000 x 463 likelihood matrix.
 - Likelihood calculations took 0.12 seconds.
 - Fitting model with 463 mixture components.

Warning in REBayes::KWDual(A, rep(1, k), normalize(w), control = control): estimated mixing distribution has some negative values:
               consider reducing rtol

Warning in mixIP(matrix_lik = structure(c(4.36775456036503e-15, 0, 0,
0, : Optimization step yields mixture weights that are either too small,
or negative; weights have been corrected and renormalized after the
optimization.

 - Model fitting took 0.24 seconds.
 - Computing posterior matrices.
 - Computation allocated took 0.29 seconds.

MASH simulation.

# Simulate from MASH model ------------------------------------------
Sigma <- list()
Sigma[[1]] <- matrix(1, nrow=p, ncol=p)
Sigma[[2]] <- matrix(0, nrow=p, ncol=p)
for (i in 1:p) {
  for (j in 1:p) {
    Sigma[[2]][i, j] <- max(1 - abs(i - j) / 4, 0)
  }
}
for (k in 1:p) {
  Sigma[[k + 2]] <- matrix(0, nrow=p, ncol=p)
  Sigma[[k + 2]][k, k] <- 1
}
which_sigma <- sample(1:12, 1000, T, prob=c(.3, .3, rep(.4/p, p)))
true_mash_Y <- matrix(0, nrow=n, ncol=p)
for (i in 1:n) {
  true_mash_Y[i, ] <- 5*mvrnorm(1, rep(0, p), Sigma[[which_sigma[i]]])
}
mash_Y <- true_mash_Y + rnorm(n * p)
# RESULTS
mash_res <- flash_v_mash(mash_Y, true_mash_Y, flash_factors)

fitting factor/loading 1

fitting factor/loading 2

fitting factor/loading 3

fitting factor/loading 4

fitting factor/loading 5

 - Computing 1000 x 400 likelihood matrix.
 - Likelihood calculations took 0.11 seconds.
 - Fitting model with 400 mixture components.
 - Model fitting took 0.58 seconds.
 - Computing posterior matrices.
 - Computation allocated took 0.03 seconds.

Summary

In each case below, I follow the vignettes to produce a MASH fit (I use both canonical and data-driven covariance matrices). I fit a FLASH object (fixing the standard errors) by adding up to 10 factors greedily, then adding \(p\) fixed one-hot vectors, and finally backfitting.

The two fits perform similarly. The MASH fit does somewhat better on data generated from the MASH model; more surprisingly, it performs comparably to FLASH on data generated from both the standard two-factor FLASH model. Both do poorly on the “augmented FLASH model” (described below), with MSEs near 1 (which would be obtained by simply using \(Y\) as an estimate).

Flash Model

First I simulate from the basic FLASH model \(Y = LF + E\) with \(E_{ij} \sim N(0, 1)\). Here, \(Y \in \mathbb{R}^{1000 \times 10}\), \(L \in \mathbb{R}^{1000 \times 2}\) has i.i.d. \(N(0, 1)\) entries, and \(F\) is as follows:

     [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
[1,]   10   10   10   10   10   10   10   10   10    10
[2,]   10    8    6    4    2    0    0    0    0     0

The MSE of the FLASH fit is 0.2, vs. 0.21 for the MASH fit. The proportion of 95% confidence intervals that contain the true value \(LF_{ij}\) is 0.94 for FLASH and 0.96 for MASH. The true false sign rate vs lfsr appears as follows:

Expand here to see past versions of lfsr1-1.png:

Version	Author	Date
624de1d	Jason Willwerscheid	2018-06-11
9f81f06	Jason Willwerscheid	2018-06-11
17f4dfb	Jason Willwerscheid	2018-06-11
6508f2b	Jason Willwerscheid	2018-06-09
861d4b3	Jason Willwerscheid	2018-06-09
a76f3cf	Jason Willwerscheid	2018-06-09

The FLASH fit took 0.61 s. The MASH fit took 8.86 s.

Augmented Flash Model

Next I simulate from the “augmented” FLASH model \[ Y = L \begin{pmatrix} F \\ 3I_{10} \end{pmatrix} + E \] with \(F\) as above.

The MSE of the FLASH fit is 0.93, vs. 1.05 for the MASH fit. The proportion of 95% confidence intervals that contain the true value is 0.94 for FLASH and 0.93 for MASH. The true false sign rate vs lfsr appears as follows:

Expand here to see past versions of lfsr2-1.png:

Version	Author	Date
a1180b5	Jason Willwerscheid	2018-06-11
624de1d	Jason Willwerscheid	2018-06-11
9f81f06	Jason Willwerscheid	2018-06-11
17f4dfb	Jason Willwerscheid	2018-06-11
6508f2b	Jason Willwerscheid	2018-06-09
861d4b3	Jason Willwerscheid	2018-06-09
a76f3cf	Jason Willwerscheid	2018-06-09

The FLASH fit took 18.36 s. The MASH fit took 3.57 s.

MASH Model

Finally I simulate from the MASH model \[ X \sim \sum \pi_i N(0, \Sigma_i),\ Y = X + E \] with \(E_{ij} \sim N(0, 1)\). I set \(\Sigma_1\) to be the all ones matrix, \(\Sigma_2\) to be a banded covariance matrix with non-zero entries on the first three off-diagonals, and \(\Sigma_3\) through \(\Sigma_{12}\) to have a single non-zero entry (corresponding to tissue-specific effects). \(\pi\) is set to \((0.3, 0.3, 0.04, 0.04, \ldots, 0.04)\).

The MSE of the FLASH fit is 0.56, vs. 0.43 for the MASH fit. The proportion of 95% confidence intervals that contain the true value is 0.9 for FLASH and 0.94 for MASH. The true false sign rate vs lfsr appears as follows:

Expand here to see past versions of lfsr3-1.png:

Version	Author	Date
a1180b5	Jason Willwerscheid	2018-06-11
624de1d	Jason Willwerscheid	2018-06-11
9f81f06	Jason Willwerscheid	2018-06-11
6508f2b	Jason Willwerscheid	2018-06-09
861d4b3	Jason Willwerscheid	2018-06-09
a76f3cf	Jason Willwerscheid	2018-06-09

The FLASH fit took 27.86 s. The MASH fit took 3.38 s.

Session information

sessionInfo()

R version 3.4.3 (2017-11-30)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS Sierra 10.12.6

Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base

other attached packages:
[1] MASS_7.3-48  mashr_0.2-7  ashr_2.2-7   flashr_0.5-8

loaded via a namespace (and not attached):
 [1] Rcpp_0.12.16             pillar_1.2.1
 [3] plyr_1.8.4               compiler_3.4.3
 [5] git2r_0.21.0             workflowr_1.0.1
 [7] R.methodsS3_1.7.1        R.utils_2.6.0
 [9] iterators_1.0.9          tools_3.4.3
[11] testthat_2.0.0           digest_0.6.15
[13] tibble_1.4.2             evaluate_0.10.1
[15] memoise_1.1.0            gtable_0.2.0
[17] lattice_0.20-35          rlang_0.2.0
[19] Matrix_1.2-12            foreach_1.4.4
[21] commonmark_1.4           yaml_2.1.17
[23] parallel_3.4.3           mvtnorm_1.0-7
[25] ebnm_0.1-11              withr_2.1.1.9000
[27] stringr_1.3.0            roxygen2_6.0.1.9000
[29] xml2_1.2.0               knitr_1.20
[31] REBayes_1.2              devtools_1.13.4
[33] rprojroot_1.3-2          grid_3.4.3
[35] R6_2.2.2                 rmarkdown_1.8
[37] rmeta_3.0                ggplot2_2.2.1
[39] magrittr_1.5             whisker_0.3-2
[41] backports_1.1.2          scales_0.5.0
[43] codetools_0.2-15         htmltools_0.3.6
[45] assertthat_0.2.0         softImpute_1.4
[47] colorspace_1.3-2         stringi_1.1.6
[49] Rmosek_7.1.3             lazyeval_0.2.1
[51] munsell_0.4.3            doParallel_1.0.11
[53] pscl_1.5.2               truncnorm_1.0-8
[55] SQUAREM_2017.10-1        ExtremeDeconvolution_1.3
[57] R.oo_1.21.0

This reproducible R Markdown analysis was created with workflowr 1.0.1