<!DOCTYPE html> <html xmlns="http://www.w3.org/1999/xhtml"> <head> <meta charset="utf-8" /> <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> <meta name="generator" content="pandoc" /> <meta name="author" content="Briana Mittleman" /> <title>ChromHMM analysis</title> <script src="site_libs/jquery-1.11.3/jquery.min.js"></script> <meta name="viewport" content="width=device-width, initial-scale=1" /> <link href="site_libs/bootstrap-3.3.5/css/journal.min.css" rel="stylesheet" /> <script src="site_libs/bootstrap-3.3.5/js/bootstrap.min.js"></script> <script src="site_libs/bootstrap-3.3.5/shim/html5shiv.min.js"></script> <script src="site_libs/bootstrap-3.3.5/shim/respond.min.js"></script> <script src="site_libs/jqueryui-1.11.4/jquery-ui.min.js"></script> <link href="site_libs/tocify-1.9.1/jquery.tocify.css" rel="stylesheet" /> <script src="site_libs/tocify-1.9.1/jquery.tocify.js"></script> <script src="site_libs/navigation-1.1/tabsets.js"></script> <link href="site_libs/highlightjs-9.12.0/textmate.css" rel="stylesheet" /> <script src="site_libs/highlightjs-9.12.0/highlight.js"></script> <link href="site_libs/font-awesome-5.0.13/css/fa-svg-with-js.css" rel="stylesheet" /> <script src="site_libs/font-awesome-5.0.13/js/fontawesome-all.min.js"></script> <script src="site_libs/font-awesome-5.0.13/js/fa-v4-shims.min.js"></script> <style type="text/css">code{white-space: pre;}</style> <style type="text/css"> pre:not([class]) { background-color: white; } </style> <script type="text/javascript"> if (window.hljs) { hljs.configure({languages: []}); hljs.initHighlightingOnLoad(); if (document.readyState && document.readyState === "complete") { window.setTimeout(function() { hljs.initHighlighting(); }, 0); } } </script> <style type="text/css"> h1 { font-size: 34px; } h1.title { font-size: 38px; } h2 { font-size: 30px; } h3 { font-size: 24px; } h4 { font-size: 18px; } h5 { font-size: 16px; } h6 { font-size: 12px; } .table th:not([align]) { text-align: left; } </style> </head> <body> <style type = "text/css"> .main-container { max-width: 940px; margin-left: auto; margin-right: auto; } code { color: inherit; background-color: rgba(0, 0, 0, 0.04); } img { max-width:100%; height: auto; } .tabbed-pane { padding-top: 12px; } .html-widget { margin-bottom: 20px; } button.code-folding-btn:focus { outline: none; } </style> <style type="text/css"> /* padding for bootstrap navbar */ body { padding-top: 51px; padding-bottom: 40px; } /* offset scroll position for anchor links (for fixed navbar) */ .section h1 { padding-top: 56px; margin-top: -56px; } .section h2 { padding-top: 56px; margin-top: -56px; } .section h3 { padding-top: 56px; margin-top: -56px; } .section h4 { padding-top: 56px; margin-top: -56px; } .section h5 { padding-top: 56px; margin-top: -56px; } .section h6 { padding-top: 56px; margin-top: -56px; } </style> <script> // manage active state of menu based on current page $(document).ready(function () { // active menu anchor href = window.location.pathname href = href.substr(href.lastIndexOf('/') + 1) if (href === "") href = "index.html"; var menuAnchor = $('a[href="' + href + '"]'); // mark it active menuAnchor.parent().addClass('active'); // if it's got a parent navbar menu mark it active as well menuAnchor.closest('li.dropdown').addClass('active'); }); </script> <div class="container-fluid main-container"> <!-- tabsets --> <script> $(document).ready(function () { window.buildTabsets("TOC"); }); </script> <!-- code folding --> <script> $(document).ready(function () { // move toc-ignore selectors from section div to header $('div.section.toc-ignore') .removeClass('toc-ignore') .children('h1,h2,h3,h4,h5').addClass('toc-ignore'); // establish options var options = { selectors: "h1,h2,h3", theme: "bootstrap3", context: '.toc-content', hashGenerator: function (text) { return text.replace(/[.\\/?&!#<>]/g, '').replace(/\s/g, '_').toLowerCase(); }, ignoreSelector: ".toc-ignore", scrollTo: 0 }; options.showAndHide = true; options.smoothScroll = true; // tocify var toc = $("#TOC").tocify(options).data("toc-tocify"); }); </script> <style type="text/css"> #TOC { margin: 25px 0px 20px 0px; } @media (max-width: 768px) { #TOC { position: relative; width: 100%; } } .toc-content { padding-left: 30px; padding-right: 40px; } div.main-container { max-width: 1200px; } div.tocify { width: 20%; max-width: 260px; max-height: 85%; } @media (min-width: 768px) and (max-width: 991px) { div.tocify { width: 25%; } } @media (max-width: 767px) { div.tocify { width: 100%; max-width: none; } } .tocify ul, .tocify li { line-height: 20px; } .tocify-subheader .tocify-item { font-size: 0.90em; padding-left: 25px; text-indent: 0; } .tocify .list-group-item { border-radius: 0px; } </style> <!-- setup 3col/9col grid for toc_float and main content --> <div class="row-fluid"> <div class="col-xs-12 col-sm-4 col-md-3"> <div id="TOC" class="tocify"> </div> </div> <div class="toc-content col-xs-12 col-sm-8 col-md-9"> <div class="navbar navbar-default navbar-fixed-top" role="navigation"> <div class="container"> <div class="navbar-header"> <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar"> <span class="icon-bar"></span> <span class="icon-bar"></span> <span class="icon-bar"></span> </button> <a class="navbar-brand" href="index.html">Three Prime Sequencing in Human LCLs</a> </div> <div id="navbar" class="navbar-collapse collapse"> <ul class="nav navbar-nav"> <li> <a href="index.html">Home</a> </li> <li> <a href="about.html">About</a> </li> <li> <a href="license.html">License</a> </li> </ul> <ul class="nav navbar-nav navbar-right"> <li> <a href="https://github.com/brimittleman/threeprimeseq"> <span class="fa fa-github"></span> </a> </li> </ul> </div><!--/.nav-collapse --> </div><!--/.container --> </div><!--/.navbar --> <!-- Add a small amount of space between sections. --> <style type="text/css"> div.section { padding-top: 12px; } </style> <div class="fluid-row" id="header"> <h1 class="title toc-ignore">ChromHMM analysis</h1> <h4 class="author"><em>Briana Mittleman</em></h4> <h4 class="date"><em>11/7/2018</em></h4> </div> <p><strong>Last updated:</strong> 2018-11-15</p> <strong>workflowr checks:</strong> <small>(Click a bullet for more information)</small> <ul> <li> <p><details> <summary> <strong style="color:blue;">✔</strong> <strong>R Markdown file:</strong> up-to-date </summary></p> <p>Great! Since the R Markdown file has been committed to the Git repository, you know the exact version of the code that produced these results.</p> </details> </li> <li> <p><details> <summary> <strong style="color:blue;">✔</strong> <strong>Environment:</strong> empty </summary></p> <p>Great job! The global environment was empty. Objects defined in the global environment can affect the analysis in your R Markdown file in unknown ways. For reproduciblity it’s best to always run the code in an empty environment.</p> </details> </li> <li> <p><details> <summary> <strong style="color:blue;">✔</strong> <strong>Seed:</strong> <code>set.seed(12345)</code> </summary></p> <p>The command <code>set.seed(12345)</code> was run prior to running the code in the R Markdown file. Setting a seed ensures that any results that rely on randomness, e.g. subsampling or permutations, are reproducible.</p> </details> </li> <li> <p><details> <summary> <strong style="color:blue;">✔</strong> <strong>Session information:</strong> recorded </summary></p> <p>Great job! Recording the operating system, R version, and package versions is critical for reproducibility.</p> </details> </li> <li> <p><details> <summary> <strong style="color:blue;">✔</strong> <strong>Repository version:</strong> <a href="https://github.com/brimittleman/threeprimeseq/tree/acf77f83b8c8ee87e9e8d4898adb929690701512" target="_blank">acf77f8</a> </summary></p> Great! You are using Git for version control. Tracking code development and connecting the code version to the results is critical for reproducibility. The version displayed above was the version of the Git repository at the time these results were generated. <br><br> Note that you need to be careful to ensure that all relevant files for the analysis have been committed to Git prior to generating the results (you can use <code>wflow_publish</code> or <code>wflow_git_commit</code>). workflowr only checks the R Markdown file, but you know if there are other scripts or data files that it depends on. Below is the status of the Git repository when the results were generated: <pre><code> Ignored files: Ignored: .DS_Store Ignored: .Rhistory Ignored: .Rproj.user/ Ignored: data/.DS_Store Ignored: output/.DS_Store Untracked files: Untracked: KalistoAbundance18486.txt Untracked: analysis/ncbiRefSeq_sm.sort.mRNA.bed Untracked: analysis/snake.config.notes.Rmd Untracked: analysis/verifyBAM.Rmd Untracked: data/18486.genecov.txt Untracked: data/APApeaksYL.total.inbrain.bed Untracked: data/ChromHmmOverlap/ Untracked: data/GM12878.chromHMM.bed Untracked: data/GM12878.chromHMM.txt Untracked: data/NuclearApaQTLs.txt Untracked: data/PeaksUsed/ Untracked: data/RNAkalisto/ Untracked: data/TotalApaQTLs.txt Untracked: data/Totalpeaks_filtered_clean.bed Untracked: data/YL-SP-18486-T-combined-genecov.txt Untracked: data/YL-SP-18486-T_S9_R1_001-genecov.txt Untracked: data/apaExamp/ Untracked: data/bedgraph_peaks/ Untracked: data/bin200.5.T.nuccov.bed Untracked: data/bin200.Anuccov.bed Untracked: data/bin200.nuccov.bed Untracked: data/clean_peaks/ Untracked: data/comb_map_stats.csv Untracked: data/comb_map_stats.xlsx Untracked: data/comb_map_stats_39ind.csv Untracked: data/combined_reads_mapped_three_prime_seq.csv Untracked: data/diff_iso_trans/ Untracked: data/ensemble_to_genename.txt Untracked: data/example_gene_peakQuant/ Untracked: data/filtered_APApeaks_merged_allchrom_refseqTrans.closest2End.bed Untracked: data/filtered_APApeaks_merged_allchrom_refseqTrans.closest2End.noties.bed Untracked: data/first50lines_closest.txt Untracked: data/gencov.test.csv Untracked: data/gencov.test.txt Untracked: data/gencov_zero.test.csv Untracked: data/gencov_zero.test.txt Untracked: data/gene_cov/ Untracked: data/joined Untracked: data/leafcutter/ Untracked: data/merged_combined_YL-SP-threeprimeseq.bg Untracked: data/mol_overlap/ Untracked: data/mol_pheno/ Untracked: data/nom_QTL/ Untracked: data/nom_QTL_opp/ Untracked: data/nom_QTL_trans/ Untracked: data/nuc6up/ Untracked: data/other_qtls/ Untracked: data/peakPerRefSeqGene/ Untracked: data/perm_QTL/ Untracked: data/perm_QTL_opp/ Untracked: data/perm_QTL_trans/ Untracked: data/reads_mapped_three_prime_seq.csv Untracked: data/smash.cov.results.bed Untracked: data/smash.cov.results.csv Untracked: data/smash.cov.results.txt Untracked: data/smash_testregion/ Untracked: data/ssFC200.cov.bed Untracked: data/temp.file1 Untracked: data/temp.file2 Untracked: data/temp.gencov.test.txt Untracked: data/temp.gencov_zero.test.txt Untracked: output/picard/ Untracked: output/plots/ Untracked: output/qual.fig2.pdf Unstaged changes: Modified: analysis/28ind.peak.explore.Rmd Modified: analysis/39indQC.Rmd Modified: analysis/cleanupdtseq.internalpriming.Rmd Modified: analysis/coloc_apaQTLs_protQTLs.Rmd Modified: analysis/dif.iso.usage.leafcutter.Rmd Modified: analysis/diff_iso_pipeline.Rmd Modified: analysis/explore.filters.Rmd Modified: analysis/flash2mash.Rmd Modified: analysis/overlapMolQTL.Rmd Modified: analysis/overlap_qtls.Rmd Modified: analysis/peakOverlap_oppstrand.Rmd Modified: analysis/pheno.leaf.comb.Rmd Modified: analysis/swarmPlots_QTLs.Rmd Modified: analysis/test.max2.Rmd Modified: code/Snakefile </code></pre> Note that any generated files, e.g. HTML, png, CSS, etc., are not included in this status report because it is ok for generated content to have uncommitted changes. </details> </li> </ul> <details> <summary> <small><strong>Expand here to see past versions:</strong></small> </summary> <ul> <table style="border-collapse:separate; border-spacing:5px;"> <thead> <tr> <th style="text-align:left;"> File </th> <th style="text-align:left;"> Version </th> <th style="text-align:left;"> Author </th> <th style="text-align:left;"> Date </th> <th style="text-align:left;"> Message </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> Rmd </td> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/acf77f83b8c8ee87e9e8d4898adb929690701512/analysis/chromHmm_enrichment.Rmd" target="_blank">acf77f8</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-11-15 </td> <td style="text-align:left;"> add res from uniq analysis </td> </tr> <tr> <td style="text-align:left;"> html </td> <td style="text-align:left;"> <a href="https://cdn.rawgit.com/brimittleman/threeprimeseq/89fa7d63b469f1816fa92765e1e5786ecfaa5900/docs/chromHmm_enrichment.html" target="_blank">89fa7d6</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-11-13 </td> <td style="text-align:left;"> Build site. </td> </tr> <tr> <td style="text-align:left;"> Rmd </td> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/595cdc05302a238e9bfd56915be603e99a85ee8b/analysis/chromHmm_enrichment.Rmd" target="_blank">595cdc0</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-11-13 </td> <td style="text-align:left;"> add code for uniq no sig permutation </td> </tr> <tr> <td style="text-align:left;"> html </td> <td style="text-align:left;"> <a href="https://cdn.rawgit.com/brimittleman/threeprimeseq/086fbcc926b578d6d8b5a54523f7dbb7fad7e78c/docs/chromHmm_enrichment.html" target="_blank">086fbcc</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-11-13 </td> <td style="text-align:left;"> Build site. </td> </tr> <tr> <td style="text-align:left;"> Rmd </td> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/1357eac2f73181ec1e6025eedbd790979a2601d0/analysis/chromHmm_enrichment.Rmd" target="_blank">1357eac</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-11-13 </td> <td style="text-align:left;"> res from no sig analysis </td> </tr> <tr> <td style="text-align:left;"> html </td> <td style="text-align:left;"> <a href="https://cdn.rawgit.com/brimittleman/threeprimeseq/ff8c9691a125898c9bf49f327c35186e5d2fc045/docs/chromHmm_enrichment.html" target="_blank">ff8c969</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-11-12 </td> <td style="text-align:left;"> Build site. </td> </tr> <tr> <td style="text-align:left;"> Rmd </td> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/d08f3f9d5a0adb0917704ecd105ebe7830b237f9/analysis/chromHmm_enrichment.Rmd" target="_blank">d08f3f9</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-11-12 </td> <td style="text-align:left;"> fix code for no sig permutations </td> </tr> <tr> <td style="text-align:left;"> html </td> <td style="text-align:left;"> <a href="https://cdn.rawgit.com/brimittleman/threeprimeseq/83f1b1419eb993f27c047d5d3a2355c4c84e24e1/docs/chromHmm_enrichment.html" target="_blank">83f1b14</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-11-08 </td> <td style="text-align:left;"> Build site. </td> </tr> <tr> <td style="text-align:left;"> Rmd </td> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/ab79f33b0858f7059283b2764531a884b8a1c218/analysis/chromHmm_enrichment.Rmd" target="_blank">ab79f33</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-11-08 </td> <td style="text-align:left;"> add enrichment analysis (messed up perm) </td> </tr> <tr> <td style="text-align:left;"> html </td> <td style="text-align:left;"> <a href="https://cdn.rawgit.com/brimittleman/threeprimeseq/19b98b3e20dcdad8fa0039ae754c9bef22447cdd/docs/chromHmm_enrichment.html" target="_blank">19b98b3</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-11-07 </td> <td style="text-align:left;"> Build site. </td> </tr> <tr> <td style="text-align:left;"> Rmd </td> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/70cf09c19343a0c37c200842b84edebe43324a4f/analysis/chromHmm_enrichment.Rmd" target="_blank">70cf09c</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-11-07 </td> <td style="text-align:left;"> add perm res </td> </tr> <tr> <td style="text-align:left;"> html </td> <td style="text-align:left;"> <a href="https://cdn.rawgit.com/brimittleman/threeprimeseq/2ec5ffd37187beac4a5124fbb700b2ea52a1e75a/docs/chromHmm_enrichment.html" target="_blank">2ec5ffd</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-11-07 </td> <td style="text-align:left;"> Build site. </td> </tr> <tr> <td style="text-align:left;"> Rmd </td> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/962e39bcfc173365546499232b60d586f6470f38/analysis/chromHmm_enrichment.Rmd" target="_blank">962e39b</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-11-07 </td> <td style="text-align:left;"> move chromhmm analysis to its own analysis </td> </tr> </tbody> </table> </ul> <p></details></p> <hr /> <p>Librarys</p> <pre class="r"><code>library(workflowr)</code></pre> <pre><code>This is workflowr version 1.1.1 Run ?workflowr for help getting started</code></pre> <pre class="r"><code>library(reshape2) library(tidyverse)</code></pre> <pre><code>── Attaching packages ─────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──</code></pre> <pre><code>✔ ggplot2 3.0.0 ✔ purrr 0.2.5 ✔ tibble 1.4.2 ✔ dplyr 0.7.6 ✔ tidyr 0.8.1 ✔ stringr 1.3.1 ✔ readr 1.1.1 ✔ forcats 0.3.0</code></pre> <pre><code>── Conflicts ────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ── ✖ dplyr::filter() masks stats::filter() ✖ dplyr::lag() masks stats::lag()</code></pre> <pre class="r"><code>library(VennDiagram)</code></pre> <pre><code>Loading required package: grid</code></pre> <pre><code>Loading required package: futile.logger</code></pre> <pre class="r"><code>library(data.table)</code></pre> <pre><code> Attaching package: 'data.table'</code></pre> <pre><code>The following objects are masked from 'package:dplyr': between, first, last</code></pre> <pre><code>The following object is masked from 'package:purrr': transpose</code></pre> <pre><code>The following objects are masked from 'package:reshape2': dcast, melt</code></pre> <pre class="r"><code>library(ggpubr)</code></pre> <pre><code>Loading required package: magrittr</code></pre> <pre><code> Attaching package: 'magrittr'</code></pre> <pre><code>The following object is masked from 'package:purrr': set_names</code></pre> <pre><code>The following object is masked from 'package:tidyr': extract</code></pre> <pre><code> Attaching package: 'ggpubr'</code></pre> <pre><code>The following object is masked from 'package:VennDiagram': rotate</code></pre> <pre class="r"><code>library(cowplot)</code></pre> <pre><code> Attaching package: 'cowplot'</code></pre> <pre><code>The following object is masked from 'package:ggpubr': get_legend</code></pre> <pre><code>The following object is masked from 'package:ggplot2': ggsave</code></pre> <p>I am continuing the analysis I started in the characterization of the APAqtl analysis. I need to run permutations to enrichment statistics.</p> <p>I created the significant SNP files in the <a href="characterizeTotalApaQtls.html">Characterize Total APAqtl analysis</a> analysis.</p> <pre class="r"><code>chromHmm=read.table("../data/ChromHmmOverlap/chromHMM_regions.txt", col.names = c("number", "name"), stringsAsFactors = F) NuclearOverlapHMM=read.table("../data/ChromHmmOverlap/Nuc_overlapHMM.bed", col.names=c("chrom", "start", "end", "sid", "significance", "strand", "number")) NuclearOverlapHMM$number=as.integer(NuclearOverlapHMM$number) NuclearOverlapHMM_names=NuclearOverlapHMM %>% left_join(chromHmm, by="number")</code></pre> <pre class="r"><code>NuclearOverlapHMM_names$number=as.character(NuclearOverlapHMM_names$number) ggplot(NuclearOverlapHMM_names, aes(x=number, fill=name)) + geom_bar() + labs(title="ChromHMM labels for Nuclear APAQtls" , y="Number of SNPs", x="Region")+theme(axis.text.x = element_text(angle = 90, hjust = 1))</code></pre> <p><img src="figure/chromHmm_enrichment.Rmd/unnamed-chunk-3-1.png" width="672" style="display: block; margin: auto;" /></p> <details> <summary><em>Expand here to see past versions of unnamed-chunk-3-1.png:</em></summary> <table style="border-collapse:separate; border-spacing:5px;"> <thead> <tr> <th style="text-align:left;"> Version </th> <th style="text-align:left;"> Author </th> <th style="text-align:left;"> Date </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/83f1b1419eb993f27c047d5d3a2355c4c84e24e1/docs/figure/chromHmm_enrichment.Rmd/unnamed-chunk-3-1.png" target="_blank">83f1b14</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-11-08 </td> </tr> <tr> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/2ec5ffd37187beac4a5124fbb700b2ea52a1e75a/docs/figure/chromHmm_enrichment.Rmd/unnamed-chunk-3-1.png" target="_blank">2ec5ffd</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-11-07 </td> </tr> </tbody> </table> <p></details></p> <p>Evaluate results for total:</p> <pre class="r"><code>TotalOverlapHMM=read.table("../data/ChromHmmOverlap/Tot_overlapHMM.bed", col.names=c("chrom", "start", "end", "sid", "significance", "strand", "number")) TotalOverlapHMM_names=TotalOverlapHMM %>% left_join(chromHmm, by="number")</code></pre> <pre class="r"><code>TotalOverlapHMM_names$number=as.character(TotalOverlapHMM_names$number) ggplot(TotalOverlapHMM_names, aes(x=number, fill=name)) + geom_bar() + labs(title="ChromHMM labels for Total APAQtls" , y="Number of SNPs", x="Region")+theme(axis.text.x = element_text(angle = 90, hjust = 1))</code></pre> <p><img src="figure/chromHmm_enrichment.Rmd/unnamed-chunk-5-1.png" width="672" style="display: block; margin: auto;" /></p> <details> <summary><em>Expand here to see past versions of unnamed-chunk-5-1.png:</em></summary> <table style="border-collapse:separate; border-spacing:5px;"> <thead> <tr> <th style="text-align:left;"> Version </th> <th style="text-align:left;"> Author </th> <th style="text-align:left;"> Date </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/83f1b1419eb993f27c047d5d3a2355c4c84e24e1/docs/figure/chromHmm_enrichment.Rmd/unnamed-chunk-5-1.png" target="_blank">83f1b14</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-11-08 </td> </tr> <tr> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/2ec5ffd37187beac4a5124fbb700b2ea52a1e75a/docs/figure/chromHmm_enrichment.Rmd/unnamed-chunk-5-1.png" target="_blank">2ec5ffd</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-11-07 </td> </tr> </tbody> </table> <p></details></p> <div id="pull-one-set-of-random-snps" class="section level2"> <h2>Pull one set of random snps:</h2> <p>I do still need to get 880 random snps.</p> <pre class="bash"><code>shuf -n 880 /project2/gilad/briana/threeprimeseq/data/nominal_APAqtl_trans/filtered_APApeaks_merged_allchrom_refseqGenes_pheno_Nuclear_NomRes.txt > /project2/gilad/briana/threeprimeseq/data/nominal_APAqtl_trans/randomSnps/ApaQTL_nuclear_Random880.txt </code></pre> <p>Run QTLNOMres2SigSNPbed.py with nuclear 880 and sort output</p> <pre class="bash"><code>import pybedtools RANDnuc=pybedtools.BedTool('/project2/gilad/briana/threeprimeseq/data/nominal_APAqtl_trans/randomSnps/ApaQTL_nuclear_Random880.sort.bed') hmm=pybedtools.BedTool("/project2/gilad/briana/genome_anotation_data/GM12878.chromHMM.sort.bed") #map hmm to snps NucRnad_overlapHMM=RANDnuc.map(hmm, c=4) #save results NucRnad_overlapHMM.saveas("/project2/gilad/briana/threeprimeseq/data/nominal_APAqtl_trans/randomSnps/ApaQTL_nuclear_Random_overlapHMM.bed") </code></pre> <pre class="r"><code>NuclearRandOverlapHMM=read.table("../data/ChromHmmOverlap/ApaQTL_nuclear_Random_overlapHMM.bed", col.names=c("chrom", "start", "end", "sid", "significance", "strand", "number")) NuclearRandOverlapHMM_names=NuclearRandOverlapHMM %>% left_join(chromHmm, by="number")</code></pre> <pre class="r"><code>ggplot(NuclearRandOverlapHMM_names, aes(x=name)) + geom_bar() + labs(title="ChromHMM labels for Nuclear APAQtls (Random)" , y="Number of SNPs", x="Region")+theme(axis.text.x = element_text(angle = 90, hjust = 1))</code></pre> <p><img src="figure/chromHmm_enrichment.Rmd/unnamed-chunk-9-1.png" width="672" style="display: block; margin: auto;" /></p> <details> <summary><em>Expand here to see past versions of unnamed-chunk-9-1.png:</em></summary> <table style="border-collapse:separate; border-spacing:5px;"> <thead> <tr> <th style="text-align:left;"> Version </th> <th style="text-align:left;"> Author </th> <th style="text-align:left;"> Date </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/2ec5ffd37187beac4a5124fbb700b2ea52a1e75a/docs/figure/chromHmm_enrichment.Rmd/unnamed-chunk-9-1.png" target="_blank">2ec5ffd</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-11-07 </td> </tr> </tbody> </table> <p></details></p> <p>To put this on the same plot I can count the number in each then plot them next to eachother.</p> <pre class="r"><code>random_perChromHMM_nuc=NuclearRandOverlapHMM_names %>% group_by(name) %>% summarise(Random=n()) sig_perChromHMM_nuc= NuclearOverlapHMM_names %>% group_by(name) %>% summarise(Nuclear_QTLs=n()) perChrommHMM_nuc=random_perChromHMM_nuc %>% full_join(sig_perChromHMM_nuc, by="name", ) %>% replace_na(list(Random=0,Total_QTLs=0)) perChrommHMM_nuc_melt=melt(perChrommHMM_nuc, id.vars="name") names(perChrommHMM_nuc_melt)=c("Region","Set", "N_Snps" )</code></pre> <pre class="r"><code>chromenrichNuclearplot=ggplot(perChrommHMM_nuc_melt, aes(x=Region, y=N_Snps, by=Set, fill=Set)) + geom_bar(position="dodge", stat="identity") +theme(axis.text.x = element_text(angle = 90, hjust = 1)) + labs(title="Enrichment of Nuclear QTLs by chromatin region", y="Number of Snps", x="Chromatin Region") + scale_fill_brewer(palette="Paired") chromenrichNuclearplot</code></pre> <p><img src="figure/chromHmm_enrichment.Rmd/unnamed-chunk-11-1.png" width="672" style="display: block; margin: auto;" /></p> <details> <summary><em>Expand here to see past versions of unnamed-chunk-11-1.png:</em></summary> <table style="border-collapse:separate; border-spacing:5px;"> <thead> <tr> <th style="text-align:left;"> Version </th> <th style="text-align:left;"> Author </th> <th style="text-align:left;"> Date </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/2ec5ffd37187beac4a5124fbb700b2ea52a1e75a/docs/figure/chromHmm_enrichment.Rmd/unnamed-chunk-11-1.png" target="_blank">2ec5ffd</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-11-07 </td> </tr> </tbody> </table> <p></details></p> <pre class="r"><code>ggsave("../output/plots/ChromHmmEnrich_Nuclear.png", chromenrichNuclearplot)</code></pre> <pre><code>Saving 7 x 5 in image</code></pre> <div id="compare-enrichment-between-fractions" class="section level3"> <h3>Compare enrichment between fractions</h3> <p>I want to make a plot with the enrichment by fraction. I am first going to get an enrichemnt score for each bin naively by looking at the QTL/random in each category.</p> <pre class="r"><code>#perChrommHMM_nuc$Random= as.integer(perChrommHMM_nuc$Random) #perChrommHMM_nuc_enr=perChrommHMM_nuc %>% mutate(Nuclear=Nuclear_QTLs-Random) #perChrommHMM_tot_enr=read.table("../data/ChromHmmOverlap/perChrommHMM_Total_enr.txt",stringsAsFactors = F,header = T)</code></pre> <pre class="r"><code>#allenrich=perChrommHMM_tot_enr %>% inner_join(perChrommHMM_nuc_enr, by="name") %>% select(name, Total, Nuclear) #allenrich_melt=melt(allenrich, id.vars="name")</code></pre> <p>plot it</p> <pre class="r"><code>#chromenrichBoth=ggplot(allenrich_melt, aes(x=name, by=variable, y=value, fill=variable)) + geom_bar(stat="identity", position = "dodge") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + labs(title="QTL-Random for each bin by fraction", y="Num QTL SNPs - Num Random SNPs") + scale_fill_manual(values=c("darkviolet", "deepskyblue3")) #ggsave("../output/plots/ChromHmmEnrich_BothFrac.png", chromenrichBoth)</code></pre> </div> </div> <div id="permutations" class="section level2"> <h2>Permutations</h2> <p>I want to permute the background snps so i can get a better expectation. To do this I need to chose random lines from the nominal file, change the lines to snp format, overlap with HMM, count how many are in each category, and append the list to a dataframe that is category by permuation.</p> <p>DO this for total first (118 snps)</p> <p>total_random118_chromHmm.sh</p> <pre class="bash"><code>#!/bin/bash #SBATCH --job-name=total_random118_chromHmm_f #SBATCH --account=pi-yangili1 #SBATCH --time=36:00:00 #SBATCH --output=total_random118_chromHmm_f.out #SBATCH --error=total_random118_chromHmm_f.err #SBATCH --partition=bigmem2 #SBATCH --mem=200G #SBATCH --mail-type=END module load Anaconda3 source activate three-prime-env #test with 2 permutations then make it 1000 #choose random res for i in {1..1000}; do shuf -n 118 /project2/gilad/briana/threeprimeseq/data/nominal_APAqtl_trans/filtered_APApeaks_merged_allchrom_refseqGenes_pheno_Total_NomRes.txt > /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Total/randomRes_Total_118_${i}.txt done #make random for i in {1..1000}; do python randomRes2SNPbed.py Total 118 ${i} done #cat res together cat /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Total/snp_bed/* > /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Total/snp_bed_all/randomRes_Total_118_ALLperm.bed #sort full file sort -k1,1 -k2,2n /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Total/snp_bed_all/randomRes_Total_118_ALLperm.bed > /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Total/snp_bed_all/randomRes_Total_118_ALLperm.sort.bed #hmm overlap python overlap_chromHMM.py Total 118 1000 #Next I would pull this into R to do the group by and average! </code></pre> <p>pull_random_lines.py</p> <pre class="bash"><code>def main(inFile, outFile ,nsamp): nom_res= pd.read_csv(inFile, sep="\t", encoding="utf-8",header=None) out=open(outFile, "w") sample=nom_res.sample(nsamp) sample.to_csv(out, sep="\t", encoding='utf-8', index=False, header=F) out.close() if __name__ == "__main__": import sys import pandas as pd fraction = sys.argv[1] nsamp=sys.argv[2] nsamp=int(nsamp) iter=sys.argv[3] inFile = "/project2/gilad/briana/threeprimeseq/data/nominal_APAqtl_trans/filtered_APApeaks_merged_allchrom_refseqGenes_pheno_%s_NomRes.txt"%(fraction) outFile = "/project2/gilad/briana/threeprimeseq/data/random_QTLsnps/%s/randomRes_%s_%d_%s.txt"%(fraction,fraction, nsamp, iter) main(inFile, outFile, nsamp)</code></pre> <p>randomRes2SNPbed.py</p> <pre class="bash"><code>def main(inFile, outFile): fout=open(outFile, "w") fin=open(inFile, "r") for ln in fin: pid, sid, dist, pval, slope = ln.split() chrom, pos= sid.split(":") name=sid start= int(pos)-1 end=int(pos) strand=pid.split(":")[3].split("_")[1] pval=float(pval) fout.write("%s\t%s\t%s\t%s\t%s\t%s\n"%(chrom, start, end, name, pval, strand)) fout.close() if __name__ == "__main__": import sys fraction=sys.argv[1] nsamp=sys.argv[2] nsamp=int(nsamp) iter=sys.argv[3] inFile = "/project2/gilad/briana/threeprimeseq/data/random_QTLsnps/%s/randomRes_%s_%d_%s.txt"%(fraction,fraction, nsamp, iter) outFile= "/project2/gilad/briana/threeprimeseq/data/random_QTLsnps/%s/snp_bed/randomRes_%s_%d_%s.bed"%(fraction,fraction, nsamp, iter) main(inFile,outFile) </code></pre> <p>overlap_chromHMM.py</p> <pre class="bash"><code> def main(inFile, outFile): rand=pybedtools.BedTool(inFile) hmm=pybedtools.BedTool("/project2/gilad/briana/genome_anotation_data/GM12878.chromHMM.sort.bed") #map hmm to snps Rand_overlapHMM=rand.map(hmm, c=4) #save results Rand_overlapHMM.saveas(outFile) if __name__ == "__main__": import sys import pandas as pd import pybedtools fraction=sys.argv[1] nsamp=sys.argv[2] niter=sys.argv[3] inFile = "/project2/gilad/briana/threeprimeseq/data/random_QTLsnps/%s/snp_bed_all/randomRes_%s_%s_ALLperm.sort.bed"%(fraction,fraction, nsamp) outFile= "/project2/gilad/briana/threeprimeseq/data/random_QTLsnps/%s/chromHMM_overlap/randomres_overlapChromHMM_%s_%s_%s.txt"%(fraction,fraction,nsamp, niter) main(inFile,outFile) </code></pre> <p>*Nuclear 880</p> <p>nuclear_random880_chromHmm.sh</p> <pre class="bash"><code>#!/bin/bash #SBATCH --job-name=nuc_random880_chromHmm #SBATCH --account=pi-yangili1 #SBATCH --time=36:00:00 #SBATCH --output=nuc_random880_chromHmm.out #SBATCH --error=nuc_random880_chromHmm.err #SBATCH --partition=bigmem2 #SBATCH --mem=200G #SBATCH --mail-type=END module load Anaconda3 source activate three-prime-env #test with 2 permutations then make it 1000 #choose random res for i in {1..1000}; do shuf -n 880 /project2/gilad/briana/threeprimeseq/data/nominal_APAqtl_trans/filtered_APApeaks_merged_allchrom_refseqGenes_pheno_Nuclear_NomRes.txt > /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Nuclear/randomRes_Nuclear_880_${i}.txt done #make random for i in {1..1000}; do python randomRes2SNPbed.py Nuclear 880 ${i} done #cat res together cat /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Nuclear/snp_bed/* > /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Nuclear/snp_bed_all/randomRes_Nuclear_880_ALLperm.bed #sort full file sort -k1,1 -k2,2n /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Nuclear/snp_bed_all/randomRes_Nuclear_880_ALLperm.bed > /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Nuclear/snp_bed_all/randomRes_Nuclear_880_ALLperm.sort.bed #hmm overlap python overlap_chromHMM.py Nuclear 880 1000 #Next I would pull this into R to do the group by and average! </code></pre> <p>Perm didnt finish: do this with less (824)</p> <p>nuclear_random880_chromHmm.sm.sh</p> <pre class="bash"><code>#!/bin/bash #SBATCH --job-name=nuc_random880_chromHmm_sm #SBATCH --account=pi-yangili1 #SBATCH --time=24:00:00 #SBATCH --output=nuc_random880_chromHmm_sm.out #SBATCH --error=nuc_random880_chromHmm_sm.err #SBATCH --partition=bigmem2 #SBATCH --mem=100G #SBATCH --mail-type=END module load Anaconda3 source activate three-prime-env #make random for i in {1..824}; do python randomRes2SNPbed.py Nuclear 880 ${i} done #cat res together cat /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Nuclear/snp_bed/* > /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Nuclear/snp_bed_all/randomRes_Nuclear_880_ALLperm.bed #sort full file sort -k1,1 -k2,2n /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Nuclear/snp_bed_all/randomRes_Nuclear_880_ALLperm.bed > /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Nuclear/snp_bed_all/randomRes_Nuclear_880_ALLperm.sort.bed #hmm overlap python overlap_chromHMM.py Nuclear 880 824</code></pre> <p>I need a way to make this more efficient to run 1000 permutations. Here I will look at the results from the 824 permutations.</p> <pre class="r"><code>nuclear_perm824= read.table("../data/ChromHmmOverlap/randomres_overlapChromHMM_Nuclear_880_824.txt", col.names=c("chrom", "start", "end", "sid", "significance", "strand", "number"),stringsAsFactors = F, na.strings = "NA") #924 snps are not annoated nuclear_perm824$number=as.integer(as.factor(nuclear_perm824$number)) nuclear_perm824_names=nuclear_perm824 %>% left_join(chromHmm, by="number") random_perChromHMM_nuc_PERM=nuclear_perm824_names %>% group_by(name) %>% summarise(Random=n()) %>% mutate(Random_perm=Random/824) %>% replace_na(list(name="No_annoation")) perChrommHMM_nuc_withPerm=random_perChromHMM_nuc_PERM %>% full_join(sig_perChromHMM_nuc, by="name" ) %>% replace_na(list(Random=0,Nuclear_QTLs=0)) %>% select(name,Random_perm, Nuclear_QTLs) perChrommHMM_nuc_withPerm_melt=melt(perChrommHMM_nuc_withPerm, id.vars="name") names(perChrommHMM_nuc_withPerm_melt)=c("Region","Set", "N_Snps" ) ggplot(perChrommHMM_nuc_withPerm_melt, aes(x=Region, y=N_Snps, by=Set, fill=Set)) + geom_bar(position="dodge", stat="identity") +theme(axis.text.x = element_text(angle = 90, hjust = 1)) + labs(title="Enrichment of Nuclear QTLs by chromatin region", y="Number of Snps", x="Chromatin Region") + scale_fill_brewer(palette="Paired")</code></pre> <p><img src="figure/chromHmm_enrichment.Rmd/unnamed-chunk-21-1.png" width="672" style="display: block; margin: auto;" /></p> <details> <summary><em>Expand here to see past versions of unnamed-chunk-21-1.png:</em></summary> <table style="border-collapse:separate; border-spacing:5px;"> <thead> <tr> <th style="text-align:left;"> Version </th> <th style="text-align:left;"> Author </th> <th style="text-align:left;"> Date </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/2ec5ffd37187beac4a5124fbb700b2ea52a1e75a/docs/figure/chromHmm_enrichment.Rmd/unnamed-chunk-21-1.png" target="_blank">2ec5ffd</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-11-07 </td> </tr> </tbody> </table> <p></details></p> <p>Enrichment is the actual/random:</p> <pre class="r"><code>perChrommHMM_nuc_withPerm_enrich = perChrommHMM_nuc_withPerm %>% mutate(Nuclear_Enrichment=(Nuclear_QTLs-Random_perm)/Random_perm, chiSq=(Nuclear_QTLs-Random_perm)^2/Random_perm) ggplot(perChrommHMM_nuc_withPerm_enrich, aes(x=name, y=Nuclear_Enrichment)) + geom_bar(stat="identity",fill="deepskyblue3")+ theme(axis.text.x = element_text(angle = 90, hjust = 1)) + labs(title="ChromHMM Enrichment of Nuclear ApaQTLs \n over 824 Random Permuations", x="Region")</code></pre> <p><img src="figure/chromHmm_enrichment.Rmd/unnamed-chunk-22-1.png" width="672" style="display: block; margin: auto;" /></p> <details> <summary><em>Expand here to see past versions of unnamed-chunk-22-1.png:</em></summary> <table style="border-collapse:separate; border-spacing:5px;"> <thead> <tr> <th style="text-align:left;"> Version </th> <th style="text-align:left;"> Author </th> <th style="text-align:left;"> Date </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/2ec5ffd37187beac4a5124fbb700b2ea52a1e75a/docs/figure/chromHmm_enrichment.Rmd/unnamed-chunk-22-1.png" target="_blank">2ec5ffd</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-11-07 </td> </tr> </tbody> </table> <p></details></p> <pre class="r"><code>ggplot(perChrommHMM_nuc_withPerm_enrich, aes(x=name, y=chiSq)) + geom_bar(stat="identity",fill="deepskyblue3")+ theme(axis.text.x = element_text(angle = 90, hjust = 1)) + labs(title="ChromHMM ChiSq of Nuclear ApaQTLs \n over 824 Random Permuations", x="Region") </code></pre> <p><img src="figure/chromHmm_enrichment.Rmd/unnamed-chunk-22-2.png" width="672" style="display: block; margin: auto;" /></p> <details> <summary><em>Expand here to see past versions of unnamed-chunk-22-2.png:</em></summary> <table style="border-collapse:separate; border-spacing:5px;"> <thead> <tr> <th style="text-align:left;"> Version </th> <th style="text-align:left;"> Author </th> <th style="text-align:left;"> Date </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/2ec5ffd37187beac4a5124fbb700b2ea52a1e75a/docs/figure/chromHmm_enrichment.Rmd/unnamed-chunk-22-2.png" target="_blank">2ec5ffd</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-11-07 </td> </tr> </tbody> </table> <p></details></p> <p>To parallelize this I will run the permutations in 4 bash scripts:</p> <p>nuc_random880_chromHmm_set1.sh</p> <pre class="bash"><code>#!/bin/bash #SBATCH --job-name=nuc_random880_chromHmm_set1 #SBATCH --account=pi-yangili1 #SBATCH --time=24:00:00 #SBATCH --output=nuc_random880_chromHmm_set1.out #SBATCH --error=nuc_random880_chromHmm_set1.err #SBATCH --partition=bigmem2 #SBATCH --mem=100G #SBATCH --mail-type=END module load Anaconda3 source activate three-prime-env #make random for i in {1..250}; do shuf -n 880 /project2/gilad/briana/threeprimeseq/data/nominal_APAqtl_trans/filtered_APApeaks_merged_allchrom_refseqGenes_pheno_Nuclear_NomRes.txt > /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Nuclear/randomRes_Nuclear_880_${i}.txt done </code></pre> <p>nuc_random880_chromHmm_set2.sh</p> <pre class="bash"><code>#!/bin/bash #SBATCH --job-name=nuc_random880_chromHmm_set2 #SBATCH --account=pi-yangili1 #SBATCH --time=24:00:00 #SBATCH --output=nuc_random880_chromHmm_set2.out #SBATCH --error=nuc_random880_chromHmm_set2.err #SBATCH --partition=bigmem2 #SBATCH --mem=200G #SBATCH --mail-type=END module load Anaconda3 source activate three-prime-env #make random for i in {251..500}; do shuf -n 880 /project2/gilad/briana/threeprimeseq/data/nominal_APAqtl_trans/filtered_APApeaks_merged_allchrom_refseqGenes_pheno_Nuclear_NomRes.txt > /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Nuclear/randomRes_Nuclear_880_${i}.txt done </code></pre> <p>nuc_random880_chromHmm_set3.sh</p> <pre class="bash"><code>#!/bin/bash #SBATCH --job-name=nuc_random880_chromHmm_set3 #SBATCH --account=pi-yangili1 #SBATCH --time=24:00:00 #SBATCH --output=nuc_random880_chromHmm_set3.out #SBATCH --error=nuc_random880_chromHmm_set3.err #SBATCH --partition=bigmem2 #SBATCH --mem=200G #SBATCH --mail-type=END module load Anaconda3 source activate three-prime-env #make random for i in {501..750}; do shuf -n 880 /project2/gilad/briana/threeprimeseq/data/nominal_APAqtl_trans/filtered_APApeaks_merged_allchrom_refseqGenes_pheno_Nuclear_NomRes.txt > /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Nuclear/randomRes_Nuclear_880_${i}.txt done </code></pre> <p>nuc_random880_chromHmm_set4.sh</p> <pre class="bash"><code>#!/bin/bash #SBATCH --job-name=nuc_random880_chromHmm_set4 #SBATCH --account=pi-yangili1 #SBATCH --time=24:00:00 #SBATCH --output=nuc_random880_chromHmm_set4.out #SBATCH --error=nuc_random880_chromHmm_set4.err #SBATCH --partition=bigmem2 #SBATCH --mem=200G #SBATCH --mail-type=END module load Anaconda3 source activate three-prime-env #make random for i in {751..1000}; do shuf -n 880 /project2/gilad/briana/threeprimeseq/data/nominal_APAqtl_trans/filtered_APApeaks_merged_allchrom_refseqGenes_pheno_Nuclear_NomRes.txt > /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Nuclear/randomRes_Nuclear_880_${i}.txt done </code></pre> <p>Same for total:</p> <p>total_random118_chromHmm_set1.sh</p> <pre class="bash"><code>#!/bin/bash #SBATCH --job-name=total_random118_chromHmm_set1 #SBATCH --account=pi-yangili1 #SBATCH --time=36:00:00 #SBATCH --output=total_random118_chromHmm_set1.out #SBATCH --error=total_random118_chromHmm_set1.err #SBATCH --partition=bigmem2 #SBATCH --mem=200G #SBATCH --mail-type=END module load Anaconda3 source activate three-prime-env #test with 2 permutations then make it 1000 #choose random res for i in {1..250}; do shuf -n 118 /project2/gilad/briana/threeprimeseq/data/nominal_APAqtl_trans/filtered_APApeaks_merged_allchrom_refseqGenes_pheno_Total_NomRes.txt > /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Total/randomRes_Total_118_${i}.txt done </code></pre> <p>total_random118_chromHmm_set2.sh</p> <pre class="bash"><code>#!/bin/bash #SBATCH --job-name=total_random118_chromHmm_set2 #SBATCH --account=pi-yangili1 #SBATCH --time=36:00:00 #SBATCH --output=total_random118_chromHmm_set2.out #SBATCH --error=total_random118_chromHmm_set2.err #SBATCH --partition=bigmem2 #SBATCH --mem=200G #SBATCH --mail-type=END module load Anaconda3 source activate three-prime-env #test with 2 permutations then make it 1000 #choose random res for i in {251..500}; do shuf -n 118 /project2/gilad/briana/threeprimeseq/data/nominal_APAqtl_trans/filtered_APApeaks_merged_allchrom_refseqGenes_pheno_Total_NomRes.txt > /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Total/randomRes_Total_118_${i}.txt done </code></pre> <p>total_random118_chromHmm_set4.sh</p> <pre class="bash"><code>#!/bin/bash #SBATCH --job-name=total_random118_chromHmm_set4 #SBATCH --account=pi-yangili1 #SBATCH --time=36:00:00 #SBATCH --output=total_random118_chromHmm_set4.out #SBATCH --error=total_random118_chromHmm_set4.err #SBATCH --partition=bigmem2 #SBATCH --mem=200G #SBATCH --mail-type=END module load Anaconda3 source activate three-prime-env #test with 2 permutations then make it 1000 #choose random res for i in {751..1000}; do shuf -n 118 /project2/gilad/briana/threeprimeseq/data/nominal_APAqtl_trans/filtered_APApeaks_merged_allchrom_refseqGenes_pheno_Total_NomRes.txt > /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Total/randomRes_Total_118_${i}.txt done </code></pre> <p>I want to turn each of these into snp files:</p> <p>randomLines2Snp.sh</p> <pre class="bash"><code>#!/bin/bash #SBATCH --job-name=randomLines2Snp #SBATCH --account=pi-yangili1 #SBATCH --time=36:00:00 #SBATCH --output=randomLines2Snp.out #SBATCH --error=randomLines2Snp.err #SBATCH --partition=broadwl #SBATCH --mem=50G #SBATCH --mail-type=END module load Anaconda3 source activate three-prime-env #make random for i in {1..1000}; do python randomRes2SNPbed.py Nuclear 880 ${i} done #make random for i in {1..1000}; do python randomRes2SNPbed.py Total 118 ${i} done </code></pre> <p>Next step is the overlap. I want this to run on each seperatly.</p> <p>sortRandomSnps.sh</p> <pre class="bash"><code>#!/bin/bash #SBATCH --job-name=sortRandomSnps #SBATCH --account=pi-yangili1 #SBATCH --time=36:00:00 #SBATCH --output=sortRandomSnps.out #SBATCH --error=sortRandomSnps.err #SBATCH --partition=broadwl #SBATCH --mem=50G #SBATCH --mail-type=END module load Anaconda3 source activate three-prime-env for i in $(ls /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Nuclear/snp_bed/); do sort -k1,1 -k2,2n /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Nuclear/snp_bed/$i > /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Nuclear/snp_bed_sort/$i.sort.bed done for i in $(ls /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Total/snp_bed/); do sort -k1,1 -k2,2n /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Total/snp_bed/$i > /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Total/snp_bed_sort/$i.sort.bed done </code></pre> <p>Rewrite overlap with ChromHMM script to do it on each file seperatly.</p> <p>overlap_chromHMM_sepfiles.py</p> <pre class="bash"><code>def main(inFile, outFile): rand=pybedtools.BedTool(inFile) hmm=pybedtools.BedTool("/project2/gilad/briana/genome_anotation_data/GM12878.chromHMM.sort.bed") #map hmm to snps Rand_overlapHMM=rand.map(hmm, c=4) #save results Rand_overlapHMM.saveas(outFile) if __name__ == "__main__": import sys import pandas as pd import pybedtools fraction=sys.argv[1] nsamp=sys.argv[2] niter=sys.argv[3] #which itteration we are on inFile ="/project2/gilad/briana/threeprimeseq/data/random_QTLsnps/%s/snp_bed_sort/randomRes_%s_%s_%s.bed.sort.bed"%(fraction,fraction, nsamp, iter) outFile= "/project2/gilad/briana/threeprimeseq/data/random_QTLsnps/%s/chromHMM_overlap/randomres_overlapChromHMM_%s_%s_%s.txt"%(fraction,fraction,nsamp, niter) main(inFile,outFile)</code></pre> <p>overlap_chromHMM_sepfiles.sh</p> <pre class="bash"><code>#!/bin/bash #SBATCH --job-name=overlap_chromHMM_sepfiles #SBATCH --account=pi-yangili1 #SBATCH --time=36:00:00 #SBATCH --output=overlap_chromHMM_sepfiles.out #SBATCH --error=overlap_chromHMM_sepfiles.err #SBATCH --partition=broadwl #SBATCH --mem=50G #SBATCH --mail-type=END module load Anaconda3 source activate three-prime-env for i in {1..1000}; do python overlap_chromHMM_sepfiles.py Nuclear 880 $i done for i in {1..1000}; do python overlap_chromHMM_sepfiles.py Total 118 $i done</code></pre> <p>I will next make an R script that will take in each file and perform the groupby command to get the number of snps in each group.</p> <p>groupRandomByChromHMM.R</p> <pre class="r"><code>#!/bin/rscripts # usage: groupRandomByChromHMM.R -f infile -o outfile #this file will take any of the itterations and output a file with chrom hmm number, name, numberof snps library(optparse) library(dplyr) library(tidyr) library(ggplot2) library(readr) option_list = list( make_option(c("-f", "--file"), action="store", default=NA, type='character', help="input coverage file"), make_option(c("-o", "--output"), action="store", default=NA, type='character', help="output file") ) opt_parser <- OptionParser(option_list=option_list) opt <- parse_args(opt_parser) #interrupt execution if no file is supplied if (is.null(opt$file)){ print_help(opt_parser) stop("Need input file", call.=FALSE) } if (is.null(opt$output)){ print_help(opt_parser) stop("Need output file", call.=FALSE) } randomSNPS=read.table(opt$file, col.names=c("chrom", "start", "end", "sid", "significance", "strand", "number"),stringsAsFactors = F, na.strings = "NA") hmm_names=read.table("/project2/gilad/briana/genome_anotation_data/chromHMM_regions.txt", col.names = c("number", "name"),stringsAsFactors=F) randomSNPS$number=as.integer(as.factor(randomSNPS$number)) randomSNPS_names= randomSNPS %>% left_join(hmm_names, by="number") #split the name of the file to get the iteration number fileSplit=strsplit(opt$file, "/")[[1]][10] iter.txt=strsplit(fileSplit, "_")[[1]][5] iter=substr(iter.txt, 1, nchar(iter.txt)-4) randomSNPS_names_grouped=randomSNPS_names %>% group_by(number) %>% summarise(!!iter:=n()) %>% replace_na(list(name="No_annotation")) %>% dplyr::select(number, !!iter) hmm_names$number=as.character(hmm_names$number) final=hmm_names %>% left_join(randomSNPS_names_grouped,by="number") write.table(final,opt$output,quote=FALSE, col.names = T, row.names = F)</code></pre> <p>groupRandomChromHMM.sh</p> <pre class="bash"><code>#!/bin/bash #SBATCH --job-name=groupRandomChromHMM #SBATCH --account=pi-yangili1 #SBATCH --time=36:00:00 #SBATCH --output=groupRandomChromHMM.out #SBATCH --error=groupRandomChromHMM.err #SBATCH --partition=broadwl #SBATCH --mem=50G #SBATCH --mail-type=END module load Anaconda3 source activate three-prime-env for i in {1..1000}; do Rscript groupRandomByChromHMM.R -f /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Nuclear/chromHMM_overlap/randomres_overlapChromHMM_Nuclear_880_${i}.txt -o /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Nuclear/chromHMM_overlap_group/randomres_overlapChromHMM_Nuclear_880_${i}_grouped.txt done for i in {1..1000}; do Rscript groupRandomByChromHMM.R -f /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Total/chromHMM_overlap/randomres_overlapChromHMM_Total_118_${i}.txt -o /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Total/chromHMM_overlap_group/randomres_overlapChromHMM_Total_118_${i}_grouped.txt done</code></pre> <p>Once I have the results I will paste the third column of each file together</p> <pre class="bash"><code>cut -d$' ' -f 1,2 randomres_overlapChromHMM_Nuclear_880_1_grouped.txt > Nuc_chromOverlap.txt for i in {1..1000}; do paste -d" " Nuc_chromOverlap.txt <(cut -d" " -f 3 randomres_overlapChromHMM_Nuclear_880_${i}_grouped.txt) > tmp mv tmp Nuc_chromOverlap.txt done cut -d$' ' -f 1,2 randomres_overlapChromHMM_Total_118_99_grouped.txt> Tot_chromOverlap.txt for i in {1..1000}; do paste -d" " Tot_chromOverlap.txt <(cut -d" " -f 3 randomres_overlapChromHMM_Total_118_${i}_grouped.txt) > tmp mv tmp Tot_chromOverlap.txt done </code></pre> <p>There will be NAs in this file. I will turn them into 0s when I bring it in R.</p> <p>Pull files onto computer:</p> <p>/project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Nuclear/chromHMM_overlap_group/Nuc_chromOverlap.txt /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Total/chromHMM_overlap_group/Tot_chromOverlap.txt</p> <pre class="r"><code>regions=c('Txn_Elongation','Weak_Txn','Repressed','Heterochrom/lo','Repetitive/CNV1','Repetitive/CNV2','Active_Promoter','Weak_Promoter','Poised_Promoter','Strong_Enhancer1','Strong_Enhancer2','Weak_Enhancer1','Weak_Enhancer2','Insulator','Txn_Transition') permutationResTotal=read.table("../data/ChromHmmOverlap/Tot_chromOverlap.txt", header=T, stringsAsFactors = F) permutationResTotal[is.na(permutationResTotal)] <- 0 permutationResTotal= as_data_frame(permutationResTotal) permutationResTotal_noName=permutationResTotal[,3:ncol(permutationResTotal)] totRand_mean=rowMeans(permutationResTotal_noName)/1000 permutationResNuclear=read.table("../data/ChromHmmOverlap/Nuc_chromOverlap.txt",header = T,stringsAsFactors = F) permutationResNuclear[is.na(permutationResNuclear)] <- 0 permutationResNuclear_noName=permutationResNuclear[,3:ncol(permutationResNuclear)] nucRand_mean=rowMeans(permutationResNuclear_noName)/1000</code></pre> <pre class="r"><code>allRand_mean_df= data.frame(cbind(regions,totRand_mean, nucRand_mean)) allRand_mean_df_melt=melt(allRand_mean_df, id.vars="regions")</code></pre> <pre><code>Warning: attributes are not identical across measure variables; they will be dropped</code></pre> <pre class="r"><code>allRand_mean_df_melt$value= as.numeric(allRand_mean_df_melt$value) ggplot(allRand_mean_df_melt, aes(y=value, x=regions, by=variable, fill=variable))+ geom_histogram(stat="identity", position="dodge") + theme(axis.text.x = element_text(angle = 90, hjust = 1))</code></pre> <pre><code>Warning: Ignoring unknown parameters: binwidth, bins, pad</code></pre> <p><img src="figure/chromHmm_enrichment.Rmd/unnamed-chunk-38-1.png" width="672" style="display: block; margin: auto;" /></p> <details> <summary><em>Expand here to see past versions of unnamed-chunk-38-1.png:</em></summary> <table style="border-collapse:separate; border-spacing:5px;"> <thead> <tr> <th style="text-align:left;"> Version </th> <th style="text-align:left;"> Author </th> <th style="text-align:left;"> Date </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/19b98b3e20dcdad8fa0039ae754c9bef22447cdd/docs/figure/chromHmm_enrichment.Rmd/unnamed-chunk-38-1.png" target="_blank">19b98b3</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-11-07 </td> </tr> </tbody> </table> <p></details></p> <p>I want to look at specific distributions:</p> <pre class="r"><code>permutationResTotal_melt= melt(permutationResTotal, id.vars=c("number", "name"))</code></pre> <pre class="r"><code>ggplot(permutationResTotal_melt, aes(x=value,fill=name)) + geom_histogram(bins=50) + facet_wrap(~number) + labs(x="N random Snps in category", title="Random permutation Total")</code></pre> <p><img src="figure/chromHmm_enrichment.Rmd/unnamed-chunk-40-1.png" width="672" style="display: block; margin: auto;" /></p> <details> <summary><em>Expand here to see past versions of unnamed-chunk-40-1.png:</em></summary> <table style="border-collapse:separate; border-spacing:5px;"> <thead> <tr> <th style="text-align:left;"> Version </th> <th style="text-align:left;"> Author </th> <th style="text-align:left;"> Date </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/19b98b3e20dcdad8fa0039ae754c9bef22447cdd/docs/figure/chromHmm_enrichment.Rmd/unnamed-chunk-40-1.png" target="_blank">19b98b3</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-11-07 </td> </tr> </tbody> </table> <p></details></p> <p>For nuclear:</p> <pre class="r"><code>permutationResNuclear_melt= melt(permutationResNuclear, id.vars=c("number", "name"))</code></pre> <pre class="r"><code>ggplot(permutationResNuclear_melt, aes(x=value,fill=name)) + geom_histogram(bins=50) + facet_wrap(~number) + labs(x="N random Snps in category", title="Random permutation Nuclear")</code></pre> <p><img src="figure/chromHmm_enrichment.Rmd/unnamed-chunk-42-1.png" width="672" style="display: block; margin: auto;" /></p> <details> <summary><em>Expand here to see past versions of unnamed-chunk-42-1.png:</em></summary> <table style="border-collapse:separate; border-spacing:5px;"> <thead> <tr> <th style="text-align:left;"> Version </th> <th style="text-align:left;"> Author </th> <th style="text-align:left;"> Date </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/19b98b3e20dcdad8fa0039ae754c9bef22447cdd/docs/figure/chromHmm_enrichment.Rmd/unnamed-chunk-42-1.png" target="_blank">19b98b3</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-11-07 </td> </tr> </tbody> </table> <p></details></p> <p>Try log scale:</p> <p>I want to add horizontal line where the actual QTL number is.</p> <pre class="r"><code>ggplot(permutationResTotal_melt, aes(x=value,fill=name)) + geom_histogram(bins=50) + facet_wrap(~number) + scale_y_log10() + labs(x="random Snps in category", title="Random permutation Total")</code></pre> <pre><code>Warning: Transformation introduced infinite values in continuous y-axis</code></pre> <pre><code>Warning: Removed 448 rows containing missing values (geom_bar).</code></pre> <p><img src="figure/chromHmm_enrichment.Rmd/unnamed-chunk-43-1.png" width="672" style="display: block; margin: auto;" /></p> <details> <summary><em>Expand here to see past versions of unnamed-chunk-43-1.png:</em></summary> <table style="border-collapse:separate; border-spacing:5px;"> <thead> <tr> <th style="text-align:left;"> Version </th> <th style="text-align:left;"> Author </th> <th style="text-align:left;"> Date </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/83f1b1419eb993f27c047d5d3a2355c4c84e24e1/docs/figure/chromHmm_enrichment.Rmd/unnamed-chunk-43-1.png" target="_blank">83f1b14</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-11-08 </td> </tr> <tr> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/19b98b3e20dcdad8fa0039ae754c9bef22447cdd/docs/figure/chromHmm_enrichment.Rmd/unnamed-chunk-43-1.png" target="_blank">19b98b3</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-11-07 </td> </tr> </tbody> </table> <p></details></p> <pre class="r"><code>ggplot(permutationResNuclear_melt, aes(x=value,fill=name)) + geom_histogram(bins=50) + facet_wrap(~number) + scale_y_log10()+ labs(x="random Snps in category", title="Random permutation Nuclear")</code></pre> <pre><code>Warning: Transformation introduced infinite values in continuous y-axis</code></pre> <pre><code>Warning: Removed 631 rows containing missing values (geom_bar).</code></pre> <p><img src="figure/chromHmm_enrichment.Rmd/unnamed-chunk-44-1.png" width="672" style="display: block; margin: auto;" /></p> <details> <summary><em>Expand here to see past versions of unnamed-chunk-44-1.png:</em></summary> <table style="border-collapse:separate; border-spacing:5px;"> <thead> <tr> <th style="text-align:left;"> Version </th> <th style="text-align:left;"> Author </th> <th style="text-align:left;"> Date </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/83f1b1419eb993f27c047d5d3a2355c4c84e24e1/docs/figure/chromHmm_enrichment.Rmd/unnamed-chunk-44-1.png" target="_blank">83f1b14</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-11-08 </td> </tr> <tr> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/19b98b3e20dcdad8fa0039ae754c9bef22447cdd/docs/figure/chromHmm_enrichment.Rmd/unnamed-chunk-44-1.png" target="_blank">19b98b3</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-11-07 </td> </tr> </tbody> </table> <p></details></p> <p>Try removing 0s:</p> <pre class="r"><code>permutationResTotal_melt_no0= permutationResTotal_melt %>% filter(value>0) ggplot(permutationResTotal_melt_no0, aes(x=value,fill=name)) + geom_histogram(bins=50) + facet_wrap(~number)+ scale_y_log10()+ labs(x="random Snps in category", title="Random permutation Total")</code></pre> <pre><code>Warning: Transformation introduced infinite values in continuous y-axis</code></pre> <pre><code>Warning: Removed 407 rows containing missing values (geom_bar).</code></pre> <p><img src="figure/chromHmm_enrichment.Rmd/unnamed-chunk-45-1.png" width="672" style="display: block; margin: auto;" /></p> <details> <summary><em>Expand here to see past versions of unnamed-chunk-45-1.png:</em></summary> <table style="border-collapse:separate; border-spacing:5px;"> <thead> <tr> <th style="text-align:left;"> Version </th> <th style="text-align:left;"> Author </th> <th style="text-align:left;"> Date </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/83f1b1419eb993f27c047d5d3a2355c4c84e24e1/docs/figure/chromHmm_enrichment.Rmd/unnamed-chunk-45-1.png" target="_blank">83f1b14</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-11-08 </td> </tr> <tr> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/19b98b3e20dcdad8fa0039ae754c9bef22447cdd/docs/figure/chromHmm_enrichment.Rmd/unnamed-chunk-45-1.png" target="_blank">19b98b3</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-11-07 </td> </tr> </tbody> </table> <p></details></p> <pre class="r"><code>permutationResNuclear_melt_no0= permutationResNuclear_melt %>% filter(value>0) ggplot(permutationResNuclear_melt_no0, aes(x=value,fill=name)) + geom_histogram(bins=50) + facet_wrap(~number)+ scale_y_log10()+ labs(x="random Snps in category", title="Random permutation Nuclear")</code></pre> <pre><code>Warning: Transformation introduced infinite values in continuous y-axis</code></pre> <pre><code>Warning: Removed 630 rows containing missing values (geom_bar).</code></pre> <p><img src="figure/chromHmm_enrichment.Rmd/unnamed-chunk-45-2.png" width="672" style="display: block; margin: auto;" /></p> <details> <summary><em>Expand here to see past versions of unnamed-chunk-45-2.png:</em></summary> <table style="border-collapse:separate; border-spacing:5px;"> <thead> <tr> <th style="text-align:left;"> Version </th> <th style="text-align:left;"> Author </th> <th style="text-align:left;"> Date </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/83f1b1419eb993f27c047d5d3a2355c4c84e24e1/docs/figure/chromHmm_enrichment.Rmd/unnamed-chunk-45-2.png" target="_blank">83f1b14</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-11-08 </td> </tr> <tr> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/19b98b3e20dcdad8fa0039ae754c9bef22447cdd/docs/figure/chromHmm_enrichment.Rmd/unnamed-chunk-45-2.png" target="_blank">19b98b3</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-11-07 </td> </tr> </tbody> </table> <p></details></p> <p>Look at enrichment by using the average</p> <pre class="r"><code>TotalPermMean=permutationResTotal_melt %>% group_by(number) %>% summarise(TotRandPerm=mean(value)) TotalPermMean$number=as.character(TotalPermMean$number) NuclearPermMean=permutationResNuclear_melt %>% group_by(number) %>% summarise(NucRandPerm=mean(value)) NuclearPermMean$number=as.character(NuclearPermMean$number)</code></pre> <p>Melt SNP values by name and number to get data in same format:</p> <pre class="r"><code>TotalOverlapHMM_names_melt=melt(TotalOverlapHMM_names, id.vars=c("number", "name"))%>% filter(variable=="sid") %>% group_by(number) %>% summarise(TotalQTL=n())</code></pre> <pre><code>Warning: attributes are not identical across measure variables; they will be dropped</code></pre> <pre class="r"><code>TotalOverlapHMM_names_melt$number=as.character(TotalOverlapHMM_names_melt$number) NuclearOverlapHMM_names_melt=melt(NuclearOverlapHMM_names, id.vars=c("number", "name")) %>% filter(variable=="sid") %>% group_by(number) %>% summarise(NucQTL=n())</code></pre> <pre><code>Warning: attributes are not identical across measure variables; they will be dropped</code></pre> <pre class="r"><code>NuclearOverlapHMM_names_melt$number=as.character(NuclearOverlapHMM_names_melt$number)</code></pre> <pre class="r"><code>chromHmm$number=as.character(chromHmm$number) TotalOverlapHMM_enrichment= TotalOverlapHMM_names_melt %>% full_join(TotalPermMean, by="number") %>% replace_na(list(TotalQTL=.00001)) %>% full_join(chromHmm, by="number") TotalOverlapHMM_enrichment$TotalQTL=as.double(TotalOverlapHMM_enrichment$TotalQTL) TotalOverlapHMM_enrichment = TotalOverlapHMM_enrichment %>% mutate(TotEnrich=(TotalQTL-TotRandPerm)/TotRandPerm) NuclearOverlapHMM_enrichment=NuclearOverlapHMM_names_melt %>% full_join(NuclearPermMean, by="number")%>% full_join(chromHmm, by="number") NuclearOverlapHMM_enrichment$NucQTL=as.double(NuclearOverlapHMM_enrichment$NucQTL) NuclearOverlapHMM_enrichment=NuclearOverlapHMM_enrichment %>%mutate(NucEnrich=(NucQTL-NucRandPerm)/NucRandPerm)</code></pre> <pre class="r"><code>ggplot(NuclearOverlapHMM_enrichment, aes(y=NucEnrich, x=number, fill=name)) + geom_bar(stat="identity")</code></pre> <p><img src="figure/chromHmm_enrichment.Rmd/unnamed-chunk-49-1.png" width="672" style="display: block; margin: auto;" /></p> <details> <summary><em>Expand here to see past versions of unnamed-chunk-49-1.png:</em></summary> <table style="border-collapse:separate; border-spacing:5px;"> <thead> <tr> <th style="text-align:left;"> Version </th> <th style="text-align:left;"> Author </th> <th style="text-align:left;"> Date </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/83f1b1419eb993f27c047d5d3a2355c4c84e24e1/docs/figure/chromHmm_enrichment.Rmd/unnamed-chunk-49-1.png" target="_blank">83f1b14</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-11-08 </td> </tr> </tbody> </table> <p></details></p> <pre class="r"><code>ggplot(TotalOverlapHMM_enrichment, aes(y=TotEnrich, x=number, fill=name)) + geom_bar(stat="identity")</code></pre> <p><img src="figure/chromHmm_enrichment.Rmd/unnamed-chunk-49-2.png" width="672" style="display: block; margin: auto;" /></p> <details> <summary><em>Expand here to see past versions of unnamed-chunk-49-2.png:</em></summary> <table style="border-collapse:separate; border-spacing:5px;"> <thead> <tr> <th style="text-align:left;"> Version </th> <th style="text-align:left;"> Author </th> <th style="text-align:left;"> Date </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/83f1b1419eb993f27c047d5d3a2355c4c84e24e1/docs/figure/chromHmm_enrichment.Rmd/unnamed-chunk-49-2.png" target="_blank">83f1b14</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-11-08 </td> </tr> </tbody> </table> <p></details></p> <p>Join together:</p> <pre class="r"><code>bothEnrich=NuclearOverlapHMM_enrichment %>% full_join(TotalOverlapHMM_enrichment, by=c("name", "number")) %>% select(number, name, NucEnrich,TotEnrich) bothEnrich_melt=melt(bothEnrich, id.vars=c("number", "name"))</code></pre> <pre class="r"><code>ggplot(bothEnrich_melt, aes(x=number, by=variable, fill=name, y=value,col=variable)) + geom_bar(position = "dodge", stat = "identity",alpha=.5) + scale_color_manual(values=c("darkviolet", "deepskyblue3")) + labs(y="Enrichment from 1000 permutations", title="ChromHMM enrichment for \nTotal and Nuclear ApaQTLs",x="Region")</code></pre> <p><img src="figure/chromHmm_enrichment.Rmd/unnamed-chunk-51-1.png" width="672" style="display: block; margin: auto;" /></p> <details> <summary><em>Expand here to see past versions of unnamed-chunk-51-1.png:</em></summary> <table style="border-collapse:separate; border-spacing:5px;"> <thead> <tr> <th style="text-align:left;"> Version </th> <th style="text-align:left;"> Author </th> <th style="text-align:left;"> Date </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/83f1b1419eb993f27c047d5d3a2355c4c84e24e1/docs/figure/chromHmm_enrichment.Rmd/unnamed-chunk-51-1.png" target="_blank">83f1b14</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-11-08 </td> </tr> </tbody> </table> <p></details></p> <p>Look only at the interesting ones by subsetting:</p> <pre class="r"><code>bothEnrich_melt_filt=bothEnrich_melt %>% filter(str_detect(name,"Active_Promoter|Txn_Elongation|Weak_Txn|Heterochrom/lo|Weak_Promoter|Poised_Promoter|Txn_Transition")) ggplot(bothEnrich_melt_filt, aes(x=name, by=variable, fill=variable, y=value))+ geom_bar(position = "dodge", stat = "identity") + scale_fill_manual(values=c("deepskyblue3","darkviolet")) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + labs(y="Enrichment", x="Category", title="ChromHMM categroies \n with oppositte Enrichemtn patterns")</code></pre> <p><img src="figure/chromHmm_enrichment.Rmd/unnamed-chunk-52-1.png" width="672" style="display: block; margin: auto;" /></p> <details> <summary><em>Expand here to see past versions of unnamed-chunk-52-1.png:</em></summary> <table style="border-collapse:separate; border-spacing:5px;"> <thead> <tr> <th style="text-align:left;"> Version </th> <th style="text-align:left;"> Author </th> <th style="text-align:left;"> Date </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/83f1b1419eb993f27c047d5d3a2355c4c84e24e1/docs/figure/chromHmm_enrichment.Rmd/unnamed-chunk-52-1.png" target="_blank">83f1b14</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-11-08 </td> </tr> </tbody> </table> <p></details></p> <p>The bimodal distributions may come from including both the significant and non significant genes in the test set. I need to remove all of the lines that come from a gene with a significant peak.</p> <div id="remove-significant-genes" class="section level3"> <h3>Remove significant genes</h3> <pre class="r"><code>NucQTL_genes=read.table("../data/perm_QTL_trans/filtered_APApeaks_merged_allchrom_refseqGenes_pheno_Nuclear_transcript_permResBH.txt", stringsAsFactors = F, header=T) %>% mutate(sig=ifelse(-log10(bh)>=1, 1,0 )) %>% separate(pid, sep = ":", into=c("chr", "start", "end", "id")) %>% separate(id, sep = "_", into=c("gene", "strand", "peak")) %>% filter(sig==1) %>% select(gene) %>% distinct(gene) #715 genes #write this out as NucAPAGenes write.table(NucQTL_genes, "../data/perm_QTL_trans/NucApaGenes.txt", row.names = F, col.names = F, quote=F) TotQTL_genes=read.table("../data/perm_QTL_trans/filtered_APApeaks_merged_allchrom_refseqGenes_pheno_Total_transcript_permResBH.txt", stringsAsFactors = F, header=T) %>% mutate(sig=ifelse(-log10(bh)>=1, 1,0 )) %>% separate(pid, sep = ":", into=c("chr", "start", "end", "id")) %>% separate(id, sep = "_", into=c("gene", "strand", "peak")) %>% filter(sig==1) %>% select(gene) %>% distinct(gene) #106 genes #write out as TotAPAGenes write.table(TotQTL_genes, "../data/perm_QTL_trans/TotApaGenes.txt", row.names = F, col.names = F, quote=F)</code></pre> <p>I need to find a way to get rid of these from the files I cam pulling from.</p> <pre class="bash"><code>/project2/gilad/briana/threeprimeseq/data/nominal_APAqtl_trans/filtered_APApeaks_merged_allchrom_refseqGenes_pheno_Nuclear_NomRes.txt /project2/gilad/briana/threeprimeseq/data/nominal_APAqtl_trans/filtered_APApeaks_merged_allchrom_refseqGenes_pheno_Total_NomRes.txt</code></pre> <p>I can create an python script to do this. I will need to seperate the first column and and only write the line out if the gene is in the apaGenes files I just created.</p> <p>filterSigGenes.py</p> <pre class="bash"><code>#python #genes with sig ApaQTL TotGenes=open("/project2/gilad/briana/threeprimeseq/data/random_QTLsnps/sig_genes/TotApaGenes.txt", "r") NucGenes=open("/project2/gilad/briana/threeprimeseq/data/random_QTLsnps/sig_genes/NucApaGenes.txt", "r") #nom res (with all snps tested) NucRes=open("/project2/gilad/briana/threeprimeseq/data/nominal_APAqtl_trans/filtered_APApeaks_merged_allchrom_refseqGenes_pheno_Nuclear_NomRes.txt", "r") TotRes=open("/project2/gilad/briana/threeprimeseq/data/nominal_APAqtl_trans/filtered_APApeaks_merged_allchrom_refseqGenes_pheno_Total_NomRes.txt", "r") #output files: Nuc_nonSig=open("/project2/gilad/briana/threeprimeseq/data/random_QTLsnps/nomRes_nonsig/NucTestedSnps_nonSigGenes.txt", "w") Tot_nonSig=open("/project2/gilad/briana/threeprimeseq/data/random_QTLsnps/nomRes_nonsig/TotTestedSnps_nonSigGenes.txt", "w") #convert genes to list def file_to_list(file): gene_list=[] for ln in file: gene=ln.strip() gene_list.append(gene) return(gene_list) Tot_gene_list=file_to_list(TotGenes) Nuc_gene_list=file_to_list(NucGenes) #function that will take in the input, the list, and the output. I want to be able to run this function for total and nuclear def filter(fin,fout, sigGenes): for ln in fin: gene=ln.split()[0].split(":")[3].split("_")[0] if gene not in sigGenes: fout.write(ln) fout.close() filter(NucRes,Nuc_nonSig,Nuc_gene_list) filter(TotRes, Tot_nonSig, Tot_gene_list) </code></pre> <p>Call this in a bash script:<br /> run_filterSigGenes.sh</p> <pre class="bash"><code>#!/bin/bash #SBATCH --job-name=run_filterSigGenes #SBATCH --account=pi-yangili1 #SBATCH --time=36:00:00 #SBATCH --output=run_filterSigGenes.out #SBATCH --error=run_filterSigGenes.err #SBATCH --partition=broadwl #SBATCH --mem=50G #SBATCH --mail-type=END module load Anaconda3 source activate three-prime-env python filterSigGenes.py </code></pre> <p>nuc_random880_chromHmm_noSig_set1.sh</p> <pre class="bash"><code>#!/bin/bash #SBATCH --job-name=nuc_random880_chromHmm_noSig_set1 #SBATCH --account=pi-yangili1 #SBATCH --time=24:00:00 #SBATCH --output=nuc_random880_chromHmm_noSig_set1.out #SBATCH --error=nuc_random880_chromHmm_noSig_set1.err #SBATCH --partition=bigmem2 #SBATCH --mem=100G #SBATCH --mail-type=END module load Anaconda3 source activate three-prime-env #make random for i in {1..250}; do shuf -n 880 /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/nomRes_nonsig/NucTestedSnps_nonSigGenes.txt > /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Nuclear_noSig/randomRes_Nuclear_880_noSig_${i}.txt done </code></pre> <p>nuc_random880_chromHmm_noSig_set2.sh</p> <pre class="bash"><code>#!/bin/bash #SBATCH --job-name=nuc_random880_chromHmm_noSig_set2 #SBATCH --account=pi-yangili1 #SBATCH --time=24:00:00 #SBATCH --output=nuc_random880_chromHmm_noSig_set2.out #SBATCH --error=nuc_random880_chromHmm_noSig_set2.err #SBATCH --partition=bigmem2 #SBATCH --mem=200G #SBATCH --mail-type=END module load Anaconda3 source activate three-prime-env #make random for i in {251..500}; do shuf -n 880 /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/nomRes_nonsig/NucTestedSnps_nonSigGenes.txt > /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Nuclear_noSig/randomRes_Nuclear_880_noSig_${i}.txt done </code></pre> <p>nuc_random880_chromHmm_noSig_set3.sh</p> <pre class="bash"><code>#!/bin/bash #SBATCH --job-name=nuc_random880_chromHmm_noSig_set3 #SBATCH --account=pi-yangili1 #SBATCH --time=24:00:00 #SBATCH --output=nuc_random880_chromHmm_noSig_set3.out #SBATCH --error=nuc_random880_chromHmm_noSig_set3.err #SBATCH --partition=bigmem2 #SBATCH --mem=200G #SBATCH --mail-type=END module load Anaconda3 source activate three-prime-env #make random for i in {501..750}; do shuf -n 880 /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/nomRes_nonsig/NucTestedSnps_nonSigGenes.txt > /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Nuclear_noSig/randomRes_Nuclear_880_noSig_${i}.txt done </code></pre> <p>nuc_random880_chromHmm_noSig_set4.sh</p> <pre class="bash"><code>#!/bin/bash #SBATCH --job-name=nuc_random880_chromHmm_noSig_set4 #SBATCH --account=pi-yangili1 #SBATCH --time=24:00:00 #SBATCH --output=nuc_random880_chromHmm_noSig_set4.out #SBATCH --error=nuc_random880_chromHmm_noSig_set4.err #SBATCH --partition=bigmem2 #SBATCH --mem=200G #SBATCH --mail-type=END module load Anaconda3 source activate three-prime-env #make random for i in {751..1000}; do shuf -n 880 /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/nomRes_nonsig/NucTestedSnps_nonSigGenes.txt > /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Nuclear_noSig/randomRes_Nuclear_880_noSig_${i}.txt done </code></pre> <p>Same for total:</p> <p>total_random118_chromHmm_noSig_set1.sh</p> <pre class="bash"><code>#!/bin/bash #SBATCH --job-name=total_random118_chromHmm_noSig_set1 #SBATCH --account=pi-yangili1 #SBATCH --time=36:00:00 #SBATCH --output=total_random118_chromHmm_noSig_set1.out #SBATCH --error=total_random118_chromHmm_noSig_set1.err #SBATCH --partition=bigmem2 #SBATCH --mem=200G #SBATCH --mail-type=END module load Anaconda3 source activate three-prime-env #test with 2 permutations then make it 1000 #choose random res for i in {1..250}; do shuf -n 118 /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/nomRes_nonsig/TotTestedSnps_nonSigGenes.txt > /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Total_noSig/randomRes_Total_118_noSig_${i}.txt done </code></pre> <p>total_random118_chromHmm_noSig_set2.sh</p> <pre class="bash"><code>#!/bin/bash #SBATCH --job-name=total_random118_chromHmm_noSig_set2 #SBATCH --account=pi-yangili1 #SBATCH --time=36:00:00 #SBATCH --output=total_random118_chromHmm_noSig_set2.out #SBATCH --error=total_random118_chromHmm_noSig_set2.err #SBATCH --partition=bigmem2 #SBATCH --mem=200G #SBATCH --mail-type=END module load Anaconda3 source activate three-prime-env #test with 2 permutations then make it 1000 #choose random res for i in {251..500}; do shuf -n 118 /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/nomRes_nonsig/TotTestedSnps_nonSigGenes.txt > /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Total_noSig/randomRes_Total_118_noSig_${i}.txt done </code></pre> <p>total_random118_chromHmm_noSig_set3.sh</p> <pre class="bash"><code>#!/bin/bash #SBATCH --job-name=total_random118_chromHmm_noSig_set3 #SBATCH --account=pi-yangili1 #SBATCH --time=36:00:00 #SBATCH --output=total_random118_chromHmm_noSig_set3.out #SBATCH --error=total_random118_chromHmm_noSig_set3.err #SBATCH --partition=bigmem2 #SBATCH --mem=200G #SBATCH --mail-type=END module load Anaconda3 source activate three-prime-env #test with 2 permutations then make it 1000 #choose random res for i in {501..750}; do shuf -n 118 /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/nomRes_nonsig/TotTestedSnps_nonSigGenes.txt > /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Total_noSig/randomRes_Total_118_noSig_${i}.txt done </code></pre> <p>total_random118_chromHmm_noSig_set4.sh</p> <pre class="bash"><code>#!/bin/bash #SBATCH --job-name=total_random118_chromHmm_noSig_set4 #SBATCH --account=pi-yangili1 #SBATCH --time=36:00:00 #SBATCH --output=total_random118_chromHmm_noSig_set4.out #SBATCH --error=total_random118_chromHmm_noSig_set4.err #SBATCH --partition=bigmem2 #SBATCH --mem=200G #SBATCH --mail-type=END module load Anaconda3 source activate three-prime-env #test with 2 permutations then make it 1000 #choose random res for i in {751..1000}; do shuf -n 118 /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/nomRes_nonsig/TotTestedSnps_nonSigGenes.txt > /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Total_noSig/randomRes_Total_118_noSig_${i}.txt done </code></pre> <p>This may not be enough. I may need to change this so it only has uniq snp. I could be sampling the same snps over an over.</p> <p>Make these files into snp bed files:</p> <p>randomRes2SNPbed_noSig.py</p> <pre class="bash"><code>def main(inFile, outFile): fout=open(outFile, "w") fin=open(inFile, "r") for ln in fin: pid, sid, dist, pval, slope = ln.split() chrom, pos= sid.split(":") name=sid start= int(pos)-1 end=int(pos) strand=pid.split(":")[3].split("_")[1] pval=float(pval) fout.write("%s\t%s\t%s\t%s\t%s\t%s\n"%(chrom, start, end, name, pval, strand)) fout.close() if __name__ == "__main__": import sys fraction=sys.argv[1] nsamp=sys.argv[2] nsamp=int(nsamp) iter=sys.argv[3] inFile = "/project2/gilad/briana/threeprimeseq/data/random_QTLsnps/%s_noSig/randomRes_%s_%d_noSig_%s.txt"%(fraction,fraction, nsamp, iter) outFile= "/project2/gilad/briana/threeprimeseq/data/random_QTLsnps/%s_noSig/snp_bed_noSig/randomRes_%s_%d_noSig_%s.bed"%(fraction,fraction, nsamp, iter) main(inFile,outFile) </code></pre> <p>randomLines2Snp_noSig.sh</p> <pre class="bash"><code>#!/bin/bash #SBATCH --job-name=randomLines2Snp_noSig #SBATCH --account=pi-gilad #SBATCH --time=36:00:00 #SBATCH --output=randomLines2Snp_noSig.out #SBATCH --error=randomLines2Snp_noSig.err #SBATCH --partition=gilad #SBATCH --mem=50G #SBATCH --mail-type=END module load Anaconda3 source activate three-prime-env #make random for i in {1..1000}; do python randomRes2SNPbed_noSig.py Nuclear 880 ${i} done #make random for i in {1..1000}; do python randomRes2SNPbed_noSig.py Total 118 ${i} done </code></pre> <p>sortRandomSnps_noSig.sh</p> <pre class="bash"><code>#!/bin/bash #SBATCH --job-name=sortRandomSnps_noSig #SBATCH --account=pi-yangili1 #SBATCH --time=10:00:00 #SBATCH --output=sortRandomSnps_noSig.out #SBATCH --error=sortRandomSnps_noSig.err #SBATCH --partition=broadwl #SBATCH --mem=50G #SBATCH --mail-type=END module load Anaconda3 source activate three-prime-env for i in $(ls /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Nuclear_noSig/snp_bed_noSig/); do sort -k1,1 -k2,2n /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Nuclear_noSig/snp_bed_noSig/$i > /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Nuclear_noSig/snp_bed_sort_noSig/$i.sort.bed done for i in $(ls /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Total_noSig/snp_bed_noSig/); do sort -k1,1 -k2,2n /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Total_noSig/snp_bed_noSig/$i > /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Total_noSig/snp_bed_sort_noSig/$i.sort.bed done </code></pre> <p>overlap_chromHMM_sepfiles_noSig.py</p> <pre class="bash"><code>def main(inFile, outFile): rand=pybedtools.BedTool(inFile) hmm=pybedtools.BedTool("/project2/gilad/briana/genome_anotation_data/GM12878.chromHMM.sort.bed") #map hmm to snps Rand_overlapHMM=rand.map(hmm, c=4) #save results Rand_overlapHMM.saveas(outFile) if __name__ == "__main__": import sys import pandas as pd import pybedtools fraction=sys.argv[1] nsamp=sys.argv[2] niter=sys.argv[3] #which itteration we are on inFile ="/project2/gilad/briana/threeprimeseq/data/random_QTLsnps/%s_noSig/snp_bed_sort_noSig/randomRes_%s_%s_noSig_%s.bed.sort.bed"%(fraction,fraction, nsamp, niter) outFile= "/project2/gilad/briana/threeprimeseq/data/random_QTLsnps/%s_noSig/chromHMM_overlap/randomres_overlapChromHMM_%s_%s_noSig_%s.txt"%(fraction,fraction,nsamp, niter) main(inFile,outFile)</code></pre> <p>overlap_chromHMM_sepfiles_noSig.sh</p> <pre class="bash"><code>#!/bin/bash #SBATCH --job-name=overlap_chromHMM_sepfiles_noSig #SBATCH --account=pi-yangili1 #SBATCH --time=10:00:00 #SBATCH --output=overlap_chromHMM_sepfiles_noSig.out #SBATCH --error=overlap_chromHMM_sepfiles_noSig.err #SBATCH --partition=broadwl #SBATCH --mem=50G #SBATCH --mail-type=END module load Anaconda3 source activate three-prime-env for i in {1..1000}; do python overlap_chromHMM_sepfiles_noSig.py Nuclear 880 $i done for i in {1..1000}; do python overlap_chromHMM_sepfiles_noSig.py Total 118 $i done</code></pre> <p>groupRandomChromHMM_noSig.sh</p> <pre class="bash"><code>#!/bin/bash #SBATCH --job-name=groupRandomChromHMM_noSig #SBATCH --account=pi-yangili1 #SBATCH --time=5:00:00 #SBATCH --output=groupRandomChromHMM.out #SBATCH --error=groupRandomChromHMM.err #SBATCH --partition=broadwl #SBATCH --mem=50G #SBATCH --mail-type=END module load Anaconda3 source activate three-prime-env for i in {1..1000}; do Rscript groupRandomByChromHMM.R -f /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Nuclear_noSig/chromHMM_overlap/randomres_overlapChromHMM_Nuclear_880_noSig_${i}.txt -o /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Nuclear_noSig/chromHMM_overlap_group/randomres_overlapChromHMM_Nuclear_880_noSig_${i}_grouped.txt done for i in {1..1000}; do Rscript groupRandomByChromHMM.R -f /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Total_noSig/chromHMM_overlap/randomres_overlapChromHMM_Total_118_noSig_${i}.txt -o /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Total_noSig/chromHMM_overlap_group/randomres_overlapChromHMM_Total_118_noSig_${i}_grouped.txt done </code></pre> <pre class="bash"><code>cut -d$' ' -f 1,2 randomres_overlapChromHMM_Nuclear_880_noSig_549_grouped.txt > Nuc_chromOverlap_noSig.txt for i in {1..1000}; do paste -d" " Nuc_chromOverlap_noSig.txt <(cut -d" " -f 3 randomres_overlapChromHMM_Nuclear_880_noSig_${i}_grouped.txt) > tmp mv tmp Nuc_chromOverlap_noSig.txt done cut -d$' ' -f 1,2 randomres_overlapChromHMM_Total_118_noSig_53_grouped.txt > Tot_chromOverlap_noSig.txt for i in {1..1000}; do paste -d" " Tot_chromOverlap_noSig.txt <(cut -d" " -f 3 randomres_overlapChromHMM_Total_118_noSig_${i}_grouped.txt) > tmp mv tmp Tot_chromOverlap_noSig.txt done</code></pre> <p>These files are not on my computer so I can work with them.</p> <pre class="r"><code>permutationResTotal_noSig=read.table("../data/ChromHmmOverlap/Tot_chromOverlap_noSig.txt", header=T, stringsAsFactors = F) permutationResTotal_noSig[is.na(permutationResTotal_noSig)] <- 0 permutationResTotal_noSig= as_data_frame(permutationResTotal_noSig) permutationResTotal_noSig_noName=permutationResTotal_noSig[,3:ncol(permutationResTotal_noSig)] totRand_mean_noSig=rowMeans(permutationResTotal_noSig_noName)/1000 permutationResNuclear_noSig=read.table("../data/ChromHmmOverlap/Nuc_chromOverlap_noSig.txt",header = T,stringsAsFactors = F) permutationResNuclear_noSig[is.na(permutationResNuclear_noSig)] <- 0 permutationResNuclear_noSig_noName=permutationResNuclear_noSig[,3:ncol(permutationResNuclear_noSig)] nucRand_mean_noSig=rowMeans(permutationResNuclear_noSig_noName)/1000</code></pre> <pre class="r"><code>permutationResTotal_noSig_melt= melt(permutationResTotal_noSig, id.vars=c("number", "name"))</code></pre> <pre class="r"><code>ggplot(permutationResTotal_noSig_melt, aes(x=value,fill=name)) + geom_histogram(bins=50) + facet_wrap(~number) + labs(x="N random Snps in category", title="Random permutation Total")</code></pre> <p><img src="figure/chromHmm_enrichment.Rmd/unnamed-chunk-74-1.png" width="672" style="display: block; margin: auto;" /></p> <details> <summary><em>Expand here to see past versions of unnamed-chunk-74-1.png:</em></summary> <table style="border-collapse:separate; border-spacing:5px;"> <thead> <tr> <th style="text-align:left;"> Version </th> <th style="text-align:left;"> Author </th> <th style="text-align:left;"> Date </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/086fbcc926b578d6d8b5a54523f7dbb7fad7e78c/docs/figure/chromHmm_enrichment.Rmd/unnamed-chunk-74-1.png" target="_blank">086fbcc</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-11-13 </td> </tr> </tbody> </table> <p></details></p> <p>For nuclear:</p> <pre class="r"><code>permutationResNuclear_noSig_melt= melt(permutationResNuclear_noSig, id.vars=c("number", "name"))</code></pre> <pre class="r"><code>ggplot(permutationResNuclear_noSig_melt, aes(x=value,fill=name)) + geom_histogram(bins=50) + facet_wrap(~number) + labs(x="N random Snps in category", title="Random permutation Nuclear")</code></pre> <p><img src="figure/chromHmm_enrichment.Rmd/unnamed-chunk-76-1.png" width="672" style="display: block; margin: auto;" /></p> <details> <summary><em>Expand here to see past versions of unnamed-chunk-76-1.png:</em></summary> <table style="border-collapse:separate; border-spacing:5px;"> <thead> <tr> <th style="text-align:left;"> Version </th> <th style="text-align:left;"> Author </th> <th style="text-align:left;"> Date </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/086fbcc926b578d6d8b5a54523f7dbb7fad7e78c/docs/figure/chromHmm_enrichment.Rmd/unnamed-chunk-76-1.png" target="_blank">086fbcc</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-11-13 </td> </tr> </tbody> </table> <p></details></p> <pre class="r"><code>ggplot(permutationResNuclear_noSig_melt, aes(x=value,fill=name)) + geom_histogram(bins=50) + facet_wrap(~number) + scale_y_log10()+ labs(x="random Snps in category", title="Random permutation Nuclear")</code></pre> <pre><code>Warning: Transformation introduced infinite values in continuous y-axis</code></pre> <pre><code>Warning: Removed 630 rows containing missing values (geom_bar).</code></pre> <p><img src="figure/chromHmm_enrichment.Rmd/unnamed-chunk-77-1.png" width="672" style="display: block; margin: auto;" /></p> <details> <summary><em>Expand here to see past versions of unnamed-chunk-77-1.png:</em></summary> <table style="border-collapse:separate; border-spacing:5px;"> <thead> <tr> <th style="text-align:left;"> Version </th> <th style="text-align:left;"> Author </th> <th style="text-align:left;"> Date </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/086fbcc926b578d6d8b5a54523f7dbb7fad7e78c/docs/figure/chromHmm_enrichment.Rmd/unnamed-chunk-77-1.png" target="_blank">086fbcc</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-11-13 </td> </tr> </tbody> </table> <p></details></p> <pre class="r"><code>ggplot(permutationResTotal_noSig_melt, aes(x=value,fill=name)) + geom_histogram(bins=50) + facet_wrap(~number) + scale_y_log10()+ labs(x="random Snps in category", title="Random permutation Total")</code></pre> <pre><code>Warning: Transformation introduced infinite values in continuous y-axis</code></pre> <pre><code>Warning: Removed 445 rows containing missing values (geom_bar).</code></pre> <p><img src="figure/chromHmm_enrichment.Rmd/unnamed-chunk-77-2.png" width="672" style="display: block; margin: auto;" /></p> <details> <summary><em>Expand here to see past versions of unnamed-chunk-77-2.png:</em></summary> <table style="border-collapse:separate; border-spacing:5px;"> <thead> <tr> <th style="text-align:left;"> Version </th> <th style="text-align:left;"> Author </th> <th style="text-align:left;"> Date </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/086fbcc926b578d6d8b5a54523f7dbb7fad7e78c/docs/figure/chromHmm_enrichment.Rmd/unnamed-chunk-77-2.png" target="_blank">086fbcc</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-11-13 </td> </tr> </tbody> </table> <p></details></p> </div> <div id="permute-only-from-unique-snps-tested" class="section level3"> <h3>Permute only from unique snps tested</h3> <p>This didnt help. I need to choose from the unique snps rather than counting if they were tested multiple times. I will look for the snps tested and get rid of those called as QTLs.</p> <p>Steps:</p> <ul> <li>get total and nuclear snps tested</li> </ul> <p>I need to do this from the nominal results in /project2/gilad/briana/threeprimeseq/data/nominal_APAqtl_trans/. The snp ID is in the second column</p> <p>run_testedSnps_noSig.sh</p> <pre class="bash"><code>#!/bin/bash #SBATCH --job-name=run_testedSnps_noSig. #SBATCH --account=pi-yangili1 #SBATCH --time=10:00:00 #SBATCH --output=run_testedSnps_noSig.out #SBATCH --error=run_testedSnps_noSig.err #SBATCH --partition=broadwl #SBATCH --mem=50G #SBATCH --mail-type=END module load Anaconda3 source activate three-prime-env cut -f2 -d" " /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/nomRes_nonsig/NucTestedSnps_nonSigGenes.txt | sort | uniq > /project2/gilad/briana/threeprimeseq/data/nominal_APAqtl_trans/uniq_tested_Nuclear.txt cut -f2 -d" " /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/nomRes_nonsig/TotTestedSnps_nonSigGenes.txt | sort | uniq > /project2/gilad/briana/threeprimeseq/data/nominal_APAqtl_trans/uniq_tested_Total.txt</code></pre> <ul> <li>remove QTL snps</li> </ul> <p>The significant QTL snps are:</p> <p>/project2/gilad/briana/threeprimeseq/data/GWAS_overlap/ApaQTLsigSNPpos_Nuclear.txt /project2/gilad/briana/threeprimeseq/data/GWAS_overlap/ApaQTLsigSNPpos_Total.txt</p> <p>I can use python to remove the snps from the full lists:</p> <p>I can write it as a bed file to make the step easier.</p> <p>testedSnps_noSig.py</p> <pre class="bash"><code> total_qtl=open('/project2/gilad/briana/threeprimeseq/data/GWAS_overlap/ApaQTLsigSNPpos_Total.txt', "r") nuclear_qtl=open('/project2/gilad/briana/threeprimeseq/data/GWAS_overlap/ApaQTLsigSNPpos_Total.txt', "r") def file_to_list(file): snp_list=[] for ln in file: snp=ln.strip() snp_list.append(snp) return(snp_list) total_qtl_list= file_to_list(total_qtl) nuclear_qtl_list= file_to_list(nuclear_qtl) total_out=open("/project2/gilad/briana/threeprimeseq/data/random_QTLsnps/uniqNonSigSnps/TotalUniqTestedSnp.bed","w") for ln in open("/project2/gilad/briana/threeprimeseq/data/nominal_APAqtl_trans/uniq_tested_Total.txt"): snp=ln.strip() if snp not in total_qtl_list: chrom, pos =snp.split(":") start=int(pos)-1 end=int(pos) total_out.write("%s\t%d\t%d\t%s\n"%(chrom, start, end, snp)) total_out.close() nuclear_out=open("/project2/gilad/briana/threeprimeseq/data/random_QTLsnps/uniqNonSigSnps/NuclearUniqTestedSnp.bed","w") for ln in open("/project2/gilad/briana/threeprimeseq/data/nominal_APAqtl_trans/uniq_tested_Nuclear.txt"): snp=ln.strip() if snp not in total_qtl_list: chrom, pos =snp.split(":") start=int(pos)-1 end=int(pos) nuclear_out.write("%s\t%d\t%d\t%s\n"%(chrom, start, end, snp)) nuclear_out.close() </code></pre> <p>I will probablly need to run this in a bash script.</p> <p>run_testedSnps2bed.sh</p> <pre class="bash"><code>#!/bin/bash #SBATCH --job-name=run_testedSnps2bed.sh #SBATCH --account=pi-yangili1 #SBATCH --time=10:00:00 #SBATCH --output=run_testedSnps2bed.out #SBATCH --error=run_testedSnps2bed.err #SBATCH --partition=broadwl #SBATCH --mem=50G #SBATCH --mail-type=END module load Anaconda3 source activate three-prime-env python testedSnps_noSig.py </code></pre> <ul> <li>select correct number of snps 1000 times (permutation) This file is not as big. I may be able to run all of the permutations at once:<br /> random_UniqNoSig.sh</li> </ul> <pre class="bash"><code>#!/bin/bash #SBATCH --job-name=random_UniqNoSig #SBATCH --account=pi-yangili1 #SBATCH --time=36:00:00 #SBATCH --output=random_UniqNoSig.out #SBATCH --error=random_UniqNoSig.err #SBATCH --partition=bigmem2 #SBATCH --mem=200G #SBATCH --mail-type=END module load Anaconda3 source activate three-prime-env for i in {1..1000}; do shuf -n 118 /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/uniqNonSigSnps/TotalUniqTestedSnp.bed > /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Total_Uniq_noSig/snp_bed/randomRes_Total_118_UniqnoSig_${i}.bed done for i in {1..1000}; do shuf -n 880 /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/uniqNonSigSnps/NuclearUniqTestedSnp.bed > /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Nuclear_Uniq_noSig/snp_bed/randomRes_Nuclear_880_UniqnoSig_${i}.bed done</code></pre> <ul> <li>sort bed files</li> </ul> <p>sortRandomUniqSnps_noSig.sh</p> <pre class="bash"><code>#!/bin/bash #SBATCH --job-name=sortRandomUniqSnps_noSig #SBATCH --account=pi-yangili1 #SBATCH --time=10:00:00 #SBATCH --output=sortRandomUniqSnps_noSig.out #SBATCH --error=sortRandomUniqSnps_noSig.err #SBATCH --partition=broadwl #SBATCH --mem=50G #SBATCH --mail-type=END module load Anaconda3 source activate three-prime-env for i in $(ls /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Nuclear_Uniq_noSig/snp_bed); do sort -k1,1 -k2,2n /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Nuclear_Uniq_noSig/snp_bed/$i > /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Nuclear_Uniq_noSig/snp_bed_sort_noSig/$i.sort.bed done for i in $(ls /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Total_Uniq_noSig/snp_bed); do sort -k1,1 -k2,2n /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Total_Uniq_noSig/snp_bed/$i > /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Total_Uniq_noSig/snp_bed_sort_noSig/$i.sort.bed done </code></pre> <ul> <li>overlap each with chrom HMM</li> </ul> <p>overlap_chromHMM_sepfiles_UniqnoSig.py</p> <pre class="bash"><code>def main(inFile, outFile): rand=pybedtools.BedTool(inFile) hmm=pybedtools.BedTool("/project2/gilad/briana/genome_anotation_data/GM12878.chromHMM.sort.bed") #map hmm to snps Rand_overlapHMM=rand.map(hmm, c=4) #save results Rand_overlapHMM.saveas(outFile) if __name__ == "__main__": import sys import pandas as pd import pybedtools fraction=sys.argv[1] nsamp=sys.argv[2] niter=sys.argv[3] #which itteration we are on inFile ="/project2/gilad/briana/threeprimeseq/data/random_QTLsnps/%s_Uniq_noSig/snp_bed_sort_noSig/randomRes_%s_%s_UniqnoSig_%s.bed.sort.bed"%(fraction,fraction, nsamp, niter) outFile= "/project2/gilad/briana/threeprimeseq/data/random_QTLsnps/%s_Uniq_noSig/chromHMM_overlap/randomres_overlapChromHMM_%s_%s_UniqnoSig_%s.txt"%(fraction,fraction,nsamp, niter) main(inFile,outFile)</code></pre> <p>overlap_chromHMM_sepfiles_UniqnoSig.sh</p> <pre class="bash"><code>#!/bin/bash #SBATCH --job-name=overlap_chromHMM_sepfiles_UniqnoSig #SBATCH --account=pi-yangili1 #SBATCH --time=10:00:00 #SBATCH --output=overlap_chromHMM_sepfiles_UniqnoSig.out #SBATCH --error=overlap_chromHMM_sepfiles_UniqnoSig.err #SBATCH --partition=broadwl #SBATCH --mem=50G #SBATCH --mail-type=END module load Anaconda3 source activate three-prime-env for i in {1..1000}; do python overlap_chromHMM_sepfiles_UniqnoSig.py Nuclear 880 $i done for i in {1..1000}; do python overlap_chromHMM_sepfiles_UniqnoSig.py Total 118 $i done</code></pre> <ul> <li>group the chrom HMM calls</li> </ul> <p>I need to modify groupRandomByChromHMM_uniq.R</p> <pre class="r"><code>#!/bin/rscripts # usage: groupRandomByChromHMM_uniq.R -f infile -o outfile #this file will take any of the itterations and output a file with chrom hmm number, name, numberof snps library(optparse) library(dplyr) library(tidyr) library(ggplot2) library(readr) option_list = list( make_option(c("-f", "--file"), action="store", default=NA, type='character', help="input coverage file"), make_option(c("-o", "--output"), action="store", default=NA, type='character', help="output file") ) opt_parser <- OptionParser(option_list=option_list) opt <- parse_args(opt_parser) #interrupt execution if no file is supplied if (is.null(opt$file)){ print_help(opt_parser) stop("Need input file", call.=FALSE) } if (is.null(opt$output)){ print_help(opt_parser) stop("Need output file", call.=FALSE) } randomSNPS=read.table(opt$file, col.names=c("chrom", "start", "end", "sid", "number"),stringsAsFactors = F, na.strings = "NA") hmm_names=read.table("/project2/gilad/briana/genome_anotation_data/chromHMM_regions.txt", col.names = c("number", "name"),stringsAsFactors=F) randomSNPS$number=as.character(randomSNPS$number) hmm_names$number=as.character(hmm_names$number) randomSNPS_names= randomSNPS %>% left_join(hmm_names, by="number") #split the name of the file to get the iteration number fileSplit=strsplit(opt$file, "/")[[1]][10] iter.txt=strsplit(fileSplit, "_")[[1]][5] iter=substr(iter.txt, 1, nchar(iter.txt)-4) randomSNPS_names_grouped=randomSNPS_names %>% group_by(number) %>% summarise(!!iter:=n()) %>% replace_na(list(name="No_annotation")) %>% dplyr::select(number, !!iter) hmm_names$number=as.character(hmm_names$number) final=hmm_names %>% left_join(randomSNPS_names_grouped,by="number") write.table(final,opt$output,quote=FALSE, col.names = T, row.names = F)</code></pre> <p>groupRandomChromHMM_UniqnoSig.sh</p> <pre class="bash"><code>#!/bin/bash #SBATCH --job-name=groupRandomChromHMM_UniqnoSig #SBATCH --account=pi-yangili1 #SBATCH --time=5:00:00 #SBATCH --output=groupRandomChromHMM_UniqnoSig.out #SBATCH --error=groupRandomChromHMM_UniqnoSig.err #SBATCH --partition=broadwl #SBATCH --mem=50G #SBATCH --mail-type=END module load Anaconda3 source activate three-prime-env for i in {1..1000}; do Rscript groupRandomByChromHMM_uniq.R -f /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Nuclear_Uniq_noSig/chromHMM_overlap/randomres_overlapChromHMM_Nuclear_880_UniqnoSig_${i}.txt -o /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Nuclear_Uniq_noSig/chromHMM_overlap_group/randomres_overlapChromHMM_Nuclear_880_UniqnoSig_${i}_grouped.txt done for i in {1..1000}; do Rscript groupRandomByChromHMM_uniq.R -f /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Total_Uniq_noSig/chromHMM_overlap/randomres_overlapChromHMM_Total_118_UniqnoSig_${i}.txt -o /project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Total_Uniq_noSig/chromHMM_overlap_group/randomres_overlapChromHMM_Total_118_UniqnoSig_${i}_grouped.txt done </code></pre> <ul> <li>make the final matrix</li> </ul> <pre class="bash"><code>#/project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Nuclear_Uniq_noSig/chromHMM_overlap_group/ cut -d$' ' -f 1,2 randomres_overlapChromHMM_Nuclear_880_UniqnoSig_549_grouped.txt > Nuc_chromOverlap_UniqnoSig.txt for i in {1..1000}; do paste -d" " Nuc_chromOverlap_UniqnoSig.txt <(cut -d" " -f 3 randomres_overlapChromHMM_Nuclear_880_UniqnoSig_${i}_grouped.txt) > tmp mv tmp Nuc_chromOverlap_UniqnoSig.txt done #/project2/gilad/briana/threeprimeseq/data/random_QTLsnps/Total_Uniq_noSig/chromHMM_overlap_group/ cut -d$' ' -f 1,2 randomres_overlapChromHMM_Total_118_UniqnoSig_53_grouped.txt > Tot_chromOverlap_UniqnoSig.txt for i in {1..1000}; do paste -d" " Tot_chromOverlap_UniqnoSig.txt <(cut -d" " -f 3 randomres_overlapChromHMM_Total_118_UniqnoSig_${i}_grouped.txt) > tmp mv tmp Tot_chromOverlap_UniqnoSig.txt done</code></pre> <pre class="r"><code>permutationResTotal_UniqnoSig=read.table("../data/ChromHmmOverlap/Tot_chromOverlap_UniqnoSig.txt", header=T, stringsAsFactors = F) permutationResTotal_UniqnoSig[is.na(permutationResTotal_UniqnoSig)] <- 0 permutationResTotal_UniqnoSig= as_data_frame(permutationResTotal_UniqnoSig) permutationResTotal_UniqnoSig_noName=permutationResTotal_UniqnoSig[,3:ncol(permutationResTotal_UniqnoSig)] totRand_mean_UniqnoSig=rowMeans(permutationResTotal_UniqnoSig_noName)/1000 permutationResNuclear_UniqnoSig=read.table("../data/ChromHmmOverlap/Nuc_chromOverlap_UniqnoSig.txt",header = T,stringsAsFactors = F) permutationResNuclear_UniqnoSig[is.na(permutationResNuclear_UniqnoSig)] <- 0 permutationResNuclear_UniqnoSig_noName=permutationResNuclear_UniqnoSig[,3:ncol(permutationResNuclear_UniqnoSig)] nucRand_mean_UniqnoSig=rowMeans(permutationResNuclear_UniqnoSig_noName)/1000</code></pre> <pre class="r"><code>permutationResTotal_UniqnoSig_melt= melt(permutationResTotal_UniqnoSig, id.vars=c("number", "name"))</code></pre> <pre class="r"><code>ggplot(permutationResTotal_UniqnoSig_melt, aes(x=value,fill=name)) + geom_histogram(bins=50) + facet_wrap(~number) + labs(x="N random Snps in category", title="Random permutation Total")</code></pre> <p><img src="figure/chromHmm_enrichment.Rmd/unnamed-chunk-90-1.png" width="672" style="display: block; margin: auto;" /></p> <pre class="r"><code>permutationResNuclear_UniqnoSig_melt= melt(permutationResNuclear_UniqnoSig, id.vars=c("number", "name"))</code></pre> <pre class="r"><code>ggplot(permutationResNuclear_UniqnoSig_melt, aes(x=value,fill=name)) + geom_histogram(bins=50) + facet_wrap(~number) + labs(x="N random Snps in category", title="Random permutation Total")</code></pre> <p><img src="figure/chromHmm_enrichment.Rmd/unnamed-chunk-92-1.png" width="672" style="display: block; margin: auto;" /></p> <p>This is the model I will move forward with.</p> <p>Plot the enrichment:</p> <p>Look at enrichment by using the average</p> <pre class="r"><code>TotalPermMean_uniq=permutationResTotal_UniqnoSig_melt %>% group_by(number) %>% summarise(TotRandPerm=mean(value), TotRandPermSD=sd(value)) TotalPermMean_uniq$number=as.character(TotalPermMean_uniq$number) NuclearPermMean_uniq=permutationResNuclear_UniqnoSig_melt %>% group_by(number) %>% summarise(NucRandPerm=mean(value),NucRandPermSD=sd(value)) NuclearPermMean_uniq$number=as.character(NuclearPermMean_uniq$number)</code></pre> <p>Melt SNP values by name and number to get data in same format. I already did this above.</p> <pre class="r"><code>TotalOverlapHMM_names_melt=melt(TotalOverlapHMM_names, id.vars=c("number", "name"))%>% filter(variable=="sid") %>% group_by(number) %>% summarise(TotalQTL=n()) TotalOverlapHMM_names_melt$number=as.character(TotalOverlapHMM_names_melt$number) NuclearOverlapHMM_names_melt=melt(NuclearOverlapHMM_names, id.vars=c("number", "name")) %>% filter(variable=="sid") %>% group_by(number) %>% summarise(NucQTL=n()) NuclearOverlapHMM_names_melt$number=as.character(NuclearOverlapHMM_names_melt$number)</code></pre> <pre class="r"><code>chromHmm$number=as.character(chromHmm$number) TotalOverlapHMM_Uniq_enrichment= TotalOverlapHMM_names_melt %>% full_join(TotalPermMean_uniq, by="number") %>% replace_na(list(TotalQTL=.00001)) %>% full_join(chromHmm, by="number") TotalOverlapHMM_Uniq_enrichment$TotalQTL=as.double(TotalOverlapHMM_Uniq_enrichment$TotalQTL) TotalOverlapHMM_Uniq_enrichment = TotalOverlapHMM_Uniq_enrichment %>% mutate(TotEnrich=(TotalQTL-TotRandPerm)/TotRandPermSD) NuclearOverlapHMM_Uniq_enrichment=NuclearOverlapHMM_names_melt %>% full_join(NuclearPermMean_uniq, by="number")%>% full_join(chromHmm, by="number") NuclearOverlapHMM_Uniq_enrichment$NucQTL=as.double(NuclearOverlapHMM_Uniq_enrichment$NucQTL) NuclearOverlapHMM_Uniq_enrichment=NuclearOverlapHMM_Uniq_enrichment %>%mutate(NucEnrich=(NucQTL-NucRandPerm)/NucRandPermSD)</code></pre> <pre class="r"><code>ggplot(NuclearOverlapHMM_Uniq_enrichment, aes(y=NucEnrich, x=number, fill=name)) + geom_bar(stat="identity") + labs(title="Nuclear ApaQTL enrichment Z score", y="Enrichment Z score")</code></pre> <p><img src="figure/chromHmm_enrichment.Rmd/unnamed-chunk-96-1.png" width="672" style="display: block; margin: auto;" /></p> <pre class="r"><code>ggplot(TotalOverlapHMM_Uniq_enrichment, aes(y=TotEnrich, x=number, fill=name)) + geom_bar(stat="identity")+ labs(title="Total ApaQTL enrichment Z score", y="Enrichment Z score")</code></pre> <p><img src="figure/chromHmm_enrichment.Rmd/unnamed-chunk-96-2.png" width="672" style="display: block; margin: auto;" /></p> <pre class="r"><code>bothEnrich_uniq=NuclearOverlapHMM_Uniq_enrichment %>% full_join(TotalOverlapHMM_Uniq_enrichment, by=c("name", "number")) %>% select(number, name, NucEnrich,TotEnrich) bothEnrich_uniqmelt=melt(bothEnrich_uniq, id.vars=c("number", "name"))</code></pre> <pre class="r"><code>ggplot(bothEnrich_uniqmelt, aes(x=number, by=variable, fill=variable, y=value,col=name)) + geom_bar(position = "dodge", stat = "identity",alpha=.5) + scale_fill_manual(values=c("darkviolet", "deepskyblue3")) + labs(y="Enrichment from 1000 permutations", title="ChromHMM enrichment for \nTotal and Nuclear ApaQTLs",x="Region")</code></pre> <p><img src="figure/chromHmm_enrichment.Rmd/unnamed-chunk-98-1.png" width="672" style="display: block; margin: auto;" /></p> </div> </div> <div id="session-information" class="section level2"> <h2>Session information</h2> <pre class="r"><code>sessionInfo()</code></pre> <pre><code>R version 3.5.1 (2018-07-02) Platform: x86_64-apple-darwin15.6.0 (64-bit) Running under: macOS Sierra 10.12.6 Matrix products: default BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib locale: [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8 attached base packages: [1] grid stats graphics grDevices utils datasets methods [8] base other attached packages: [1] bindrcpp_0.2.2 cowplot_0.9.3 ggpubr_0.1.8 [4] magrittr_1.5 data.table_1.11.8 VennDiagram_1.6.20 [7] futile.logger_1.4.3 forcats_0.3.0 stringr_1.3.1 [10] dplyr_0.7.6 purrr_0.2.5 readr_1.1.1 [13] tidyr_0.8.1 tibble_1.4.2 ggplot2_3.0.0 [16] tidyverse_1.2.1 reshape2_1.4.3 workflowr_1.1.1 loaded via a namespace (and not attached): [1] tidyselect_0.2.4 haven_1.1.2 lattice_0.20-35 [4] colorspace_1.3-2 htmltools_0.3.6 yaml_2.2.0 [7] rlang_0.2.2 R.oo_1.22.0 pillar_1.3.0 [10] glue_1.3.0 withr_2.1.2 R.utils_2.7.0 [13] RColorBrewer_1.1-2 lambda.r_1.2.3 modelr_0.1.2 [16] readxl_1.1.0 bindr_0.1.1 plyr_1.8.4 [19] munsell_0.5.0 gtable_0.2.0 cellranger_1.1.0 [22] rvest_0.3.2 R.methodsS3_1.7.1 evaluate_0.11 [25] labeling_0.3 knitr_1.20 broom_0.5.0 [28] Rcpp_0.12.19 formatR_1.5 backports_1.1.2 [31] scales_1.0.0 jsonlite_1.5 hms_0.4.2 [34] digest_0.6.17 stringi_1.2.4 rprojroot_1.3-2 [37] cli_1.0.1 tools_3.5.1 lazyeval_0.2.1 [40] futile.options_1.0.1 crayon_1.3.4 whisker_0.3-2 [43] pkgconfig_2.0.2 xml2_1.2.0 lubridate_1.7.4 [46] assertthat_0.2.0 rmarkdown_1.10 httr_1.3.1 [49] rstudioapi_0.8 R6_2.3.0 nlme_3.1-137 [52] git2r_0.23.0 compiler_3.5.1 </code></pre> </div> <hr> <p> </p> <hr> <!-- To enable disqus, uncomment the section below and provide your disqus_shortname --> <!-- disqus <div id="disqus_thread"></div> <script type="text/javascript"> /* * * CONFIGURATION VARIABLES: EDIT BEFORE PASTING INTO YOUR WEBPAGE * * */ var disqus_shortname = 'rmarkdown'; // required: replace example with your forum shortname /* * * DON'T EDIT BELOW THIS LINE * * */ (function() { var dsq = document.createElement('script'); dsq.type = 'text/javascript'; dsq.async = true; dsq.src = '//' + disqus_shortname + '.disqus.com/embed.js'; (document.getElementsByTagName('head')[0] || document.getElementsByTagName('body')[0]).appendChild(dsq); })(); </script> <noscript>Please enable JavaScript to view the <a href="http://disqus.com/?ref_noscript">comments powered by Disqus.</a></noscript> <a href="http://disqus.com" class="dsq-brlink">comments powered by <span class="logo-disqus">Disqus</span></a> --> <!-- Adjust MathJax settings so that all math formulae are shown using TeX fonts only; see http://docs.mathjax.org/en/latest/configuration.html. This will make the presentation more consistent at the cost of the webpage sometimes taking slightly longer to load. Note that this only works because the footer is added to webpages before the MathJax javascript. --> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <hr> <p> This reproducible <a href="http://rmarkdown.rstudio.com">R Markdown</a> analysis was created with <a href="https://github.com/jdblischak/workflowr">workflowr</a> 1.1.1 </p> <hr> </div> </div> </div> <script> // add bootstrap table styles to pandoc tables function bootstrapStylePandocTables() { $('tr.header').parent('thead').parent('table').addClass('table table-condensed'); } $(document).ready(function () { bootstrapStylePandocTables(); }); </script> <!-- dynamically load mathjax for compatibility with self-contained --> <script> (function () { var script = document.createElement("script"); script.type = "text/javascript"; script.src = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML"; document.getElementsByTagName("head")[0].appendChild(script); })(); </script> </body> </html>