<!DOCTYPE html> <html xmlns="http://www.w3.org/1999/xhtml"> <head> <meta charset="utf-8" /> <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> <meta name="generator" content="pandoc" /> <meta name="author" content="Briana Mittleman" /> <meta name="date" content="2018-12-11" /> <title>Investigate Peak to Gene Assignment</title> <script src="site_libs/jquery-1.11.3/jquery.min.js"></script> <meta name="viewport" content="width=device-width, initial-scale=1" /> <link href="site_libs/bootstrap-3.3.5/css/journal.min.css" rel="stylesheet" /> <script src="site_libs/bootstrap-3.3.5/js/bootstrap.min.js"></script> <script src="site_libs/bootstrap-3.3.5/shim/html5shiv.min.js"></script> <script src="site_libs/bootstrap-3.3.5/shim/respond.min.js"></script> <script src="site_libs/jqueryui-1.11.4/jquery-ui.min.js"></script> <link href="site_libs/tocify-1.9.1/jquery.tocify.css" rel="stylesheet" /> <script src="site_libs/tocify-1.9.1/jquery.tocify.js"></script> <script src="site_libs/navigation-1.1/tabsets.js"></script> <link href="site_libs/highlightjs-9.12.0/textmate.css" rel="stylesheet" /> <script src="site_libs/highlightjs-9.12.0/highlight.js"></script> <link href="site_libs/font-awesome-5.0.13/css/fa-svg-with-js.css" rel="stylesheet" /> <script src="site_libs/font-awesome-5.0.13/js/fontawesome-all.min.js"></script> <script src="site_libs/font-awesome-5.0.13/js/fa-v4-shims.min.js"></script> <style type="text/css">code{white-space: pre;}</style> <style type="text/css"> pre:not([class]) { background-color: white; } </style> <script type="text/javascript"> if (window.hljs) { hljs.configure({languages: []}); hljs.initHighlightingOnLoad(); if (document.readyState && document.readyState === "complete") { window.setTimeout(function() { hljs.initHighlighting(); }, 0); } } </script> <style type="text/css"> h1 { font-size: 34px; } h1.title { font-size: 38px; } h2 { font-size: 30px; } h3 { font-size: 24px; } h4 { font-size: 18px; } h5 { font-size: 16px; } h6 { font-size: 12px; } .table th:not([align]) { text-align: left; } </style> </head> <body> <style type = "text/css"> .main-container { max-width: 940px; margin-left: auto; margin-right: auto; } code { color: inherit; background-color: rgba(0, 0, 0, 0.04); } img { max-width:100%; height: auto; } .tabbed-pane { padding-top: 12px; } .html-widget { margin-bottom: 20px; } button.code-folding-btn:focus { outline: none; } </style> <style type="text/css"> /* padding for bootstrap navbar */ body { padding-top: 51px; padding-bottom: 40px; } /* offset scroll position for anchor links (for fixed navbar) */ .section h1 { padding-top: 56px; margin-top: -56px; } .section h2 { padding-top: 56px; margin-top: -56px; } .section h3 { padding-top: 56px; margin-top: -56px; } .section h4 { padding-top: 56px; margin-top: -56px; } .section h5 { padding-top: 56px; margin-top: -56px; } .section h6 { padding-top: 56px; margin-top: -56px; } </style> <script> // manage active state of menu based on current page $(document).ready(function () { // active menu anchor href = window.location.pathname href = href.substr(href.lastIndexOf('/') + 1) if (href === "") href = "index.html"; var menuAnchor = $('a[href="' + href + '"]'); // mark it active menuAnchor.parent().addClass('active'); // if it's got a parent navbar menu mark it active as well menuAnchor.closest('li.dropdown').addClass('active'); }); </script> <div class="container-fluid main-container"> <!-- tabsets --> <script> $(document).ready(function () { window.buildTabsets("TOC"); }); </script> <!-- code folding --> <script> $(document).ready(function () { // move toc-ignore selectors from section div to header $('div.section.toc-ignore') .removeClass('toc-ignore') .children('h1,h2,h3,h4,h5').addClass('toc-ignore'); // establish options var options = { selectors: "h1,h2,h3", theme: "bootstrap3", context: '.toc-content', hashGenerator: function (text) { return text.replace(/[.\\/?&!#<>]/g, '').replace(/\s/g, '_').toLowerCase(); }, ignoreSelector: ".toc-ignore", scrollTo: 0 }; options.showAndHide = true; options.smoothScroll = true; // tocify var toc = $("#TOC").tocify(options).data("toc-tocify"); }); </script> <style type="text/css"> #TOC { margin: 25px 0px 20px 0px; } @media (max-width: 768px) { #TOC { position: relative; width: 100%; } } .toc-content { padding-left: 30px; padding-right: 40px; } div.main-container { max-width: 1200px; } div.tocify { width: 20%; max-width: 260px; max-height: 85%; } @media (min-width: 768px) and (max-width: 991px) { div.tocify { width: 25%; } } @media (max-width: 767px) { div.tocify { width: 100%; max-width: none; } } .tocify ul, .tocify li { line-height: 20px; } .tocify-subheader .tocify-item { font-size: 0.90em; padding-left: 25px; text-indent: 0; } .tocify .list-group-item { border-radius: 0px; } </style> <!-- setup 3col/9col grid for toc_float and main content --> <div class="row-fluid"> <div class="col-xs-12 col-sm-4 col-md-3"> <div id="TOC" class="tocify"> </div> </div> <div class="toc-content col-xs-12 col-sm-8 col-md-9"> <div class="navbar navbar-default navbar-fixed-top" role="navigation"> <div class="container"> <div class="navbar-header"> <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar"> <span class="icon-bar"></span> <span class="icon-bar"></span> <span class="icon-bar"></span> </button> <a class="navbar-brand" href="index.html">Three Prime Sequencing in Human LCLs</a> </div> <div id="navbar" class="navbar-collapse collapse"> <ul class="nav navbar-nav"> <li> <a href="index.html">Home</a> </li> <li> <a href="about.html">About</a> </li> <li> <a href="license.html">License</a> </li> </ul> <ul class="nav navbar-nav navbar-right"> <li> <a href="https://github.com/brimittleman/threeprimeseq"> <span class="fa fa-github"></span> </a> </li> </ul> </div><!--/.nav-collapse --> </div><!--/.container --> </div><!--/.navbar --> <!-- Add a small amount of space between sections. --> <style type="text/css"> div.section { padding-top: 12px; } </style> <div class="fluid-row" id="header"> <h1 class="title toc-ignore">Investigate Peak to Gene Assignment</h1> <h4 class="author"><em>Briana Mittleman</em></h4> <h4 class="date"><em>12/11/2018</em></h4> </div> <p><strong>Last updated:</strong> 2019-01-15</p> <strong>workflowr checks:</strong> <small>(Click a bullet for more information)</small> <ul> <li> <p><details> <summary> <strong style="color:blue;">✔</strong> <strong>R Markdown file:</strong> up-to-date </summary></p> <p>Great! Since the R Markdown file has been committed to the Git repository, you know the exact version of the code that produced these results.</p> </details> </li> <li> <p><details> <summary> <strong style="color:blue;">✔</strong> <strong>Environment:</strong> empty </summary></p> <p>Great job! The global environment was empty. Objects defined in the global environment can affect the analysis in your R Markdown file in unknown ways. For reproduciblity it’s best to always run the code in an empty environment.</p> </details> </li> <li> <p><details> <summary> <strong style="color:blue;">✔</strong> <strong>Seed:</strong> <code>set.seed(12345)</code> </summary></p> <p>The command <code>set.seed(12345)</code> was run prior to running the code in the R Markdown file. Setting a seed ensures that any results that rely on randomness, e.g. subsampling or permutations, are reproducible.</p> </details> </li> <li> <p><details> <summary> <strong style="color:blue;">✔</strong> <strong>Session information:</strong> recorded </summary></p> <p>Great job! Recording the operating system, R version, and package versions is critical for reproducibility.</p> </details> </li> <li> <p><details> <summary> <strong style="color:blue;">✔</strong> <strong>Repository version:</strong> <a href="https://github.com/brimittleman/threeprimeseq/tree/d26e5485872212953667d4928890c4b6b766ea83" target="_blank">d26e548</a> </summary></p> Great! You are using Git for version control. Tracking code development and connecting the code version to the results is critical for reproducibility. The version displayed above was the version of the Git repository at the time these results were generated. <br><br> Note that you need to be careful to ensure that all relevant files for the analysis have been committed to Git prior to generating the results (you can use <code>wflow_publish</code> or <code>wflow_git_commit</code>). workflowr only checks the R Markdown file, but you know if there are other scripts or data files that it depends on. Below is the status of the Git repository when the results were generated: <pre><code> Ignored files: Ignored: .DS_Store Ignored: .Rhistory Ignored: .Rproj.user/ Ignored: data/.DS_Store Ignored: output/.DS_Store Untracked files: Untracked: KalistoAbundance18486.txt Untracked: analysis/DirectionapaQTL.Rmd Untracked: analysis/PreAshExplore.Rmd Untracked: analysis/YL_QTL_test.Rmd Untracked: analysis/ncbiRefSeq_sm.sort.mRNA.bed Untracked: analysis/snake.config.notes.Rmd Untracked: analysis/verifyBAM.Rmd Untracked: code/PeaksToCoverPerReads.py Untracked: code/strober_pc_pve_heatmap_func.R Untracked: data/18486.genecov.txt Untracked: data/APApeaksYL.total.inbrain.bed Untracked: data/ChromHmmOverlap/ Untracked: data/GM12878.chromHMM.bed Untracked: data/GM12878.chromHMM.txt Untracked: data/LianoglouLCL/ Untracked: data/LocusZoom/ Untracked: data/NuclearApaQTLs.txt Untracked: data/PeakCounts/ Untracked: data/PeaksUsed/ Untracked: data/RNAkalisto/ Untracked: data/TotalApaQTLs.txt Untracked: data/Totalpeaks_filtered_clean.bed Untracked: data/UnderstandPeaksQC/ Untracked: data/YL-SP-18486-T-combined-genecov.txt Untracked: data/YL-SP-18486-T_S9_R1_001-genecov.txt Untracked: data/YL_QTL_test/ Untracked: data/apaExamp/ Untracked: data/bedgraph_peaks/ Untracked: data/bin200.5.T.nuccov.bed Untracked: data/bin200.Anuccov.bed Untracked: data/bin200.nuccov.bed Untracked: data/clean_peaks/ Untracked: data/comb_map_stats.csv Untracked: data/comb_map_stats.xlsx Untracked: data/comb_map_stats_39ind.csv Untracked: data/combined_reads_mapped_three_prime_seq.csv Untracked: data/diff_iso_trans/ Untracked: data/ensemble_to_genename.txt Untracked: data/example_gene_peakQuant/ Untracked: data/explainProtVar/ Untracked: data/filtered_APApeaks_merged_allchrom_refseqTrans.closest2End.bed Untracked: data/filtered_APApeaks_merged_allchrom_refseqTrans.closest2End.noties.bed Untracked: data/first50lines_closest.txt Untracked: data/gencov.test.csv Untracked: data/gencov.test.txt Untracked: data/gencov_zero.test.csv Untracked: data/gencov_zero.test.txt Untracked: data/gene_cov/ Untracked: data/joined Untracked: data/leafcutter/ Untracked: data/merged_combined_YL-SP-threeprimeseq.bg Untracked: data/mol_overlap/ Untracked: data/mol_pheno/ Untracked: data/nom_QTL/ Untracked: data/nom_QTL_opp/ Untracked: data/nom_QTL_trans/ Untracked: data/nuc6up/ Untracked: data/nuc_10up/ Untracked: data/other_qtls/ Untracked: data/pQTL_otherphen/ Untracked: data/peakPerRefSeqGene/ Untracked: data/perm_QTL/ Untracked: data/perm_QTL_opp/ Untracked: data/perm_QTL_trans/ Untracked: data/perm_QTL_trans_filt/ Untracked: data/reads_mapped_three_prime_seq.csv Untracked: data/smash.cov.results.bed Untracked: data/smash.cov.results.csv Untracked: data/smash.cov.results.txt Untracked: data/smash_testregion/ Untracked: data/ssFC200.cov.bed Untracked: data/temp.file1 Untracked: data/temp.file2 Untracked: data/temp.gencov.test.txt Untracked: data/temp.gencov_zero.test.txt Untracked: data/threePrimeSeqMetaData.csv Untracked: output/picard/ Untracked: output/plots/ Untracked: output/qual.fig2.pdf Unstaged changes: Modified: analysis/28ind.peak.explore.Rmd Modified: analysis/CompareLianoglouData.Rmd Modified: analysis/NewPeakPostMP.Rmd Modified: analysis/apaQTLoverlapGWAS.Rmd Modified: analysis/cleanupdtseq.internalpriming.Rmd Modified: analysis/coloc_apaQTLs_protQTLs.Rmd Modified: analysis/dif.iso.usage.leafcutter.Rmd Modified: analysis/diff_iso_pipeline.Rmd Modified: analysis/explainpQTLs.Rmd Modified: analysis/explore.filters.Rmd Modified: analysis/flash2mash.Rmd Modified: analysis/overlapMolQTL.Rmd Modified: analysis/overlap_qtls.Rmd Modified: analysis/peakOverlap_oppstrand.Rmd Modified: analysis/pheno.leaf.comb.Rmd Modified: analysis/swarmPlots_QTLs.Rmd Modified: analysis/test.max2.Rmd Modified: analysis/understandPeaks.Rmd Modified: code/Snakefile </code></pre> Note that any generated files, e.g. HTML, png, CSS, etc., are not included in this status report because it is ok for generated content to have uncommitted changes. </details> </li> </ul> <details> <summary> <small><strong>Expand here to see past versions:</strong></small> </summary> <ul> <table style="border-collapse:separate; border-spacing:5px;"> <thead> <tr> <th style="text-align:left;"> File </th> <th style="text-align:left;"> Version </th> <th style="text-align:left;"> Author </th> <th style="text-align:left;"> Date </th> <th style="text-align:left;"> Message </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> Rmd </td> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/d26e5485872212953667d4928890c4b6b766ea83/analysis/InvestigatePeak2GeneAssignment.Rmd" target="_blank">d26e548</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2019-01-15 </td> <td style="text-align:left;"> add correlation plots </td> </tr> <tr> <td style="text-align:left;"> html </td> <td style="text-align:left;"> <a href="https://cdn.rawgit.com/brimittleman/threeprimeseq/b5a37f3ed662b0f28780079601d389535b2c529c/docs/InvestigatePeak2GeneAssignment.html" target="_blank">b5a37f3</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2019-01-11 </td> <td style="text-align:left;"> Build site. </td> </tr> <tr> <td style="text-align:left;"> Rmd </td> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/d64db58222b47c184a4253170ea8b6fdfe262cec/analysis/InvestigatePeak2GeneAssignment.Rmd" target="_blank">d64db58</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2019-01-11 </td> <td style="text-align:left;"> add correlation plots </td> </tr> <tr> <td style="text-align:left;"> html </td> <td style="text-align:left;"> <a href="https://cdn.rawgit.com/brimittleman/threeprimeseq/69b51620dfe0a0ce05d7085c92f5949fd26fae4a/docs/InvestigatePeak2GeneAssignment.html" target="_blank">69b5162</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-12-13 </td> <td style="text-align:left;"> Build site. </td> </tr> <tr> <td style="text-align:left;"> Rmd </td> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/2a6944b21270570229a60a1d83f84ccbcd6644bc/analysis/InvestigatePeak2GeneAssignment.Rmd" target="_blank">2a6944b</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-12-13 </td> <td style="text-align:left;"> sum over ind </td> </tr> <tr> <td style="text-align:left;"> html </td> <td style="text-align:left;"> <a href="https://cdn.rawgit.com/brimittleman/threeprimeseq/c3fd2c47561749e66251dd7c462d1fd3c9a8a8a5/docs/InvestigatePeak2GeneAssignment.html" target="_blank">c3fd2c4</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-12-13 </td> <td style="text-align:left;"> Build site. </td> </tr> <tr> <td style="text-align:left;"> Rmd </td> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/b198e3bc15dc55987b3fb83c105856b737ec89a3/analysis/InvestigatePeak2GeneAssignment.Rmd" target="_blank">b198e3b</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-12-13 </td> <td style="text-align:left;"> 4kb around end </td> </tr> <tr> <td style="text-align:left;"> html </td> <td style="text-align:left;"> <a href="https://cdn.rawgit.com/brimittleman/threeprimeseq/cbf986cbdc4f24e92c4c8be50f550dcf9257c7b5/docs/InvestigatePeak2GeneAssignment.html" target="_blank">cbf986c</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-12-12 </td> <td style="text-align:left;"> Build site. </td> </tr> <tr> <td style="text-align:left;"> Rmd </td> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/23123cfc46df693415721f21f253e3604d9f4ace/analysis/InvestigatePeak2GeneAssignment.Rmd" target="_blank">23123cf</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-12-12 </td> <td style="text-align:left;"> add only genes not nearby </td> </tr> <tr> <td style="text-align:left;"> html </td> <td style="text-align:left;"> <a href="https://cdn.rawgit.com/brimittleman/threeprimeseq/558e8e88cd84b2803b254b0f60f9c72b2fc1a211/docs/InvestigatePeak2GeneAssignment.html" target="_blank">558e8e8</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-12-12 </td> <td style="text-align:left;"> Build site. </td> </tr> <tr> <td style="text-align:left;"> Rmd </td> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/0e0840ef800bbe60330b5cabb91047974abed714/analysis/InvestigatePeak2GeneAssignment.Rmd" target="_blank">0e0840e</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-12-12 </td> <td style="text-align:left;"> remove overlaping genes </td> </tr> <tr> <td style="text-align:left;"> html </td> <td style="text-align:left;"> <a href="https://cdn.rawgit.com/brimittleman/threeprimeseq/6da90e9c6405976ca4cbeb59c6c07a30365e195f/docs/InvestigatePeak2GeneAssignment.html" target="_blank">6da90e9</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-12-11 </td> <td style="text-align:left;"> Build site. </td> </tr> </tbody> </table> </ul> <p></details></p> <hr /> <pre class="r"><code>library(tidyverse)</code></pre> <pre><code>── Attaching packages ───────────────────────────────────────────────────────────── tidyverse 1.2.1 ──</code></pre> <pre><code>✔ ggplot2 3.0.0 ✔ purrr 0.2.5 ✔ tibble 1.4.2 ✔ dplyr 0.7.6 ✔ tidyr 0.8.1 ✔ stringr 1.3.1 ✔ readr 1.1.1 ✔ forcats 0.3.0</code></pre> <pre><code>── Conflicts ──────────────────────────────────────────────────────────────── tidyverse_conflicts() ── ✖ dplyr::filter() masks stats::filter() ✖ dplyr::lag() masks stats::lag()</code></pre> <pre class="r"><code>library(workflowr)</code></pre> <pre><code>This is workflowr version 1.1.1 Run ?workflowr for help getting started</code></pre> <pre class="r"><code>library(cowplot)</code></pre> <pre><code> Attaching package: 'cowplot'</code></pre> <pre><code>The following object is masked from 'package:ggplot2': ggsave</code></pre> <pre class="r"><code>library(reshape2)</code></pre> <pre><code> Attaching package: 'reshape2'</code></pre> <pre><code>The following object is masked from 'package:tidyr': smiths</code></pre> <pre class="r"><code>library(tximport)</code></pre> <p>In looking at correlations and some examples, there is evidence the peak to gene assignment may be a problem. I am going to visualize the peaks in IGV. I will name them by the gene and look at them in the browser.</p> <p>The peak to gene annotations used in the feature counts to map reads back to the peaks is the following:<br /> * /project2/gilad/briana/threeprimeseq/data/mergedPeaks_comb/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed.bed</p> <p>I need to change this a bit to have the name be the gene rather than the score:</p> <p>NamePeaksByGene.py</p> <pre class="bash"><code>#python CovnamedPeaks=open("/project2/gilad/briana/threeprimeseq/data/mergedPeaks_comb/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed.bed", "r") GeneNamedPeaks=open("/project2/gilad/briana/threeprimeseq/data/UnderstandPeaksQC/PeaksNamedWithGeneAssignment.bed", "w") for ln in CovnamedPeaks: chrom, start, end, num, cov, strand, transcript = ln.split() gene=transcript.split("-")[1] GeneNamedPeaks.write("%s\t%s\t%s\t%s\n"%(chrom,start,end,gene)) GeneNamedPeaks.close() </code></pre> <p>This was made based on the transcript annotation: ncbiRefSeq.mRNA.named.bed</p> <ul> <li>/project2/gilad/briana/genome_anotation_data/ncbiRefSeq.mRNA.named.bed</li> </ul> <p>The ends of the transcripts specfically are in:</p> <ul> <li>/project2/gilad/briana/genome_anotation_data/ncbiRefSeq_endProtCodGenes_sort.txt</li> </ul> <p>Ideas for Dilters:</p> <ul> <li><p>Cant be upstream of the gene, ex: chr2:135,558,075-135,604,343</p></li> <li><p>maybe it cant be in another gene</p></li> <li><p>we should include LINCs</p></li> <li><p>looks like we have a ton of low expressed intergenic peaks that should be filtered before we do the gene annotation</p></li> </ul> <div id="filter-out-intergenic-peaks" class="section level3"> <h3>Filter out intergenic peaks</h3> <p>As a first pass I want to filter out the peaks that are outside a gene body. While this may not be perfect it will help alot with the intergenic noise.</p> <p>I need to overlap the named peaks with /project2/gilad/briana/genome_anotation_data/ncbiRefSeq.mRNA.named.bed and only keep the matches. I can use bedtools intersect.</p> <p>Rename the peaks according to convention to run an intesect.</p> <p>RenamePeaks4Intersect.py</p> <pre class="bash"><code>#python CovnamedPeaks=open("/project2/gilad/briana/threeprimeseq/data/mergedPeaks_comb/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed.bed", "r") GeneNamedPeaks=open("/project2/gilad/briana/threeprimeseq/data/UnderstandPeaksQC/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_RENAMED.bed", "w") for ln in CovnamedPeaks: chrom, start, end, num, cov, strand, transcript = ln.split() gene=transcript.split("-")[1] start=int(start) end=int(end) GeneNamedPeaks.write("%s\t%d\t%d\t%s-%s\t%s\t%s\n"%(chrom,start,end,num,gene,cov,strand)) GeneNamedPeaks.close() </code></pre> <p>Remove CHR from the refseq annpotation:</p> <pre class="bash"><code>sed 's/^chr//' /project2/gilad/briana/genome_anotation_data/ncbiRefSeq.mRNA.named.bed > /project2/gilad/briana/genome_anotation_data/ncbiRefSeq.mRNA.named_noCHR.bed</code></pre> <p>Filter4GenicPeaks.sh</p> <pre class="bash"><code> #!/bin/bash #SBATCH --job-name=Filter4GenicPeaks #SBATCH --account=pi-yangili1 #SBATCH --time=24:00:00 #SBATCH --output=Filter4GenicPeaks.out #SBATCH --error=Filter4GenicPeaks.err #SBATCH --partition=broadwl #SBATCH --mem=12G #SBATCH --mail-type=END module load Anaconda3 source activate three-prime-env bedtools intersect -wa -s -a /project2/gilad/briana/threeprimeseq/data/UnderstandPeaksQC/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_RENAMED.bed -b /project2/gilad/briana/genome_anotation_data/ncbiRefSeq.mRNA.named_noCHR.bed > /project2/gilad/briana/threeprimeseq/data/mergedPeaks_comb_inGeneBody/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_inGeneBodies.bed</code></pre> <p>This is printing them multiple times.</p> <pre class="bash"><code>uniq /project2/gilad/briana/threeprimeseq/data/mergedPeaks_comb_inGeneBody/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_inGeneBodies.bed > /project2/gilad/briana/threeprimeseq/data/mergedPeaks_comb_inGeneBody/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_inGeneBodiesUNIQ.bed</code></pre> <p>Now I need to make this an SAF to run feature counts.</p> <p>bed2saf_peaksInGenicReg.py</p> <pre class="bash"><code>from misc_helper import * fout = open("/project2/gilad/briana/threeprimeseq/data/mergedPeaks_comb_inGeneBody/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_inGeneBodiesUNIQ.SAF",'w') fout.write("GeneID\tChr\tStart\tEnd\tStrand\n") for ln in open("/project2/gilad/briana/threeprimeseq/data/mergedPeaks_comb_inGeneBody/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_inGeneBodiesUNIQ.bed"): chrom, start, end, name, score, strand = ln.split() namenum=name.split("-")[0] name_i=int(namenum) start_i=int(start) end_i=int(end) gene_only=name.split("-")[1] ID = "peak%d:%s:%d:%d:%s:%s"%(name_i, chrom, start_i, end_i, strand, gene_only) fout.write("%s\t%s\t%d\t%d\t%s\n"%(ID, chrom, start_i, end_i, strand)) fout.close()</code></pre> <p>Run Feature Counts<br /> PeaksinGenicRegion_fc_TN.sh</p> <pre class="bash"><code>#!/bin/bash #SBATCH --job-name=PeaksinGenicRegion_fc_TN #SBATCH --account=pi-yangili1 #SBATCH --time=24:00:00 #SBATCH --output=PeaksinGenicRegion_fc_TN.out #SBATCH --error=PeaksinGenicRegion_fc_TN.err #SBATCH --partition=broadwl #SBATCH --mem=12G #SBATCH --mail-type=END module load Anaconda3 source activate three-prime-env featureCounts -O -a /project2/gilad/briana/threeprimeseq/data/mergedPeaks_comb_inGeneBody/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_inGeneBodiesUNIQ.SAF -F SAF -o /project2/gilad/briana/threeprimeseq/data/PeakInGenecRegion_cov/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_Genic.Total.fc /project2/gilad/briana/threeprimeseq/data/sort/*-T-*-sort.bam -s 2 featureCounts -O -a /project2/gilad/briana/threeprimeseq/data/mergedPeaks_comb_inGeneBody/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_inGeneBodiesUNIQ.SAF -F SAF -o /project2/gilad/briana/threeprimeseq/data/PeakInGenecRegion_cov/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_Genic.Nuclear.fc /project2/gilad/briana/threeprimeseq/data/sort/*-N-*-sort.bam -s 2 </code></pre> <p>Lastly I will need to fix the headers.</p> <p>fix_head_fc_genicPeak_tot.py</p> <pre class="bash"><code>infile= open("/project2/gilad/briana/threeprimeseq/data/PeakInGenecRegion_cov/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_Genic.Total.fc", "r") fout = open("/project2/gilad/briana/threeprimeseq/data/PeakInGenecRegion_cov/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_Genic.Total_fixed.fc",'w') for line, i in enumerate(infile): if line == 1: i_list=i.split() libraries=i_list[:6] for sample in i_list[6:]: full = sample.split("/")[7] samp= full.split("-")[2:4] lim="_" samp_st=lim.join(samp) libraries.append(samp_st) first_line= "\t".join(libraries) fout.write(first_line + '\n') else : fout.write(i) fout.close()</code></pre> <p>fix_head_fc_genicPeak_nuc.py</p> <pre class="bash"><code>infile= open("/project2/gilad/briana/threeprimeseq/data/PeakInGenecRegion_cov/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_Genic.Nuclear.fc", "r") fout = open("/project2/gilad/briana/threeprimeseq/data/PeakInGenecRegion_cov/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_Genic.Nuclear_fixed.fc",'w') for line, i in enumerate(infile): if line == 1: i_list=i.split() libraries=i_list[:6] for sample in i_list[6:]: full = sample.split("/")[7] samp= full.split("-")[2:4] lim="_" samp_st=lim.join(samp) libraries.append(samp_st) first_line= "\t".join(libraries) fout.write(first_line + '\n') else : fout.write(i) fout.close()</code></pre> <p>Pull these into R and look at the correlation between the sum of the peaks by gene and the transcripts counts from RNA seq.</p> <p>TPM counts from Kalisto</p> <pre class="r"><code>tx2gene=read.table("../data/RNAkalisto/ncbiRefSeq.txn2gene.txt" ,header= F, sep="\t", stringsAsFactors = F) txi.kallisto.tsv <- tximport("../data/RNAkalisto/abundance.tsv", type = "kallisto", tx2gene = tx2gene,countsFromAbundance="lengthScaledTPM" )</code></pre> <pre><code>Note: importing `abundance.h5` is typically faster than `abundance.tsv`</code></pre> <pre><code>reading in files with read_tsv</code></pre> <pre><code>1 removing duplicated transcript rows from tx2gene transcripts missing from tx2gene: 99 summarizing abundance summarizing counts summarizing length</code></pre> <p>In previous analysis I did not account for gene length. Here I am going to standardize by length because I am taking a sum over a gene body.</p> <p>Import gene lengths:</p> <pre class="r"><code>geneLengthNames=c("CHR", "start", "end", "gene", "score", "strand") geneLengths=read.table("../data/UnderstandPeaksQC/refseq.ProteinCoding.bed", header=F, stringsAsFactors = F, col.names = geneLengthNames) %>% mutate(length=end-start) %>% select(gene, length)</code></pre> <p>Look at the correlation with the total:</p> <p>I am using the sum of the counts in a gene divided by how many million reads mapped. I am also filtering out peaks with less than 10 reads in this individual.</p> <pre class="r"><code>total_Cov_18486=read.table("../data/UnderstandPeaksQC/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_Genic.Total_fixed.fc", header=T, stringsAsFactors = F)[,1:7] %>% separate(Geneid, into=c("peak", "chr", "start", "end", "strand", "gene"), sep=":") %>% select(gene, X18486_T) %>% filter(X18486_T>0) %>% group_by(gene) %>% summarize(GeneSum=sum(X18486_T)) %>% mutate(GeneSumNorm=GeneSum/10.8) %>% inner_join(geneLengths, by="gene") %>% mutate(GeneSumSt=GeneSum/length)</code></pre> <p>Join the data frames.</p> <pre class="r"><code>TXN_abund=as.data.frame(txi.kallisto.tsv$abundance) %>% rownames_to_column(var="gene") colnames(TXN_abund)=c("gene", "TPM") TXN_NormGene=TXN_abund %>% inner_join(total_Cov_18486,by="gene")</code></pre> <p>Remove rows with 0 counts and Plot:</p> <pre class="r"><code>TXN_NormGene=TXN_NormGene %>% filter(TPM>0) %>% filter(GeneSumSt>0) corr_18486Tot=ggplot(TXN_NormGene, aes(x=log10(TPM), y= log10(GeneSumSt))) + geom_point() + labs(title="Total", x="log10 RNA seq TPM", y="log10 Peak count sum per gene")+ geom_smooth(aes(x=log10(TPM),y=log10(GeneSumSt)),method = "lm") + annotate("text",x=3, y=5,label="R2=.48") +geom_density2d(na.rm = TRUE, size = 1, colour = 'red') #+ geom_text(aes(label=gene),hjust=0, vjust=0) corr_18486Tot </code></pre> <p><img src="figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-15-1.png" width="672" style="display: block; margin: auto;" /></p> <details> <summary><em>Expand here to see past versions of unnamed-chunk-15-1.png:</em></summary> <table style="border-collapse:separate; border-spacing:5px;"> <thead> <tr> <th style="text-align:left;"> Version </th> <th style="text-align:left;"> Author </th> <th style="text-align:left;"> Date </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/558e8e88cd84b2803b254b0f60f9c72b2fc1a211/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-15-1.png" target="_blank">558e8e8</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-12-12 </td> </tr> </tbody> </table> <p></details></p> <pre class="r"><code>summary(lm(log10(TPM)~log10(GeneSumSt),TXN_NormGene)) </code></pre> <pre><code> Call: lm(formula = log10(TPM) ~ log10(GeneSumSt), data = TXN_NormGene) Residuals: Min 1Q Median 3Q Max -3.5894 -0.2556 0.0856 0.3676 2.3387 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 2.407969 0.013563 177.5 <2e-16 *** log10(GeneSumSt) 0.612175 0.005812 105.3 <2e-16 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Residual standard error: 0.598 on 12053 degrees of freedom Multiple R-squared: 0.4793, Adjusted R-squared: 0.4793 F-statistic: 1.11e+04 on 1 and 12053 DF, p-value: < 2.2e-16</code></pre> <pre class="r"><code>cor.test(log10(TXN_NormGene$TPM),log10(TXN_NormGene$GeneSumSt))</code></pre> <pre><code> Pearson's product-moment correlation data: log10(TXN_NormGene$TPM) and log10(TXN_NormGene$GeneSumSt) t = 105.34, df = 12053, p-value < 2.2e-16 alternative hypothesis: true correlation is not equal to 0 95 percent confidence interval: 0.6829262 0.7015184 sample estimates: cor 0.6923372 </code></pre> <p>Try this with nuclear</p> <pre class="r"><code>nuclear_Cov_18486=read.table("../data/UnderstandPeaksQC/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_Genic.Nuclear_fixed.fc", header=T, stringsAsFactors = F)[,1:7] %>% separate(Geneid, into=c("peak", "chr", "start", "end", "strand", "gene"), sep=":") %>% select(gene, X18486_N) %>% filter(X18486_N>10) %>% group_by(gene) %>% summarize(GeneSum=sum(X18486_N)) %>% mutate(GeneSumNorm=GeneSum/11.4) %>% inner_join(geneLengths, by="gene") %>% mutate(GeneSumSt=GeneSum/length)</code></pre> <p>Join the data frames.</p> <pre class="r"><code>TXN_NormGene_nuc=TXN_abund %>% inner_join(nuclear_Cov_18486,by="gene")</code></pre> <p>Remove rows with 0 counts and Plot:</p> <pre class="r"><code>TXN_NormGene_nuc=TXN_NormGene_nuc %>% filter(TPM>0) %>% filter(GeneSumSt>0) corr_18486Nuc=ggplot(TXN_NormGene_nuc, aes(x=log10(TPM), y= log10(GeneSumSt))) + geom_point() + labs(title="Nuclear", x="log10 RNA seq TPM", y="log10 Peak count sum per gene")+ geom_smooth(aes(x=log10(TPM),y=log10(GeneSumSt)),method = "lm") + annotate("text",x=3, y=5,label="R2=.37") + geom_density2d(na.rm = TRUE, size = 1, colour = 'red') #+ geom_text(aes(label=gene),hjust=0, vjust=0) corr_18486Nuc </code></pre> <p><img src="figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-18-1.png" width="672" style="display: block; margin: auto;" /></p> <details> <summary><em>Expand here to see past versions of unnamed-chunk-18-1.png:</em></summary> <table style="border-collapse:separate; border-spacing:5px;"> <thead> <tr> <th style="text-align:left;"> Version </th> <th style="text-align:left;"> Author </th> <th style="text-align:left;"> Date </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/558e8e88cd84b2803b254b0f60f9c72b2fc1a211/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-18-1.png" target="_blank">558e8e8</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-12-12 </td> </tr> </tbody> </table> <p></details></p> <pre class="r"><code>summary(lm(log10(TPM)~log10(GeneSumSt),TXN_NormGene_nuc)) </code></pre> <pre><code> Call: lm(formula = log10(TPM) ~ log10(GeneSumSt), data = TXN_NormGene_nuc) Residuals: Min 1Q Median 3Q Max -3.7211 -0.2691 0.0733 0.3789 2.5253 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 2.451150 0.017039 143.85 <2e-16 *** log10(GeneSumSt) 0.654587 0.008008 81.74 <2e-16 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Residual standard error: 0.6324 on 11567 degrees of freedom Multiple R-squared: 0.3661, Adjusted R-squared: 0.3661 F-statistic: 6681 on 1 and 11567 DF, p-value: < 2.2e-16</code></pre> <pre class="r"><code>cor.test(log10(TXN_NormGene_nuc$TPM),log10(TXN_NormGene_nuc$GeneSumSt))</code></pre> <pre><code> Pearson's product-moment correlation data: log10(TXN_NormGene_nuc$TPM) and log10(TXN_NormGene_nuc$GeneSumSt) t = 81.74, df = 11567, p-value < 2.2e-16 alternative hypothesis: true correlation is not equal to 0 95 percent confidence interval: 0.593414 0.616518 sample estimates: cor 0.6050934 </code></pre> <p>This just said it had to be in a gene body not the specific gene body. This could be a problem still. For example in the SSPO locus chr7:149,521,993-149,543,749. Here the peaks are closer to the end of the SSPO but are in the gene body of the next gene downstream.</p> <p>Histones dont have a polyA tail- the HIST1H4C peak is most likely misprimming (chr6:26,102,306-26,110,443)</p> <p>Filter out overlapping genes:</p> <p>Count overlaps in origial file:<br /> <code>bedtools merge -i IN.bed -c 1 -o count > counted</code></p> <p>countGeneOverlap.sh</p> <pre class="bash"><code>#!/bin/bash #SBATCH --job-name=countGeneOverlap #SBATCH --account=pi-yangili1 #SBATCH --time=24:00:00 #SBATCH --output=countGeneOverlap.out #SBATCH --error=countGeneOverlap.err #SBATCH --partition=broadwl #SBATCH --mem=12G #SBATCH --mail-type=END module load Anaconda3 source activate three-prime-env bedtools merge -i /project2/gilad/briana/genome_anotation_data/refseq.ProteinCoding.noCHR.bed -c 1 -o count > /project2/gilad/briana/genome_anotation_data/refseq.ProteinCoding.countGeneOverlap.bed </code></pre> <p>Filter out these rows: <code>awk '/\t1$/{print}' counted > filtered</code></p> <pre class="bash"><code>awk '/\t1$/{print}' /project2/gilad/briana/genome_anotation_data/refseq.ProteinCoding.countGeneOverlap.bed > /project2/gilad/briana/genome_anotation_data/refseq.ProteinCoding.countGeneOverlap.filtered.bed</code></pre> <p>Intersect with original input to only keep the ones in both sets.<br /> <code>bedtools intersect -a IN.bed -b filtered -wa > OUT.bed</code></p> <p>findGeneswithoutOverlap.sh</p> <pre class="bash"><code>#!/bin/bash #SBATCH --job-name=findGeneswithoutOverlap.sh #SBATCH --account=pi-yangili1 #SBATCH --time=24:00:00 #SBATCH --output=findGeneswithoutOverlap.out #SBATCH --error=findGeneswithoutOverlap.err #SBATCH --partition=broadwl #SBATCH --mem=12G #SBATCH --mail-type=END module load Anaconda3 source activate three-prime-env bedtools intersect -a /project2/gilad/briana/genome_anotation_data/refseq.ProteinCoding.noCHR.bed -b /project2/gilad/briana/genome_anotation_data/refseq.ProteinCoding.countGeneOverlap.filtered.bed -wa > /project2/gilad/briana/genome_anotation_data/refseq.ProteinCoding.NonOverlapGenes.bed </code></pre> <p>Finally overlap with the mRNA file to only keep the transcripts in these genes. This may be easiest in python /project2/gilad/briana/genome_anotation_data/ncbiRefSeq.mRNA.named_noCHR.bed</p> <p>subsetmRNAforNonOverlapGenes.py</p> <pre class="bash"><code>geneFile=open("/project2/gilad/briana/genome_anotation_data/refseq.ProteinCoding.NonOverlapGenes.bed", "r") mRNAFile=open("/project2/gilad/briana/genome_anotation_data/ncbiRefSeq.mRNA.named_noCHR.bed", "r") outFile=open("/project2/gilad/briana/genome_anotation_data/ncbiRefSeq.mRNA.named_noCHR_NoneOverlapingGenes.bed", "w") #make list of non overlapping genes keep=[] for ln in geneFile: keep.append(ln.split()[3]) for ln in mRNAFile: if ln.split()[4] in keep: outFile.write(ln) outFile.close() </code></pre> <p>Filter peaks on this resutls</p> <p>Filter4GenicPeaks_noOverlap.sh</p> <pre class="bash"><code> #!/bin/bash #SBATCH --job-name=Filter4GenicPeaks #SBATCH --account=pi-yangili1 #SBATCH --time=24:00:00 #SBATCH --output=Filter4GenicPeaks.out #SBATCH --error=Filter4GenicPeaks.err #SBATCH --partition=broadwl #SBATCH --mem=12G #SBATCH --mail-type=END module load Anaconda3 source activate three-prime-env bedtools intersect -wa -s -a /project2/gilad/briana/threeprimeseq/data/UnderstandPeaksQC/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_RENAMED.bed -b /project2/gilad/briana/genome_anotation_data/ncbiRefSeq.mRNA.named_noCHR_NoneOverlapingGenes.bed> /project2/gilad/briana/threeprimeseq/data/mergedPeaks_comb_inGeneBody/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_inGeneBodies_noGeneOverlap.bed</code></pre> <p>This is printing them multiple times.</p> <pre class="bash"><code>uniq /project2/gilad/briana/threeprimeseq/data/mergedPeaks_comb_inGeneBody/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_inGeneBodies_noGeneOverlap.bed > /project2/gilad/briana/threeprimeseq/data/mergedPeaks_comb_inGeneBody/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_inGeneBodies_noGeneOverlap_UNIQ.bed</code></pre> <p>Make this an SAF to run FC</p> <p>bed2saf_peaksInGenicReg_noOVERLAP.py</p> <pre class="bash"><code>from misc_helper import * fout = open("/project2/gilad/briana/threeprimeseq/data/mergedPeaks_comb_inGeneBody/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_inGeneBodies_noGeneOverlap_UNIQ.SAF",'w') fout.write("GeneID\tChr\tStart\tEnd\tStrand\n") for ln in open("/project2/gilad/briana/threeprimeseq/data/mergedPeaks_comb_inGeneBody/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_inGeneBodies_noGeneOverlap_UNIQ.bed"): chrom, start, end, name, score, strand = ln.split() namenum=name.split("-")[0] name_i=int(namenum) start_i=int(start) end_i=int(end) gene_only=name.split("-")[1] ID = "peak%d:%s:%d:%d:%s:%s"%(name_i, chrom, start_i, end_i, strand, gene_only) fout.write("%s\t%s\t%d\t%d\t%s\n"%(ID, chrom, start_i, end_i, strand)) fout.close()</code></pre> <p>Run Feature Counts<br /> PeaksinGenicRegion_NoneOverlapGenes_fc_TN.sh</p> <pre class="bash"><code>#!/bin/bash #SBATCH --job-name=PeaksinGenicRegion_NoneOverlapGenes_fc_TN #SBATCH --account=pi-yangili1 #SBATCH --time=24:00:00 #SBATCH --output=PeaksinGenicRegion_NoneOverlapGenes_fc_TN.out #SBATCH --error=PeaksinGenicRegion_NoneOverlapGenes_fc_TN.err #SBATCH --partition=broadwl #SBATCH --mem=12G #SBATCH --mail-type=END module load Anaconda3 source activate three-prime-env featureCounts -O -a /project2/gilad/briana/threeprimeseq/data/mergedPeaks_comb_inGeneBody/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_inGeneBodies_noGeneOverlap_UNIQ.SAF -F SAF -o /project2/gilad/briana/threeprimeseq/data/PeakInGenecRegionNoOverlap_cov/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_GenicNoOverlap.Total.fc /project2/gilad/briana/threeprimeseq/data/sort/*-T-*-sort.bam -s 2 featureCounts -O -a /project2/gilad/briana/threeprimeseq/data/mergedPeaks_comb_inGeneBody/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_inGeneBodies_noGeneOverlap_UNIQ.SAF -F SAF -o /project2/gilad/briana/threeprimeseq/data/PeakInGenecRegionNoOverlap_cov/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_GenicNoOverlap.Nuclear.fc /project2/gilad/briana/threeprimeseq/data/sort/*-N-*-sort.bam -s 2 </code></pre> <p>Lastly I will need to fix the headers.</p> <p>fix_head_fc_genicPeakNoOverlap_tot.py</p> <pre class="bash"><code>infile= open("/project2/gilad/briana/threeprimeseq/data/PeakInGenecRegionNoOverlap_cov/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_GenicNoOverlap.Total.fc", "r") fout = open("/project2/gilad/briana/threeprimeseq/data/PeakInGenecRegionNoOverlap_cov/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_GenicNoOverlap.Total_fixed.fc",'w') for line, i in enumerate(infile): if line == 1: i_list=i.split() libraries=i_list[:6] for sample in i_list[6:]: full = sample.split("/")[7] samp= full.split("-")[2:4] lim="_" samp_st=lim.join(samp) libraries.append(samp_st) first_line= "\t".join(libraries) fout.write(first_line + '\n') else : fout.write(i) fout.close()</code></pre> <p>fix_head_fc_genicPeakNoOverlap_nuc.py</p> <pre class="bash"><code>infile= open("/project2/gilad/briana/threeprimeseq/data/PeakInGenecRegionNoOverlap_cov/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_GenicNoOverlap.Nuclear.fc", "r") fout = open("/project2/gilad/briana/threeprimeseq/data/PeakInGenecRegionNoOverlap_cov/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_GenicNoOverlap.Nuclear_fixed.fc",'w') for line, i in enumerate(infile): if line == 1: i_list=i.split() libraries=i_list[:6] for sample in i_list[6:]: full = sample.split("/")[7] samp= full.split("-")[2:4] lim="_" samp_st=lim.join(samp) libraries.append(samp_st) first_line= "\t".join(libraries) fout.write(first_line + '\n') else : fout.write(i) fout.close() </code></pre> <p>Pull these onto my computer:</p> <p>Use no filter and standardization scheme:</p> <pre class="r"><code>total_Cov_18486_noOver=read.table("../data/UnderstandPeaksQC/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_GenicNoOverlap.Total_fixed.fc", header=T, stringsAsFactors = F)[,1:7] %>% separate(Geneid, into=c("peak", "chr", "start", "end", "strand", "gene"), sep=":") %>% select(gene, X18486_T)%>% group_by(gene) %>% summarize(GeneSum=sum(X18486_T)) %>% mutate(GeneSumNorm=GeneSum/10.8) %>% inner_join(geneLengths, by="gene") %>% mutate(GeneSumSt=GeneSum/length)</code></pre> <p>Join the data frames.</p> <pre class="r"><code>TXN_NormGene_noOverlap=TXN_abund %>% inner_join(total_Cov_18486_noOver,by="gene")</code></pre> <p>Remove rows with 0 counts and Plot:</p> <pre class="r"><code>TXN_NormGene_noOverlap=TXN_NormGene_noOverlap %>% filter(TPM>0) %>% filter(GeneSumSt>0) corr_18486Tot_noOver=ggplot(TXN_NormGene_noOverlap, aes(x=log10(TPM), y= log10(GeneSumSt))) + geom_point() + labs(title="Total", x="log10 RNA seq TPM", y="log10 Peak count sum per gene")+ geom_smooth(aes(x=log10(TPM),y=log10(GeneSumSt)),method = "lm") + annotate("text",x=3, y=5,label="R2=.49") +geom_density2d(na.rm = TRUE, size = 1, colour = 'red') + geom_text(aes(label=gene),hjust=0, vjust=0) corr_18486Tot_noOver </code></pre> <p><img src="figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-31-1.png" width="672" style="display: block; margin: auto;" /></p> <details> <summary><em>Expand here to see past versions of unnamed-chunk-31-1.png:</em></summary> <table style="border-collapse:separate; border-spacing:5px;"> <thead> <tr> <th style="text-align:left;"> Version </th> <th style="text-align:left;"> Author </th> <th style="text-align:left;"> Date </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/cbf986cbdc4f24e92c4c8be50f550dcf9257c7b5/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-31-1.png" target="_blank">cbf986c</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-12-12 </td> </tr> </tbody> </table> <p></details></p> <pre class="r"><code>summary(lm(log10(TPM)~log10(GeneSumSt),TXN_NormGene_noOverlap)) </code></pre> <pre><code> Call: lm(formula = log10(TPM) ~ log10(GeneSumSt), data = TXN_NormGene_noOverlap) Residuals: Min 1Q Median 3Q Max -3.3953 -0.2604 0.0768 0.3638 2.8920 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 2.435113 0.014849 163.99 <2e-16 *** log10(GeneSumSt) 0.616937 0.006274 98.34 <2e-16 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Residual standard error: 0.5973 on 10088 degrees of freedom Multiple R-squared: 0.4894, Adjusted R-squared: 0.4894 F-statistic: 9671 on 1 and 10088 DF, p-value: < 2.2e-16</code></pre> <pre class="r"><code>cor.test(log10(TXN_NormGene_noOverlap$TPM),log10(TXN_NormGene_noOverlap$GeneSumSt))</code></pre> <pre><code> Pearson's product-moment correlation data: log10(TXN_NormGene_noOverlap$TPM) and log10(TXN_NormGene_noOverlap$GeneSumSt) t = 98.339, df = 10088, p-value < 2.2e-16 alternative hypothesis: true correlation is not equal to 0 95 percent confidence interval: 0.6894966 0.7094251 sample estimates: cor 0.6995969 </code></pre> <pre class="r"><code>nuclear_Cov_18486_noOver=read.table("../data/UnderstandPeaksQC/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_GenicNoOverlap.Nuclear_fixed.fc", header=T, stringsAsFactors = F)[,1:7] %>% separate(Geneid, into=c("peak", "chr", "start", "end", "strand", "gene"), sep=":") %>% select(gene, X18486_N) %>% group_by(gene) %>% summarize(GeneSum=sum(X18486_N)) %>% mutate(GeneSumNorm=GeneSum/11.4) %>% inner_join(geneLengths, by="gene") %>% mutate(GeneSumSt=GeneSum/length)</code></pre> <p>Join the data frames.</p> <pre class="r"><code>TXN_NormGene_noOverlap_nuc=TXN_abund %>% inner_join(nuclear_Cov_18486_noOver,by="gene")</code></pre> <p>Remove rows with 0 counts and Plot:</p> <pre class="r"><code>TXN_NormGene_noOverlap_nuc=TXN_NormGene_noOverlap_nuc %>% filter(TPM>0) %>% filter(GeneSumSt>0) corr_18486Nuc_noOver=ggplot(TXN_NormGene_noOverlap_nuc, aes(x=log10(TPM), y= log10(GeneSumSt))) + geom_point() + labs(title="Nuclear", x="log10 RNA seq TPM", y="log10 Peak count sum per gene")+ geom_smooth(aes(x=log10(TPM),y=log10(GeneSumSt)),method = "lm") + annotate("text",x=3, y=5,label="R2=.5") +geom_density2d(na.rm = TRUE, size = 1, colour = 'red') #+ geom_text(aes(label=gene),hjust=0, vjust=0) corr_18486Nuc_noOver </code></pre> <p><img src="figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-34-1.png" width="672" style="display: block; margin: auto;" /></p> <details> <summary><em>Expand here to see past versions of unnamed-chunk-34-1.png:</em></summary> <table style="border-collapse:separate; border-spacing:5px;"> <thead> <tr> <th style="text-align:left;"> Version </th> <th style="text-align:left;"> Author </th> <th style="text-align:left;"> Date </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/cbf986cbdc4f24e92c4c8be50f550dcf9257c7b5/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-34-1.png" target="_blank">cbf986c</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-12-12 </td> </tr> </tbody> </table> <p></details></p> <pre class="r"><code>summary(lm(log10(TPM)~log10(GeneSumSt),TXN_NormGene_noOverlap_nuc)) </code></pre> <pre><code> Call: lm(formula = log10(TPM) ~ log10(GeneSumSt), data = TXN_NormGene_noOverlap_nuc) Residuals: Min 1Q Median 3Q Max -3.6947 -0.2989 0.0630 0.3825 2.8300 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 2.530468 0.016063 157.5 <2e-16 *** log10(GeneSumSt) 0.708816 0.006819 103.9 <2e-16 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Residual standard error: 0.647 on 10965 degrees of freedom Multiple R-squared: 0.4963, Adjusted R-squared: 0.4963 F-statistic: 1.08e+04 on 1 and 10965 DF, p-value: < 2.2e-16</code></pre> <pre class="r"><code>cor.test(log10(TXN_NormGene_noOverlap_nuc$TPM),log10(TXN_NormGene_noOverlap_nuc$GeneSumSt),method="spearman")</code></pre> <pre><code>Warning in cor.test.default(log10(TXN_NormGene_noOverlap_nuc$TPM), log10(TXN_NormGene_noOverlap_nuc$GeneSumSt), : Cannot compute exact p-value with ties</code></pre> <pre><code> Spearman's rank correlation rho data: log10(TXN_NormGene_noOverlap_nuc$TPM) and log10(TXN_NormGene_noOverlap_nuc$GeneSumSt) S = 6.8486e+10, p-value < 2.2e-16 alternative hypothesis: true rho is not equal to 0 sample estimates: rho 0.6884764 </code></pre> <p>It looks like a more stringent RNA seq filter could help. Lets say log(TPM)>-1.5</p> <pre class="r"><code>TXN_abund_filt=TXN_abund %>% filter(log(TPM) > -1.5) TXN_NormGene_noOverlap_filt=TXN_abund_filt %>% inner_join(total_Cov_18486_noOver,by="gene") TXN_NormGene_noOverlap_filt=TXN_NormGene_noOverlap_filt %>% filter(TPM >0) %>% filter(GeneSumSt>0) corr_18486Tot_noOver_filt=ggplot(TXN_NormGene_noOverlap_filt, aes(x=log10(TPM), y= log10(GeneSumSt))) + geom_point() + labs(title="Total filtered", x="log10 RNA seq TPM", y="log10 Peak count sum per gene")+ geom_smooth(aes(x=log10(TPM),y=log10(GeneSumSt)),method = "lm") + annotate("text",x=3, y=5,label="R2=.49") +geom_density2d(na.rm = TRUE, size = 1, colour = 'red') #+ geom_text(aes(label=gene),hjust=0, vjust=0) corr_18486Tot_noOver_filt </code></pre> <p><img src="figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-35-1.png" width="672" style="display: block; margin: auto;" /></p> <details> <summary><em>Expand here to see past versions of unnamed-chunk-35-1.png:</em></summary> <table style="border-collapse:separate; border-spacing:5px;"> <thead> <tr> <th style="text-align:left;"> Version </th> <th style="text-align:left;"> Author </th> <th style="text-align:left;"> Date </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/cbf986cbdc4f24e92c4c8be50f550dcf9257c7b5/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-35-1.png" target="_blank">cbf986c</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-12-12 </td> </tr> </tbody> </table> <p></details></p> <pre class="r"><code>summary(lm(log10(TPM)~log10(GeneSumSt),TXN_NormGene_noOverlap_filt)) </code></pre> <pre><code> Call: lm(formula = log10(TPM) ~ log10(GeneSumSt), data = TXN_NormGene_noOverlap_filt) Residuals: Min 1Q Median 3Q Max -3.05828 -0.26269 0.03857 0.31263 2.64091 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 2.344999 0.012950 181.08 <2e-16 *** log10(GeneSumSt) 0.544908 0.005639 96.63 <2e-16 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Residual standard error: 0.4998 on 9677 degrees of freedom Multiple R-squared: 0.4911, Adjusted R-squared: 0.491 F-statistic: 9338 on 1 and 9677 DF, p-value: < 2.2e-16</code></pre> <pre class="r"><code>cor.test(log10(TXN_NormGene_noOverlap_filt$TPM),log10(TXN_NormGene_noOverlap_filt$GeneSumSt),method="spearman")</code></pre> <pre><code>Warning in cor.test.default(log10(TXN_NormGene_noOverlap_filt$TPM), log10(TXN_NormGene_noOverlap_filt$GeneSumSt), : Cannot compute exact p- value with ties</code></pre> <pre><code> Spearman's rank correlation rho data: log10(TXN_NormGene_noOverlap_filt$TPM) and log10(TXN_NormGene_noOverlap_filt$GeneSumSt) S = 4.7005e+10, p-value < 2.2e-16 alternative hypothesis: true rho is not equal to 0 sample estimates: rho 0.6889706 </code></pre> <p>This does not take care of genes that are near each other such as what is going on with SSPO</p> </div> <div id="filter-genes-that-are-too-close-together" class="section level2"> <h2>Filter genes that are too close together</h2> <p>This is similar to the overlap problem but I want to extend the genes.</p> <p>I can a python script to subtract 10000 bases from the start and add 10000 to the end</p> <p>*/project2/gilad/briana/genome_anotation_data/refseq.ProteinCoding.noCHR.bed</p> <p>extendGenes.py</p> <pre class="bash"><code>inFile=open("/project2/gilad/briana/genome_anotation_data/refseq.ProteinCoding.noCHR.bed", "r") outFile=open("/project2/gilad/briana/genome_anotation_data/refseq.ProteinCoding.noCHR_EXT.bed", "w")\ for ln in inFile: chrom, start, end, gene, score, strand = ln.split() start_ex=int(start)-10000 if start_ex <0: start_ex=0 end_ex=int(end)+10000 outFile.write("%s\t%d\t%d\t%s\t%s\t%s\n"%(chrom, start_ex, end_ex, gene, score, strand)) outFile.close()</code></pre> <p>Now I can run the merge:</p> <p>countGeneOverlap_EXT.sh</p> <pre class="bash"><code>#!/bin/bash #SBATCH --job-name=countGeneOverlap_EXT #SBATCH --account=pi-yangili1 #SBATCH --time=24:00:00 #SBATCH --output=countGeneOverlap_EXT.out #SBATCH --error=countGeneOverlap_EXT.err #SBATCH --partition=broadwl #SBATCH --mem=12G #SBATCH --mail-type=END module load Anaconda3 source activate three-prime-env bedtools merge -i /project2/gilad/briana/genome_anotation_data/refseq.ProteinCoding.noCHR_EXT.bed -c 1 -o count > /project2/gilad/briana/genome_anotation_data/refseq.ProteinCoding.countGeneOverlap_EXT.bed </code></pre> <p>Filter out these rows: <code>awk '/\t1$/{print}' counted > filtered</code></p> <pre class="bash"><code>awk '/\t1$/{print}' /project2/gilad/briana/genome_anotation_data/refseq.ProteinCoding.countGeneOverlap_EXT.bed > /project2/gilad/briana/genome_anotation_data/refseq.ProteinCoding.countGeneOverlap_EXT_filter.bed</code></pre> <p>Intersect with original input to only keep the ones in both sets.<br /> <code>bedtools intersect -a IN.bed -b filtered -wa > OUT.bed</code></p> <p>findGeneswithoutOverlap_EXT.sh</p> <pre class="bash"><code>#!/bin/bash #SBATCH --job-name=findGeneswithoutOverlap_EXT.sh #SBATCH --account=pi-yangili1 #SBATCH --time=24:00:00 #SBATCH --output=findGeneswithoutOverlap_EXT.out #SBATCH --error=findGeneswithoutOverlap_EXT.err #SBATCH --partition=broadwl #SBATCH --mem=12G #SBATCH --mail-type=END module load Anaconda3 source activate three-prime-env bedtools intersect -a /project2/gilad/briana/genome_anotation_data/refseq.ProteinCoding.noCHR.bed -b /project2/gilad/briana/genome_anotation_data/refseq.ProteinCoding.countGeneOverlap_EXT_filter.bed -wa > /project2/gilad/briana/genome_anotation_data/refseq.ProteinCoding.NonOverlapGenes_EXT.bed </code></pre> <p>Finally overlap with the mRNA file to only keep the transcripts in these genes. This may be easiest in python /project2/gilad/briana/genome_anotation_data/ncbiRefSeq.mRNA.named_noCHR.bed</p> <p>subsetmRNAforNonOverlapGenes_EXT.py</p> <pre class="bash"><code>geneFile=open("/project2/gilad/briana/genome_anotation_data/refseq.ProteinCoding.NonOverlapGenes_EXT.bed", "r") mRNAFile=open("/project2/gilad/briana/genome_anotation_data/ncbiRefSeq.mRNA.named_noCHR.bed", "r") outFile=open("/project2/gilad/briana/genome_anotation_data/ncbiRefSeq.mRNA.named_noCHR_NoneOverlapingGenes_EXT.bed", "w") #make list of non overlapping genes keep=[] for ln in geneFile: keep.append(ln.split()[3]) for ln in mRNAFile: if ln.split()[4] in keep: outFile.write(ln) outFile.close() </code></pre> <p>Filter4GenicPeaks_noOverlap_EXT.sh</p> <pre class="bash"><code> #!/bin/bash #SBATCH --job-name=Filter4GenicPeaks_EXT #SBATCH --account=pi-yangili1 #SBATCH --time=24:00:00 #SBATCH --output=Filter4GenicPeaks_EXT.out #SBATCH --error=Filter4GenicPeaks_EXT.err #SBATCH --partition=broadwl #SBATCH --mem=12G #SBATCH --mail-type=END module load Anaconda3 source activate three-prime-env bedtools intersect -wa -s -a /project2/gilad/briana/threeprimeseq/data/UnderstandPeaksQC/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_RENAMED.bed -b /project2/gilad/briana/genome_anotation_data/ncbiRefSeq.mRNA.named_noCHR_NoneOverlapingGenes_EXT.bed > /project2/gilad/briana/threeprimeseq/data/mergedPeaks_comb_inGeneBody/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_inGeneBodies_noGeneOverlapEXT.bed</code></pre> <p>This is printing them multiple times.</p> <pre class="bash"><code>uniq /project2/gilad/briana/threeprimeseq/data/mergedPeaks_comb_inGeneBody/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_inGeneBodies_noGeneOverlapEXT.bed > /project2/gilad/briana/threeprimeseq/data/mergedPeaks_comb_inGeneBody/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_inGeneBodies_noGeneOverlapEXT_UNIQ.bed</code></pre> <p>Make this an SAF to run FC</p> <p>bed2saf_peaksInGenicReg_noOVERLAPEXT.py</p> <pre class="bash"><code>from misc_helper import * fout = open("/project2/gilad/briana/threeprimeseq/data/mergedPeaks_comb_inGeneBody/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_inGeneBodies_noGeneOverlapEXT_UNIQ.SAF",'w') fout.write("GeneID\tChr\tStart\tEnd\tStrand\n") for ln in open("/project2/gilad/briana/threeprimeseq/data/mergedPeaks_comb_inGeneBody/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_inGeneBodies_noGeneOverlapEXT_UNIQ.bed"): chrom, start, end, name, score, strand = ln.split() namenum=name.split("-")[0] name_i=int(namenum) start_i=int(start) end_i=int(end) gene_only=name.split("-")[1] ID = "peak%d:%s:%d:%d:%s:%s"%(name_i, chrom, start_i, end_i, strand, gene_only) fout.write("%s\t%s\t%d\t%d\t%s\n"%(ID, chrom, start_i, end_i, strand)) fout.close()</code></pre> <p>Run Feature Counts<br /> PeaksinGenicRegion_NoneOverlapGenesEXT_fc_TN.sh</p> <pre class="bash"><code>#!/bin/bash #SBATCH --job-name=PeaksinGenicRegion_NoneOverlapGenesEXT_fc_TN #SBATCH --account=pi-yangili1 #SBATCH --time=24:00:00 #SBATCH --output=PeaksinGenicRegion_NoneOverlapGenesEXT_fc_TN.out #SBATCH --error=PeaksinGenicRegion_NoneOverlapGenesEXT_fc_TN.err #SBATCH --partition=broadwl #SBATCH --mem=12G #SBATCH --mail-type=END module load Anaconda3 source activate three-prime-env featureCounts -O -a /project2/gilad/briana/threeprimeseq/data/mergedPeaks_comb_inGeneBody/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_inGeneBodies_noGeneOverlapEXT_UNIQ.SAF -F SAF -o /project2/gilad/briana/threeprimeseq/data/PeakInGenecRegionNoOverlapEXT_cov/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_GenicNoOverlapEXT.Total.fc /project2/gilad/briana/threeprimeseq/data/sort/*-T-*-sort.bam -s 2 featureCounts -O -a /project2/gilad/briana/threeprimeseq/data/mergedPeaks_comb_inGeneBody/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_inGeneBodies_noGeneOverlapEXT_UNIQ.SAF -F SAF -o /project2/gilad/briana/threeprimeseq/data/PeakInGenecRegionNoOverlapEXT_cov/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_GenicNoOverlapEXT.Nuclear.fc /project2/gilad/briana/threeprimeseq/data/sort/*-N-*-sort.bam -s 2 </code></pre> <p>Lastly I will need to fix the headers.</p> <p>fix_head_fc_genicPeakNoOverlapEXT_tot.py</p> <pre class="bash"><code>infile= open("/project2/gilad/briana/threeprimeseq/data/PeakInGenecRegionNoOverlapEXT_cov/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_GenicNoOverlapEXT.Total.fc", "r") fout = open("/project2/gilad/briana/threeprimeseq/data/PeakInGenecRegionNoOverlapEXT_cov/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_GenicNoOverlapEXT.Total_fixed.fc",'w') for line, i in enumerate(infile): if line == 1: i_list=i.split() libraries=i_list[:6] for sample in i_list[6:]: full = sample.split("/")[7] samp= full.split("-")[2:4] lim="_" samp_st=lim.join(samp) libraries.append(samp_st) first_line= "\t".join(libraries) fout.write(first_line + '\n') else : fout.write(i) fout.close()</code></pre> <p>fix_head_fc_genicPeakNoOverlapEXT_nuc.py</p> <pre class="bash"><code>infile= open("/project2/gilad/briana/threeprimeseq/data/PeakInGenecRegionNoOverlapEXT_cov/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_GenicNoOverlapEXT.Nuclear.fc", "r") fout = open("/project2/gilad/briana/threeprimeseq/data/PeakInGenecRegionNoOverlapEXT_cov/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_GenicNoOverlapEXT.Nuclear_Fixed.fc",'w') for line, i in enumerate(infile): if line == 1: i_list=i.split() libraries=i_list[:6] for sample in i_list[6:]: full = sample.split("/")[7] samp= full.split("-")[2:4] lim="_" samp_st=lim.join(samp) libraries.append(samp_st) first_line= "\t".join(libraries) fout.write(first_line + '\n') else : fout.write(i) fout.close() </code></pre> <p>Pull these onto my computer:</p> <p>Use no filter and standardization scheme:</p> <pre class="r"><code>total_Cov_18486_noOverEXT=read.table("../data/UnderstandPeaksQC/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_GenicNoOverlapEXT.Total_fixed.fc", header=T, stringsAsFactors = F)[,1:7] %>% separate(Geneid, into=c("peak", "chr", "start", "end", "strand", "gene"), sep=":") %>% select(gene, X18486_T)%>% group_by(gene) %>% summarize(GeneSum=sum(X18486_T)) %>% mutate(GeneSumNorm=GeneSum/10.8)</code></pre> <p>Join the data frames.</p> <pre class="r"><code>TXN_NormGene_noOverlapEXT=TXN_abund %>% inner_join(total_Cov_18486_noOverEXT,by="gene")</code></pre> <p>Remove rows with 0 counts and Plot:</p> <pre class="r"><code>TXN_NormGene_noOverlapEXT_n=TXN_NormGene_noOverlapEXT %>% filter(TPM>0) %>% filter(GeneSumNorm>0) corr_18486Tot_noOverEXT=ggplot(TXN_NormGene_noOverlapEXT_n, aes(x=log10(TPM), y= log10(GeneSumNorm))) + geom_point() + labs(title="Total", x="log10 RNA seq TPM", y="log10 Peak count sum per gene")+ geom_smooth(aes(x=log10(TPM),y=log10(GeneSumNorm)),method = "lm") + annotate("text",x=3, y=5,label="R2=.42") +geom_density2d(na.rm = TRUE, size = 1, colour = 'red') #+geom_text(aes(label=gene),hjust=0, vjust=0) corr_18486Tot_noOverEXT </code></pre> <p><img src="figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-49-1.png" width="672" style="display: block; margin: auto;" /></p> <details> <summary><em>Expand here to see past versions of unnamed-chunk-49-1.png:</em></summary> <table style="border-collapse:separate; border-spacing:5px;"> <thead> <tr> <th style="text-align:left;"> Version </th> <th style="text-align:left;"> Author </th> <th style="text-align:left;"> Date </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/69b51620dfe0a0ce05d7085c92f5949fd26fae4a/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-49-1.png" target="_blank">69b5162</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-12-13 </td> </tr> <tr> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/c3fd2c47561749e66251dd7c462d1fd3c9a8a8a5/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-49-1.png" target="_blank">c3fd2c4</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-12-13 </td> </tr> <tr> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/cbf986cbdc4f24e92c4c8be50f550dcf9257c7b5/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-49-1.png" target="_blank">cbf986c</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-12-12 </td> </tr> </tbody> </table> <p></details></p> <pre class="r"><code>summary(lm(log10(TPM)~log10(GeneSumNorm),TXN_NormGene_noOverlapEXT_n)) </code></pre> <pre><code> Call: lm(formula = log10(TPM) ~ log10(GeneSumNorm), data = TXN_NormGene_noOverlapEXT_n) Residuals: Min 1Q Median 3Q Max -3.6242 -0.2842 0.0307 0.3445 2.9033 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 0.18874 0.02195 8.598 <2e-16 *** log10(GeneSumNorm) 0.66519 0.01540 43.200 <2e-16 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Residual standard error: 0.6428 on 2542 degrees of freedom Multiple R-squared: 0.4234, Adjusted R-squared: 0.4231 F-statistic: 1866 on 1 and 2542 DF, p-value: < 2.2e-16</code></pre> <pre class="r"><code>cor.test(log10(TXN_NormGene_noOverlapEXT_n$TPM),log10(TXN_NormGene_noOverlapEXT_n$GeneSumNorm),method="spearman")</code></pre> <pre><code>Warning in cor.test.default(log10(TXN_NormGene_noOverlapEXT_n$TPM), log10(TXN_NormGene_noOverlapEXT_n$GeneSumNorm), : Cannot compute exact p- value with ties</code></pre> <pre><code> Spearman's rank correlation rho data: log10(TXN_NormGene_noOverlapEXT_n$TPM) and log10(TXN_NormGene_noOverlapEXT_n$GeneSumNorm) S = 880650000, p-value < 2.2e-16 alternative hypothesis: true rho is not equal to 0 sample estimates: rho 0.6790751 </code></pre> <p>In this test, I was looking at 2600 genes.</p> <pre class="r"><code>nuclear_Cov_18486_noOverEXT=read.table("../data/UnderstandPeaksQC/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_GenicNoOverlapEXT.Nuclear_Fixed.fc", header=T, stringsAsFactors = F)[,1:7] %>% separate(Geneid, into=c("peak", "chr", "start", "end", "strand", "gene"), sep=":") %>% select(gene, X18486_N) %>% group_by(gene) %>% summarize(GeneSum=sum(X18486_N)) %>% mutate(GeneSumNorm=GeneSum/11.4) </code></pre> <p>Join the data frames.</p> <pre class="r"><code>TXN_NormGene_noOverlap_nucEXT=TXN_abund %>% inner_join(nuclear_Cov_18486_noOverEXT,by="gene")</code></pre> <p>Remove rows with 0 counts and Plot:</p> <pre class="r"><code>TXN_NormGene_noOverlap_nucEXT_n=TXN_NormGene_noOverlap_nucEXT %>% filter(TPM>0) %>% filter(GeneSumNorm>0) corr_18486Nuc_noOverEXT=ggplot(TXN_NormGene_noOverlap_nucEXT_n, aes(x=log10(TPM), y= log10(GeneSumNorm))) + geom_point() + labs(title="Nuclear", x="log10 RNA seq TPM", y="log10 Peak count sum per gene")+ geom_smooth(aes(x=log10(TPM),y=log10(GeneSumNorm)),method = "lm") + annotate("text",x=3, y=5,label="R2=.43") +geom_density2d(na.rm = TRUE, size = 1, colour = 'red') #+geom_text(aes(label=gene),hjust=0, vjust=0) corr_18486Nuc_noOverEXT </code></pre> <p><img src="figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-52-1.png" width="672" style="display: block; margin: auto;" /></p> <details> <summary><em>Expand here to see past versions of unnamed-chunk-52-1.png:</em></summary> <table style="border-collapse:separate; border-spacing:5px;"> <thead> <tr> <th style="text-align:left;"> Version </th> <th style="text-align:left;"> Author </th> <th style="text-align:left;"> Date </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/69b51620dfe0a0ce05d7085c92f5949fd26fae4a/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-52-1.png" target="_blank">69b5162</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-12-13 </td> </tr> <tr> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/c3fd2c47561749e66251dd7c462d1fd3c9a8a8a5/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-52-1.png" target="_blank">c3fd2c4</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-12-13 </td> </tr> <tr> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/cbf986cbdc4f24e92c4c8be50f550dcf9257c7b5/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-52-1.png" target="_blank">cbf986c</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-12-12 </td> </tr> </tbody> </table> <p></details></p> <pre class="r"><code>summary(lm(log10(TPM)~log10(GeneSumNorm),TXN_NormGene_noOverlap_nucEXT_n)) </code></pre> <pre><code> Call: lm(formula = log10(TPM) ~ log10(GeneSumNorm), data = TXN_NormGene_noOverlap_nucEXT_n) Residuals: Min 1Q Median 3Q Max -3.4278 -0.3689 0.0112 0.3873 3.2887 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 0.02478 0.02179 1.137 0.256 log10(GeneSumNorm) 0.68406 0.01470 46.527 <2e-16 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Residual standard error: 0.7076 on 2836 degrees of freedom Multiple R-squared: 0.4329, Adjusted R-squared: 0.4327 F-statistic: 2165 on 1 and 2836 DF, p-value: < 2.2e-16</code></pre> <pre class="r"><code>cor.test(log10(TXN_NormGene_noOverlap_nucEXT_n$TPM),log10(TXN_NormGene_noOverlap_nucEXT_n$GeneSumNorm),method="spearman")</code></pre> <pre><code>Warning in cor.test.default(log10(TXN_NormGene_noOverlap_nucEXT_n$TPM), : Cannot compute exact p-value with ties</code></pre> <pre><code> Spearman's rank correlation rho data: log10(TXN_NormGene_noOverlap_nucEXT_n$TPM) and log10(TXN_NormGene_noOverlap_nucEXT_n$GeneSumNorm) S = 1351600000, p-value < 2.2e-16 alternative hypothesis: true rho is not equal to 0 sample estimates: rho 0.6452286 </code></pre> <p>In this test, I was looking at 2900 genes.</p> <p>I can further ssubset for genes with only 1 peak.</p> <pre class="r"><code>OnePeakGenesTotalnoOver=read.table("../data/UnderstandPeaksQC/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_GenicNoOverlapEXT.Total_fixed.fc", header=T, stringsAsFactors = F)[,1:7] %>% separate(Geneid, into=c("peak", "chr", "start", "end", "strand", "gene"), sep=":") %>% select(gene, X18486_T)%>% group_by(gene) %>% tally() %>% filter(n>=3) total_Cov_18486_noOverEXT_1Peak=read.table("../data/UnderstandPeaksQC/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_GenicNoOverlapEXT.Total_fixed.fc", header=T, stringsAsFactors = F)[,1:7] %>% separate(Geneid, into=c("peak", "chr", "start", "end", "strand", "gene"), sep=":") %>% select(gene, X18486_T)%>% group_by(gene) %>% summarize(GeneSum=sum(X18486_T)) %>% mutate(GeneSumNorm=GeneSum/10.8) %>% inner_join(geneLengths, by="gene") %>% mutate(GeneSumSt=GeneSum/length) %>% semi_join(OnePeakGenesTotalnoOver, by="gene")</code></pre> <pre class="r"><code>TXN_NormGene_noOverlapEXT_1peak=TXN_abund %>% inner_join(total_Cov_18486_noOverEXT_1Peak,by="gene")</code></pre> <pre class="r"><code>TXN_NormGene_noOverlapEXT_1peak_n=TXN_NormGene_noOverlapEXT_1peak %>% filter(TPM>0) %>% filter(GeneSumNorm>0) corr_18486Tot_noOverEXT=ggplot(TXN_NormGene_noOverlapEXT_1peak_n, aes(x=log10(TPM), y= log10(GeneSumNorm))) + geom_point() + labs(title="Total-nonOverlapping Genes with 3 peaks", x="log10 RNA seq TPM", y="log10 Peak count sum per gene")+ geom_smooth(aes(x=log10(TPM),y=log10(GeneSumNorm)),method = "lm") + annotate("text",x=3, y=5,label="R2=.4") +geom_density2d(na.rm = TRUE, size = 1, colour = 'red') #+geom_text(aes(label=gene),hjust=0, vjust=0) corr_18486Tot_noOverEXT </code></pre> <p><img src="figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-55-1.png" width="672" style="display: block; margin: auto;" /></p> <details> <summary><em>Expand here to see past versions of unnamed-chunk-55-1.png:</em></summary> <table style="border-collapse:separate; border-spacing:5px;"> <thead> <tr> <th style="text-align:left;"> Version </th> <th style="text-align:left;"> Author </th> <th style="text-align:left;"> Date </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/69b51620dfe0a0ce05d7085c92f5949fd26fae4a/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-55-1.png" target="_blank">69b5162</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-12-13 </td> </tr> <tr> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/cbf986cbdc4f24e92c4c8be50f550dcf9257c7b5/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-55-1.png" target="_blank">cbf986c</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-12-12 </td> </tr> </tbody> </table> <p></details></p> <pre class="r"><code>summary(lm(log10(TPM)~log10(GeneSumNorm),TXN_NormGene_noOverlapEXT_1peak_n)) </code></pre> <pre><code> Call: lm(formula = log10(TPM) ~ log10(GeneSumNorm), data = TXN_NormGene_noOverlapEXT_1peak_n) Residuals: Min 1Q Median 3Q Max -3.2185 -0.2785 0.0258 0.3296 2.7657 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 0.27643 0.02297 12.03 <2e-16 *** log10(GeneSumNorm) 0.61688 0.01570 39.30 <2e-16 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Residual standard error: 0.5918 on 2366 degrees of freedom Multiple R-squared: 0.395, Adjusted R-squared: 0.3947 F-statistic: 1544 on 1 and 2366 DF, p-value: < 2.2e-16</code></pre> <pre class="r"><code>cor.test(log10(TXN_NormGene_noOverlapEXT_1peak_n$TPM),log10(TXN_NormGene_noOverlapEXT_1peak_n$GeneSumNorm),method="spearman")</code></pre> <pre><code>Warning in cor.test.default(log10(TXN_NormGene_noOverlapEXT_1peak_n$TPM), : Cannot compute exact p-value with ties</code></pre> <pre><code> Spearman's rank correlation rho data: log10(TXN_NormGene_noOverlapEXT_1peak_n$TPM) and log10(TXN_NormGene_noOverlapEXT_1peak_n$GeneSumNorm) S = 752880000, p-value < 2.2e-16 alternative hypothesis: true rho is not equal to 0 sample estimates: rho 0.6598005 </code></pre> </div> <div id="around-gene-end" class="section level2"> <h2>Around gene end</h2> <p>There are still problems with peaks in the downstream gene. I need to have a filter that a peak needs to be within a certain distance from the end of a gene. I can change the end of gene file to have 2kb around each gene end and then work with peaks in this area. I want it to be 2kb into the gene.</p> <p>I can filter the peaks to only those in these regions.</p> <p>EndOfGenes_4kbaround.py</p> <pre class="bash"><code>#python inFile=open("/project2/gilad/briana/genome_anotation_data/ncbiRefSeq_endProtCodGenes_sort_withscore.bed", "r") outFile=open("/project2/gilad/briana/genome_anotation_data/ncbiRefSeq_4KBaroundEnd_sort_withscore.bed", "w")\ for ln in inFile: chrom, start, end, gene, score, strand = ln.split() start_ex=int(start)-2000 if start_ex <0: start_ex=0 end_ex=int(end)+2000 outFile.write("%s\t%d\t%d\t%s\t%s\t%s\n"%(chrom, start_ex, end_ex, gene, score, strand)) outFile.close()</code></pre> <p>Now I intersect this with the peaks file.</p> <p>FilterPeaks4KBaroundend.sh</p> <pre class="bash"><code>#!/bin/bash #SBATCH --job-name=FilterPeaks4KBaroundend #SBATCH --account=pi-yangili1 #SBATCH --time=24:00:00 #SBATCH --output=FilterPeaks4KBaroundend.out #SBATCH --error=FilterPeaks4KBaroundend.err #SBATCH --partition=broadwl #SBATCH --mem=12G #SBATCH --mail-type=END module load Anaconda3 source activate three-prime-env bedtools intersect -a /project2/gilad/briana/threeprimeseq/data/UnderstandPeaksQC/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_RENAMED.bed -b /project2/gilad/briana/genome_anotation_data/ncbiRefSeq_4KBaroundEnd_sort_withscore.bed -wa -s > /project2/gilad/briana/threeprimeseq/data/UnderstandPeaksQC/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_RENAMED_4kbaroundEnd.bed</code></pre> <p>This is printing them multiple times.</p> <pre class="bash"><code>uniq /project2/gilad/briana/threeprimeseq/data/UnderstandPeaksQC/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_RENAMED_4kbaroundEnd.bed > /project2/gilad/briana/threeprimeseq/data/UnderstandPeaksQC/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_RENAMED_4kbaroundEnd.UNIQ.bed</code></pre> <p>Make this an SAF to run FC</p> <p>bed2saf_peaksInGenicReg_4kbaround.py</p> <pre class="bash"><code>from misc_helper import * fout = open("/project2/gilad/briana/threeprimeseq/data/UnderstandPeaksQC/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_RENAMED_4kbaroundEnd.UNIQ.SAF",'w') fout.write("GeneID\tChr\tStart\tEnd\tStrand\n") for ln in open("/project2/gilad/briana/threeprimeseq/data/UnderstandPeaksQC/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_RENAMED_4kbaroundEnd.UNIQ.bed"): chrom, start, end, name, score, strand = ln.split() namenum=name.split("-")[0] name_i=int(namenum) start_i=int(start) end_i=int(end) gene_only=name.split("-")[1] ID = "peak%d:%s:%d:%d:%s:%s"%(name_i, chrom, start_i, end_i, strand, gene_only) fout.write("%s\t%s\t%d\t%d\t%s\n"%(ID, chrom, start_i, end_i, strand)) fout.close()</code></pre> <p>Run feature counts with this.<br /> Peaks4kbAround_fc_TN.sh</p> <pre class="bash"><code>#!/bin/bash #SBATCH --job-name=Peaks4kbAround_fc_TN #SBATCH --account=pi-yangili1 #SBATCH --time=24:00:00 #SBATCH --output=Peaks4kbAround_fc_TN.out #SBATCH --error=Peaks4kbAround_fc_TN.err #SBATCH --partition=broadwl #SBATCH --mem=12G #SBATCH --mail-type=END module load Anaconda3 source activate three-prime-env featureCounts -O -a /project2/gilad/briana/threeprimeseq/data/UnderstandPeaksQC/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_RENAMED_4kbaroundEnd.UNIQ.SAF -F SAF -o /project2/gilad/briana/threeprimeseq/data/Peaks4kbAroundGeneEnd/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_Peaks4kbAround.Total.fc /project2/gilad/briana/threeprimeseq/data/sort/*-T-*-sort.bam -s 2 featureCounts -O -a //project2/gilad/briana/threeprimeseq/data/UnderstandPeaksQC/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_RENAMED_4kbaroundEnd.UNIQ.SAF -F SAF -o /project2/gilad/briana/threeprimeseq/data/Peaks4kbAroundGeneEnd/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_Peaks4kbAround.Nuclear.fc /project2/gilad/briana/threeprimeseq/data/sort/*-N-*-sort.bam -s 2 </code></pre> <p>fix_head_fc_Peaks4kbAround_tot.py</p> <pre class="bash"><code>infile= open("/project2/gilad/briana/threeprimeseq/data/Peaks4kbAroundGeneEnd/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_Peaks4kbAround.Total.fc", "r") fout = open("/project2/gilad/briana/threeprimeseq/data/Peaks4kbAroundGeneEnd/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_Peaks4kbAround.Total_fixed.fc",'w') for line, i in enumerate(infile): if line == 1: i_list=i.split() libraries=i_list[:6] for sample in i_list[6:]: full = sample.split("/")[7] samp= full.split("-")[2:4] lim="_" samp_st=lim.join(samp) libraries.append(samp_st) first_line= "\t".join(libraries) fout.write(first_line + '\n') else : fout.write(i) fout.close()</code></pre> <p>fix_head_fc_gPeaks4kbAround_nuc.py</p> <pre class="bash"><code>infile= open("/project2/gilad/briana/threeprimeseq/data/Peaks4kbAroundGeneEnd/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_Peaks4kbAround.Nuclear.fc", "r") fout = open("/project2/gilad/briana/threeprimeseq/data/Peaks4kbAroundGeneEnd/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_Peaks4kbAround.Nuclear_fixed.fc",'w') for line, i in enumerate(infile): if line == 1: i_list=i.split() libraries=i_list[:6] for sample in i_list[6:]: full = sample.split("/")[7] samp= full.split("-")[2:4] lim="_" samp_st=lim.join(samp) libraries.append(samp_st) first_line= "\t".join(libraries) fout.write(first_line + '\n') else : fout.write(i) fout.close() </code></pre> <p>Use no filter and standardization scheme:</p> <pre class="r"><code>total_Cov_18486_4kb=read.table("../data/UnderstandPeaksQC/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_Peaks4kbAround.Total_fixed.fc", header=T, stringsAsFactors = F)[,1:7] %>% separate(Geneid, into=c("peak", "chr", "start", "end", "strand", "gene"), sep=":") %>% select(gene, X18486_T)%>% group_by(gene) %>% filter(X18486_T>10) %>% summarize(GeneSum=sum(X18486_T)) %>% mutate(GeneSumNorm=GeneSum/10.8) %>% inner_join(geneLengths, by="gene") %>% mutate(GeneSumSt=GeneSum/length)</code></pre> <p>Join the data frames.</p> <pre class="r"><code>TXN_NormGene_4kb=TXN_abund %>% inner_join(total_Cov_18486_4kb,by="gene")</code></pre> <p>Remove rows with 0 counts and Plot:</p> <pre class="r"><code>TXN_NormGene_4kb_n=TXN_NormGene_4kb %>% filter(TPM>0) %>% filter(GeneSumSt>0) corr_18486Tot_noOver4kb=ggplot(TXN_NormGene_4kb_n, aes(x=log10(TPM), y= log10(GeneSumSt))) + geom_point() + labs(title="Total", x="log10 RNA seq TPM", y="log10 Peak count sum per gene")+ geom_smooth(aes(x=log10(TPM),y=log10(GeneSumSt)),method = "lm") + annotate("text",x=3, y=5,label="R2=.34") +geom_density2d(na.rm = TRUE, size = 1, colour = 'red') +geom_text(aes(label=gene),hjust=0, vjust=0) corr_18486Tot_noOver4kb </code></pre> <p><img src="figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-65-1.png" width="672" style="display: block; margin: auto;" /></p> <details> <summary><em>Expand here to see past versions of unnamed-chunk-65-1.png:</em></summary> <table style="border-collapse:separate; border-spacing:5px;"> <thead> <tr> <th style="text-align:left;"> Version </th> <th style="text-align:left;"> Author </th> <th style="text-align:left;"> Date </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/b5a37f3ed662b0f28780079601d389535b2c529c/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-65-1.png" target="_blank">b5a37f3</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2019-01-11 </td> </tr> <tr> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/69b51620dfe0a0ce05d7085c92f5949fd26fae4a/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-65-1.png" target="_blank">69b5162</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-12-13 </td> </tr> <tr> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/c3fd2c47561749e66251dd7c462d1fd3c9a8a8a5/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-65-1.png" target="_blank">c3fd2c4</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-12-13 </td> </tr> </tbody> </table> <p></details></p> <pre class="r"><code>summary(lm(log10(TPM)~log10(GeneSumSt),TXN_NormGene_4kb_n)) </code></pre> <pre><code> Call: lm(formula = log10(TPM) ~ log10(GeneSumSt), data = TXN_NormGene_4kb_n) Residuals: Min 1Q Median 3Q Max -3.07128 -0.26438 0.05285 0.31624 2.23108 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 2.200525 0.013284 165.65 <2e-16 *** log10(GeneSumSt) 0.409498 0.005733 71.43 <2e-16 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Residual standard error: 0.5116 on 9738 degrees of freedom Multiple R-squared: 0.3438, Adjusted R-squared: 0.3437 F-statistic: 5102 on 1 and 9738 DF, p-value: < 2.2e-16</code></pre> <pre class="r"><code>cor.test(log10(TXN_NormGene_4kb_n$TPM),log10(TXN_NormGene_4kb_n$GeneSumSt),method="spearman")</code></pre> <pre><code>Warning in cor.test.default(log10(TXN_NormGene_4kb_n$TPM), log10(TXN_NormGene_4kb_n$GeneSumSt), : Cannot compute exact p-value with ties</code></pre> <pre><code> Spearman's rank correlation rho data: log10(TXN_NormGene_4kb_n$TPM) and log10(TXN_NormGene_4kb_n$GeneSumSt) S = 6.5566e+10, p-value < 2.2e-16 alternative hypothesis: true rho is not equal to 0 sample estimates: rho 0.5742507 </code></pre> <pre class="r"><code>nuclear_Cov_18486_4kb=read.table("../data/UnderstandPeaksQC/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_Peaks4kbAround.Nuclear_fixed.fc", header=T, stringsAsFactors = F)[,1:7] %>% separate(Geneid, into=c("peak", "chr", "start", "end", "strand", "gene"), sep=":") %>% select(gene, X18486_N) %>% group_by(gene) %>%filter(X18486_N>10) %>% summarize(GeneSum=sum(X18486_N)) %>% mutate(GeneSumNorm=GeneSum/11.4) %>% inner_join(geneLengths, by="gene") %>% mutate(GeneSumSt=GeneSum/length)</code></pre> <p>Join the data frames.</p> <pre class="r"><code>TXN_NormGene_noOverlap_4kb=TXN_abund %>% inner_join(nuclear_Cov_18486_4kb,by="gene")</code></pre> <p>Remove rows with 0 counts and Plot:</p> <pre class="r"><code>TXN_NormGene_noOverlap_4kb_n=TXN_NormGene_noOverlap_4kb %>% filter(TPM>0) %>% filter(GeneSumSt>0) corr_18486Nuc_4kb=ggplot(TXN_NormGene_noOverlap_4kb_n, aes(x=log10(TPM), y= log10(GeneSumSt))) + geom_point() + labs(title="Nuclear", x="log10 RNA seq TPM", y="log10 Peak count sum per gene")+ geom_smooth(aes(x=log10(TPM),y=log10(GeneSumSt)),method = "lm") + annotate("text",x=3, y=5,label="R2=.26") +geom_density2d(na.rm = TRUE, size = 1, colour = 'red') + geom_text(aes(label=gene),hjust=0, vjust=0) corr_18486Nuc_4kb </code></pre> <p><img src="figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-68-1.png" width="672" style="display: block; margin: auto;" /></p> <details> <summary><em>Expand here to see past versions of unnamed-chunk-68-1.png:</em></summary> <table style="border-collapse:separate; border-spacing:5px;"> <thead> <tr> <th style="text-align:left;"> Version </th> <th style="text-align:left;"> Author </th> <th style="text-align:left;"> Date </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/b5a37f3ed662b0f28780079601d389535b2c529c/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-68-1.png" target="_blank">b5a37f3</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2019-01-11 </td> </tr> <tr> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/69b51620dfe0a0ce05d7085c92f5949fd26fae4a/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-68-1.png" target="_blank">69b5162</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-12-13 </td> </tr> <tr> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/c3fd2c47561749e66251dd7c462d1fd3c9a8a8a5/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-68-1.png" target="_blank">c3fd2c4</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-12-13 </td> </tr> </tbody> </table> <p></details></p> <pre class="r"><code>summary(lm(log10(TPM)~log10(GeneSumSt),TXN_NormGene_noOverlap_4kb_n)) </code></pre> <pre><code> Call: lm(formula = log10(TPM) ~ log10(GeneSumSt), data = TXN_NormGene_noOverlap_4kb_n) Residuals: Min 1Q Median 3Q Max -3.3104 -0.3004 0.0465 0.3451 2.2798 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 2.203466 0.016119 136.70 <2e-16 *** log10(GeneSumSt) 0.402256 0.006733 59.74 <2e-16 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Residual standard error: 0.5572 on 10000 degrees of freedom Multiple R-squared: 0.263, Adjusted R-squared: 0.2629 F-statistic: 3569 on 1 and 10000 DF, p-value: < 2.2e-16</code></pre> <pre class="r"><code>cor.test(log10(TXN_NormGene_noOverlap_4kb_n$TPM),log10(TXN_NormGene_noOverlap_4kb_n$GeneSumSt),method="spearman")</code></pre> <pre><code>Warning in cor.test.default(log10(TXN_NormGene_noOverlap_4kb_n$TPM), log10(TXN_NormGene_noOverlap_4kb_n$GeneSumSt), : Cannot compute exact p- value with ties</code></pre> <pre><code> Spearman's rank correlation rho data: log10(TXN_NormGene_noOverlap_4kb_n$TPM) and log10(TXN_NormGene_noOverlap_4kb_n$GeneSumSt) S = 8.3048e+10, p-value < 2.2e-16 alternative hypothesis: true rho is not equal to 0 sample estimates: rho 0.502012 </code></pre> <p>From here I can subset down to just the genes in the nonoverlapping set.</p> <pre class="r"><code>TXN_NormGene_noOverlap_4kb_noOver=TXN_NormGene_noOverlap_4kb_n %>% semi_join(TXN_NormGene_noOverlapEXT, by="gene") corr_18486Nuc_4kb_noOVer=ggplot(TXN_NormGene_noOverlap_4kb_noOver, aes(x=log10(TPM), y= log10(GeneSumSt))) + geom_point() + labs(title="Nuclear", x="log10 RNA seq TPM", y="log10 Peak count sum per gene")+ geom_smooth(aes(x=log10(TPM),y=log10(GeneSumSt)),method = "lm") + annotate("text",x=3, y=5,label="R2=.27") #+geom_text(aes(label=gene),hjust=0, vjust=0)+geom_density2d(na.rm = TRUE, size = 1, colour = 'red') corr_18486Nuc_4kb_noOVer </code></pre> <p><img src="figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-69-1.png" width="672" style="display: block; margin: auto;" /></p> <details> <summary><em>Expand here to see past versions of unnamed-chunk-69-1.png:</em></summary> <table style="border-collapse:separate; border-spacing:5px;"> <thead> <tr> <th style="text-align:left;"> Version </th> <th style="text-align:left;"> Author </th> <th style="text-align:left;"> Date </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/69b51620dfe0a0ce05d7085c92f5949fd26fae4a/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-69-1.png" target="_blank">69b5162</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-12-13 </td> </tr> </tbody> </table> <p></details></p> <pre class="r"><code>summary(lm(log10(TPM)~log10(GeneSumSt),TXN_NormGene_noOverlap_4kb_noOver))</code></pre> <pre><code> Call: lm(formula = log10(TPM) ~ log10(GeneSumSt), data = TXN_NormGene_noOverlap_4kb_noOver) Residuals: Min 1Q Median 3Q Max -2.61075 -0.31776 0.04514 0.34370 2.19168 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 2.27850 0.04047 56.30 <2e-16 *** log10(GeneSumSt) 0.41540 0.01509 27.53 <2e-16 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Residual standard error: 0.5504 on 2006 degrees of freedom Multiple R-squared: 0.2742, Adjusted R-squared: 0.2739 F-statistic: 757.9 on 1 and 2006 DF, p-value: < 2.2e-16</code></pre> <pre class="r"><code>cor.test(log10(TXN_NormGene_noOverlap_4kb_noOver$TPM),log10(TXN_NormGene_noOverlap_4kb_noOver$GeneSumSt) ,method="spearman")</code></pre> <pre><code>Warning in cor.test.default(log10(TXN_NormGene_noOverlap_4kb_noOver$TPM), : Cannot compute exact p-value with ties</code></pre> <pre><code> Spearman's rank correlation rho data: log10(TXN_NormGene_noOverlap_4kb_noOver$TPM) and log10(TXN_NormGene_noOverlap_4kb_noOver$GeneSumSt) S = 664430000, p-value < 2.2e-16 alternative hypothesis: true rho is not equal to 0 sample estimates: rho 0.5076096 </code></pre> <p>Look at curves for the TPM and standards.</p> <pre class="r"><code> ggplot(TXN_NormGene_noOverlap_4kb_noOver, aes(x=log10(GeneSumSt))) + geom_density()</code></pre> <p><img src="figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-70-1.png" width="672" style="display: block; margin: auto;" /></p> <details> <summary><em>Expand here to see past versions of unnamed-chunk-70-1.png:</em></summary> <table style="border-collapse:separate; border-spacing:5px;"> <thead> <tr> <th style="text-align:left;"> Version </th> <th style="text-align:left;"> Author </th> <th style="text-align:left;"> Date </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/69b51620dfe0a0ce05d7085c92f5949fd26fae4a/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-70-1.png" target="_blank">69b5162</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-12-13 </td> </tr> </tbody> </table> <p></details></p> <pre class="r"><code>ggplot(TXN_NormGene_noOverlap_4kb_noOver, aes(x=log10(TPM))) + geom_density()</code></pre> <p><img src="figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-70-2.png" width="672" style="display: block; margin: auto;" /></p> <details> <summary><em>Expand here to see past versions of unnamed-chunk-70-2.png:</em></summary> <table style="border-collapse:separate; border-spacing:5px;"> <thead> <tr> <th style="text-align:left;"> Version </th> <th style="text-align:left;"> Author </th> <th style="text-align:left;"> Date </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/69b51620dfe0a0ce05d7085c92f5949fd26fae4a/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-70-2.png" target="_blank">69b5162</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-12-13 </td> </tr> </tbody> </table> <p></details></p> </div> <div id="separate-by-percentile-gene-expression" class="section level2"> <h2>Separate by percentile gene expression</h2> </div> <div id="check-another-line" class="section level2"> <h2>Check another line</h2> <pre class="r"><code>total_Cov_18497_noOver=read.table("../data/UnderstandPeaksQC/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_GenicNoOverlap.Total_fixed.fc", header=T, stringsAsFactors = F)[,1:8] %>% separate(Geneid, into=c("peak", "chr", "start", "end", "strand", "gene"), sep=":") %>% select(gene, X18497_T)%>% group_by(gene) %>% summarize(GeneSum=sum(X18497_T)) %>% mutate(GeneSumNorm=GeneSum/10.8) %>% inner_join(geneLengths, by="gene") %>% mutate(GeneSumSt=GeneSum/length)</code></pre> <p>Join the data frames.</p> <pre class="r"><code>TXN_NormGene_noOverlap_497=TXN_abund %>% inner_join(total_Cov_18497_noOver,by="gene")</code></pre> <p>Remove rows with 0 counts and Plot:</p> <pre class="r"><code>TXN_NormGene_noOverlap_497=TXN_NormGene_noOverlap_497 %>% filter(TPM>0) %>% filter(GeneSumSt>0) corr_18497Tot_noOver=ggplot(TXN_NormGene_noOverlap_497, aes(x=log10(TPM), y= log10(GeneSumSt))) + geom_point() + labs(title="Total", x="log10 RNA seq TPM", y="log10 Peak count sum per gene")+ geom_smooth(aes(x=log10(TPM),y=log10(GeneSumSt)),method = "lm") + annotate("text",x=3, y=5,label="R2=.49") +geom_density2d(na.rm = TRUE, size = 1, colour = 'red') #+ geom_text(aes(label=gene),hjust=0, vjust=0) corr_18497Tot_noOver </code></pre> <p><img src="figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-73-1.png" width="672" style="display: block; margin: auto;" /></p> <details> <summary><em>Expand here to see past versions of unnamed-chunk-73-1.png:</em></summary> <table style="border-collapse:separate; border-spacing:5px;"> <thead> <tr> <th style="text-align:left;"> Version </th> <th style="text-align:left;"> Author </th> <th style="text-align:left;"> Date </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/69b51620dfe0a0ce05d7085c92f5949fd26fae4a/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-73-1.png" target="_blank">69b5162</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-12-13 </td> </tr> </tbody> </table> <p></details></p> <pre class="r"><code>summary(lm(log10(TPM)~log10(GeneSumSt),TXN_NormGene_noOverlap_497)) </code></pre> <pre><code> Call: lm(formula = log10(TPM) ~ log10(GeneSumSt), data = TXN_NormGene_noOverlap_497) Residuals: Min 1Q Median 3Q Max -3.5597 -0.2929 0.0903 0.3993 2.8953 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 2.413566 0.015014 160.8 <2e-16 *** log10(GeneSumSt) 0.650761 0.006389 101.9 <2e-16 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Residual standard error: 0.6346 on 10739 degrees of freedom Multiple R-squared: 0.4914, Adjusted R-squared: 0.4913 F-statistic: 1.037e+04 on 1 and 10739 DF, p-value: < 2.2e-16</code></pre> <pre class="r"><code>cor.test(log10(TXN_NormGene_noOverlap_497$TPM),log10(TXN_NormGene_noOverlap_497$GeneSumSt),method="spearman")</code></pre> <pre><code>Warning in cor.test.default(log10(TXN_NormGene_noOverlap_497$TPM), log10(TXN_NormGene_noOverlap_497$GeneSumSt), : Cannot compute exact p-value with ties</code></pre> <pre><code> Spearman's rank correlation rho data: log10(TXN_NormGene_noOverlap_497$TPM) and log10(TXN_NormGene_noOverlap_497$GeneSumSt) S = 6.3783e+10, p-value < 2.2e-16 alternative hypothesis: true rho is not equal to 0 sample estimates: rho 0.6911702 </code></pre> <p>This shows me the effect is not line specific because the correlation is the same when i use the RNA from one line and 3’ from another.</p> </div> <div id="sum-over-all-individals" class="section level2"> <h2>Sum over all individals</h2> <p>First I will get the total counts for the non overlapping set:</p> <pre class="r"><code>TotCounts_nonOverlapEX=read.table("../data/UnderstandPeaksQC/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_GenicNoOverlapEXT.Total_fixed.fc", header=T, stringsAsFactors = F)[,7:45] SumCounts_Tot=rowSums(TotCounts_nonOverlapEX) Alllib_Tot_nonOverlapEX=read.table("../data/UnderstandPeaksQC/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_GenicNoOverlapEXT.Total_fixed.fc", header=T, stringsAsFactors = F) %>% separate(Geneid, into=c("peak", "chr", "start", "end", "strand", "gene"), sep=":") Alllib_Tot_nonOverlapEX$SumCounts=SumCounts_Tot Alllib_Tot_nonOverlapEX_bygene=Alllib_Tot_nonOverlapEX %>% select(gene, SumCounts) %>% group_by(gene) %>% summarize(GeneSum=sum(SumCounts)) %>% mutate(GeneSumNorm=GeneSum/11.4) TXN_abund_combLibs_tot=TXN_abund %>% inner_join(Alllib_Tot_nonOverlapEX_bygene,by="gene") TXN_abund_combLibs_tot_n0=TXN_abund_combLibs_tot %>% filter(TPM>0) %>% filter(GeneSumNorm>0) corr_AllLibTot_noOver=ggplot(TXN_abund_combLibs_tot_n0, aes(x=log10(TPM), y= log10(GeneSumNorm))) + geom_point() + labs(title="Total All ind", x="log10 RNA seq TPM", y="log10 Peak count sum per gene All Ind.")+ geom_smooth(aes(x=log10(TPM),y=log10(GeneSumNorm)),method = "lm") + annotate("text",x=-2, y=5,label="R2=.55") +geom_density2d(na.rm = TRUE, size = 1, colour = 'red') #+ geom_text(aes(label=gene),hjust=0, vjust=0) corr_AllLibTot_noOver </code></pre> <p><img src="figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-74-1.png" width="672" style="display: block; margin: auto;" /></p> <details> <summary><em>Expand here to see past versions of unnamed-chunk-74-1.png:</em></summary> <table style="border-collapse:separate; border-spacing:5px;"> <thead> <tr> <th style="text-align:left;"> Version </th> <th style="text-align:left;"> Author </th> <th style="text-align:left;"> Date </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/b5a37f3ed662b0f28780079601d389535b2c529c/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-74-1.png" target="_blank">b5a37f3</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2019-01-11 </td> </tr> <tr> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/69b51620dfe0a0ce05d7085c92f5949fd26fae4a/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-74-1.png" target="_blank">69b5162</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-12-13 </td> </tr> </tbody> </table> <p></details></p> <pre class="r"><code>summary(lm(log10(TPM)~log10(GeneSumNorm),TXN_abund_combLibs_tot_n0)) </code></pre> <pre><code> Call: lm(formula = log10(TPM) ~ log10(GeneSumNorm), data = TXN_abund_combLibs_tot_n0) Residuals: Min 1Q Median 3Q Max -3.2653 -0.3200 0.0183 0.3435 3.4494 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -1.34124 0.03578 -37.48 <2e-16 *** log10(GeneSumNorm) 0.78488 0.01279 61.38 <2e-16 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Residual standard error: 0.6682 on 3079 degrees of freedom Multiple R-squared: 0.5503, Adjusted R-squared: 0.5501 F-statistic: 3767 on 1 and 3079 DF, p-value: < 2.2e-16</code></pre> <pre class="r"><code>cor.test(log10(TXN_abund_combLibs_tot_n0$TPM),log10(TXN_abund_combLibs_tot_n0$GeneSumNorm),method="spearman")</code></pre> <pre><code>Warning in cor.test.default(log10(TXN_abund_combLibs_tot_n0$TPM), log10(TXN_abund_combLibs_tot_n0$GeneSumNorm), : Cannot compute exact p- value with ties</code></pre> <pre><code> Spearman's rank correlation rho data: log10(TXN_abund_combLibs_tot_n0$TPM) and log10(TXN_abund_combLibs_tot_n0$GeneSumNorm) S = 1.211e+09, p-value < 2.2e-16 alternative hypothesis: true rho is not equal to 0 sample estimates: rho 0.7515536 </code></pre> <p>Same for nuclear</p> <pre class="r"><code>NucCounts_nonOverlapEX=read.table("../data/UnderstandPeaksQC/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_GenicNoOverlapEXT.Nuclear_Fixed.fc", header=T, stringsAsFactors = F)[,7:45] SumCounts_Nuc=rowSums(NucCounts_nonOverlapEX) Alllib_Nuc_nonOverlapEX=read.table("../data/UnderstandPeaksQC/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_GenicNoOverlapEXT.Nuclear_Fixed.fc", header=T, stringsAsFactors = F) %>% separate(Geneid, into=c("peak", "chr", "start", "end", "strand", "gene"), sep=":") Alllib_Nuc_nonOverlapEX$SumCounts=SumCounts_Nuc Alllib_Nuc_nonOverlapEX_bygene=Alllib_Nuc_nonOverlapEX %>% select(gene, SumCounts) %>% group_by(gene) %>% summarize(GeneSum=sum(SumCounts)) %>% mutate(GeneSumNorm=GeneSum/10.8) TXN_abund_combLibs_Nuc=TXN_abund %>% inner_join(Alllib_Nuc_nonOverlapEX_bygene,by="gene") TXN_abund_combLibs_Nuc_n0=TXN_abund_combLibs_Nuc%>% filter(TPM>0) %>% filter(GeneSumNorm>0) corr_AllLibNuc_noOver=ggplot(TXN_abund_combLibs_Nuc_n0, aes(x=log10(TPM), y= log10(GeneSumNorm))) + geom_point() + labs(title="Nuclear All ind", x="log10 RNA seq TPM", y="log10 Peak count sum per gene All Ind.")+ geom_smooth(aes(x=log10(TPM),y=log10(GeneSumNorm)),method = "lm") + annotate("text",x=-2, y=5,label="R2=.48") +geom_density2d(na.rm = TRUE, size = 1, colour = 'red') #+ geom_text(aes(label=gene),hjust=0, vjust=0) corr_AllLibNuc_noOver </code></pre> <p><img src="figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-75-1.png" width="672" style="display: block; margin: auto;" /></p> <details> <summary><em>Expand here to see past versions of unnamed-chunk-75-1.png:</em></summary> <table style="border-collapse:separate; border-spacing:5px;"> <thead> <tr> <th style="text-align:left;"> Version </th> <th style="text-align:left;"> Author </th> <th style="text-align:left;"> Date </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/69b51620dfe0a0ce05d7085c92f5949fd26fae4a/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-75-1.png" target="_blank">69b5162</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-12-13 </td> </tr> </tbody> </table> <p></details></p> <pre class="r"><code>summary(lm(log10(TPM)~log10(GeneSumNorm),TXN_abund_combLibs_Nuc_n0)) </code></pre> <pre><code> Call: lm(formula = log10(TPM) ~ log10(GeneSumNorm), data = TXN_abund_combLibs_Nuc_n0) Residuals: Min 1Q Median 3Q Max -3.3980 -0.3912 0.0130 0.4109 3.3698 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -1.40221 0.04238 -33.08 <2e-16 *** log10(GeneSumNorm) 0.75348 0.01427 52.78 <2e-16 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Residual standard error: 0.7219 on 3079 degrees of freedom Multiple R-squared: 0.475, Adjusted R-squared: 0.4749 F-statistic: 2786 on 1 and 3079 DF, p-value: < 2.2e-16</code></pre> <pre class="r"><code>cor.test(log10(TXN_abund_combLibs_Nuc_n0$TPM),log10(TXN_abund_combLibs_Nuc_n0$GeneSumNorm),method="spearman")</code></pre> <pre><code>Warning in cor.test.default(log10(TXN_abund_combLibs_Nuc_n0$TPM), log10(TXN_abund_combLibs_Nuc_n0$GeneSumNorm), : Cannot compute exact p- value with ties</code></pre> <pre><code> Spearman's rank correlation rho data: log10(TXN_abund_combLibs_Nuc_n0$TPM) and log10(TXN_abund_combLibs_Nuc_n0$GeneSumNorm) S = 1587500000, p-value < 2.2e-16 alternative hypothesis: true rho is not equal to 0 sample estimates: rho 0.6743196 </code></pre> </div> <div id="compare-2-3-seq." class="section level2"> <h2>Compare 2 3’ Seq.</h2> <pre class="r"><code>TotCounts_nonOverlapEX=read.table("../data/UnderstandPeaksQC/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_GenicNoOverlapEXT.Total_fixed.fc", header=T, stringsAsFactors = F) summary(lm(TotCounts_nonOverlapEX$X18486_T~TotCounts_nonOverlapEX$X19238_T))</code></pre> <pre><code> Call: lm(formula = TotCounts_nonOverlapEX$X18486_T ~ TotCounts_nonOverlapEX$X19238_T) Residuals: Min 1Q Median 3Q Max -10543.0 -2.6 1.3 1.3 11904.2 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -1.2603937 0.3532729 -3.568 0.00036 *** TotCounts_nonOverlapEX$X19238_T 0.7807420 0.0009045 863.135 < 2e-16 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Residual standard error: 101.2 on 82478 degrees of freedom Multiple R-squared: 0.9003, Adjusted R-squared: 0.9003 F-statistic: 7.45e+05 on 1 and 82478 DF, p-value: < 2.2e-16</code></pre> <pre class="r"><code>summary(lm(TotCounts_nonOverlapEX$X18486_T~TotCounts_nonOverlapEX$X19238_T))</code></pre> <pre><code> Call: lm(formula = TotCounts_nonOverlapEX$X18486_T ~ TotCounts_nonOverlapEX$X19238_T) Residuals: Min 1Q Median 3Q Max -10543.0 -2.6 1.3 1.3 11904.2 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -1.2603937 0.3532729 -3.568 0.00036 *** TotCounts_nonOverlapEX$X19238_T 0.7807420 0.0009045 863.135 < 2e-16 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Residual standard error: 101.2 on 82478 degrees of freedom Multiple R-squared: 0.9003, Adjusted R-squared: 0.9003 F-statistic: 7.45e+05 on 1 and 82478 DF, p-value: < 2.2e-16</code></pre> <pre class="r"><code>summary(lm(TotCounts_nonOverlapEX$X19238_T~TotCounts_nonOverlapEX$X19225_T))</code></pre> <pre><code> Call: lm(formula = TotCounts_nonOverlapEX$X19238_T ~ TotCounts_nonOverlapEX$X19225_T) Residuals: Min 1Q Median 3Q Max -13986.1 -7.7 -6.1 -2.1 12207.9 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 6.146796 0.454280 13.53 <2e-16 *** TotCounts_nonOverlapEX$X19225_T 0.837322 0.001034 809.62 <2e-16 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Residual standard error: 130.3 on 82478 degrees of freedom Multiple R-squared: 0.8882, Adjusted R-squared: 0.8882 F-statistic: 6.555e+05 on 1 and 82478 DF, p-value: < 2.2e-16</code></pre> <pre class="r"><code>plot(log10(TotCounts_nonOverlapEX$X19238_T)~log10(TotCounts_nonOverlapEX$X19225_T))</code></pre> <p><img src="figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-76-1.png" width="672" style="display: block; margin: auto;" /></p> <details> <summary><em>Expand here to see past versions of unnamed-chunk-76-1.png:</em></summary> <table style="border-collapse:separate; border-spacing:5px;"> <thead> <tr> <th style="text-align:left;"> Version </th> <th style="text-align:left;"> Author </th> <th style="text-align:left;"> Date </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/69b51620dfe0a0ce05d7085c92f5949fd26fae4a/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-76-1.png" target="_blank">69b5162</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2018-12-13 </td> </tr> </tbody> </table> <p></details></p> <p>Do this withough filter:</p> <pre class="r"><code>total_Cov=read.table("../data/PeakCounts/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant.Total_fixed.fc", header=T, stringsAsFactors = F)[7:45] plot(log10(total_Cov$X19238_T)~log10(total_Cov$X19225_T))</code></pre> <p><img src="figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-77-1.png" width="672" style="display: block; margin: auto;" /></p> <details> <summary><em>Expand here to see past versions of unnamed-chunk-77-1.png:</em></summary> <table style="border-collapse:separate; border-spacing:5px;"> <thead> <tr> <th style="text-align:left;"> Version </th> <th style="text-align:left;"> Author </th> <th style="text-align:left;"> Date </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/b5a37f3ed662b0f28780079601d389535b2c529c/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-77-1.png" target="_blank">b5a37f3</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2019-01-11 </td> </tr> </tbody> </table> <p></details></p> <pre class="r"><code>summary(lm(total_Cov$X19238_T~total_Cov$X19225_T))</code></pre> <pre><code> Call: lm(formula = total_Cov$X19238_T ~ total_Cov$X19225_T) Residuals: Min 1Q Median 3Q Max -28832.1 -8.0 -6.3 -2.6 25408.6 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 6.3159764 0.2977179 21.21 <2e-16 *** total_Cov$X19225_T 0.8266119 0.0004396 1880.39 <2e-16 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Residual standard error: 172.9 on 338139 degrees of freedom Multiple R-squared: 0.9127, Adjusted R-squared: 0.9127 F-statistic: 3.536e+06 on 1 and 338139 DF, p-value: < 2.2e-16</code></pre> <p>Do this with ggplot</p> <pre class="r"><code>ggplot(total_Cov, aes(x = log10(X19238_T) , y=log10(X19225_T))) + geom_point() + labs(title="Relationship between 2 Total 3' Seq Peak Counts", x="log10(19238)",y="log10(19225)")</code></pre> <p><img src="figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-78-1.png" width="672" style="display: block; margin: auto;" /></p> <details> <summary><em>Expand here to see past versions of unnamed-chunk-78-1.png:</em></summary> <table style="border-collapse:separate; border-spacing:5px;"> <thead> <tr> <th style="text-align:left;"> Version </th> <th style="text-align:left;"> Author </th> <th style="text-align:left;"> Date </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/b5a37f3ed662b0f28780079601d389535b2c529c/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-78-1.png" target="_blank">b5a37f3</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2019-01-11 </td> </tr> </tbody> </table> <p></details></p> <p>I want to do this for all and make a heat plot with the correlations:</p> <pre class="r"><code>cor_function=function(data){ corr_matrix= matrix(0,dim(data)[[2]],dim(data)[[2]]) for (i in seq(1,dim(data)[[2]])){ for (j in seq(1,dim(data)[[2]])){ x=cor.test(data[,i], data[,j], method='pearson') cor_ij=as.numeric(x$estimate) corr_matrix[i,j]=cor_ij } } return(corr_matrix) }</code></pre> <pre class="r"><code>total_corr=cor_function(as.matrix(total_Cov))</code></pre> <pre class="r"><code>total_corr_melt=melt(total_corr) ggheatmap=ggplot(data = total_corr_melt, aes(x=Var1, y=Var2, fill=value)) + geom_tile() + labs(title="Library Peak Count Correlation Heatplot: Total", x="", y="")+ scale_fill_continuous(limits=c(0, 1), breaks=seq(0,1,by=0.15),name = "Correlation", low = "#FFFFFF", high = "darkviolet") ggheatmap</code></pre> <p><img src="figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-81-1.png" width="672" style="display: block; margin: auto;" /></p> <details> <summary><em>Expand here to see past versions of unnamed-chunk-81-1.png:</em></summary> <table style="border-collapse:separate; border-spacing:5px;"> <thead> <tr> <th style="text-align:left;"> Version </th> <th style="text-align:left;"> Author </th> <th style="text-align:left;"> Date </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/b5a37f3ed662b0f28780079601d389535b2c529c/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-81-1.png" target="_blank">b5a37f3</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2019-01-11 </td> </tr> </tbody> </table> <p></details></p> <p>mean correlation .89</p> <p>DO this for nuclear:</p> <pre class="r"><code>nuclear_Cov=read.table("../data/PeakCounts/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant.Nuclear_fixed.fc", header=T, stringsAsFactors = F)[7:45] </code></pre> <pre class="r"><code>nuclear_corr=cor_function(as.matrix(nuclear_Cov)) nuclear_corr_melt=melt(nuclear_corr) ggheatmap_nuc=ggplot(data = nuclear_corr_melt, aes(x=Var1, y=Var2, fill=value)) + geom_tile() + labs(title="Library Peak Count Correlation Heatplot: Nuclear", x="", y="")+ scale_fill_continuous(limits=c(0, 1), breaks=seq(0,1,by=0.15),name = "Correlation", low = "#FFFFFF", high = "deepskyblue3") ggheatmap_nuc</code></pre> <p><img src="figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-83-1.png" width="672" style="display: block; margin: auto;" /></p> <details> <summary><em>Expand here to see past versions of unnamed-chunk-83-1.png:</em></summary> <table style="border-collapse:separate; border-spacing:5px;"> <thead> <tr> <th style="text-align:left;"> Version </th> <th style="text-align:left;"> Author </th> <th style="text-align:left;"> Date </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> <a href="https://github.com/brimittleman/threeprimeseq/blob/b5a37f3ed662b0f28780079601d389535b2c529c/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-83-1.png" target="_blank">b5a37f3</a> </td> <td style="text-align:left;"> Briana Mittleman </td> <td style="text-align:left;"> 2019-01-11 </td> </tr> </tbody> </table> <p></details></p> <p>Mean correlation is .74</p> <p>Make a full one with both:</p> <pre class="r"><code>bothFrac=cbind(nuclear_Cov,total_Cov) bothFrac_corr=cor_function(bothFrac) bothFrac_corr_melt=melt(bothFrac_corr) ggheatmap_both=ggplot(data = bothFrac_corr_melt, aes(x=Var1, y=Var2, fill=value)) + geom_tile() + labs(title="Library Peak Count Correlation Heatplot", x="", y="")+ scale_fill_continuous(limits=c(0, 1), breaks=seq(0,1,by=0.15),name = "Correlation", low = "#FFFFFF", high = "coral1") ggheatmap_both</code></pre> <p><img src="figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-84-1.png" width="672" style="display: block; margin: auto;" /></p> <p>Make a boxplot with the correlations:</p> <pre class="r"><code>Fraction=c("Total", "Nuclear", "Both") CorrelationMean=c(mean(total_corr),mean(nuclear_corr), mean(bothFrac_corr)) CorrelationSD=c(sd(total_corr), sd(nuclear_corr), sd(bothFrac_corr)) CorrelationDF=as.data.frame(cbind(Fraction,CorrelationMean,CorrelationSD)) CorrelationDF$CorrelationMean=as.numeric(as.character(CorrelationDF$CorrelationMean)) CorrelationDF$CorrelationSD=as.numeric(as.character(CorrelationDF$CorrelationSD)) corrBarPlot=ggplot(CorrelationDF, aes(x=Fraction, y=CorrelationMean, fill=Fraction)) + geom_bar(stat="Identity", position="dodge", width=.5) + geom_errorbar(aes(ymin=CorrelationMean-CorrelationSD, ymax=CorrelationMean+CorrelationSD), width=.2) + labs(y="Mean Correlation between Libraries", title="Correlation between Peak Counts across libraries") + scale_fill_manual(values=c("grey","deepskyblue3","darkviolet")) +theme(axis.text.y = element_text(size=12),axis.title.y=element_text(size=15,face="bold"), axis.title.x=element_text(size=12,face="bold")) ggsave(corrBarPlot, file="../output/plots/QC_plots/BarPlot4PeakCountCorrelation.png")</code></pre> <pre><code>Saving 7 x 5 in image</code></pre> </div> <div id="session-information" class="section level2"> <h2>Session information</h2> <pre class="r"><code>sessionInfo()</code></pre> <pre><code>R version 3.5.1 (2018-07-02) Platform: x86_64-apple-darwin15.6.0 (64-bit) Running under: macOS 10.14.1 Matrix products: default BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib locale: [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8 attached base packages: [1] stats graphics grDevices utils datasets methods base other attached packages: [1] bindrcpp_0.2.2 tximport_1.8.0 reshape2_1.4.3 cowplot_0.9.3 [5] workflowr_1.1.1 forcats_0.3.0 stringr_1.3.1 dplyr_0.7.6 [9] purrr_0.2.5 readr_1.1.1 tidyr_0.8.1 tibble_1.4.2 [13] ggplot2_3.0.0 tidyverse_1.2.1 loaded via a namespace (and not attached): [1] tidyselect_0.2.4 haven_1.1.2 lattice_0.20-35 [4] colorspace_1.3-2 htmltools_0.3.6 yaml_2.2.0 [7] rlang_0.2.2 R.oo_1.22.0 pillar_1.3.0 [10] glue_1.3.0 withr_2.1.2 R.utils_2.7.0 [13] modelr_0.1.2 readxl_1.1.0 bindr_0.1.1 [16] plyr_1.8.4 munsell_0.5.0 gtable_0.2.0 [19] cellranger_1.1.0 rvest_0.3.2 R.methodsS3_1.7.1 [22] evaluate_0.11 labeling_0.3 knitr_1.20 [25] broom_0.5.0 Rcpp_0.12.19 scales_1.0.0 [28] backports_1.1.2 jsonlite_1.5 hms_0.4.2 [31] digest_0.6.17 stringi_1.2.4 grid_3.5.1 [34] rprojroot_1.3-2 cli_1.0.1 tools_3.5.1 [37] magrittr_1.5 lazyeval_0.2.1 crayon_1.3.4 [40] whisker_0.3-2 pkgconfig_2.0.2 MASS_7.3-50 [43] xml2_1.2.0 lubridate_1.7.4 assertthat_0.2.0 [46] rmarkdown_1.10 httr_1.3.1 rstudioapi_0.8 [49] R6_2.3.0 nlme_3.1-137 git2r_0.23.0 [52] compiler_3.5.1 </code></pre> </div> <hr> <p> </p> <hr> <!-- To enable disqus, uncomment the section below and provide your disqus_shortname --> <!-- disqus <div id="disqus_thread"></div> <script type="text/javascript"> /* * * CONFIGURATION VARIABLES: EDIT BEFORE PASTING INTO YOUR WEBPAGE * * */ var disqus_shortname = 'rmarkdown'; // required: replace example with your forum shortname /* * * DON'T EDIT BELOW THIS LINE * * */ (function() { var dsq = document.createElement('script'); dsq.type = 'text/javascript'; dsq.async = true; dsq.src = '//' + disqus_shortname + '.disqus.com/embed.js'; (document.getElementsByTagName('head')[0] || document.getElementsByTagName('body')[0]).appendChild(dsq); })(); </script> <noscript>Please enable JavaScript to view the <a href="http://disqus.com/?ref_noscript">comments powered by Disqus.</a></noscript> <a href="http://disqus.com" class="dsq-brlink">comments powered by <span class="logo-disqus">Disqus</span></a> --> <!-- Adjust MathJax settings so that all math formulae are shown using TeX fonts only; see http://docs.mathjax.org/en/latest/configuration.html. This will make the presentation more consistent at the cost of the webpage sometimes taking slightly longer to load. Note that this only works because the footer is added to webpages before the MathJax javascript. --> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <hr> <p> This reproducible <a href="http://rmarkdown.rstudio.com">R Markdown</a> analysis was created with <a href="https://github.com/jdblischak/workflowr">workflowr</a> 1.1.1 </p> <hr> </div> </div> </div> <script> // add bootstrap table styles to pandoc tables function bootstrapStylePandocTables() { $('tr.header').parent('thead').parent('table').addClass('table table-condensed'); } $(document).ready(function () { bootstrapStylePandocTables(); }); </script> <!-- dynamically load mathjax for compatibility with self-contained --> <script> (function () { var script = document.createElement("script"); script.type = "text/javascript"; script.src = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML"; document.getElementsByTagName("head")[0].appendChild(script); })(); </script> </body> </html>