<!DOCTYPE html>

<html xmlns="http://www.w3.org/1999/xhtml">

<head>

<meta charset="utf-8" />
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta name="generator" content="pandoc" />


<meta name="author" content="Briana Mittleman" />

<meta name="date" content="2018-12-11" />

<title>Investigate Peak to Gene Assignment</title>

<script src="site_libs/jquery-1.11.3/jquery.min.js"></script>
<meta name="viewport" content="width=device-width, initial-scale=1" />
<link href="site_libs/bootstrap-3.3.5/css/journal.min.css" rel="stylesheet" />
<script src="site_libs/bootstrap-3.3.5/js/bootstrap.min.js"></script>
<script src="site_libs/bootstrap-3.3.5/shim/html5shiv.min.js"></script>
<script src="site_libs/bootstrap-3.3.5/shim/respond.min.js"></script>
<script src="site_libs/jqueryui-1.11.4/jquery-ui.min.js"></script>
<link href="site_libs/tocify-1.9.1/jquery.tocify.css" rel="stylesheet" />
<script src="site_libs/tocify-1.9.1/jquery.tocify.js"></script>
<script src="site_libs/navigation-1.1/tabsets.js"></script>
<link href="site_libs/highlightjs-9.12.0/textmate.css" rel="stylesheet" />
<script src="site_libs/highlightjs-9.12.0/highlight.js"></script>
<link href="site_libs/font-awesome-5.0.13/css/fa-svg-with-js.css" rel="stylesheet" />
<script src="site_libs/font-awesome-5.0.13/js/fontawesome-all.min.js"></script>
<script src="site_libs/font-awesome-5.0.13/js/fa-v4-shims.min.js"></script>

<style type="text/css">code{white-space: pre;}</style>
<style type="text/css">
  pre:not([class]) {
    background-color: white;
  }
</style>
<script type="text/javascript">
if (window.hljs) {
  hljs.configure({languages: []});
  hljs.initHighlightingOnLoad();
  if (document.readyState && document.readyState === "complete") {
    window.setTimeout(function() { hljs.initHighlighting(); }, 0);
  }
}
</script>



<style type="text/css">
h1 {
  font-size: 34px;
}
h1.title {
  font-size: 38px;
}
h2 {
  font-size: 30px;
}
h3 {
  font-size: 24px;
}
h4 {
  font-size: 18px;
}
h5 {
  font-size: 16px;
}
h6 {
  font-size: 12px;
}
.table th:not([align]) {
  text-align: left;
}
</style>


</head>

<body>

<style type = "text/css">
.main-container {
  max-width: 940px;
  margin-left: auto;
  margin-right: auto;
}
code {
  color: inherit;
  background-color: rgba(0, 0, 0, 0.04);
}
img {
  max-width:100%;
  height: auto;
}
.tabbed-pane {
  padding-top: 12px;
}
.html-widget {
  margin-bottom: 20px;
}
button.code-folding-btn:focus {
  outline: none;
}
</style>


<style type="text/css">
/* padding for bootstrap navbar */
body {
  padding-top: 51px;
  padding-bottom: 40px;
}
/* offset scroll position for anchor links (for fixed navbar)  */
.section h1 {
  padding-top: 56px;
  margin-top: -56px;
}

.section h2 {
  padding-top: 56px;
  margin-top: -56px;
}
.section h3 {
  padding-top: 56px;
  margin-top: -56px;
}
.section h4 {
  padding-top: 56px;
  margin-top: -56px;
}
.section h5 {
  padding-top: 56px;
  margin-top: -56px;
}
.section h6 {
  padding-top: 56px;
  margin-top: -56px;
}
</style>

<script>
// manage active state of menu based on current page
$(document).ready(function () {
  // active menu anchor
  href = window.location.pathname
  href = href.substr(href.lastIndexOf('/') + 1)
  if (href === "")
    href = "index.html";
  var menuAnchor = $('a[href="' + href + '"]');

  // mark it active
  menuAnchor.parent().addClass('active');

  // if it's got a parent navbar menu mark it active as well
  menuAnchor.closest('li.dropdown').addClass('active');
});
</script>


<div class="container-fluid main-container">

<!-- tabsets -->
<script>
$(document).ready(function () {
  window.buildTabsets("TOC");
});
</script>

<!-- code folding -->




<script>
$(document).ready(function ()  {

    // move toc-ignore selectors from section div to header
    $('div.section.toc-ignore')
        .removeClass('toc-ignore')
        .children('h1,h2,h3,h4,h5').addClass('toc-ignore');

    // establish options
    var options = {
      selectors: "h1,h2,h3",
      theme: "bootstrap3",
      context: '.toc-content',
      hashGenerator: function (text) {
        return text.replace(/[.\\/?&!#<>]/g, '').replace(/\s/g, '_').toLowerCase();
      },
      ignoreSelector: ".toc-ignore",
      scrollTo: 0
    };
    options.showAndHide = true;
    options.smoothScroll = true;

    // tocify
    var toc = $("#TOC").tocify(options).data("toc-tocify");
});
</script>

<style type="text/css">

#TOC {
  margin: 25px 0px 20px 0px;
}
@media (max-width: 768px) {
#TOC {
  position: relative;
  width: 100%;
}
}


.toc-content {
  padding-left: 30px;
  padding-right: 40px;
}

div.main-container {
  max-width: 1200px;
}

div.tocify {
  width: 20%;
  max-width: 260px;
  max-height: 85%;
}

@media (min-width: 768px) and (max-width: 991px) {
  div.tocify {
    width: 25%;
  }
}

@media (max-width: 767px) {
  div.tocify {
    width: 100%;
    max-width: none;
  }
}

.tocify ul, .tocify li {
  line-height: 20px;
}

.tocify-subheader .tocify-item {
  font-size: 0.90em;
  padding-left: 25px;
  text-indent: 0;
}

.tocify .list-group-item {
  border-radius: 0px;
}


</style>

<!-- setup 3col/9col grid for toc_float and main content  -->
<div class="row-fluid">
<div class="col-xs-12 col-sm-4 col-md-3">
<div id="TOC" class="tocify">
</div>
</div>

<div class="toc-content col-xs-12 col-sm-8 col-md-9">




<div class="navbar navbar-default  navbar-fixed-top" role="navigation">
  <div class="container">
    <div class="navbar-header">
      <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar">
        <span class="icon-bar"></span>
        <span class="icon-bar"></span>
        <span class="icon-bar"></span>
      </button>
      <a class="navbar-brand" href="index.html">Three Prime Sequencing in Human LCLs</a>
    </div>
    <div id="navbar" class="navbar-collapse collapse">
      <ul class="nav navbar-nav">
        <li>
  <a href="index.html">Home</a>
</li>
<li>
  <a href="about.html">About</a>
</li>
<li>
  <a href="license.html">License</a>
</li>
      </ul>
      <ul class="nav navbar-nav navbar-right">
        <li>
  <a href="https://github.com/brimittleman/threeprimeseq">
    <span class="fa fa-github"></span>
     
  </a>
</li>
      </ul>
    </div><!--/.nav-collapse -->
  </div><!--/.container -->
</div><!--/.navbar -->

<!-- Add a small amount of space between sections. -->
<style type="text/css">
div.section {
  padding-top: 12px;
}
</style>

<div class="fluid-row" id="header">



<h1 class="title toc-ignore">Investigate Peak to Gene Assignment</h1>
<h4 class="author"><em>Briana Mittleman</em></h4>
<h4 class="date"><em>12/11/2018</em></h4>

</div>


<p><strong>Last updated:</strong> 2019-01-15</p>
<strong>workflowr checks:</strong> <small>(Click a bullet for more information)</small>
<ul>
<li>
<p><details> <summary> <strong style="color:blue;">✔</strong> <strong>R Markdown file:</strong> up-to-date </summary></p>
<p>Great! Since the R Markdown file has been committed to the Git repository, you know the exact version of the code that produced these results.</p>
</details>
</li>
<li>
<p><details> <summary> <strong style="color:blue;">✔</strong> <strong>Environment:</strong> empty </summary></p>
<p>Great job! The global environment was empty. Objects defined in the global environment can affect the analysis in your R Markdown file in unknown ways. For reproduciblity it’s best to always run the code in an empty environment.</p>
</details>
</li>
<li>
<p><details> <summary> <strong style="color:blue;">✔</strong> <strong>Seed:</strong> <code>set.seed(12345)</code> </summary></p>
<p>The command <code>set.seed(12345)</code> was run prior to running the code in the R Markdown file. Setting a seed ensures that any results that rely on randomness, e.g. subsampling or permutations, are reproducible.</p>
</details>
</li>
<li>
<p><details> <summary> <strong style="color:blue;">✔</strong> <strong>Session information:</strong> recorded </summary></p>
<p>Great job! Recording the operating system, R version, and package versions is critical for reproducibility.</p>
</details>
</li>
<li>
<p><details> <summary> <strong style="color:blue;">✔</strong> <strong>Repository version:</strong> <a href="https://github.com/brimittleman/threeprimeseq/tree/d26e5485872212953667d4928890c4b6b766ea83" target="_blank">d26e548</a> </summary></p>
Great! You are using Git for version control. Tracking code development and connecting the code version to the results is critical for reproducibility. The version displayed above was the version of the Git repository at the time these results were generated. <br><br> Note that you need to be careful to ensure that all relevant files for the analysis have been committed to Git prior to generating the results (you can use <code>wflow_publish</code> or <code>wflow_git_commit</code>). workflowr only checks the R Markdown file, but you know if there are other scripts or data files that it depends on. Below is the status of the Git repository when the results were generated:
<pre><code>
Ignored files:
    Ignored:    .DS_Store
    Ignored:    .Rhistory
    Ignored:    .Rproj.user/
    Ignored:    data/.DS_Store
    Ignored:    output/.DS_Store

Untracked files:
    Untracked:  KalistoAbundance18486.txt
    Untracked:  analysis/DirectionapaQTL.Rmd
    Untracked:  analysis/PreAshExplore.Rmd
    Untracked:  analysis/YL_QTL_test.Rmd
    Untracked:  analysis/ncbiRefSeq_sm.sort.mRNA.bed
    Untracked:  analysis/snake.config.notes.Rmd
    Untracked:  analysis/verifyBAM.Rmd
    Untracked:  code/PeaksToCoverPerReads.py
    Untracked:  code/strober_pc_pve_heatmap_func.R
    Untracked:  data/18486.genecov.txt
    Untracked:  data/APApeaksYL.total.inbrain.bed
    Untracked:  data/ChromHmmOverlap/
    Untracked:  data/GM12878.chromHMM.bed
    Untracked:  data/GM12878.chromHMM.txt
    Untracked:  data/LianoglouLCL/
    Untracked:  data/LocusZoom/
    Untracked:  data/NuclearApaQTLs.txt
    Untracked:  data/PeakCounts/
    Untracked:  data/PeaksUsed/
    Untracked:  data/RNAkalisto/
    Untracked:  data/TotalApaQTLs.txt
    Untracked:  data/Totalpeaks_filtered_clean.bed
    Untracked:  data/UnderstandPeaksQC/
    Untracked:  data/YL-SP-18486-T-combined-genecov.txt
    Untracked:  data/YL-SP-18486-T_S9_R1_001-genecov.txt
    Untracked:  data/YL_QTL_test/
    Untracked:  data/apaExamp/
    Untracked:  data/bedgraph_peaks/
    Untracked:  data/bin200.5.T.nuccov.bed
    Untracked:  data/bin200.Anuccov.bed
    Untracked:  data/bin200.nuccov.bed
    Untracked:  data/clean_peaks/
    Untracked:  data/comb_map_stats.csv
    Untracked:  data/comb_map_stats.xlsx
    Untracked:  data/comb_map_stats_39ind.csv
    Untracked:  data/combined_reads_mapped_three_prime_seq.csv
    Untracked:  data/diff_iso_trans/
    Untracked:  data/ensemble_to_genename.txt
    Untracked:  data/example_gene_peakQuant/
    Untracked:  data/explainProtVar/
    Untracked:  data/filtered_APApeaks_merged_allchrom_refseqTrans.closest2End.bed
    Untracked:  data/filtered_APApeaks_merged_allchrom_refseqTrans.closest2End.noties.bed
    Untracked:  data/first50lines_closest.txt
    Untracked:  data/gencov.test.csv
    Untracked:  data/gencov.test.txt
    Untracked:  data/gencov_zero.test.csv
    Untracked:  data/gencov_zero.test.txt
    Untracked:  data/gene_cov/
    Untracked:  data/joined
    Untracked:  data/leafcutter/
    Untracked:  data/merged_combined_YL-SP-threeprimeseq.bg
    Untracked:  data/mol_overlap/
    Untracked:  data/mol_pheno/
    Untracked:  data/nom_QTL/
    Untracked:  data/nom_QTL_opp/
    Untracked:  data/nom_QTL_trans/
    Untracked:  data/nuc6up/
    Untracked:  data/nuc_10up/
    Untracked:  data/other_qtls/
    Untracked:  data/pQTL_otherphen/
    Untracked:  data/peakPerRefSeqGene/
    Untracked:  data/perm_QTL/
    Untracked:  data/perm_QTL_opp/
    Untracked:  data/perm_QTL_trans/
    Untracked:  data/perm_QTL_trans_filt/
    Untracked:  data/reads_mapped_three_prime_seq.csv
    Untracked:  data/smash.cov.results.bed
    Untracked:  data/smash.cov.results.csv
    Untracked:  data/smash.cov.results.txt
    Untracked:  data/smash_testregion/
    Untracked:  data/ssFC200.cov.bed
    Untracked:  data/temp.file1
    Untracked:  data/temp.file2
    Untracked:  data/temp.gencov.test.txt
    Untracked:  data/temp.gencov_zero.test.txt
    Untracked:  data/threePrimeSeqMetaData.csv
    Untracked:  output/picard/
    Untracked:  output/plots/
    Untracked:  output/qual.fig2.pdf

Unstaged changes:
    Modified:   analysis/28ind.peak.explore.Rmd
    Modified:   analysis/CompareLianoglouData.Rmd
    Modified:   analysis/NewPeakPostMP.Rmd
    Modified:   analysis/apaQTLoverlapGWAS.Rmd
    Modified:   analysis/cleanupdtseq.internalpriming.Rmd
    Modified:   analysis/coloc_apaQTLs_protQTLs.Rmd
    Modified:   analysis/dif.iso.usage.leafcutter.Rmd
    Modified:   analysis/diff_iso_pipeline.Rmd
    Modified:   analysis/explainpQTLs.Rmd
    Modified:   analysis/explore.filters.Rmd
    Modified:   analysis/flash2mash.Rmd
    Modified:   analysis/overlapMolQTL.Rmd
    Modified:   analysis/overlap_qtls.Rmd
    Modified:   analysis/peakOverlap_oppstrand.Rmd
    Modified:   analysis/pheno.leaf.comb.Rmd
    Modified:   analysis/swarmPlots_QTLs.Rmd
    Modified:   analysis/test.max2.Rmd
    Modified:   analysis/understandPeaks.Rmd
    Modified:   code/Snakefile

</code></pre>
Note that any generated files, e.g. HTML, png, CSS, etc., are not included in this status report because it is ok for generated content to have uncommitted changes. </details>
</li>
</ul>
<details> <summary> <small><strong>Expand here to see past versions:</strong></small> </summary>
<ul>
<table style="border-collapse:separate; border-spacing:5px;">
<thead>
<tr>
<th style="text-align:left;">
File
</th>
<th style="text-align:left;">
Version
</th>
<th style="text-align:left;">
Author
</th>
<th style="text-align:left;">
Date
</th>
<th style="text-align:left;">
Message
</th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align:left;">
Rmd
</td>
<td style="text-align:left;">
<a href="https://github.com/brimittleman/threeprimeseq/blob/d26e5485872212953667d4928890c4b6b766ea83/analysis/InvestigatePeak2GeneAssignment.Rmd" target="_blank">d26e548</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2019-01-15
</td>
<td style="text-align:left;">
add correlation plots
</td>
</tr>
<tr>
<td style="text-align:left;">
html
</td>
<td style="text-align:left;">
<a href="https://cdn.rawgit.com/brimittleman/threeprimeseq/b5a37f3ed662b0f28780079601d389535b2c529c/docs/InvestigatePeak2GeneAssignment.html" target="_blank">b5a37f3</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2019-01-11
</td>
<td style="text-align:left;">
Build site.
</td>
</tr>
<tr>
<td style="text-align:left;">
Rmd
</td>
<td style="text-align:left;">
<a href="https://github.com/brimittleman/threeprimeseq/blob/d64db58222b47c184a4253170ea8b6fdfe262cec/analysis/InvestigatePeak2GeneAssignment.Rmd" target="_blank">d64db58</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2019-01-11
</td>
<td style="text-align:left;">
add correlation plots
</td>
</tr>
<tr>
<td style="text-align:left;">
html
</td>
<td style="text-align:left;">
<a href="https://cdn.rawgit.com/brimittleman/threeprimeseq/69b51620dfe0a0ce05d7085c92f5949fd26fae4a/docs/InvestigatePeak2GeneAssignment.html" target="_blank">69b5162</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2018-12-13
</td>
<td style="text-align:left;">
Build site.
</td>
</tr>
<tr>
<td style="text-align:left;">
Rmd
</td>
<td style="text-align:left;">
<a href="https://github.com/brimittleman/threeprimeseq/blob/2a6944b21270570229a60a1d83f84ccbcd6644bc/analysis/InvestigatePeak2GeneAssignment.Rmd" target="_blank">2a6944b</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2018-12-13
</td>
<td style="text-align:left;">
sum over ind
</td>
</tr>
<tr>
<td style="text-align:left;">
html
</td>
<td style="text-align:left;">
<a href="https://cdn.rawgit.com/brimittleman/threeprimeseq/c3fd2c47561749e66251dd7c462d1fd3c9a8a8a5/docs/InvestigatePeak2GeneAssignment.html" target="_blank">c3fd2c4</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2018-12-13
</td>
<td style="text-align:left;">
Build site.
</td>
</tr>
<tr>
<td style="text-align:left;">
Rmd
</td>
<td style="text-align:left;">
<a href="https://github.com/brimittleman/threeprimeseq/blob/b198e3bc15dc55987b3fb83c105856b737ec89a3/analysis/InvestigatePeak2GeneAssignment.Rmd" target="_blank">b198e3b</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2018-12-13
</td>
<td style="text-align:left;">
4kb around end
</td>
</tr>
<tr>
<td style="text-align:left;">
html
</td>
<td style="text-align:left;">
<a href="https://cdn.rawgit.com/brimittleman/threeprimeseq/cbf986cbdc4f24e92c4c8be50f550dcf9257c7b5/docs/InvestigatePeak2GeneAssignment.html" target="_blank">cbf986c</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2018-12-12
</td>
<td style="text-align:left;">
Build site.
</td>
</tr>
<tr>
<td style="text-align:left;">
Rmd
</td>
<td style="text-align:left;">
<a href="https://github.com/brimittleman/threeprimeseq/blob/23123cfc46df693415721f21f253e3604d9f4ace/analysis/InvestigatePeak2GeneAssignment.Rmd" target="_blank">23123cf</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2018-12-12
</td>
<td style="text-align:left;">
add only genes not nearby
</td>
</tr>
<tr>
<td style="text-align:left;">
html
</td>
<td style="text-align:left;">
<a href="https://cdn.rawgit.com/brimittleman/threeprimeseq/558e8e88cd84b2803b254b0f60f9c72b2fc1a211/docs/InvestigatePeak2GeneAssignment.html" target="_blank">558e8e8</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2018-12-12
</td>
<td style="text-align:left;">
Build site.
</td>
</tr>
<tr>
<td style="text-align:left;">
Rmd
</td>
<td style="text-align:left;">
<a href="https://github.com/brimittleman/threeprimeseq/blob/0e0840ef800bbe60330b5cabb91047974abed714/analysis/InvestigatePeak2GeneAssignment.Rmd" target="_blank">0e0840e</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2018-12-12
</td>
<td style="text-align:left;">
remove overlaping genes
</td>
</tr>
<tr>
<td style="text-align:left;">
html
</td>
<td style="text-align:left;">
<a href="https://cdn.rawgit.com/brimittleman/threeprimeseq/6da90e9c6405976ca4cbeb59c6c07a30365e195f/docs/InvestigatePeak2GeneAssignment.html" target="_blank">6da90e9</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2018-12-11
</td>
<td style="text-align:left;">
Build site.
</td>
</tr>
</tbody>
</table>
</ul>
<p></details></p>
<hr />
<pre class="r"><code>library(tidyverse)</code></pre>
<pre><code>── Attaching packages ───────────────────────────────────────────────────────────── tidyverse 1.2.1 ──</code></pre>
<pre><code>✔ ggplot2 3.0.0     ✔ purrr   0.2.5
✔ tibble  1.4.2     ✔ dplyr   0.7.6
✔ tidyr   0.8.1     ✔ stringr 1.3.1
✔ readr   1.1.1     ✔ forcats 0.3.0</code></pre>
<pre><code>── Conflicts ──────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()</code></pre>
<pre class="r"><code>library(workflowr)</code></pre>
<pre><code>This is workflowr version 1.1.1
Run ?workflowr for help getting started</code></pre>
<pre class="r"><code>library(cowplot)</code></pre>
<pre><code>
Attaching package: &#39;cowplot&#39;</code></pre>
<pre><code>The following object is masked from &#39;package:ggplot2&#39;:

    ggsave</code></pre>
<pre class="r"><code>library(reshape2)</code></pre>
<pre><code>
Attaching package: &#39;reshape2&#39;</code></pre>
<pre><code>The following object is masked from &#39;package:tidyr&#39;:

    smiths</code></pre>
<pre class="r"><code>library(tximport)</code></pre>
<p>In looking at correlations and some examples, there is evidence the peak to gene assignment may be a problem. I am going to visualize the peaks in IGV. I will name them by the gene and look at them in the browser.</p>
<p>The peak to gene annotations used in the feature counts to map reads back to the peaks is the following:<br />
* /project2/gilad/briana/threeprimeseq/data/mergedPeaks_comb/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed.bed</p>
<p>I need to change this a bit to have the name be the gene rather than the score:</p>
<p>NamePeaksByGene.py</p>
<pre class="bash"><code>#python  

CovnamedPeaks=open(&quot;/project2/gilad/briana/threeprimeseq/data/mergedPeaks_comb/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed.bed&quot;, &quot;r&quot;)
GeneNamedPeaks=open(&quot;/project2/gilad/briana/threeprimeseq/data/UnderstandPeaksQC/PeaksNamedWithGeneAssignment.bed&quot;, &quot;w&quot;)

for ln in CovnamedPeaks:
  chrom, start, end, num, cov, strand, transcript = ln.split()
  gene=transcript.split(&quot;-&quot;)[1]
  GeneNamedPeaks.write(&quot;%s\t%s\t%s\t%s\n&quot;%(chrom,start,end,gene))

GeneNamedPeaks.close()
</code></pre>
<p>This was made based on the transcript annotation: ncbiRefSeq.mRNA.named.bed</p>
<ul>
<li>/project2/gilad/briana/genome_anotation_data/ncbiRefSeq.mRNA.named.bed</li>
</ul>
<p>The ends of the transcripts specfically are in:</p>
<ul>
<li>/project2/gilad/briana/genome_anotation_data/ncbiRefSeq_endProtCodGenes_sort.txt</li>
</ul>
<p>Ideas for Dilters:</p>
<ul>
<li><p>Cant be upstream of the gene, ex: chr2:135,558,075-135,604,343</p></li>
<li><p>maybe it cant be in another gene</p></li>
<li><p>we should include LINCs</p></li>
<li><p>looks like we have a ton of low expressed intergenic peaks that should be filtered before we do the gene annotation</p></li>
</ul>
<div id="filter-out-intergenic-peaks" class="section level3">
<h3>Filter out intergenic peaks</h3>
<p>As a first pass I want to filter out the peaks that are outside a gene body. While this may not be perfect it will help alot with the intergenic noise.</p>
<p>I need to overlap the named peaks with /project2/gilad/briana/genome_anotation_data/ncbiRefSeq.mRNA.named.bed and only keep the matches. I can use bedtools intersect.</p>
<p>Rename the peaks according to convention to run an intesect.</p>
<p>RenamePeaks4Intersect.py</p>
<pre class="bash"><code>#python  
CovnamedPeaks=open(&quot;/project2/gilad/briana/threeprimeseq/data/mergedPeaks_comb/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed.bed&quot;, &quot;r&quot;)
GeneNamedPeaks=open(&quot;/project2/gilad/briana/threeprimeseq/data/UnderstandPeaksQC/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_RENAMED.bed&quot;, &quot;w&quot;)  

for ln in CovnamedPeaks:
  chrom, start, end, num, cov, strand, transcript = ln.split()
  gene=transcript.split(&quot;-&quot;)[1]
  start=int(start)
  end=int(end)
  GeneNamedPeaks.write(&quot;%s\t%d\t%d\t%s-%s\t%s\t%s\n&quot;%(chrom,start,end,num,gene,cov,strand))

GeneNamedPeaks.close()
</code></pre>
<p>Remove CHR from the refseq annpotation:</p>
<pre class="bash"><code>sed &#39;s/^chr//&#39; /project2/gilad/briana/genome_anotation_data/ncbiRefSeq.mRNA.named.bed &gt; /project2/gilad/briana/genome_anotation_data/ncbiRefSeq.mRNA.named_noCHR.bed</code></pre>
<p>Filter4GenicPeaks.sh</p>
<pre class="bash"><code>
#!/bin/bash


#SBATCH --job-name=Filter4GenicPeaks
#SBATCH --account=pi-yangili1
#SBATCH --time=24:00:00
#SBATCH --output=Filter4GenicPeaks.out
#SBATCH --error=Filter4GenicPeaks.err
#SBATCH --partition=broadwl
#SBATCH --mem=12G
#SBATCH --mail-type=END

module load Anaconda3
source activate three-prime-env


bedtools intersect -wa -s -a /project2/gilad/briana/threeprimeseq/data/UnderstandPeaksQC/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_RENAMED.bed -b /project2/gilad/briana/genome_anotation_data/ncbiRefSeq.mRNA.named_noCHR.bed &gt; /project2/gilad/briana/threeprimeseq/data/mergedPeaks_comb_inGeneBody/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_inGeneBodies.bed</code></pre>
<p>This is printing them multiple times.</p>
<pre class="bash"><code>uniq /project2/gilad/briana/threeprimeseq/data/mergedPeaks_comb_inGeneBody/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_inGeneBodies.bed &gt; /project2/gilad/briana/threeprimeseq/data/mergedPeaks_comb_inGeneBody/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_inGeneBodiesUNIQ.bed</code></pre>
<p>Now I need to make this an SAF to run feature counts.</p>
<p>bed2saf_peaksInGenicReg.py</p>
<pre class="bash"><code>from misc_helper import *

fout = open(&quot;/project2/gilad/briana/threeprimeseq/data/mergedPeaks_comb_inGeneBody/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_inGeneBodiesUNIQ.SAF&quot;,&#39;w&#39;)
fout.write(&quot;GeneID\tChr\tStart\tEnd\tStrand\n&quot;)
for ln in open(&quot;/project2/gilad/briana/threeprimeseq/data/mergedPeaks_comb_inGeneBody/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_inGeneBodiesUNIQ.bed&quot;):
    chrom, start, end, name, score, strand = ln.split()
    namenum=name.split(&quot;-&quot;)[0]
    name_i=int(namenum)
    start_i=int(start)
    end_i=int(end)
    gene_only=name.split(&quot;-&quot;)[1]
    ID = &quot;peak%d:%s:%d:%d:%s:%s&quot;%(name_i, chrom, start_i, end_i, strand, gene_only)
    fout.write(&quot;%s\t%s\t%d\t%d\t%s\n&quot;%(ID, chrom, start_i, end_i, strand))
fout.close()</code></pre>
<p>Run Feature Counts<br />
PeaksinGenicRegion_fc_TN.sh</p>
<pre class="bash"><code>#!/bin/bash

#SBATCH --job-name=PeaksinGenicRegion_fc_TN
#SBATCH --account=pi-yangili1
#SBATCH --time=24:00:00
#SBATCH --output=PeaksinGenicRegion_fc_TN.out
#SBATCH --error=PeaksinGenicRegion_fc_TN.err
#SBATCH --partition=broadwl
#SBATCH --mem=12G
#SBATCH --mail-type=END

module load Anaconda3
source activate three-prime-env

featureCounts -O -a /project2/gilad/briana/threeprimeseq/data/mergedPeaks_comb_inGeneBody/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_inGeneBodiesUNIQ.SAF -F SAF -o /project2/gilad/briana/threeprimeseq/data/PeakInGenecRegion_cov/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_Genic.Total.fc /project2/gilad/briana/threeprimeseq/data/sort/*-T-*-sort.bam -s 2

featureCounts -O -a /project2/gilad/briana/threeprimeseq/data/mergedPeaks_comb_inGeneBody/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_inGeneBodiesUNIQ.SAF -F SAF -o /project2/gilad/briana/threeprimeseq/data/PeakInGenecRegion_cov/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_Genic.Nuclear.fc /project2/gilad/briana/threeprimeseq/data/sort/*-N-*-sort.bam -s 2
</code></pre>
<p>Lastly I will need to fix the headers.</p>
<p>fix_head_fc_genicPeak_tot.py</p>
<pre class="bash"><code>infile= open(&quot;/project2/gilad/briana/threeprimeseq/data/PeakInGenecRegion_cov/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_Genic.Total.fc&quot;, &quot;r&quot;)
fout = open(&quot;/project2/gilad/briana/threeprimeseq/data/PeakInGenecRegion_cov/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_Genic.Total_fixed.fc&quot;,&#39;w&#39;)
for line, i in enumerate(infile):
    if line == 1:
        i_list=i.split()
        libraries=i_list[:6]
        for sample in i_list[6:]:
            full = sample.split(&quot;/&quot;)[7]
            samp= full.split(&quot;-&quot;)[2:4]
            lim=&quot;_&quot;
            samp_st=lim.join(samp)
            libraries.append(samp_st)
        first_line= &quot;\t&quot;.join(libraries)
        fout.write(first_line + &#39;\n&#39;)
    else :
        fout.write(i)
fout.close()</code></pre>
<p>fix_head_fc_genicPeak_nuc.py</p>
<pre class="bash"><code>infile= open(&quot;/project2/gilad/briana/threeprimeseq/data/PeakInGenecRegion_cov/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_Genic.Nuclear.fc&quot;, &quot;r&quot;)
fout = open(&quot;/project2/gilad/briana/threeprimeseq/data/PeakInGenecRegion_cov/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_Genic.Nuclear_fixed.fc&quot;,&#39;w&#39;)
for line, i in enumerate(infile):
    if line == 1:
        i_list=i.split()
        libraries=i_list[:6]
        for sample in i_list[6:]:
            full = sample.split(&quot;/&quot;)[7]
            samp= full.split(&quot;-&quot;)[2:4]
            lim=&quot;_&quot;
            samp_st=lim.join(samp)
            libraries.append(samp_st)
        first_line= &quot;\t&quot;.join(libraries)
        fout.write(first_line + &#39;\n&#39;)
    else :
        fout.write(i)
fout.close()</code></pre>
<p>Pull these into R and look at the correlation between the sum of the peaks by gene and the transcripts counts from RNA seq.</p>
<p>TPM counts from Kalisto</p>
<pre class="r"><code>tx2gene=read.table(&quot;../data/RNAkalisto/ncbiRefSeq.txn2gene.txt&quot; ,header= F, sep=&quot;\t&quot;, stringsAsFactors = F)

txi.kallisto.tsv &lt;- tximport(&quot;../data/RNAkalisto/abundance.tsv&quot;, type = &quot;kallisto&quot;, tx2gene = tx2gene,countsFromAbundance=&quot;lengthScaledTPM&quot; )</code></pre>
<pre><code>Note: importing `abundance.h5` is typically faster than `abundance.tsv`</code></pre>
<pre><code>reading in files with read_tsv</code></pre>
<pre><code>1 
removing duplicated transcript rows from tx2gene
transcripts missing from tx2gene: 99
summarizing abundance
summarizing counts
summarizing length</code></pre>
<p>In previous analysis I did not account for gene length. Here I am going to standardize by length because I am taking a sum over a gene body.</p>
<p>Import gene lengths:</p>
<pre class="r"><code>geneLengthNames=c(&quot;CHR&quot;, &quot;start&quot;, &quot;end&quot;, &quot;gene&quot;, &quot;score&quot;, &quot;strand&quot;)
geneLengths=read.table(&quot;../data/UnderstandPeaksQC/refseq.ProteinCoding.bed&quot;, header=F, stringsAsFactors = F, col.names = geneLengthNames) %&gt;% mutate(length=end-start) %&gt;% select(gene, length)</code></pre>
<p>Look at the correlation with the total:</p>
<p>I am using the sum of the counts in a gene divided by how many million reads mapped. I am also filtering out peaks with less than 10 reads in this individual.</p>
<pre class="r"><code>total_Cov_18486=read.table(&quot;../data/UnderstandPeaksQC/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_Genic.Total_fixed.fc&quot;, header=T, stringsAsFactors = F)[,1:7] %&gt;% separate(Geneid, into=c(&quot;peak&quot;, &quot;chr&quot;, &quot;start&quot;, &quot;end&quot;, &quot;strand&quot;, &quot;gene&quot;), sep=&quot;:&quot;) %&gt;% select(gene, X18486_T) %&gt;% filter(X18486_T&gt;0) %&gt;%  group_by(gene) %&gt;% summarize(GeneSum=sum(X18486_T)) %&gt;% mutate(GeneSumNorm=GeneSum/10.8) %&gt;% inner_join(geneLengths, by=&quot;gene&quot;) %&gt;% mutate(GeneSumSt=GeneSum/length)</code></pre>
<p>Join the data frames.</p>
<pre class="r"><code>TXN_abund=as.data.frame(txi.kallisto.tsv$abundance) %&gt;% rownames_to_column(var=&quot;gene&quot;)
colnames(TXN_abund)=c(&quot;gene&quot;, &quot;TPM&quot;)

TXN_NormGene=TXN_abund %&gt;% inner_join(total_Cov_18486,by=&quot;gene&quot;)</code></pre>
<p>Remove rows with 0 counts and Plot:</p>
<pre class="r"><code>TXN_NormGene=TXN_NormGene %&gt;% filter(TPM&gt;0) %&gt;% filter(GeneSumSt&gt;0)
corr_18486Tot=ggplot(TXN_NormGene, aes(x=log10(TPM), y= log10(GeneSumSt))) + geom_point() + labs(title=&quot;Total&quot;, x=&quot;log10 RNA seq TPM&quot;, y=&quot;log10 Peak count sum per gene&quot;)+ geom_smooth(aes(x=log10(TPM),y=log10(GeneSumSt)),method = &quot;lm&quot;) + annotate(&quot;text&quot;,x=3, y=5,label=&quot;R2=.48&quot;) +geom_density2d(na.rm = TRUE, size = 1, colour = &#39;red&#39;) 

#+ geom_text(aes(label=gene),hjust=0, vjust=0)
       
corr_18486Tot       </code></pre>
<p><img src="figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-15-1.png" width="672" style="display: block; margin: auto;" /></p>
<details> <summary><em>Expand here to see past versions of unnamed-chunk-15-1.png:</em></summary>
<table style="border-collapse:separate; border-spacing:5px;">
<thead>
<tr>
<th style="text-align:left;">
Version
</th>
<th style="text-align:left;">
Author
</th>
<th style="text-align:left;">
Date
</th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align:left;">
<a href="https://github.com/brimittleman/threeprimeseq/blob/558e8e88cd84b2803b254b0f60f9c72b2fc1a211/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-15-1.png" target="_blank">558e8e8</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2018-12-12
</td>
</tr>
</tbody>
</table>
<p></details></p>
<pre class="r"><code>summary(lm(log10(TPM)~log10(GeneSumSt),TXN_NormGene)) </code></pre>
<pre><code>
Call:
lm(formula = log10(TPM) ~ log10(GeneSumSt), data = TXN_NormGene)

Residuals:
    Min      1Q  Median      3Q     Max 
-3.5894 -0.2556  0.0856  0.3676  2.3387 

Coefficients:
                 Estimate Std. Error t value Pr(&gt;|t|)    
(Intercept)      2.407969   0.013563   177.5   &lt;2e-16 ***
log10(GeneSumSt) 0.612175   0.005812   105.3   &lt;2e-16 ***
---
Signif. codes:  0 &#39;***&#39; 0.001 &#39;**&#39; 0.01 &#39;*&#39; 0.05 &#39;.&#39; 0.1 &#39; &#39; 1

Residual standard error: 0.598 on 12053 degrees of freedom
Multiple R-squared:  0.4793,    Adjusted R-squared:  0.4793 
F-statistic: 1.11e+04 on 1 and 12053 DF,  p-value: &lt; 2.2e-16</code></pre>
<pre class="r"><code>cor.test(log10(TXN_NormGene$TPM),log10(TXN_NormGene$GeneSumSt))</code></pre>
<pre><code>
    Pearson&#39;s product-moment correlation

data:  log10(TXN_NormGene$TPM) and log10(TXN_NormGene$GeneSumSt)
t = 105.34, df = 12053, p-value &lt; 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.6829262 0.7015184
sample estimates:
      cor 
0.6923372 </code></pre>
<p>Try this with nuclear</p>
<pre class="r"><code>nuclear_Cov_18486=read.table(&quot;../data/UnderstandPeaksQC/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_Genic.Nuclear_fixed.fc&quot;, header=T, stringsAsFactors = F)[,1:7] %&gt;% separate(Geneid, into=c(&quot;peak&quot;, &quot;chr&quot;, &quot;start&quot;, &quot;end&quot;, &quot;strand&quot;, &quot;gene&quot;), sep=&quot;:&quot;) %&gt;% select(gene, X18486_N) %&gt;% filter(X18486_N&gt;10) %&gt;%  group_by(gene) %&gt;% summarize(GeneSum=sum(X18486_N)) %&gt;% mutate(GeneSumNorm=GeneSum/11.4) %&gt;% inner_join(geneLengths, by=&quot;gene&quot;) %&gt;% mutate(GeneSumSt=GeneSum/length)</code></pre>
<p>Join the data frames.</p>
<pre class="r"><code>TXN_NormGene_nuc=TXN_abund %&gt;% inner_join(nuclear_Cov_18486,by=&quot;gene&quot;)</code></pre>
<p>Remove rows with 0 counts and Plot:</p>
<pre class="r"><code>TXN_NormGene_nuc=TXN_NormGene_nuc %&gt;% filter(TPM&gt;0) %&gt;% filter(GeneSumSt&gt;0)
corr_18486Nuc=ggplot(TXN_NormGene_nuc, aes(x=log10(TPM), y= log10(GeneSumSt))) + geom_point() + labs(title=&quot;Nuclear&quot;, x=&quot;log10 RNA seq TPM&quot;, y=&quot;log10 Peak count sum per gene&quot;)+ geom_smooth(aes(x=log10(TPM),y=log10(GeneSumSt)),method = &quot;lm&quot;) + annotate(&quot;text&quot;,x=3, y=5,label=&quot;R2=.37&quot;) + geom_density2d(na.rm = TRUE, size = 1, colour = &#39;red&#39;) 

#+ geom_text(aes(label=gene),hjust=0, vjust=0)
       
corr_18486Nuc    </code></pre>
<p><img src="figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-18-1.png" width="672" style="display: block; margin: auto;" /></p>
<details> <summary><em>Expand here to see past versions of unnamed-chunk-18-1.png:</em></summary>
<table style="border-collapse:separate; border-spacing:5px;">
<thead>
<tr>
<th style="text-align:left;">
Version
</th>
<th style="text-align:left;">
Author
</th>
<th style="text-align:left;">
Date
</th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align:left;">
<a href="https://github.com/brimittleman/threeprimeseq/blob/558e8e88cd84b2803b254b0f60f9c72b2fc1a211/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-18-1.png" target="_blank">558e8e8</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2018-12-12
</td>
</tr>
</tbody>
</table>
<p></details></p>
<pre class="r"><code>summary(lm(log10(TPM)~log10(GeneSumSt),TXN_NormGene_nuc)) </code></pre>
<pre><code>
Call:
lm(formula = log10(TPM) ~ log10(GeneSumSt), data = TXN_NormGene_nuc)

Residuals:
    Min      1Q  Median      3Q     Max 
-3.7211 -0.2691  0.0733  0.3789  2.5253 

Coefficients:
                 Estimate Std. Error t value Pr(&gt;|t|)    
(Intercept)      2.451150   0.017039  143.85   &lt;2e-16 ***
log10(GeneSumSt) 0.654587   0.008008   81.74   &lt;2e-16 ***
---
Signif. codes:  0 &#39;***&#39; 0.001 &#39;**&#39; 0.01 &#39;*&#39; 0.05 &#39;.&#39; 0.1 &#39; &#39; 1

Residual standard error: 0.6324 on 11567 degrees of freedom
Multiple R-squared:  0.3661,    Adjusted R-squared:  0.3661 
F-statistic:  6681 on 1 and 11567 DF,  p-value: &lt; 2.2e-16</code></pre>
<pre class="r"><code>cor.test(log10(TXN_NormGene_nuc$TPM),log10(TXN_NormGene_nuc$GeneSumSt))</code></pre>
<pre><code>
    Pearson&#39;s product-moment correlation

data:  log10(TXN_NormGene_nuc$TPM) and log10(TXN_NormGene_nuc$GeneSumSt)
t = 81.74, df = 11567, p-value &lt; 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.593414 0.616518
sample estimates:
      cor 
0.6050934 </code></pre>
<p>This just said it had to be in a gene body not the specific gene body. This could be a problem still. For example in the SSPO locus chr7:149,521,993-149,543,749. Here the peaks are closer to the end of the SSPO but are in the gene body of the next gene downstream.</p>
<p>Histones dont have a polyA tail- the HIST1H4C peak is most likely misprimming (chr6:26,102,306-26,110,443)</p>
<p>Filter out overlapping genes:</p>
<p>Count overlaps in origial file:<br />
<code>bedtools merge -i IN.bed -c 1 -o count &gt; counted</code></p>
<p>countGeneOverlap.sh</p>
<pre class="bash"><code>#!/bin/bash

#SBATCH --job-name=countGeneOverlap
#SBATCH --account=pi-yangili1
#SBATCH --time=24:00:00
#SBATCH --output=countGeneOverlap.out
#SBATCH --error=countGeneOverlap.err
#SBATCH --partition=broadwl
#SBATCH --mem=12G
#SBATCH --mail-type=END

module load Anaconda3
source activate three-prime-env


bedtools merge -i /project2/gilad/briana/genome_anotation_data/refseq.ProteinCoding.noCHR.bed -c 1 -o count &gt; /project2/gilad/briana/genome_anotation_data/refseq.ProteinCoding.countGeneOverlap.bed  </code></pre>
<p>Filter out these rows: <code>awk '/\t1$/{print}' counted &gt; filtered</code></p>
<pre class="bash"><code>awk &#39;/\t1$/{print}&#39; /project2/gilad/briana/genome_anotation_data/refseq.ProteinCoding.countGeneOverlap.bed &gt; /project2/gilad/briana/genome_anotation_data/refseq.ProteinCoding.countGeneOverlap.filtered.bed</code></pre>
<p>Intersect with original input to only keep the ones in both sets.<br />
<code>bedtools intersect -a IN.bed -b filtered -wa &gt; OUT.bed</code></p>
<p>findGeneswithoutOverlap.sh</p>
<pre class="bash"><code>#!/bin/bash

#SBATCH --job-name=findGeneswithoutOverlap.sh
#SBATCH --account=pi-yangili1
#SBATCH --time=24:00:00
#SBATCH --output=findGeneswithoutOverlap.out
#SBATCH --error=findGeneswithoutOverlap.err
#SBATCH --partition=broadwl
#SBATCH --mem=12G
#SBATCH --mail-type=END

module load Anaconda3
source activate three-prime-env

bedtools intersect -a /project2/gilad/briana/genome_anotation_data/refseq.ProteinCoding.noCHR.bed -b /project2/gilad/briana/genome_anotation_data/refseq.ProteinCoding.countGeneOverlap.filtered.bed -wa &gt; /project2/gilad/briana/genome_anotation_data/refseq.ProteinCoding.NonOverlapGenes.bed 
</code></pre>
<p>Finally overlap with the mRNA file to only keep the transcripts in these genes. This may be easiest in python /project2/gilad/briana/genome_anotation_data/ncbiRefSeq.mRNA.named_noCHR.bed</p>
<p>subsetmRNAforNonOverlapGenes.py</p>
<pre class="bash"><code>geneFile=open(&quot;/project2/gilad/briana/genome_anotation_data/refseq.ProteinCoding.NonOverlapGenes.bed&quot;, &quot;r&quot;) 

mRNAFile=open(&quot;/project2/gilad/briana/genome_anotation_data/ncbiRefSeq.mRNA.named_noCHR.bed&quot;, &quot;r&quot;)

outFile=open(&quot;/project2/gilad/briana/genome_anotation_data/ncbiRefSeq.mRNA.named_noCHR_NoneOverlapingGenes.bed&quot;, &quot;w&quot;)  

#make list of non overlapping genes  

keep=[]
for ln in geneFile:
  keep.append(ln.split()[3])

for ln in mRNAFile: 
  if ln.split()[4] in keep:
    outFile.write(ln)
outFile.close()
</code></pre>
<p>Filter peaks on this resutls</p>
<p>Filter4GenicPeaks_noOverlap.sh</p>
<pre class="bash"><code>
#!/bin/bash


#SBATCH --job-name=Filter4GenicPeaks
#SBATCH --account=pi-yangili1
#SBATCH --time=24:00:00
#SBATCH --output=Filter4GenicPeaks.out
#SBATCH --error=Filter4GenicPeaks.err
#SBATCH --partition=broadwl
#SBATCH --mem=12G
#SBATCH --mail-type=END

module load Anaconda3
source activate three-prime-env


bedtools intersect -wa -s -a /project2/gilad/briana/threeprimeseq/data/UnderstandPeaksQC/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_RENAMED.bed -b /project2/gilad/briana/genome_anotation_data/ncbiRefSeq.mRNA.named_noCHR_NoneOverlapingGenes.bed&gt; /project2/gilad/briana/threeprimeseq/data/mergedPeaks_comb_inGeneBody/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_inGeneBodies_noGeneOverlap.bed</code></pre>
<p>This is printing them multiple times.</p>
<pre class="bash"><code>uniq /project2/gilad/briana/threeprimeseq/data/mergedPeaks_comb_inGeneBody/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_inGeneBodies_noGeneOverlap.bed &gt; /project2/gilad/briana/threeprimeseq/data/mergedPeaks_comb_inGeneBody/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_inGeneBodies_noGeneOverlap_UNIQ.bed</code></pre>
<p>Make this an SAF to run FC</p>
<p>bed2saf_peaksInGenicReg_noOVERLAP.py</p>
<pre class="bash"><code>from misc_helper import *

fout = open(&quot;/project2/gilad/briana/threeprimeseq/data/mergedPeaks_comb_inGeneBody/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_inGeneBodies_noGeneOverlap_UNIQ.SAF&quot;,&#39;w&#39;)
fout.write(&quot;GeneID\tChr\tStart\tEnd\tStrand\n&quot;)
for ln in open(&quot;/project2/gilad/briana/threeprimeseq/data/mergedPeaks_comb_inGeneBody/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_inGeneBodies_noGeneOverlap_UNIQ.bed&quot;):
    chrom, start, end, name, score, strand = ln.split()
    namenum=name.split(&quot;-&quot;)[0]
    name_i=int(namenum)
    start_i=int(start)
    end_i=int(end)
    gene_only=name.split(&quot;-&quot;)[1]
    ID = &quot;peak%d:%s:%d:%d:%s:%s&quot;%(name_i, chrom, start_i, end_i, strand, gene_only)
    fout.write(&quot;%s\t%s\t%d\t%d\t%s\n&quot;%(ID, chrom, start_i, end_i, strand))
fout.close()</code></pre>
<p>Run Feature Counts<br />
PeaksinGenicRegion_NoneOverlapGenes_fc_TN.sh</p>
<pre class="bash"><code>#!/bin/bash

#SBATCH --job-name=PeaksinGenicRegion_NoneOverlapGenes_fc_TN
#SBATCH --account=pi-yangili1
#SBATCH --time=24:00:00
#SBATCH --output=PeaksinGenicRegion_NoneOverlapGenes_fc_TN.out
#SBATCH --error=PeaksinGenicRegion_NoneOverlapGenes_fc_TN.err
#SBATCH --partition=broadwl
#SBATCH --mem=12G
#SBATCH --mail-type=END

module load Anaconda3
source activate three-prime-env

featureCounts -O -a /project2/gilad/briana/threeprimeseq/data/mergedPeaks_comb_inGeneBody/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_inGeneBodies_noGeneOverlap_UNIQ.SAF -F SAF -o /project2/gilad/briana/threeprimeseq/data/PeakInGenecRegionNoOverlap_cov/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_GenicNoOverlap.Total.fc /project2/gilad/briana/threeprimeseq/data/sort/*-T-*-sort.bam -s 2

featureCounts -O -a /project2/gilad/briana/threeprimeseq/data/mergedPeaks_comb_inGeneBody/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_inGeneBodies_noGeneOverlap_UNIQ.SAF -F SAF -o /project2/gilad/briana/threeprimeseq/data/PeakInGenecRegionNoOverlap_cov/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_GenicNoOverlap.Nuclear.fc /project2/gilad/briana/threeprimeseq/data/sort/*-N-*-sort.bam -s 2
</code></pre>
<p>Lastly I will need to fix the headers.</p>
<p>fix_head_fc_genicPeakNoOverlap_tot.py</p>
<pre class="bash"><code>infile= open(&quot;/project2/gilad/briana/threeprimeseq/data/PeakInGenecRegionNoOverlap_cov/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_GenicNoOverlap.Total.fc&quot;, &quot;r&quot;)
fout = open(&quot;/project2/gilad/briana/threeprimeseq/data/PeakInGenecRegionNoOverlap_cov/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_GenicNoOverlap.Total_fixed.fc&quot;,&#39;w&#39;)
for line, i in enumerate(infile):
    if line == 1:
        i_list=i.split()
        libraries=i_list[:6]
        for sample in i_list[6:]:
            full = sample.split(&quot;/&quot;)[7]
            samp= full.split(&quot;-&quot;)[2:4]
            lim=&quot;_&quot;
            samp_st=lim.join(samp)
            libraries.append(samp_st)
        first_line= &quot;\t&quot;.join(libraries)
        fout.write(first_line + &#39;\n&#39;)
    else :
        fout.write(i)
fout.close()</code></pre>
<p>fix_head_fc_genicPeakNoOverlap_nuc.py</p>
<pre class="bash"><code>infile= open(&quot;/project2/gilad/briana/threeprimeseq/data/PeakInGenecRegionNoOverlap_cov/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_GenicNoOverlap.Nuclear.fc&quot;, &quot;r&quot;)
fout = open(&quot;/project2/gilad/briana/threeprimeseq/data/PeakInGenecRegionNoOverlap_cov/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_GenicNoOverlap.Nuclear_fixed.fc&quot;,&#39;w&#39;)
for line, i in enumerate(infile):
    if line == 1:
        i_list=i.split()
        libraries=i_list[:6]
        for sample in i_list[6:]:
            full = sample.split(&quot;/&quot;)[7]
            samp= full.split(&quot;-&quot;)[2:4]
            lim=&quot;_&quot;
            samp_st=lim.join(samp)
            libraries.append(samp_st)
        first_line= &quot;\t&quot;.join(libraries)
        fout.write(first_line + &#39;\n&#39;)
    else :
        fout.write(i)
fout.close()
</code></pre>
<p>Pull these onto my computer:</p>
<p>Use no filter and standardization scheme:</p>
<pre class="r"><code>total_Cov_18486_noOver=read.table(&quot;../data/UnderstandPeaksQC/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_GenicNoOverlap.Total_fixed.fc&quot;, header=T, stringsAsFactors = F)[,1:7] %&gt;% separate(Geneid, into=c(&quot;peak&quot;, &quot;chr&quot;, &quot;start&quot;, &quot;end&quot;, &quot;strand&quot;, &quot;gene&quot;), sep=&quot;:&quot;) %&gt;% select(gene, X18486_T)%&gt;%  group_by(gene) %&gt;% summarize(GeneSum=sum(X18486_T)) %&gt;% mutate(GeneSumNorm=GeneSum/10.8) %&gt;% inner_join(geneLengths, by=&quot;gene&quot;) %&gt;% mutate(GeneSumSt=GeneSum/length)</code></pre>
<p>Join the data frames.</p>
<pre class="r"><code>TXN_NormGene_noOverlap=TXN_abund %&gt;% inner_join(total_Cov_18486_noOver,by=&quot;gene&quot;)</code></pre>
<p>Remove rows with 0 counts and Plot:</p>
<pre class="r"><code>TXN_NormGene_noOverlap=TXN_NormGene_noOverlap %&gt;% filter(TPM&gt;0) %&gt;% filter(GeneSumSt&gt;0)
corr_18486Tot_noOver=ggplot(TXN_NormGene_noOverlap, aes(x=log10(TPM), y= log10(GeneSumSt))) + geom_point() + labs(title=&quot;Total&quot;, x=&quot;log10 RNA seq TPM&quot;, y=&quot;log10 Peak count sum per gene&quot;)+ geom_smooth(aes(x=log10(TPM),y=log10(GeneSumSt)),method = &quot;lm&quot;) + annotate(&quot;text&quot;,x=3, y=5,label=&quot;R2=.49&quot;) +geom_density2d(na.rm = TRUE, size = 1, colour = &#39;red&#39;) + geom_text(aes(label=gene),hjust=0, vjust=0)
       
corr_18486Tot_noOver       </code></pre>
<p><img src="figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-31-1.png" width="672" style="display: block; margin: auto;" /></p>
<details> <summary><em>Expand here to see past versions of unnamed-chunk-31-1.png:</em></summary>
<table style="border-collapse:separate; border-spacing:5px;">
<thead>
<tr>
<th style="text-align:left;">
Version
</th>
<th style="text-align:left;">
Author
</th>
<th style="text-align:left;">
Date
</th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align:left;">
<a href="https://github.com/brimittleman/threeprimeseq/blob/cbf986cbdc4f24e92c4c8be50f550dcf9257c7b5/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-31-1.png" target="_blank">cbf986c</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2018-12-12
</td>
</tr>
</tbody>
</table>
<p></details></p>
<pre class="r"><code>summary(lm(log10(TPM)~log10(GeneSumSt),TXN_NormGene_noOverlap)) </code></pre>
<pre><code>
Call:
lm(formula = log10(TPM) ~ log10(GeneSumSt), data = TXN_NormGene_noOverlap)

Residuals:
    Min      1Q  Median      3Q     Max 
-3.3953 -0.2604  0.0768  0.3638  2.8920 

Coefficients:
                 Estimate Std. Error t value Pr(&gt;|t|)    
(Intercept)      2.435113   0.014849  163.99   &lt;2e-16 ***
log10(GeneSumSt) 0.616937   0.006274   98.34   &lt;2e-16 ***
---
Signif. codes:  0 &#39;***&#39; 0.001 &#39;**&#39; 0.01 &#39;*&#39; 0.05 &#39;.&#39; 0.1 &#39; &#39; 1

Residual standard error: 0.5973 on 10088 degrees of freedom
Multiple R-squared:  0.4894,    Adjusted R-squared:  0.4894 
F-statistic:  9671 on 1 and 10088 DF,  p-value: &lt; 2.2e-16</code></pre>
<pre class="r"><code>cor.test(log10(TXN_NormGene_noOverlap$TPM),log10(TXN_NormGene_noOverlap$GeneSumSt))</code></pre>
<pre><code>
    Pearson&#39;s product-moment correlation

data:  log10(TXN_NormGene_noOverlap$TPM) and log10(TXN_NormGene_noOverlap$GeneSumSt)
t = 98.339, df = 10088, p-value &lt; 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.6894966 0.7094251
sample estimates:
      cor 
0.6995969 </code></pre>
<pre class="r"><code>nuclear_Cov_18486_noOver=read.table(&quot;../data/UnderstandPeaksQC/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_GenicNoOverlap.Nuclear_fixed.fc&quot;, header=T, stringsAsFactors = F)[,1:7] %&gt;% separate(Geneid, into=c(&quot;peak&quot;, &quot;chr&quot;, &quot;start&quot;, &quot;end&quot;, &quot;strand&quot;, &quot;gene&quot;), sep=&quot;:&quot;) %&gt;% select(gene, X18486_N) %&gt;%  group_by(gene) %&gt;% summarize(GeneSum=sum(X18486_N)) %&gt;% mutate(GeneSumNorm=GeneSum/11.4) %&gt;% inner_join(geneLengths, by=&quot;gene&quot;) %&gt;% mutate(GeneSumSt=GeneSum/length)</code></pre>
<p>Join the data frames.</p>
<pre class="r"><code>TXN_NormGene_noOverlap_nuc=TXN_abund %&gt;% inner_join(nuclear_Cov_18486_noOver,by=&quot;gene&quot;)</code></pre>
<p>Remove rows with 0 counts and Plot:</p>
<pre class="r"><code>TXN_NormGene_noOverlap_nuc=TXN_NormGene_noOverlap_nuc %&gt;% filter(TPM&gt;0) %&gt;% filter(GeneSumSt&gt;0)
corr_18486Nuc_noOver=ggplot(TXN_NormGene_noOverlap_nuc, aes(x=log10(TPM), y= log10(GeneSumSt))) + geom_point() + labs(title=&quot;Nuclear&quot;, x=&quot;log10 RNA seq TPM&quot;, y=&quot;log10 Peak count sum per gene&quot;)+ geom_smooth(aes(x=log10(TPM),y=log10(GeneSumSt)),method = &quot;lm&quot;) + annotate(&quot;text&quot;,x=3, y=5,label=&quot;R2=.5&quot;) +geom_density2d(na.rm = TRUE, size = 1, colour = &#39;red&#39;) 

#+ geom_text(aes(label=gene),hjust=0, vjust=0)
       
corr_18486Nuc_noOver       </code></pre>
<p><img src="figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-34-1.png" width="672" style="display: block; margin: auto;" /></p>
<details> <summary><em>Expand here to see past versions of unnamed-chunk-34-1.png:</em></summary>
<table style="border-collapse:separate; border-spacing:5px;">
<thead>
<tr>
<th style="text-align:left;">
Version
</th>
<th style="text-align:left;">
Author
</th>
<th style="text-align:left;">
Date
</th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align:left;">
<a href="https://github.com/brimittleman/threeprimeseq/blob/cbf986cbdc4f24e92c4c8be50f550dcf9257c7b5/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-34-1.png" target="_blank">cbf986c</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2018-12-12
</td>
</tr>
</tbody>
</table>
<p></details></p>
<pre class="r"><code>summary(lm(log10(TPM)~log10(GeneSumSt),TXN_NormGene_noOverlap_nuc)) </code></pre>
<pre><code>
Call:
lm(formula = log10(TPM) ~ log10(GeneSumSt), data = TXN_NormGene_noOverlap_nuc)

Residuals:
    Min      1Q  Median      3Q     Max 
-3.6947 -0.2989  0.0630  0.3825  2.8300 

Coefficients:
                 Estimate Std. Error t value Pr(&gt;|t|)    
(Intercept)      2.530468   0.016063   157.5   &lt;2e-16 ***
log10(GeneSumSt) 0.708816   0.006819   103.9   &lt;2e-16 ***
---
Signif. codes:  0 &#39;***&#39; 0.001 &#39;**&#39; 0.01 &#39;*&#39; 0.05 &#39;.&#39; 0.1 &#39; &#39; 1

Residual standard error: 0.647 on 10965 degrees of freedom
Multiple R-squared:  0.4963,    Adjusted R-squared:  0.4963 
F-statistic: 1.08e+04 on 1 and 10965 DF,  p-value: &lt; 2.2e-16</code></pre>
<pre class="r"><code>cor.test(log10(TXN_NormGene_noOverlap_nuc$TPM),log10(TXN_NormGene_noOverlap_nuc$GeneSumSt),method=&quot;spearman&quot;)</code></pre>
<pre><code>Warning in cor.test.default(log10(TXN_NormGene_noOverlap_nuc$TPM),
log10(TXN_NormGene_noOverlap_nuc$GeneSumSt), : Cannot compute exact p-value
with ties</code></pre>
<pre><code>
    Spearman&#39;s rank correlation rho

data:  log10(TXN_NormGene_noOverlap_nuc$TPM) and log10(TXN_NormGene_noOverlap_nuc$GeneSumSt)
S = 6.8486e+10, p-value &lt; 2.2e-16
alternative hypothesis: true rho is not equal to 0
sample estimates:
      rho 
0.6884764 </code></pre>
<p>It looks like a more stringent RNA seq filter could help. Lets say log(TPM)&gt;-1.5</p>
<pre class="r"><code>TXN_abund_filt=TXN_abund %&gt;% filter(log(TPM) &gt; -1.5)
TXN_NormGene_noOverlap_filt=TXN_abund_filt %&gt;% inner_join(total_Cov_18486_noOver,by=&quot;gene&quot;)
TXN_NormGene_noOverlap_filt=TXN_NormGene_noOverlap_filt %&gt;% filter(TPM &gt;0) %&gt;% filter(GeneSumSt&gt;0)
corr_18486Tot_noOver_filt=ggplot(TXN_NormGene_noOverlap_filt, aes(x=log10(TPM), y= log10(GeneSumSt))) + geom_point() + labs(title=&quot;Total filtered&quot;, x=&quot;log10 RNA seq TPM&quot;, y=&quot;log10 Peak count sum per gene&quot;)+ geom_smooth(aes(x=log10(TPM),y=log10(GeneSumSt)),method = &quot;lm&quot;) + annotate(&quot;text&quot;,x=3, y=5,label=&quot;R2=.49&quot;) +geom_density2d(na.rm = TRUE, size = 1, colour = &#39;red&#39;) 

#+ geom_text(aes(label=gene),hjust=0, vjust=0)
       
corr_18486Tot_noOver_filt      </code></pre>
<p><img src="figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-35-1.png" width="672" style="display: block; margin: auto;" /></p>
<details> <summary><em>Expand here to see past versions of unnamed-chunk-35-1.png:</em></summary>
<table style="border-collapse:separate; border-spacing:5px;">
<thead>
<tr>
<th style="text-align:left;">
Version
</th>
<th style="text-align:left;">
Author
</th>
<th style="text-align:left;">
Date
</th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align:left;">
<a href="https://github.com/brimittleman/threeprimeseq/blob/cbf986cbdc4f24e92c4c8be50f550dcf9257c7b5/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-35-1.png" target="_blank">cbf986c</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2018-12-12
</td>
</tr>
</tbody>
</table>
<p></details></p>
<pre class="r"><code>summary(lm(log10(TPM)~log10(GeneSumSt),TXN_NormGene_noOverlap_filt)) </code></pre>
<pre><code>
Call:
lm(formula = log10(TPM) ~ log10(GeneSumSt), data = TXN_NormGene_noOverlap_filt)

Residuals:
     Min       1Q   Median       3Q      Max 
-3.05828 -0.26269  0.03857  0.31263  2.64091 

Coefficients:
                 Estimate Std. Error t value Pr(&gt;|t|)    
(Intercept)      2.344999   0.012950  181.08   &lt;2e-16 ***
log10(GeneSumSt) 0.544908   0.005639   96.63   &lt;2e-16 ***
---
Signif. codes:  0 &#39;***&#39; 0.001 &#39;**&#39; 0.01 &#39;*&#39; 0.05 &#39;.&#39; 0.1 &#39; &#39; 1

Residual standard error: 0.4998 on 9677 degrees of freedom
Multiple R-squared:  0.4911,    Adjusted R-squared:  0.491 
F-statistic:  9338 on 1 and 9677 DF,  p-value: &lt; 2.2e-16</code></pre>
<pre class="r"><code>cor.test(log10(TXN_NormGene_noOverlap_filt$TPM),log10(TXN_NormGene_noOverlap_filt$GeneSumSt),method=&quot;spearman&quot;)</code></pre>
<pre><code>Warning in cor.test.default(log10(TXN_NormGene_noOverlap_filt$TPM),
log10(TXN_NormGene_noOverlap_filt$GeneSumSt), : Cannot compute exact p-
value with ties</code></pre>
<pre><code>
    Spearman&#39;s rank correlation rho

data:  log10(TXN_NormGene_noOverlap_filt$TPM) and log10(TXN_NormGene_noOverlap_filt$GeneSumSt)
S = 4.7005e+10, p-value &lt; 2.2e-16
alternative hypothesis: true rho is not equal to 0
sample estimates:
      rho 
0.6889706 </code></pre>
<p>This does not take care of genes that are near each other such as what is going on with SSPO</p>
</div>
<div id="filter-genes-that-are-too-close-together" class="section level2">
<h2>Filter genes that are too close together</h2>
<p>This is similar to the overlap problem but I want to extend the genes.</p>
<p>I can a python script to subtract 10000 bases from the start and add 10000 to the end</p>
<p>*/project2/gilad/briana/genome_anotation_data/refseq.ProteinCoding.noCHR.bed</p>
<p>extendGenes.py</p>
<pre class="bash"><code>inFile=open(&quot;/project2/gilad/briana/genome_anotation_data/refseq.ProteinCoding.noCHR.bed&quot;, &quot;r&quot;)

outFile=open(&quot;/project2/gilad/briana/genome_anotation_data/refseq.ProteinCoding.noCHR_EXT.bed&quot;, &quot;w&quot;)\


for ln in inFile:
  chrom, start, end, gene, score, strand = ln.split()
  start_ex=int(start)-10000
  if start_ex &lt;0: 
    start_ex=0
  end_ex=int(end)+10000
  outFile.write(&quot;%s\t%d\t%d\t%s\t%s\t%s\n&quot;%(chrom, start_ex, end_ex, gene, score, strand))
  
outFile.close()</code></pre>
<p>Now I can run the merge:</p>
<p>countGeneOverlap_EXT.sh</p>
<pre class="bash"><code>#!/bin/bash

#SBATCH --job-name=countGeneOverlap_EXT
#SBATCH --account=pi-yangili1
#SBATCH --time=24:00:00
#SBATCH --output=countGeneOverlap_EXT.out
#SBATCH --error=countGeneOverlap_EXT.err
#SBATCH --partition=broadwl
#SBATCH --mem=12G
#SBATCH --mail-type=END

module load Anaconda3
source activate three-prime-env


bedtools merge -i /project2/gilad/briana/genome_anotation_data/refseq.ProteinCoding.noCHR_EXT.bed -c 1 -o count &gt; /project2/gilad/briana/genome_anotation_data/refseq.ProteinCoding.countGeneOverlap_EXT.bed  </code></pre>
<p>Filter out these rows: <code>awk '/\t1$/{print}' counted &gt; filtered</code></p>
<pre class="bash"><code>awk &#39;/\t1$/{print}&#39; /project2/gilad/briana/genome_anotation_data/refseq.ProteinCoding.countGeneOverlap_EXT.bed &gt; /project2/gilad/briana/genome_anotation_data/refseq.ProteinCoding.countGeneOverlap_EXT_filter.bed</code></pre>
<p>Intersect with original input to only keep the ones in both sets.<br />
<code>bedtools intersect -a IN.bed -b filtered -wa &gt; OUT.bed</code></p>
<p>findGeneswithoutOverlap_EXT.sh</p>
<pre class="bash"><code>#!/bin/bash

#SBATCH --job-name=findGeneswithoutOverlap_EXT.sh
#SBATCH --account=pi-yangili1
#SBATCH --time=24:00:00
#SBATCH --output=findGeneswithoutOverlap_EXT.out
#SBATCH --error=findGeneswithoutOverlap_EXT.err
#SBATCH --partition=broadwl
#SBATCH --mem=12G
#SBATCH --mail-type=END

module load Anaconda3
source activate three-prime-env

bedtools intersect -a /project2/gilad/briana/genome_anotation_data/refseq.ProteinCoding.noCHR.bed -b /project2/gilad/briana/genome_anotation_data/refseq.ProteinCoding.countGeneOverlap_EXT_filter.bed -wa &gt; /project2/gilad/briana/genome_anotation_data/refseq.ProteinCoding.NonOverlapGenes_EXT.bed 
</code></pre>
<p>Finally overlap with the mRNA file to only keep the transcripts in these genes. This may be easiest in python /project2/gilad/briana/genome_anotation_data/ncbiRefSeq.mRNA.named_noCHR.bed</p>
<p>subsetmRNAforNonOverlapGenes_EXT.py</p>
<pre class="bash"><code>geneFile=open(&quot;/project2/gilad/briana/genome_anotation_data/refseq.ProteinCoding.NonOverlapGenes_EXT.bed&quot;, &quot;r&quot;) 

mRNAFile=open(&quot;/project2/gilad/briana/genome_anotation_data/ncbiRefSeq.mRNA.named_noCHR.bed&quot;, &quot;r&quot;)

outFile=open(&quot;/project2/gilad/briana/genome_anotation_data/ncbiRefSeq.mRNA.named_noCHR_NoneOverlapingGenes_EXT.bed&quot;, &quot;w&quot;)  

#make list of non overlapping genes  

keep=[]
for ln in geneFile:
  keep.append(ln.split()[3])

for ln in mRNAFile: 
  if ln.split()[4] in keep:
    outFile.write(ln)
outFile.close()
</code></pre>
<p>Filter4GenicPeaks_noOverlap_EXT.sh</p>
<pre class="bash"><code>
#!/bin/bash


#SBATCH --job-name=Filter4GenicPeaks_EXT
#SBATCH --account=pi-yangili1
#SBATCH --time=24:00:00
#SBATCH --output=Filter4GenicPeaks_EXT.out
#SBATCH --error=Filter4GenicPeaks_EXT.err
#SBATCH --partition=broadwl
#SBATCH --mem=12G
#SBATCH --mail-type=END

module load Anaconda3
source activate three-prime-env


bedtools intersect -wa -s -a /project2/gilad/briana/threeprimeseq/data/UnderstandPeaksQC/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_RENAMED.bed -b /project2/gilad/briana/genome_anotation_data/ncbiRefSeq.mRNA.named_noCHR_NoneOverlapingGenes_EXT.bed &gt; /project2/gilad/briana/threeprimeseq/data/mergedPeaks_comb_inGeneBody/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_inGeneBodies_noGeneOverlapEXT.bed</code></pre>
<p>This is printing them multiple times.</p>
<pre class="bash"><code>uniq /project2/gilad/briana/threeprimeseq/data/mergedPeaks_comb_inGeneBody/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_inGeneBodies_noGeneOverlapEXT.bed &gt; /project2/gilad/briana/threeprimeseq/data/mergedPeaks_comb_inGeneBody/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_inGeneBodies_noGeneOverlapEXT_UNIQ.bed</code></pre>
<p>Make this an SAF to run FC</p>
<p>bed2saf_peaksInGenicReg_noOVERLAPEXT.py</p>
<pre class="bash"><code>from misc_helper import *

fout = open(&quot;/project2/gilad/briana/threeprimeseq/data/mergedPeaks_comb_inGeneBody/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_inGeneBodies_noGeneOverlapEXT_UNIQ.SAF&quot;,&#39;w&#39;)
fout.write(&quot;GeneID\tChr\tStart\tEnd\tStrand\n&quot;)
for ln in open(&quot;/project2/gilad/briana/threeprimeseq/data/mergedPeaks_comb_inGeneBody/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_inGeneBodies_noGeneOverlapEXT_UNIQ.bed&quot;):
    chrom, start, end, name, score, strand = ln.split()
    namenum=name.split(&quot;-&quot;)[0]
    name_i=int(namenum)
    start_i=int(start)
    end_i=int(end)
    gene_only=name.split(&quot;-&quot;)[1]
    ID = &quot;peak%d:%s:%d:%d:%s:%s&quot;%(name_i, chrom, start_i, end_i, strand, gene_only)
    fout.write(&quot;%s\t%s\t%d\t%d\t%s\n&quot;%(ID, chrom, start_i, end_i, strand))
fout.close()</code></pre>
<p>Run Feature Counts<br />
PeaksinGenicRegion_NoneOverlapGenesEXT_fc_TN.sh</p>
<pre class="bash"><code>#!/bin/bash

#SBATCH --job-name=PeaksinGenicRegion_NoneOverlapGenesEXT_fc_TN
#SBATCH --account=pi-yangili1
#SBATCH --time=24:00:00
#SBATCH --output=PeaksinGenicRegion_NoneOverlapGenesEXT_fc_TN.out
#SBATCH --error=PeaksinGenicRegion_NoneOverlapGenesEXT_fc_TN.err
#SBATCH --partition=broadwl
#SBATCH --mem=12G
#SBATCH --mail-type=END

module load Anaconda3
source activate three-prime-env

featureCounts -O -a /project2/gilad/briana/threeprimeseq/data/mergedPeaks_comb_inGeneBody/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_inGeneBodies_noGeneOverlapEXT_UNIQ.SAF -F SAF -o /project2/gilad/briana/threeprimeseq/data/PeakInGenecRegionNoOverlapEXT_cov/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_GenicNoOverlapEXT.Total.fc /project2/gilad/briana/threeprimeseq/data/sort/*-T-*-sort.bam -s 2

featureCounts -O -a /project2/gilad/briana/threeprimeseq/data/mergedPeaks_comb_inGeneBody/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_inGeneBodies_noGeneOverlapEXT_UNIQ.SAF -F SAF -o /project2/gilad/briana/threeprimeseq/data/PeakInGenecRegionNoOverlapEXT_cov/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_GenicNoOverlapEXT.Nuclear.fc /project2/gilad/briana/threeprimeseq/data/sort/*-N-*-sort.bam -s 2
</code></pre>
<p>Lastly I will need to fix the headers.</p>
<p>fix_head_fc_genicPeakNoOverlapEXT_tot.py</p>
<pre class="bash"><code>infile= open(&quot;/project2/gilad/briana/threeprimeseq/data/PeakInGenecRegionNoOverlapEXT_cov/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_GenicNoOverlapEXT.Total.fc&quot;, &quot;r&quot;)
fout = open(&quot;/project2/gilad/briana/threeprimeseq/data/PeakInGenecRegionNoOverlapEXT_cov/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_GenicNoOverlapEXT.Total_fixed.fc&quot;,&#39;w&#39;)
for line, i in enumerate(infile):
    if line == 1:
        i_list=i.split()
        libraries=i_list[:6]
        for sample in i_list[6:]:
            full = sample.split(&quot;/&quot;)[7]
            samp= full.split(&quot;-&quot;)[2:4]
            lim=&quot;_&quot;
            samp_st=lim.join(samp)
            libraries.append(samp_st)
        first_line= &quot;\t&quot;.join(libraries)
        fout.write(first_line + &#39;\n&#39;)
    else :
        fout.write(i)
fout.close()</code></pre>
<p>fix_head_fc_genicPeakNoOverlapEXT_nuc.py</p>
<pre class="bash"><code>infile= open(&quot;/project2/gilad/briana/threeprimeseq/data/PeakInGenecRegionNoOverlapEXT_cov/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_GenicNoOverlapEXT.Nuclear.fc&quot;, &quot;r&quot;)
fout = open(&quot;/project2/gilad/briana/threeprimeseq/data/PeakInGenecRegionNoOverlapEXT_cov/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_GenicNoOverlapEXT.Nuclear_Fixed.fc&quot;,&#39;w&#39;)
for line, i in enumerate(infile):
    if line == 1:
        i_list=i.split()
        libraries=i_list[:6]
        for sample in i_list[6:]:
            full = sample.split(&quot;/&quot;)[7]
            samp= full.split(&quot;-&quot;)[2:4]
            lim=&quot;_&quot;
            samp_st=lim.join(samp)
            libraries.append(samp_st)
        first_line= &quot;\t&quot;.join(libraries)
        fout.write(first_line + &#39;\n&#39;)
    else :
        fout.write(i)
fout.close()
</code></pre>
<p>Pull these onto my computer:</p>
<p>Use no filter and standardization scheme:</p>
<pre class="r"><code>total_Cov_18486_noOverEXT=read.table(&quot;../data/UnderstandPeaksQC/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_GenicNoOverlapEXT.Total_fixed.fc&quot;, header=T, stringsAsFactors = F)[,1:7] %&gt;% separate(Geneid, into=c(&quot;peak&quot;, &quot;chr&quot;, &quot;start&quot;, &quot;end&quot;, &quot;strand&quot;, &quot;gene&quot;), sep=&quot;:&quot;) %&gt;% select(gene, X18486_T)%&gt;%  group_by(gene) %&gt;% summarize(GeneSum=sum(X18486_T)) %&gt;% mutate(GeneSumNorm=GeneSum/10.8)</code></pre>
<p>Join the data frames.</p>
<pre class="r"><code>TXN_NormGene_noOverlapEXT=TXN_abund %&gt;% inner_join(total_Cov_18486_noOverEXT,by=&quot;gene&quot;)</code></pre>
<p>Remove rows with 0 counts and Plot:</p>
<pre class="r"><code>TXN_NormGene_noOverlapEXT_n=TXN_NormGene_noOverlapEXT %&gt;% filter(TPM&gt;0) %&gt;% filter(GeneSumNorm&gt;0)
corr_18486Tot_noOverEXT=ggplot(TXN_NormGene_noOverlapEXT_n, aes(x=log10(TPM), y= log10(GeneSumNorm))) + geom_point() + labs(title=&quot;Total&quot;, x=&quot;log10 RNA seq TPM&quot;, y=&quot;log10 Peak count sum per gene&quot;)+ geom_smooth(aes(x=log10(TPM),y=log10(GeneSumNorm)),method = &quot;lm&quot;) + annotate(&quot;text&quot;,x=3, y=5,label=&quot;R2=.42&quot;) +geom_density2d(na.rm = TRUE, size = 1, colour = &#39;red&#39;)

#+geom_text(aes(label=gene),hjust=0, vjust=0)
       
corr_18486Tot_noOverEXT       </code></pre>
<p><img src="figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-49-1.png" width="672" style="display: block; margin: auto;" /></p>
<details> <summary><em>Expand here to see past versions of unnamed-chunk-49-1.png:</em></summary>
<table style="border-collapse:separate; border-spacing:5px;">
<thead>
<tr>
<th style="text-align:left;">
Version
</th>
<th style="text-align:left;">
Author
</th>
<th style="text-align:left;">
Date
</th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align:left;">
<a href="https://github.com/brimittleman/threeprimeseq/blob/69b51620dfe0a0ce05d7085c92f5949fd26fae4a/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-49-1.png" target="_blank">69b5162</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2018-12-13
</td>
</tr>
<tr>
<td style="text-align:left;">
<a href="https://github.com/brimittleman/threeprimeseq/blob/c3fd2c47561749e66251dd7c462d1fd3c9a8a8a5/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-49-1.png" target="_blank">c3fd2c4</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2018-12-13
</td>
</tr>
<tr>
<td style="text-align:left;">
<a href="https://github.com/brimittleman/threeprimeseq/blob/cbf986cbdc4f24e92c4c8be50f550dcf9257c7b5/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-49-1.png" target="_blank">cbf986c</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2018-12-12
</td>
</tr>
</tbody>
</table>
<p></details></p>
<pre class="r"><code>summary(lm(log10(TPM)~log10(GeneSumNorm),TXN_NormGene_noOverlapEXT_n)) </code></pre>
<pre><code>
Call:
lm(formula = log10(TPM) ~ log10(GeneSumNorm), data = TXN_NormGene_noOverlapEXT_n)

Residuals:
    Min      1Q  Median      3Q     Max 
-3.6242 -0.2842  0.0307  0.3445  2.9033 

Coefficients:
                   Estimate Std. Error t value Pr(&gt;|t|)    
(Intercept)         0.18874    0.02195   8.598   &lt;2e-16 ***
log10(GeneSumNorm)  0.66519    0.01540  43.200   &lt;2e-16 ***
---
Signif. codes:  0 &#39;***&#39; 0.001 &#39;**&#39; 0.01 &#39;*&#39; 0.05 &#39;.&#39; 0.1 &#39; &#39; 1

Residual standard error: 0.6428 on 2542 degrees of freedom
Multiple R-squared:  0.4234,    Adjusted R-squared:  0.4231 
F-statistic:  1866 on 1 and 2542 DF,  p-value: &lt; 2.2e-16</code></pre>
<pre class="r"><code>cor.test(log10(TXN_NormGene_noOverlapEXT_n$TPM),log10(TXN_NormGene_noOverlapEXT_n$GeneSumNorm),method=&quot;spearman&quot;)</code></pre>
<pre><code>Warning in cor.test.default(log10(TXN_NormGene_noOverlapEXT_n$TPM),
log10(TXN_NormGene_noOverlapEXT_n$GeneSumNorm), : Cannot compute exact p-
value with ties</code></pre>
<pre><code>
    Spearman&#39;s rank correlation rho

data:  log10(TXN_NormGene_noOverlapEXT_n$TPM) and log10(TXN_NormGene_noOverlapEXT_n$GeneSumNorm)
S = 880650000, p-value &lt; 2.2e-16
alternative hypothesis: true rho is not equal to 0
sample estimates:
      rho 
0.6790751 </code></pre>
<p>In this test, I was looking at 2600 genes.</p>
<pre class="r"><code>nuclear_Cov_18486_noOverEXT=read.table(&quot;../data/UnderstandPeaksQC/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_GenicNoOverlapEXT.Nuclear_Fixed.fc&quot;, header=T, stringsAsFactors = F)[,1:7] %&gt;% separate(Geneid, into=c(&quot;peak&quot;, &quot;chr&quot;, &quot;start&quot;, &quot;end&quot;, &quot;strand&quot;, &quot;gene&quot;), sep=&quot;:&quot;) %&gt;% select(gene, X18486_N) %&gt;%  group_by(gene) %&gt;% summarize(GeneSum=sum(X18486_N)) %&gt;% mutate(GeneSumNorm=GeneSum/11.4) </code></pre>
<p>Join the data frames.</p>
<pre class="r"><code>TXN_NormGene_noOverlap_nucEXT=TXN_abund %&gt;% inner_join(nuclear_Cov_18486_noOverEXT,by=&quot;gene&quot;)</code></pre>
<p>Remove rows with 0 counts and Plot:</p>
<pre class="r"><code>TXN_NormGene_noOverlap_nucEXT_n=TXN_NormGene_noOverlap_nucEXT %&gt;% filter(TPM&gt;0) %&gt;% filter(GeneSumNorm&gt;0)
corr_18486Nuc_noOverEXT=ggplot(TXN_NormGene_noOverlap_nucEXT_n, aes(x=log10(TPM), y= log10(GeneSumNorm))) + geom_point() + labs(title=&quot;Nuclear&quot;, x=&quot;log10 RNA seq TPM&quot;, y=&quot;log10 Peak count sum per gene&quot;)+ geom_smooth(aes(x=log10(TPM),y=log10(GeneSumNorm)),method = &quot;lm&quot;) + annotate(&quot;text&quot;,x=3, y=5,label=&quot;R2=.43&quot;) +geom_density2d(na.rm = TRUE, size = 1, colour = &#39;red&#39;) 

#+geom_text(aes(label=gene),hjust=0, vjust=0)
       
corr_18486Nuc_noOverEXT       </code></pre>
<p><img src="figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-52-1.png" width="672" style="display: block; margin: auto;" /></p>
<details> <summary><em>Expand here to see past versions of unnamed-chunk-52-1.png:</em></summary>
<table style="border-collapse:separate; border-spacing:5px;">
<thead>
<tr>
<th style="text-align:left;">
Version
</th>
<th style="text-align:left;">
Author
</th>
<th style="text-align:left;">
Date
</th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align:left;">
<a href="https://github.com/brimittleman/threeprimeseq/blob/69b51620dfe0a0ce05d7085c92f5949fd26fae4a/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-52-1.png" target="_blank">69b5162</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2018-12-13
</td>
</tr>
<tr>
<td style="text-align:left;">
<a href="https://github.com/brimittleman/threeprimeseq/blob/c3fd2c47561749e66251dd7c462d1fd3c9a8a8a5/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-52-1.png" target="_blank">c3fd2c4</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2018-12-13
</td>
</tr>
<tr>
<td style="text-align:left;">
<a href="https://github.com/brimittleman/threeprimeseq/blob/cbf986cbdc4f24e92c4c8be50f550dcf9257c7b5/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-52-1.png" target="_blank">cbf986c</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2018-12-12
</td>
</tr>
</tbody>
</table>
<p></details></p>
<pre class="r"><code>summary(lm(log10(TPM)~log10(GeneSumNorm),TXN_NormGene_noOverlap_nucEXT_n)) </code></pre>
<pre><code>
Call:
lm(formula = log10(TPM) ~ log10(GeneSumNorm), data = TXN_NormGene_noOverlap_nucEXT_n)

Residuals:
    Min      1Q  Median      3Q     Max 
-3.4278 -0.3689  0.0112  0.3873  3.2887 

Coefficients:
                   Estimate Std. Error t value Pr(&gt;|t|)    
(Intercept)         0.02478    0.02179   1.137    0.256    
log10(GeneSumNorm)  0.68406    0.01470  46.527   &lt;2e-16 ***
---
Signif. codes:  0 &#39;***&#39; 0.001 &#39;**&#39; 0.01 &#39;*&#39; 0.05 &#39;.&#39; 0.1 &#39; &#39; 1

Residual standard error: 0.7076 on 2836 degrees of freedom
Multiple R-squared:  0.4329,    Adjusted R-squared:  0.4327 
F-statistic:  2165 on 1 and 2836 DF,  p-value: &lt; 2.2e-16</code></pre>
<pre class="r"><code>cor.test(log10(TXN_NormGene_noOverlap_nucEXT_n$TPM),log10(TXN_NormGene_noOverlap_nucEXT_n$GeneSumNorm),method=&quot;spearman&quot;)</code></pre>
<pre><code>Warning in cor.test.default(log10(TXN_NormGene_noOverlap_nucEXT_n$TPM), :
Cannot compute exact p-value with ties</code></pre>
<pre><code>
    Spearman&#39;s rank correlation rho

data:  log10(TXN_NormGene_noOverlap_nucEXT_n$TPM) and log10(TXN_NormGene_noOverlap_nucEXT_n$GeneSumNorm)
S = 1351600000, p-value &lt; 2.2e-16
alternative hypothesis: true rho is not equal to 0
sample estimates:
      rho 
0.6452286 </code></pre>
<p>In this test, I was looking at 2900 genes.</p>
<p>I can further ssubset for genes with only 1 peak.</p>
<pre class="r"><code>OnePeakGenesTotalnoOver=read.table(&quot;../data/UnderstandPeaksQC/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_GenicNoOverlapEXT.Total_fixed.fc&quot;, header=T, stringsAsFactors = F)[,1:7] %&gt;% separate(Geneid, into=c(&quot;peak&quot;, &quot;chr&quot;, &quot;start&quot;, &quot;end&quot;, &quot;strand&quot;, &quot;gene&quot;), sep=&quot;:&quot;) %&gt;% select(gene, X18486_T)%&gt;%  group_by(gene) %&gt;% tally() %&gt;% filter(n&gt;=3)

total_Cov_18486_noOverEXT_1Peak=read.table(&quot;../data/UnderstandPeaksQC/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_GenicNoOverlapEXT.Total_fixed.fc&quot;, header=T, stringsAsFactors = F)[,1:7] %&gt;% separate(Geneid, into=c(&quot;peak&quot;, &quot;chr&quot;, &quot;start&quot;, &quot;end&quot;, &quot;strand&quot;, &quot;gene&quot;), sep=&quot;:&quot;) %&gt;% select(gene, X18486_T)%&gt;%  group_by(gene) %&gt;% summarize(GeneSum=sum(X18486_T)) %&gt;% mutate(GeneSumNorm=GeneSum/10.8) %&gt;% inner_join(geneLengths, by=&quot;gene&quot;) %&gt;% mutate(GeneSumSt=GeneSum/length) %&gt;% semi_join(OnePeakGenesTotalnoOver, by=&quot;gene&quot;)</code></pre>
<pre class="r"><code>TXN_NormGene_noOverlapEXT_1peak=TXN_abund %&gt;% inner_join(total_Cov_18486_noOverEXT_1Peak,by=&quot;gene&quot;)</code></pre>
<pre class="r"><code>TXN_NormGene_noOverlapEXT_1peak_n=TXN_NormGene_noOverlapEXT_1peak %&gt;% filter(TPM&gt;0) %&gt;% filter(GeneSumNorm&gt;0)
corr_18486Tot_noOverEXT=ggplot(TXN_NormGene_noOverlapEXT_1peak_n, aes(x=log10(TPM), y= log10(GeneSumNorm))) + geom_point() + labs(title=&quot;Total-nonOverlapping Genes with 3 peaks&quot;, x=&quot;log10 RNA seq TPM&quot;, y=&quot;log10 Peak count sum per gene&quot;)+ geom_smooth(aes(x=log10(TPM),y=log10(GeneSumNorm)),method = &quot;lm&quot;) + annotate(&quot;text&quot;,x=3, y=5,label=&quot;R2=.4&quot;) +geom_density2d(na.rm = TRUE, size = 1, colour = &#39;red&#39;) 

#+geom_text(aes(label=gene),hjust=0, vjust=0)
       
corr_18486Tot_noOverEXT       </code></pre>
<p><img src="figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-55-1.png" width="672" style="display: block; margin: auto;" /></p>
<details> <summary><em>Expand here to see past versions of unnamed-chunk-55-1.png:</em></summary>
<table style="border-collapse:separate; border-spacing:5px;">
<thead>
<tr>
<th style="text-align:left;">
Version
</th>
<th style="text-align:left;">
Author
</th>
<th style="text-align:left;">
Date
</th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align:left;">
<a href="https://github.com/brimittleman/threeprimeseq/blob/69b51620dfe0a0ce05d7085c92f5949fd26fae4a/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-55-1.png" target="_blank">69b5162</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2018-12-13
</td>
</tr>
<tr>
<td style="text-align:left;">
<a href="https://github.com/brimittleman/threeprimeseq/blob/cbf986cbdc4f24e92c4c8be50f550dcf9257c7b5/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-55-1.png" target="_blank">cbf986c</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2018-12-12
</td>
</tr>
</tbody>
</table>
<p></details></p>
<pre class="r"><code>summary(lm(log10(TPM)~log10(GeneSumNorm),TXN_NormGene_noOverlapEXT_1peak_n)) </code></pre>
<pre><code>
Call:
lm(formula = log10(TPM) ~ log10(GeneSumNorm), data = TXN_NormGene_noOverlapEXT_1peak_n)

Residuals:
    Min      1Q  Median      3Q     Max 
-3.2185 -0.2785  0.0258  0.3296  2.7657 

Coefficients:
                   Estimate Std. Error t value Pr(&gt;|t|)    
(Intercept)         0.27643    0.02297   12.03   &lt;2e-16 ***
log10(GeneSumNorm)  0.61688    0.01570   39.30   &lt;2e-16 ***
---
Signif. codes:  0 &#39;***&#39; 0.001 &#39;**&#39; 0.01 &#39;*&#39; 0.05 &#39;.&#39; 0.1 &#39; &#39; 1

Residual standard error: 0.5918 on 2366 degrees of freedom
Multiple R-squared:  0.395, Adjusted R-squared:  0.3947 
F-statistic:  1544 on 1 and 2366 DF,  p-value: &lt; 2.2e-16</code></pre>
<pre class="r"><code>cor.test(log10(TXN_NormGene_noOverlapEXT_1peak_n$TPM),log10(TXN_NormGene_noOverlapEXT_1peak_n$GeneSumNorm),method=&quot;spearman&quot;)</code></pre>
<pre><code>Warning in cor.test.default(log10(TXN_NormGene_noOverlapEXT_1peak_n$TPM), :
Cannot compute exact p-value with ties</code></pre>
<pre><code>
    Spearman&#39;s rank correlation rho

data:  log10(TXN_NormGene_noOverlapEXT_1peak_n$TPM) and log10(TXN_NormGene_noOverlapEXT_1peak_n$GeneSumNorm)
S = 752880000, p-value &lt; 2.2e-16
alternative hypothesis: true rho is not equal to 0
sample estimates:
      rho 
0.6598005 </code></pre>
</div>
<div id="around-gene-end" class="section level2">
<h2>Around gene end</h2>
<p>There are still problems with peaks in the downstream gene. I need to have a filter that a peak needs to be within a certain distance from the end of a gene. I can change the end of gene file to have 2kb around each gene end and then work with peaks in this area. I want it to be 2kb into the gene.</p>
<p>I can filter the peaks to only those in these regions.</p>
<p>EndOfGenes_4kbaround.py</p>
<pre class="bash"><code>#python  

inFile=open(&quot;/project2/gilad/briana/genome_anotation_data/ncbiRefSeq_endProtCodGenes_sort_withscore.bed&quot;, &quot;r&quot;)

outFile=open(&quot;/project2/gilad/briana/genome_anotation_data/ncbiRefSeq_4KBaroundEnd_sort_withscore.bed&quot;, &quot;w&quot;)\


for ln in inFile:
  chrom, start, end, gene, score, strand = ln.split()
  start_ex=int(start)-2000
  if start_ex &lt;0: 
    start_ex=0
  end_ex=int(end)+2000
  outFile.write(&quot;%s\t%d\t%d\t%s\t%s\t%s\n&quot;%(chrom, start_ex, end_ex, gene, score, strand))
  
outFile.close()</code></pre>
<p>Now I intersect this with the peaks file.</p>
<p>FilterPeaks4KBaroundend.sh</p>
<pre class="bash"><code>#!/bin/bash


#SBATCH --job-name=FilterPeaks4KBaroundend
#SBATCH --account=pi-yangili1
#SBATCH --time=24:00:00
#SBATCH --output=FilterPeaks4KBaroundend.out
#SBATCH --error=FilterPeaks4KBaroundend.err
#SBATCH --partition=broadwl
#SBATCH --mem=12G
#SBATCH --mail-type=END

module load Anaconda3
source activate three-prime-env


bedtools intersect -a /project2/gilad/briana/threeprimeseq/data/UnderstandPeaksQC/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_RENAMED.bed -b /project2/gilad/briana/genome_anotation_data/ncbiRefSeq_4KBaroundEnd_sort_withscore.bed -wa -s  &gt; /project2/gilad/briana/threeprimeseq/data/UnderstandPeaksQC/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_RENAMED_4kbaroundEnd.bed</code></pre>
<p>This is printing them multiple times.</p>
<pre class="bash"><code>uniq /project2/gilad/briana/threeprimeseq/data/UnderstandPeaksQC/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_RENAMED_4kbaroundEnd.bed &gt; /project2/gilad/briana/threeprimeseq/data/UnderstandPeaksQC/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_RENAMED_4kbaroundEnd.UNIQ.bed</code></pre>
<p>Make this an SAF to run FC</p>
<p>bed2saf_peaksInGenicReg_4kbaround.py</p>
<pre class="bash"><code>from misc_helper import *

fout = open(&quot;/project2/gilad/briana/threeprimeseq/data/UnderstandPeaksQC/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_RENAMED_4kbaroundEnd.UNIQ.SAF&quot;,&#39;w&#39;)
fout.write(&quot;GeneID\tChr\tStart\tEnd\tStrand\n&quot;)
for ln in open(&quot;/project2/gilad/briana/threeprimeseq/data/UnderstandPeaksQC/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_RENAMED_4kbaroundEnd.UNIQ.bed&quot;):
    chrom, start, end, name, score, strand = ln.split()
    namenum=name.split(&quot;-&quot;)[0]
    name_i=int(namenum)
    start_i=int(start)
    end_i=int(end)
    gene_only=name.split(&quot;-&quot;)[1]
    ID = &quot;peak%d:%s:%d:%d:%s:%s&quot;%(name_i, chrom, start_i, end_i, strand, gene_only)
    fout.write(&quot;%s\t%s\t%d\t%d\t%s\n&quot;%(ID, chrom, start_i, end_i, strand))
fout.close()</code></pre>
<p>Run feature counts with this.<br />
Peaks4kbAround_fc_TN.sh</p>
<pre class="bash"><code>#!/bin/bash

#SBATCH --job-name=Peaks4kbAround_fc_TN
#SBATCH --account=pi-yangili1
#SBATCH --time=24:00:00
#SBATCH --output=Peaks4kbAround_fc_TN.out
#SBATCH --error=Peaks4kbAround_fc_TN.err
#SBATCH --partition=broadwl
#SBATCH --mem=12G
#SBATCH --mail-type=END

module load Anaconda3
source activate three-prime-env

featureCounts -O -a /project2/gilad/briana/threeprimeseq/data/UnderstandPeaksQC/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_RENAMED_4kbaroundEnd.UNIQ.SAF -F SAF -o /project2/gilad/briana/threeprimeseq/data/Peaks4kbAroundGeneEnd/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_Peaks4kbAround.Total.fc /project2/gilad/briana/threeprimeseq/data/sort/*-T-*-sort.bam -s 2

featureCounts -O -a //project2/gilad/briana/threeprimeseq/data/UnderstandPeaksQC/filtered_APApeaks_merged_allchrom_refseqTrans.noties_sm.fixed_RENAMED_4kbaroundEnd.UNIQ.SAF -F SAF -o /project2/gilad/briana/threeprimeseq/data/Peaks4kbAroundGeneEnd/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_Peaks4kbAround.Nuclear.fc /project2/gilad/briana/threeprimeseq/data/sort/*-N-*-sort.bam -s 2
</code></pre>
<p>fix_head_fc_Peaks4kbAround_tot.py</p>
<pre class="bash"><code>infile= open(&quot;/project2/gilad/briana/threeprimeseq/data/Peaks4kbAroundGeneEnd/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_Peaks4kbAround.Total.fc&quot;, &quot;r&quot;)
fout = open(&quot;/project2/gilad/briana/threeprimeseq/data/Peaks4kbAroundGeneEnd/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_Peaks4kbAround.Total_fixed.fc&quot;,&#39;w&#39;)
for line, i in enumerate(infile):
    if line == 1:
        i_list=i.split()
        libraries=i_list[:6]
        for sample in i_list[6:]:
            full = sample.split(&quot;/&quot;)[7]
            samp= full.split(&quot;-&quot;)[2:4]
            lim=&quot;_&quot;
            samp_st=lim.join(samp)
            libraries.append(samp_st)
        first_line= &quot;\t&quot;.join(libraries)
        fout.write(first_line + &#39;\n&#39;)
    else :
        fout.write(i)
fout.close()</code></pre>
<p>fix_head_fc_gPeaks4kbAround_nuc.py</p>
<pre class="bash"><code>infile= open(&quot;/project2/gilad/briana/threeprimeseq/data/Peaks4kbAroundGeneEnd/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_Peaks4kbAround.Nuclear.fc&quot;, &quot;r&quot;)
fout = open(&quot;/project2/gilad/briana/threeprimeseq/data/Peaks4kbAroundGeneEnd/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_Peaks4kbAround.Nuclear_fixed.fc&quot;,&#39;w&#39;)
for line, i in enumerate(infile):
    if line == 1:
        i_list=i.split()
        libraries=i_list[:6]
        for sample in i_list[6:]:
            full = sample.split(&quot;/&quot;)[7]
            samp= full.split(&quot;-&quot;)[2:4]
            lim=&quot;_&quot;
            samp_st=lim.join(samp)
            libraries.append(samp_st)
        first_line= &quot;\t&quot;.join(libraries)
        fout.write(first_line + &#39;\n&#39;)
    else :
        fout.write(i)
fout.close()
</code></pre>
<p>Use no filter and standardization scheme:</p>
<pre class="r"><code>total_Cov_18486_4kb=read.table(&quot;../data/UnderstandPeaksQC/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_Peaks4kbAround.Total_fixed.fc&quot;, header=T, stringsAsFactors = F)[,1:7] %&gt;% separate(Geneid, into=c(&quot;peak&quot;, &quot;chr&quot;, &quot;start&quot;, &quot;end&quot;, &quot;strand&quot;, &quot;gene&quot;), sep=&quot;:&quot;) %&gt;% select(gene, X18486_T)%&gt;%  group_by(gene) %&gt;% filter(X18486_T&gt;10) %&gt;% summarize(GeneSum=sum(X18486_T)) %&gt;% mutate(GeneSumNorm=GeneSum/10.8) %&gt;% inner_join(geneLengths, by=&quot;gene&quot;) %&gt;% mutate(GeneSumSt=GeneSum/length)</code></pre>
<p>Join the data frames.</p>
<pre class="r"><code>TXN_NormGene_4kb=TXN_abund %&gt;% inner_join(total_Cov_18486_4kb,by=&quot;gene&quot;)</code></pre>
<p>Remove rows with 0 counts and Plot:</p>
<pre class="r"><code>TXN_NormGene_4kb_n=TXN_NormGene_4kb %&gt;% filter(TPM&gt;0) %&gt;% filter(GeneSumSt&gt;0)
corr_18486Tot_noOver4kb=ggplot(TXN_NormGene_4kb_n, aes(x=log10(TPM), y= log10(GeneSumSt))) + geom_point() + labs(title=&quot;Total&quot;, x=&quot;log10 RNA seq TPM&quot;, y=&quot;log10 Peak count sum per gene&quot;)+ geom_smooth(aes(x=log10(TPM),y=log10(GeneSumSt)),method = &quot;lm&quot;) + annotate(&quot;text&quot;,x=3, y=5,label=&quot;R2=.34&quot;) +geom_density2d(na.rm = TRUE, size = 1, colour = &#39;red&#39;) +geom_text(aes(label=gene),hjust=0, vjust=0)
       
corr_18486Tot_noOver4kb       </code></pre>
<p><img src="figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-65-1.png" width="672" style="display: block; margin: auto;" /></p>
<details> <summary><em>Expand here to see past versions of unnamed-chunk-65-1.png:</em></summary>
<table style="border-collapse:separate; border-spacing:5px;">
<thead>
<tr>
<th style="text-align:left;">
Version
</th>
<th style="text-align:left;">
Author
</th>
<th style="text-align:left;">
Date
</th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align:left;">
<a href="https://github.com/brimittleman/threeprimeseq/blob/b5a37f3ed662b0f28780079601d389535b2c529c/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-65-1.png" target="_blank">b5a37f3</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2019-01-11
</td>
</tr>
<tr>
<td style="text-align:left;">
<a href="https://github.com/brimittleman/threeprimeseq/blob/69b51620dfe0a0ce05d7085c92f5949fd26fae4a/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-65-1.png" target="_blank">69b5162</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2018-12-13
</td>
</tr>
<tr>
<td style="text-align:left;">
<a href="https://github.com/brimittleman/threeprimeseq/blob/c3fd2c47561749e66251dd7c462d1fd3c9a8a8a5/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-65-1.png" target="_blank">c3fd2c4</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2018-12-13
</td>
</tr>
</tbody>
</table>
<p></details></p>
<pre class="r"><code>summary(lm(log10(TPM)~log10(GeneSumSt),TXN_NormGene_4kb_n)) </code></pre>
<pre><code>
Call:
lm(formula = log10(TPM) ~ log10(GeneSumSt), data = TXN_NormGene_4kb_n)

Residuals:
     Min       1Q   Median       3Q      Max 
-3.07128 -0.26438  0.05285  0.31624  2.23108 

Coefficients:
                 Estimate Std. Error t value Pr(&gt;|t|)    
(Intercept)      2.200525   0.013284  165.65   &lt;2e-16 ***
log10(GeneSumSt) 0.409498   0.005733   71.43   &lt;2e-16 ***
---
Signif. codes:  0 &#39;***&#39; 0.001 &#39;**&#39; 0.01 &#39;*&#39; 0.05 &#39;.&#39; 0.1 &#39; &#39; 1

Residual standard error: 0.5116 on 9738 degrees of freedom
Multiple R-squared:  0.3438,    Adjusted R-squared:  0.3437 
F-statistic:  5102 on 1 and 9738 DF,  p-value: &lt; 2.2e-16</code></pre>
<pre class="r"><code>cor.test(log10(TXN_NormGene_4kb_n$TPM),log10(TXN_NormGene_4kb_n$GeneSumSt),method=&quot;spearman&quot;)</code></pre>
<pre><code>Warning in cor.test.default(log10(TXN_NormGene_4kb_n$TPM),
log10(TXN_NormGene_4kb_n$GeneSumSt), : Cannot compute exact p-value with
ties</code></pre>
<pre><code>
    Spearman&#39;s rank correlation rho

data:  log10(TXN_NormGene_4kb_n$TPM) and log10(TXN_NormGene_4kb_n$GeneSumSt)
S = 6.5566e+10, p-value &lt; 2.2e-16
alternative hypothesis: true rho is not equal to 0
sample estimates:
      rho 
0.5742507 </code></pre>
<pre class="r"><code>nuclear_Cov_18486_4kb=read.table(&quot;../data/UnderstandPeaksQC/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_Peaks4kbAround.Nuclear_fixed.fc&quot;, header=T, stringsAsFactors = F)[,1:7] %&gt;% separate(Geneid, into=c(&quot;peak&quot;, &quot;chr&quot;, &quot;start&quot;, &quot;end&quot;, &quot;strand&quot;, &quot;gene&quot;), sep=&quot;:&quot;) %&gt;% select(gene, X18486_N) %&gt;%  group_by(gene) %&gt;%filter(X18486_N&gt;10) %&gt;%  summarize(GeneSum=sum(X18486_N)) %&gt;% mutate(GeneSumNorm=GeneSum/11.4) %&gt;% inner_join(geneLengths, by=&quot;gene&quot;) %&gt;% mutate(GeneSumSt=GeneSum/length)</code></pre>
<p>Join the data frames.</p>
<pre class="r"><code>TXN_NormGene_noOverlap_4kb=TXN_abund %&gt;% inner_join(nuclear_Cov_18486_4kb,by=&quot;gene&quot;)</code></pre>
<p>Remove rows with 0 counts and Plot:</p>
<pre class="r"><code>TXN_NormGene_noOverlap_4kb_n=TXN_NormGene_noOverlap_4kb %&gt;% filter(TPM&gt;0) %&gt;% filter(GeneSumSt&gt;0)
corr_18486Nuc_4kb=ggplot(TXN_NormGene_noOverlap_4kb_n, aes(x=log10(TPM), y= log10(GeneSumSt))) + geom_point() + labs(title=&quot;Nuclear&quot;, x=&quot;log10 RNA seq TPM&quot;, y=&quot;log10 Peak count sum per gene&quot;)+ geom_smooth(aes(x=log10(TPM),y=log10(GeneSumSt)),method = &quot;lm&quot;) + annotate(&quot;text&quot;,x=3, y=5,label=&quot;R2=.26&quot;) +geom_density2d(na.rm = TRUE, size = 1, colour = &#39;red&#39;) + geom_text(aes(label=gene),hjust=0, vjust=0)
       
corr_18486Nuc_4kb       </code></pre>
<p><img src="figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-68-1.png" width="672" style="display: block; margin: auto;" /></p>
<details> <summary><em>Expand here to see past versions of unnamed-chunk-68-1.png:</em></summary>
<table style="border-collapse:separate; border-spacing:5px;">
<thead>
<tr>
<th style="text-align:left;">
Version
</th>
<th style="text-align:left;">
Author
</th>
<th style="text-align:left;">
Date
</th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align:left;">
<a href="https://github.com/brimittleman/threeprimeseq/blob/b5a37f3ed662b0f28780079601d389535b2c529c/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-68-1.png" target="_blank">b5a37f3</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2019-01-11
</td>
</tr>
<tr>
<td style="text-align:left;">
<a href="https://github.com/brimittleman/threeprimeseq/blob/69b51620dfe0a0ce05d7085c92f5949fd26fae4a/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-68-1.png" target="_blank">69b5162</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2018-12-13
</td>
</tr>
<tr>
<td style="text-align:left;">
<a href="https://github.com/brimittleman/threeprimeseq/blob/c3fd2c47561749e66251dd7c462d1fd3c9a8a8a5/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-68-1.png" target="_blank">c3fd2c4</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2018-12-13
</td>
</tr>
</tbody>
</table>
<p></details></p>
<pre class="r"><code>summary(lm(log10(TPM)~log10(GeneSumSt),TXN_NormGene_noOverlap_4kb_n)) </code></pre>
<pre><code>
Call:
lm(formula = log10(TPM) ~ log10(GeneSumSt), data = TXN_NormGene_noOverlap_4kb_n)

Residuals:
    Min      1Q  Median      3Q     Max 
-3.3104 -0.3004  0.0465  0.3451  2.2798 

Coefficients:
                 Estimate Std. Error t value Pr(&gt;|t|)    
(Intercept)      2.203466   0.016119  136.70   &lt;2e-16 ***
log10(GeneSumSt) 0.402256   0.006733   59.74   &lt;2e-16 ***
---
Signif. codes:  0 &#39;***&#39; 0.001 &#39;**&#39; 0.01 &#39;*&#39; 0.05 &#39;.&#39; 0.1 &#39; &#39; 1

Residual standard error: 0.5572 on 10000 degrees of freedom
Multiple R-squared:  0.263, Adjusted R-squared:  0.2629 
F-statistic:  3569 on 1 and 10000 DF,  p-value: &lt; 2.2e-16</code></pre>
<pre class="r"><code>cor.test(log10(TXN_NormGene_noOverlap_4kb_n$TPM),log10(TXN_NormGene_noOverlap_4kb_n$GeneSumSt),method=&quot;spearman&quot;)</code></pre>
<pre><code>Warning in cor.test.default(log10(TXN_NormGene_noOverlap_4kb_n$TPM),
log10(TXN_NormGene_noOverlap_4kb_n$GeneSumSt), : Cannot compute exact p-
value with ties</code></pre>
<pre><code>
    Spearman&#39;s rank correlation rho

data:  log10(TXN_NormGene_noOverlap_4kb_n$TPM) and log10(TXN_NormGene_noOverlap_4kb_n$GeneSumSt)
S = 8.3048e+10, p-value &lt; 2.2e-16
alternative hypothesis: true rho is not equal to 0
sample estimates:
     rho 
0.502012 </code></pre>
<p>From here I can subset down to just the genes in the nonoverlapping set.</p>
<pre class="r"><code>TXN_NormGene_noOverlap_4kb_noOver=TXN_NormGene_noOverlap_4kb_n %&gt;% semi_join(TXN_NormGene_noOverlapEXT, by=&quot;gene&quot;)


corr_18486Nuc_4kb_noOVer=ggplot(TXN_NormGene_noOverlap_4kb_noOver, aes(x=log10(TPM), y= log10(GeneSumSt))) + geom_point() + labs(title=&quot;Nuclear&quot;, x=&quot;log10 RNA seq TPM&quot;, y=&quot;log10 Peak count sum per gene&quot;)+ geom_smooth(aes(x=log10(TPM),y=log10(GeneSumSt)),method = &quot;lm&quot;) + annotate(&quot;text&quot;,x=3, y=5,label=&quot;R2=.27&quot;)


#+geom_text(aes(label=gene),hjust=0, vjust=0)+geom_density2d(na.rm = TRUE, size = 1, colour = &#39;red&#39;)
       
corr_18486Nuc_4kb_noOVer </code></pre>
<p><img src="figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-69-1.png" width="672" style="display: block; margin: auto;" /></p>
<details> <summary><em>Expand here to see past versions of unnamed-chunk-69-1.png:</em></summary>
<table style="border-collapse:separate; border-spacing:5px;">
<thead>
<tr>
<th style="text-align:left;">
Version
</th>
<th style="text-align:left;">
Author
</th>
<th style="text-align:left;">
Date
</th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align:left;">
<a href="https://github.com/brimittleman/threeprimeseq/blob/69b51620dfe0a0ce05d7085c92f5949fd26fae4a/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-69-1.png" target="_blank">69b5162</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2018-12-13
</td>
</tr>
</tbody>
</table>
<p></details></p>
<pre class="r"><code>summary(lm(log10(TPM)~log10(GeneSumSt),TXN_NormGene_noOverlap_4kb_noOver))</code></pre>
<pre><code>
Call:
lm(formula = log10(TPM) ~ log10(GeneSumSt), data = TXN_NormGene_noOverlap_4kb_noOver)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.61075 -0.31776  0.04514  0.34370  2.19168 

Coefficients:
                 Estimate Std. Error t value Pr(&gt;|t|)    
(Intercept)       2.27850    0.04047   56.30   &lt;2e-16 ***
log10(GeneSumSt)  0.41540    0.01509   27.53   &lt;2e-16 ***
---
Signif. codes:  0 &#39;***&#39; 0.001 &#39;**&#39; 0.01 &#39;*&#39; 0.05 &#39;.&#39; 0.1 &#39; &#39; 1

Residual standard error: 0.5504 on 2006 degrees of freedom
Multiple R-squared:  0.2742,    Adjusted R-squared:  0.2739 
F-statistic: 757.9 on 1 and 2006 DF,  p-value: &lt; 2.2e-16</code></pre>
<pre class="r"><code>cor.test(log10(TXN_NormGene_noOverlap_4kb_noOver$TPM),log10(TXN_NormGene_noOverlap_4kb_noOver$GeneSumSt) ,method=&quot;spearman&quot;)</code></pre>
<pre><code>Warning in cor.test.default(log10(TXN_NormGene_noOverlap_4kb_noOver$TPM), :
Cannot compute exact p-value with ties</code></pre>
<pre><code>
    Spearman&#39;s rank correlation rho

data:  log10(TXN_NormGene_noOverlap_4kb_noOver$TPM) and log10(TXN_NormGene_noOverlap_4kb_noOver$GeneSumSt)
S = 664430000, p-value &lt; 2.2e-16
alternative hypothesis: true rho is not equal to 0
sample estimates:
      rho 
0.5076096 </code></pre>
<p>Look at curves for the TPM and standards.</p>
<pre class="r"><code> ggplot(TXN_NormGene_noOverlap_4kb_noOver, aes(x=log10(GeneSumSt))) + geom_density()</code></pre>
<p><img src="figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-70-1.png" width="672" style="display: block; margin: auto;" /></p>
<details> <summary><em>Expand here to see past versions of unnamed-chunk-70-1.png:</em></summary>
<table style="border-collapse:separate; border-spacing:5px;">
<thead>
<tr>
<th style="text-align:left;">
Version
</th>
<th style="text-align:left;">
Author
</th>
<th style="text-align:left;">
Date
</th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align:left;">
<a href="https://github.com/brimittleman/threeprimeseq/blob/69b51620dfe0a0ce05d7085c92f5949fd26fae4a/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-70-1.png" target="_blank">69b5162</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2018-12-13
</td>
</tr>
</tbody>
</table>
<p></details></p>
<pre class="r"><code>ggplot(TXN_NormGene_noOverlap_4kb_noOver, aes(x=log10(TPM))) + geom_density()</code></pre>
<p><img src="figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-70-2.png" width="672" style="display: block; margin: auto;" /></p>
<details> <summary><em>Expand here to see past versions of unnamed-chunk-70-2.png:</em></summary>
<table style="border-collapse:separate; border-spacing:5px;">
<thead>
<tr>
<th style="text-align:left;">
Version
</th>
<th style="text-align:left;">
Author
</th>
<th style="text-align:left;">
Date
</th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align:left;">
<a href="https://github.com/brimittleman/threeprimeseq/blob/69b51620dfe0a0ce05d7085c92f5949fd26fae4a/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-70-2.png" target="_blank">69b5162</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2018-12-13
</td>
</tr>
</tbody>
</table>
<p></details></p>
</div>
<div id="separate-by-percentile-gene-expression" class="section level2">
<h2>Separate by percentile gene expression</h2>
</div>
<div id="check-another-line" class="section level2">
<h2>Check another line</h2>
<pre class="r"><code>total_Cov_18497_noOver=read.table(&quot;../data/UnderstandPeaksQC/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_GenicNoOverlap.Total_fixed.fc&quot;, header=T, stringsAsFactors = F)[,1:8] %&gt;% separate(Geneid, into=c(&quot;peak&quot;, &quot;chr&quot;, &quot;start&quot;, &quot;end&quot;, &quot;strand&quot;, &quot;gene&quot;), sep=&quot;:&quot;) %&gt;% select(gene, X18497_T)%&gt;%  group_by(gene) %&gt;% summarize(GeneSum=sum(X18497_T)) %&gt;% mutate(GeneSumNorm=GeneSum/10.8) %&gt;% inner_join(geneLengths, by=&quot;gene&quot;) %&gt;% mutate(GeneSumSt=GeneSum/length)</code></pre>
<p>Join the data frames.</p>
<pre class="r"><code>TXN_NormGene_noOverlap_497=TXN_abund %&gt;% inner_join(total_Cov_18497_noOver,by=&quot;gene&quot;)</code></pre>
<p>Remove rows with 0 counts and Plot:</p>
<pre class="r"><code>TXN_NormGene_noOverlap_497=TXN_NormGene_noOverlap_497 %&gt;% filter(TPM&gt;0) %&gt;% filter(GeneSumSt&gt;0)
corr_18497Tot_noOver=ggplot(TXN_NormGene_noOverlap_497, aes(x=log10(TPM), y= log10(GeneSumSt))) + geom_point() + labs(title=&quot;Total&quot;, x=&quot;log10 RNA seq TPM&quot;, y=&quot;log10 Peak count sum per gene&quot;)+ geom_smooth(aes(x=log10(TPM),y=log10(GeneSumSt)),method = &quot;lm&quot;) + annotate(&quot;text&quot;,x=3, y=5,label=&quot;R2=.49&quot;) +geom_density2d(na.rm = TRUE, size = 1, colour = &#39;red&#39;) 

#+ geom_text(aes(label=gene),hjust=0, vjust=0)
       
corr_18497Tot_noOver       </code></pre>
<p><img src="figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-73-1.png" width="672" style="display: block; margin: auto;" /></p>
<details> <summary><em>Expand here to see past versions of unnamed-chunk-73-1.png:</em></summary>
<table style="border-collapse:separate; border-spacing:5px;">
<thead>
<tr>
<th style="text-align:left;">
Version
</th>
<th style="text-align:left;">
Author
</th>
<th style="text-align:left;">
Date
</th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align:left;">
<a href="https://github.com/brimittleman/threeprimeseq/blob/69b51620dfe0a0ce05d7085c92f5949fd26fae4a/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-73-1.png" target="_blank">69b5162</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2018-12-13
</td>
</tr>
</tbody>
</table>
<p></details></p>
<pre class="r"><code>summary(lm(log10(TPM)~log10(GeneSumSt),TXN_NormGene_noOverlap_497)) </code></pre>
<pre><code>
Call:
lm(formula = log10(TPM) ~ log10(GeneSumSt), data = TXN_NormGene_noOverlap_497)

Residuals:
    Min      1Q  Median      3Q     Max 
-3.5597 -0.2929  0.0903  0.3993  2.8953 

Coefficients:
                 Estimate Std. Error t value Pr(&gt;|t|)    
(Intercept)      2.413566   0.015014   160.8   &lt;2e-16 ***
log10(GeneSumSt) 0.650761   0.006389   101.9   &lt;2e-16 ***
---
Signif. codes:  0 &#39;***&#39; 0.001 &#39;**&#39; 0.01 &#39;*&#39; 0.05 &#39;.&#39; 0.1 &#39; &#39; 1

Residual standard error: 0.6346 on 10739 degrees of freedom
Multiple R-squared:  0.4914,    Adjusted R-squared:  0.4913 
F-statistic: 1.037e+04 on 1 and 10739 DF,  p-value: &lt; 2.2e-16</code></pre>
<pre class="r"><code>cor.test(log10(TXN_NormGene_noOverlap_497$TPM),log10(TXN_NormGene_noOverlap_497$GeneSumSt),method=&quot;spearman&quot;)</code></pre>
<pre><code>Warning in cor.test.default(log10(TXN_NormGene_noOverlap_497$TPM),
log10(TXN_NormGene_noOverlap_497$GeneSumSt), : Cannot compute exact p-value
with ties</code></pre>
<pre><code>
    Spearman&#39;s rank correlation rho

data:  log10(TXN_NormGene_noOverlap_497$TPM) and log10(TXN_NormGene_noOverlap_497$GeneSumSt)
S = 6.3783e+10, p-value &lt; 2.2e-16
alternative hypothesis: true rho is not equal to 0
sample estimates:
      rho 
0.6911702 </code></pre>
<p>This shows me the effect is not line specific because the correlation is the same when i use the RNA from one line and 3’ from another.</p>
</div>
<div id="sum-over-all-individals" class="section level2">
<h2>Sum over all individals</h2>
<p>First I will get the total counts for the non overlapping set:</p>
<pre class="r"><code>TotCounts_nonOverlapEX=read.table(&quot;../data/UnderstandPeaksQC/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_GenicNoOverlapEXT.Total_fixed.fc&quot;, header=T, stringsAsFactors = F)[,7:45]


SumCounts_Tot=rowSums(TotCounts_nonOverlapEX)

Alllib_Tot_nonOverlapEX=read.table(&quot;../data/UnderstandPeaksQC/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_GenicNoOverlapEXT.Total_fixed.fc&quot;, header=T, stringsAsFactors = F) %&gt;% separate(Geneid, into=c(&quot;peak&quot;, &quot;chr&quot;, &quot;start&quot;, &quot;end&quot;, &quot;strand&quot;, &quot;gene&quot;), sep=&quot;:&quot;) 
 
Alllib_Tot_nonOverlapEX$SumCounts=SumCounts_Tot

Alllib_Tot_nonOverlapEX_bygene=Alllib_Tot_nonOverlapEX %&gt;% select(gene, SumCounts) %&gt;%  group_by(gene)  %&gt;%  summarize(GeneSum=sum(SumCounts)) %&gt;% mutate(GeneSumNorm=GeneSum/11.4) 


TXN_abund_combLibs_tot=TXN_abund %&gt;% inner_join(Alllib_Tot_nonOverlapEX_bygene,by=&quot;gene&quot;)


TXN_abund_combLibs_tot_n0=TXN_abund_combLibs_tot %&gt;% filter(TPM&gt;0) %&gt;% filter(GeneSumNorm&gt;0)
corr_AllLibTot_noOver=ggplot(TXN_abund_combLibs_tot_n0, aes(x=log10(TPM), y= log10(GeneSumNorm))) + geom_point() + labs(title=&quot;Total All ind&quot;, x=&quot;log10 RNA seq TPM&quot;, y=&quot;log10 Peak count sum per gene All Ind.&quot;)+ geom_smooth(aes(x=log10(TPM),y=log10(GeneSumNorm)),method = &quot;lm&quot;) + annotate(&quot;text&quot;,x=-2, y=5,label=&quot;R2=.55&quot;) +geom_density2d(na.rm = TRUE, size = 1, colour = &#39;red&#39;)

#+ geom_text(aes(label=gene),hjust=0, vjust=0)
       
corr_AllLibTot_noOver       </code></pre>
<p><img src="figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-74-1.png" width="672" style="display: block; margin: auto;" /></p>
<details> <summary><em>Expand here to see past versions of unnamed-chunk-74-1.png:</em></summary>
<table style="border-collapse:separate; border-spacing:5px;">
<thead>
<tr>
<th style="text-align:left;">
Version
</th>
<th style="text-align:left;">
Author
</th>
<th style="text-align:left;">
Date
</th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align:left;">
<a href="https://github.com/brimittleman/threeprimeseq/blob/b5a37f3ed662b0f28780079601d389535b2c529c/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-74-1.png" target="_blank">b5a37f3</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2019-01-11
</td>
</tr>
<tr>
<td style="text-align:left;">
<a href="https://github.com/brimittleman/threeprimeseq/blob/69b51620dfe0a0ce05d7085c92f5949fd26fae4a/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-74-1.png" target="_blank">69b5162</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2018-12-13
</td>
</tr>
</tbody>
</table>
<p></details></p>
<pre class="r"><code>summary(lm(log10(TPM)~log10(GeneSumNorm),TXN_abund_combLibs_tot_n0)) </code></pre>
<pre><code>
Call:
lm(formula = log10(TPM) ~ log10(GeneSumNorm), data = TXN_abund_combLibs_tot_n0)

Residuals:
    Min      1Q  Median      3Q     Max 
-3.2653 -0.3200  0.0183  0.3435  3.4494 

Coefficients:
                   Estimate Std. Error t value Pr(&gt;|t|)    
(Intercept)        -1.34124    0.03578  -37.48   &lt;2e-16 ***
log10(GeneSumNorm)  0.78488    0.01279   61.38   &lt;2e-16 ***
---
Signif. codes:  0 &#39;***&#39; 0.001 &#39;**&#39; 0.01 &#39;*&#39; 0.05 &#39;.&#39; 0.1 &#39; &#39; 1

Residual standard error: 0.6682 on 3079 degrees of freedom
Multiple R-squared:  0.5503,    Adjusted R-squared:  0.5501 
F-statistic:  3767 on 1 and 3079 DF,  p-value: &lt; 2.2e-16</code></pre>
<pre class="r"><code>cor.test(log10(TXN_abund_combLibs_tot_n0$TPM),log10(TXN_abund_combLibs_tot_n0$GeneSumNorm),method=&quot;spearman&quot;)</code></pre>
<pre><code>Warning in cor.test.default(log10(TXN_abund_combLibs_tot_n0$TPM),
log10(TXN_abund_combLibs_tot_n0$GeneSumNorm), : Cannot compute exact p-
value with ties</code></pre>
<pre><code>
    Spearman&#39;s rank correlation rho

data:  log10(TXN_abund_combLibs_tot_n0$TPM) and log10(TXN_abund_combLibs_tot_n0$GeneSumNorm)
S = 1.211e+09, p-value &lt; 2.2e-16
alternative hypothesis: true rho is not equal to 0
sample estimates:
      rho 
0.7515536 </code></pre>
<p>Same for nuclear</p>
<pre class="r"><code>NucCounts_nonOverlapEX=read.table(&quot;../data/UnderstandPeaksQC/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_GenicNoOverlapEXT.Nuclear_Fixed.fc&quot;, header=T, stringsAsFactors = F)[,7:45]


SumCounts_Nuc=rowSums(NucCounts_nonOverlapEX)

Alllib_Nuc_nonOverlapEX=read.table(&quot;../data/UnderstandPeaksQC/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_GenicNoOverlapEXT.Nuclear_Fixed.fc&quot;, header=T, stringsAsFactors = F) %&gt;% separate(Geneid, into=c(&quot;peak&quot;, &quot;chr&quot;, &quot;start&quot;, &quot;end&quot;, &quot;strand&quot;, &quot;gene&quot;), sep=&quot;:&quot;) 
 
Alllib_Nuc_nonOverlapEX$SumCounts=SumCounts_Nuc

Alllib_Nuc_nonOverlapEX_bygene=Alllib_Nuc_nonOverlapEX %&gt;% select(gene, SumCounts) %&gt;%  group_by(gene)  %&gt;%  summarize(GeneSum=sum(SumCounts)) %&gt;% mutate(GeneSumNorm=GeneSum/10.8) 


TXN_abund_combLibs_Nuc=TXN_abund %&gt;% inner_join(Alllib_Nuc_nonOverlapEX_bygene,by=&quot;gene&quot;)


TXN_abund_combLibs_Nuc_n0=TXN_abund_combLibs_Nuc%&gt;% filter(TPM&gt;0) %&gt;% filter(GeneSumNorm&gt;0)
corr_AllLibNuc_noOver=ggplot(TXN_abund_combLibs_Nuc_n0, aes(x=log10(TPM), y= log10(GeneSumNorm))) + geom_point() + labs(title=&quot;Nuclear All ind&quot;, x=&quot;log10 RNA seq TPM&quot;, y=&quot;log10 Peak count sum per gene All Ind.&quot;)+ geom_smooth(aes(x=log10(TPM),y=log10(GeneSumNorm)),method = &quot;lm&quot;) + annotate(&quot;text&quot;,x=-2, y=5,label=&quot;R2=.48&quot;) +geom_density2d(na.rm = TRUE, size = 1, colour = &#39;red&#39;) 

#+ geom_text(aes(label=gene),hjust=0, vjust=0)
       
corr_AllLibNuc_noOver       </code></pre>
<p><img src="figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-75-1.png" width="672" style="display: block; margin: auto;" /></p>
<details> <summary><em>Expand here to see past versions of unnamed-chunk-75-1.png:</em></summary>
<table style="border-collapse:separate; border-spacing:5px;">
<thead>
<tr>
<th style="text-align:left;">
Version
</th>
<th style="text-align:left;">
Author
</th>
<th style="text-align:left;">
Date
</th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align:left;">
<a href="https://github.com/brimittleman/threeprimeseq/blob/69b51620dfe0a0ce05d7085c92f5949fd26fae4a/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-75-1.png" target="_blank">69b5162</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2018-12-13
</td>
</tr>
</tbody>
</table>
<p></details></p>
<pre class="r"><code>summary(lm(log10(TPM)~log10(GeneSumNorm),TXN_abund_combLibs_Nuc_n0)) </code></pre>
<pre><code>
Call:
lm(formula = log10(TPM) ~ log10(GeneSumNorm), data = TXN_abund_combLibs_Nuc_n0)

Residuals:
    Min      1Q  Median      3Q     Max 
-3.3980 -0.3912  0.0130  0.4109  3.3698 

Coefficients:
                   Estimate Std. Error t value Pr(&gt;|t|)    
(Intercept)        -1.40221    0.04238  -33.08   &lt;2e-16 ***
log10(GeneSumNorm)  0.75348    0.01427   52.78   &lt;2e-16 ***
---
Signif. codes:  0 &#39;***&#39; 0.001 &#39;**&#39; 0.01 &#39;*&#39; 0.05 &#39;.&#39; 0.1 &#39; &#39; 1

Residual standard error: 0.7219 on 3079 degrees of freedom
Multiple R-squared:  0.475, Adjusted R-squared:  0.4749 
F-statistic:  2786 on 1 and 3079 DF,  p-value: &lt; 2.2e-16</code></pre>
<pre class="r"><code>cor.test(log10(TXN_abund_combLibs_Nuc_n0$TPM),log10(TXN_abund_combLibs_Nuc_n0$GeneSumNorm),method=&quot;spearman&quot;)</code></pre>
<pre><code>Warning in cor.test.default(log10(TXN_abund_combLibs_Nuc_n0$TPM),
log10(TXN_abund_combLibs_Nuc_n0$GeneSumNorm), : Cannot compute exact p-
value with ties</code></pre>
<pre><code>
    Spearman&#39;s rank correlation rho

data:  log10(TXN_abund_combLibs_Nuc_n0$TPM) and log10(TXN_abund_combLibs_Nuc_n0$GeneSumNorm)
S = 1587500000, p-value &lt; 2.2e-16
alternative hypothesis: true rho is not equal to 0
sample estimates:
      rho 
0.6743196 </code></pre>
</div>
<div id="compare-2-3-seq." class="section level2">
<h2>Compare 2 3’ Seq.</h2>
<pre class="r"><code>TotCounts_nonOverlapEX=read.table(&quot;../data/UnderstandPeaksQC/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_GenicNoOverlapEXT.Total_fixed.fc&quot;, header=T, stringsAsFactors = F)

summary(lm(TotCounts_nonOverlapEX$X18486_T~TotCounts_nonOverlapEX$X19238_T))</code></pre>
<pre><code>
Call:
lm(formula = TotCounts_nonOverlapEX$X18486_T ~ TotCounts_nonOverlapEX$X19238_T)

Residuals:
     Min       1Q   Median       3Q      Max 
-10543.0     -2.6      1.3      1.3  11904.2 

Coefficients:
                                  Estimate Std. Error t value Pr(&gt;|t|)    
(Intercept)                     -1.2603937  0.3532729  -3.568  0.00036 ***
TotCounts_nonOverlapEX$X19238_T  0.7807420  0.0009045 863.135  &lt; 2e-16 ***
---
Signif. codes:  0 &#39;***&#39; 0.001 &#39;**&#39; 0.01 &#39;*&#39; 0.05 &#39;.&#39; 0.1 &#39; &#39; 1

Residual standard error: 101.2 on 82478 degrees of freedom
Multiple R-squared:  0.9003,    Adjusted R-squared:  0.9003 
F-statistic: 7.45e+05 on 1 and 82478 DF,  p-value: &lt; 2.2e-16</code></pre>
<pre class="r"><code>summary(lm(TotCounts_nonOverlapEX$X18486_T~TotCounts_nonOverlapEX$X19238_T))</code></pre>
<pre><code>
Call:
lm(formula = TotCounts_nonOverlapEX$X18486_T ~ TotCounts_nonOverlapEX$X19238_T)

Residuals:
     Min       1Q   Median       3Q      Max 
-10543.0     -2.6      1.3      1.3  11904.2 

Coefficients:
                                  Estimate Std. Error t value Pr(&gt;|t|)    
(Intercept)                     -1.2603937  0.3532729  -3.568  0.00036 ***
TotCounts_nonOverlapEX$X19238_T  0.7807420  0.0009045 863.135  &lt; 2e-16 ***
---
Signif. codes:  0 &#39;***&#39; 0.001 &#39;**&#39; 0.01 &#39;*&#39; 0.05 &#39;.&#39; 0.1 &#39; &#39; 1

Residual standard error: 101.2 on 82478 degrees of freedom
Multiple R-squared:  0.9003,    Adjusted R-squared:  0.9003 
F-statistic: 7.45e+05 on 1 and 82478 DF,  p-value: &lt; 2.2e-16</code></pre>
<pre class="r"><code>summary(lm(TotCounts_nonOverlapEX$X19238_T~TotCounts_nonOverlapEX$X19225_T))</code></pre>
<pre><code>
Call:
lm(formula = TotCounts_nonOverlapEX$X19238_T ~ TotCounts_nonOverlapEX$X19225_T)

Residuals:
     Min       1Q   Median       3Q      Max 
-13986.1     -7.7     -6.1     -2.1  12207.9 

Coefficients:
                                Estimate Std. Error t value Pr(&gt;|t|)    
(Intercept)                     6.146796   0.454280   13.53   &lt;2e-16 ***
TotCounts_nonOverlapEX$X19225_T 0.837322   0.001034  809.62   &lt;2e-16 ***
---
Signif. codes:  0 &#39;***&#39; 0.001 &#39;**&#39; 0.01 &#39;*&#39; 0.05 &#39;.&#39; 0.1 &#39; &#39; 1

Residual standard error: 130.3 on 82478 degrees of freedom
Multiple R-squared:  0.8882,    Adjusted R-squared:  0.8882 
F-statistic: 6.555e+05 on 1 and 82478 DF,  p-value: &lt; 2.2e-16</code></pre>
<pre class="r"><code>plot(log10(TotCounts_nonOverlapEX$X19238_T)~log10(TotCounts_nonOverlapEX$X19225_T))</code></pre>
<p><img src="figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-76-1.png" width="672" style="display: block; margin: auto;" /></p>
<details> <summary><em>Expand here to see past versions of unnamed-chunk-76-1.png:</em></summary>
<table style="border-collapse:separate; border-spacing:5px;">
<thead>
<tr>
<th style="text-align:left;">
Version
</th>
<th style="text-align:left;">
Author
</th>
<th style="text-align:left;">
Date
</th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align:left;">
<a href="https://github.com/brimittleman/threeprimeseq/blob/69b51620dfe0a0ce05d7085c92f5949fd26fae4a/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-76-1.png" target="_blank">69b5162</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2018-12-13
</td>
</tr>
</tbody>
</table>
<p></details></p>
<p>Do this withough filter:</p>
<pre class="r"><code>total_Cov=read.table(&quot;../data/PeakCounts/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant.Total_fixed.fc&quot;, header=T, stringsAsFactors = F)[7:45]

plot(log10(total_Cov$X19238_T)~log10(total_Cov$X19225_T))</code></pre>
<p><img src="figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-77-1.png" width="672" style="display: block; margin: auto;" /></p>
<details> <summary><em>Expand here to see past versions of unnamed-chunk-77-1.png:</em></summary>
<table style="border-collapse:separate; border-spacing:5px;">
<thead>
<tr>
<th style="text-align:left;">
Version
</th>
<th style="text-align:left;">
Author
</th>
<th style="text-align:left;">
Date
</th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align:left;">
<a href="https://github.com/brimittleman/threeprimeseq/blob/b5a37f3ed662b0f28780079601d389535b2c529c/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-77-1.png" target="_blank">b5a37f3</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2019-01-11
</td>
</tr>
</tbody>
</table>
<p></details></p>
<pre class="r"><code>summary(lm(total_Cov$X19238_T~total_Cov$X19225_T))</code></pre>
<pre><code>
Call:
lm(formula = total_Cov$X19238_T ~ total_Cov$X19225_T)

Residuals:
     Min       1Q   Median       3Q      Max 
-28832.1     -8.0     -6.3     -2.6  25408.6 

Coefficients:
                    Estimate Std. Error t value Pr(&gt;|t|)    
(Intercept)        6.3159764  0.2977179   21.21   &lt;2e-16 ***
total_Cov$X19225_T 0.8266119  0.0004396 1880.39   &lt;2e-16 ***
---
Signif. codes:  0 &#39;***&#39; 0.001 &#39;**&#39; 0.01 &#39;*&#39; 0.05 &#39;.&#39; 0.1 &#39; &#39; 1

Residual standard error: 172.9 on 338139 degrees of freedom
Multiple R-squared:  0.9127,    Adjusted R-squared:  0.9127 
F-statistic: 3.536e+06 on 1 and 338139 DF,  p-value: &lt; 2.2e-16</code></pre>
<p>Do this with ggplot</p>
<pre class="r"><code>ggplot(total_Cov, aes(x = log10(X19238_T) , y=log10(X19225_T))) + geom_point() + labs(title=&quot;Relationship between 2 Total 3&#39; Seq Peak Counts&quot;, x=&quot;log10(19238)&quot;,y=&quot;log10(19225)&quot;)</code></pre>
<p><img src="figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-78-1.png" width="672" style="display: block; margin: auto;" /></p>
<details> <summary><em>Expand here to see past versions of unnamed-chunk-78-1.png:</em></summary>
<table style="border-collapse:separate; border-spacing:5px;">
<thead>
<tr>
<th style="text-align:left;">
Version
</th>
<th style="text-align:left;">
Author
</th>
<th style="text-align:left;">
Date
</th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align:left;">
<a href="https://github.com/brimittleman/threeprimeseq/blob/b5a37f3ed662b0f28780079601d389535b2c529c/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-78-1.png" target="_blank">b5a37f3</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2019-01-11
</td>
</tr>
</tbody>
</table>
<p></details></p>
<p>I want to do this for all and make a heat plot with the correlations:</p>
<pre class="r"><code>cor_function=function(data){
  corr_matrix= matrix(0,dim(data)[[2]],dim(data)[[2]])
  for (i in seq(1,dim(data)[[2]])){
    for (j in seq(1,dim(data)[[2]])){
      x=cor.test(data[,i],  data[,j], method=&#39;pearson&#39;)
      cor_ij=as.numeric(x$estimate)
      corr_matrix[i,j]=cor_ij
    }
  }
  return(corr_matrix)
}</code></pre>
<pre class="r"><code>total_corr=cor_function(as.matrix(total_Cov))</code></pre>
<pre class="r"><code>total_corr_melt=melt(total_corr)
ggheatmap=ggplot(data = total_corr_melt, aes(x=Var1, y=Var2, fill=value)) +
  geom_tile() +
  labs(title=&quot;Library Peak Count Correlation Heatplot: Total&quot;, x=&quot;&quot;, y=&quot;&quot;)+ 
  scale_fill_continuous(limits=c(0, 1), breaks=seq(0,1,by=0.15),name = &quot;Correlation&quot;,
                      low = &quot;#FFFFFF&quot;,
                      high = &quot;darkviolet&quot;)

ggheatmap</code></pre>
<p><img src="figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-81-1.png" width="672" style="display: block; margin: auto;" /></p>
<details> <summary><em>Expand here to see past versions of unnamed-chunk-81-1.png:</em></summary>
<table style="border-collapse:separate; border-spacing:5px;">
<thead>
<tr>
<th style="text-align:left;">
Version
</th>
<th style="text-align:left;">
Author
</th>
<th style="text-align:left;">
Date
</th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align:left;">
<a href="https://github.com/brimittleman/threeprimeseq/blob/b5a37f3ed662b0f28780079601d389535b2c529c/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-81-1.png" target="_blank">b5a37f3</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2019-01-11
</td>
</tr>
</tbody>
</table>
<p></details></p>
<p>mean correlation .89</p>
<p>DO this for nuclear:</p>
<pre class="r"><code>nuclear_Cov=read.table(&quot;../data/PeakCounts/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant.Nuclear_fixed.fc&quot;, header=T, stringsAsFactors = F)[7:45]  </code></pre>
<pre class="r"><code>nuclear_corr=cor_function(as.matrix(nuclear_Cov))

nuclear_corr_melt=melt(nuclear_corr)
ggheatmap_nuc=ggplot(data = nuclear_corr_melt, aes(x=Var1, y=Var2, fill=value)) +
  geom_tile() +
  labs(title=&quot;Library Peak Count Correlation Heatplot: Nuclear&quot;, x=&quot;&quot;, y=&quot;&quot;)+ 
  scale_fill_continuous(limits=c(0, 1), breaks=seq(0,1,by=0.15),name = &quot;Correlation&quot;,
                      low = &quot;#FFFFFF&quot;,
                      high = &quot;deepskyblue3&quot;)

ggheatmap_nuc</code></pre>
<p><img src="figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-83-1.png" width="672" style="display: block; margin: auto;" /></p>
<details> <summary><em>Expand here to see past versions of unnamed-chunk-83-1.png:</em></summary>
<table style="border-collapse:separate; border-spacing:5px;">
<thead>
<tr>
<th style="text-align:left;">
Version
</th>
<th style="text-align:left;">
Author
</th>
<th style="text-align:left;">
Date
</th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align:left;">
<a href="https://github.com/brimittleman/threeprimeseq/blob/b5a37f3ed662b0f28780079601d389535b2c529c/docs/figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-83-1.png" target="_blank">b5a37f3</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2019-01-11
</td>
</tr>
</tbody>
</table>
<p></details></p>
<p>Mean correlation is .74</p>
<p>Make a full one with both:</p>
<pre class="r"><code>bothFrac=cbind(nuclear_Cov,total_Cov)
bothFrac_corr=cor_function(bothFrac)


bothFrac_corr_melt=melt(bothFrac_corr)
ggheatmap_both=ggplot(data = bothFrac_corr_melt, aes(x=Var1, y=Var2, fill=value)) +
  geom_tile() +
  labs(title=&quot;Library Peak Count Correlation Heatplot&quot;, x=&quot;&quot;, y=&quot;&quot;)+ 
  scale_fill_continuous(limits=c(0, 1), breaks=seq(0,1,by=0.15),name = &quot;Correlation&quot;,
                      low = &quot;#FFFFFF&quot;,
                      high = &quot;coral1&quot;)

ggheatmap_both</code></pre>
<p><img src="figure/InvestigatePeak2GeneAssignment.Rmd/unnamed-chunk-84-1.png" width="672" style="display: block; margin: auto;" /></p>
<p>Make a boxplot with the correlations:</p>
<pre class="r"><code>Fraction=c(&quot;Total&quot;, &quot;Nuclear&quot;, &quot;Both&quot;)
CorrelationMean=c(mean(total_corr),mean(nuclear_corr), mean(bothFrac_corr))
CorrelationSD=c(sd(total_corr), sd(nuclear_corr), sd(bothFrac_corr))

CorrelationDF=as.data.frame(cbind(Fraction,CorrelationMean,CorrelationSD))
CorrelationDF$CorrelationMean=as.numeric(as.character(CorrelationDF$CorrelationMean))
CorrelationDF$CorrelationSD=as.numeric(as.character(CorrelationDF$CorrelationSD))

corrBarPlot=ggplot(CorrelationDF, aes(x=Fraction, y=CorrelationMean, fill=Fraction)) + geom_bar(stat=&quot;Identity&quot;, position=&quot;dodge&quot;, width=.5) + geom_errorbar(aes(ymin=CorrelationMean-CorrelationSD, ymax=CorrelationMean+CorrelationSD), width=.2) + labs(y=&quot;Mean Correlation between Libraries&quot;, title=&quot;Correlation between Peak Counts across libraries&quot;) + scale_fill_manual(values=c(&quot;grey&quot;,&quot;deepskyblue3&quot;,&quot;darkviolet&quot;)) +theme(axis.text.y = element_text(size=12),axis.title.y=element_text(size=15,face=&quot;bold&quot;), axis.title.x=element_text(size=12,face=&quot;bold&quot;))


ggsave(corrBarPlot, file=&quot;../output/plots/QC_plots/BarPlot4PeakCountCorrelation.png&quot;)</code></pre>
<pre><code>Saving 7 x 5 in image</code></pre>
</div>
<div id="session-information" class="section level2">
<h2>Session information</h2>
<pre class="r"><code>sessionInfo()</code></pre>
<pre><code>R version 3.5.1 (2018-07-02)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS  10.14.1

Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] bindrcpp_0.2.2  tximport_1.8.0  reshape2_1.4.3  cowplot_0.9.3  
 [5] workflowr_1.1.1 forcats_0.3.0   stringr_1.3.1   dplyr_0.7.6    
 [9] purrr_0.2.5     readr_1.1.1     tidyr_0.8.1     tibble_1.4.2   
[13] ggplot2_3.0.0   tidyverse_1.2.1

loaded via a namespace (and not attached):
 [1] tidyselect_0.2.4  haven_1.1.2       lattice_0.20-35  
 [4] colorspace_1.3-2  htmltools_0.3.6   yaml_2.2.0       
 [7] rlang_0.2.2       R.oo_1.22.0       pillar_1.3.0     
[10] glue_1.3.0        withr_2.1.2       R.utils_2.7.0    
[13] modelr_0.1.2      readxl_1.1.0      bindr_0.1.1      
[16] plyr_1.8.4        munsell_0.5.0     gtable_0.2.0     
[19] cellranger_1.1.0  rvest_0.3.2       R.methodsS3_1.7.1
[22] evaluate_0.11     labeling_0.3      knitr_1.20       
[25] broom_0.5.0       Rcpp_0.12.19      scales_1.0.0     
[28] backports_1.1.2   jsonlite_1.5      hms_0.4.2        
[31] digest_0.6.17     stringi_1.2.4     grid_3.5.1       
[34] rprojroot_1.3-2   cli_1.0.1         tools_3.5.1      
[37] magrittr_1.5      lazyeval_0.2.1    crayon_1.3.4     
[40] whisker_0.3-2     pkgconfig_2.0.2   MASS_7.3-50      
[43] xml2_1.2.0        lubridate_1.7.4   assertthat_0.2.0 
[46] rmarkdown_1.10    httr_1.3.1        rstudioapi_0.8   
[49] R6_2.3.0          nlme_3.1-137      git2r_0.23.0     
[52] compiler_3.5.1   </code></pre>
</div>

<hr>
<p>
    
</p>
<hr>

<!-- To enable disqus, uncomment the section below and provide your disqus_shortname -->

<!-- disqus
  <div id="disqus_thread"></div>
    <script type="text/javascript">
        /* * * CONFIGURATION VARIABLES: EDIT BEFORE PASTING INTO YOUR WEBPAGE * * */
        var disqus_shortname = 'rmarkdown'; // required: replace example with your forum shortname

        /* * * DON'T EDIT BELOW THIS LINE * * */
        (function() {
            var dsq = document.createElement('script'); dsq.type = 'text/javascript'; dsq.async = true;
            dsq.src = '//' + disqus_shortname + '.disqus.com/embed.js';
            (document.getElementsByTagName('head')[0] || document.getElementsByTagName('body')[0]).appendChild(dsq);
        })();
    </script>
    <noscript>Please enable JavaScript to view the <a href="http://disqus.com/?ref_noscript">comments powered by Disqus.</a></noscript>
    <a href="http://disqus.com" class="dsq-brlink">comments powered by <span class="logo-disqus">Disqus</span></a>
-->
<!-- Adjust MathJax settings so that all math formulae are shown using
TeX fonts only; see
http://docs.mathjax.org/en/latest/configuration.html.  This will make
the presentation more consistent at the cost of the webpage sometimes
taking slightly longer to load. Note that this only works because the
footer is added to webpages before the MathJax javascript. -->
<script type="text/x-mathjax-config">
  MathJax.Hub.Config({
    "HTML-CSS": { availableFonts: ["TeX"] }
  });
</script>

<hr>
<p>
  This reproducible <a href="http://rmarkdown.rstudio.com">R Markdown</a>
  analysis was created with
  <a href="https://github.com/jdblischak/workflowr">workflowr</a> 1.1.1
</p>
<hr>


</div>
</div>

</div>

<script>

// add bootstrap table styles to pandoc tables
function bootstrapStylePandocTables() {
  $('tr.header').parent('thead').parent('table').addClass('table table-condensed');
}
$(document).ready(function () {
  bootstrapStylePandocTables();
});


</script>

<!-- dynamically load mathjax for compatibility with self-contained -->
<script>
  (function () {
    var script = document.createElement("script");
    script.type = "text/javascript";
    script.src  = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
    document.getElementsByTagName("head")[0].appendChild(script);
  })();
</script>

</body>
</html>