<!DOCTYPE html>

<html xmlns="http://www.w3.org/1999/xhtml">

<head>

<meta charset="utf-8" />
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta name="generator" content="pandoc" />


<meta name="author" content="Briana Mittleman" />


<title>Data Processing Figures</title>

<script src="site_libs/jquery-1.11.3/jquery.min.js"></script>
<meta name="viewport" content="width=device-width, initial-scale=1" />
<link href="site_libs/bootstrap-3.3.5/css/journal.min.css" rel="stylesheet" />
<script src="site_libs/bootstrap-3.3.5/js/bootstrap.min.js"></script>
<script src="site_libs/bootstrap-3.3.5/shim/html5shiv.min.js"></script>
<script src="site_libs/bootstrap-3.3.5/shim/respond.min.js"></script>
<script src="site_libs/jqueryui-1.11.4/jquery-ui.min.js"></script>
<link href="site_libs/tocify-1.9.1/jquery.tocify.css" rel="stylesheet" />
<script src="site_libs/tocify-1.9.1/jquery.tocify.js"></script>
<script src="site_libs/navigation-1.1/tabsets.js"></script>
<link href="site_libs/highlightjs-9.12.0/textmate.css" rel="stylesheet" />
<script src="site_libs/highlightjs-9.12.0/highlight.js"></script>
<link href="site_libs/font-awesome-5.0.13/css/fa-svg-with-js.css" rel="stylesheet" />
<script src="site_libs/font-awesome-5.0.13/js/fontawesome-all.min.js"></script>
<script src="site_libs/font-awesome-5.0.13/js/fa-v4-shims.min.js"></script>

<style type="text/css">code{white-space: pre;}</style>
<style type="text/css">
  pre:not([class]) {
    background-color: white;
  }
</style>
<script type="text/javascript">
if (window.hljs) {
  hljs.configure({languages: []});
  hljs.initHighlightingOnLoad();
  if (document.readyState && document.readyState === "complete") {
    window.setTimeout(function() { hljs.initHighlighting(); }, 0);
  }
}
</script>



<style type="text/css">
h1 {
  font-size: 34px;
}
h1.title {
  font-size: 38px;
}
h2 {
  font-size: 30px;
}
h3 {
  font-size: 24px;
}
h4 {
  font-size: 18px;
}
h5 {
  font-size: 16px;
}
h6 {
  font-size: 12px;
}
.table th:not([align]) {
  text-align: left;
}
</style>


</head>

<body>

<style type = "text/css">
.main-container {
  max-width: 940px;
  margin-left: auto;
  margin-right: auto;
}
code {
  color: inherit;
  background-color: rgba(0, 0, 0, 0.04);
}
img {
  max-width:100%;
  height: auto;
}
.tabbed-pane {
  padding-top: 12px;
}
.html-widget {
  margin-bottom: 20px;
}
button.code-folding-btn:focus {
  outline: none;
}
</style>


<style type="text/css">
/* padding for bootstrap navbar */
body {
  padding-top: 51px;
  padding-bottom: 40px;
}
/* offset scroll position for anchor links (for fixed navbar)  */
.section h1 {
  padding-top: 56px;
  margin-top: -56px;
}

.section h2 {
  padding-top: 56px;
  margin-top: -56px;
}
.section h3 {
  padding-top: 56px;
  margin-top: -56px;
}
.section h4 {
  padding-top: 56px;
  margin-top: -56px;
}
.section h5 {
  padding-top: 56px;
  margin-top: -56px;
}
.section h6 {
  padding-top: 56px;
  margin-top: -56px;
}
</style>

<script>
// manage active state of menu based on current page
$(document).ready(function () {
  // active menu anchor
  href = window.location.pathname
  href = href.substr(href.lastIndexOf('/') + 1)
  if (href === "")
    href = "index.html";
  var menuAnchor = $('a[href="' + href + '"]');

  // mark it active
  menuAnchor.parent().addClass('active');

  // if it's got a parent navbar menu mark it active as well
  menuAnchor.closest('li.dropdown').addClass('active');
});
</script>


<div class="container-fluid main-container">

<!-- tabsets -->
<script>
$(document).ready(function () {
  window.buildTabsets("TOC");
});
</script>

<!-- code folding -->




<script>
$(document).ready(function ()  {

    // move toc-ignore selectors from section div to header
    $('div.section.toc-ignore')
        .removeClass('toc-ignore')
        .children('h1,h2,h3,h4,h5').addClass('toc-ignore');

    // establish options
    var options = {
      selectors: "h1,h2,h3",
      theme: "bootstrap3",
      context: '.toc-content',
      hashGenerator: function (text) {
        return text.replace(/[.\\/?&!#<>]/g, '').replace(/\s/g, '_').toLowerCase();
      },
      ignoreSelector: ".toc-ignore",
      scrollTo: 0
    };
    options.showAndHide = true;
    options.smoothScroll = true;

    // tocify
    var toc = $("#TOC").tocify(options).data("toc-tocify");
});
</script>

<style type="text/css">

#TOC {
  margin: 25px 0px 20px 0px;
}
@media (max-width: 768px) {
#TOC {
  position: relative;
  width: 100%;
}
}


.toc-content {
  padding-left: 30px;
  padding-right: 40px;
}

div.main-container {
  max-width: 1200px;
}

div.tocify {
  width: 20%;
  max-width: 260px;
  max-height: 85%;
}

@media (min-width: 768px) and (max-width: 991px) {
  div.tocify {
    width: 25%;
  }
}

@media (max-width: 767px) {
  div.tocify {
    width: 100%;
    max-width: none;
  }
}

.tocify ul, .tocify li {
  line-height: 20px;
}

.tocify-subheader .tocify-item {
  font-size: 0.90em;
  padding-left: 25px;
  text-indent: 0;
}

.tocify .list-group-item {
  border-radius: 0px;
}


</style>

<!-- setup 3col/9col grid for toc_float and main content  -->
<div class="row-fluid">
<div class="col-xs-12 col-sm-4 col-md-3">
<div id="TOC" class="tocify">
</div>
</div>

<div class="toc-content col-xs-12 col-sm-8 col-md-9">




<div class="navbar navbar-default  navbar-fixed-top" role="navigation">
  <div class="container">
    <div class="navbar-header">
      <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar">
        <span class="icon-bar"></span>
        <span class="icon-bar"></span>
        <span class="icon-bar"></span>
      </button>
      <a class="navbar-brand" href="index.html">Three Prime Sequencing in Human LCLs</a>
    </div>
    <div id="navbar" class="navbar-collapse collapse">
      <ul class="nav navbar-nav">
        <li>
  <a href="index.html">Home</a>
</li>
<li>
  <a href="about.html">About</a>
</li>
<li>
  <a href="license.html">License</a>
</li>
      </ul>
      <ul class="nav navbar-nav navbar-right">
        <li>
  <a href="https://github.com/brimittleman/threeprimeseq">
    <span class="fa fa-github"></span>
     
  </a>
</li>
      </ul>
    </div><!--/.nav-collapse -->
  </div><!--/.container -->
</div><!--/.navbar -->

<!-- Add a small amount of space between sections. -->
<style type="text/css">
div.section {
  padding-top: 12px;
}
</style>

<div class="fluid-row" id="header">



<h1 class="title toc-ignore">Data Processing Figures</h1>
<h4 class="author"><em>Briana Mittleman</em></h4>
<h4 class="date"><em>8/28/2018</em></h4>

</div>


<p><strong>Last updated:</strong> 2018-09-04</p>
<strong>workflowr checks:</strong> <small>(Click a bullet for more information)</small>
<ul>
<li>
<p><details> <summary> <strong style="color:blue;">✔</strong> <strong>R Markdown file:</strong> up-to-date </summary></p>
<p>Great! Since the R Markdown file has been committed to the Git repository, you know the exact version of the code that produced these results.</p>
</details>
</li>
<li>
<p><details> <summary> <strong style="color:blue;">✔</strong> <strong>Environment:</strong> empty </summary></p>
<p>Great job! The global environment was empty. Objects defined in the global environment can affect the analysis in your R Markdown file in unknown ways. For reproduciblity it’s best to always run the code in an empty environment.</p>
</details>
</li>
<li>
<p><details> <summary> <strong style="color:blue;">✔</strong> <strong>Seed:</strong> <code>set.seed(12345)</code> </summary></p>
<p>The command <code>set.seed(12345)</code> was run prior to running the code in the R Markdown file. Setting a seed ensures that any results that rely on randomness, e.g. subsampling or permutations, are reproducible.</p>
</details>
</li>
<li>
<p><details> <summary> <strong style="color:blue;">✔</strong> <strong>Session information:</strong> recorded </summary></p>
<p>Great job! Recording the operating system, R version, and package versions is critical for reproducibility.</p>
</details>
</li>
<li>
<p><details> <summary> <strong style="color:blue;">✔</strong> <strong>Repository version:</strong> <a href="https://github.com/brimittleman/threeprimeseq/tree/deaa5b0c0f3a0e46c7c187c7fe4ee62e042bca30" target="_blank">deaa5b0</a> </summary></p>
Great! You are using Git for version control. Tracking code development and connecting the code version to the results is critical for reproducibility. The version displayed above was the version of the Git repository at the time these results were generated. <br><br> Note that you need to be careful to ensure that all relevant files for the analysis have been committed to Git prior to generating the results (you can use <code>wflow_publish</code> or <code>wflow_git_commit</code>). workflowr only checks the R Markdown file, but you know if there are other scripts or data files that it depends on. Below is the status of the Git repository when the results were generated:
<pre><code>
Ignored files:
    Ignored:    .DS_Store
    Ignored:    .Rhistory
    Ignored:    .Rproj.user/
    Ignored:    analysis/figure/
    Ignored:    output/.DS_Store

Untracked files:
    Untracked:  analysis/ncbiRefSeq_sm.sort.mRNA.bed
    Untracked:  analysis/snake.config.notes.Rmd
    Untracked:  data/18486.genecov.txt
    Untracked:  data/APApeaksYL.total.inbrain.bed
    Untracked:  data/RNAkalisto/
    Untracked:  data/Totalpeaks_filtered_clean.bed
    Untracked:  data/YL-SP-18486-T-combined-genecov.txt
    Untracked:  data/YL-SP-18486-T_S9_R1_001-genecov.txt
    Untracked:  data/bedgraph_peaks/
    Untracked:  data/bin200.5.T.nuccov.bed
    Untracked:  data/bin200.Anuccov.bed
    Untracked:  data/bin200.nuccov.bed
    Untracked:  data/clean_peaks/
    Untracked:  data/comb_map_stats.csv
    Untracked:  data/comb_map_stats.xlsx
    Untracked:  data/combined_reads_mapped_three_prime_seq.csv
    Untracked:  data/gencov.test.csv
    Untracked:  data/gencov.test.txt
    Untracked:  data/gencov_zero.test.csv
    Untracked:  data/gencov_zero.test.txt
    Untracked:  data/gene_cov/
    Untracked:  data/joined
    Untracked:  data/leafcutter/
    Untracked:  data/merged_combined_YL-SP-threeprimeseq.bg
    Untracked:  data/nom_QTL/
    Untracked:  data/nom_QTL_opp/
    Untracked:  data/nuc6up/
    Untracked:  data/peakPerRefSeqGene/
    Untracked:  data/perm_QTL/
    Untracked:  data/perm_QTL_opp/
    Untracked:  data/reads_mapped_three_prime_seq.csv
    Untracked:  data/smash.cov.results.bed
    Untracked:  data/smash.cov.results.csv
    Untracked:  data/smash.cov.results.txt
    Untracked:  data/smash_testregion/
    Untracked:  data/ssFC200.cov.bed
    Untracked:  data/temp.file1
    Untracked:  data/temp.file2
    Untracked:  data/temp.gencov.test.txt
    Untracked:  data/temp.gencov_zero.test.txt
    Untracked:  output/picard/
    Untracked:  output/plots/
    Untracked:  output/qual.fig2.pdf

Unstaged changes:
    Modified:   analysis/28ind.peak.explore.Rmd
    Modified:   analysis/cleanupdtseq.internalpriming.Rmd
    Modified:   analysis/dif.iso.usage.leafcutter.Rmd
    Modified:   analysis/diff_iso_pipeline.Rmd
    Modified:   analysis/explore.filters.Rmd
    Modified:   analysis/peak.cov.pipeline.Rmd
    Modified:   analysis/peakOverlap_oppstrand.Rmd
    Modified:   analysis/pheno.leaf.comb.Rmd
    Modified:   analysis/test.max2.Rmd
    Modified:   code/Snakefile

</code></pre>
Note that any generated files, e.g. HTML, png, CSS, etc., are not included in this status report because it is ok for generated content to have uncommitted changes. </details>
</li>
</ul>
<details> <summary> <small><strong>Expand here to see past versions:</strong></small> </summary>
<ul>
<table style="border-collapse:separate; border-spacing:5px;">
<thead>
<tr>
<th style="text-align:left;">
File
</th>
<th style="text-align:left;">
Version
</th>
<th style="text-align:left;">
Author
</th>
<th style="text-align:left;">
Date
</th>
<th style="text-align:left;">
Message
</th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align:left;">
Rmd
</td>
<td style="text-align:left;">
<a href="https://github.com/brimittleman/threeprimeseq/blob/deaa5b0c0f3a0e46c7c187c7fe4ee62e042bca30/analysis/dataprocfigures.Rmd" target="_blank">deaa5b0</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2018-09-04
</td>
<td style="text-align:left;">
compare TPM for genes with no peaks
</td>
</tr>
<tr>
<td style="text-align:left;">
html
</td>
<td style="text-align:left;">
<a href="https://cdn.rawgit.com/brimittleman/threeprimeseq/2e39f7a21ec47a1d8efaf410a87616f2bda9283f/docs/dataprocfigures.html" target="_blank">2e39f7a</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2018-08-30
</td>
<td style="text-align:left;">
Build site.
</td>
</tr>
<tr>
<td style="text-align:left;">
Rmd
</td>
<td style="text-align:left;">
<a href="https://github.com/brimittleman/threeprimeseq/blob/a2a7cd9bb204a1c4f22f57125a72d05f36a44697/analysis/dataprocfigures.Rmd" target="_blank">a2a7cd9</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2018-08-30
</td>
<td style="text-align:left;">
add kalisto code
</td>
</tr>
<tr>
<td style="text-align:left;">
html
</td>
<td style="text-align:left;">
<a href="https://cdn.rawgit.com/brimittleman/threeprimeseq/cbec2f6eb084a5f22386c61b8e6d872d238c3c05/docs/dataprocfigures.html" target="_blank">cbec2f6</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2018-08-29
</td>
<td style="text-align:left;">
Build site.
</td>
</tr>
<tr>
<td style="text-align:left;">
Rmd
</td>
<td style="text-align:left;">
<a href="https://github.com/brimittleman/threeprimeseq/blob/6b818cbdefc857eb57e9a522483fa63b60b7fb60/analysis/dataprocfigures.Rmd" target="_blank">6b818cb</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2018-08-29
</td>
<td style="text-align:left;">
try gencode anno
</td>
</tr>
<tr>
<td style="text-align:left;">
html
</td>
<td style="text-align:left;">
<a href="https://cdn.rawgit.com/brimittleman/threeprimeseq/c6dc97bf8c0353ab9179b44a1f9ed8210f4c0adb/docs/dataprocfigures.html" target="_blank">c6dc97b</a>
</td>
<td style="text-align:left;">
brimittleman
</td>
<td style="text-align:left;">
2018-08-28
</td>
<td style="text-align:left;">
Build site.
</td>
</tr>
<tr>
<td style="text-align:left;">
Rmd
</td>
<td style="text-align:left;">
<a href="https://github.com/brimittleman/threeprimeseq/blob/fa818a13d8591ac31fd9f6c63a86af3af565def7/analysis/dataprocfigures.Rmd" target="_blank">fa818a1</a>
</td>
<td style="text-align:left;">
brimittleman
</td>
<td style="text-align:left;">
2018-08-28
</td>
<td style="text-align:left;">
first processing figure
</td>
</tr>
</tbody>
</table>
</ul>
<p></details></p>
<hr />
<p>I will use this analysis to work on vizualising some of the processing steps of this analysis.</p>
<div id="peaks-per-gene" class="section level2">
<h2>Peaks per gene</h2>
<p>I want to create a figure similar to the one I created in <a href="https://brimittleman.github.io/comparative_threeprime/characterize.ortho.peaks.html" class="uri">https://brimittleman.github.io/comparative_threeprime/characterize.ortho.peaks.html</a>. I will use the count distinct function from bedtools map. For this I am using the RefSeq mRNA annotations.</p>
<pre class="bash"><code>#!/bin/bash

#SBATCH --job-name=refseq_countdistinct
#SBATCH --account=pi-yangili1
#SBATCH --time=24:00:00
#SBATCH --output=refseq_countdistinct.out
#SBATCH --error=refseq_countdistinct.err
#SBATCH --partition=broadwl
#SBATCH --mem=12G
#SBATCH --mail-type=END



bedtools map -c 4 -s -o count_distinct -a /project2/gilad/briana/genome_anotation_data/refseq.ProteinCoding.noCHR.bed -b /project2/gilad/briana/threeprimeseq/data/mergedPeaks_comb/filtered_APApeaks_merged_allchrom.named.fixed.bed  &gt; /project2/gilad/briana/threeprimeseq/data/peakPerRefseqGene/filtered_APApeaks_perRefseqGene.txt 


#try opp strand 
bedtools map -c 4 -S -o count_distinct -a /project2/gilad/briana/genome_anotation_data/refseq.ProteinCoding.noCHR.bed -b /project2/gilad/briana/threeprimeseq/data/mergedPeaks_comb/filtered_APApeaks_merged_allchrom.named.fixed.bed  &gt; /project2/gilad/briana/threeprimeseq/data/peakPerRefseqGene/filtered_APApeaks_perRefseqGene_oppStrand.txt 
</code></pre>
<pre class="r"><code>library(tidyverse)</code></pre>
<pre><code>── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──</code></pre>
<pre><code>✔ ggplot2 3.0.0     ✔ purrr   0.2.5
✔ tibble  1.4.2     ✔ dplyr   0.7.6
✔ tidyr   0.8.1     ✔ stringr 1.3.1
✔ readr   1.1.1     ✔ forcats 0.3.0</code></pre>
<pre><code>── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()</code></pre>
<pre class="r"><code>library(workflowr)</code></pre>
<pre><code>This is workflowr version 1.1.1
Run ?workflowr for help getting started</code></pre>
<pre class="r"><code>library(reshape2)</code></pre>
<pre><code>
Attaching package: &#39;reshape2&#39;</code></pre>
<pre><code>The following object is masked from &#39;package:tidyr&#39;:

    smiths</code></pre>
<pre class="r"><code>library(cowplot)</code></pre>
<pre><code>
Attaching package: &#39;cowplot&#39;</code></pre>
<pre><code>The following object is masked from &#39;package:ggplot2&#39;:

    ggsave</code></pre>
<pre class="r"><code>names=c(&quot;Chr&quot;, &quot;Start&quot;, &quot;End&quot;, &quot;Name&quot;, &quot;Score&quot;, &quot;Strand&quot;, &quot;numPeaks&quot;)
peakpergene=read.table(&quot;../data/peakPerRefSeqGene/filtered_APApeaks_perRefseqGene.txt&quot;, stringsAsFactors = F, header = F, col.names = names) %&gt;% mutate(onePeak=ifelse(numPeaks==1, 1, 0 )) %&gt;%  mutate(multPeaks=ifelse(numPeaks &gt; 1, 1, 0 ))</code></pre>
<pre class="r"><code>genes1peak=sum(peakpergene$onePeak)/nrow(peakpergene) 
genesMultpeak=sum(peakpergene$multPeaks)/nrow(peakpergene)
genes0peak= 1- genes1peak - genesMultpeak

perPeak= c(round(genes0peak,digits = 3), round(genes1peak,digits = 3),round(genesMultpeak, digits = 3))
Category=c(&quot;Zero&quot;, &quot;One&quot;, &quot;Multiple&quot;)
perPeakdf=as.data.frame(cbind(Category,as.numeric(perPeak)))</code></pre>
<p>Plot these proportions:</p>
<pre class="r"><code>lab1=paste(&quot;Genes =&quot;, genes0peak*nrow(peakpergene), sep=&quot; &quot;)
lab2=paste(&quot;Genes =&quot;, sum(peakpergene$onePeak), sep=&quot; &quot;)
lab3=paste(&quot;Genes =&quot;, sum(peakpergene$multPeaks), sep=&quot; &quot;)

genepeakplot=ggplot(perPeakdf, aes(x=&quot;&quot;, y=perPeak, fill=Category)) + geom_bar(stat=&quot;identity&quot;)+ labs(title=&quot;Characterize genes by number of PAS&quot;, y=&quot;Proportion of Protein Coding gene&quot;, x=&quot;&quot;)+ scale_fill_brewer(palette=&quot;Paired&quot;) + coord_cartesian(ylim=c(0,1)) + annotate(&quot;text&quot;, x=&quot;&quot;, y= .35, label=lab1) + annotate(&quot;text&quot;, x=&quot;&quot;, y= .78, label=lab2) + annotate(&quot;text&quot;, x=&quot;&quot;, y= .92, label=lab3)
genepeakplot</code></pre>
<p><img src="figure/dataprocfigures.Rmd/unnamed-chunk-5-1.png" width="672" style="display: block; margin: auto;" /></p>
<details> <summary><em>Expand here to see past versions of unnamed-chunk-5-1.png:</em></summary>
<table style="border-collapse:separate; border-spacing:5px;">
<thead>
<tr>
<th style="text-align:left;">
Version
</th>
<th style="text-align:left;">
Author
</th>
<th style="text-align:left;">
Date
</th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align:left;">
<a href="https://github.com/brimittleman/threeprimeseq/blob/cbec2f6eb084a5f22386c61b8e6d872d238c3c05/docs/figure/dataprocfigures.Rmd/unnamed-chunk-5-1.png" target="_blank">cbec2f6</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2018-08-29
</td>
</tr>
<tr>
<td style="text-align:left;">
<a href="https://github.com/brimittleman/threeprimeseq/blob/c6dc97bf8c0353ab9179b44a1f9ed8210f4c0adb/docs/figure/dataprocfigures.Rmd/unnamed-chunk-5-1.png" target="_blank">c6dc97b</a>
</td>
<td style="text-align:left;">
brimittleman
</td>
<td style="text-align:left;">
2018-08-28
</td>
</tr>
</tbody>
</table>
<p></details></p>
<p>This includes for than 1 isoform for different genes. I am going to go back to the original refseq file and resegment it. Column 13 is the gene name. Column 2 needs to start with NM because that is mRNA.</p>
<pre class="bash"><code>grep  &quot;NM&quot; ncbiRefSeq.txt | awk &#39;{print $3 &quot;\t&quot; $5 &quot;\t&quot; $6 &quot;\t&quot; $2 &quot;\t&quot; $13 &quot;\t&quot; $4}&#39; &gt; ncbiRefSeq.mRNA.named.bed</code></pre>
<p>I can write a script that writes only the longest isoform for each gene.</p>
<pre class="bash"><code>
outfile=open(&quot;refseq.ProteinCoding.bed&quot;, &quot;w&quot;)




infile=open(&quot;ncbiRefSeq.mRNA.named.bed&quot;, &quot;r&quot;)

lines=infile.readlines()
lot_lines=len(lines)
for n,ln in enumerate(lines):
    chrom, start, end, mRNA, gene, strand = ln.split()
    #if first line
    if n == 0:
        #first line condition
        SE_list=[]
        cur_gene=gene
        SE_list.append(int(start))   
        SE_list.append(int(end)) 
    elif n == lot_lines-1:
        #last line condition
        if gene == cur_gene:
            SE_list.append(int(start))   
            SE_list.append(int(end))
            SE_list.sort()
            outfile.write(&quot;%s\t%d\t%d\t%s\t.\t%s\n&quot;%(chrom, SE_list[0], SE_list[-1], gene, strand))
        else: 
           outfile.write(&quot;%s\t%d\t%d\t%s\t.\t%s\n&quot;%(chrom, int(start), int(end), gene, strand))
    elif gene == cur_gene:
        SE_list.append(int(start))   
        SE_list.append(int(end))
    elif gene != cur_gene:
        #write out the last line but with the start end from the SE list
        prevline=lines[n-1]
        chrom2, start2, end2, mRNA2, gene2, strand2 = prevline.split()
        outfile.write(&quot;%s\t%d\t%d\t%s\t.\t%s\n&quot;%(chrom2, SE_list[0], SE_list[-1], gene2, strand2))
        cur_gene=gene
        SE_list=[int(start), int(end)]


outfile.close()
</code></pre>
<p>I can check this by maknig sure there is 1 line for all of the unique names in the in file.</p>
<pre class="bash"><code>awk &#39;{print $5}&#39; ncbiRefSeq.mRNA.named.bed | sort | uniq | wc -l
#19243
wc -l refseq.ProteinCoding.bed 
#20298
sed &#39;s/^chr//&#39; refseq.ProteinCoding.bed &gt; refseq.ProteinCoding.noCHR.bed</code></pre>
<p>There is still a problem with the script. The problem is when the same gene name is on extra haplotypes. I could remove all of the lines in the file that have _ in the first column. These are on contigs or specfic haplotypes. They will not map to our peaks anyway. I also need to remove the chr.</p>
<p>This still seems lower than previos APA estimates. I had used gencode estimates before. I am gonig to run this analysis again with those gene.</p>
<pre class="bash"><code>#!/bin/bash

#SBATCH --job-name=gencode_countdistinct
#SBATCH --account=pi-yangili1
#SBATCH --time=24:00:00
#SBATCH --output=gencode_countdistinct.out
#SBATCH --error=gencode_countdistinct.err
#SBATCH --partition=broadwl
#SBATCH --mem=12G
#SBATCH --mail-type=END



bedtools map -c 4 -s -o count_distinct -a /project2/gilad/briana/genome_anotation_data/gencode.v19.annotation.proteincodinggene.sort.bed   -b /project2/gilad/briana/threeprimeseq/data/mergedPeaks_comb/filtered_APApeaks_merged_allchrom.named.fixed.bed  &gt;
</code></pre>
<pre class="r"><code>Gpeakpergene=read.table(&quot;../data/peakPerRefSeqGene/filtered_APApeaks_perGencodeGene.txt&quot;, stringsAsFactors = F, header = F, col.names = names) %&gt;% mutate(onePeak=ifelse(numPeaks==1, 1, 0 )) %&gt;%  mutate(multPeaks=ifelse(numPeaks &gt; 1, 1, 0 ))</code></pre>
<pre class="r"><code>Ggenes1peak=sum(Gpeakpergene$onePeak)/nrow(Gpeakpergene) 
GgenesMultpeak=sum(Gpeakpergene$multPeaks)/nrow(Gpeakpergene)
Ggenes0peak= 1- Ggenes1peak - GgenesMultpeak

GperPeak= c(round(Ggenes0peak,digits = 3), round(Ggenes1peak,digits = 3),round(GgenesMultpeak, digits = 3))

GperPeakdf=as.data.frame(cbind(Category,as.numeric(GperPeak)))</code></pre>
<p>Plot these proportions:</p>
<pre class="r"><code>Glab1=paste(&quot;Genes =&quot;, Ggenes0peak*nrow(Gpeakpergene), sep=&quot; &quot;)
Glab2=paste(&quot;Genes =&quot;, sum(Gpeakpergene$onePeak), sep=&quot; &quot;)
Glab3=paste(&quot;Genes =&quot;, sum(Gpeakpergene$multPeaks), sep=&quot; &quot;)

Ggenepeakplot=ggplot(GperPeakdf, aes(x=&quot;&quot;, y=perPeak, fill=Category)) + geom_bar(stat=&quot;identity&quot;)+ labs(title=&quot;Characterize Gencode genes by number of PAS&quot;, y=&quot;Proportion of Protein Coding gene&quot;, x=&quot;&quot;)+ scale_fill_brewer(palette=&quot;Paired&quot;) + coord_cartesian(ylim=c(0,1)) + annotate(&quot;text&quot;, x=&quot;&quot;, y= .35, label=Glab1) + annotate(&quot;text&quot;, x=&quot;&quot;, y= .78, label=Glab2) + annotate(&quot;text&quot;, x=&quot;&quot;, y= .92, label=Glab3)
Ggenepeakplot</code></pre>
<p><img src="figure/dataprocfigures.Rmd/unnamed-chunk-12-1.png" width="672" style="display: block; margin: auto;" /></p>
<details> <summary><em>Expand here to see past versions of unnamed-chunk-12-1.png:</em></summary>
<table style="border-collapse:separate; border-spacing:5px;">
<thead>
<tr>
<th style="text-align:left;">
Version
</th>
<th style="text-align:left;">
Author
</th>
<th style="text-align:left;">
Date
</th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align:left;">
<a href="https://github.com/brimittleman/threeprimeseq/blob/cbec2f6eb084a5f22386c61b8e6d872d238c3c05/docs/figure/dataprocfigures.Rmd/unnamed-chunk-12-1.png" target="_blank">cbec2f6</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2018-08-29
</td>
</tr>
</tbody>
</table>
<p></details></p>
<p>These results are still lower than expected. This is because all of my previous analysis mapped the genes to the peaks as which were closest in the upstream direction. Here I am saying the peak must overlap the gene.</p>
<p>I should again look at some of the genes with the top counts in RNA seq and the 0 peaks.</p>
<p>I am going to run feaureCounts on 18486 guevardis with the refseq annotation with the named genes. I need to make this a SAF file.</p>
<pre class="bash"><code>from misc_helper import *

fout = file(&quot;/project2/gilad/briana/genome_anotation_data/refseq.ProteinCoding.noCHR.SAF&quot;,&#39;w&#39;)
fout.write(&quot;GeneID\tChr\tStart\tEnd\tStrand\n&quot;)
for ln in open(&quot;/project2/gilad/briana/genome_anotation_data/refseq.ProteinCoding.noCHR.bed&quot;):
    chrom, start, end, gene, score, strand = ln.split()
    start_i=int(start)
    end_i=int(end)
    fout.write(&quot;%s\t%s\t%d\t%d\t%s\n&quot;%(gene, chrom, start_i, end_i, strand))
fout.close()</code></pre>
<pre class="bash"><code>#!/bin/bash

#SBATCH --job-name=fc_RNAseq_refseq
#SBATCH --account=pi-yangili1
#SBATCH --time=24:00:00
#SBATCH --output=fc_RNAseq_refseq.out
#SBATCH --error=fc_RNAseq_refseq.err
#SBATCH --partition=broadwl
#SBATCH --mem=12G
#SBATCH --mail-type=END


module load Anaconda3
source activate three-prime-env


# outdir: /project2/gilad/briana/comparitive_threeprime/data/PeakwGene_quant

featureCounts -a /project2/gilad/briana/genome_anotation_data/refseq.ProteinCoding.noCHR.SAF -F SAF -o /project2/gilad/briana/threeprimeseq/data/peakPerRefseqGene/refseq18486exp.quant /project2/yangili1/LCL/RNAseqGeuvadisBams/RNAseqGeuvadis_STAR_18486.final.bam -s 1</code></pre>
<p>Now I can upload the results and compare them to the peak counts in these genes.</p>
<pre class="r"><code>namesRNA=c(&quot;Name&quot;, &quot;Chr&quot;, &quot;Start&quot;, &quot;End&quot;, &quot;Strand&quot;, &quot;Length&quot;, &quot;RNAseq&quot;)
RNAseqrefseq=read.table(&quot;../data/peakPerRefSeqGene/refseq18486exp.quant&quot;, header=T, stringsAsFactors = F, col.names = namesRNA)
RNAseqrefseq$Start=as.integer(RNAseqrefseq$Start)</code></pre>
<pre><code>Warning: NAs introduced by coercion</code></pre>
<pre class="r"><code>RNAseqrefseq$End=as.integer(RNAseqrefseq$End)</code></pre>
<pre><code>Warning: NAs introduced by coercion</code></pre>
<p>Join the peakpergene dataframe with this dataframe.</p>
<pre class="r"><code>refPeakandRNA=peakpergene %&gt;% inner_join(RNAseqrefseq, by=c(&quot;Name&quot;, &quot;Chr&quot;, &quot;Start&quot;, &quot;End&quot;, &quot;Strand&quot;)) 

refPeakandRNA_noPeak=refPeakandRNA %&gt;% filter(RNAseq!=0) %&gt;% filter(numPeaks==0) %&gt;% arrange(desc(RNAseq)) %&gt;% select(Name, Start, End, Chr, RNAseq, numPeaks)</code></pre>
<p>This doesnt make much sense. Seems like the peaks are on the opposite strand for the top genes. I am gonig to force opposite strandedness and see what happens.</p>
<pre class="r"><code>Opeakpergene=read.table(&quot;../data/peakPerRefSeqGene/filtered_APApeaks_perRefseqGene_oppStrand.txt&quot;, stringsAsFactors = F, header = F, col.names = names) %&gt;% mutate(onePeak=ifelse(numPeaks==1, 1, 0 )) %&gt;%  mutate(multPeaks=ifelse(numPeaks &gt; 1, 1, 0 ))</code></pre>
<pre class="r"><code>Ogenes1peak=sum(Opeakpergene$onePeak)/nrow(Opeakpergene) 
OgenesMultpeak=sum(Opeakpergene$multPeaks)/nrow(Opeakpergene)
Ogenes0peak= 1- Ogenes1peak - OgenesMultpeak


OperPeak= c(round(Ogenes0peak,digits = 3), round(Ogenes1peak,digits = 3),round(OgenesMultpeak, digits = 3))

OperPeakdf=as.data.frame(cbind(Category,OperPeak))

OperPeakdf$OperPeak=as.numeric(as.character(OperPeakdf$OperPeak))</code></pre>
<p>Plot these proportions:</p>
<pre class="r"><code>Olab1=paste(&quot;Genes =&quot;, Ogenes0peak*nrow(Opeakpergene), sep=&quot; &quot;)
Olab2=paste(&quot;Genes =&quot;, sum(Opeakpergene$onePeak), sep=&quot; &quot;)
Olab3=paste(&quot;Genes =&quot;, sum(Opeakpergene$multPeaks), sep=&quot; &quot;)

Ogenepeakplot=ggplot(OperPeakdf, aes(x=&quot;&quot;, y=OperPeak, by=Category, fill=Category)) + geom_bar(stat=&quot;identity&quot;)+ labs(title=&quot;Characterize Refseq genes by number of PAS- Oppstrand&quot;, y=&quot;Proportion of Protein Coding gene&quot;, x=&quot;&quot;)+ scale_fill_brewer(palette=&quot;Paired&quot;) + coord_cartesian(ylim=c(0,1)) + annotate(&quot;text&quot;, x=&quot;&quot;, y= .2, label=Olab1) + annotate(&quot;text&quot;, x=&quot;&quot;, y= .4, label=Olab2) + annotate(&quot;text&quot;, x=&quot;&quot;, y= .9, label=Olab3)
Ogenepeakplot</code></pre>
<p><img src="figure/dataprocfigures.Rmd/unnamed-chunk-19-1.png" width="672" style="display: block; margin: auto;" /></p>
<details> <summary><em>Expand here to see past versions of unnamed-chunk-19-1.png:</em></summary>
<table style="border-collapse:separate; border-spacing:5px;">
<thead>
<tr>
<th style="text-align:left;">
Version
</th>
<th style="text-align:left;">
Author
</th>
<th style="text-align:left;">
Date
</th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align:left;">
<a href="https://github.com/brimittleman/threeprimeseq/blob/2e39f7a21ec47a1d8efaf410a87616f2bda9283f/docs/figure/dataprocfigures.Rmd/unnamed-chunk-19-1.png" target="_blank">2e39f7a</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2018-08-30
</td>
</tr>
</tbody>
</table>
<p></details></p>
<p>This makes more sense now.</p>
<pre class="r"><code>refPeakandRNA_withO=Opeakpergene %&gt;% inner_join(RNAseqrefseq, by=c(&quot;Name&quot;, &quot;Chr&quot;, &quot;Start&quot;, &quot;End&quot;, &quot;Strand&quot;)) 
refPeakandRNA_noPeakw_withO=refPeakandRNA_withO %&gt;% filter(RNAseq!=0) %&gt;% filter(numPeaks==0) %&gt;% arrange(desc(RNAseq)) %&gt;% select(Name, Start, End, Chr, RNAseq, numPeaks)</code></pre>
<pre class="r"><code>plot(sort(log10(refPeakandRNA_withO$RNAseq), decreasing = T), main=&quot;Distribution of RNA expression counts 18486&quot;, ylab=&quot;log10 Gene count&quot;, xlab=&quot;Refseq Gene&quot;)
points(sort(log10(refPeakandRNA_noPeakw_withO$RNAseq), decreasing = T), col=&quot;Red&quot;)
legend(&quot;topright&quot;, legend=c(&quot;All Gene&quot;, &quot;Gene without Peak&quot;), col=c(&quot;black&quot;, &quot;red&quot;),pch=19)</code></pre>
<p><img src="figure/dataprocfigures.Rmd/unnamed-chunk-21-1.png" width="672" style="display: block; margin: auto;" /></p>
<details> <summary><em>Expand here to see past versions of unnamed-chunk-21-1.png:</em></summary>
<table style="border-collapse:separate; border-spacing:5px;">
<thead>
<tr>
<th style="text-align:left;">
Version
</th>
<th style="text-align:left;">
Author
</th>
<th style="text-align:left;">
Date
</th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align:left;">
<a href="https://github.com/brimittleman/threeprimeseq/blob/2e39f7a21ec47a1d8efaf410a87616f2bda9283f/docs/figure/dataprocfigures.Rmd/unnamed-chunk-21-1.png" target="_blank">2e39f7a</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2018-08-30
</td>
</tr>
</tbody>
</table>
<p></details></p>
<p>Run Kalisto on the this RNA seq line and look at this plot with the kalisto output expression TPM. I added Kallisto to the three-prime-env.</p>
<p>Kallisto step:</p>
<ul>
<li>make index: kallisto_index18486.sh</li>
</ul>
<p>This needs to be based on a transcriptome. I will use the protein coding transcripts from <a href="https://www.gencodegenes.org/releases/28lift37.html" class="uri">https://www.gencodegenes.org/releases/28lift37.html</a>.</p>
<pre class="bash"><code>
#!/bin/bash

#SBATCH --job-name=kallisto_index18486
#SBATCH --account=pi-yangili1
#SBATCH --time=24:00:00
#SBATCH --output=kallisto_index18486.out
#SBATCH --error=kallisto_index18486.err
#SBATCH --partition=broadwl
#SBATCH --mem=50G
#SBATCH --mail-type=END

module load Anaconda3
source activate three-prime-env 


kallisto index  --make-unique -i /project2/gilad/briana/threeprimeseq/data/RNAseqKallisto/RNA18486_index /project2/gilad/briana/genome_anotation_data/gencode.v28lift37.pc_transcripts.fa</code></pre>
<ul>
<li>quantify: kallisto_quant18467.sh</li>
</ul>
<pre class="bash"><code>#!/bin/bash

#SBATCH --job-name=kallisto_quant18486
#SBATCH --account=pi-yangili1
#SBATCH --time=24:00:00
#SBATCH --output=kallisto_quant18486.out
#SBATCH --error=kallisto_quant18486.err
#SBATCH --partition=broadwl
#SBATCH --mem=12G
#SBATCH --mail-type=END

module load Anaconda3
source activate three-prime-env 

kallisto quant -i /project2/gilad/briana/threeprimeseq/data/RNAseqKallisto/RNA18486_index -o /project2/gilad/briana/threeprimeseq/data/RNAseqKallisto/ /project2/yangili1/LCL/RNAseq/RNA.18486_1.fastq.gz /project2/yangili1/LCL/RNAseq/RNA.18486_2.fastq.gz</code></pre>
<p>Convert to readable with TPM:</p>
<pre class="bash"><code> kallisto h5dump abundance.h5 -o /project2/gilad/briana/threeprimeseq/data/RNAseqKallisto</code></pre>
<p>This is the gencode annotation. I want to do this with the refseq transcriptome. <a href="https://www.ncbi.nlm.nih.gov/projects/genome/guide/human/" class="uri">https://www.ncbi.nlm.nih.gov/projects/genome/guide/human/</a></p>
<p>kallisto_refseqindex18486.sh</p>
<pre class="bash"><code>
#!/bin/bash

#SBATCH --job-name=kallisto_refseqindex18486
#SBATCH --account=pi-yangili1
#SBATCH --time=24:00:00
#SBATCH --output=kallisto_refseqindex18486.out
#SBATCH --error=kallisto_refseqindex18486.err
#SBATCH --partition=broadwl
#SBATCH --mem=50G
#SBATCH --mail-type=END

module load Anaconda3
source activate three-prime-env 


kallisto index  --make-unique -i /project2/gilad/briana/threeprimeseq/data/RNAseqKallisto/RNA18486_refseq_index /project2/gilad/briana/genome_anotation_data/GRCh37_latest_rna.fna
</code></pre>
<ul>
<li>quantify: kallisto_refseq_quant18467.sh</li>
</ul>
<pre class="bash"><code>#!/bin/bash

#SBATCH --job-name=kallisto_refseq_quant18486
#SBATCH --account=pi-yangili1
#SBATCH --time=24:00:00
#SBATCH --output=kallisto_refseq_quant18486.out
#SBATCH --error=kallisto_refseq_quant18486.err
#SBATCH --partition=broadwl
#SBATCH --mem=12G
#SBATCH --mail-type=END

module load Anaconda3
source activate three-prime-env 

kallisto quant -i /project2/gilad/briana/threeprimeseq/data/RNAseqKallisto/RNA18486_refseq_index -o /project2/gilad/briana/threeprimeseq/data/RNAseqKallisto/refseq/project2/yangili1/LCL/RNAseq/RNA.18486_1.fastq.gz /project2/yangili1/LCL/RNAseq/RNA.18486_2.fastq.gz</code></pre>
<p>I will use tximport to convert from the transcripts that are quantified in kalisto.</p>
<pre class="r"><code>#source(&quot;https://bioconductor.org/biocLite.R&quot;)
#biocLite(&quot;tximport&quot;)
#biocLite(&quot;TxDb.Hsapiens.UCSC.hg19.knownGene&quot;)
library(tximport)
library(&quot;TxDb.Hsapiens.UCSC.hg19.knownGene&quot;)</code></pre>
<pre><code>Loading required package: GenomicFeatures</code></pre>
<pre><code>Loading required package: BiocGenerics</code></pre>
<pre><code>Loading required package: parallel</code></pre>
<pre><code>
Attaching package: &#39;BiocGenerics&#39;</code></pre>
<pre><code>The following objects are masked from &#39;package:parallel&#39;:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB</code></pre>
<pre><code>The following objects are masked from &#39;package:dplyr&#39;:

    combine, intersect, setdiff, union</code></pre>
<pre><code>The following objects are masked from &#39;package:stats&#39;:

    IQR, mad, sd, var, xtabs</code></pre>
<pre><code>The following objects are masked from &#39;package:base&#39;:

    anyDuplicated, append, as.data.frame, basename, cbind,
    colMeans, colnames, colSums, dirname, do.call, duplicated,
    eval, evalq, Filter, Find, get, grep, grepl, intersect,
    is.unsorted, lapply, lengths, Map, mapply, match, mget, order,
    paste, pmax, pmax.int, pmin, pmin.int, Position, rank, rbind,
    Reduce, rowMeans, rownames, rowSums, sapply, setdiff, sort,
    table, tapply, union, unique, unsplit, which, which.max,
    which.min</code></pre>
<pre><code>Loading required package: S4Vectors</code></pre>
<pre><code>Loading required package: stats4</code></pre>
<pre><code>
Attaching package: &#39;S4Vectors&#39;</code></pre>
<pre><code>The following objects are masked from &#39;package:dplyr&#39;:

    first, rename</code></pre>
<pre><code>The following object is masked from &#39;package:tidyr&#39;:

    expand</code></pre>
<pre><code>The following object is masked from &#39;package:base&#39;:

    expand.grid</code></pre>
<pre><code>Loading required package: IRanges</code></pre>
<pre><code>
Attaching package: &#39;IRanges&#39;</code></pre>
<pre><code>The following objects are masked from &#39;package:dplyr&#39;:

    collapse, desc, slice</code></pre>
<pre><code>The following object is masked from &#39;package:purrr&#39;:

    reduce</code></pre>
<pre><code>Loading required package: GenomeInfoDb</code></pre>
<pre><code>Loading required package: GenomicRanges</code></pre>
<pre><code>Loading required package: AnnotationDbi</code></pre>
<pre><code>Loading required package: Biobase</code></pre>
<pre><code>Welcome to Bioconductor

    Vignettes contain introductory material; view with
    &#39;browseVignettes()&#39;. To cite Bioconductor, see
    &#39;citation(&quot;Biobase&quot;)&#39;, and for packages &#39;citation(&quot;pkgname&quot;)&#39;.</code></pre>
<pre><code>
Attaching package: &#39;AnnotationDbi&#39;</code></pre>
<pre><code>The following object is masked from &#39;package:dplyr&#39;:

    select</code></pre>
<p>Import Kalisto resutls:</p>
<pre class="r"><code>#I need to make a gene to transcript ID with the transcript id and gene id columns
tx2gene=read.table(&quot;../data/RNAkalisto/ncbiRefSeq.txn2gene.txt&quot; ,header= F, sep=&quot;\t&quot;, stringsAsFactors = F)

txi.kallisto.tsv &lt;- tximport(&quot;../data/RNAkalisto/abundance.tsv&quot;, type = &quot;kallisto&quot;, tx2gene = tx2gene)</code></pre>
<pre><code>Note: importing `abundance.h5` is typically faster than `abundance.tsv`</code></pre>
<pre><code>reading in files with read_tsv</code></pre>
<pre><code>1 
removing duplicated transcript rows from tx2gene
transcripts missing from tx2gene: 99
summarizing abundance
summarizing counts
summarizing length</code></pre>
<pre class="r"><code>txi.kallisto.tsv$abundance= as.data.frame(txi.kallisto.tsv$abundance) %&gt;% rownames_to_column(var=&quot;Name&quot;)
colnames(txi.kallisto.tsv$abundance)= c(&quot;Name&quot;, &quot;TPM&quot;)</code></pre>
<p>Now I want to join this with the RNA seq data so I am looking at the expression tpm rather than counts.</p>
<pre class="r"><code>refPeakandRNA_withO_TPM=refPeakandRNA_withO %&gt;% inner_join(txi.kallisto.tsv$abundance, by=&quot;Name&quot;) %&gt;% filter(TPM&gt;0)


refPeakandRNA_noPeakw_withO_TPM=refPeakandRNA_noPeakw_withO %&gt;% inner_join(txi.kallisto.tsv$abundance, by=&quot;Name&quot;) %&gt;% filter(TPM &gt;0)</code></pre>
<p>I can now replot the genes without peaks by TPM for the RNA seq rather than count.</p>
<pre class="r"><code>plot(sort(log10(refPeakandRNA_withO_TPM$TPM), decreasing = T), main=&quot;Distribution of RNA expression 18486&quot;, ylab=&quot;log10 TPM&quot;, xlab=&quot;Refseq Gene&quot;)
points(sort(log10(refPeakandRNA_noPeakw_withO_TPM$TPM), decreasing = T), col=&quot;Red&quot;)
legend(&quot;topright&quot;, legend=c(&quot;All Genes&quot;, &quot;Genes without Peak&quot;), col=c(&quot;black&quot;, &quot;red&quot;),pch=19)</code></pre>
<p><img src="figure/dataprocfigures.Rmd/unnamed-chunk-30-1.png" width="672" style="display: block; margin: auto;" /></p>
<p>I can use this to look at some of the highest expressed genes that we do not have peaks for.</p>
<ul>
<li><p>HIST2H2AA4: no coverage at location</p></li>
<li><p>HIST1H2AC: no coverage at location</p></li>
<li><p>BOP1: Not in the protein coding gene file. Are 2 peaks.</p></li>
<li><p>GSTM1: no coverage at location</p></li>
<li><p>NPIPA1: no coverage at location</p></li>
<li><p>SLX1A: difficult to interpret due to overlapping genes in the region</p></li>
<li><p>HIST1H2BJ: no coverage at location</p></li>
<li><p>MTX1: peak in the original filtered peaks, not in the refseq gene - lost due to direction, the peak goes the same was as the gene. probably noise</p></li>
<li><p>GALE - looks like there is a peak but we are not detecting it. May be too close to the next peak at the 3’ end of LYPLA2 gene.</p></li>
<li><p>HGH1: no coverage at location</p></li>
<li><p>MSMP: difficult to interpret due to overlapping genes in the region</p></li>
</ul>
</div>
<div id="session-information" class="section level2">
<h2>Session information</h2>
<pre class="r"><code>sessionInfo()</code></pre>
<pre><code>R version 3.5.1 (2018-07-02)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS Sierra 10.12.6

Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] stats4    parallel  stats     graphics  grDevices utils     datasets 
[8] methods   base     

other attached packages:
 [1] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2
 [2] GenomicFeatures_1.32.2                 
 [3] AnnotationDbi_1.42.1                   
 [4] Biobase_2.40.0                         
 [5] GenomicRanges_1.32.6                   
 [6] GenomeInfoDb_1.16.0                    
 [7] IRanges_2.14.11                        
 [8] S4Vectors_0.18.3                       
 [9] BiocGenerics_0.26.0                    
[10] tximport_1.8.0                         
[11] bindrcpp_0.2.2                         
[12] cowplot_0.9.3                          
[13] reshape2_1.4.3                         
[14] workflowr_1.1.1                        
[15] forcats_0.3.0                          
[16] stringr_1.3.1                          
[17] dplyr_0.7.6                            
[18] purrr_0.2.5                            
[19] readr_1.1.1                            
[20] tidyr_0.8.1                            
[21] tibble_1.4.2                           
[22] ggplot2_3.0.0                          
[23] tidyverse_1.2.1                        

loaded via a namespace (and not attached):
 [1] nlme_3.1-137                matrixStats_0.54.0         
 [3] bitops_1.0-6                lubridate_1.7.4            
 [5] bit64_0.9-7                 RColorBrewer_1.1-2         
 [7] progress_1.2.0              httr_1.3.1                 
 [9] rprojroot_1.3-2             tools_3.5.1                
[11] backports_1.1.2             R6_2.2.2                   
[13] DBI_1.0.0                   lazyeval_0.2.1             
[15] colorspace_1.3-2            withr_2.1.2                
[17] tidyselect_0.2.4            prettyunits_1.0.2          
[19] bit_1.1-14                  compiler_3.5.1             
[21] git2r_0.23.0                cli_1.0.0                  
[23] rvest_0.3.2                 xml2_1.2.0                 
[25] DelayedArray_0.6.5          rtracklayer_1.40.6         
[27] labeling_0.3                scales_1.0.0               
[29] digest_0.6.16               Rsamtools_1.32.3           
[31] rmarkdown_1.10              R.utils_2.7.0              
[33] XVector_0.20.0              pkgconfig_2.0.2            
[35] htmltools_0.3.6             rlang_0.2.2                
[37] readxl_1.1.0                rstudioapi_0.7             
[39] RSQLite_2.1.1               bindr_0.1.1                
[41] jsonlite_1.5                BiocParallel_1.14.2        
[43] R.oo_1.22.0                 RCurl_1.95-4.11            
[45] magrittr_1.5                GenomeInfoDbData_1.1.0     
[47] Matrix_1.2-14               Rcpp_0.12.18               
[49] munsell_0.5.0               R.methodsS3_1.7.1          
[51] stringi_1.2.4               whisker_0.3-2              
[53] yaml_2.2.0                  SummarizedExperiment_1.10.1
[55] zlibbioc_1.26.0             plyr_1.8.4                 
[57] grid_3.5.1                  blob_1.1.1                 
[59] crayon_1.3.4                lattice_0.20-35            
[61] Biostrings_2.48.0           haven_1.1.2                
[63] hms_0.4.2                   knitr_1.20                 
[65] pillar_1.3.0                biomaRt_2.36.1             
[67] XML_3.98-1.16               glue_1.3.0                 
[69] evaluate_0.11               modelr_0.1.2               
[71] cellranger_1.1.0            gtable_0.2.0               
[73] assertthat_0.2.0            broom_0.5.0                
[75] GenomicAlignments_1.16.0    memoise_1.1.0              </code></pre>
</div>

<hr>
<p>
    
</p>
<hr>

<!-- To enable disqus, uncomment the section below and provide your disqus_shortname -->

<!-- disqus
  <div id="disqus_thread"></div>
    <script type="text/javascript">
        /* * * CONFIGURATION VARIABLES: EDIT BEFORE PASTING INTO YOUR WEBPAGE * * */
        var disqus_shortname = 'rmarkdown'; // required: replace example with your forum shortname

        /* * * DON'T EDIT BELOW THIS LINE * * */
        (function() {
            var dsq = document.createElement('script'); dsq.type = 'text/javascript'; dsq.async = true;
            dsq.src = '//' + disqus_shortname + '.disqus.com/embed.js';
            (document.getElementsByTagName('head')[0] || document.getElementsByTagName('body')[0]).appendChild(dsq);
        })();
    </script>
    <noscript>Please enable JavaScript to view the <a href="http://disqus.com/?ref_noscript">comments powered by Disqus.</a></noscript>
    <a href="http://disqus.com" class="dsq-brlink">comments powered by <span class="logo-disqus">Disqus</span></a>
-->
<!-- Adjust MathJax settings so that all math formulae are shown using
TeX fonts only; see
http://docs.mathjax.org/en/latest/configuration.html.  This will make
the presentation more consistent at the cost of the webpage sometimes
taking slightly longer to load. Note that this only works because the
footer is added to webpages before the MathJax javascript. -->
<script type="text/x-mathjax-config">
  MathJax.Hub.Config({
    "HTML-CSS": { availableFonts: ["TeX"] }
  });
</script>

<hr>
<p>
  This reproducible <a href="http://rmarkdown.rstudio.com">R Markdown</a>
  analysis was created with
  <a href="https://github.com/jdblischak/workflowr">workflowr</a> 1.1.1
</p>
<hr>


</div>
</div>

</div>

<script>

// add bootstrap table styles to pandoc tables
function bootstrapStylePandocTables() {
  $('tr.header').parent('thead').parent('table').addClass('table table-condensed');
}
$(document).ready(function () {
  bootstrapStylePandocTables();
});


</script>

<!-- dynamically load mathjax for compatibility with self-contained -->
<script>
  (function () {
    var script = document.createElement("script");
    script.type = "text/javascript";
    script.src  = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
    document.getElementsByTagName("head")[0].appendChild(script);
  })();
</script>

</body>
</html>