<!DOCTYPE html>

<html xmlns="http://www.w3.org/1999/xhtml">

<head>

<meta charset="utf-8" />
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta name="generator" content="pandoc" />


<meta name="author" content="Briana Mittleman" />

<meta name="date" content="2018-03-02" />

<title>Complete Blacklist</title>

<script src="site_libs/jquery-1.11.3/jquery.min.js"></script>
<meta name="viewport" content="width=device-width, initial-scale=1" />
<link href="site_libs/bootstrap-3.3.5/css/cosmo.min.css" rel="stylesheet" />
<script src="site_libs/bootstrap-3.3.5/js/bootstrap.min.js"></script>
<script src="site_libs/bootstrap-3.3.5/shim/html5shiv.min.js"></script>
<script src="site_libs/bootstrap-3.3.5/shim/respond.min.js"></script>
<script src="site_libs/jqueryui-1.11.4/jquery-ui.min.js"></script>
<link href="site_libs/tocify-1.9.1/jquery.tocify.css" rel="stylesheet" />
<script src="site_libs/tocify-1.9.1/jquery.tocify.js"></script>
<script src="site_libs/navigation-1.1/tabsets.js"></script>
<link href="site_libs/highlightjs-9.12.0/textmate.css" rel="stylesheet" />
<script src="site_libs/highlightjs-9.12.0/highlight.js"></script>
<link href="site_libs/font-awesome-4.5.0/css/font-awesome.min.css" rel="stylesheet" />

<style type="text/css">code{white-space: pre;}</style>
<style type="text/css">
  pre:not([class]) {
    background-color: white;
  }
</style>
<script type="text/javascript">
if (window.hljs) {
  hljs.configure({languages: []});
  hljs.initHighlightingOnLoad();
  if (document.readyState && document.readyState === "complete") {
    window.setTimeout(function() { hljs.initHighlighting(); }, 0);
  }
}
</script>



<style type="text/css">
h1 {
  font-size: 34px;
}
h1.title {
  font-size: 38px;
}
h2 {
  font-size: 30px;
}
h3 {
  font-size: 24px;
}
h4 {
  font-size: 18px;
}
h5 {
  font-size: 16px;
}
h6 {
  font-size: 12px;
}
.table th:not([align]) {
  text-align: left;
}
</style>


</head>

<body>

<style type = "text/css">
.main-container {
  max-width: 940px;
  margin-left: auto;
  margin-right: auto;
}
code {
  color: inherit;
  background-color: rgba(0, 0, 0, 0.04);
}
img {
  max-width:100%;
  height: auto;
}
.tabbed-pane {
  padding-top: 12px;
}
button.code-folding-btn:focus {
  outline: none;
}
</style>


<style type="text/css">
/* padding for bootstrap navbar */
body {
  padding-top: 51px;
  padding-bottom: 40px;
}
/* offset scroll position for anchor links (for fixed navbar)  */
.section h1 {
  padding-top: 56px;
  margin-top: -56px;
}

.section h2 {
  padding-top: 56px;
  margin-top: -56px;
}
.section h3 {
  padding-top: 56px;
  margin-top: -56px;
}
.section h4 {
  padding-top: 56px;
  margin-top: -56px;
}
.section h5 {
  padding-top: 56px;
  margin-top: -56px;
}
.section h6 {
  padding-top: 56px;
  margin-top: -56px;
}
</style>

<script>
// manage active state of menu based on current page
$(document).ready(function () {
  // active menu anchor
  href = window.location.pathname
  href = href.substr(href.lastIndexOf('/') + 1)
  if (href === "")
    href = "index.html";
  var menuAnchor = $('a[href="' + href + '"]');

  // mark it active
  menuAnchor.parent().addClass('active');

  // if it's got a parent navbar menu mark it active as well
  menuAnchor.closest('li.dropdown').addClass('active');
});
</script>


<div class="container-fluid main-container">

<!-- tabsets -->
<script>
$(document).ready(function () {
  window.buildTabsets("TOC");
});
</script>

<!-- code folding -->




<script>
$(document).ready(function ()  {

    // move toc-ignore selectors from section div to header
    $('div.section.toc-ignore')
        .removeClass('toc-ignore')
        .children('h1,h2,h3,h4,h5').addClass('toc-ignore');

    // establish options
    var options = {
      selectors: "h1,h2,h3",
      theme: "bootstrap3",
      context: '.toc-content',
      hashGenerator: function (text) {
        return text.replace(/[.\\/?&!#<>]/g, '').replace(/\s/g, '_').toLowerCase();
      },
      ignoreSelector: ".toc-ignore",
      scrollTo: 0
    };
    options.showAndHide = true;
    options.smoothScroll = true;

    // tocify
    var toc = $("#TOC").tocify(options).data("toc-tocify");
});
</script>

<style type="text/css">

#TOC {
  margin: 25px 0px 20px 0px;
}
@media (max-width: 768px) {
#TOC {
  position: relative;
  width: 100%;
}
}


.toc-content {
  padding-left: 30px;
  padding-right: 40px;
}

div.main-container {
  max-width: 1200px;
}

div.tocify {
  width: 20%;
  max-width: 260px;
  max-height: 85%;
}

@media (min-width: 768px) and (max-width: 991px) {
  div.tocify {
    width: 25%;
  }
}

@media (max-width: 767px) {
  div.tocify {
    width: 100%;
    max-width: none;
  }
}

.tocify ul, .tocify li {
  line-height: 20px;
}

.tocify-subheader .tocify-item {
  font-size: 0.90em;
  padding-left: 25px;
  text-indent: 0;
}

.tocify .list-group-item {
  border-radius: 0px;
}


</style>

<!-- setup 3col/9col grid for toc_float and main content  -->
<div class="row-fluid">
<div class="col-xs-12 col-sm-4 col-md-3">
<div id="TOC" class="tocify">
</div>
</div>

<div class="toc-content col-xs-12 col-sm-8 col-md-9">




<div class="navbar navbar-default  navbar-fixed-top" role="navigation">
  <div class="container">
    <div class="navbar-header">
      <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar">
        <span class="icon-bar"></span>
        <span class="icon-bar"></span>
        <span class="icon-bar"></span>
      </button>
      <a class="navbar-brand" href="index.html">Net-seq</a>
    </div>
    <div id="navbar" class="navbar-collapse collapse">
      <ul class="nav navbar-nav">
        <li>
  <a href="index.html">Home</a>
</li>
<li>
  <a href="about.html">About</a>
</li>
<li>
  <a href="license.html">License</a>
</li>
      </ul>
      <ul class="nav navbar-nav navbar-right">
        <li>
  <a href="https://github.com/brimittleman/Net-seq">
    <span class="fa fa-github"></span>
     
  </a>
</li>
      </ul>
    </div><!--/.nav-collapse -->
  </div><!--/.container -->
</div><!--/.navbar -->

<div class="fluid-row" id="header">



<h1 class="title toc-ignore">Complete Blacklist</h1>
<h4 class="author"><em>Briana Mittleman</em></h4>
<h4 class="date"><em>2018-03-02</em></h4>

</div>


<!-- The file analysis/chunks.R contains chunks that define default settings
shared across the workflowr files. -->
<!-- Update knitr chunk options -->
<!-- Insert the date the file was last updated -->
<p><strong>Last updated:</strong> 2018-03-10</p>
<!-- Insert the code version (Git commit SHA1) if Git repository exists and R
 package git2r is installed -->
<p><strong>Code version:</strong> fdba2be</p>
<!-- Add your analysis here -->
<p>This analysis will allow me assess the places in the data that account for the read pileup. I will look for ribosomal, snoRNA, snRNA, hNRP genes. I will then look at highly expressed genes that we would not expect high expression/pileup such as insig1 (found in net3 exploration). I will remove reads at all of these locations by creating a blacklist of sequences to filter the fastq files.</p>
<div id="gene-expression-analysis" class="section level2">
<h2>Gene expression analysis</h2>
<div id="insig2" class="section level4">
<h4>Insig2</h4>
<p>First I am subsetting the genome coverage file for the 2nd chromosome. This is where insig2 is. I can use this to look at the distribution.</p>
<pre class="bash"><code>awk &#39;{ if ($1 = 2) { print } }&#39; YG-SP-NET3-18486_combined_Netpilot-sort.dedup.cov.bed  &gt; YG-SP-NET3-18486_combined_Netpilot-sort.dedup.cov.chr2.bed   


awk &#39;{ if ($1 = 2) { print } }&#39; YG-SP-NET3-18486_combined_Netpilot-sort.cov.bed  &gt; YG-SP-NET3-18486_combined_Netpilot-sort.cov.chr2.bed   

awk &#39;{ if ($2 &gt; 118846028 &amp;&amp; $2 &lt; 118868573) { print }}&#39; YG-SP-NET3-18486_combined_Netpilot-sort.cov.chr2.bed &gt;  YG-SP-NET3-18486_combined_Netpilot-sort.cov.insig2.bed</code></pre>
<p>Pull in the insig2:</p>
<pre class="r"><code>insig2=read.table(&quot;../data/YG-SP-NET3-18486_combined_Netpilot-sort.cov.insig2.bed&quot;, header=FALSE)
plot(insig2$V3 ~ insig2$V2, ylab=&quot;Read count&quot;, xlab=&quot;Chrom 2 position&quot;, main=&quot;Genome Coverage at Ingsig2 gene&quot;)</code></pre>
<p><img src="figure/create_blacklist.Rmd/unnamed-chunk-2-1.png" width="672" style="display: block; margin: auto;" /></p>
<p>Do this for the same line but deduplicated:</p>
<pre class="bash"><code>awk &#39;{ if ($2 &gt; 118846028 &amp;&amp; $2 &lt; 118868573) { print }}&#39; YG-SP-NET3-18486_combined_Netpilot-sort.dedup.cov.chr2.bed &gt;  YG-SP-NET3-18486_combined_Netpilot-sort.dedup.cov.insig2.bed
</code></pre>
<pre class="r"><code>insig2_de=read.table(&quot;../data/YG-SP-NET3-18486_combined_Netpilot-sort.dedup.cov.insig2.bed&quot;, header=FALSE)
plot(insig2_de$V3 ~ insig2_de$V2, ylab=&quot;Molecule count&quot;, xlab=&quot;Chrom 2 position&quot;, main=&quot;Genome Coverage at Ingsig2 gene (deduplicated)&quot;)</code></pre>
<p><img src="figure/create_blacklist.Rmd/unnamed-chunk-4-1.png" width="672" style="display: block; margin: auto;" /></p>
<pre class="r"><code>max_sort=max(insig2$V3)
max_sort_de=max(insig2_de$V3)
1- max_sort_de/max_sort</code></pre>
<pre><code>[1] 0.9864162</code></pre>
<p>This means that the deduplication removed 99 percent of the buildup but the peak is still there.</p>
<p>Try this with ggplot.</p>
<pre class="r"><code>library(ggplot2)
colnames(insig2)=c(&quot;chr&quot;, &quot;pos&quot;, &quot;count&quot;)
denstiy_plot= ggplot(data=insig2, aes(y=count, x=pos)) + geom_line(aes(x=pos,y=count))
denstiy_plot</code></pre>
<p><img src="figure/create_blacklist.Rmd/unnamed-chunk-6-1.png" width="672" style="display: block; margin: auto;" /></p>
</div>
<div id="expand-to-more-genes" class="section level4">
<h4>Expand to more genes</h4>
<p>How to make this more efficent:<br />
* Look at the top genes<br />
* create a script where I can enter the positions and get out the gene coverage by base (need gene name, chrom, start, end) * pull it into R and plot like this</p>
<p>To do this I need to use bedtools coverage -counts</p>
<p>Copy the gencode gene file /project2/gilad/briana/Net-seq/Net-seq3/gencode_noCHR_genes_MT_Fsort.bed to genome annotation.</p>
<pre class="bash"><code>#!/bin/bash

#SBATCH --job-name=count_cov
#SBATCH --output=count_cov_sbatch.out
#SBATCH --error=count_cov_sbatch.err
#SBATCH --time=8:00:00
#SBATCH --partition=broadwl
#SBATCH --mem=36G
#SBATCH --mail-type=END

module load Anaconda3  

source activate net-seq 


sample=$1

describer=$(echo ${sample} | sed -e &#39;s/.*\YG-SP-//&#39; | sed -e &quot;s/_combined_Netpilot-sort.bed$//&quot;)

bedtools coverage -counts -sorted -a /project2/gilad/briana/genome_anotation_data/gencode_noCHR_genes_MT_Fsort.bed -b $1 &gt; /project2/gilad/briana/Net-seq-pilot/data/gene_cov/${describer}.gene.coverage.bed</code></pre>
<p>Run this first for /project2/gilad/briana/Net-seq-pilot/data/gene_cov/YG-SP-NET3-18486_combined_Netpilot-sort.bed</p>
<pre class="r"><code>gene_cov_18486= read.table(&quot;../data/NET3-18486.gene.coverage.bed&quot;)
colnames(gene_cov_18486)= c(&quot;chr&quot;, &quot;start&quot;, &quot;end&quot;, &quot;gene&quot;, &quot;score&quot;, &quot;strand&quot;, &quot;count&quot;)</code></pre>
<pre class="r"><code>summary(gene_cov_18486$count)</code></pre>
<pre><code>    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
       0        0       14     1665       65 31156619 </code></pre>
<pre class="r"><code>gene_cov_18486_sort= gene_cov_18486[order(gene_cov_18486$count, decreasing = TRUE),]
plot(log10(gene_cov_18486_sort$count))</code></pre>
<p><img src="figure/create_blacklist.Rmd/unnamed-chunk-9-1.png" width="672" style="display: block; margin: auto;" /></p>
<pre class="r"><code>gene_cov_18486_sort[1:15,]</code></pre>
<pre><code>      chr     start       end              gene score strand    count
12688   2 118846049 118867604 ENST00000245787.4     0      + 31156619
12689   2 118846049 118868573 ENST00000485520.1     0      + 31156619
12690   2 118889703 118943962 ENST00000414886.1     0      - 30589292
5265    1 148604907 148605072 ENST00000384476.1     0      -  2118514
71571  15  65597014  65597130 ENST00000363286.1     0      +  1738588
71568  15  65558914  65592956 ENST00000558873.1     0      -  1220341
71570  15  65588388  65588504 ENST00000362698.1     0      +  1219665
52960  10 103113819 103317078 ENST00000370187.3     0      +  1198878
52961  10 103113858 103317054 ENST00000393441.4     0      +  1198862
52962  10 103113863 103317078 ENST00000408038.2     0      +  1198862
52963  10 103124601 103124792 ENST00000410482.1     0      -  1198755
5276    1 149224057 149224221 ENST00000384010.1     0      -  1022753
23151   4  76781024  76823681 ENST00000286719.7     0      -   935122
56403  11  62600383  62609281 ENST00000525239.1     0      -   813933
56407  11  62609090  62609281 ENST00000410396.1     0      -   813798</code></pre>
<p>Many of these are SnRNAs. Getting rid of these should help.</p>
<ol style="list-style-type: decimal">
<li>Insig2<br />
</li>
<li>(lincRNA)- AC093901.1- CCAGGGAA(+) so - strand is GGTCCCTT</li>
<li>RNU5B (SnRNA) ENST00000363286.1</li>
<li>RNVU1 (SnRNA)<br />
</li>
<li>RNU5A (SnRNA)<br />
</li>
<li>BTRC</li>
</ol>
<p>Make the bash script</p>
<pre class="bash"><code>#!/bin/bash

#SBATCH --job-name=gene_cov
#SBATCH --time=8:00:00
#SBATCH --output=gene_cov_sbatch.out
#SBATCH --error=gene_cov_sbatch.err
#SBATCH --partition=broadwl
#SBATCH --mem=10G
#SBATCH --mail-type=END

#script takes in the chr, start, end, and gene name. It will output a 
chr=$1
start=$2
end=$3
name=$4


awk -v var=${chr} &#39;{ if ($1 = var) { print }}&#39; /project2/gilad/briana/Net-seq-pilot/data/cov/YG-SP-NET3-18486_combined_Netpilot-sort.cov.bed  &gt; temp

awk -v var1=${start} -v var2=${end} &#39;{ if ($2 &gt; var1 &amp;&amp; $2 &lt; var2) { print }}&#39; -v var1=${start} -v var2=${end} temp &gt;  /project2/gilad/briana/Net-seq-pilot/output/high_gene_cov/YG-SP-NET3-18486_combined_Netpilot-sort.cov.${name}.bed</code></pre>
<p>Rfunction to make plot:</p>
<pre class="r"><code>plot_gene_dis &lt;- function(file, chr, geneName){
  gene &lt;- read.table(file, header=FALSE)
  colnames(gene)=c(&quot;chr&quot;, &quot;pos&quot;, &quot;count&quot;)
  plt_gene = ggplot(data=gene) + geom_line(aes(x=pos,y=count)) + ggtitle(paste(&quot;Genome coverage at &quot;,geneName)) + xlab(paste(&quot;Chrom &quot;,chr ,&quot; postion&quot;)) + ylab(&quot;read count&quot;)
  return(plt_gene)
}</code></pre>
<p>sbatch high_ex_gene_cov.sh ‘1’ ‘148604907’ ‘148605072’ ‘AC093901’</p>
<pre class="r"><code>aco_plt=plot_gene_dis(&quot;../data/YG-SP-NET3-18486_combined_Netpilot-sort.cov.AC093901.bed&quot;, &quot;1&quot;, &quot;ACO93901&quot;)
aco_plt</code></pre>
<p><img src="figure/create_blacklist.Rmd/unnamed-chunk-12-1.png" width="672" style="display: block; margin: auto;" /></p>
<p>sbatch high_ex_gene_cov.sh ‘15’ ‘65597014’ ‘65597130’ ‘RNU5B’</p>
<pre class="r"><code>RNU5B_plt=plot_gene_dis(&quot;../data/YG-SP-NET3-18486_combined_Netpilot-sort.cov.RNU5B.bed&quot;, &quot;15&quot;, &quot;RNU5B&quot;)

RNU5B_plt</code></pre>
<p><img src="figure/create_blacklist.Rmd/unnamed-chunk-13-1.png" width="672" style="display: block; margin: auto;" /></p>
<pre class="r"><code>RNU5B=read.table(&quot;../data/YG-SP-NET3-18486_combined_Netpilot-sort.cov.RNU5B.bed&quot;, header=FALSE, col.names = c(&quot;chr&quot;, &quot;pos&quot;, &quot;count&quot;))</code></pre>
<p>sbatch high_ex_gene_cov.sh ‘10’ ‘103113819’ ‘103317078’ ‘BTRC’</p>
<pre class="r"><code>btrc_plt= plot_gene_dis(&quot;../data/YG-SP-NET3-18486_combined_Netpilot-sort.cov.BTRC.bed&quot;, &quot;10&quot;, &quot;BTRC&quot;)
btrc_plt</code></pre>
<p><img src="figure/create_blacklist.Rmd/unnamed-chunk-14-1.png" width="672" style="display: block; margin: auto;" /></p>
<pre class="r"><code>btrc=read.table(&quot;../data/YG-SP-NET3-18486_combined_Netpilot-sort.cov.BTRC.bed&quot;, header=FALSE)
colnames(btrc)= c(&quot;chr&quot;, &quot;pos&quot;, &quot;count&quot;)
plot(btrc$count~btrc$pos)</code></pre>
<p><img src="figure/create_blacklist.Rmd/unnamed-chunk-14-2.png" width="672" style="display: block; margin: auto;" /></p>
<pre class="r"><code>btrc_sort=btrc[order(btrc$count, decreasing=TRUE),]
btrc_sort[1:20,]</code></pre>
<pre><code>        chr       pos   count
1840110  10 103124607 1064950
1840111  10 103124608  111925
1840108  10 103124605   10272
1840112  10 103124609    4733
1840109  10 103124606    1846
1840113  10 103124610    1693
1840114  10 103124611     967
1840115  10 103124612     835
1840116  10 103124613     698
1840254  10 103124751     114
1840117  10 103124614      79
1840159  10 103124656      78
1840165  10 103124662      77
1840158  10 103124655      72
1809410  10 103297165      56
841530   10 103142317      49
1840154  10 103124651      47
1610687  10 103301700      39
1840157  10 103124654      32
1840152  10 103124649      30</code></pre>
<p>sbatch high_ex_gene_cov.sh ‘10’ ‘103124601’ ‘103124792’ ‘rnu259p’</p>
<pre class="r"><code>rnu259_plt=plot_gene_dis(&quot;../data/YG-SP-NET3-18486_combined_Netpilot-sort.cov.rnu259p.bed&quot;, &quot;10&quot;, &quot;rnu259p&quot;)
rnu259_plt</code></pre>
<p><img src="figure/create_blacklist.Rmd/unnamed-chunk-15-1.png" width="672" style="display: block; margin: auto;" /></p>
<p>sbatch high_ex_gene_cov.sh ‘4’ ‘76781024’ ‘76823681’ ‘ppef2’</p>
<pre class="r"><code>ppef2_plt= plot_gene_dis(&quot;../data/YG-SP-NET3-18486_combined_Netpilot-sort.cov.ppef2.bed&quot;, &quot;4&quot;, &quot;ppef2&quot;)
ppef2_plt</code></pre>
<p><img src="figure/create_blacklist.Rmd/unnamed-chunk-16-1.png" width="672" style="display: block; margin: auto;" /></p>
<p>sbatch high_ex_gene_cov.sh ‘11’ ‘62600383’ ‘62609281’ ‘WDR74’ 62600383 62609281</p>
<pre class="r"><code>WDR74_plt= plot_gene_dis(&quot;../data/YG-SP-NET3-18486_combined_Netpilot-sort.cov.WDR74.bed&quot;, &quot;11&quot;, &quot;WDR74&quot;)
WDR74_plt</code></pre>
<p><img src="figure/create_blacklist.Rmd/unnamed-chunk-17-1.png" width="672" style="display: block; margin: auto;" /></p>
<pre class="r"><code>wd74=read.table(&quot;../data/YG-SP-NET3-18486_combined_Netpilot-sort.cov.WDR74.bed&quot;)
colnames(wd74)=c(&quot;chr&quot;, &quot;pos&quot;, &quot;count&quot;)
summary(wd74$V3)</code></pre>
<pre><code>Length  Class   Mode 
     0   NULL   NULL </code></pre>
</div>
<div id="summary-stat-for-buildup" class="section level4">
<h4>Summary stat for buildup</h4>
<p>Think for a summary statistic for these genes. Maybe the top position over the sum of the gene standardized by length?</p>
<pre class="r"><code>buildup_test_stat=function(df){
  length=nrow(df)
  x=df$count/length
  max=max(x)
  teststat= max/sum(x)
  return(teststat)
}


buildup_test_stat(insig2)</code></pre>
<pre><code>[1] 0.3864875</code></pre>
<pre class="r"><code>buildup_test_stat(btrc)</code></pre>
<pre><code>[1] 0.8870532</code></pre>
<pre class="r"><code>buildup_test_stat(RNU5B)</code></pre>
<pre><code>[1] 0.3864984</code></pre>
<pre class="r"><code>buildup_test_stat(wd74)</code></pre>
<pre><code>[1] 0.4801532</code></pre>
</div>
<div id="extend-to-second-sample" class="section level4">
<h4>Extend to second sample</h4>
<p>Extend analysis to 1 more line to make sure the top genes are the same:</p>
<pre class="r"><code>gene_cov_18505= read.table(&quot;../data/NET3-18505.gene.coverage.bed&quot;)
colnames(gene_cov_18505)= c(&quot;chr&quot;, &quot;start&quot;, &quot;end&quot;, &quot;gene&quot;, &quot;score&quot;, &quot;strand&quot;, &quot;count&quot;)</code></pre>
<pre class="r"><code>summary(gene_cov_18505$count)</code></pre>
<pre><code>    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
       0        0       17     1290       76 17872558 </code></pre>
<pre class="r"><code>gene_cov_18505_sort= gene_cov_18505[order(gene_cov_18505$count, decreasing = TRUE),]
plot(log10(gene_cov_18505_sort$count))</code></pre>
<p><img src="figure/create_blacklist.Rmd/unnamed-chunk-20-1.png" width="672" style="display: block; margin: auto;" /></p>
<pre class="r"><code>gene_cov_18505_sort[1:15,]</code></pre>
<pre><code>      chr     start       end              gene score strand    count
12688   2 118846049 118867604 ENST00000245787.4     0      + 17872558
12689   2 118846049 118868573 ENST00000485520.1     0      + 17872558
12690   2 118889703 118943962 ENST00000414886.1     0      - 16850121
5265    1 148604907 148605072 ENST00000384476.1     0      -  2615409
71571  15  65597014  65597130 ENST00000363286.1     0      +  2275911
23151   4  76781024  76823681 ENST00000286719.7     0      -  1616087
71568  15  65558914  65592956 ENST00000558873.1     0      -  1415228
71570  15  65588388  65588504 ENST00000362698.1     0      +  1414560
52960  10 103113819 103317078 ENST00000370187.3     0      +  1161216
52961  10 103113858 103317054 ENST00000393441.4     0      +  1161216
52962  10 103113863 103317078 ENST00000408038.2     0      +  1161216
52963  10 103124601 103124792 ENST00000410482.1     0      -  1161056
56403  11  62600383  62609281 ENST00000525239.1     0      -   714781
56407  11  62609090  62609281 ENST00000410396.1     0      -   714614
88957  19  49990810  49995565 ENST00000391857.4     0      +   626045</code></pre>
</div>
<div id="compare-to-rna-seq-data" class="section level3">
<h3>Compare to RNA seq data:</h3>
<pre class="bash"><code>#!/bin/bash

#SBATCH --job-name=RNA_count_cov
#SBATCH --output=RNA_count_cov_sbatch.out
#SBATCH --error=RNA_count_cov_sbatch.err
#SBATCH --time=8:00:00
#SBATCH --partition=bigmem2
#SBATCH --mem=60G
#SBATCH --mail-type=END

module load Anaconda3  

source activate net-seq 

bedtools coverage -counts -sorted  -a gencode.Genes.sort.bed -b /project2/gilad/yangili/LCLs/bams/RNAseqGeuvadis_STAR_18486.final.bam &gt; /project2/gilad/briana/Net-seq-pilot/data/RNA_seq_cov/RNAseqGeuvadis_STAR_18486.gene.coverage.bed</code></pre>
<p>Sort the file with bedtools sort -faidx using names.txt in this code directory.</p>
<pre class="r"><code>rna_seq=read.table(&quot;../data/RNAseqGeuvadis_STAR_18486.gene.coverage.bed&quot;, header = FALSE)
rna_seq_genecounts= rna_seq[,1:7]
colnames(rna_seq_genecounts)= c(&quot;chr&quot;, &quot;start&quot;, &quot;end&quot;, &quot;gene&quot;, &quot;score&quot;, &quot;strand&quot;, &quot;count&quot;)

rna_seq_order=rna_seq_genecounts[order(rna_seq_genecounts$count, decreasing = TRUE),]
plot(log10(rna_seq_order$count))</code></pre>
<p><img src="figure/create_blacklist.Rmd/unnamed-chunk-22-1.png" width="672" style="display: block; margin: auto;" /></p>
<pre class="r"><code>plot(log10(gene_cov_18486_sort$count))</code></pre>
<p><img src="figure/create_blacklist.Rmd/unnamed-chunk-22-2.png" width="672" style="display: block; margin: auto;" /> Plot the rank of the genes against each other.</p>
<pre class="r"><code>plot(log10(rna_seq_order$count)~log10(gene_cov_18486_sort$count))</code></pre>
<p><img src="figure/create_blacklist.Rmd/unnamed-chunk-23-1.png" width="672" style="display: block; margin: auto;" /></p>
<pre class="r"><code>top5000_rna=rna_seq_order[1:5000,]
top5000_net=gene_cov_18486_sort[1:5000,]
plot(log10(top5000_rna$count)~log10(top5000_net$count), xlab=&quot;log 10 Netseq&quot;, ylab=&quot;log10 RNA seq&quot;, main=&quot;Top 5000 Ranked expression vs Netseq counts for 18486&quot;)
abline(0,1)</code></pre>
<p><img src="figure/create_blacklist.Rmd/unnamed-chunk-23-2.png" width="672" style="display: block; margin: auto;" /></p>
<pre class="r"><code>cor(rna_seq_order$count,gene_cov_18486_sort$count)</code></pre>
<pre><code>[1] 0.02982312</code></pre>
<pre class="r"><code>cor(top5000_rna$count, top5000_net$count)</code></pre>
<pre><code>[1] 0.07747795</code></pre>
<p>How many genes are non zero for netseq?</p>
<pre class="r"><code>gene_cov_18486_sort_non0= gene_cov_18486_sort[gene_cov_18486_sort$count != 0,]
nrow(gene_cov_18486_sort_non0)</code></pre>
<pre><code>[1] 74475</code></pre>
<p>Look at the reads directly downstream of the tss in the net seq data. Negative strand we want 500 upstream of end to end and for pos strand we want start to 500 ds of start.</p>
<pre class="bash"><code> less gencode_noCHR_genes_MT_Fsort.bed  | awk &#39;{if($6 == &quot;+&quot;) print($1 &quot;\t&quot; $2 &quot;\t&quot; $2 + 500 &quot;\t&quot; $4 &quot;\t&quot; $5 &quot;\t&quot; $6); else print($1 &quot;\t&quot; $3 - 500 &quot;\t&quot; $3 &quot;\t&quot; $4 &quot;\t&quot; $5 &quot;\t&quot; $6)}&#39; &gt; gencode_noCHR_genes_MT_Fsort_tss.bed
</code></pre>
<p>Script for getting coverage in this file is /project2/gilad/briana/Net-seq-pilot/code/gene_cov_tss.sh</p>
<pre class="r"><code>library(dplyr)</code></pre>
<pre><code>
Attaching package: &#39;dplyr&#39;</code></pre>
<pre><code>The following objects are masked from &#39;package:stats&#39;:

    filter, lag</code></pre>
<pre><code>The following objects are masked from &#39;package:base&#39;:

    intersect, setdiff, setequal, union</code></pre>
<pre class="r"><code>tss_18486=read.table(&quot;../data/NET3-18486.tss.coverage.bed&quot;, header=FALSE)
colnames(tss_18486)=c(&quot;chr&quot;, &quot;start&quot;, &quot;end&quot;, &quot;gene&quot;, &quot;score&quot;, &quot;strand&quot;, &quot;count&quot;)
tss_18486_st = tss_18486 %&gt;% mutate(., st_count=count/500)
rna_seq_genecounts_st = rna_seq_genecounts %&gt;% mutate(., st_count=count/(end-start))</code></pre>
<p>Order both by standard counts:</p>
<pre class="r"><code>tss_18486_st_order=tss_18486_st[order(tss_18486_st$st_count, decreasing = TRUE),]
rna_seq_genecounts_st_order=rna_seq_genecounts_st[order(rna_seq_genecounts_st$st_count, decreasing = TRUE),]</code></pre>
<p>Correlation:</p>
<pre class="r"><code>cor(rna_seq_genecounts_st_order$st_count, tss_18486_st_order$st_count )</code></pre>
<pre><code>[1] 0.1449201</code></pre>
<pre class="r"><code>plot(rna_seq_genecounts_st_order$st_count~tss_18486_st_order$st_count, xlab=&quot;standardized Netseq at TSS&quot;, ylab=&quot;standardized RNA seq&quot;, main=&quot;Standardized expression vs Netseq standardized TSS counts for 18486&quot;)</code></pre>
<p><img src="figure/create_blacklist.Rmd/unnamed-chunk-28-1.png" width="672" style="display: block; margin: auto;" /> ###Bin genome coverage file into 200bp windows<br />
I already have genome coverage file and I can use bedtools make windows function. This script takes one of the genome coverage bed files and a bed file with 200bp windows. The script is /project2/gilad/briana/Net-seq-pilot/code/window_200_cov.sh</p>
<p><strong>the coverge file is not a bed file because it does not have a start and end. I need to use awk to make it a bedfile</strong></p>
<pre class="bash"><code>less YG-SP-NET3-18486_combined_Netpilot-sort.cov.bed | awk &#39;{print($1 &quot;\t&quot; $2 &quot;\t&quot; $2 &quot;\t&quot; $3)}&#39; &gt; YG-SP-NET3-18486_combined_Netpilot-sort.cov.fixed.bed</code></pre>
<pre class="bash"><code>#!/bin/bash

#SBATCH --job-name=200_wind
#SBATCH --time=8:00:00
#SBATCH --output=window_sbatch.out
#SBATCH --error=window_sbatch.err
#SBATCH --partition=broadwl
#SBATCH --mem=10G
#SBATCH --mail-type=END

module load Anaconda3  

source activate net-seq 


sample=$1

describer=$(echo ${sample} | sed -e &#39;s/.*\YG-SP-//&#39;  | sed -e &quot;/_combined_Netpilot-sort.cov.fixed.bed$//&quot;)



bedtools makewindows -b $1 -w 200 &gt; /project2/gilad/briana/Net-seq-pilot/data/200wind_cov/${describer}_combined_Netpilot-sort.200.cov.bed
</code></pre>
<p>/project2/gilad/briana/Net-seq-pilot/data/cov/YG-SP-NET3-18486_combined_Netpilot-sort.cov.fixed.bed</p>
<pre class="bash"><code>less YG-SP-NET3-18486_combined_Netpilot-sort.cov.fixed.bed | awk &#39;{print($1 &quot;\t&quot; $2 &quot;\t&quot; $2 + 1 )}&#39; &gt; /project2/gilad/briana/genome_anotation_data/genome_1bp_windows 
</code></pre>
<p>New idea is do this in 2 steps: make the windows then do coverage</p>
<p>This script is /project2/gilad/briana/Net-seq-pilot/code/window_200_cov2.sh</p>
<pre class="bash"><code>#!/bin/bash

#SBATCH --job-name=200_wind
#SBATCH --time=8:00:00
#SBATCH --output=window_sbatch.out
#SBATCH --error=window_sbatch.err
#SBATCH --partition=broadwl
#SBATCH --mem=20G
#SBATCH --mail-type=END

module load Anaconda3  

source activate net-seq 

#input is a bed file 
sample=$1


describer=$(echo ${sample} | sed -e &#39;s/.*\YG-SP-//&#39; | sed -e &quot;s/_combined_Netpilot-sort.bed$//&quot;)


bedtools makewindows -b /project2/gilad/briana/genome_anotation_data/genome_1bp_windows  -w 200 &gt; /project2/gilad/briana/genome_anotation_data/genome_200bp_windows

bedtools coverage -counts -sorted -a  /project2/gilad/briana/genome_anotation_data/genome_200bp_windows  -a $1 &gt; /project2/gilad/briana/Net-seq-pilot/data/200wind_cov/${describer}_combined_Netpilot-sort.200.cov.bed
</code></pre>
<p>sbatch window_200_cov2.sh /project2/gilad/briana/Net-seq-pilot/data/bed/YG-SP-NET3-18486_combined_Netpilot-sort.bed</p>
<p>Create the annotation file my self in python:</p>
<p>File with chromosome lengths: hg19.chrlen.txt</p>
<pre class="python"><code>import pandas as pd 
#FIX with NO HEADER!!
chr_length= pd.read_table(&quot;/project2/gilad/briana/Net-seq-pilot/code/hg19.chrlen.txt&quot;, header=None)
bed_list=[]
for ind, row in chr_length.iterrows():
    x=chr_length.iloc[ind,1]
    chr=chr_length.iloc[ind,0]
    for i in range(0, x, 200):
        bed_list.append([chr, i, i+200])
bed_df=pd.DataFrame(bed_list)
bed_df.to_csv(&#39;/project2/gilad/briana/Net-seq-pilot/code/genome_200_wind.bed&#39;, sep=&quot;\t&quot;)</code></pre>
<p>use awk to get rid of the first line and the first column</p>
<pre class="bash"><code>less genome_200_wind.bed | awk &#39;NR &gt;=2 {print($2 &quot;\t&quot; $3 &quot;\t&quot; $4)}&#39; &gt; genome_200_wind_fix.bed </code></pre>
<p>Now make a feature counts script:</p>
<pre class="bash"><code>#!/bin/bash

#SBATCH --job-name=fc_200
#SBATCH --time=8:00:00
#SBATCH --output=fc_200.out
#SBATCH --error=fc_200.err
#SBATCH --partition=broadwl
#SBATCH --mem=20G
#SBATCH --mail-type=END

module load Anaconda3  

source activate net-seq 

#input is a bed file 
sample=$1


describer=$(echo ${sample} | sed -e &#39;s/.*\YG-SP-//&#39; | sed -e &quot;s/_combined_Netpilot-sort.bam$//&quot;)


featureCounts -T 5 -a /project2/gilad/briana/Net-seq-pilot/code/genome_200_wind_fix.saf2 -F &#39;SAF&#39; -o /project2/gilad/briana/Net-seq-pilot/data/200wind_cov/${describer}_combined_Netpilot-sort.FC200.cov.bed $1
</code></pre>
<p>imput file /project2/gilad/briana/Net-seq-pilot/data/sort/YG-SP-NET3-18486_combined_Netpilot-sort.bam</p>
<p>awk to make it a gtf file</p>
<pre class="bash"><code>less genome_200_wind.bed | awk &#39;NR &gt;=2 {print(&quot;exon&quot; &quot;\t&quot; $2 &quot;\t&quot; $3 &quot;\t&quot; $4 &quot;\t&quot; &quot;+&quot;)}&#39; &gt; genome_200_wind_fix.saf</code></pre>
<p><strong>The working code is the featureCounts version</strong></p>
<p>I have to give each of the bins a different name:</p>
<p>there are 14439596 bins</p>
<pre class="bash"><code>

awk &#39;{ printf (&quot;%.6d %s\n&quot;, NR, $0) }&#39; genome_200_wind_fix.saf |awk &#39;{print $1 &quot;\t&quot; $3 &quot;\t&quot; $4 &quot;\t&quot; $5 &quot;\t&quot; $6}&#39;&gt; genome_200_wind_fix2.saf
</code></pre>
<p>Fix the first column name. It starts at 000002.</p>
<pre class="r"><code>gen_wind200_no0=read.table(&#39;../data/NET3-18486_combined_Netpilot-sort.FC200.cov.no0.bed&#39;, header=TRUE, stringsAsFactors = FALSE, col.names = c(&#39;Geneid&#39;, &#39;Chr&#39;, &#39;Start&#39;, &#39;End&#39;, &#39;Strand&#39;, &#39;Length&#39;, &#39;Counts&#39;), na.strings = &quot;NA&quot;)

gen_wind200_no0_order=gen_wind200_no0[order(gen_wind200_no0$Counts, decreasing=TRUE),]

summary(gen_wind200_no0_order$Counts)</code></pre>
<pre><code>     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
      1.0       1.0       2.0      66.8       4.0 2118483.0 </code></pre>
<pre class="r"><code>plot(gen_wind200_no0_order$Counts,  ylab=&#39;read count&#39;, xlab=&quot;Bin index&quot;, main=&quot;Sorted counts per 200bp bin without 0 bins&quot;)</code></pre>
<p><img src="figure/create_blacklist.Rmd/unnamed-chunk-38-1.png" width="672" style="display: block; margin: auto;" /></p>
<pre class="r"><code>plot(log10(gen_wind200_no0_order$Counts),  ylab=&#39;log10 read count&#39;, xlab=&quot;Bin index&quot;, main=&quot;Sorted counts per 200bp bin without 0 bins&quot;)</code></pre>
<p><img src="figure/create_blacklist.Rmd/unnamed-chunk-38-2.png" width="672" style="display: block; margin: auto;" /></p>
<pre class="r"><code>x=gen_wind200_no0_order$Counts[1: 23098]

plot(log10(x),  ylab=&#39;log10 read count&#39;, xlab=&quot;Bin index&quot;, main=&quot;Sorted counts per 200bp, top 5%&quot;)</code></pre>
<p><img src="figure/create_blacklist.Rmd/unnamed-chunk-38-3.png" width="672" style="display: block; margin: auto;" /></p>
<p>Subset the top 5% of bins:</p>
<pre class="r"><code>top5_gen_wind200_no0_order=gen_wind200_no0_order[1:23098,]

head(top5_gen_wind200_no0_order)</code></pre>
<pre><code>         Geneid   Chr     Start       End Strand Length  Counts
23476    743025  chr1 148604800 148605000      +    201 2118483
344025 12041897 chr15  65597000  65597200      +    201 1738589
344016 12041854 chr15  65588400  65588600      +    201 1218950
229721  8266594  chr9  79186800  79187000      +    201 1205170
257869  9094769 chr10 103124600 103124800      +    201 1198748
23521    746121  chr1 149224000 149224200      +    201 1022751</code></pre>
<pre class="r"><code>top5_gen_wind200.bed=top5_gen_wind200_no0_order[,2:5]

write.csv(top5_gen_wind200.bed, file = &quot;../data/top5_gen_wind200.bed&quot;, row.names = FALSE, quote = FALSE)</code></pre>
<p>I need to create a sorted bed file out of this so I can exclude these regions.</p>
<pre class="bash"><code>cat top5_gen_wind200.bed | tr &#39;,&#39; &#39;\t&#39; &gt; top5_gen_wind200.tab.bed </code></pre>
<p>Put in: /project2/gilad/briana/genome_anotation_data</p>
<pre class="bash"><code>cat top5_gen_wind200.tab.bed |sed &#39;s/^chr//&#39; &gt; top5_gen_wind200.tab.nochr.bed
#remove header in vi
bedtools sort -faidx names.txt -i top5_gen_wind200.tab.nochr.bed &gt; top5_gen_wind200.tab.nochr.sort.bed</code></pre>
<p>Now use intersect to remove genes including these.</p>
<p>/project2/gilad/briana/Net-seq-pilot/code/int.topwind.sh</p>
<pre class="bash"><code>#!/bin/bash

#SBATCH --job-name=top_intersect
#SBATCH --time=8:00:00
#SBATCH --output=top_int_sbatch.out
#SBATCH --error=top_int_sbatch.err
#SBATCH --partition=broadwl
#SBATCH --mem=10G
#SBATCH --mail-type=END


module load Anaconda3  

source activate net-seq 


sample=$1

describer=$(echo ${sample} | sed -e &#39;s/.*\gene_cov\///&#39; | sed -e &quot;s/.gene.coverage.bed$//&quot;)


bedtools intersect -v -wa -a $1 -b /project2/gilad/briana/genome_anotation_data/top5_gen_wind200.tab.nochr.sort.bed  &gt; /project2/gilad/briana/Net-seq-pilot/data/gene_cov/${describer}.gene.coverage.notopwind.bed
</code></pre>
<p>run with /project2/gilad/briana/Net-seq-pilot/data/gene_cov/NET3-18486.gene.coverage.bed</p>
<pre class="r"><code>gene_cov_18486_notoo=read.table(&quot;../data/NET3-18486.gene.coverage.notopwind.bed&quot;)
colnames(gene_cov_18486_notoo)=c(&quot;chr&quot;, &quot;start&quot;, &quot;end&quot;, &quot;gene&quot;, &quot;score&quot;, &quot;strand&quot;, &quot;count&quot;)
gene_cov_18486_notoo_st= gene_cov_18486_notoo %&gt;% mutate(., st_count=count/(end-start))
gene_cov_18486_notoo_st_order=gene_cov_18486_notoo_st[order(gene_cov_18486_notoo_st$st_count, decreasing=TRUE),]</code></pre>
<p>Run this for all lines to see how similar the top bins are:</p>
<p>The script is FT_200_cov.sh, run on:</p>
<ul>
<li><p>sbatch FT_200_cov.sh /project2/gilad/briana/Net-seq-pilot/data/sort/YG-SP-NET3-18505_combined_Netpilot-sort.bam</p></li>
<li><p>sbatch FT_200_cov.sh /project2/gilad/briana/Net-seq-pilot/data/sort/YG-SP-NET3-18508_combined_Netpilot-sort.bam</p></li>
<li><p>sbatch FT_200_cov.sh /project2/gilad/briana/Net-seq-pilot/data/sort/YG-SP-NET3-19128_combined_Netpilot-sort.bam</p></li>
<li><p>sbatch FT_200_cov.sh /project2/gilad/briana/Net-seq-pilot/data/sort/YG-SP-NET3-19141_combined_Netpilot-sort.bam</p></li>
<li><p>sbatch FT_200_cov.sh /project2/gilad/briana/Net-seq-pilot/data/sort/YG-SP-NET3-19193_combined_Netpilot-sort.bam</p></li>
<li><p>sbatch FT_200_cov.sh /project2/gilad/briana/Net-seq-pilot/data/sort/YG-SP-NET3-19239_combined_Netpilot-sort.bam</p></li>
<li><p>sbatch FT_200_cov.sh /project2/gilad/briana/Net-seq-pilot/data/sort/YG-SP-NET3-19257_combined_Netpilot-sort.bam</p></li>
</ul>
<p>Remove bins with 0s.</p>
<pre class="bash"><code>awk &#39;{if ($7 != 0) print}&#39; file &gt; file.no0



awk &#39;{if ($7 != 0) print}&#39; NET3-18508_combined_Netpilot-sort.FC200.cov.bed  &gt; NET3-18508_combined_Netpilot-sort.FC200.cov.no0.bed
awk &#39;{if ($7 != 0) print}&#39; NET3-19128_combined_Netpilot-sort.FC200.cov.bed  &gt; NET3-19128_combined_Netpilot-sort.FC200.cov.no0.bed
awk &#39;{if ($7 != 0) print}&#39; NET3-19141_combined_Netpilot-sort.FC200.cov.bed  &gt; NET3-19141_combined_Netpilot-sort.FC200.cov.no0.bed
awk &#39;{if ($7 != 0) print}&#39; NET3-19193_combined_Netpilot-sort.FC200.cov.bed  &gt; NET3-19193_combined_Netpilot-sort.FC200.cov.no0.bed
awk &#39;{if ($7 != 0) print}&#39; NET3-19239_combined_Netpilot-sort.FC200.cov.bed  &gt; NET3-19239_combined_Netpilot-sort.FC200.cov.no0.bed
awk &#39;{if ($7 != 0) print}&#39; NET3-19257_combined_Netpilot-sort.FC200.cov.bed  &gt; NET3-19257_combined_Netpilot-sort.FC200.cov.no0.bed</code></pre>
<pre class="r"><code>top_5_wind=function(file){
  wind200_no0=read.table(file, header=TRUE, stringsAsFactors = FALSE, col.names = c(&#39;Geneid&#39;, &#39;Chr&#39;, &#39;Start&#39;, &#39;End&#39;, &#39;Strand&#39;, &#39;Length&#39;, &#39;Counts&#39;), na.strings = &quot;NA&quot;)
  wind200_no0_order=wind200_no0[order(wind200_no0$Counts, decreasing=TRUE),]
  x=.5*nrow(wind200_no0_order)
  top5_wind200_no0_order=wind200_no0_order[1:x,]
  return(top5_wind200_no0_order$Geneid)
}


#test=top_5_wind(&#39;../data/NET3-18486_combined_Netpilot-sort.FC200.cov.no0.bed&#39;)</code></pre>
<pre class="r"><code>top5_bin_18486=top_5_wind(&#39;../data/NET3-18486_combined_Netpilot-sort.FC200.cov.no0.bed&#39;)
top5_bin_18505=top_5_wind(&#39;../data/NET3-18505_combined_Netpilot-sort.FC200.cov.no0.bed&#39;)
top5_bin_18508=top_5_wind(&#39;../data/NET3-18508_combined_Netpilot-sort.FC200.cov.no0.bed&#39;)
top5_bin_19128=top_5_wind(&#39;../data/NET3-19128_combined_Netpilot-sort.FC200.cov.no0.bed&#39;)
top5_bin_19141=top_5_wind(&#39;../data/NET3-19141_combined_Netpilot-sort.FC200.cov.no0.bed&#39;)
top5_bin_19193=top_5_wind(&#39;../data/NET3-19193_combined_Netpilot-sort.FC200.cov.no0.bed&#39;)
top5_bin_19239=top_5_wind(&#39;../data/NET3-19239_combined_Netpilot-sort.FC200.cov.no0.bed&#39;)
top5_bin_19257=top_5_wind(&#39;../data/NET3-19257_combined_Netpilot-sort.FC200.cov.no0.bed&#39;)</code></pre>
<div id="look-for-snorna-snrnahnrp" class="section level4">
<h4>Look for snoRNA, SNRNA,hNRP</h4>
<p>Make files in the genome annotation file.</p>
<pre class="bash"><code>less gencode.v19.annotation.gtf | tr &quot;\&quot;&quot; &quot;\t&quot; | awk &#39;$3 == &quot;gene&quot;&#39; | awk &#39;{print $16}&#39; | sort | uniq &gt; gene.type.txt </code></pre>
<p>Both snRNA and snoRNA are in this file. I will make a seperate bed file for each of these:</p>
<pre class="bash"><code>less gencode.v19.annotation.gtf | tr &quot;\&quot;&quot; &quot;\t&quot; | awk &#39;$3 == &quot;gene&quot;&#39; | awk &#39;$16 == &quot;snRNA&quot;&#39; |  awk &#39;{print($1 &quot;\t&quot; $4  &quot;\t&quot; $5 &quot;\t&quot; $10 &quot;\t0\t&quot; $7 )}&#39; &gt; snRNA.gencode.v19.bed  


less gencode.v19.annotation.gtf | tr &quot;\&quot;&quot; &quot;\t&quot; | awk &#39;$3 == &quot;gene&quot;&#39; |  awk &#39;$16 == &quot;snoRNA&quot;&#39; | awk &#39;{print($1 &quot;\t&quot; $4  &quot;\t&quot; $5 &quot;\t&quot; $10 &quot;\t0\t&quot; $7 )}&#39; &gt; snoRNA.gencode.v19.bed   </code></pre>
<p>Cut the chr tag out.</p>
<pre class="bash"><code>cat snRNA.gencode.v19.bed |sed &#39;s/^chr//&#39; &gt; snRNA.gencode.v19.nochr.bed
cat snoRNA.gencode.v19.bed  | sed &#39;s/chr//&#39; &gt;  snoRNA.gencode.v19.nochr.bed  
</code></pre>
<p>I now want to intersect these with /project2/gilad/briana/Net-seq-pilot/data/gene_cov/NET3-18486.gene.coverage.bed to take out the small RNAs.</p>
<p>Following code is /project2/gilad/briana/Net-seq-pilot/code/intersect_sn_sno.sh</p>
<pre class="bash"><code>#!/bin/bash

#SBATCH --job-name=intersect
#SBATCH --time=8:00:00
#SBATCH --output=int_sbatch.out
#SBATCH --error=int_sbatch.err
#SBATCH --partition=broadwl
#SBATCH --mem=10G
#SBATCH --mail-type=END


module load Anaconda3  

source activate net-seq 


sample=$1

describer=$(echo ${sample} | sed -e &#39;s/.*\gene_cov\///&#39; | sed -e &quot;s/.gene.coverage.bed$//&quot;)


bedtools intersect -v -wa -a $1 -b /project2/gilad/briana/genome_anotation_data/snRNA.gencode.v19.nochr.bed /project2/gilad/briana/genome_anotation_data/snoRNA.gencode.v19.nochr.bed  &gt; /project2/gilad/briana/Net-seq-pilot/data/gene_cov/${describer}.gene.coverage.nosn.nosno.bed
</code></pre>
<pre class="r"><code>gene_cov_18486_srnafilter=read.table(&quot;../data/NET3-18486.gene.coverage.nosn.nosno.bed&quot;, header=FALSE)
colnames(gene_cov_18486_srnafilter)= c(&quot;chr&quot;, &quot;start&quot;, &quot;end&quot;, &quot;gene&quot;, &quot;score&quot;, &quot;strand&quot;, &quot;count&quot;)
gene_cov_18486_srnafilter_st = gene_cov_18486_srnafilter%&gt;% mutate(., st_count=count/(end-start))
gene_cov_18486_srnafilter_st_order=gene_cov_18486_srnafilter_st[order(gene_cov_18486_srnafilter_st$st_count, decreasing=TRUE),]</code></pre>
<pre class="r"><code>#standardize read count by length  


gene_cov_18486_sort_st = gene_cov_18486_sort %&gt;% mutate(., st_count=count/(end-start))
gene_cov_18486_sort_st_order= gene_cov_18486_sort_st[order(gene_cov_18486_sort_st$st_count, decreasing=TRUE),]</code></pre>
<pre class="r"><code>par(mfrow = c(1,2))
withsRNA=plot(log10(gene_cov_18486_sort_st_order$st_count), ylab=&quot;log10 read count st. by gene length&quot;, main=&quot;Netseq read coverage&quot;, xlab=&quot;Gene&quot;)
nosRNA=plot(log10(gene_cov_18486_srnafilter_st_order$st_count), ylab=&quot;log10 read count st. by gene length&quot;, main=&quot;Netseq read coverage without snRNAs or snoRNAs&quot;, xlab=&quot;Gene&quot;)</code></pre>
<p><img src="figure/create_blacklist.Rmd/unnamed-chunk-53-1.png" width="672" style="display: block; margin: auto;" /></p>
<p>Add rRNAs to this analysis:</p>
<pre class="bash"><code>less gencode.v19.annotation.gtf | tr &quot;\&quot;&quot; &quot;\t&quot; | awk &#39;$3 == &quot;gene&quot;&#39; | awk &#39;$16 == &quot;rRNA&quot;&#39; |  awk &#39;{print($1 &quot;\t&quot; $4  &quot;\t&quot; $5 &quot;\t&quot; $10 &quot;\t0\t&quot; $7 )}&#39; &gt; rRNA.gencode.v19.bed  
cat rRNA.gencode.v19.bed  | sed &#39;s/chr//&#39; &gt;  rRNA.gencode.v19.nochr.bed  </code></pre>
<pre class="bash"><code>#!/bin/bash

#SBATCH --job-name=intersect_small
#SBATCH --time=8:00:00
#SBATCH --output=intSM_sbatch.out
#SBATCH --error=intSM_sbatch.err
#SBATCH --partition=broadwl
#SBATCH --mem=10G
#SBATCH --mail-type=END


module load Anaconda3  

source activate net-seq 


sample=$1

describer=$(echo ${sample} | sed -e &#39;s/.*\gene_cov\///&#39; | sed -e &quot;s/.gene.coverage.bed$//&quot;)


bedtools intersect -v -wa -a $1 -b /project2/gilad/briana/genome_anotation_data/snRNA.gencode.v19.nochr.bed /project2/gilad/briana/genome_anotation_data/snoRNA.gencode.v19.nochr.bed /project2/gilad/briana/genome_anotation_data/rRNA.gencode.v19.nochr.bed &gt; /project2/gilad/briana/Net-seq-pilot/data/gene_cov/${describer}.gene.coverage.noSM.bed
</code></pre>
<pre class="r"><code>gene_cov_18486_smallfilter=read.table(&quot;../data/NET3-18486.gene.coverage.noSM.bed&quot;, header=FALSE)
colnames(gene_cov_18486_smallfilter)= c(&quot;chr&quot;, &quot;start&quot;, &quot;end&quot;, &quot;gene&quot;, &quot;score&quot;, &quot;strand&quot;, &quot;count&quot;)
gene_cov_18486_smallfilter_st = gene_cov_18486_smallfilter%&gt;% mutate(., st_count=count/(end-start))
gene_cov_18486_smallfilter_st_order =gene_cov_18486_smallfilter_st[order(gene_cov_18486_smallfilter_st$st_count, decreasing=TRUE),]</code></pre>
<p>plot all 4:</p>
<pre class="r"><code>par(mfrow = c(2,2))
withsRNA=plot(log10(gene_cov_18486_sort_st_order$st_count), ylab=&quot;log10 read count st. by gene length&quot;, main=&quot;Netseq read coverage&quot;, xlab=&quot;Gene index&quot;,  ylim=c(-5,5))
nosRNA=plot(log10(gene_cov_18486_srnafilter_st_order$st_count), ylab=&quot;log10 read count st. by gene length&quot;, main=&quot;Netseq read coverage without snRNAs or snoRNAs&quot;, xlab=&quot;Gene index &quot;,  ylim=c(-5,5))
noSM=plot(log10(gene_cov_18486_smallfilter_st_order$st_count), ylab=&quot;log10 read count st. by gene length&quot;, main=&quot;Netseq read coverage without snRNA, snoRNA, rRNA&quot;, xlab=&quot;Gene index&quot;,  ylim=c(-5,5))
no_topwind=plot(log10(gene_cov_18486_notoo_st_order$st_count), ylab=&quot;log10 read count st. by gene length&quot;, main=&quot;Netseq read coverage without top 5% of windows&quot;, xlab=&quot;Gene index&quot;,  ylim=c(-5,5))</code></pre>
<p><img src="figure/create_blacklist.Rmd/unnamed-chunk-57-1.png" width="672" style="display: block; margin: auto;" /></p>
<p>Bedtools intersect to remove:</p>
</div>
</div>
<div id="meta-plots-on-filtered-data" class="section level3">
<h3>Meta plots on filtered data:</h3>
<p>First: use samtools intersect to remove these regions from the bam files<br />
* /project2/gilad/briana/Net-seq-pilot/code/filter_bams.sh</p>
<pre class="bash"><code>
#!/bin/bash

#SBATCH --job-name=bamintersect
#SBATCH --time=8:00:00
#SBATCH --output=bamint_sbatch.out
#SBATCH --error=bamint_sbatch.err
#SBATCH --partition=broadwl
#SBATCH --mem=40G
#SBATCH --mail-type=END


module load Anaconda3  

source activate net-seq 


sample=$1


describer=$(echo ${sample} | sed -e &#39;s/.*\YG-SP-//&#39; | sed -e &quot;s/_combined_Netpilot-sort.bam$//&quot;)


bedtools intersect -abam $1 -b /project2/gilad/briana/genome_anotation_data/top5_gen_wind200.tab.nochr.sort.bed -v &gt; /project2/gilad/briana/Net-seq-pilot/data/filter_bam/${describer}.Netpilot.binfilter.bam</code></pre>
<p>run on /project2/gilad/briana/Net-seq-pilot/data/sort/YG-SP-NET3-18486_combined_Netpilot-sort.bam</p>
<p>index this bam</p>
<p>Make the deep tools plot at TSS:</p>
<p>dt_tss_filter_18486.sh</p>
<pre class="bash"><code>#!/bin/bash


#SBATCH --job-name=dt_tss_18486
#SBATCH --time=8:00:00
#SBATCH --partition=gilad
#SBATCH --mem=16G
#SBATCH --mail-type=END
#SBATCH --output=dt_tss_18486_sbatch.out
#SBATCH --error=dt_tss_18486_sbatch.err

module load Anaconda3

source activate net-seq

bamCoverage -b /project2/gilad/briana/Net-seq-pilot/data/filter_bam/NET3-18486.Netpilot.binfilter.bam -o /project2/gilad/briana/Net-seq-pilot/output/deeptools/NET3-18486.Netpilot.binfilter.bw

computeMatrix reference-point -S/project2/gilad/briana/Net-seq-pilot/output/deeptools/NET3-18486.Netpilot.binfilter.bw -R /project2/gilad/briana/genome_anotation_data/gencode.v19.annotation.bed --referencePoint TSS -b 500 -a 500 -out /project2/gilad/briana/Net-seq-pilot/output/deeptools/NET3-18486.Netpilot.binfilter.gz

plotHeatmap --sortRegions descend  --refPointLabel &quot;TSS&quot;   -m /project2/gilad/briana/Net-seq-pilot/output/deeptools/NET3-18486.Netpilot.binfilter.gz  -out /project2/gilad/briana/Net-seq-pilot/output/deeptools/NET3-18486.Netpilot.binfilter.gz.png</code></pre>
<p>dt_tes_filter_18486.sh</p>
<pre class="bash"><code>#!/bin/bash


#SBATCH --job-name=dt_tes_18486
#SBATCH --time=8:00:00
#SBATCH --partition=gilad
#SBATCH --mem=20G
#SBATCH --mail-type=END
#SBATCH --output=dt_tes_18486_sbatch.out
#SBATCH --error=dt_tes_18486_sbatch.err

module load Anaconda3

source activate net-seq

#bw created in tss plot

computeMatrix reference-point -S /project2/gilad/briana/Net-seq-pilot/output/deeptools/NET3-18486.Netpilot.binfilter.bw -R /project2/gilad/briana/genome_anotation_data/gencode.v19.annotation.bed --referencePoint TES -b 500 -a 500 -out /project2/gilad/briana/Net-seq-pilot/output/deeptools/NET3-18486.TES.Netpilot.binfilter.gz

plotHeatmap --sortRegions descend  --refPointLabel &quot;TES&quot;   -m /project2/gilad/briana/Net-seq-pilot/output/deeptools/NET3-18486.TES.Netpilot.binfilter.gz  -out /project2/gilad/briana/Net-seq-pilot/output/deeptools/NET3-18486.TES.Netpilot.binfilter.gz.png</code></pre>
<p>PAS<br />
dt_pas_filter_18486.sh</p>
<pre class="bash"><code>#!/bin/bash


#SBATCH --job-name=deeptools_pas_netseq
#SBATCH --time=8:00:00
#SBATCH --partition=broadwl
#SBATCH --mem=40G
#SBATCH --tasks-per-node=4 
#SBATCH --mail-type=END
#SBATCH --output=deeptool_pas_sbatch.out
#SBATCH --error=deeptools_pas_sbatch.err

module load Anaconda3

source activate net-seq

#the bw file has already been created

computeMatrix reference-point -S /project2/gilad/briana/Net-seq-pilot/output/deeptools/NET3-18486.Netpilot.binfilter.bw -R /project2/gilad/briana/apa_sites/clusters.bed -b 500 -a 500 -out /project2/gilad/briana/Net-seq-pilot/output/deeptools/NET3-18486.PAS.Netpilot.binfilter.gz

plotHeatmap --sortRegions descend --refPointLabel &quot;PAS&quot;  -m /project2/gilad/briana/Net-seq-pilot/output/deeptools/NET3-18486.PAS.Netpilot.binfilter.gz  -out /project2/gilad/briana/Net-seq-pilot/output/deeptools/NET3-18486.PAS.Netpilot.binfilter.png</code></pre>
<p>dt_3ss_filter_18486.sh</p>
<pre class="bash"><code>#!/bin/bash


#SBATCH --job-name=deeptools_3_netseq
#SBATCH --time=8:00:00
#SBATCH --partition=broadwl
#SBATCH --mem=40G
#SBATCH --mail-type=END
#SBATCH --output=deeptool_3_sbatch.out
#SBATCH --error=deeptools_3_sbatch.err

module load Anaconda3

source activate net-seq

#the bw file has already been created

computeMatrix reference-point -S /project2/gilad/briana/Net-seq-pilot/output/deeptools/NET3-18486.Netpilot.binfilter.bw   -R /project2/gilad/briana/Net-seq/Net-seq3/gencode.v19.3prime.noE1noTES.bed -b 500 -a 500 -out /project2/gilad/briana/Net-seq-pilot/output/deeptools/NET3-18486.3SS.Netpilot.binfilter.gz

plotHeatmap --sortRegions descend --refPointLabel &quot;3&#39;splice boundary&quot; -m /project2/gilad/briana/Net-seq-pilot/output/deeptools/NET3-18486.3SS.Netpilot.binfilter.gz  -out /project2/gilad/briana/Net-seq-pilot/output/deeptools/NET3-18486.3SS.Netpilot.binfilter.png</code></pre>
<p>dt_5ss_filter_18486.sh</p>
<pre class="bash"><code>#!/bin/bash


#SBATCH --job-name=deeptools_5_netseq
#SBATCH --time=8:00:00
#SBATCH --partition=broadwl
#SBATCH --mem=40G
#SBATCH --mail-type=END
#SBATCH --output=deeptool_5_sbatch.out
#SBATCH --error=deeptools_5_sbatch.err

module load Anaconda3

source activate net-seq

#the bw file has already been created

computeMatrix reference-point -S  /project2/gilad/briana/Net-seq/Net-seq3/output/deeptools/net-seq-18486.bw -R /project2/gilad/briana/Net-seq/Net-seq3/gencode.v19.5prime.noE1noTES.bed -b 500 -a 500 -out /project2/gilad/briana/Net-seq-pilot/output/deeptools/NET3-18486.5SS.Netpilot.binfilter.gz

plotHeatmap --sortRegions descend --refPointLabel &quot;5&#39; splice boundary&quot; -m /project2/gilad/briana/Net-seq-pilot/output/deeptools/NET3-18486.5SS.Netpilot.binfilter.gz  -out /project2/gilad/briana/Net-seq-pilot/output/deeptools/NET3-18486.5SS.Netpilot.binfilter.png</code></pre>
<p>CTCF</p>
<p>dt_ctcf_filter_18486.sh</p>
<pre class="bash"><code>#!/bin/bash


#SBATCH --job-name=deeptools_ctcf_netseq
#SBATCH --time=8:00:00
#SBATCH --partition=broadwl
#SBATCH --mem=40G
#SBATCH --tasks-per-node=4
#SBATCH --mail-type=END
#SBATCH --output=deeptool_ctcf_sbatch.out
#SBATCH --error=deeptools_ctcf_sbatch.err

module load Anaconda3

source activate net-seq

#the bw file has already been created

computeMatrix reference-point -S /project2/gilad/briana/Net-seq/Net-seq3/output/deeptools/net-seq-18486.bw -R /project2/gilad/briana/Net-seq/Net-seq3/GM12873-DS14433.peaks.fdr0.01.hg19.bed -b 500 -a 500 -out /project2/gilad/briana/Net-seq-pilot/output/deeptools/NET3-18486.ctcf.Netpilot.binfilter.gz

plotHeatmap --sortRegions descend --refPointLabel &quot;CTCF&quot;  -m /project2/gilad/briana/Net-seq-pilot/output/deeptools/NET3-18486.ctcf.Netpilot.binfilter.gz  -out /project2/gilad/briana/Net-seq-pilot/output/deeptools/NET3-18486.ctcf.Netpilot.binfilter.png</code></pre>
</div>
</div>
<div id="session-information" class="section level2">
<h2>Session information</h2>
<!-- Insert the session information into the document -->
<pre class="r"><code>sessionInfo()</code></pre>
<pre><code>R version 3.4.2 (2017-09-28)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS Sierra 10.12.6

Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] bindrcpp_0.2  dplyr_0.7.4   ggplot2_2.2.1

loaded via a namespace (and not attached):
 [1] Rcpp_0.12.15     bindr_0.1        knitr_1.18       magrittr_1.5    
 [5] munsell_0.4.3    colorspace_1.3-2 R6_2.2.2         rlang_0.1.6     
 [9] stringr_1.2.0    plyr_1.8.4       tools_3.4.2      grid_3.4.2      
[13] gtable_0.2.0     git2r_0.21.0     htmltools_0.3.6  assertthat_0.2.0
[17] yaml_2.1.16      lazyeval_0.2.1   rprojroot_1.3-2  digest_0.6.14   
[21] tibble_1.4.2     glue_1.2.0       evaluate_0.10.1  rmarkdown_1.8.5 
[25] labeling_0.3     stringi_1.1.6    compiler_3.4.2   pillar_1.1.0    
[29] scales_0.5.0     backports_1.1.2  jsonlite_1.5     reticulate_1.4  
[33] pkgconfig_2.0.1 </code></pre>
</div>

<hr>
<p>
    This <a href="http://rmarkdown.rstudio.com">R Markdown</a> site was created with <a href="https://github.com/jdblischak/workflowr">workflowr</a>
</p>
<hr>

<!-- To enable disqus, uncomment the section below and provide your disqus_shortname -->

<!-- disqus
  <div id="disqus_thread"></div>
    <script type="text/javascript">
        /* * * CONFIGURATION VARIABLES: EDIT BEFORE PASTING INTO YOUR WEBPAGE * * */
        var disqus_shortname = 'rmarkdown'; // required: replace example with your forum shortname

        /* * * DON'T EDIT BELOW THIS LINE * * */
        (function() {
            var dsq = document.createElement('script'); dsq.type = 'text/javascript'; dsq.async = true;
            dsq.src = '//' + disqus_shortname + '.disqus.com/embed.js';
            (document.getElementsByTagName('head')[0] || document.getElementsByTagName('body')[0]).appendChild(dsq);
        })();
    </script>
    <noscript>Please enable JavaScript to view the <a href="http://disqus.com/?ref_noscript">comments powered by Disqus.</a></noscript>
    <a href="http://disqus.com" class="dsq-brlink">comments powered by <span class="logo-disqus">Disqus</span></a>
-->


</div>
</div>

</div>

<script>

// add bootstrap table styles to pandoc tables
function bootstrapStylePandocTables() {
  $('tr.header').parent('thead').parent('table').addClass('table table-condensed');
}
$(document).ready(function () {
  bootstrapStylePandocTables();
});


</script>

<!-- dynamically load mathjax for compatibility with self-contained -->
<script>
  (function () {
    var script = document.createElement("script");
    script.type = "text/javascript";
    script.src  = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
    document.getElementsByTagName("head")[0].appendChild(script);
  })();
</script>

</body>
</html>