<!DOCTYPE html>

<html xmlns="http://www.w3.org/1999/xhtml">

<head>

<meta charset="utf-8" />
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta name="generator" content="pandoc" />


<meta name="author" content="Briana Mittleman" />

<meta name="date" content="2017-11-30" />

<title>Check read counts in binned genome</title>

<script src="site_libs/jquery-1.11.3/jquery.min.js"></script>
<meta name="viewport" content="width=device-width, initial-scale=1" />
<link href="site_libs/bootstrap-3.3.5/css/cosmo.min.css" rel="stylesheet" />
<script src="site_libs/bootstrap-3.3.5/js/bootstrap.min.js"></script>
<script src="site_libs/bootstrap-3.3.5/shim/html5shiv.min.js"></script>
<script src="site_libs/bootstrap-3.3.5/shim/respond.min.js"></script>
<script src="site_libs/jqueryui-1.11.4/jquery-ui.min.js"></script>
<link href="site_libs/tocify-1.9.1/jquery.tocify.css" rel="stylesheet" />
<script src="site_libs/tocify-1.9.1/jquery.tocify.js"></script>
<script src="site_libs/navigation-1.1/tabsets.js"></script>
<link href="site_libs/highlightjs-1.1/textmate.css" rel="stylesheet" />
<script src="site_libs/highlightjs-1.1/highlight.js"></script>
<link href="site_libs/font-awesome-4.5.0/css/font-awesome.min.css" rel="stylesheet" />

<style type="text/css">code{white-space: pre;}</style>
<style type="text/css">
  pre:not([class]) {
    background-color: white;
  }
</style>
<script type="text/javascript">
if (window.hljs && document.readyState && document.readyState === "complete") {
   window.setTimeout(function() {
      hljs.initHighlighting();
   }, 0);
}
</script>



<style type="text/css">
h1 {
  font-size: 34px;
}
h1.title {
  font-size: 38px;
}
h2 {
  font-size: 30px;
}
h3 {
  font-size: 24px;
}
h4 {
  font-size: 18px;
}
h5 {
  font-size: 16px;
}
h6 {
  font-size: 12px;
}
.table th:not([align]) {
  text-align: left;
}
</style>


</head>

<body>

<style type = "text/css">
.main-container {
  max-width: 940px;
  margin-left: auto;
  margin-right: auto;
}
code {
  color: inherit;
  background-color: rgba(0, 0, 0, 0.04);
}
img {
  max-width:100%;
  height: auto;
}
.tabbed-pane {
  padding-top: 12px;
}
button.code-folding-btn:focus {
  outline: none;
}
</style>


<style type="text/css">
/* padding for bootstrap navbar */
body {
  padding-top: 51px;
  padding-bottom: 40px;
}
/* offset scroll position for anchor links (for fixed navbar)  */
.section h1 {
  padding-top: 56px;
  margin-top: -56px;
}

.section h2 {
  padding-top: 56px;
  margin-top: -56px;
}
.section h3 {
  padding-top: 56px;
  margin-top: -56px;
}
.section h4 {
  padding-top: 56px;
  margin-top: -56px;
}
.section h5 {
  padding-top: 56px;
  margin-top: -56px;
}
.section h6 {
  padding-top: 56px;
  margin-top: -56px;
}
</style>

<script>
// manage active state of menu based on current page
$(document).ready(function () {
  // active menu anchor
  href = window.location.pathname
  href = href.substr(href.lastIndexOf('/') + 1)
  if (href === "")
    href = "index.html";
  var menuAnchor = $('a[href="' + href + '"]');

  // mark it active
  menuAnchor.parent().addClass('active');

  // if it's got a parent navbar menu mark it active as well
  menuAnchor.closest('li.dropdown').addClass('active');
});
</script>


<div class="container-fluid main-container">

<!-- tabsets -->
<script>
$(document).ready(function () {
  window.buildTabsets("TOC");
});
</script>

<!-- code folding -->




<script>
$(document).ready(function ()  {

    // move toc-ignore selectors from section div to header
    $('div.section.toc-ignore')
        .removeClass('toc-ignore')
        .children('h1,h2,h3,h4,h5').addClass('toc-ignore');

    // establish options
    var options = {
      selectors: "h1,h2,h3",
      theme: "bootstrap3",
      context: '.toc-content',
      hashGenerator: function (text) {
        return text.replace(/[.\\/?&!#<>]/g, '').replace(/\s/g, '_').toLowerCase();
      },
      ignoreSelector: ".toc-ignore",
      scrollTo: 0
    };
    options.showAndHide = true;
    options.smoothScroll = true;

    // tocify
    var toc = $("#TOC").tocify(options).data("toc-tocify");
});
</script>

<style type="text/css">

#TOC {
  margin: 25px 0px 20px 0px;
}
@media (max-width: 768px) {
#TOC {
  position: relative;
  width: 100%;
}
}


.toc-content {
  padding-left: 30px;
  padding-right: 40px;
}

div.main-container {
  max-width: 1200px;
}

div.tocify {
  width: 20%;
  max-width: 260px;
  max-height: 85%;
}

@media (min-width: 768px) and (max-width: 991px) {
  div.tocify {
    width: 25%;
  }
}

@media (max-width: 767px) {
  div.tocify {
    width: 100%;
    max-width: none;
  }
}

.tocify ul, .tocify li {
  line-height: 20px;
}

.tocify-subheader .tocify-item {
  font-size: 0.90em;
  padding-left: 25px;
  text-indent: 0;
}

.tocify .list-group-item {
  border-radius: 0px;
}


</style>

<!-- setup 3col/9col grid for toc_float and main content  -->
<div class="row-fluid">
<div class="col-xs-12 col-sm-4 col-md-3">
<div id="TOC" class="tocify">
</div>
</div>

<div class="toc-content col-xs-12 col-sm-8 col-md-9">




<div class="navbar navbar-default  navbar-fixed-top" role="navigation">
  <div class="container">
    <div class="navbar-header">
      <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar">
        <span class="icon-bar"></span>
        <span class="icon-bar"></span>
        <span class="icon-bar"></span>
      </button>
      <a class="navbar-brand" href="index.html">Net-seq</a>
    </div>
    <div id="navbar" class="navbar-collapse collapse">
      <ul class="nav navbar-nav">
        <li>
  <a href="index.html">Home</a>
</li>
<li>
  <a href="about.html">About</a>
</li>
<li>
  <a href="license.html">License</a>
</li>
      </ul>
      <ul class="nav navbar-nav navbar-right">
        <li>
  <a href="https://github.com/brimittleman/Net-seq">
    <span class="fa fa-github"></span>
     
  </a>
</li>
      </ul>
    </div><!--/.nav-collapse -->
  </div><!--/.container -->
</div><!--/.navbar -->

<div class="fluid-row" id="header">



<h1 class="title toc-ignore">Check read counts in binned genome</h1>
<h4 class="author"><em>Briana Mittleman</em></h4>
<h4 class="date"><em>2017-11-30</em></h4>

</div>


<!-- The file analysis/chunks.R contains chunks that define default settings
shared across the workflowr files. -->
<!-- Update knitr chunk options -->
<!-- Insert the date the file was last updated -->
<p><strong>Last updated:</strong> 2017-12-04</p>
<!-- Insert the code version (Git commit SHA1) if Git repository exists and R
 package git2r is installed -->
<p><strong>Code version:</strong> a63bb5f</p>
<!-- Add your analysis here -->
<div id="bash-script" class="section level3">
<h3>Bash script</h3>
<p>Split genome into 200bp windows and run the coverage command:</p>
<p>/project2/gilad/briana/Net-seq/genome_bed/hg19_200bp_window.bed</p>
<p>/project2/gilad/briana/Net-seq/ref_genes/windows_200</p>
<p>make window_200_cov.sh</p>
<pre class="bash"><code>#!/bin/bash

#SBATCH --job-name=window_200_cov
#SBATCH --time=8:00:00
#SBATCH --partition=broadwl
#SBATCH --mem=50G
#SBATCH --tasks-per-node=4
#SBATCH --mail-type=END



bedtools coverage -counts -a /project2/gilad/briana/Net-seq/genome_bed/hg19_200bp_window.bed -b /project2/gilad/briana/Net-seq/Net-seq1/data/bed_sort/net1_18486_dep_chr_sort.bed  &gt; /project2/gilad/briana/Net-seq/ref_genes/windows_200/window_200_cov_18486.txt


bedtools coverage -counts -a /project2/gilad/briana/Net-seq/genome_bed/hg19_200bp_window.bed -b /project2/gilad/briana/Net-seq/Net-seq1/data/bed_sort/net1_18508_dep_chr_sort.bed  &gt; /project2/gilad/briana/Net-seq/ref_genes/windows_200/window_200_cov_18508_dep.txt

bedtools coverage -counts -a /project2/gilad/briana/Net-seq/genome_bed/hg19_200bp_window.bed -b /project2/gilad/briana/Net-seq/Net-seq1/data/bed_sort/net1_18508_nondep_chr_sort.bed   &gt; /project2/gilad/briana/Net-seq/ref_genes/windows_200/window_200_cov_18508_nondep.txt

bedtools coverage -counts -a /project2/gilad/briana/Net-seq/genome_bed/hg19_200bp_window.bed -b /project2/gilad/briana/Net-seq/Net-seq1/data/bed_sort/net1_19238_dep_chr_sort.bed   &gt; /project2/gilad/briana/Net-seq/ref_genes/windows_200/window_200_cov_19238.txt

bedtools coverage -counts -a /project2/gilad/briana/Net-seq/genome_bed/hg19_200bp_window.bed  -b /project2/gilad/briana/Net-seq/data/bed_sort/mayer_SRR1575922_chr_sort.bed   &gt; /project2/gilad/briana/Net-seq/ref_genes/windows_200/window_200_cov_mayer.txt 


#bedtools coverage -counts -a /project2/gilad/briana/Net-seq/genome_bed/hg19_200bp_window.bed   -b /project2/gilad/briana/Net-seq/Net-seq1/data/bed/merged_Net1.chr.bed   &gt; /project2/gilad/briana/Net-seq/ref_genes/windows_200/window_200_cov_merged.txt
#step memory exceeded!</code></pre>
</div>
<div id="import-data" class="section level3">
<h3>Import data</h3>
<pre class="r"><code>window_200_18486=read.csv(&quot;../data/windows_200/window_200_cov_18486.txt&quot;, header=FALSE, sep=&quot;\t&quot;)
window_200_18508_dep=read.csv(&quot;../data/windows_200/window_200_cov_18508_dep.txt&quot;, header=FALSE, sep=&quot;\t&quot;)
window_200_18508_nondep=read.csv(&quot;../data/windows_200/window_200_cov_18508_nondep.txt&quot;, header=FALSE, sep=&quot;\t&quot;)
window_200_19238=read.csv(&quot;../data/windows_200/window_200_cov_19238.txt&quot;, header=FALSE, sep=&quot;\t&quot;)
window_200_mayer= read.csv(&quot;../data/windows_200/window_200_cov_mayer.txt&quot;, header=FALSE, sep=&quot;\t&quot;)</code></pre>
<p>Add col labels to each file:</p>
<pre class="r"><code>colnames(window_200_18486) = c(&quot;chr&quot;, &quot;start&quot;, &quot;end&quot;, &quot;count&quot;)
colnames(window_200_18508_dep) = c(&quot;chr&quot;, &quot;start&quot;, &quot;end&quot;, &quot;count&quot;)
colnames(window_200_18508_nondep) = c(&quot;chr&quot;, &quot;start&quot;, &quot;end&quot;, &quot;count&quot;)
colnames(window_200_19238) = c(&quot;chr&quot;, &quot;start&quot;, &quot;end&quot;, &quot;count&quot;)
colnames(window_200_mayer) = c(&quot;chr&quot;, &quot;start&quot;, &quot;end&quot;, &quot;count&quot;)</code></pre>
</div>
<div id="plot-data" class="section level3">
<h3>Plot data</h3>
<p>Data I want to look at:</p>
<ul>
<li>summary per library</li>
</ul>
<pre class="r"><code>summary(window_200_18486$count)</code></pre>
<pre><code>   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
      0       0       0       1       0 4076520 </code></pre>
<pre class="r"><code>summary(window_200_18508_dep$count)</code></pre>
<pre><code>    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
       0        0        0       21        0 27069584 </code></pre>
<pre class="r"><code>summary(window_200_18508_nondep$count)</code></pre>
<pre><code>    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
       0        0        0       23        0 30571781 </code></pre>
<pre class="r"><code>summary(window_200_19238$count)</code></pre>
<pre><code>    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
       0        0        0        4        0 13033253 </code></pre>
<pre class="r"><code>summary(window_200_mayer$count)</code></pre>
<pre><code>    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
     0.0      0.0      0.0    113.3      0.0 701170.0 </code></pre>
<p>Use a plot to see the distribution:</p>
<ul>
<li>summaries not including zero</li>
</ul>
<p>Make dataframe excluding the zeros:</p>
<pre class="r"><code>window_200_18486_non0= window_200_18486[window_200_18486$count!=0,]
window_200_18508_dep_non0= window_200_18508_dep[window_200_18508_dep$count!=0,]
window_200_18508_nondep_non0= window_200_18508_nondep[window_200_18508_nondep$count!=0,]
window_200_19238_non0= window_200_19238[window_200_19238$count!=0,]
window_200_mayer_non0= window_200_mayer[window_200_mayer$count!=0,]</code></pre>
<p>summarise</p>
<pre class="r"><code>summary(window_200_18486_non0$count)</code></pre>
<pre><code>   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
      1       1       1      45       3 4076520 </code></pre>
<pre class="r"><code>summary(window_200_18508_dep_non0$count)</code></pre>
<pre><code>    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
       1        1        1      624        3 27069584 </code></pre>
<pre class="r"><code>summary(window_200_18508_nondep_non0$count)</code></pre>
<pre><code>    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
       1        1        1      633        3 30571781 </code></pre>
<pre class="r"><code>summary(window_200_19238_non0$count)</code></pre>
<pre><code>    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
       1        1        1      149        3 13033253 </code></pre>
<pre class="r"><code>summary(window_200_mayer_non0$count)</code></pre>
<pre><code>   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
      1       1       1    2200       2  701170 </code></pre>
<pre class="r"><code>plot(sort(log(window_200_19238_non0$count), decreasing=TRUE))</code></pre>
<p><img src="figure/bin_windows.Rmd/unnamed-chunk-7-1.png" width="672" style="display: block; margin: auto;" /></p>
<ul>
<li>number of entries that are non zero</li>
</ul>
<pre class="r"><code>x= nrow(window_200_18486)
barplot(c(nrow(window_200_18486_non0)/x,nrow(window_200_18508_dep_non0)/x,nrow(window_200_18508_nondep_non0)/x, nrow(window_200_19238_non0)/x), main=&quot;Proportion of detected bins&quot;, names=c(&quot;18486&quot;, &quot;18508 \n dep&quot;, &quot;18508 \n nondep&quot;, &quot;19238&quot;))</code></pre>
<p><img src="figure/bin_windows.Rmd/unnamed-chunk-8-1.png" width="672" style="display: block; margin: auto;" /></p>
<pre class="r"><code>nrow(window_200_mayer_non0)/x</code></pre>
<pre><code>[1] 0.05149546</code></pre>
</div>
<div id="integrate-sets" class="section level3">
<h3>Integrate sets:</h3>
<pre class="r"><code>window_non_0_all= nrow(window_200_18486[window_200_18486$count!=0 | window_200_18508_dep$count!=0 | window_200_18486$count!=0,] )
prop_window_non0= window_non_0_all/x
prop_window_non0</code></pre>
<pre><code>[1] 0.05416194</code></pre>
<pre class="r"><code>window_non_0_2= nrow(window_200_18486[window_200_18486$count!=0 | window_200_18508_dep$count!=0,] )
prop_window_non0_2= window_non_0_2/x
prop_window_non0_2</code></pre>
<pre><code>[1] 0.05416194</code></pre>
<pre class="r"><code>window_non_0_2b= nrow(window_200_18486[window_200_18486$count!=0 |window_200_18486$count!=0,] )
prop_window_non0_2b= window_non_0_2b/x
prop_window_non0_2b</code></pre>
<pre><code>[1] 0.03282309</code></pre>
</div>
<div id="explore-bin-overlap" class="section level3">
<h3>Explore bin overlap</h3>
<p>I will combine the depleted samples counts in one data frame:</p>
<pre class="r"><code>window_200_3lib= cbind(window_200_18486, window_200_18508_dep$count, window_200_19238$count)
colnames(window_200_3lib)= c(&quot;chr&quot;, &quot;start&quot;, &quot;end&quot;, &quot;18486&quot;, &quot;18508&quot;, &quot;19238&quot;)</code></pre>
<p>Make a vector with how many of the libraries have coverage for each bin</p>
<pre class="r"><code>sum_vec= apply(window_200_3lib[,4:6],1,function(x)sum(x&gt;=1))
window_200_3lib= cbind(window_200_3lib, sum_vec)</code></pre>
<p>Explore the results:</p>
<pre class="r"><code>bin_0= sum(sum_vec==0)
bin_1= sum(sum_vec==1)
bin_2= sum(sum_vec==2)
bin_3= sum(sum_vec==3)

prop0=bin_0/x
prop1=bin_1/x
prop2=bin_2/x
prop3=bin_3/x

barplot_prop=(barplot(c(prop1, prop2, prop3),names = c(&quot;1 bin&quot;, &quot;2 bins&quot;, &quot;3 bins&quot;), main=&quot;Proportion of bins with coverage in 3 libaries&quot;, ylab=&quot;proportion&quot;, xlab=&quot;library&quot; ))</code></pre>
<p><img src="figure/bin_windows.Rmd/unnamed-chunk-14-1.png" width="672" style="display: block; margin: auto;" /></p>
<pre class="r"><code>prop_vec=c(prop0,prop1, prop2, prop3)
names_vec= c(&quot;0 bins&quot;,&quot;1 bin&quot;, &quot;2 bins&quot;, &quot;3 bins&quot;)

prop_table=rbind(names_vec,prop_vec)
prop_table</code></pre>
<pre><code>          [,1]                [,2]                 [,3]                
names_vec &quot;0 bins&quot;            &quot;1 bin&quot;              &quot;2 bins&quot;            
prop_vec  &quot;0.934702546224945&quot; &quot;0.0453430987382321&quot; &quot;0.0115298190107529&quot;
          [,4]                 
names_vec &quot;3 bins&quot;             
prop_vec  &quot;0.00842453602607038&quot;</code></pre>
</div>
<div id="perform-on-mayer-data-or-multiple-lanes" class="section level3">
<h3>Perform on mayer data or multiple lanes</h3>
<pre class="bash"><code>#!/bin/bash

#SBATCH --job-name=window_200_cov_mayer
#SBATCH --time=8:00:00
#SBATCH --partition=broadwl
#SBATCH --mem=50G
#SBATCH --tasks-per-node=4
#SBATCH --mail-type=END

module load bedtools

bedtools coverage -counts -a /project2/gilad/briana/Net-seq/genome_bed/hg19_200bp_window.bed -b /project2/gilad/briana/mayer.data/mayer_hek/data/bed/mayer_hek-sort.chr.bed &gt; /project2/gilad/briana/mayer.data/mayer_hek/data/window_200_cov_mayer_hek.txt</code></pre>
<p>Not enough memory</p>
</div>
<div id="bin-the-gene-file" class="section level3">
<h3>Bin the gene file:</h3>
<pre class="bash"><code>bedtools makewindows -b ref_seq_gene_hg19 -w 200 &gt;  ref_seq_gene_hg16_bins.bed</code></pre>
<p>Make a bash script to get coverage in these bins. This is in /project2/gilad/briana/Net-seq/ref_genes/gene_windows_200/gene_window_200.sh</p>
<pre class="bash"><code>#!/bin/bash

#SBATCH --job-name=gene_window_200_cov
#SBATCH --time=8:00:00
#SBATCH --partition=broadwl
#SBATCH --mem=30G
#SBATCH --tasks-per-node=4
#SBATCH --mail-type=END



bedtools coverage -counts -a /project2/gilad/briana/Net-seq/ref_genes/ref_seq_gene_hg16_bins.bed -b /project2/gilad/briana/Net-seq/Net-seq1/data/bed_sort/net1_18486_dep_chr_sort.bed  &gt; /project2/gilad/briana/Net-seq/ref_genes/gene_windows_200/gene_window_cov_18486.txt

bedtools coverage -counts -a /project2/gilad/briana/Net-seq/ref_genes/ref_seq_gene_hg16_bins.bed -b /project2/gilad/briana/Net-seq/Net-seq1/data/bed_sort/net1_18508_dep_chr_sort.bed  &gt; /project2/gilad/briana/Net-seq/ref_genes/gene_windows_200/gene_window_cov_18508_dep.txt

bedtools coverage -counts -a /project2/gilad/briana/Net-seq/ref_genes/ref_seq_gene_hg16_bins.bed -b /project2/gilad/briana/Net-seq/Net-seq1/data/bed_sort/net1_19238_dep_chr_sort.bed   &gt; /project2/gilad/briana/Net-seq/ref_genes/gene_windows_200/gene_window_cov_19238.txt

bedtools coverage -counts -a /project2/gilad/briana/Net-seq/ref_genes/ref_seq_gene_hg16_bins.bed  -b /project2/gilad/briana/Net-seq/data/bed_sort/mayer_SRR1575922_chr_sort.bed   &gt; /project2/gilad/briana/Net-seq/ref_genes/gene_windows_200/gene_window_cov_mayer.txt 
</code></pre>
<p><strong>This file has 22071951 bins. This is not a good representation because some locations in the genome are in multiple genes. I need to think about this more. Maybe I can only keep unique windows.</strong></p>
<p>``ref_seq_gene_hg16_bins.bed | uniq -u | wc -l```</p>
<p><strong>This leaves 5023160 lines. Still not right because the genes start at different positions but overlap then the lines wouldnt be unique.</strong></p>
<p>Use bedtools intersect -a genome -b genes -wa:</p>
<ul>
<li><p>a is the genome windows</p></li>
<li><p>b is the gene file</p></li>
</ul>
<p>This should keep the windows of a that intersect b. This means I will only have windows that contain a gene.</p>
<pre class="bash"><code>
bedtools intersect -a /project2/gilad/briana/Net-seq/genome_bed/hg19_windows_sort_2.bed -b /project2/gilad/briana/Net-seq/ref_genes/ref_seq_gene_hg19_small.bed -wa &gt; /project2/gilad/briana/Net-seq/genome_bed/hg19_windows_with_gene.bed 
</code></pre>
<p>ERROR:</p>
<p>ERROR: Received illegal bin number 262143 from getBin call.<br />
ERROR: Unable to add record to tree</p>
<p>Potential problem: hg19 file starts with chrom 10 and the gene file starts with chr1. Maybe try sorting the gene file the way I sorted the hg19 file.</p>
<p>Get the chr# list from the genes file using:</p>
<p><code>cut -f 1 | uniq /project2/gilad/briana/Net-seq/ref_genes/ref_seq_gene_hg19_small.bed &gt; names.txt</code></p>
<pre class="bash"><code>#!/bin/bash


#SBATCH --job-name=sortgene
#SBATCH --time=8:00:00
#SBATCH --partition=broadwl
#SBATCH --mem=20G
#SBATCH --tasks-per-node=4
#SBATCH --mail-type=END

module load bedtools

cd /project2/gilad/briana/Net-seq/genome_bed
 
bedtools sort -faidx names_uniq.txt  -i  hg19_200bp_window.bed -o hg19_windows_sort.bed</code></pre>
<p>ERROR: Chromosome “chr1” undefined in names_uniq.txt</p>
<p>Try:</p>
<p><code>sort -k 1,1 -k2,2n hg19_200bp_window.bed &gt; hg19_windows_sort_2.bed</code></p>
<p>Cut the gene file to only have the first 3 columns with:</p>
<p><code>cut -f 1,2,3 ref_seq_gene_hg19 &gt; ref_seq_gene_hg19_3col.bed</code></p>
<p>Run intersect:</p>
<pre class="bash"><code>bedtools intersect -a /project2/gilad/briana/Net-seq/genome_bed/hg19_windows_sort_2.bed -b /project2/gilad/briana/Net-seq/ref_genes/ref_seq_gene_hg19_3col.bed -wa &gt; /project2/gilad/briana/Net-seq/genome_bed/hg19_windows_with_gene.bed </code></pre>
<p>This worked but you still have multiples. I will take the unique lines of this with:</p>
<p><code>cat hg19_windows_with_gene.bed | uniq  &gt; hg19_windows_with_gene_uniq.bed</code></p>
<p>Now I can run the coverage counts on this file to see the how many of the genome bins that had at least one gene have coverage.</p>
<p>Call this script. uniq_gene_window.sh in /project2/gilad/briana/Net-seq/ref_genes/uniq_gene_windows_200</p>
<pre class="bash"><code>#!/bin/bash

#SBATCH --job-name=uniq_gene_window
#SBATCH --time=8:00:00
#SBATCH --partition=broadwl
#SBATCH --mem=30G
#SBATCH --tasks-per-node=4
#SBATCH --mail-type=END



bedtools coverage -counts -a /project2/gilad/briana/Net-seq/genome_bed/hg19_windows_with_gene_uniq.bed -b /project2/gilad/briana/Net-seq/Net-seq1/data/bed_sort/net1_18486_dep_chr_sort.bed  &gt; /project2/gilad/briana/Net-seq/ref_genes/uniq_gene_windows_200/uniq_gene_window_cov_18486.txt

bedtools coverage -counts -a /project2/gilad/briana/Net-seq/genome_bed/hg19_windows_with_gene_uniq.bed -b /project2/gilad/briana/Net-seq/Net-seq1/data/bed_sort/net1_18508_dep_chr_sort.bed  &gt; /project2/gilad/briana/Net-seq/ref_genes/uniq_gene_windows_200/uniq_gene_window_cov_18508_dep.txt

bedtools coverage -counts -a /project2/gilad/briana/Net-seq/genome_bed/hg19_windows_with_gene_uniq.bed -b /project2/gilad/briana/Net-seq/Net-seq1/data/bed_sort/net1_19238_dep_chr_sort.bed   &gt; /project2/gilad/briana/Net-seq/ref_genes/uniq_gene_windows_200/uniq_gene_window_cov_19238.txt

bedtools coverage -counts -a /project2/gilad/briana/Net-seq/genome_bed/hg19_windows_with_gene_uniq.bed -b /project2/gilad/briana/Net-seq/data/bed_sort/mayer_SRR1575922_chr_sort.bed   &gt; /project2/gilad/briana/Net-seq/ref_genes/uniq_gene_windows_200/uniq_gene_window_cov_mayer.txt 
</code></pre>
<div id="load-uniq-gene-files" class="section level4">
<h4>Load uniq gene files:</h4>
<p>These are the uniq bind in the genome 200bp window file that include a gene.</p>
<pre class="r"><code>uniq_gene_window_18486=read.csv(&quot;../data/uniq_genes/uniq_gene_window_cov_18486.txt&quot;, header=FALSE, sep=&quot;\t&quot;)
uniq_gene_window_18508=read.csv(&quot;../data/uniq_genes/uniq_gene_window_cov_18508_dep.txt&quot;, header=FALSE, sep=&quot;\t&quot;)
uniq_gene_window_19238=read.csv(&quot;../data/uniq_genes/uniq_gene_window_cov_19238.txt&quot;, header=FALSE, sep=&quot;\t&quot;)
uniq_gene_window_mayer=read.csv(&quot;../data/uniq_genes/uniq_gene_window_cov_mayer.txt&quot;, header=FALSE, sep=&quot;\t&quot;)</code></pre>
<pre class="r"><code>colnames(uniq_gene_window_mayer)= c(&quot;chr&quot;, &quot;start&quot;, &quot;end&quot;, &quot;count&quot;)
colnames(uniq_gene_window_19238)= c(&quot;chr&quot;, &quot;start&quot;, &quot;end&quot;, &quot;count&quot;)
colnames(uniq_gene_window_18486)= c(&quot;chr&quot;, &quot;start&quot;, &quot;end&quot;, &quot;count&quot;)
colnames(uniq_gene_window_18508)= c(&quot;chr&quot;, &quot;start&quot;, &quot;end&quot;, &quot;count&quot;)</code></pre>
<p>Look at the number of windows with coverage:</p>
<pre class="r"><code>uniq_gene_window_18486_no0= uniq_gene_window_18486[uniq_gene_window_18486$count!=0,]
uniq_gene_window_18508_no0= uniq_gene_window_18508[uniq_gene_window_18508$count!=0,]
uniq_gene_window_19238_no0= uniq_gene_window_19238[uniq_gene_window_19238$count!=0,]
uniq_gene_window_mayer_no0= uniq_gene_window_mayer[uniq_gene_window_mayer$count!=0,]</code></pre>
<p>Bar plot for coverage:</p>
<pre class="r"><code>gene_windows= nrow(uniq_gene_window_mayer)

barplot(c(nrow(uniq_gene_window_18486_no0)/gene_windows, nrow(uniq_gene_window_18508_no0)/gene_windows, nrow(uniq_gene_window_19238_no0)/gene_windows, nrow(uniq_gene_window_mayer_no0)/gene_windows), main=&quot;Coverage for windows with genes&quot;, names= c(&quot;18486&quot;, &quot;18508&quot;, &quot;19238&quot;, &quot;Mayer&quot;), xlab=&quot;Library&quot;)</code></pre>
<p><img src="figure/bin_windows.Rmd/unnamed-chunk-25-1.png" width="672" style="display: block; margin: auto;" /></p>
<pre class="r"><code>mayer_gene_coverage=nrow(uniq_gene_window_mayer_no0)/gene_windows</code></pre>
<p>Mayer has more coverage for this parameter. Their library has 0.0954862.</p>
<p>Ours:</p>
<ul>
<li><p>18486: 0.0495328</p></li>
<li><p>18508: 0.0460654</p></li>
<li><p>19238: 0.0346009</p></li>
</ul>
</div>
</div>
<div id="session-information" class="section level3">
<h3>Session information</h3>
<!-- Insert the session information into the document -->
<pre class="r"><code>sessionInfo()</code></pre>
<pre><code>R version 3.4.2 (2017-09-28)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS Sierra 10.12.6

Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

loaded via a namespace (and not attached):
 [1] compiler_3.4.2  backports_1.1.1 magrittr_1.5    rprojroot_1.2  
 [5] tools_3.4.2     htmltools_0.3.6 yaml_2.1.14     Rcpp_0.12.13   
 [9] stringi_1.1.5   rmarkdown_1.6   knitr_1.17      git2r_0.19.0   
[13] stringr_1.2.0   digest_0.6.12   evaluate_0.10.1</code></pre>
</div>

<hr>
<p>
    This <a href="http://rmarkdown.rstudio.com">R Markdown</a> site was created with <a href="https://github.com/jdblischak/workflowr">workflowr</a>
</p>
<hr>

<!-- To enable disqus, uncomment the section below and provide your disqus_shortname -->

<!-- disqus
  <div id="disqus_thread"></div>
    <script type="text/javascript">
        /* * * CONFIGURATION VARIABLES: EDIT BEFORE PASTING INTO YOUR WEBPAGE * * */
        var disqus_shortname = 'rmarkdown'; // required: replace example with your forum shortname

        /* * * DON'T EDIT BELOW THIS LINE * * */
        (function() {
            var dsq = document.createElement('script'); dsq.type = 'text/javascript'; dsq.async = true;
            dsq.src = '//' + disqus_shortname + '.disqus.com/embed.js';
            (document.getElementsByTagName('head')[0] || document.getElementsByTagName('body')[0]).appendChild(dsq);
        })();
    </script>
    <noscript>Please enable JavaScript to view the <a href="http://disqus.com/?ref_noscript">comments powered by Disqus.</a></noscript>
    <a href="http://disqus.com" class="dsq-brlink">comments powered by <span class="logo-disqus">Disqus</span></a>
-->


</div>
</div>

</div>

<script>

// add bootstrap table styles to pandoc tables
function bootstrapStylePandocTables() {
  $('tr.header').parent('thead').parent('table').addClass('table table-condensed');
}
$(document).ready(function () {
  bootstrapStylePandocTables();
});


</script>

<!-- dynamically load mathjax for compatibility with self-contained -->
<script>
  (function () {
    var script = document.createElement("script");
    script.type = "text/javascript";
    script.src  = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
    document.getElementsByTagName("head")[0].appendChild(script);
  })();
</script>

</body>
</html>