<!DOCTYPE html>

<html xmlns="http://www.w3.org/1999/xhtml">

<head>

<meta charset="utf-8" />
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta name="generator" content="pandoc" />


<meta name="author" content="Briana Mittleman" />

<meta name="date" content="2017-11-08" />

<title>Explore UMI Usage in Netseq1 libary</title>

<script src="site_libs/jquery-1.11.3/jquery.min.js"></script>
<meta name="viewport" content="width=device-width, initial-scale=1" />
<link href="site_libs/bootstrap-3.3.5/css/cosmo.min.css" rel="stylesheet" />
<script src="site_libs/bootstrap-3.3.5/js/bootstrap.min.js"></script>
<script src="site_libs/bootstrap-3.3.5/shim/html5shiv.min.js"></script>
<script src="site_libs/bootstrap-3.3.5/shim/respond.min.js"></script>
<script src="site_libs/jqueryui-1.11.4/jquery-ui.min.js"></script>
<link href="site_libs/tocify-1.9.1/jquery.tocify.css" rel="stylesheet" />
<script src="site_libs/tocify-1.9.1/jquery.tocify.js"></script>
<script src="site_libs/navigation-1.1/tabsets.js"></script>
<link href="site_libs/highlightjs-1.1/textmate.css" rel="stylesheet" />
<script src="site_libs/highlightjs-1.1/highlight.js"></script>
<link href="site_libs/font-awesome-4.5.0/css/font-awesome.min.css" rel="stylesheet" />

<style type="text/css">code{white-space: pre;}</style>
<style type="text/css">
  pre:not([class]) {
    background-color: white;
  }
</style>
<script type="text/javascript">
if (window.hljs && document.readyState && document.readyState === "complete") {
   window.setTimeout(function() {
      hljs.initHighlighting();
   }, 0);
}
</script>



<style type="text/css">
h1 {
  font-size: 34px;
}
h1.title {
  font-size: 38px;
}
h2 {
  font-size: 30px;
}
h3 {
  font-size: 24px;
}
h4 {
  font-size: 18px;
}
h5 {
  font-size: 16px;
}
h6 {
  font-size: 12px;
}
.table th:not([align]) {
  text-align: left;
}
</style>


</head>

<body>

<style type = "text/css">
.main-container {
  max-width: 940px;
  margin-left: auto;
  margin-right: auto;
}
code {
  color: inherit;
  background-color: rgba(0, 0, 0, 0.04);
}
img {
  max-width:100%;
  height: auto;
}
.tabbed-pane {
  padding-top: 12px;
}
button.code-folding-btn:focus {
  outline: none;
}
</style>


<style type="text/css">
/* padding for bootstrap navbar */
body {
  padding-top: 51px;
  padding-bottom: 40px;
}
/* offset scroll position for anchor links (for fixed navbar)  */
.section h1 {
  padding-top: 56px;
  margin-top: -56px;
}

.section h2 {
  padding-top: 56px;
  margin-top: -56px;
}
.section h3 {
  padding-top: 56px;
  margin-top: -56px;
}
.section h4 {
  padding-top: 56px;
  margin-top: -56px;
}
.section h5 {
  padding-top: 56px;
  margin-top: -56px;
}
.section h6 {
  padding-top: 56px;
  margin-top: -56px;
}
</style>

<script>
// manage active state of menu based on current page
$(document).ready(function () {
  // active menu anchor
  href = window.location.pathname
  href = href.substr(href.lastIndexOf('/') + 1)
  if (href === "")
    href = "index.html";
  var menuAnchor = $('a[href="' + href + '"]');

  // mark it active
  menuAnchor.parent().addClass('active');

  // if it's got a parent navbar menu mark it active as well
  menuAnchor.closest('li.dropdown').addClass('active');
});
</script>


<div class="container-fluid main-container">

<!-- tabsets -->
<script>
$(document).ready(function () {
  window.buildTabsets("TOC");
});
</script>

<!-- code folding -->




<script>
$(document).ready(function ()  {

    // move toc-ignore selectors from section div to header
    $('div.section.toc-ignore')
        .removeClass('toc-ignore')
        .children('h1,h2,h3,h4,h5').addClass('toc-ignore');

    // establish options
    var options = {
      selectors: "h1,h2,h3",
      theme: "bootstrap3",
      context: '.toc-content',
      hashGenerator: function (text) {
        return text.replace(/[.\\/?&!#<>]/g, '').replace(/\s/g, '_').toLowerCase();
      },
      ignoreSelector: ".toc-ignore",
      scrollTo: 0
    };
    options.showAndHide = true;
    options.smoothScroll = true;

    // tocify
    var toc = $("#TOC").tocify(options).data("toc-tocify");
});
</script>

<style type="text/css">

#TOC {
  margin: 25px 0px 20px 0px;
}
@media (max-width: 768px) {
#TOC {
  position: relative;
  width: 100%;
}
}


.toc-content {
  padding-left: 30px;
  padding-right: 40px;
}

div.main-container {
  max-width: 1200px;
}

div.tocify {
  width: 20%;
  max-width: 260px;
  max-height: 85%;
}

@media (min-width: 768px) and (max-width: 991px) {
  div.tocify {
    width: 25%;
  }
}

@media (max-width: 767px) {
  div.tocify {
    width: 100%;
    max-width: none;
  }
}

.tocify ul, .tocify li {
  line-height: 20px;
}

.tocify-subheader .tocify-item {
  font-size: 0.90em;
  padding-left: 25px;
  text-indent: 0;
}

.tocify .list-group-item {
  border-radius: 0px;
}


</style>

<!-- setup 3col/9col grid for toc_float and main content  -->
<div class="row-fluid">
<div class="col-xs-12 col-sm-4 col-md-3">
<div id="TOC" class="tocify">
</div>
</div>

<div class="toc-content col-xs-12 col-sm-8 col-md-9">




<div class="navbar navbar-default  navbar-fixed-top" role="navigation">
  <div class="container">
    <div class="navbar-header">
      <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar">
        <span class="icon-bar"></span>
        <span class="icon-bar"></span>
        <span class="icon-bar"></span>
      </button>
      <a class="navbar-brand" href="index.html">Net-seq</a>
    </div>
    <div id="navbar" class="navbar-collapse collapse">
      <ul class="nav navbar-nav">
        <li>
  <a href="index.html">Home</a>
</li>
<li>
  <a href="about.html">About</a>
</li>
<li>
  <a href="license.html">License</a>
</li>
      </ul>
      <ul class="nav navbar-nav navbar-right">
        <li>
  <a href="https://github.com/brimittleman/Net-seq">
    <span class="fa fa-github"></span>
     
  </a>
</li>
      </ul>
    </div><!--/.nav-collapse -->
  </div><!--/.container -->
</div><!--/.navbar -->

<div class="fluid-row" id="header">



<h1 class="title toc-ignore">Explore UMI Usage in Netseq1 libary</h1>
<h4 class="author"><em>Briana Mittleman</em></h4>
<h4 class="date"><em>2017-11-08</em></h4>

</div>


<!-- The file analysis/chunks.R contains chunks that define default settings
shared across the workflowr files. -->
<!-- Update knitr chunk options -->
<!-- Insert the date the file was last updated -->
<p><strong>Last updated:</strong> 2017-11-13</p>
<!-- Insert the code version (Git commit SHA1) if Git repository exists and R
 package git2r is installed -->
<p><strong>Code version:</strong> a25b69a</p>
<p>In this analysis I will explore the UMI usage in the Net-Seq1 library. Due to low read counts in the total sample, I will exclude this sample from the analysis.</p>
<p>This code is used to create a text file that I can explore in R. It has a list of all of the UMIs used for the sample sorted by usage with the number of times each is used. This is run before the duduplication step.</p>
<pre class="bash"><code>samtools view {file} | tr &quot;_&quot; &quot;\t&quot; | cut -f 2 | sort | uniq -c &gt; ../../output/UMI_{file}_stat.txt</code></pre>
<div id="packages" class="section level3">
<h3>Packages</h3>
<pre class="r"><code>library(&quot;tidyr&quot;)
library(&quot;dplyr&quot;)</code></pre>
<pre><code>
Attaching package: &#39;dplyr&#39;</code></pre>
<pre><code>The following objects are masked from &#39;package:stats&#39;:

    filter, lag</code></pre>
<pre><code>The following objects are masked from &#39;package:base&#39;:

    intersect, setdiff, setequal, union</code></pre>
<pre class="r"><code>library(&quot;ggplot2&quot;)
library(&quot;seqLogo&quot;)</code></pre>
<pre><code>Loading required package: grid</code></pre>
<pre class="r"><code>library(&quot;Biostrings&quot;)</code></pre>
<pre><code>Loading required package: BiocGenerics</code></pre>
<pre><code>Loading required package: parallel</code></pre>
<pre><code>
Attaching package: &#39;BiocGenerics&#39;</code></pre>
<pre><code>The following objects are masked from &#39;package:parallel&#39;:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB</code></pre>
<pre><code>The following objects are masked from &#39;package:dplyr&#39;:

    combine, intersect, setdiff, union</code></pre>
<pre><code>The following objects are masked from &#39;package:stats&#39;:

    IQR, mad, sd, var, xtabs</code></pre>
<pre><code>The following objects are masked from &#39;package:base&#39;:

    anyDuplicated, append, as.data.frame, cbind, colMeans,
    colnames, colSums, do.call, duplicated, eval, evalq, Filter,
    Find, get, grep, grepl, intersect, is.unsorted, lapply,
    lengths, Map, mapply, match, mget, order, paste, pmax,
    pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce,
    rowMeans, rownames, rowSums, sapply, setdiff, sort, table,
    tapply, union, unique, unsplit, which, which.max, which.min</code></pre>
<pre><code>Loading required package: S4Vectors</code></pre>
<pre><code>Loading required package: stats4</code></pre>
<pre><code>
Attaching package: &#39;S4Vectors&#39;</code></pre>
<pre><code>The following objects are masked from &#39;package:dplyr&#39;:

    first, rename</code></pre>
<pre><code>The following object is masked from &#39;package:tidyr&#39;:

    expand</code></pre>
<pre><code>The following object is masked from &#39;package:base&#39;:

    expand.grid</code></pre>
<pre><code>Loading required package: IRanges</code></pre>
<pre><code>
Attaching package: &#39;IRanges&#39;</code></pre>
<pre><code>The following objects are masked from &#39;package:dplyr&#39;:

    collapse, desc, slice</code></pre>
<pre><code>Loading required package: XVector</code></pre>
<pre><code>
Attaching package: &#39;Biostrings&#39;</code></pre>
<pre><code>The following object is masked from &#39;package:base&#39;:

    strsplit</code></pre>
<pre class="r"><code>require(&quot;Biostrings&quot;)</code></pre>
</div>
<div id="input-data" class="section level3">
<h3>Input data</h3>
<pre class="r"><code>prepare_UMI_data=function(path.txt){
  x=read.delim(file=path.txt, header = FALSE,stringsAsFactors = FALSE)
  colnames(x) &lt;- &quot;UMI&quot;
  x= data.frame(sapply(x, trimws), stringsAsFactors = FALSE)
  x= separate(data=x, col = UMI, into= c(&quot;number&quot;, &quot;umi&quot;), sep=&quot;\\s+&quot;)
  x$number= as.numeric(x$number)
  x= arrange(x, desc(number))
  return(x)
}

UMI_18486_dep = prepare_UMI_data(&quot;../data/UMI_18486_dep_stat.txt&quot;)
UMI_18508_dep= prepare_UMI_data(&quot;../data/UMI_18508_dep_stat.txt&quot;)
UMI_18508_nondep= prepare_UMI_data(&quot;../data/UMI_18508_nondep_stat.txt&quot;)
UMI_19238_dep= prepare_UMI_data(&quot;../data/UMI_19238_dep_stat.txt&quot;)
UMI_mayer= prepare_UMI_data(&quot;../data/UMI_mayer_stat.txt&quot;)</code></pre>
<div id="plot-the-umi-distributions" class="section level4">
<h4>Plot the umi distributions</h4>
<pre class="r"><code>par(mfrow = c(2,3))
plot(UMI_18486_dep$number, ylab=&quot;UMI count&quot;, xlab=&quot;UMI&quot;, main=&quot;18486-dep distribution&quot;)
plot(UMI_18508_dep$number, ylab=&quot;UMI count&quot;, xlab=&quot;UMI&quot;, main=&quot;18508-dep distribution&quot;)
plot(UMI_18508_nondep$number, ylab=&quot;UMI count&quot;, xlab=&quot;UMI&quot;, main=&quot;1508- nondep distribution&quot;)
plot(UMI_19238_dep$number, ylab=&quot;UMI count&quot;, xlab=&quot;UMI&quot;, main=&quot;19238-dep distribution&quot;)
plot(UMI_mayer$number, ylab=&quot;UMI count&quot;, xlab=&quot;UMI&quot;, main=&quot;Mayer distribution&quot;)</code></pre>
<p><img src="figure/explore_umi_usage.Rmd/unnamed-chunk-4-1.png" width="672" style="display: block; margin: auto;" /> Look at the top used UMI for each data set.</p>
<pre class="r"><code>UMI_18486_dep[1:3,]</code></pre>
<pre><code>   number    umi
1 3617979 ATCTCG
2  592512 CACCCG
3   90128 TCTCGT</code></pre>
<pre class="r"><code>UMI_18508_dep[1:3,]</code></pre>
<pre><code>   number    umi
1 9270083 ATCTCG
2  880379 CACCCG
3  201796 TCTCGT</code></pre>
<pre class="r"><code>UMI_18508_nondep[1:3, ]</code></pre>
<pre><code>    number    umi
1 12216803 ATCTCG
2   911426 CACCCG
3   401897 TCTCGT</code></pre>
<pre class="r"><code>UMI_19238_dep[1:3,]</code></pre>
<pre><code>   number    umi
1 6058977 ATCTCG
2 1852855 CACCCG
3  235866 TATCTC</code></pre>
<pre class="r"><code>UMI_mayer[1:3,]</code></pre>
<pre><code>   number    umi
1 1040195 ATCTCG
2  172910 TTTCAC
3  169350 TTACAC</code></pre>
<p>The top used UMIs are similar accross samples. This preference could be due to annealing temperatures.(Conversation with Po) All data sets show an overrepresentation of a few UMIs, I will remove the top 5 to get a better look at the distribution.</p>
<pre class="r"><code>par(mfrow = c(2,3))
plot(UMI_18486_dep[6:5388,]$number, ylab=&quot;UMI count&quot;, xlab=&quot;UMI&quot;, main=&quot;18486-dep distribution -5&quot;)

plot(UMI_18508_dep[6:5471,]$number, ylab=&quot;UMI count&quot;, xlab=&quot;UMI&quot;, main=&quot;18508-dep distribution -5&quot;)

plot(UMI_18508_nondep[6:5535,]$number, ylab=&quot;UMI count&quot;, xlab=&quot;UMI&quot;, main=&quot;18508-nondep distribution -5&quot;)

plot(UMI_19238_dep[6:5699,]$number, ylab=&quot;UMI count&quot;, xlab=&quot;UMI&quot;, main=&quot;19328-dep distribution -5&quot;)

plot(UMI_mayer[6:6101,]$number, ylab=&quot;UMI count&quot;, xlab=&quot;UMI&quot;, main=&quot;Mayer distribution -5&quot;)</code></pre>
<p><img src="figure/explore_umi_usage.Rmd/plot-1.png" width="672" style="display: block; margin: auto;" /></p>
</div>
<div id="seq-logo-plots" class="section level4">
<h4>Seq Logo Plots</h4>
<p>Use Biostrings to get the PMW then create the logoplots with seqlogo.</p>
<pre class="r"><code>#source(&quot;https://bioconductor.org/biocLite.R&quot;)
#biocLite(&quot;seqLogo&quot;)
#source(&quot;http://bioconductor.org/biocLite.R&quot;)
#biocLite(&quot;Biostrings&quot;)
#set= DNAStringSet(UMI_18486_dep$umi)
#length(set)
#set.freq=data.frame(alphabetFrequency(set, baseOnly=T, as.prob=T))
#set_noN=set[set.freq$other==0,]
#length(set_noN)
#width(set_noN)
#x=consensusMatrix(set_noN) #problem here, getting 1024 for all
#freq_18486= PWM(x[1:4,])

#sum(UMI_18486_dep$number==0) &gt; 0 : shows no UMIs are never used 

#seqLogo(freq_18486, ic.scale = TRUE, xaxis = TRUE, yaxis = TRUE, xfontsize = 15, yfontsize =15)</code></pre>
<p>Try with a different package:</p>
<pre class="r"><code>#library(&quot;devtools&quot;)
#install_github(&quot;omarwagih/ggseqlogo&quot;)
require(ggseqlogo)</code></pre>
<pre><code>Loading required package: ggseqlogo</code></pre>
<pre class="r"><code>cs1 = make_col_scheme(chars=c(&#39;A&#39;, &#39;T&#39;, &#39;C&#39;, &#39;G&#39;, &#39;N&#39;), groups=c(&#39;A&#39;, &#39;T&#39;, &#39;C&#39;, &#39;G&#39;, &#39;N&#39;), cols=c(&#39;red&#39;, &#39;blue&#39;, &#39;green&#39;, &#39;yellow&#39;, &#39;pink&#39;))


par(mfrow = c(2,3))
ggseqlogo(UMI_18486_dep$umi, col_scheme=cs1)</code></pre>
<p><img src="figure/explore_umi_usage.Rmd/unnamed-chunk-7-1.png" width="672" style="display: block; margin: auto;" /></p>
<pre class="r"><code>ggseqlogo(UMI_18508_dep$umi, col_scheme=cs1)</code></pre>
<p><img src="figure/explore_umi_usage.Rmd/unnamed-chunk-7-2.png" width="672" style="display: block; margin: auto;" /></p>
<pre class="r"><code>ggseqlogo(UMI_18508_nondep$umi, col_scheme=cs1)</code></pre>
<p><img src="figure/explore_umi_usage.Rmd/unnamed-chunk-7-3.png" width="672" style="display: block; margin: auto;" /></p>
<pre class="r"><code>ggseqlogo(UMI_19238_dep$umi, col_scheme=cs1)</code></pre>
<p><img src="figure/explore_umi_usage.Rmd/unnamed-chunk-7-4.png" width="672" style="display: block; margin: auto;" /></p>
<pre class="r"><code>ggseqlogo(UMI_mayer$umi, col_scheme=cs1)</code></pre>
<p><img src="figure/explore_umi_usage.Rmd/unnamed-chunk-7-5.png" width="672" style="display: block; margin: auto;" /> Does not look like we get overrepresentation of one letter at any particular location in the UMI.</p>
<pre class="r"><code>test.seqs= c(&quot;ATGC&quot;, &quot;TAGC&quot;, &quot;ATGC&quot;, &quot;ATGC&quot;)
ggseqlogo(test.seqs, col_scheme=cs1)</code></pre>
<p><img src="figure/explore_umi_usage.Rmd/unnamed-chunk-8-1.png" width="672" style="display: block; margin: auto;" /></p>
<!-- Insert the session information into the document -->
<pre class="r"><code>sessionInfo()</code></pre>
<pre><code>R version 3.4.2 (2017-09-28)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS Sierra 10.12.6

Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
 [1] stats4    parallel  grid      stats     graphics  grDevices utils    
 [8] datasets  methods   base     

other attached packages:
 [1] ggseqlogo_0.1       bindrcpp_0.2        Biostrings_2.46.0  
 [4] XVector_0.18.0      IRanges_2.12.0      S4Vectors_0.16.0   
 [7] BiocGenerics_0.24.0 seqLogo_1.44.0      ggplot2_2.2.1      
[10] dplyr_0.7.4         tidyr_0.7.2        

loaded via a namespace (and not attached):
 [1] Rcpp_0.12.13     compiler_3.4.2   git2r_0.19.0     plyr_1.8.4      
 [5] bindr_0.1        tools_3.4.2      zlibbioc_1.24.0  digest_0.6.12   
 [9] evaluate_0.10.1  tibble_1.3.4     gtable_0.2.0     pkgconfig_2.0.1 
[13] rlang_0.1.4      yaml_2.1.14      stringr_1.2.0    knitr_1.17      
[17] tidyselect_0.2.3 rprojroot_1.2    glue_1.2.0       R6_2.2.2        
[21] rmarkdown_1.6    purrr_0.2.4      magrittr_1.5     backports_1.1.1 
[25] scales_0.5.0     htmltools_0.3.6  assertthat_0.2.0 colorspace_1.3-2
[29] labeling_0.3     stringi_1.1.5    lazyeval_0.2.1   munsell_0.4.3   </code></pre>
</div>
</div>

<hr>
<p>
    This <a href="http://rmarkdown.rstudio.com">R Markdown</a> site was created with <a href="https://github.com/jdblischak/workflowr">workflowr</a>
</p>
<hr>

<!-- To enable disqus, uncomment the section below and provide your disqus_shortname -->

<!-- disqus
  <div id="disqus_thread"></div>
    <script type="text/javascript">
        /* * * CONFIGURATION VARIABLES: EDIT BEFORE PASTING INTO YOUR WEBPAGE * * */
        var disqus_shortname = 'rmarkdown'; // required: replace example with your forum shortname

        /* * * DON'T EDIT BELOW THIS LINE * * */
        (function() {
            var dsq = document.createElement('script'); dsq.type = 'text/javascript'; dsq.async = true;
            dsq.src = '//' + disqus_shortname + '.disqus.com/embed.js';
            (document.getElementsByTagName('head')[0] || document.getElementsByTagName('body')[0]).appendChild(dsq);
        })();
    </script>
    <noscript>Please enable JavaScript to view the <a href="http://disqus.com/?ref_noscript">comments powered by Disqus.</a></noscript>
    <a href="http://disqus.com" class="dsq-brlink">comments powered by <span class="logo-disqus">Disqus</span></a>
-->


</div>
</div>

</div>

<script>

// add bootstrap table styles to pandoc tables
function bootstrapStylePandocTables() {
  $('tr.header').parent('thead').parent('table').addClass('table table-condensed');
}
$(document).ready(function () {
  bootstrapStylePandocTables();
});


</script>

<!-- dynamically load mathjax for compatibility with self-contained -->
<script>
  (function () {
    var script = document.createElement("script");
    script.type = "text/javascript";
    script.src  = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
    document.getElementsByTagName("head")[0].appendChild(script);
  })();
</script>

</body>
</html>