<!DOCTYPE html> <html xmlns="http://www.w3.org/1999/xhtml"> <head> <meta charset="utf-8" /> <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> <meta name="generator" content="pandoc" /> <meta name="author" content="Briana Mittleman" /> <meta name="date" content="2017-11-08" /> <title>Explore UMI Usage in Netseq1 libary</title> <script src="site_libs/jquery-1.11.3/jquery.min.js"></script> <meta name="viewport" content="width=device-width, initial-scale=1" /> <link href="site_libs/bootstrap-3.3.5/css/cosmo.min.css" rel="stylesheet" /> <script src="site_libs/bootstrap-3.3.5/js/bootstrap.min.js"></script> <script src="site_libs/bootstrap-3.3.5/shim/html5shiv.min.js"></script> <script src="site_libs/bootstrap-3.3.5/shim/respond.min.js"></script> <script src="site_libs/jqueryui-1.11.4/jquery-ui.min.js"></script> <link href="site_libs/tocify-1.9.1/jquery.tocify.css" rel="stylesheet" /> <script src="site_libs/tocify-1.9.1/jquery.tocify.js"></script> <script src="site_libs/navigation-1.1/tabsets.js"></script> <link href="site_libs/highlightjs-1.1/textmate.css" rel="stylesheet" /> <script src="site_libs/highlightjs-1.1/highlight.js"></script> <link href="site_libs/font-awesome-4.5.0/css/font-awesome.min.css" rel="stylesheet" /> <style type="text/css">code{white-space: pre;}</style> <style type="text/css"> pre:not([class]) { background-color: white; } </style> <script type="text/javascript"> if (window.hljs && document.readyState && document.readyState === "complete") { window.setTimeout(function() { hljs.initHighlighting(); }, 0); } </script> <style type="text/css"> h1 { font-size: 34px; } h1.title { font-size: 38px; } h2 { font-size: 30px; } h3 { font-size: 24px; } h4 { font-size: 18px; } h5 { font-size: 16px; } h6 { font-size: 12px; } .table th:not([align]) { text-align: left; } </style> </head> <body> <style type = "text/css"> .main-container { max-width: 940px; margin-left: auto; margin-right: auto; } code { color: inherit; background-color: rgba(0, 0, 0, 0.04); } img { max-width:100%; height: auto; } .tabbed-pane { padding-top: 12px; } button.code-folding-btn:focus { outline: none; } </style> <style type="text/css"> /* padding for bootstrap navbar */ body { padding-top: 51px; padding-bottom: 40px; } /* offset scroll position for anchor links (for fixed navbar) */ .section h1 { padding-top: 56px; margin-top: -56px; } .section h2 { padding-top: 56px; margin-top: -56px; } .section h3 { padding-top: 56px; margin-top: -56px; } .section h4 { padding-top: 56px; margin-top: -56px; } .section h5 { padding-top: 56px; margin-top: -56px; } .section h6 { padding-top: 56px; margin-top: -56px; } </style> <script> // manage active state of menu based on current page $(document).ready(function () { // active menu anchor href = window.location.pathname href = href.substr(href.lastIndexOf('/') + 1) if (href === "") href = "index.html"; var menuAnchor = $('a[href="' + href + '"]'); // mark it active menuAnchor.parent().addClass('active'); // if it's got a parent navbar menu mark it active as well menuAnchor.closest('li.dropdown').addClass('active'); }); </script> <div class="container-fluid main-container"> <!-- tabsets --> <script> $(document).ready(function () { window.buildTabsets("TOC"); }); </script> <!-- code folding --> <script> $(document).ready(function () { // move toc-ignore selectors from section div to header $('div.section.toc-ignore') .removeClass('toc-ignore') .children('h1,h2,h3,h4,h5').addClass('toc-ignore'); // establish options var options = { selectors: "h1,h2,h3", theme: "bootstrap3", context: '.toc-content', hashGenerator: function (text) { return text.replace(/[.\\/?&!#<>]/g, '').replace(/\s/g, '_').toLowerCase(); }, ignoreSelector: ".toc-ignore", scrollTo: 0 }; options.showAndHide = true; options.smoothScroll = true; // tocify var toc = $("#TOC").tocify(options).data("toc-tocify"); }); </script> <style type="text/css"> #TOC { margin: 25px 0px 20px 0px; } @media (max-width: 768px) { #TOC { position: relative; width: 100%; } } .toc-content { padding-left: 30px; padding-right: 40px; } div.main-container { max-width: 1200px; } div.tocify { width: 20%; max-width: 260px; max-height: 85%; } @media (min-width: 768px) and (max-width: 991px) { div.tocify { width: 25%; } } @media (max-width: 767px) { div.tocify { width: 100%; max-width: none; } } .tocify ul, .tocify li { line-height: 20px; } .tocify-subheader .tocify-item { font-size: 0.90em; padding-left: 25px; text-indent: 0; } .tocify .list-group-item { border-radius: 0px; } </style> <!-- setup 3col/9col grid for toc_float and main content --> <div class="row-fluid"> <div class="col-xs-12 col-sm-4 col-md-3"> <div id="TOC" class="tocify"> </div> </div> <div class="toc-content col-xs-12 col-sm-8 col-md-9"> <div class="navbar navbar-default navbar-fixed-top" role="navigation"> <div class="container"> <div class="navbar-header"> <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar"> <span class="icon-bar"></span> <span class="icon-bar"></span> <span class="icon-bar"></span> </button> <a class="navbar-brand" href="index.html">Net-seq</a> </div> <div id="navbar" class="navbar-collapse collapse"> <ul class="nav navbar-nav"> <li> <a href="index.html">Home</a> </li> <li> <a href="about.html">About</a> </li> <li> <a href="license.html">License</a> </li> </ul> <ul class="nav navbar-nav navbar-right"> <li> <a href="https://github.com/brimittleman/Net-seq"> <span class="fa fa-github"></span> </a> </li> </ul> </div><!--/.nav-collapse --> </div><!--/.container --> </div><!--/.navbar --> <div class="fluid-row" id="header"> <h1 class="title toc-ignore">Explore UMI Usage in Netseq1 libary</h1> <h4 class="author"><em>Briana Mittleman</em></h4> <h4 class="date"><em>2017-11-08</em></h4> </div> <!-- The file analysis/chunks.R contains chunks that define default settings shared across the workflowr files. --> <!-- Update knitr chunk options --> <!-- Insert the date the file was last updated --> <p><strong>Last updated:</strong> 2017-11-13</p> <!-- Insert the code version (Git commit SHA1) if Git repository exists and R package git2r is installed --> <p><strong>Code version:</strong> a25b69a</p> <p>In this analysis I will explore the UMI usage in the Net-Seq1 library. Due to low read counts in the total sample, I will exclude this sample from the analysis.</p> <p>This code is used to create a text file that I can explore in R. It has a list of all of the UMIs used for the sample sorted by usage with the number of times each is used. This is run before the duduplication step.</p> <pre class="bash"><code>samtools view {file} | tr "_" "\t" | cut -f 2 | sort | uniq -c > ../../output/UMI_{file}_stat.txt</code></pre> <div id="packages" class="section level3"> <h3>Packages</h3> <pre class="r"><code>library("tidyr") library("dplyr")</code></pre> <pre><code> Attaching package: 'dplyr'</code></pre> <pre><code>The following objects are masked from 'package:stats': filter, lag</code></pre> <pre><code>The following objects are masked from 'package:base': intersect, setdiff, setequal, union</code></pre> <pre class="r"><code>library("ggplot2") library("seqLogo")</code></pre> <pre><code>Loading required package: grid</code></pre> <pre class="r"><code>library("Biostrings")</code></pre> <pre><code>Loading required package: BiocGenerics</code></pre> <pre><code>Loading required package: parallel</code></pre> <pre><code> Attaching package: 'BiocGenerics'</code></pre> <pre><code>The following objects are masked from 'package:parallel': clusterApply, clusterApplyLB, clusterCall, clusterEvalQ, clusterExport, clusterMap, parApply, parCapply, parLapply, parLapplyLB, parRapply, parSapply, parSapplyLB</code></pre> <pre><code>The following objects are masked from 'package:dplyr': combine, intersect, setdiff, union</code></pre> <pre><code>The following objects are masked from 'package:stats': IQR, mad, sd, var, xtabs</code></pre> <pre><code>The following objects are masked from 'package:base': anyDuplicated, append, as.data.frame, cbind, colMeans, colnames, colSums, do.call, duplicated, eval, evalq, Filter, Find, get, grep, grepl, intersect, is.unsorted, lapply, lengths, Map, mapply, match, mget, order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce, rowMeans, rownames, rowSums, sapply, setdiff, sort, table, tapply, union, unique, unsplit, which, which.max, which.min</code></pre> <pre><code>Loading required package: S4Vectors</code></pre> <pre><code>Loading required package: stats4</code></pre> <pre><code> Attaching package: 'S4Vectors'</code></pre> <pre><code>The following objects are masked from 'package:dplyr': first, rename</code></pre> <pre><code>The following object is masked from 'package:tidyr': expand</code></pre> <pre><code>The following object is masked from 'package:base': expand.grid</code></pre> <pre><code>Loading required package: IRanges</code></pre> <pre><code> Attaching package: 'IRanges'</code></pre> <pre><code>The following objects are masked from 'package:dplyr': collapse, desc, slice</code></pre> <pre><code>Loading required package: XVector</code></pre> <pre><code> Attaching package: 'Biostrings'</code></pre> <pre><code>The following object is masked from 'package:base': strsplit</code></pre> <pre class="r"><code>require("Biostrings")</code></pre> </div> <div id="input-data" class="section level3"> <h3>Input data</h3> <pre class="r"><code>prepare_UMI_data=function(path.txt){ x=read.delim(file=path.txt, header = FALSE,stringsAsFactors = FALSE) colnames(x) <- "UMI" x= data.frame(sapply(x, trimws), stringsAsFactors = FALSE) x= separate(data=x, col = UMI, into= c("number", "umi"), sep="\\s+") x$number= as.numeric(x$number) x= arrange(x, desc(number)) return(x) } UMI_18486_dep = prepare_UMI_data("../data/UMI_18486_dep_stat.txt") UMI_18508_dep= prepare_UMI_data("../data/UMI_18508_dep_stat.txt") UMI_18508_nondep= prepare_UMI_data("../data/UMI_18508_nondep_stat.txt") UMI_19238_dep= prepare_UMI_data("../data/UMI_19238_dep_stat.txt") UMI_mayer= prepare_UMI_data("../data/UMI_mayer_stat.txt")</code></pre> <div id="plot-the-umi-distributions" class="section level4"> <h4>Plot the umi distributions</h4> <pre class="r"><code>par(mfrow = c(2,3)) plot(UMI_18486_dep$number, ylab="UMI count", xlab="UMI", main="18486-dep distribution") plot(UMI_18508_dep$number, ylab="UMI count", xlab="UMI", main="18508-dep distribution") plot(UMI_18508_nondep$number, ylab="UMI count", xlab="UMI", main="1508- nondep distribution") plot(UMI_19238_dep$number, ylab="UMI count", xlab="UMI", main="19238-dep distribution") plot(UMI_mayer$number, ylab="UMI count", xlab="UMI", main="Mayer distribution")</code></pre> <p><img src="figure/explore_umi_usage.Rmd/unnamed-chunk-4-1.png" width="672" style="display: block; margin: auto;" /> Look at the top used UMI for each data set.</p> <pre class="r"><code>UMI_18486_dep[1:3,]</code></pre> <pre><code> number umi 1 3617979 ATCTCG 2 592512 CACCCG 3 90128 TCTCGT</code></pre> <pre class="r"><code>UMI_18508_dep[1:3,]</code></pre> <pre><code> number umi 1 9270083 ATCTCG 2 880379 CACCCG 3 201796 TCTCGT</code></pre> <pre class="r"><code>UMI_18508_nondep[1:3, ]</code></pre> <pre><code> number umi 1 12216803 ATCTCG 2 911426 CACCCG 3 401897 TCTCGT</code></pre> <pre class="r"><code>UMI_19238_dep[1:3,]</code></pre> <pre><code> number umi 1 6058977 ATCTCG 2 1852855 CACCCG 3 235866 TATCTC</code></pre> <pre class="r"><code>UMI_mayer[1:3,]</code></pre> <pre><code> number umi 1 1040195 ATCTCG 2 172910 TTTCAC 3 169350 TTACAC</code></pre> <p>The top used UMIs are similar accross samples. This preference could be due to annealing temperatures.(Conversation with Po) All data sets show an overrepresentation of a few UMIs, I will remove the top 5 to get a better look at the distribution.</p> <pre class="r"><code>par(mfrow = c(2,3)) plot(UMI_18486_dep[6:5388,]$number, ylab="UMI count", xlab="UMI", main="18486-dep distribution -5") plot(UMI_18508_dep[6:5471,]$number, ylab="UMI count", xlab="UMI", main="18508-dep distribution -5") plot(UMI_18508_nondep[6:5535,]$number, ylab="UMI count", xlab="UMI", main="18508-nondep distribution -5") plot(UMI_19238_dep[6:5699,]$number, ylab="UMI count", xlab="UMI", main="19328-dep distribution -5") plot(UMI_mayer[6:6101,]$number, ylab="UMI count", xlab="UMI", main="Mayer distribution -5")</code></pre> <p><img src="figure/explore_umi_usage.Rmd/plot-1.png" width="672" style="display: block; margin: auto;" /></p> </div> <div id="seq-logo-plots" class="section level4"> <h4>Seq Logo Plots</h4> <p>Use Biostrings to get the PMW then create the logoplots with seqlogo.</p> <pre class="r"><code>#source("https://bioconductor.org/biocLite.R") #biocLite("seqLogo") #source("http://bioconductor.org/biocLite.R") #biocLite("Biostrings") #set= DNAStringSet(UMI_18486_dep$umi) #length(set) #set.freq=data.frame(alphabetFrequency(set, baseOnly=T, as.prob=T)) #set_noN=set[set.freq$other==0,] #length(set_noN) #width(set_noN) #x=consensusMatrix(set_noN) #problem here, getting 1024 for all #freq_18486= PWM(x[1:4,]) #sum(UMI_18486_dep$number==0) > 0 : shows no UMIs are never used #seqLogo(freq_18486, ic.scale = TRUE, xaxis = TRUE, yaxis = TRUE, xfontsize = 15, yfontsize =15)</code></pre> <p>Try with a different package:</p> <pre class="r"><code>#library("devtools") #install_github("omarwagih/ggseqlogo") require(ggseqlogo)</code></pre> <pre><code>Loading required package: ggseqlogo</code></pre> <pre class="r"><code>cs1 = make_col_scheme(chars=c('A', 'T', 'C', 'G', 'N'), groups=c('A', 'T', 'C', 'G', 'N'), cols=c('red', 'blue', 'green', 'yellow', 'pink')) par(mfrow = c(2,3)) ggseqlogo(UMI_18486_dep$umi, col_scheme=cs1)</code></pre> <p><img src="figure/explore_umi_usage.Rmd/unnamed-chunk-7-1.png" width="672" style="display: block; margin: auto;" /></p> <pre class="r"><code>ggseqlogo(UMI_18508_dep$umi, col_scheme=cs1)</code></pre> <p><img src="figure/explore_umi_usage.Rmd/unnamed-chunk-7-2.png" width="672" style="display: block; margin: auto;" /></p> <pre class="r"><code>ggseqlogo(UMI_18508_nondep$umi, col_scheme=cs1)</code></pre> <p><img src="figure/explore_umi_usage.Rmd/unnamed-chunk-7-3.png" width="672" style="display: block; margin: auto;" /></p> <pre class="r"><code>ggseqlogo(UMI_19238_dep$umi, col_scheme=cs1)</code></pre> <p><img src="figure/explore_umi_usage.Rmd/unnamed-chunk-7-4.png" width="672" style="display: block; margin: auto;" /></p> <pre class="r"><code>ggseqlogo(UMI_mayer$umi, col_scheme=cs1)</code></pre> <p><img src="figure/explore_umi_usage.Rmd/unnamed-chunk-7-5.png" width="672" style="display: block; margin: auto;" /> Does not look like we get overrepresentation of one letter at any particular location in the UMI.</p> <pre class="r"><code>test.seqs= c("ATGC", "TAGC", "ATGC", "ATGC") ggseqlogo(test.seqs, col_scheme=cs1)</code></pre> <p><img src="figure/explore_umi_usage.Rmd/unnamed-chunk-8-1.png" width="672" style="display: block; margin: auto;" /></p> <!-- Insert the session information into the document --> <pre class="r"><code>sessionInfo()</code></pre> <pre><code>R version 3.4.2 (2017-09-28) Platform: x86_64-apple-darwin15.6.0 (64-bit) Running under: macOS Sierra 10.12.6 Matrix products: default BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib locale: [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8 attached base packages: [1] stats4 parallel grid stats graphics grDevices utils [8] datasets methods base other attached packages: [1] ggseqlogo_0.1 bindrcpp_0.2 Biostrings_2.46.0 [4] XVector_0.18.0 IRanges_2.12.0 S4Vectors_0.16.0 [7] BiocGenerics_0.24.0 seqLogo_1.44.0 ggplot2_2.2.1 [10] dplyr_0.7.4 tidyr_0.7.2 loaded via a namespace (and not attached): [1] Rcpp_0.12.13 compiler_3.4.2 git2r_0.19.0 plyr_1.8.4 [5] bindr_0.1 tools_3.4.2 zlibbioc_1.24.0 digest_0.6.12 [9] evaluate_0.10.1 tibble_1.3.4 gtable_0.2.0 pkgconfig_2.0.1 [13] rlang_0.1.4 yaml_2.1.14 stringr_1.2.0 knitr_1.17 [17] tidyselect_0.2.3 rprojroot_1.2 glue_1.2.0 R6_2.2.2 [21] rmarkdown_1.6 purrr_0.2.4 magrittr_1.5 backports_1.1.1 [25] scales_0.5.0 htmltools_0.3.6 assertthat_0.2.0 colorspace_1.3-2 [29] labeling_0.3 stringi_1.1.5 lazyeval_0.2.1 munsell_0.4.3 </code></pre> </div> </div> <hr> <p> This <a href="http://rmarkdown.rstudio.com">R Markdown</a> site was created with <a href="https://github.com/jdblischak/workflowr">workflowr</a> </p> <hr> <!-- To enable disqus, uncomment the section below and provide your disqus_shortname --> <!-- disqus <div id="disqus_thread"></div> <script type="text/javascript"> /* * * CONFIGURATION VARIABLES: EDIT BEFORE PASTING INTO YOUR WEBPAGE * * */ var disqus_shortname = 'rmarkdown'; // required: replace example with your forum shortname /* * * DON'T EDIT BELOW THIS LINE * * */ (function() { var dsq = document.createElement('script'); dsq.type = 'text/javascript'; dsq.async = true; dsq.src = '//' + disqus_shortname + '.disqus.com/embed.js'; (document.getElementsByTagName('head')[0] || document.getElementsByTagName('body')[0]).appendChild(dsq); })(); </script> <noscript>Please enable JavaScript to view the <a href="http://disqus.com/?ref_noscript">comments powered by Disqus.</a></noscript> <a href="http://disqus.com" class="dsq-brlink">comments powered by <span class="logo-disqus">Disqus</span></a> --> </div> </div> </div> <script> // add bootstrap table styles to pandoc tables function bootstrapStylePandocTables() { $('tr.header').parent('thead').parent('table').addClass('table table-condensed'); } $(document).ready(function () { bootstrapStylePandocTables(); }); </script> <!-- dynamically load mathjax for compatibility with self-contained --> <script> (function () { var script = document.createElement("script"); script.type = "text/javascript"; script.src = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML"; document.getElementsByTagName("head")[0].appendChild(script); })(); </script> </body> </html>