<!DOCTYPE html>

<html xmlns="http://www.w3.org/1999/xhtml">

<head>

<meta charset="utf-8" />
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta name="generator" content="pandoc" />


<meta name="author" content="Briana Mittleman" />


<title>Fraction Differences with Processed data</title>

<script src="site_libs/jquery-1.11.3/jquery.min.js"></script>
<meta name="viewport" content="width=device-width, initial-scale=1" />
<link href="site_libs/bootstrap-3.3.5/css/journal.min.css" rel="stylesheet" />
<script src="site_libs/bootstrap-3.3.5/js/bootstrap.min.js"></script>
<script src="site_libs/bootstrap-3.3.5/shim/html5shiv.min.js"></script>
<script src="site_libs/bootstrap-3.3.5/shim/respond.min.js"></script>
<script src="site_libs/jqueryui-1.11.4/jquery-ui.min.js"></script>
<link href="site_libs/tocify-1.9.1/jquery.tocify.css" rel="stylesheet" />
<script src="site_libs/tocify-1.9.1/jquery.tocify.js"></script>
<script src="site_libs/navigation-1.1/tabsets.js"></script>
<link href="site_libs/highlightjs-9.12.0/textmate.css" rel="stylesheet" />
<script src="site_libs/highlightjs-9.12.0/highlight.js"></script>
<link href="site_libs/font-awesome-5.0.13/css/fa-svg-with-js.css" rel="stylesheet" />
<script src="site_libs/font-awesome-5.0.13/js/fontawesome-all.min.js"></script>
<script src="site_libs/font-awesome-5.0.13/js/fa-v4-shims.min.js"></script>

<style type="text/css">code{white-space: pre;}</style>
<style type="text/css">
  pre:not([class]) {
    background-color: white;
  }
</style>
<script type="text/javascript">
if (window.hljs) {
  hljs.configure({languages: []});
  hljs.initHighlightingOnLoad();
  if (document.readyState && document.readyState === "complete") {
    window.setTimeout(function() { hljs.initHighlighting(); }, 0);
  }
}
</script>



<style type="text/css">
h1 {
  font-size: 34px;
}
h1.title {
  font-size: 38px;
}
h2 {
  font-size: 30px;
}
h3 {
  font-size: 24px;
}
h4 {
  font-size: 18px;
}
h5 {
  font-size: 16px;
}
h6 {
  font-size: 12px;
}
.table th:not([align]) {
  text-align: left;
}
</style>


</head>

<body>

<style type = "text/css">
.main-container {
  max-width: 940px;
  margin-left: auto;
  margin-right: auto;
}
code {
  color: inherit;
  background-color: rgba(0, 0, 0, 0.04);
}
img {
  max-width:100%;
  height: auto;
}
.tabbed-pane {
  padding-top: 12px;
}
.html-widget {
  margin-bottom: 20px;
}
button.code-folding-btn:focus {
  outline: none;
}
</style>


<style type="text/css">
/* padding for bootstrap navbar */
body {
  padding-top: 51px;
  padding-bottom: 40px;
}
/* offset scroll position for anchor links (for fixed navbar)  */
.section h1 {
  padding-top: 56px;
  margin-top: -56px;
}

.section h2 {
  padding-top: 56px;
  margin-top: -56px;
}
.section h3 {
  padding-top: 56px;
  margin-top: -56px;
}
.section h4 {
  padding-top: 56px;
  margin-top: -56px;
}
.section h5 {
  padding-top: 56px;
  margin-top: -56px;
}
.section h6 {
  padding-top: 56px;
  margin-top: -56px;
}
</style>

<script>
// manage active state of menu based on current page
$(document).ready(function () {
  // active menu anchor
  href = window.location.pathname
  href = href.substr(href.lastIndexOf('/') + 1)
  if (href === "")
    href = "index.html";
  var menuAnchor = $('a[href="' + href + '"]');

  // mark it active
  menuAnchor.parent().addClass('active');

  // if it's got a parent navbar menu mark it active as well
  menuAnchor.closest('li.dropdown').addClass('active');
});
</script>


<div class="container-fluid main-container">

<!-- tabsets -->
<script>
$(document).ready(function () {
  window.buildTabsets("TOC");
});
</script>

<!-- code folding -->




<script>
$(document).ready(function ()  {

    // move toc-ignore selectors from section div to header
    $('div.section.toc-ignore')
        .removeClass('toc-ignore')
        .children('h1,h2,h3,h4,h5').addClass('toc-ignore');

    // establish options
    var options = {
      selectors: "h1,h2,h3",
      theme: "bootstrap3",
      context: '.toc-content',
      hashGenerator: function (text) {
        return text.replace(/[.\\/?&!#<>]/g, '').replace(/\s/g, '_').toLowerCase();
      },
      ignoreSelector: ".toc-ignore",
      scrollTo: 0
    };
    options.showAndHide = true;
    options.smoothScroll = true;

    // tocify
    var toc = $("#TOC").tocify(options).data("toc-tocify");
});
</script>

<style type="text/css">

#TOC {
  margin: 25px 0px 20px 0px;
}
@media (max-width: 768px) {
#TOC {
  position: relative;
  width: 100%;
}
}


.toc-content {
  padding-left: 30px;
  padding-right: 40px;
}

div.main-container {
  max-width: 1200px;
}

div.tocify {
  width: 20%;
  max-width: 260px;
  max-height: 85%;
}

@media (min-width: 768px) and (max-width: 991px) {
  div.tocify {
    width: 25%;
  }
}

@media (max-width: 767px) {
  div.tocify {
    width: 100%;
    max-width: none;
  }
}

.tocify ul, .tocify li {
  line-height: 20px;
}

.tocify-subheader .tocify-item {
  font-size: 0.90em;
  padding-left: 25px;
  text-indent: 0;
}

.tocify .list-group-item {
  border-radius: 0px;
}


</style>

<!-- setup 3col/9col grid for toc_float and main content  -->
<div class="row-fluid">
<div class="col-xs-12 col-sm-4 col-md-3">
<div id="TOC" class="tocify">
</div>
</div>

<div class="toc-content col-xs-12 col-sm-8 col-md-9">




<div class="navbar navbar-default  navbar-fixed-top" role="navigation">
  <div class="container">
    <div class="navbar-header">
      <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar">
        <span class="icon-bar"></span>
        <span class="icon-bar"></span>
        <span class="icon-bar"></span>
      </button>
      <a class="navbar-brand" href="index.html">Three Prime Sequencing in Human LCLs</a>
    </div>
    <div id="navbar" class="navbar-collapse collapse">
      <ul class="nav navbar-nav">
        <li>
  <a href="index.html">Home</a>
</li>
<li>
  <a href="about.html">About</a>
</li>
<li>
  <a href="license.html">License</a>
</li>
      </ul>
      <ul class="nav navbar-nav navbar-right">
        <li>
  <a href="https://github.com/brimittleman/threeprimeseq">
    <span class="fa fa-github"></span>
     
  </a>
</li>
      </ul>
    </div><!--/.nav-collapse -->
  </div><!--/.container -->
</div><!--/.navbar -->

<!-- Add a small amount of space between sections. -->
<style type="text/css">
div.section {
  padding-top: 12px;
}
</style>

<div class="fluid-row" id="header">



<h1 class="title toc-ignore">Fraction Differences with Processed data</h1>
<h4 class="author"><em>Briana Mittleman</em></h4>
<h4 class="date"><em>1/22/2019</em></h4>

</div>


<p><strong>Last updated:</strong> 2019-01-25</p>
<strong>workflowr checks:</strong> <small>(Click a bullet for more information)</small>
<ul>
<li>
<p><details> <summary> <strong style="color:blue;">✔</strong> <strong>R Markdown file:</strong> up-to-date </summary></p>
<p>Great! Since the R Markdown file has been committed to the Git repository, you know the exact version of the code that produced these results.</p>
</details>
</li>
<li>
<p><details> <summary> <strong style="color:blue;">✔</strong> <strong>Environment:</strong> empty </summary></p>
<p>Great job! The global environment was empty. Objects defined in the global environment can affect the analysis in your R Markdown file in unknown ways. For reproduciblity it’s best to always run the code in an empty environment.</p>
</details>
</li>
<li>
<p><details> <summary> <strong style="color:blue;">✔</strong> <strong>Seed:</strong> <code>set.seed(12345)</code> </summary></p>
<p>The command <code>set.seed(12345)</code> was run prior to running the code in the R Markdown file. Setting a seed ensures that any results that rely on randomness, e.g. subsampling or permutations, are reproducible.</p>
</details>
</li>
<li>
<p><details> <summary> <strong style="color:blue;">✔</strong> <strong>Session information:</strong> recorded </summary></p>
<p>Great job! Recording the operating system, R version, and package versions is critical for reproducibility.</p>
</details>
</li>
<li>
<p><details> <summary> <strong style="color:blue;">✔</strong> <strong>Repository version:</strong> <a href="https://github.com/brimittleman/threeprimeseq/tree/78f24b9acd89c71d45d23751a6edf1de44d2ecbf" target="_blank">78f24b9</a> </summary></p>
Great! You are using Git for version control. Tracking code development and connecting the code version to the results is critical for reproducibility. The version displayed above was the version of the Git repository at the time these results were generated. <br><br> Note that you need to be careful to ensure that all relevant files for the analysis have been committed to Git prior to generating the results (you can use <code>wflow_publish</code> or <code>wflow_git_commit</code>). workflowr only checks the R Markdown file, but you know if there are other scripts or data files that it depends on. Below is the status of the Git repository when the results were generated:
<pre><code>
Ignored files:
    Ignored:    .DS_Store
    Ignored:    .Rhistory
    Ignored:    .Rproj.user/
    Ignored:    data/.DS_Store
    Ignored:    output/.DS_Store

Untracked files:
    Untracked:  KalistoAbundance18486.txt
    Untracked:  analysis/DirectionapaQTL.Rmd
    Untracked:  analysis/EvaleQTLs.Rmd
    Untracked:  analysis/YL_QTL_test.Rmd
    Untracked:  analysis/ncbiRefSeq_sm.sort.mRNA.bed
    Untracked:  analysis/snake.config.notes.Rmd
    Untracked:  analysis/verifyBAM.Rmd
    Untracked:  code/PeaksToCoverPerReads.py
    Untracked:  code/strober_pc_pve_heatmap_func.R
    Untracked:  data/18486.genecov.txt
    Untracked:  data/APApeaksYL.total.inbrain.bed
    Untracked:  data/ChromHmmOverlap/
    Untracked:  data/GM12878.chromHMM.bed
    Untracked:  data/GM12878.chromHMM.txt
    Untracked:  data/LianoglouLCL/
    Untracked:  data/LocusZoom/
    Untracked:  data/NuclearApaQTLs.txt
    Untracked:  data/PeakCounts/
    Untracked:  data/PeakCounts_noMP_5perc/
    Untracked:  data/PeakUsage/
    Untracked:  data/PeakUsage_noMP/
    Untracked:  data/PeaksUsed/
    Untracked:  data/PeaksUsed_noMP_5percCov/
    Untracked:  data/RNAkalisto/
    Untracked:  data/TotalApaQTLs.txt
    Untracked:  data/Totalpeaks_filtered_clean.bed
    Untracked:  data/UnderstandPeaksQC/
    Untracked:  data/YL-SP-18486-T-combined-genecov.txt
    Untracked:  data/YL-SP-18486-T_S9_R1_001-genecov.txt
    Untracked:  data/YL_QTL_test/
    Untracked:  data/apaExamp/
    Untracked:  data/apaQTL_examp_noMP/
    Untracked:  data/bedgraph_peaks/
    Untracked:  data/bin200.5.T.nuccov.bed
    Untracked:  data/bin200.Anuccov.bed
    Untracked:  data/bin200.nuccov.bed
    Untracked:  data/clean_peaks/
    Untracked:  data/comb_map_stats.csv
    Untracked:  data/comb_map_stats.xlsx
    Untracked:  data/comb_map_stats_39ind.csv
    Untracked:  data/combined_reads_mapped_three_prime_seq.csv
    Untracked:  data/diff_iso_proc/
    Untracked:  data/diff_iso_trans/
    Untracked:  data/ensemble_to_genename.txt
    Untracked:  data/example_gene_peakQuant/
    Untracked:  data/explainProtVar/
    Untracked:  data/filtered_APApeaks_merged_allchrom_refseqTrans.closest2End.bed
    Untracked:  data/filtered_APApeaks_merged_allchrom_refseqTrans.closest2End.noties.bed
    Untracked:  data/first50lines_closest.txt
    Untracked:  data/gencov.test.csv
    Untracked:  data/gencov.test.txt
    Untracked:  data/gencov_zero.test.csv
    Untracked:  data/gencov_zero.test.txt
    Untracked:  data/gene_cov/
    Untracked:  data/joined
    Untracked:  data/leafcutter/
    Untracked:  data/merged_combined_YL-SP-threeprimeseq.bg
    Untracked:  data/molPheno_noMP/
    Untracked:  data/mol_overlap/
    Untracked:  data/mol_pheno/
    Untracked:  data/nom_QTL/
    Untracked:  data/nom_QTL_opp/
    Untracked:  data/nom_QTL_trans/
    Untracked:  data/nuc6up/
    Untracked:  data/nuc_10up/
    Untracked:  data/other_qtls/
    Untracked:  data/pQTL_otherphen/
    Untracked:  data/peakPerRefSeqGene/
    Untracked:  data/perm_QTL/
    Untracked:  data/perm_QTL_opp/
    Untracked:  data/perm_QTL_trans/
    Untracked:  data/perm_QTL_trans_filt/
    Untracked:  data/perm_QTL_trans_noMP_5percov/
    Untracked:  data/protAndAPAlmRes.Rda
    Untracked:  data/reads_mapped_three_prime_seq.csv
    Untracked:  data/smash.cov.results.bed
    Untracked:  data/smash.cov.results.csv
    Untracked:  data/smash.cov.results.txt
    Untracked:  data/smash_testregion/
    Untracked:  data/ssFC200.cov.bed
    Untracked:  data/temp.file1
    Untracked:  data/temp.file2
    Untracked:  data/temp.gencov.test.txt
    Untracked:  data/temp.gencov_zero.test.txt
    Untracked:  data/threePrimeSeqMetaData.csv
    Untracked:  output/picard/
    Untracked:  output/plots/
    Untracked:  output/qual.fig2.pdf

Unstaged changes:
    Modified:   analysis/28ind.peak.explore.Rmd
    Modified:   analysis/CompareLianoglouData.Rmd
    Modified:   analysis/apaQTLoverlapGWAS.Rmd
    Modified:   analysis/cleanupdtseq.internalpriming.Rmd
    Modified:   analysis/coloc_apaQTLs_protQTLs.Rmd
    Modified:   analysis/dif.iso.usage.leafcutter.Rmd
    Modified:   analysis/diff_iso_pipeline.Rmd
    Modified:   analysis/explainpQTLs.Rmd
    Modified:   analysis/explore.filters.Rmd
    Modified:   analysis/flash2mash.Rmd
    Modified:   analysis/mispriming_approach.Rmd
    Modified:   analysis/overlapMolQTL.Rmd
    Modified:   analysis/overlapMolQTL.opposite.Rmd
    Modified:   analysis/overlap_qtls.Rmd
    Modified:   analysis/peakOverlap_oppstrand.Rmd
    Modified:   analysis/peakQCPPlots.Rmd
    Modified:   analysis/pheno.leaf.comb.Rmd
    Modified:   analysis/swarmPlots_QTLs.Rmd
    Modified:   analysis/test.max2.Rmd
    Modified:   analysis/understandPeaks.Rmd
    Modified:   code/Snakefile

</code></pre>
Note that any generated files, e.g. HTML, png, CSS, etc., are not included in this status report because it is ok for generated content to have uncommitted changes. </details>
</li>
</ul>
<details> <summary> <small><strong>Expand here to see past versions:</strong></small> </summary>
<ul>
<table style="border-collapse:separate; border-spacing:5px;">
<thead>
<tr>
<th style="text-align:left;">
File
</th>
<th style="text-align:left;">
Version
</th>
<th style="text-align:left;">
Author
</th>
<th style="text-align:left;">
Date
</th>
<th style="text-align:left;">
Message
</th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align:left;">
Rmd
</td>
<td style="text-align:left;">
<a href="https://github.com/brimittleman/threeprimeseq/blob/78f24b9acd89c71d45d23751a6edf1de44d2ecbf/analysis/FractionDiffProcessed.Rmd" target="_blank">78f24b9</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2019-01-25
</td>
<td style="text-align:left;">
look at mid range, draw conclusion
</td>
</tr>
<tr>
<td style="text-align:left;">
html
</td>
<td style="text-align:left;">
<a href="https://cdn.rawgit.com/brimittleman/threeprimeseq/3698abac05d5b339cb916baf87f654080bc04091/docs/FractionDiffProcessed.html" target="_blank">3698aba</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2019-01-24
</td>
<td style="text-align:left;">
Build site.
</td>
</tr>
<tr>
<td style="text-align:left;">
Rmd
</td>
<td style="text-align:left;">
<a href="https://github.com/brimittleman/threeprimeseq/blob/19ec0998d07c00c83e13905ac40b1ac6b804d267/analysis/FractionDiffProcessed.Rmd" target="_blank">19ec099</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2019-01-24
</td>
<td style="text-align:left;">
look at mid range examples
</td>
</tr>
<tr>
<td style="text-align:left;">
html
</td>
<td style="text-align:left;">
<a href="https://cdn.rawgit.com/brimittleman/threeprimeseq/d86a41cf331cd8cc3ee024fb72d216c00d176827/docs/FractionDiffProcessed.html" target="_blank">d86a41c</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2019-01-24
</td>
<td style="text-align:left;">
Build site.
</td>
</tr>
<tr>
<td style="text-align:left;">
Rmd
</td>
<td style="text-align:left;">
<a href="https://github.com/brimittleman/threeprimeseq/blob/82c157109943211bf670770c2848e82f5eb8a03d/analysis/FractionDiffProcessed.Rmd" target="_blank">82c1571</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2019-01-24
</td>
<td style="text-align:left;">
look at top 15 red
</td>
</tr>
<tr>
<td style="text-align:left;">
html
</td>
<td style="text-align:left;">
<a href="https://cdn.rawgit.com/brimittleman/threeprimeseq/aefc33040326e05c44fd4305c5d26249f41ef089/docs/FractionDiffProcessed.html" target="_blank">aefc330</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2019-01-22
</td>
<td style="text-align:left;">
Build site.
</td>
</tr>
<tr>
<td style="text-align:left;">
Rmd
</td>
<td style="text-align:left;">
<a href="https://github.com/brimittleman/threeprimeseq/blob/fd184be6b12e4f11a774d09588aa5d3fdc5a26fd/analysis/FractionDiffProcessed.Rmd" target="_blank">fd184be</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2019-01-22
</td>
<td style="text-align:left;">
add code for leafcutter on processed
</td>
</tr>
</tbody>
</table>
</ul>
<p></details></p>
<hr />
<pre class="r"><code>library(workflowr)</code></pre>
<pre><code>This is workflowr version 1.1.1
Run ?workflowr for help getting started</code></pre>
<pre class="r"><code>library(tidyverse)</code></pre>
<pre><code>── Attaching packages ────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──</code></pre>
<pre><code>✔ ggplot2 3.0.0     ✔ purrr   0.2.5
✔ tibble  1.4.2     ✔ dplyr   0.7.6
✔ tidyr   0.8.1     ✔ stringr 1.3.1
✔ readr   1.1.1     ✔ forcats 0.3.0</code></pre>
<pre><code>── Conflicts ───────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()</code></pre>
<pre class="r"><code>library(reshape2)</code></pre>
<pre><code>
Attaching package: &#39;reshape2&#39;</code></pre>
<pre><code>The following object is masked from &#39;package:tidyr&#39;:

    smiths</code></pre>
<pre class="r"><code>library(cowplot)</code></pre>
<pre><code>
Attaching package: &#39;cowplot&#39;</code></pre>
<pre><code>The following object is masked from &#39;package:ggplot2&#39;:

    ggsave</code></pre>
<p>In this analysis <a href="https://brimittleman.github.io/threeprimeseq/PeakToGeneAssignment.html" class="uri">https://brimittleman.github.io/threeprimeseq/PeakToGeneAssignment.html</a> I ran an initial run of the leafcutter tool for differences between fractions. I will use the same pipeline here for the processed data.</p>
<p>This starts with running feature counts with all of the peaks. I will use peaks passing the filter in either the total or nucelar fraction. These peaks are in /project2/gilad/briana/threeprimeseq/data/mergedPeaks_noMP_filtered/Filtered_APApeaks_merged_allchrom_noMP.sort.named.noCHR.refseqTrans.closest2end.sm.fixed_5percCov.bed and were created using the filternamePeaks5percCov.py script. I need to make this into an SAF file for FC.</p>
<p>This file has chr, start, end, peakNu, cov, strand, transcript:gene, distance. For the SAF file I want GeneID, Chr, start, end, strand. The GeneID is peak#:chr:start:end:strand:gene</p>
<p>bed2saf_bothFrac_Processed.py</p>
<pre class="bash"><code>from misc_helper import *

fout = open(&quot;/project2/gilad/briana/threeprimeseq/data/mergedPeaks_noMP_filtered/Filtered_APApeaks_merged_allchrom_noMP.sort.named.noCHR.refseqTrans.closest2end.sm.fixed_5percCov.SAF&quot;,&#39;w&#39;)
fout.write(&quot;GeneID\tChr\tStart\tEnd\tStrand\n&quot;)
for ln in open(&quot;/project2/gilad/briana/threeprimeseq/data/mergedPeaks_noMP_filtered/Filtered_APApeaks_merged_allchrom_noMP.sort.named.noCHR.refseqTrans.closest2end.sm.fixed_5percCov.bed&quot;):
    chrom, start, end, peakNum, cov, strand, trans, dist = ln.split()
    gene=trans.split(&quot;:&quot;)[1]
    ID = &quot;peak%s:%s:%s:%s:%s:%s&quot;%(peakNum,chrom,start, end,strand,gene)
    fout.write(&quot;%s\t%s\t%s\t%s\t%s\n&quot;%(ID, chrom, start, end, strand))
fout.close()
</code></pre>
<p>bothFrac_processed_FC.sh</p>
<pre class="bash"><code>#!/bin/bash

#SBATCH --job-name=bothFrac_processed_FC
#SBATCH --account=pi-yangili1
#SBATCH --time=24:00:00
#SBATCH --output=bothFrac_processed_FCc.out
#SBATCH --error=bothFrac_processed_FC.err
#SBATCH --partition=broadwl
#SBATCH --mem=12G
#SBATCH --mail-type=END

module load Anaconda3
source activate three-prime-env


featureCounts -O -a /project2/gilad/briana/threeprimeseq/data/mergedPeaks_noMP_filtered/Filtered_APApeaks_merged_allchrom_noMP.sort.named.noCHR.refseqTrans.closest2end.sm.fixed_5percCov.SAF -F SAF -o /project2/gilad/briana/threeprimeseq/data/filtPeakOppstrand_cov_processed_bothFrac/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_processed.fc /project2/gilad/briana/threeprimeseq/data/bam_NoMP_sort/*sort.bam -s 2</code></pre>
<p>Fix headers:</p>
<p>fix_head_fc_procBothFrac.py</p>
<pre class="bash"><code>
#python 

infile= open(&quot;/project2/gilad/briana/threeprimeseq/data/filtPeakOppstrand_cov_processed_bothFrac/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_processed.fc&quot;, &quot;r&quot;)
fout = open(&quot;/project2/gilad/briana/threeprimeseq/data/filtPeakOppstrand_cov_processed_bothFrac/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_processed_fixed.fc&quot;,&#39;w&#39;)
for line, i in enumerate(infile):
    if line == 1:
        i_list=i.split()
        libraries = i_list[:6]
        print(libraries)
        for sample in i_list[6:]:
            full = sample.split(&quot;/&quot;)[7]
            samp= full.split(&quot;-&quot;)[2:4]
            lim=&quot;_&quot;
            samp_st=lim.join(samp)
            libraries.append(samp_st)
        first_line= &quot;\t&quot;.join(libraries)
        fout.write(first_line + &#39;\n&#39;)
    else :
        fout.write(i)
fout.close()</code></pre>
<p>fc2leafphen_processed.py</p>
<pre class="bash"><code>inFile= open(&quot;/project2/gilad/briana/threeprimeseq/data/filtPeakOppstrand_cov_processed_bothFrac/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_processed_fixed.fc&quot;, &quot;r&quot;)
outFile= open(&quot;/project2/gilad/briana/threeprimeseq/data/pheno_DiffIso_processed/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_processed_forLC.fc&quot;, &quot;w&quot;)

for num, ln in enumerate(inFile):
        if num == 1:
            lines=ln.split()[6:]
            outFile.write(&quot; &quot;.join(lines)+&#39;\n&#39;)
        if num &gt; 1:
            ID=ln.split()[0]
            peak=ID.split(&quot;:&quot;)[0]
            chrom=ID.split(&quot;:&quot;)[1]
            start=ID.split(&quot;:&quot;)[2]
            start=int(start)
            end=ID.split(&quot;:&quot;)[3]
            end=int(end)
            strand=ID.split(&quot;:&quot;)[4]
            gene=ID.split(&quot;:&quot;)[5]
            new_ID=&quot;chr%s:%d:%d:%s&quot;%(chrom, start, end, gene)
            pheno=ln.split()[6:]
            pheno.insert(0, new_ID)
            outFile.write(&quot; &quot;.join(pheno)+&#39;\n&#39;)
            
outFile.close()  </code></pre>
<p>subset_diffisopheno_processed.py</p>
<pre class="bash"><code>def main(inFile, outFile, target):
    ifile=open(inFile, &quot;r&quot;)
    ofile=open(outFile, &quot;w&quot;)
    target=int(target)
    for num, ln in enumerate(ifile):
        if num == 0:
            ofile.write(ln)
        else:
            ID=ln.split()[0]
            chrom=ID.split(&quot;:&quot;)[0][3:]
            print(chrom)
            chrom=int(chrom)
            if chrom == target:
                ofile.write(ln)
            
if __name__ == &quot;__main__&quot;:
    import sys

    target = sys.argv[1]
    inFile = &quot;/project2/gilad/briana/threeprimeseq/data/pheno_DiffIso_processed/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_processed_forLC.fc&quot;
    outFile = &quot;/project2/gilad/briana/threeprimeseq/data/pheno_DiffIso_processed/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_processed_forLC_%s.txt&quot;%(target)
    main(inFile, outFile, target)</code></pre>
<p>Run this with: run_subset_diffisopheno_processed.sh</p>
<pre class="bash"><code>#!/bin/bash

#SBATCH --job-name=run_subset_diffisopheno_processed
#SBATCH --account=pi-yangili1
#SBATCH --time=24:00:00
#SBATCH --output=run_subset_diffisopheno_processed.out
#SBATCH --error=run_subset_diffisopheno_processed.err
#SBATCH --partition=broadwl
#SBATCH --mem=12G
#SBATCH --mail-type=END

module load Anaconda3
source activate three-prime-env

for i in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 
do
python subset_diffisopheno_processed.py $i 
done</code></pre>
<p>Make new sample list. I could use the old one but I want to have this pipeline work when I add individuals.</p>
<p>makeLCSampleList_processed.py</p>
<pre class="bash"><code>outfile=open(&quot;/project2/gilad/briana/threeprimeseq/data/pheno_DiffIso_processed/sample_groups.txt&quot;, &quot;w&quot;)
infile=open(&quot;/project2/gilad/briana/threeprimeseq/data/filtPeakOppstrand_cov_processed_bothFrac/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_processed.fc&quot;, &quot;r&quot;)

for line, i in enumerate(infile):
    if line == 1:
        i_list=i.split()
        libraries=[]
        for sample in i_list[6:]:
            full = sample.split(&quot;/&quot;)[7]
            samp= full.split(&quot;-&quot;)[2:4]
            lim=&quot;_&quot;
            samp_st=lim.join(samp)
            libraries.append(samp_st)
        for l in libraries:
            if l[-1] == &quot;T&quot;:
                outfile.write(&quot;%s\tTotal\n&quot;%(l))
            else:
                outfile.write(&quot;%s\tNuclear\n&quot;%(l))
    else:
          next
                
outfile.close()</code></pre>
<p>run_leafcutter_ds_bychrom_processed.sh</p>
<pre class="bash"><code>#!/bin/bash

#SBATCH --job-name=run_leafcutter_ds_bychrom_processed
#SBATCH --account=pi-yangili1
#SBATCH --time=24:00:00
#SBATCH --output=run_leafcutter_ds_bychrom_processed.out
#SBATCH --error=run_leafcutter_ds_bychrom_processed.err
#SBATCH --partition=bigmem2
#SBATCH --mem=50G
#SBATCH --mail-type=END

module load R

for i in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 
do
Rscript /project2/gilad/briana/davidaknowles-leafcutter-c3d9474/scripts/leafcutter_ds.R --num_threads 4  /project2/gilad/briana/threeprimeseq/data/pheno_DiffIso_processed/filtered_APApeaks_merged_allchrom_refseqGenes.Transcript_sm_quant_processed_forLC_${i}.txt /project2/gilad/briana/threeprimeseq/data/pheno_DiffIso_processed/sample_groups.txt -o /project2/gilad/briana/threeprimeseq/data/diff_iso_processed/TN_diff_isoform_chr${i}.txt 
done</code></pre>
<p>Cat all of the signficance files and bring them to my computer to look at here.</p>
<pre class="r"><code>diffIso=read.table(&quot;../data/diff_iso_proc/TN_diff_isoform_allChrom_clusterSig.txt&quot;, header = T,col.names = c(&quot;status&quot;,   &quot;loglr&quot;,    &quot;df&quot;,   &quot;p&quot;,    &quot;cluster&quot;,  &quot;p.adjust&quot;),stringsAsFactors = F,sep=&quot;\t&quot;) %&gt;% filter(status == &quot;Success&quot;)


diffIso$p.adjust=as.numeric(as.character(diffIso$p.adjust))


qqplot(-log10(runif(nrow(diffIso))), -log10(diffIso$p.adjust),ylab=&quot;-log10 Total Adjusted Leafcutter pvalue&quot;, xlab=&quot;-log 10 Uniform expectation&quot;, main=&quot;Leafcutter differencial isoform analysis between fractions&quot;)
abline(0,1)</code></pre>
<p><img src="figure/FractionDiffProcessed.Rmd/unnamed-chunk-10-1.png" width="672" style="display: block; margin: auto;" /></p>
<details> <summary><em>Expand here to see past versions of unnamed-chunk-10-1.png:</em></summary>
<table style="border-collapse:separate; border-spacing:5px;">
<thead>
<tr>
<th style="text-align:left;">
Version
</th>
<th style="text-align:left;">
Author
</th>
<th style="text-align:left;">
Date
</th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align:left;">
<a href="https://github.com/brimittleman/threeprimeseq/blob/d86a41cf331cd8cc3ee024fb72d216c00d176827/docs/figure/FractionDiffProcessed.Rmd/unnamed-chunk-10-1.png" target="_blank">d86a41c</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2019-01-24
</td>
</tr>
</tbody>
</table>
<p></details></p>
<p>This is based on successfull results for 9,532 genes with multiple PAS and enough coverage in enough individuals.</p>
<p>Next, I will look at the effect sizes.</p>
<pre class="bash"><code>awk &#39;{if(NR&gt;1)print}&#39; /project2/gilad/briana/threeprimeseq/data/diff_iso_processed/TN_diff_isoform_chr*.txt_effect_sizes.txt &gt; /project2/gilad/briana/threeprimeseq/data/diff_iso_processed/TN_diff_isoform_AllChrom.txt_effect_sizes.txt</code></pre>
<pre class="r"><code>effectsize=read.table(&quot;../data/diff_iso_proc/TN_diff_isoform_AllChrom.txt_effect_sizes.txt&quot;, stringsAsFactors = F, col.names=c(&#39;intron&#39;,  &#39;logef&#39; ,&#39;Nuclear&#39;, &#39;Total&#39;,&#39;deltapsi&#39;))</code></pre>
<pre class="r"><code>effectsize$logef=as.numeric(as.character(effectsize$logef))</code></pre>
<pre><code>Warning: NAs introduced by coercion</code></pre>
<pre class="r"><code>plot(sort(effectsize$logef),main=&quot;Leafcutter effect Sizes&quot;, ylab=&quot;Effect size&quot;, xlab=&quot;Peak Index&quot;)</code></pre>
<p><img src="figure/FractionDiffProcessed.Rmd/unnamed-chunk-13-1.png" width="672" style="display: block; margin: auto;" /></p>
<details> <summary><em>Expand here to see past versions of unnamed-chunk-13-1.png:</em></summary>
<table style="border-collapse:separate; border-spacing:5px;">
<thead>
<tr>
<th style="text-align:left;">
Version
</th>
<th style="text-align:left;">
Author
</th>
<th style="text-align:left;">
Date
</th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align:left;">
<a href="https://github.com/brimittleman/threeprimeseq/blob/d86a41cf331cd8cc3ee024fb72d216c00d176827/docs/figure/FractionDiffProcessed.Rmd/unnamed-chunk-13-1.png" target="_blank">d86a41c</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2019-01-24
</td>
</tr>
</tbody>
</table>
<p></details></p>
<pre class="r"><code>effectsize$deltapsi=as.numeric(as.character(effectsize$deltapsi))</code></pre>
<pre><code>Warning: NAs introduced by coercion</code></pre>
<pre class="r"><code>plot(sort(effectsize$deltapsi),main=&quot;Leafcutter delta PSI&quot;, ylab=&quot;Delta PSI&quot;, xlab=&quot;Peak Index&quot;)</code></pre>
<p><img src="figure/FractionDiffProcessed.Rmd/unnamed-chunk-14-1.png" width="672" style="display: block; margin: auto;" /></p>
<details> <summary><em>Expand here to see past versions of unnamed-chunk-14-1.png:</em></summary>
<table style="border-collapse:separate; border-spacing:5px;">
<thead>
<tr>
<th style="text-align:left;">
Version
</th>
<th style="text-align:left;">
Author
</th>
<th style="text-align:left;">
Date
</th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align:left;">
<a href="https://github.com/brimittleman/threeprimeseq/blob/d86a41cf331cd8cc3ee024fb72d216c00d176827/docs/figure/FractionDiffProcessed.Rmd/unnamed-chunk-14-1.png" target="_blank">d86a41c</a>
</td>
<td style="text-align:left;">
Briana Mittleman
</td>
<td style="text-align:left;">
2019-01-24
</td>
</tr>
</tbody>
</table>
<p></details></p>
<p>I want to spot check some of these in IGV. First I will focus on the |PSI| &gt;.4</p>
<pre class="r"><code>filterHighPSI=effectsize %&gt;% filter(abs(deltapsi)&gt;.4) %&gt;% arrange(deltapsi)
head(filterHighPSI)</code></pre>
<pre><code>                            intron     logef           Nuclear
1   chr2:108464199:108464261:RGPD4 -1.563069 0.786325014667375
2  chr11:71409928:71409980:FAM86C1 -1.750522 0.736879355169336
3   chr9:19443338:19443405:SLC24A2 -1.373452  0.87662816529528
4 chr7:151725930:151726019:GALNTL5 -1.229096 0.766827065986407
5     chr17:49248730:49248817:NME2 -1.185121 0.763505415306066
6   chr7:100970341:100970385:IFT22 -1.588218 0.668693582230009
              Total   deltapsi
1 0.139047439399753 -0.6472776
2 0.169115629694331 -0.5677637
3 0.313023025601665 -0.5636051
4 0.219649286101815 -0.5471778
5 0.231787036297737 -0.5317184
6 0.140656353549046 -0.5280372</code></pre>
<p>Too look at this most effectively I will merge the total and nuclear clean bam files:</p>
<p>mergeBamFiles_byfrac_noMP.sh</p>
<pre class="bash"><code>#!/bin/bash

#SBATCH --job-name=mergeBamFiles_byfrac_noMP.sh
#SBATCH --account=pi-yangili1
#SBATCH --time=8:00:00
#SBATCH --output=mergeBamFiles_byfrac_noMP.out
#SBATCH --error=mergeBamFiles_byfrac_noMP.err
#SBATCH --partition=bigmem2
#SBATCH --mem=100G
#SBATCH --mail-type=END

module load Anaconda3
source activate three-prime-env  


samtools merge  /project2/gilad/briana/threeprimeseq/data/mergedBams_NoMP/AllTotalSamples.MergedBamFiles.noMP.bam /project2/gilad/briana/threeprimeseq/data/bam_NoMP_sort/*T*.bam


samtools merge  /project2/gilad/briana/threeprimeseq/data/mergedBams_NoMP/AllNuclearSamples.MergedBamFiles.noMP.bam /project2/gilad/briana/threeprimeseq/data/bam_NoMP_sort/*N*.bam</code></pre>
<p>SortIndex_mergeBamFiles_byfrac_noMP.sh</p>
<pre class="bash"><code>#!/bin/bash

#SBATCH --job-name=SortIndex_mergeBamFiles_byfrac_noMP.sh
#SBATCH --account=pi-yangili1
#SBATCH --time=8:00:00
#SBATCH --output=SortIndex_mergeBamFiles_byfrac_noMP.out
#SBATCH --error=SortIndex_mergeBamFiles_byfrac_noMP.err
#SBATCH --partition=bigmem2
#SBATCH --mem=100G
#SBATCH --mail-type=END

module load Anaconda3
source activate three-prime-env  


samtools sort  /project2/gilad/briana/threeprimeseq/data/mergedBams_NoMP/AllTotalSamples.MergedBamFiles.noMP.bam &gt; /project2/gilad/briana/threeprimeseq/data/mergedBams_NoMP/AllTotalSamples.MergedBamFiles.noMP.sort.bam
samtools index /project2/gilad/briana/threeprimeseq/data/mergedBams_NoMP/AllTotalSamples.MergedBamFiles.noMP.sort.bam

samtools sort  /project2/gilad/briana/threeprimeseq/data/mergedBams_NoMP/AllNuclearSamples.MergedBamFiles.noMP.bam &gt; /project2/gilad/briana/threeprimeseq/data/mergedBams_NoMP/AllNuclearSamples.MergedBamFiles.noMP.sort.bam

samtools index /project2/gilad/briana/threeprimeseq/data/mergedBams_NoMP/AllNuclearSamples.MergedBamFiles.noMP.sort.bam</code></pre>
<p>Take a look at some of the top hits in IGV:</p>
<pre class="r"><code>slice(filterHighPSI,1:15)</code></pre>
<pre><code>                             intron     logef           Nuclear
1    chr2:108464199:108464261:RGPD4 -1.563069 0.786325014667375
2   chr11:71409928:71409980:FAM86C1 -1.750522 0.736879355169336
3    chr9:19443338:19443405:SLC24A2 -1.373452  0.87662816529528
4  chr7:151725930:151726019:GALNTL5 -1.229096 0.766827065986407
5      chr17:49248730:49248817:NME2 -1.185121 0.763505415306066
6    chr7:100970341:100970385:IFT22 -1.588218 0.668693582230009
7  chr3:142422173:142422217:PCOLCE2 -1.160742 0.747193256154251
8   chr12:121337613:121337664:HNF1A -1.439657 0.922672953632586
9    chr8:134586015:134586097:WISP1 -1.828958 0.861221198355898
10   chr15:76234769:76234851:FBXO22 -1.494914 0.702484460762617
11    chr13:65049589:65049665:PCDH9 -1.585817 0.677762517045467
12    chr20:30651036:30651094:CCM2L -1.657778 0.668557867828921
13     chr17:44777988:44778040:WNT3 -1.282798  0.61467112951709
14   chr4:122775166:122775266:TRPC3 -1.269060  0.61759501169366
15      chr3:60357325:60357413:FHIT -1.266671 0.610069267936541
               Total   deltapsi
1  0.139047439399753 -0.6472776
2  0.169115629694331 -0.5677637
3  0.313023025601665 -0.5636051
4  0.219649286101815 -0.5471778
5  0.231787036297737 -0.5317184
6  0.140656353549046 -0.5280372
7  0.224821665761922 -0.5223716
8   0.40129250694973 -0.5213804
9   0.34267701803463 -0.5185442
10 0.192803741815497 -0.5096807
11 0.171781437839461 -0.5059811
12 0.162584556335585 -0.5059733
13 0.109232403935245 -0.5054387
14 0.113169455110416 -0.5044256
15 0.110490691620035 -0.4995786</code></pre>
<p>It would be good to know the mean usage scores for these as well:</p>
<pre class="r"><code>names=c(&quot;chr&quot;, &quot;start&quot;,&quot;end&quot;,&quot;gene&quot;,&quot;strand&quot;, &quot;peak&quot;, &quot;meanUsage&quot;)
tot_usage=read.table(&quot;../data/PeakUsage_noMP/filtered_APApeaks_merged_allchrom_refseqGenes.TranscriptNoMP_sm_quant.Total_fixed.pheno.5percPeaks.txt&quot;, stringsAsFactors = F,col.names = names)
nuc_usage=read.table(&quot;../data/PeakUsage_noMP/filtered_APApeaks_merged_allchrom_refseqGenes.TranscriptNoMP_sm_quant.Nuclear_fixed.pheno.5percPeaks.txt&quot;,stringsAsFactors = F,col.names = names)</code></pre>
<ul>
<li>RGPD4:</li>
</ul>
<p>Intronic Peak in the gene 64775-</p>
<p>Total mean usage: 0.1389744 Nuclear mean Usage: 0.7933333</p>
<p>The other peak in this gene is downstream of the annotated gene. 64776</p>
<p>Total mean Usage: 0.8610256 Nuclear mean Usage: 0.2066667</p>
<p>The nuclear has the truncated internal transcript.</p>
<ul>
<li>FAM86C1</li>
</ul>
<p>The most used peak in this gene in the nuclear fraction is peak20610 it is chr11:71409928:71409980. This peak is upstream of the gene. This means the peak is probably not real. However this does not look like a misprimed read.</p>
<ul>
<li>SLC24A2</li>
</ul>
<p>There are 2 peaks for this gene. They are downstream of the annoated in another gene but because they are on the opposite strand from the other gene, it makes sense for them to map to SLC24A2.</p>
<p>This means that without a strand specific protocol it would be difficult to see this effect.</p>
<p>Peak 118136 Total: 0.3012821 Nucelar: 0.8553846</p>
<p>Peak 118140 Total: 0.6987179 Nuclear: 0.1446154</p>
<p>The extended transcript is used in the nuclear.</p>
<ul>
<li>GALNTL5</li>
</ul>
<p>This gene is close to the GALNT11 gene. They are also going in the same direction. The peak here is in the first intron of the GALNT11 gene. The other peak is way upsteream and looks like it is near the end of the PRKAG2-ASI gene chr7:151,570,858-151,585,995.</p>
<ul>
<li>NME2</li>
</ul>
<p>Peak 50663</p>
<p>Total: 0.1615385 Nuclear:0.7335897</p>
<p>The other annotated peak is Peak 50662 Total:0.5564103 Nuclear:0.2407692</p>
<p>50663 is directly upstream of the UTR and 50662 is uspream in one of the introns.</p>
<ul>
<li>IFT22</li>
</ul>
<p>This gene has 3 peaks in the filtered peaks however the one called here is upstream of the gene. The other two make a lot more sense:</p>
<p>109623: Total:0.1310256 Nuclear:0.63794872</p>
<p>109622:<br />
Total: 0.1866667 Nuclear: 0.08897436</p>
<p>109621: Total:0.6371795 Nuclear: 0.23717949</p>
<p>From these results it looks like there is still differencial ussage in this gene. Peak 109621 is also significant with a delta PSI of 0.45275927.</p>
<ul>
<li>PCOLCE2</li>
</ul>
<p>This peak is pretty far downstream of the gene but other genes in the region go the opposite way:</p>
<p>peak83037: Total:0.2151282 Nuclear: 0.7323077</p>
<p>peak83045: This peak is upstream of the UTR for the gene in one of the exons:<br />
Total: 0.7848718 Nuclear: 0.2676923</p>
<p>These results look like the gene has significant run through in the nuclear fraciton.</p>
<ul>
<li>HNF1A</li>
</ul>
<p>The 2 called peaks for this gene are upstream of the gene. Probably not actually correct. It does not look like any peaks in the track would work for this gene.</p>
<ul>
<li>WISP1</li>
</ul>
<p>The peak is very far downstream but not in the wrong direciton for other peaks in the region. All of the peaks for this gene are pretty far downstream. I am not sure how best to know if these are real or not.</p>
<ul>
<li>FBXO22</li>
</ul>
<p>The peak here is downstream of the gene close to the UTR of the next gene but because it goes the other direction, I can be confident in the assignemnt.</p>
<p>peak41217: Total: 0.16333333 Nuclear: 0.4917949</p>
<p>The other peaks in this gene are peak41206 in both fractions and low usage of peak41207 in total.</p>
<p>peak41206: Total: 0.58538462 Nuclear: 0.1776923</p>
<p>Again this looks like a case of run on read in the nuclear</p>
<ul>
<li>PCDH9</li>
</ul>
<p>The peak is downsteam of the gene but it is in a region with many LINCs. This peak is 32303. THe other peaks for the gene are 32304 and 32305. Oeak 32305 is upstream meanin it is probably not real.</p>
<p>peak32303: Total:0.1761538 Nuclear: 0.6843590</p>
<p>Peak 32304: Total: 0.5946154 Nuclear: 0.2479487</p>
<p>This is another case of run on for the nuclear transcript.</p>
<ul>
<li><p>CCM2L<br />
The peak is downstream in on if the first exons of the HCK gene. It is difficult to know which you should assign the peak to because they are in the same orientation. All three of the called peaks for this gene are downstream in the HCK gene.</p></li>
<li><p>WINT3:</p></li>
</ul>
<p>The peak is downstream in the NSF gene but the NSF gene goes the other direction. Both peaks for this gene are downstream in the NSF gene.</p>
<p>Peak 50131<br />
Total: 0.1087179 Nuclear: 0.5658974</p>
<p>Peak 50134<br />
Total: 0.8912821 Nuclear: 0.4084615</p>
<p>Again here we see more run on for the nuclear fraction transcript.</p>
<ul>
<li>TRPC3</li>
</ul>
<p>The peak here is in an intron of the downstreem gene BBS7. It is difficult to know which gene to assign the peak because they go the same direction.</p>
<p>peak89306 total: 0.1207692 nuclear:0.6010256</p>
<p>peak89307<br />
total:0.8535897 nuclear: 0.3989744</p>
<p>Again here, if these peaks truly go with this gene, the nuclear transcript is the run on transcript.</p>
<ul>
<li>FHIT:</li>
</ul>
<p>This peak is in an intron of the gene. This is also the only peak passing 5% in each fraction.</p>
<p>peak 79975:</p>
<p>Total:0.2602564<br />
Nucelar:0.05923077</p>
<p>It is hard to draw further conclusions here.</p>
<p>I now want to look at some intermediate examples that are less likely to be outliers:</p>
<pre class="r"><code>filterMidPSI=effectsize %&gt;% filter(abs(deltapsi)&gt;.1,  abs(deltapsi)&lt;.2) %&gt;% arrange(deltapsi)

slice(filterMidPSI, 1:20)</code></pre>
<pre><code>                             intron      logef           Nuclear
1        chr3:9824703:9824900:TADA3 -0.3663203 0.325432995242216
2    chr5:59173621:59173722:DEPDC1B -1.1379521 0.270435823995102
3      chr16:58746389:58746439:GOT2 -1.1858858 0.226137719840828
4       chr2:19534331:19534419:OSR1 -0.7480049 0.583655239822405
5    chr18:32824892:32825018:ZNF397 -0.8954891 0.332811386875754
6       chr5:27491617:27491709:CDH6 -0.3475314 0.340152540947897
7   chr12:26060628:26060675:BHLHE41 -1.1023581 0.284333101368843
8   chr8:126089041:126089194:WASHC5 -0.4613435 0.308742523793907
9   chr3:119762220:119762305:GPR156 -0.6754549 0.668378068472292
10   chr19:34702824:34702894:LSM14A -1.2379175 0.262806679986851
11   chr5:43482525:43482619:C5orf34 -0.5841307 0.237305958687913
12       chr7:17260895:17260936:AHR -1.0800012 0.257416122485577
13    chr7:98993610:98993692:ARPC1B -0.2835942 0.308021209679854
14   chr1:167612310:167612392:RCSD1 -0.4305542 0.291550619788812
15         chr7:5820254:5820322:OCM -0.4952238 0.779630820591706
16    chr16:3181668:3181753:ZSCAN10 -0.6372901 0.588452776814862
17      chr7:94260631:94260713:SGCE -0.9599485 0.279108870841054
18     chr19:47378498:47378583:FKRP -1.0682692 0.330580082811762
19 chr3:122481138:122481277:HSPBAP1 -0.4802691 0.433000666440278
20    chr3:12761795:12761880:TMEM40 -0.4885477 0.689878926943269
                Total   deltapsi
1   0.125571306501736 -0.1998617
2  0.0707522472721187 -0.1996836
3  0.0265447102065377 -0.1995930
4   0.384077797410421 -0.1995774
5   0.133435407762397 -0.1993760
6    0.14078100359469 -0.1993715
7  0.0850206759231001 -0.1993124
8   0.109466992933229 -0.1992755
9   0.469129223532137 -0.1992488
10 0.0636106463287395 -0.1991960
11 0.0381194363145641 -0.1991865
12 0.0583828866891181 -0.1990332
13  0.108998470216784 -0.1990227
14 0.0925532620325678 -0.1989974
15  0.580643826090524 -0.1989870
16  0.389493388354609 -0.1989594
17 0.0802511202513235 -0.1988578
18  0.131890177247821 -0.1986899
19  0.234320158423806 -0.1986805
20  0.491259391799099 -0.1986195</code></pre>
<ul>
<li>Tada3</li>
</ul>
<p>This gene has 2 annotated UTRs. This peak is in the upstream 3’UTR.</p>
<p>Peak 7742:</p>
<p>Total:0.1382051 Nuclear: 0.29974359</p>
<p>The total only has 1 more annoated here: peak77240. This peak is in the downstream 3’ UTR</p>
<p>Total:0.7933333</p>
<p>Nucelar:0.47923077</p>
<p>The nuclear has 2 more upstream peaks here:</p>
<p>Peak77243:0.08589744 Peak77245:0.13461538</p>
<ul>
<li>DEPDC1B</li>
</ul>
<p>This gene is close to PDE4D. The sig peak is in this gene. The 1 peak in the 3’ UTR of DEPDC1B is peak92690.</p>
<p>*GOT2</p>
<p>The internal peak for this gene is used less than 5% in the total and 22% in the nuclear. These are the only 2 peaks in our filtered set for the gene.</p>
<p>Peak45274: Total &lt;5% Nuclear: 0.2202564</p>
<p>Peak45272: Total:0.9438462 Nuclear : 0.2202564</p>
<ul>
<li>OSR1</li>
</ul>
<p>The 3 peaks for this gene are downstream of the gene with no other peaks in the region.</p>
<p>Peak 60538: Total:0.3035897 Nucelar: 0.5566667</p>
<p>The total fraction uses the one peak downstream at about the same rate.</p>
<ul>
<li>ZNF397</li>
</ul>
<p>This gene also looks like it has multiple annotated UTRs.<br />
Peak 53787<br />
Total: 0.10461538<br />
Nuclear: 0.22794872</p>
<p>This gene has 5 peaks and they run into the next gene downstream. Peaks 53807 and 53808 should probaby be assigned to ZNF271P but it is hard to tell.</p>
<ul>
<li>CDH6</li>
</ul>
<p>These peaks should probably go to the LNC RNA LINC01021.</p>
<ul>
<li>BHLHE41</li>
</ul>
<p>This doesnt make sense</p>
<ul>
<li>WASHC5</li>
</ul>
<p>This is a good example. peak116684 is in an intron of the gene and peak116678 is at the end in UTR.<br />
peak116684:<br />
total:0.09589744 nuclear: 0.25256410</p>
<p>peak116678: total: 0.72538462 nuclear: 0.50384615</p>
<p>The other used peak in nucelar is peak116687 with 0.06641026.</p>
<ul>
<li>GPR156</li>
</ul>
<p>The peaks are downstream in GSK3B.</p>
<ul>
<li>LSM14A</li>
</ul>
<p>chr19:34702824:34702894</p>
<p>peak57649: This peak is in the intron of the gene.</p>
<p>Total: &lt;5% Nuclear:0.16358974</p>
<p>In the total there are three peaks for this gene at over 5%.</p>
<p>peak57651 total:0.2943590 Nuclear:0.12179487</p>
<p>peak57655 Total:0.1356410 nuclear: 0.05461538</p>
<p>peak57656 total:0.2941026 nuclear: 0.10871795</p>
<p>Peaks 57655 adn 57656 are in the UTR.</p>
<p>This is a good example of internal PAS in the nuclear fraction.</p>
<ul>
<li><p>C5orf34 chr5:43482525:43482619: peak912217. This peak is in the next gene downstream in the same direction. It is hard to tell which gene this goes with.</p></li>
<li><p>AHR: Peak 106198 This peak is upstream of the gene. Probably not ream chr7:17260895:17260936</p></li>
</ul>
<p>However 106205 is an interesting peak here. It is not used at 5% in total but it used at almost 17% in nuclear.</p>
<ul>
<li>ARPC1B chr7:98993610:98993692 This is peak 109344. It is downstream in PDAP1 but also looks like its still the annoataed UTR for the ARPC1B gene . But this gene is the other direction.</li>
</ul>
<p>peak 109344: total:0.1546154 nuclear: 0.27615385</p>
<p>The nuclear has 6 peaks at &gt; 5% for this gene and total only has 2. The most used total is peak109343. But it does not look like that in IGV. It looks like it uses 109344.</p>
<ul>
<li>RCSD1</li>
</ul>
<p>peak 8281</p>
<p>total: 0.08051282 nuclear: 0.21512821</p>
<p>peak8293 is the other used peak in total. It is in the annoated 3’ UTR</p>
<p>total:0.67102564 nuclear:0.33769231</p>
<p>There are 2 more peaks also used at moderate levels here.</p>
<ul>
<li><p>OCM Peak is upstream. Not correct</p></li>
<li><p>ZSCAN10 peak is upstream. not correct</p></li>
<li><p>SGCE</p></li>
</ul>
<p>peak 190201 In an intron of the gene. total:0.10589744 nuclear: 0.2587179</p>
<p>The utr peak is 109198<br />
total:0.58205128 nuclear:0.3894872</p>
<ul>
<li>FKRB</li>
</ul>
<p>chr19:47378498:47378583</p>
<p>Looks like there must be a transcribed element here but thereis not an annotated gene. FKRB is pretty far upstream.</p>
<ul>
<li>HSPBAP1</li>
</ul>
<p>peak81962 total:0.26000000 nuclear: 0.41871795</p>
<p>the most used total peak is 81961 it is in the 3’ UTR</p>
<p>total: 0.64743590 nuclear:0.49000000</p>
<ul>
<li>TMEM40</li>
</ul>
<p>peak77434. The peaks for this gene are all outside. It is a bit confusing. 77435 is upstream so this is probably not correct</p>
<p>Conclusions:</p>
<ul>
<li><p>Gene to peak annotations are difficult, especially when the genes are in the same direction close to each other</p></li>
<li><p>We need to make sure peaks cannot be upstream of the gene.</p></li>
<li><p>We need to add LINC to the annotation</p></li>
</ul>
<div id="session-information" class="section level2">
<h2>Session information</h2>
<pre class="r"><code>sessionInfo()</code></pre>
<pre><code>R version 3.5.1 (2018-07-02)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS  10.14.1

Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] bindrcpp_0.2.2  cowplot_0.9.3   reshape2_1.4.3  forcats_0.3.0  
 [5] stringr_1.3.1   dplyr_0.7.6     purrr_0.2.5     readr_1.1.1    
 [9] tidyr_0.8.1     tibble_1.4.2    ggplot2_3.0.0   tidyverse_1.2.1
[13] workflowr_1.1.1

loaded via a namespace (and not attached):
 [1] tidyselect_0.2.4  haven_1.1.2       lattice_0.20-35  
 [4] colorspace_1.3-2  htmltools_0.3.6   yaml_2.2.0       
 [7] rlang_0.2.2       R.oo_1.22.0       pillar_1.3.0     
[10] glue_1.3.0        withr_2.1.2       R.utils_2.7.0    
[13] modelr_0.1.2      readxl_1.1.0      bindr_0.1.1      
[16] plyr_1.8.4        munsell_0.5.0     gtable_0.2.0     
[19] cellranger_1.1.0  rvest_0.3.2       R.methodsS3_1.7.1
[22] evaluate_0.11     knitr_1.20        broom_0.5.0      
[25] Rcpp_0.12.19      scales_1.0.0      backports_1.1.2  
[28] jsonlite_1.5      hms_0.4.2         digest_0.6.17    
[31] stringi_1.2.4     grid_3.5.1        rprojroot_1.3-2  
[34] cli_1.0.1         tools_3.5.1       magrittr_1.5     
[37] lazyeval_0.2.1    crayon_1.3.4      whisker_0.3-2    
[40] pkgconfig_2.0.2   xml2_1.2.0        lubridate_1.7.4  
[43] assertthat_0.2.0  rmarkdown_1.10    httr_1.3.1       
[46] rstudioapi_0.8    R6_2.3.0          nlme_3.1-137     
[49] git2r_0.23.0      compiler_3.5.1   </code></pre>
</div>

<hr>
<p>
    
</p>
<hr>

<!-- To enable disqus, uncomment the section below and provide your disqus_shortname -->

<!-- disqus
  <div id="disqus_thread"></div>
    <script type="text/javascript">
        /* * * CONFIGURATION VARIABLES: EDIT BEFORE PASTING INTO YOUR WEBPAGE * * */
        var disqus_shortname = 'rmarkdown'; // required: replace example with your forum shortname

        /* * * DON'T EDIT BELOW THIS LINE * * */
        (function() {
            var dsq = document.createElement('script'); dsq.type = 'text/javascript'; dsq.async = true;
            dsq.src = '//' + disqus_shortname + '.disqus.com/embed.js';
            (document.getElementsByTagName('head')[0] || document.getElementsByTagName('body')[0]).appendChild(dsq);
        })();
    </script>
    <noscript>Please enable JavaScript to view the <a href="http://disqus.com/?ref_noscript">comments powered by Disqus.</a></noscript>
    <a href="http://disqus.com" class="dsq-brlink">comments powered by <span class="logo-disqus">Disqus</span></a>
-->
<!-- Adjust MathJax settings so that all math formulae are shown using
TeX fonts only; see
http://docs.mathjax.org/en/latest/configuration.html.  This will make
the presentation more consistent at the cost of the webpage sometimes
taking slightly longer to load. Note that this only works because the
footer is added to webpages before the MathJax javascript. -->
<script type="text/x-mathjax-config">
  MathJax.Hub.Config({
    "HTML-CSS": { availableFonts: ["TeX"] }
  });
</script>

<hr>
<p>
  This reproducible <a href="http://rmarkdown.rstudio.com">R Markdown</a>
  analysis was created with
  <a href="https://github.com/jdblischak/workflowr">workflowr</a> 1.1.1
</p>
<hr>


</div>
</div>

</div>

<script>

// add bootstrap table styles to pandoc tables
function bootstrapStylePandocTables() {
  $('tr.header').parent('thead').parent('table').addClass('table table-condensed');
}
$(document).ready(function () {
  bootstrapStylePandocTables();
});


</script>

<!-- dynamically load mathjax for compatibility with self-contained -->
<script>
  (function () {
    var script = document.createElement("script");
    script.type = "text/javascript";
    script.src  = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
    document.getElementsByTagName("head")[0].appendChild(script);
  })();
</script>

</body>
</html>