Last updated: 2017-11-18
Code version: affe5d0
Goal: Extract the number of bases that mapped and were soft clipped from bam files and make a histogram
Look first at the original bam files from the snake mapping.
To look at bam files in python, use pysam. I added this to my environment.
Process:
#!/usr/bin/env python
import sys
import pysam
import re
mapped_num = open("mapped_18486_dep.txt", "w")
file_bam_open= open("/project2/gilad/briana/Net-seq/Net-seq1/data/sort/YG-SP-NET1-18486-dep-2017-10-13_S4_R1_001-sort.bam", "r")
file_bam= pysam.AlignmentFile(file_bam_open, "rb")
def get_mapped(file):
mapped=[]
for read in file.fetch():
read= str(read)
cigar= read.split()[5] #python starts at 0
num= re.findall("\d+M", cigar) #this gives me each "##M" in the cigar
for i in num:
mapped.append(i[:-1])
return(mapped)
a= get_mapped(file_bam)
mapped_num.write("\n".join(a))
mapped_num.close()
What I want:
6th column (sep by tab) of the line. ex: 7S23M14S. I want the all the numbers before the M in this string.
Pull the txt file with all the numbers into R and make a histogram.
scp brimittleman@midway2.rcc.uchicago.edu:/project2/gilad/briana/Net-seq/Net-seq1/data/sort/mapped_18486_dep.txt .
mapped=read.csv("../data/mapped_18486_dep.txt", header=FALSE)
map.vec= as.vector(mapped[,1])
a= hist(map.vec, xlab="Bases", main="Histogram of 18486 mapped reads from cigar string")
summary(map.vec)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.00 21.00 23.00 23.16 26.00 44.00
Change the code so I only add the max mapped value from each read to the file. get_mapped_max.py
#!/usr/bin/env python
import sys
import pysam
import re
mapped_num = open("mapped_18486_dep_max.txt", "w")
file_bam_open= open("/project2/gilad/briana/Net-seq/Net-seq1/data/sort/YG-SP-NET1-18486-dep-2017-10-13_S4_R1_001-sort.bam", "r")
file_bam= pysam.AlignmentFile(file_bam_open, "rb")
def get_mapped(file):
mapped=[]
for read in file.fetch():
read=str(read)
cigar= read.split()[5] #python starts at 0
num= re.findall("\d+M", cigar) #this gives me each "##M" in the cigar
m=[]
for i in num:
m.append(i[:-1])
mapped.append(max(m))
return(mapped)
a=get_mapped(file_bam)
mapped_num.write("\n".join(a))
mapped_num.close()
mapped.max=read.csv("../data/mapped_18486_dep_max.txt", header=FALSE)
map.vec.max= as.vector(mapped.max[,1])
hist(map.vec.max, xlab="Bases", main="Histogram of max mapped reads from cigar string")
summary(map.vec.max)
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.00 22.00 23.00 23.87 27.00 44.00
I will change the code so it looks for the digits before the S in the cigar string.
#!/usr/bin/env python
import sys
import pysam
import re
clip_num = open("clip_18486_dep.txt", "w")
file_bam_open= open("/project2/gilad/briana/Net-seq/Net-seq1/data/sort/YG-SP-NET1-18486-dep-2017-10-13_S4_R1_001-sort.bam", "r")
file_bam= pysam.AlignmentFile(file_bam_open, "rb")
def get_clip(file):
clip=[]
for read in file.fetch():
read= str(read)
cigar= read.split()[5] #python starts at 0
num= re.findall("\d+S", cigar) #this gives me each "##S" in the cigar
for i in num:
clip.append(i[:-1])
return(clip)
c= get_clip(file_bam)
clip_num.write("\n".join(c))
clip_num.close()
clip=read.csv("../data/clip_18486_dep.txt", header=FALSE)
clip.vec= as.vector(clip[,1])
hist(clip.vec, xlab="Bases", main="Histogram of clipped reads from cigar string")
summary(clip.vec)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.00 8.00 14.00 13.85 19.00 26.00
Use the sys.argv[1] command to pass in arguments from the command line. This script will generalize my get_mapped.py
#!/usr/bin/env python
import sys
import pysam
import re
file_name = sys.argv[1]
out_file = sys.argv[2]
#use by passing a sorted bam file, an output text file
mapped_num = open(out_file, "w")
file_bam_open= open(file_name, "r")
file_bam= pysam.AlignmentFile(file_bam_open, "rb")
def get_mapped(file):
out=[]
for read in file.fetch():
read= str(read)
cigar= read.split()[5] #python starts at 0
num= re.findall("\d+M", cigar) #this gives me each "##M" in the cigar
for i in num:
out.append(i[:-1])
return(out)
a= get_mapped(file_bam)
mapped_num.write("\n".join(a))
mapped_num.close()
./get_mapped.py /project2/gilad/briana/Net-seq/Net-seq1/data/sort/YG-SP-NET1-18508-dep-2017-10-13_S2_R1_001-sort.bam mapped_18508_dep.txt
./get_mapped.py /project2/gilad/briana/Net-seq/Net-seq1/data/sort/YG-SP-NET1-Unk1_S6_R1_001-sort.bam mapped_19238_dep.txt
/project2/gilad/briana/Net-seq/Net-seq1/data/sort/get_mapped.py /project2/gilad/briana/Net-seq/data/sort/SRR1575922-sort.bam /project2/gilad/briana/Net-seq/data/sort/mapped_mayer.txt
mapped_mayer=read.csv("../data/mapped_mayer.txt", header=FALSE)
map.mayer.vec= as.vector(mapped_mayer[,1])
d= hist(map.mayer.vec, xlab="Bases", main="Histogram of mayer mapped reads from cigar string")
summary(map.mayer.vec)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.0 21.0 32.0 28.1 38.0 70.0
mapped_18508=read.csv("../data/mapped_18508_dep.txt", header=FALSE)
map.18508.vec= as.vector(mapped_18508[,1])
b= hist(map.18508.vec, xlab="Bases", main="Histogram of 18508 mapped reads from cigar string")
summary(map.18508.vec)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.00 13.00 22.00 19.21 23.00 44.00
mapped_19238=read.csv("../data/mapped_19238_dep.txt", header=FALSE)
map.19238.vec= as.vector(mapped_19238[,1])
c= hist(map.19238.vec, xlab="Bases", main="Histogram of 19238 mapped reads from cigar string")
summary(map.19238.vec)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.00 13.00 22.00 19.41 23.00 44.00
par(mfrow = c(2,2))
hist(map.vec, xlab="Bases", main="Histogram of 18486 mapped reads")
hist(map.18508.vec, xlab="Bases", main="Histogram of 18508 mapped reads")
hist(map.19238.vec, xlab="Bases", main="Histogram of 19238 mapped reads")
hist(map.mayer.vec, xlab="Bases", main="Histogram of mayer reads")
Example bam line:
700819F:582:HNHYYBCXY:1:2214:10312:72354_TGATCC 0 1 13537 13 19M25S * 0 0 CGGTGTTTGTCATGGGCCTAATTTCGTATGCCGTCTTCTGCTTG IIIIIGIIIIIIIGGIIGIIGGIIGGGIIIIGIIIIIIIGIIII HI:i:1 NH:i:1 NM:i:0
The cigar is column 6. The read is column 10.
import re
cigar= "18S19M7S"
read= "TCTGCAACAGCTGCCCCTGATCTCGTATGCCGTCTTCTGCTTGA"
num_M= re.findall("\d+[SM]", cigar)
num_int= re.split("[SM]", cigar)
num_int= num_int[0:-1]
num_int=list(map(int,num_int))
cum_sum= [sum(num_int[:i]) for i in range(1, len(num_int)+1)]
cum_sum= [0] + cum_sum
split_string= [read[i:j] for i, j in zip(cum_sum[:-1], cum_sum[1:])]
num_all= []
for i in num_M:
if "S" in i:
x= re.findall("\d+", i)
x = int(x[0])
num_all.append(x)
max_S= max(num_all)
for i in range(len(num_M)):
if num_int[i] == max_S:
break
final = split_string[i]
#!/usr/bin/env python
import sys
import pysam
import re
file_name = sys.argv[1]
out_file = sys.argv[2]
file_bam_open= open(file_name, "r")
file_bam= pysam.AlignmentFile(file_bam_open, "rb")
softclip_tot = open(out_file, "w")
def soft_clip(file):
out= []
for a in file.fetch():
line=str(a)
cigar= line.split()[5]
read= line.split()[9]
num_M= re.findall("\d+[SM]", cigar)
num_int= re.split("[A-Z]", cigar)
num_int= num_int[0:-1]
num_int=list(map(int,num_int))
cum_sum= [sum(num_int[:i]) for i in range(1, len(num_int)+1)]
cum_sum= [0] + cum_sum
split_string= [read[i:j] for i, j in zip(cum_sum[:-1], cum_sum[1:])]
num_all= []
for i in num_M:
if "S" in i:
x= re.findall("\d+", i)
x = int(x[0])
num_all.append(x)
if len(num_all) != 0:
max_S= max(num_all)
for i in range(len(num_M)):
if num_int[i] == max_S:
break
final = split_string[i]
out.append(final)
return(out)
run= soft_clip(file_bam)
softclip_tot.write("\n".join(run))
softclip_tot.close()
Write bash file to submit this where I input the input bam and output txt.
#!/bin/bash
#SBATCH --job-name=run_get_soft_clip
#SBATCH --time=8:00:00
#SBATCH --partition=broadwl
#SBATCH --mem=20G
#SBATCH --tasks-per-node=4
module load Anaconda3
source activate net-seq
#$1 input file name
#$2 output file
/project2/gilad/briana/Net-seq/Net-seq1/data/sort/get_soft_clip.py $1 $2
Submit for 18486 dep:
sbatch submit_get_soft_clip.sh YG-SP-NET1-18486-dep-2017-10-13_S4_R1_001-sort.bam soft_clip_seq_18486_dep.txt
sbatch submit_get_soft_clip.sh /project2/gilad/briana/Net-seq/data/sort/SRR1575922-sort.bam /project2/gilad/briana/Net-seq/data/sort/soft_clip_mayer.txt
Complete and ready to analyze the txt files.
The quality score is in the sorted bam file.
This is column 5 in the bam file.
This script is called get_qual.py This is in /project2/gilad/briana/Net-seq/scripts
Run by passing it the bam file and an output txt file.
#!/usr/bin/env python
import sys
import pysam
import re
file_name = sys.argv[1]
out_file = sys.argv[2]
qual_score = open(out_file, "w")
file_bam_open= open(file_name, "r")
file_bam= pysam.AlignmentFile(file_bam_open, "rb")
def get_qual(file):
out=[]
for read in file.fetch():
read= str(read)
cigar= read.split()[4] #python starts at 0
out.append(cigar)
return(out)
a= get_qual(file_bam)
qual_score.write("\n".join(a))
qual_score.close()
Import data:
qual_18486= read.csv("../data/qual_18486_dep.txt", head=FALSE)
qual_18508= read.csv("../data/qual_18508_dep.txt", head=FALSE)
qual_19238= read.csv("../data/qual_19238_dep.txt", head=FALSE)
qual_mayer= read.csv("../data/qual_mayer.txt", head=FALSE)
Look at the summary:
summary(qual_18486)
V1
Min. : 6.0
1st Qu.:13.0
Median :13.0
Mean :14.3
3rd Qu.:13.0
Max. :40.0
summary(qual_18508)
V1
Min. : 6.00
1st Qu.:13.00
Median :13.00
Mean :12.59
3rd Qu.:13.00
Max. :40.00
summary(qual_19238)
V1
Min. : 6.00
1st Qu.:13.00
Median :13.00
Mean :12.88
3rd Qu.:13.00
Max. :40.00
summary(qual_mayer)
V1
Min. : 6.00
1st Qu.:10.00
Median :13.00
Mean :21.11
3rd Qu.:40.00
Max. :40.00
Make map quality histograms:
par(mfrow = c(2,2))
hist(as.numeric(qual_18486[,1]), freq=FALSE, main="Quality 18486", xlab="Quality Score")
hist(as.numeric(qual_18508[,1]), freq=FALSE, main="Quality 18508", xlab="Quality Score")
hist(as.numeric(qual_19238[,1]), freq=FALSE, main="Quality 19238", xlab="Quality Score")
hist(as.numeric(qual_mayer[,1]), freq=FALSE, main= "Quality Mayer", xlab="Quality Score")
sessionInfo()
R version 3.4.2 (2017-09-28)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS Sierra 10.12.6
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] stats graphics grDevices utils datasets methods base
loaded via a namespace (and not attached):
[1] compiler_3.4.2 backports_1.1.1 magrittr_1.5 rprojroot_1.2
[5] tools_3.4.2 htmltools_0.3.6 yaml_2.1.14 Rcpp_0.12.13
[9] stringi_1.1.5 rmarkdown_1.6 knitr_1.17 git2r_0.19.0
[13] stringr_1.2.0 digest_0.6.12 evaluate_0.10.1
This R Markdown site was created with workflowr