Day 1 - Session 1: for cut&paste in RStudio
scores<-read.table("modifications.gff",sep = "\t")
mean(scores[which(scores$V7 == '+'),6])
mean(scores[which(scores$V7 == '-'),6])
origin=1581900
terminus=3919956
genome=4639675
jpeg("modQVscores_MG1655.jpg", width=1200, height=600)
par(mfrow=c(2,1))
plot(scores[which(scores$V7 == '+'),4],scores[which(scores$V7 == '+'),6], pch='.',col="blue" , xlab="genome coordinates strain MG1655 (+) strand", ylab="modQV scores")
lines(c(origin,origin),c(-20,20), col="green", lwd=10)
lines(c(terminus,terminus),c(-20,20), col="darkgreen", lwd=10)
plot(scores[which(scores$V7 == '-'),4],scores[which(scores$V7 == '-'),6], pch='.',col="red",xlab="genome coordinates strain MG1655 (-) strand", ylab="modQV scores")
lines(c(origin,origin),c(-20,20), col="green", lwd=10)
lines(c(terminus,terminus),c(-20,20), col="darkgreen", lwd=10)
dev.off()
install.packages("ggplot2")
install.packages("Biostrings")
library(ggplot2)
library(Biostrings)
source('scripts.R')
#read gff file
gff <-'modifications.gff'
hits <-readModificationsGff(gff)
hits$CognateBase <-substr(hits$context, 21, 21)
head(hits)
summary(hits$coverage)
#plot frequency
qplot(score, colour=CognateBase, geom='freqpoly', data=hits, binwidth=1,xlim=c(0,200))+scale_y_continuous(breaks=seq(0,30000,1000))+ylim(c(0,2000))
#plot Score vs. Coverage
qplot(coverage, score, colour = CognateBase, alpha = I(0.2), data = subset(hits, CognateBase %in% c('A','C','G','T')))
#add a separation line
qplot(coverage, score, colour = CognateBase, alpha = I(0.2), data = subset(hits, CognateBase %in% c('A','C','G','T')), xlim=c(0,250), ylim=c(0,250))+geom_abline(slope=1,intercept=0)+geom_vline(x=25,xintercept=0)
#the coverage threshold cutoff is taken from the previous plot
covcutoff<-50 #insert your value here (e.g., 50)
#We select these "good" hits, then sort in decreasing order of score, so we consider the strongest signal first.
goodHits <-subset(hits, coverage > covcutoff)
goodHits <-subset(goodHits, score > coverage)
goodHits <-goodHits[order(goodHits$score, decreasing=T),]
workHits <-goodHits
#MEME Motif Finding
#We use the online motif finding server ’MEMEChip’ (http://meme-suite.org/tools/meme).
#The MEMEChip server requires the source sequences to uploaded in FASTA format.
#We write the context string of the top 1000 hits to ’contexts.fasta’.
writeContextFasta(workHits[1:1000,], 'contexts.fasta')
motif <-c('GATC')
position<-c(2)
motifLabels <-labelContexts(workHits$context, motif, position)
table(motifLabels)
workHits$label = motifLabels
remain <-workHits[which(motifLabels == 'None'),]
nrow(remain)
writeContextFasta(remain[1:1000,], 'remaining.fasta')
#In our example, the 2 motifs are ’GATC’, ’AACNNNNNNGTGC’ as methylated sequence motifs.
motif <-c('GATC', 'AACNNNNNNGTGC')
position <-c(2,2)
motifLabels <-labelContexts(workHits$context, motif, position)
table(motifLabels)
workHits$label = motifLabels
remain <-workHits[which(motifLabels == 'None'),]
nrow(remain)
motif <-c('GATC', 'AACNNNNNNGTGC', 'GCACNNNNNNGTT')
position <-c(2,2,3)
motifLabels <-labelContexts(workHits$context, motif, position)
table(motifLabels)
workHits$label = motifLabels
remain <-workHits[which(motifLabels == 'None'),]
nrow(remain)
#Load the genome sequence
seq_path <-'ecoli_K12_MG1655.fasta'
dna_seq <-readDNAStringSet(seq_path)
motif <-c('GATC', 'AACNNNNNNGTGC', 'GCACNNNNNNGTT')
position <-c(2,2,3) #position of the modified base in the motifs string
genomeAnnotations <-genomeAnnotation(dna_seq, motif, position)
head(genomeAnnotations)
table(genomeAnnotations$motif)
#trick to get the sequence number (seqid) from seqname
workHits$seqid <- 1
#extract the hits and their annotations
mm <-merge(workHits, genomeAnnotations, all=T)
mm$motif[is.na(mm$motif)] <-'NoMotif'
mm$feature[is.na(mm$feature)] <-'not_detected'
table(mm$feature, mm$motif)
#the plot shows better the difference distribution scores between motifs vs no motifs
qplot(score, ..density.., colour=motif, geom='freqpoly', data=subset(mm, feature %in% c('m6A','modified_base')), binwidth=3, xlim=c(0,200))
hits$seqid <- 1
mma <-merge(hits, genomeAnnotations, all=T)
mma$motif[is.na(mma$motif)] <-'NoMotif'
mma$feature[is.na(mma$feature)] <-'not_detected'
table(mma$feature, mma$motif)
#select the motif and merge the ipdratios by position
ipdplus<-subset(mma, motif == 'GATC' & strand == '+',
select=c(start,strand,seqid,IPDRatio))
ipdminus<-subset(mma, motif == 'GATC' & strand == '-',
select=c(start,strand,seqid,IPDRatio))
#correct for difference in position between the strands "1" in this case
ipdminus$start<-ipdminus$start-1
mm2<-merge(ipdplus,ipdminus,by=c("seqid","start"))
#rename columns
colnames(mm2)<-c("start","seqid","strand.plus","IPDRatio.plus","strand.minus","IPDRatio.minus")
#plot ipdplus vs minus
qplot(IPDRatio.plus, IPDRatio.minus, geom='auto', data=mm2, xlim=c(0,15),ylim=c(0,15))
genomeSize <- sum(width(dna_seq))
csv <-'modifications.csv'
rawKin <-read.csv(csv) #warning slow!
rawKin$seqid <-1
rawKin$strand[rawKin$strand == 1] <-'-'
rawKin$strand[rawKin$strand == 0] <-'+'
mga <-merge(genomeAnnotations, rawKin, by.x=c('start', 'strand','seqid'),
by.y=c('tpl', 'strand','seqid'))
table(mga$motif, mga$score > mga$coverage)
#select the motif and merge the ipdratios by position
ipdplus<-subset(mga, motif == 'GATC' & strand == '+',select=c(start,strand,seqid,ipdRatio))
ipdminus<-subset(mga, motif == 'GATC' & strand == '-',select=c(start,strand,seqid,ipdRatio))
#correct for difference in position between the strands "1" in this case
ipdminus$start<-ipdminus$start-1
mm2<-merge(ipdplus,ipdminus,by=c("seqid","start"))
#rename columns
colnames(mm2)<-c("seqid","start","strand.plus","IPDRatio.plus","strand.minus","IPDRatio.minus")
#plot ipdplus vs minus
qplot(IPDRatio.plus, IPDRatio.minus, geom='auto', data=mm2, xlim=c(0,15),ylim=c(0,15))
colsToShow <-c('refName','start', 'strand', 'motif', 'score', 'coverage', 'ipdRatio')
subset(mga, score < 20 & motif=='GATC')[,colsToShow]
#Plot ipdRatio, score or coverage by position
#targetRef is the chromosome number
targetRef <- 1
comp_dnaSeq <- complement(dna_seq)
pos <- 1:width(dna_seq[targetRef,])
tplBase <-data.frame(tpl=pos,strand='+',base=unlist(strsplit(as.character(dna_seq[targetRef,]),split='')),row.names=NULL)
rTplBase <-data.frame(tpl=pos,strand='-',base=unlist(strsplit(as.character(comp_dnaSeq[targetRef,]),split='')),row.names=NULL)
tplBase <- rbind(tplBase,rTplBase)
newRaw <- merge(tplBase,rawKin,by.x=c('tpl','strand'),by.y=c('tpl','strand'))
### plot random selection of hits ###
statMode <- 'ipdRatio'
plotRange <- c(10000,12000)
rangePerPlot <- 200
subHits <- workHits[which(workHits$label=='GATC'),]
sampleSize <- 5
subDat <- subHits[which(subHits$seqid==targetRef),]
if (nrow(subDat) < sampleSize) sampleSize <- nrow(subDat)
nsamples=sample(subDat$start,sampleSize,replace=F)
pdf(file='GATCExamples.pdf',height=6,width=25)
for (s in 1:length(nsamples))
{
plotRange[1] <- nsamples[s]-round(rangePerPlot/2)
plotRange[2] <- nsamples[s]+round(rangePerPlot/2)
plotKinetics(newRaw,targetRef,statMode,plotRange,rangePerPlot)
}
dev.off()