EDA22: Day 1 - Session 1: for cut&paste in RStudio

scores<-read.table("modifications.gff",sep = "\t")

mean(scores[which(scores$V7 == '+'),6])

mean(scores[which(scores$V7 == '-'),6])

origin=1581900

terminus=3919956

genome=4639675

jpeg("modQVscores_MG1655.jpg", width=1200, height=600)

par(mfrow=c(2,1))

plot(scores[which(scores$V7 == '+'),4],scores[which(scores$V7 == '+'),6], pch='.',col="blue" , xlab="genome coordinates strain MG1655 (+) strand", ylab="modQV scores")

lines(c(origin,origin),c(-20,20), col="green", lwd=10)

lines(c(terminus,terminus),c(-20,20), col="darkgreen", lwd=10)

plot(scores[which(scores$V7 == '-'),4],scores[which(scores$V7 == '-'),6], pch='.',col="red",xlab="genome coordinates strain MG1655 (-) strand", ylab="modQV scores")

lines(c(origin,origin),c(-20,20), col="green", lwd=10)

lines(c(terminus,terminus),c(-20,20), col="darkgreen", lwd=10)

dev.off()

install.packages("ggplot2")

install.packages("Biostrings")

library(ggplot2)

library(Biostrings)

source('scripts.R')

#read gff file

gff <-'modifications.gff'

hits <-readModificationsGff(gff)

hits$CognateBase <-substr(hits$context, 21, 21)

head(hits)

summary(hits$coverage)

#plot frequency

qplot(score, colour=CognateBase, geom='freqpoly', data=hits, binwidth=1,xlim=c(0,200))+scale_y_continuous(breaks=seq(0,30000,1000))+ylim(c(0,2000))

#plot Score vs. Coverage

qplot(coverage, score, colour = CognateBase, alpha = I(0.2), data = subset(hits, CognateBase %in% c('A','C','G','T')))

#add a separation line

qplot(coverage, score, colour = CognateBase, alpha = I(0.2), data = subset(hits, CognateBase %in% c('A','C','G','T')), xlim=c(0,250), ylim=c(0,250))+geom_abline(slope=1,intercept=0)+geom_vline(x=25,xintercept=0)

#the coverage threshold cutoff is taken from the previous plot

covcutoff<-50 #insert your value here (e.g., 50)

#We select these "good" hits, then sort in decreasing order of score, so we consider the strongest signal first.

goodHits <-subset(hits, coverage > covcutoff)

goodHits <-subset(goodHits, score > coverage)

goodHits <-goodHits[order(goodHits$score, decreasing=T),]

workHits <-goodHits

#MEME Motif Finding

#We use the online motif ﬁnding server ’MEMEChip’ (http://meme-suite.org/tools/meme).

#The MEMEChip server requires the source sequences to uploaded in FASTA format.

#We write the context string of the top 1000 hits to ’contexts.fasta’.

writeContextFasta(workHits[1:1000,], 'contexts.fasta')

motif <-c('GATC')

position<-c(2)

motifLabels <-labelContexts(workHits$context, motif, position)

table(motifLabels)

workHits$label = motifLabels

remain <-workHits[which(motifLabels == 'None'),]

nrow(remain)

writeContextFasta(remain[1:1000,], 'remaining.fasta')

#In our example, the 2 motifs are ’GATC’, ’AACNNNNNNGTGC’ as methylated sequence motifs.

motif <-c('GATC', 'AACNNNNNNGTGC')

position <-c(2,2)

motifLabels <-labelContexts(workHits$context, motif, position)

table(motifLabels)

workHits$label = motifLabels

remain <-workHits[which(motifLabels == 'None'),]

nrow(remain)

motif <-c('GATC', 'AACNNNNNNGTGC', 'GCACNNNNNNGTT')

position <-c(2,2,3)

motifLabels <-labelContexts(workHits$context, motif, position)

table(motifLabels)

workHits$label = motifLabels

remain <-workHits[which(motifLabels == 'None'),]

nrow(remain)

#Load the genome sequence

seq_path <-'ecoli_K12_MG1655.fasta'

dna_seq <-readDNAStringSet(seq_path)

motif <-c('GATC', 'AACNNNNNNGTGC', 'GCACNNNNNNGTT')

position <-c(2,2,3) #position of the modified base in the motifs string

genomeAnnotations <-genomeAnnotation(dna_seq, motif, position)

head(genomeAnnotations)

table(genomeAnnotations$motif)

#trick to get the sequence number (seqid) from seqname

workHits$seqid <- 1

#extract the hits and their annotations

mm <-merge(workHits, genomeAnnotations, all=T)

mm$motif[is.na(mm$motif)] <-'NoMotif'

mm$feature[is.na(mm$feature)] <-'not_detected'

table(mm$feature, mm$motif)

#the plot shows better the difference distribution scores between motifs vs no motifs

qplot(score, ..density.., colour=motif, geom='freqpoly', data=subset(mm, feature %in% c('m6A','modified_base')), binwidth=3, xlim=c(0,200))

hits$seqid <- 1

mma <-merge(hits, genomeAnnotations, all=T)

mma$motif[is.na(mma$motif)] <-'NoMotif'

mma$feature[is.na(mma$feature)] <-'not_detected'

table(mma$feature, mma$motif)

#select the motif and merge the ipdratios by position

ipdplus<-subset(mma, motif == 'GATC' & strand == '+',

select=c(start,strand,seqid,IPDRatio))

ipdminus<-subset(mma, motif == 'GATC' & strand == '-',

select=c(start,strand,seqid,IPDRatio))

#correct for difference in position between the strands "1" in this case

ipdminus$start<-ipdminus$start-1

mm2<-merge(ipdplus,ipdminus,by=c("seqid","start"))

#rename columns

colnames(mm2)<-c("start","seqid","strand.plus","IPDRatio.plus","strand.minus","IPDRatio.minus")

#plot ipdplus vs minus

qplot(IPDRatio.plus, IPDRatio.minus, geom='auto', data=mm2, xlim=c(0,15),ylim=c(0,15))

genomeSize <- sum(width(dna_seq))

csv <-'modifications.csv'

rawKin <-read.csv(csv) #warning slow!

rawKin$seqid <-1

rawKin$strand[rawKin$strand == 1] <-'-'

rawKin$strand[rawKin$strand == 0] <-'+'

mga <-merge(genomeAnnotations, rawKin, by.x=c('start', 'strand','seqid'),

by.y=c('tpl', 'strand','seqid'))

table(mga$motif, mga$score > mga$coverage)

#select the motif and merge the ipdratios by position

ipdplus<-subset(mga, motif == 'GATC' & strand == '+',select=c(start,strand,seqid,ipdRatio))

ipdminus<-subset(mga, motif == 'GATC' & strand == '-',select=c(start,strand,seqid,ipdRatio))

#correct for difference in position between the strands "1" in this case

ipdminus$start<-ipdminus$start-1

mm2<-merge(ipdplus,ipdminus,by=c("seqid","start"))

#rename columns

colnames(mm2)<-c("seqid","start","strand.plus","IPDRatio.plus","strand.minus","IPDRatio.minus")

#plot ipdplus vs minus

qplot(IPDRatio.plus, IPDRatio.minus, geom='auto', data=mm2, xlim=c(0,15),ylim=c(0,15))

colsToShow <-c('refName','start', 'strand', 'motif', 'score', 'coverage', 'ipdRatio')

subset(mga, score < 20 & motif=='GATC')[,colsToShow]

#Plot ipdRatio, score or coverage by position

#targetRef is the chromosome number

targetRef <- 1

comp_dnaSeq <- complement(dna_seq)

pos <- 1:width(dna_seq[targetRef,])

tplBase <-data.frame(tpl=pos,strand='+',base=unlist(strsplit(as.character(dna_seq[targetRef,]),split='')),row.names=NULL)

rTplBase <-data.frame(tpl=pos,strand='-',base=unlist(strsplit(as.character(comp_dnaSeq[targetRef,]),split='')),row.names=NULL)

tplBase <- rbind(tplBase,rTplBase)

newRaw <- merge(tplBase,rawKin,by.x=c('tpl','strand'),by.y=c('tpl','strand'))

### plot random selection of hits ###

statMode <- 'ipdRatio'

plotRange <- c(10000,12000)

rangePerPlot <- 200

subHits <- workHits[which(workHits$label=='GATC'),]

sampleSize <- 5

subDat <- subHits[which(subHits$seqid==targetRef),]

if (nrow(subDat) < sampleSize) sampleSize <- nrow(subDat)

nsamples=sample(subDat$start,sampleSize,replace=F)

pdf(file='GATCExamples.pdf',height=6,width=25)

for (s in 1:length(nsamples))

{

plotRange[1] <- nsamples[s]-round(rangePerPlot/2)

plotRange[2] <- nsamples[s]+round(rangePerPlot/2)

plotKinetics(newRaw,targetRef,statMode,plotRange,rangePerPlot)

}

dev.off()

Last modified: Monday, 27 June 2022, 11:20 AM