################################################### ### chunk number 1: options ################################################### options(width=72) ################################################### ### chunk number 2: main1 ################################################### library(Biostrings) pairwiseAlignment(pattern = c("succeed", "precede"), subject = "supersede") ################################################### ### chunk number 3: main2 ################################################### pairwiseAlignment(pattern = c("succeed", "precede"), subject = "supersede", type = "local") ################################################### ### chunk number 4: main3 ################################################### pairwiseAlignment(pattern = c("succeed", "precede"), subject = "supersede", gapOpening = 0, gapExtension = -1) ################################################### ### chunk number 5: main4 ################################################### submat <- matrix(-1, nrow = 26, ncol = 26, dimnames = list(letters, letters)) diag(submat) <- 0 pairwiseAlignment(pattern = c("succeed", "precede"), subject = "supersede", substitutionMatrix = submat, gapOpening = 0, gapExtension = -1) ################################################### ### chunk number 6: main5 ################################################### submat <- matrix(-1, nrow = 26, ncol = 26, dimnames = list(letters, letters)) diag(submat) <- 0 pairwiseAlignment(pattern = c("succeed", "precede"), subject = "supersede", substitutionMatrix = submat, gapOpening = 0, gapExtension = -1, scoreOnly = TRUE) ################################################### ### chunk number 7: classes1 ################################################### psa1 <- pairwiseAlignment(pattern = c("succeed", "precede"), subject = "supersede") class(psa1) ################################################### ### chunk number 8: classes2 ################################################### summary(psa1) class(summary(psa1)) ################################################### ### chunk number 9: classes3 ################################################### class(pattern(psa1)) submat <- matrix(-1, nrow = 26, ncol = 26, dimnames = list(letters, letters)) diag(submat) <- 0 psa2 <- pairwiseAlignment(pattern = c("succeed", "precede"), subject = "supersede", substitutionMatrix = submat, gapOpening = 0, gapExtension = -1) class(pattern(psa2)) ################################################### ### chunk number 10: helper1 ################################################### submat <- matrix(-1, nrow = 26, ncol = 26, dimnames = list(letters, letters)) diag(submat) <- 0 psa2 <- pairwiseAlignment(pattern = c("succeed", "precede"), subject = "supersede", substitutionMatrix = submat, gapOpening = 0, gapExtension = -1) score(psa2) nedit(psa2) nmatch(psa2) nmismatch(psa2) nchar(psa2) aligned(psa2) as.character(psa2) as.matrix(psa2) consensusMatrix(psa2) ################################################### ### chunk number 11: helper2 ################################################### summary(psa2) mismatchTable(psa2) mismatchSummary(psa2) ################################################### ### chunk number 12: helper3 ################################################### class(pattern(psa2)) aligned(pattern(psa2)) nindel(pattern(psa2)) start(subject(psa2)) end(subject(psa2)) ################################################### ### chunk number 13: editdist1 ################################################### agrepBioC <- function(pattern, x, ignore.case = FALSE, value = FALSE, max.distance = 0.1) { if (!is.character(pattern)) pattern <- as.character(pattern) if (!is.character(x)) x <- as.character(x) if (max.distance < 1) max.distance <- ceiling(max.distance / nchar(pattern)) characters <- unique(unlist(strsplit(c(pattern, x), "", fixed = TRUE))) if (ignore.case) substitutionMatrix <- outer(tolower(characters), tolower(characters), function(x,y) -as.numeric(x!=y)) else substitutionMatrix <- outer(characters, characters, function(x,y) -as.numeric(x!=y)) dimnames(substitutionMatrix) <- list(characters, characters) distance <- - pairwiseAlignment(pattern = x, subject = pattern, substitutionMatrix = substitutionMatrix, type = "local-global", gapOpening = 0, gapExtension = -1, scoreOnly = TRUE) whichClose <- which(distance <= max.distance) if (value) whichClose <- x[whichClose] whichClose } cbind(base = agrep("laysy", c("1 lazy", "1", "1 LAZY"), max = 2, value = TRUE), bioc = agrepBioC("laysy", c("1 lazy", "1", "1 LAZY"), max = 2, value = TRUE)) cbind(base = agrep("laysy", c("1 lazy", "1", "1 LAZY"), max = 2, ignore.case = TRUE), bioc = agrepBioC("laysy", c("1 lazy", "1", "1 LAZY"), max = 2, ignore.case = TRUE)) ################################################### ### chunk number 14: lkblo ################################################### data(BLOSUM50) BLOSUM50[1:4,1:4] nwdemo <- pairwiseAlignment(AAString("PAWHEAE"), AAString("HEAGAWGHEE"), substitutionMatrix = BLOSUM50, gapOpening = 0, gapExtension = -8) nwdemo compareStrings(nwdemo) pid(nwdemo) ################################################### ### chunk number 15: adapter1 ################################################### simulateReads <- function(N, adapter, experiment, substitutionRate = 0.01, gapRate = 0.001) { chars <- strsplit(as.character(adapter), "")[[1]] sapply(seq_len(N), function(i, experiment, substitutionRate, gapRate) { width <- experiment[["width"]][i] side <- experiment[["side"]][i] randomLetters <- function(n) sample(DNA_ALPHABET[1:4], n, replace = TRUE) randomLettersWithEmpty <- function(n) sample(c("", DNA_ALPHABET[1:4]), n, replace = TRUE, prob = c(1 - gapRate, rep(gapRate/4, 4))) nChars <- length(chars) value <- paste(ifelse(rbinom(nChars,1,substitutionRate), randomLetters(nChars), chars), randomLettersWithEmpty(nChars), sep = "", collapse = "") if (side) value <- paste(c(randomLetters(36 - width), substring(value, 1, width)), sep = "", collapse = "") else value <- paste(c(substring(value, 37 - width, 36), randomLetters(36 - width)), sep = "", collapse = "") value }, experiment = experiment, substitutionRate = substitutionRate, gapRate = gapRate) } adapter <- DNAString("GATCGGAAGAGCTCGTATGCCGTCTTCTGCTTGAAA") set.seed(123) N <- 1000 experiment <- list(side = rbinom(N, 1, 0.5), width = sample(0:36, N, replace = TRUE)) table(experiment[["side"]], experiment[["width"]]) adapterStrings <- simulateReads(N, adapter, experiment, substitutionRate = 0.01, gapRate = 0.001) adapterStrings <- DNAStringSet(adapterStrings) ################################################### ### chunk number 16: adapter2 ################################################### M <- 5000 randomStrings <- apply(matrix(sample(DNA_ALPHABET[1:4], 36 * M, replace = TRUE), nrow = M), 1, paste, collapse = "") randomStrings <- DNAStringSet(randomStrings) ################################################### ### chunk number 17: adapter3 ################################################### ## Method 1: Use edit distance with an FDR of 1e-03 submat1 <- nucleotideSubstitutionMatrix(match = 0, mismatch = -1, baseOnly = TRUE) randomScores1 <- pairwiseAlignment(randomStrings, adapter, substitutionMatrix = submat1, gapOpening = 0, gapExtension = -1, scoreOnly = TRUE) quantile(randomScores1, seq(0.99, 1, by = 0.001)) adapterAligns1 <- pairwiseAlignment(adapterStrings, adapter, substitutionMatrix = submat1, gapOpening = 0, gapExtension = -1) table(score(adapterAligns1) > quantile(randomScores1, 0.999), experiment[["width"]]) ################################################### ### chunk number 18: adapter4 ################################################### ## Method 2: Use consecutive matches anywhere in string with an FDR of 1e-03 submat2 <- nucleotideSubstitutionMatrix(match = 1, mismatch = -Inf, baseOnly = TRUE) randomScores2 <- pairwiseAlignment(randomStrings, adapter, substitutionMatrix = submat2, type = "local", gapOpening = 0, gapExtension = -Inf, scoreOnly = TRUE) quantile(randomScores2, seq(0.99, 1, by = 0.001)) adapterAligns2 <- pairwiseAlignment(adapterStrings, adapter, substitutionMatrix = submat2, type = "local", gapOpening = 0, gapExtension = -Inf) table(score(adapterAligns2) > quantile(randomScores2, 0.999), experiment[["width"]]) # Determine if the correct end was chosen table(start(pattern(adapterAligns2)) > 37 - end(pattern(adapterAligns2)), experiment[["side"]]) ################################################### ### chunk number 19: adapter5 ################################################### ## Method 3: Use consecutive matches on the ends with an FDR of 1e-03 submat3 <- nucleotideSubstitutionMatrix(match = 1, mismatch = -Inf, baseOnly = TRUE) randomScores3 <- pairwiseAlignment(randomStrings, adapter, substitutionMatrix = submat3, type = "overlap", gapOpening = 0, gapExtension = -Inf, scoreOnly = TRUE) quantile(randomScores3, seq(0.99, 1, by = 0.001)) adapterAligns3 <- pairwiseAlignment(adapterStrings, adapter, substitutionMatrix = submat3, type = "overlap", gapOpening = 0, gapExtension = -Inf) table(score(adapterAligns3) > quantile(randomScores3, 0.999), experiment[["width"]]) # Determine if the correct end was chosen table(end(pattern(adapterAligns3)) == 36, experiment[["side"]]) ################################################### ### chunk number 20: adapter6 ################################################### ## Method 4: Allow mismatches and indels on the ends with an FDR of 1e-03 randomScores4 <- pairwiseAlignment(randomStrings, adapter, type = "overlap", scoreOnly = TRUE) quantile(randomScores4, seq(0.99, 1, by = 0.001)) adapterAligns4 <- pairwiseAlignment(adapterStrings, adapter, type = "overlap") table(score(adapterAligns4) > quantile(randomScores4, 0.999), experiment[["width"]]) # Determine if the correct end was chosen table(end(pattern(adapterAligns4)) == 36, experiment[["side"]]) ################################################### ### chunk number 21: adapter7 ################################################### ## Method 4 continued: Remove adapter fragments fragmentFound <- score(adapterAligns4) > quantile(randomScores4, 0.999) fragmentFoundAt1 <- fragmentFound & (start(pattern(adapterAligns4)) == 1) fragmentFoundAt36 <- fragmentFound & (end(pattern(adapterAligns4)) == 36) cleanedStrings <- as.character(adapterStrings) cleanedStrings[fragmentFoundAt1] <- as.character(narrow(adapterStrings[fragmentFoundAt1], end = 36, width = 36 - end(pattern(adapterAligns4[fragmentFoundAt1])))) cleanedStrings[fragmentFoundAt36] <- as.character(narrow(adapterStrings[fragmentFoundAt36], start = 1, width = start(pattern(adapterAligns4[fragmentFoundAt36])) - 1)) cleanedStrings <- DNAStringSet(cleanedStrings) cleanedStrings ################################################### ### chunk number 22: genome1 ################################################### data(phiX174Phage) genBankPhage <- phiX174Phage[[1]] nchar(genBankPhage) data(srPhiX174) srPhiX174 quPhiX174 summary(wtPhiX174) fullShortReads <- rep(srPhiX174, wtPhiX174) srPDict <- PDict(fullShortReads) table(countPDict(srPDict, genBankPhage)) ################################################### ### chunk number 23: genome2 ################################################### genBankSubstring <- substring(genBankPhage, 2793-34, 2811+34) genBankAlign <- pairwiseAlignment(srPhiX174, genBankSubstring, patternQuality = SolexaQuality(quPhiX174), subjectQuality = SolexaQuality(99L), type = "global-local") summary(genBankAlign, weight = wtPhiX174) revisedPhage <- replaceLetterAt(genBankPhage, c(2793, 2811), "TT") table(countPDict(srPDict, revisedPhage)) ################################################### ### chunk number 24: genome3 ################################################### genBankCoverage <- coverage(genBankAlign, weight = wtPhiX174) plot((2793-34):(2811+34), as.integer(genBankCoverage), xlab = "Position", ylab = "Coverage", type = "l") nchar(genBankSubstring) slice(genBankCoverage, lower = 1) ################################################### ### chunk number 25: profiling1 ################################################### N <- as.integer(seq(500, 5000, by = 500)) timings <- rep(0, length(N)) names(timings) <- as.character(N) for (i in seq_len(length(N))) { string1 <- DNAString(paste(sample(DNA_ALPHABET[1:4], N[i], replace = TRUE), collapse = "")) string2 <- DNAString(paste(sample(DNA_ALPHABET[1:4], N[i], replace = TRUE), collapse = "")) timings[i] <- system.time(pairwiseAlignment(string1, string2, type = "global"))[["user.self"]] } timings coef(summary(lm(timings ~ poly(N, 2)))) plot(N, timings, xlab = "String Size, Both Strings", ylab = "Timing (sec.)", type = "l", main = "Global Pairwise Sequence Alignment Timings") ################################################### ### chunk number 26: profiling2 ################################################### scoreOnlyTimings <- rep(0, length(N)) names(scoreOnlyTimings) <- as.character(N) for (i in seq_len(length(N))) { string1 <- DNAString(paste(sample(DNA_ALPHABET[1:4], N[i], replace = TRUE), collapse = "")) string2 <- DNAString(paste(sample(DNA_ALPHABET[1:4], N[i], replace = TRUE), collapse = "")) scoreOnlyTimings[i] <- system.time(pairwiseAlignment(string1, string2, type = "global", scoreOnly = TRUE))[["user.self"]] } scoreOnlyTimings round((timings - scoreOnlyTimings) / timings, 2) ################################################### ### chunk number 27: doal ################################################### file <- system.file("extdata", "someORF.fa", package="Biostrings") orf <- read.DNAStringSet(file, "fasta") orf orf10 <- DNAStringSet(orf, end=10) consensusMatrix(orf10, baseOnly=TRUE, freq=TRUE) ################################################### ### chunk number 28: infco ################################################### informationContent <- function(Lmers) { zlog <- function(x) ifelse(x==0,0,log(x)) co <- consensusMatrix(Lmers, freq=TRUE) lets <- rownames(co) fr <- alphabetFrequency(Lmers, collapse=TRUE)[lets] fr <- fr / sum(fr) sum(co*zlog(co/fr), na.rm=TRUE) } informationContent(orf10) ################################################### ### chunk number 29: ans1a ################################################### pairwiseAlignment("zyzzyx", "syzygy") pairwiseAlignment("zyzzyx", "syzygy", type = "local") pairwiseAlignment("zyzzyx", "syzygy", type = "overlap") ################################################### ### chunk number 30: ans1b ################################################### pairwiseAlignment("zyzzyx", "syzygy", type = "overlap", gapExtension = -Inf) ################################################### ### chunk number 31: ans2a ################################################### ex2 <- summary(pairwiseAlignment("zyzzyx", "syzygy")) nmatch(ex2) / nmismatch(ex2) ################################################### ### chunk number 32: ans3 ################################################### ex3 <- pairwiseAlignment("zyzzyx", "syzygy", type = "overlap") ################################################### ### chunk number 33: ans3a ################################################### nmatch(ex3) nmismatch(ex3) ################################################### ### chunk number 34: ans3b ################################################### compareStrings(ex3) ################################################### ### chunk number 35: ans3c ################################################### as.character(ex3) ################################################### ### chunk number 36: ans3d ################################################### mismatch(pattern(ex3)) ################################################### ### chunk number 37: ans3e ################################################### aligned(subject(ex3)) ################################################### ### chunk number 38: ans4a ################################################### submat <- matrix(-1, nrow = 26, ncol = 26, dimnames = list(letters, letters)) diag(submat) <- 0 - pairwiseAlignment("zyzzyx", "syzygy", substitutionMatrix = submat, gapOpening = 0, gapExtension = -1, scoreOnly = TRUE) ################################################### ### chunk number 39: ans4b ################################################### stringDist(c("zyzzyx", "syzygy", "succeed", "precede", "supersede")) ################################################### ### chunk number 40: ans5a ################################################### data(BLOSUM62) pairwiseAlignment(AAString("PAWHEAE"), AAString("HEAGAWGHEE"), substitutionMatrix = BLOSUM62, gapOpening = -12, gapExtension = -4) ################################################### ### chunk number 41: ans6a ################################################### adapter <- DNAString("GATCGGAAGAGCTCGTATGCCGTCTTCTGCTTGAAA") set.seed(123) N <- 1000 experiment <- list(side = rbinom(N, 1, 0.5), width = sample(0:36, N, replace = TRUE)) table(experiment[["side"]], experiment[["width"]]) ex6Strings <- simulateReads(N, adapter, experiment, substitutionRate = 0.005, gapRate = 0.0005) ex6Strings <- DNAStringSet(ex6Strings) ex6Strings ## Method 1: Use edit distance with an FDR of 1e-03 submat1 <- nucleotideSubstitutionMatrix(match = 0, mismatch = -1, baseOnly = TRUE) quantile(randomScores1, seq(0.99, 1, by = 0.001)) ex6Aligns1 <- pairwiseAlignment(ex6Strings, adapter, substitutionMatrix = submat1, gapOpening = 0, gapExtension = -1) table(score(ex6Aligns1) > quantile(randomScores1, 0.999), experiment[["width"]]) ## Method 2: Use consecutive matches anywhere in string with an FDR of 1e-03 submat2 <- nucleotideSubstitutionMatrix(match = 1, mismatch = -Inf, baseOnly = TRUE) quantile(randomScores2, seq(0.99, 1, by = 0.001)) ex6Aligns2 <- pairwiseAlignment(ex6Strings, adapter, substitutionMatrix = submat2, type = "local", gapOpening = 0, gapExtension = -Inf) table(score(ex6Aligns2) > quantile(randomScores2, 0.999), experiment[["width"]]) # Determine if the correct end was chosen table(start(pattern(ex6Aligns2)) > 37 - end(pattern(ex6Aligns2)), experiment[["side"]]) ## Method 3: Use consecutive matches on the ends with an FDR of 1e-03 submat3 <- nucleotideSubstitutionMatrix(match = 1, mismatch = -Inf, baseOnly = TRUE) ex6Aligns3 <- pairwiseAlignment(ex6Strings, adapter, substitutionMatrix = submat3, type = "overlap", gapOpening = 0, gapExtension = -Inf) table(score(ex6Aligns3) > quantile(randomScores3, 0.999), experiment[["width"]]) # Determine if the correct end was chosen table(end(pattern(ex6Aligns3)) == 36, experiment[["side"]]) ## Method 4: Allow mismatches and indels on the ends with an FDR of 1e-03 quantile(randomScores4, seq(0.99, 1, by = 0.001)) ex6Aligns4 <- pairwiseAlignment(ex6Strings, adapter, type = "overlap") table(score(ex6Aligns4) > quantile(randomScores4, 0.999), experiment[["width"]]) # Determine if the correct end was chosen table(end(pattern(ex6Aligns4)) == 36, experiment[["side"]]) ################################################### ### chunk number 42: ans6b ################################################### simulateReads <- function(N, left, right = left, experiment, substitutionRate = 0.01, gapRate = 0.001) { leftChars <- strsplit(as.character(left), "")[[1]] rightChars <- strsplit(as.character(right), "")[[1]] if (length(leftChars) != length(rightChars)) stop("left and right adapters must have the same number of characters") nChars <- length(leftChars) sapply(seq_len(N), function(i) { width <- experiment[["width"]][i] side <- experiment[["side"]][i] randomLetters <- function(n) sample(DNA_ALPHABET[1:4], n, replace = TRUE) randomLettersWithEmpty <- function(n) sample(c("", DNA_ALPHABET[1:4]), n, replace = TRUE, prob = c(1 - gapRate, rep(gapRate/4, 4))) if (side) { value <- paste(ifelse(rbinom(nChars,1,substitutionRate), randomLetters(nChars), rightChars), randomLettersWithEmpty(nChars), sep = "", collapse = "") value <- paste(c(randomLetters(36 - width), substring(value, 1, width)), sep = "", collapse = "") } else { value <- paste(ifelse(rbinom(nChars,1,substitutionRate), randomLetters(nChars), leftChars), randomLettersWithEmpty(nChars), sep = "", collapse = "") value <- paste(c(substring(value, 37 - width, 36), randomLetters(36 - width)), sep = "", collapse = "") } value }) } leftAdapter <- adapter rightAdapter <- reverseComplement(adapter) ex6LeftRightStrings <- simulateReads(N, leftAdapter, rightAdapter, experiment) ex6LeftAligns4 <- pairwiseAlignment(ex6LeftRightStrings, leftAdapter, type = "overlap") ex6RightAligns4 <- pairwiseAlignment(ex6LeftRightStrings, rightAdapter, type = "overlap") scoreCutoff <- quantile(randomScores4, 0.999) leftAligned <- start(pattern(ex6LeftAligns4)) == 1 & score(ex6LeftAligns4) > pmax(scoreCutoff, score(ex6RightAligns4)) rightAligned <- end(pattern(ex6RightAligns4)) == 36 & score(ex6RightAligns4) > pmax(scoreCutoff, score(ex6LeftAligns4)) table(leftAligned, rightAligned) table(leftAligned | rightAligned, experiment[["width"]]) ################################################### ### chunk number 43: ans7a ################################################### genBankFullAlign <- pairwiseAlignment(srPhiX174, genBankPhage, patternQuality = SolexaQuality(quPhiX174), subjectQuality = SolexaQuality(99L), type = "global-local") summary(genBankFullAlign, weight = wtPhiX174) ################################################### ### chunk number 44: ans7b ################################################### genBankFullCoverage <- coverage(genBankFullAlign, weight = wtPhiX174) plot(as.integer(genBankFullCoverage), xlab = "Position", ylab = "Coverage", type = "l") slice(genBankFullCoverage, lower = 1) ################################################### ### chunk number 45: ans7c ################################################### genBankFullAlignRevComp <- pairwiseAlignment(srPhiX174, reverseComplement(genBankPhage), patternQuality = SolexaQuality(quPhiX174), subjectQuality = SolexaQuality(99L), type = "global-local") table(score(genBankFullAlignRevComp) > score(genBankFullAlign)) ################################################### ### chunk number 46: ans8a ################################################### N <- as.integer(seq(5000, 50000, by = 5000)) newTimings <- rep(0, length(N)) names(newTimings) <- as.character(N) for (i in seq_len(length(N))) { string1 <- DNAString(paste(sample(DNA_ALPHABET[1:4], 35, replace = TRUE), collapse = "")) string2 <- DNAString(paste(sample(DNA_ALPHABET[1:4], N[i], replace = TRUE), collapse = "")) newTimings[i] <- system.time(pairwiseAlignment(string1, string2, type = "global"))[["user.self"]] } newTimings coef(summary(lm(newTimings ~ poly(N, 2)))) plot(N, newTimings, xlab = "Larger String Size", ylab = "Timing (sec.)", type = "l", main = "Global Pairwise Sequence Alignment Timings") ################################################### ### chunk number 47: ans8b ################################################### newScoreOnlyTimings <- rep(0, length(N)) names(newScoreOnlyTimings) <- as.character(N) for (i in seq_len(length(N))) { string1 <- DNAString(paste(sample(DNA_ALPHABET[1:4], 35, replace = TRUE), collapse = "")) string2 <- DNAString(paste(sample(DNA_ALPHABET[1:4], N[i], replace = TRUE), collapse = "")) newScoreOnlyTimings[i] <- system.time(pairwiseAlignment(string1, string2, type = "global", scoreOnly = TRUE))[["user.self"]] } newScoreOnlyTimings round((newTimings - newScoreOnlyTimings) / newTimings, 2) ################################################### ### chunk number 48: sessinfo ################################################### toLatex(sessionInfo())