###################################################
### chunk number 1: 
###################################################
library(nudge)


###################################################
### chunk number 2: 
###################################################
data(like)
#lR is the matrix or vector of log ratios to the base 2
#We get this by subtracting the log of the green intensities for each gene from the log of the red intensities for each gene (the colour assignments can be the other way also)
lR<-log(like[,1],2)-log(like[,2],2)
#lI is the matrix or vector of log intensities to the base 2
#We get this by adding the log of the green intensities for each gene from the log of the red intensities for each gene
lI<-log(like[,1],2)+log(like[,2],2)


###################################################
### chunk number 3: 
###################################################
result<-nudge1(logratio=lR,logintensity=lI)


###################################################
### chunk number 4: 
###################################################
names(result)
#pdiff is the vector of probabilities of differential expression
#lRnorm is the vector of normalized log ratios
#mu and sigma are the EM estimates of the parameters for the group of non-differentially expressed normalized log ratios
#mixprob is the mixing parameter estimate (or the prior probability for a gene not being differentially expressed)
#a is the min and b is the max of the normalized log ratios (this gives the range for the uniform mixture component)
#loglike is the converged log likelihood value for the fitted model


###################################################
### chunk number 5: 
###################################################
s<-sort(result$pdiff,decreasing=T,index.return=T)
#Look at the row number and the probabilities of the genes with the 20 highest probabilities of differential expression
names(lR)<-c(1:length(lR))
cbind(names(lR)[s$ix[1:20]],round(s$x[1:20],2))


###################################################
### chunk number 6: 
###################################################
# Set the threshold value
thresh<-0.5
sum(result$pdiff>=thresh)
#Can also look at the row numbers or names of these genes
names(lR)[result$pdiff>=thresh]


###################################################
### chunk number 7: 
###################################################
#lRnorm is the mean and variance normalized log ratios for the genes
lRnorm<-result$lRnorm


###################################################
### chunk number 8:  eval=FALSE
###################################################
## #plot the two different log ratios versus log intensities side by side
## par(mfrow=c(1,2))
## #put them both on the same scale
## yl<-range(lR,lRnorm)
## plot(lI,lR,pch=".",main="Log ratios versus log intensities",xlab="log intensities",ylab="log ratios",ylim=yl)
## #We can also add a loess fitted mean line to the plot/
## l<-loess(lR~lI,span=0.6)
## slI<-sort(lI,index.return=T)
## lines(cbind(slI$x,l$fitted[slI$ix]))
## plot(lI,lRnorm,pch=".",
## main="Normalized log ratios versus log intensities", xlab="average log intensities",ylab="normalized log ratios",ylim=yl)
## #We can also add a loess fitted mean line to the plot
## l<-loess(lRnorm~lI,span=0.6)
## slI<-sort(lI,index.return=T)
## lines(cbind(slI$x,l$fitted[slI$ix]))
## 


###################################################
### chunk number 9: 
###################################################
data(hiv)
#lR is the matrix of log ratios to the base 2
#We get this by subtracting the logs of the intensities for sample 1 for each gene from the logs of the corresponding intensities for sample 2 for each gene (by corresponding intensities we mean those paired to the same slide)
lR<-log(hiv[,1:4],2)-log(hiv[,5:8],2)
#lI is the matrix of log intensities to the base 2
#We get this by adding the logs of the intensities for sample 1 for each gene from the logs of the corresponding intensities for sample 2 for each gene (by corresponding intensities we mean those paired to the same slide)
lI<-log(hiv[,1:4],2)+log(hiv[,5:8],2)


###################################################
### chunk number 10: 
###################################################
result<-nudge1(logratio=lR,logintensity=lI,dye.swap=T)


###################################################
### chunk number 11: 
###################################################
names(result)
#pdiff is the vector of probabilities of differential expression
#lRnorm is the vector of normalized log ratios
#mu and sigma are the EM estimates of the parameters for the group of non-differentially expressed log ratios
#mixprob is the mixing parameter estimate (or the prior probability for a gene not being differentially expressed)
#a is the min and b is the max of the normalized log ratios (this gives the range for the uniform mixture component)
#loglike is the converged log likelihood value for the fitted model


###################################################
### chunk number 12: 
###################################################
s<-sort(result$pdiff,decreasing=T,index.return=T)
#Look at the row number and the probabilities of the genes with the 20 highest probabilities of differential expression
rownames(lR)<-c(1:nrow(lR))
cbind(rownames(lR)[s$ix[1:20]],round(s$x[1:20],2))


###################################################
### chunk number 13: 
###################################################
# Set the threshold value
thresh<-0.5
sum(result$pdiff>=thresh)
#Can also look at the row numbers or names of these genes
rownames(lR)[result$pdiff>=thresh]


###################################################
### chunk number 14: 
###################################################
#lRbar is the average (across replicates) log ratios for the genes
lRbar<-apply(lR,1,mean)
#lIbar is the average (across replicates) log intensities for the genes 
lIbar<-apply(lI,1,mean)
#lRnorm is the mean and variance normalized average log ratios for the genes
lRnorm<-result$lRnorm


###################################################
### chunk number 15:  eval=FALSE
###################################################
## #plot the two different average log ratios versus average log intensities side by side
## par(mfrow=c(1,2))
## #put them both on the same scale
## yl<-range(lRbar,lRnorm)
## 


###################################################
### chunk number 16:  eval=FALSE
###################################################
## plot(lIbar,lRbar,pch=".",main="Average log ratios
##  versus 
##  average log  intensities",xlab="average log intensities",ylab="average log ratios", ylim=yl)
## 


###################################################
### chunk number 17:  eval=FALSE
###################################################
## #We can also add a loess fitted mean line to the plot
## l<-loess(lRbar~lIbar,span=0.6)
## slI<-sort(lIbar,index.return=T)
## lines(cbind(slI$x,l$fitted[slI$ix]))
## 


###################################################
### chunk number 18:  eval=FALSE
###################################################
## plot(lIbar,lRnorm,pch=".",main="Normalized average log ratios
##  versus 
##  average log intensities",xlab="average log intensities",ylab="normalized average log ratios", ylim=yl)
## 


###################################################
### chunk number 19:  eval=FALSE
###################################################
## #We can also add a loess fitted mean line to the plot
## l<-loess(lRnorm~lIbar,span=0.6)
## slI<-sort(lIbar,index.return=T)
## lines(cbind(slI$x,l$fitted[slI$ix]))
## 


###################################################
### chunk number 20: 
###################################################
#diff is a vector of indicator variables with 1 indicating a gene is differentially expressed, 0 not
#ndiff is a vector of indicator variables with 1 indicating a gene is not differentially expressed, 0 is
diff<-round(result$pdiff)
ndiff<-1-diff
sum((diff==pos.contr)&pos.contr)
#we can also calculate the percentage of positive controls found
sum((diff==pos.contr)&pos.contr)/sum(pos.contr)*100
sum((ndiff==neg.contr)&neg.contr)
#we can also calculate the percentage of negative controls found
sum((ndiff==neg.contr)&neg.contr)/sum(neg.contr)*100


###################################################
### chunk number 21:  eval=FALSE
###################################################
## x<-seq((result$a-1),(result$b+1),0.0001)
## #d is the fitted density at each point in x
## d<-(1-result$mixprob)*dunif(x,result$a,result$b)+ result$mixprob*dnorm(x,result$mu,result$sigma)
## par(mfrow=c(1,2))


###################################################
### chunk number 22:  eval=FALSE
###################################################
## hist(result$lRnorm,freq=F,main="Histogram of 
## average normalized log ratios", xlab="average normalized log ratios",breaks=25)
## 


###################################################
### chunk number 23:  eval=FALSE
###################################################
## lines(cbind(x,d),lty=2)
## #We can also take a closer look at the right-hand tail (where the positive controls are)
## 


###################################################
### chunk number 24:  eval=FALSE
###################################################
## hist(result$lRnorm,freq=F,main="Right-side of the histogram of 
## average normalized log ratios",xlab="average normalized log ratios",xlim=c(1.5, result$b),ylim=c(0,0.1),breaks=25)
## 


###################################################
### chunk number 25:  eval=FALSE
###################################################
## lines(cbind(x,d),lty=2)
## 


###################################################
### chunk number 26: 
###################################################
apo<-read.csv(file="http://www.stat.berkeley.edu/users/terry/zarray/Data/ApoA1/rg_a1ko_morph.txt", header=T)


###################################################
### chunk number 27: 
###################################################
rownames(apo)<-apo[,1]
apo<-apo[,-1]
colnames(apo)


###################################################
### chunk number 28: 
###################################################
#Because of zero entries we add 1 to all entries to allow us to take  logs
apo<-apo+1
#lRctl is the matrix of control log ratios to the base 2
lRctl<-log(apo[,c(seq(2,16,2))],2)-log(apo[,c(seq(1,15,2))],2)
#lRtxt is the matrix of treatment log ratios to the base 2
lRtxt<-log(apo[,c(seq(18,32,2))],2)-log(apo[,c(seq(17,31,2))],2)
#lIctl is the matrix of control log intensities to the base 2
lIctl<-log(apo[,c(seq(2,16,2))],2)+log(apo[,c(seq(1,15,2))],2)
#lItxt is the matrix of treatment log intensities to the base 2
lItxt<-log(apo[,c(seq(18,32,2))],2)+log(apo[,c(seq(17,31,2))],2)


###################################################
### chunk number 29: 
###################################################
result<-nudge2(control.logratio=lRctl,txt.logratio=lRtxt, control.logintensity=lIctl,txt.logintensity=lItxt)


###################################################
### chunk number 30: 
###################################################
names(result)
#pdiff is the vector of probabilities of differential expression
#lRnorm is the vector of normalized log ratio differences
#mu and sigma are the EM estimates of the parameters for the group of non-differentially expressed log ratios
#mixprob is the mixing parameter estimate (or the prior probability for a gene not being differentially expressed)
#a is the min and b is the max of the normalized log ratios (this gives the range for the uniform mixture component)
#loglike is the converged log likelihood value for the fitted model


###################################################
### chunk number 31: 
###################################################
s<-sort(result$pdiff,decreasing=T,index.return=T)
#Look at the row number and the probabilities of the genes with the 20 highest probabilities of differential expression
rownames(lRtxt)<-rownames(lRctl)<-c(1:nrow(lRtxt))
cbind(rownames(lRtxt)[s$ix[1:20]],round(s$x[1:20],2))


###################################################
### chunk number 32: 
###################################################
# Set the threshold value
thresh<-0.5
sum(result$pdiff>=thresh)
#Can also look at the row numbers or names of these genes
rownames(lRtxt)[result$pdiff>=thresh]


###################################################
### chunk number 33: 
###################################################
#lRbar is the average log ratio differences for the genes
lRbar<-apply(lRtxt,1,mean)-apply(lRctl,1,mean)
#lIbar is the average log intensities for the genes
lIbar<-apply(cbind(lItxt,lIctl),1,mean)
#lRnorm is the mean and variance normalized average log ratio differences for the genes
lRnorm<-result$lRnorm


###################################################
### chunk number 34:  eval=FALSE
###################################################
## #plot the two different average log ratio differences versus average log intensities side by side
## par(mfrow=c(1,2))
## # put them both on the same scale
## yl<-range(lRbar,lRnorm)
## 


###################################################
### chunk number 35:  eval=FALSE
###################################################
## plot(lIbar,lRbar,pch=".",main="Average log ratio differences
##  versus 
##  average log intensities",xlab="average log intensities",ylab="average log ratio differences", ylim=yl)
## 


###################################################
### chunk number 36:  eval=FALSE
###################################################
## #We can also add a loess fitted mean line to the plot
## l<-loess(lRbar~lIbar,span=0.6)
## slI<-sort(lIbar,index.return=T)
## lines(cbind(slI$x,l$fitted[slI$ix]))
## 


###################################################
### chunk number 37:  eval=FALSE
###################################################
## plot(lIbar,lRnorm,pch=".",main="Normalized average log ratio differences
##  versus 
##  average log intensities",xlab="average log intensities",ylab="normalized average log ratio differences",ylim=yl)
## 


###################################################
### chunk number 38:  eval=FALSE
###################################################
## #We can also add a loess fitted mean line to the plot
## l<-loess(lRnorm~lIbar,span=0.6)
## slI<-sort(lIbar,index.return=T)
## lines(cbind(slI$x,l$fitted[slI$ix]))
## 


###################################################
### chunk number 39: 
###################################################
data(like)
#lR is the matrix or vector of log ratios to the base 2
#We get this by subtracting the log of the green intensities for each gene from the log of the red intensities for each gene (the colour assignments can be the other way also)
lR<-log(like[,1],2)-log(like[,2],2)
#lI is the matrix or vector of log intensities to the base 2
#We get this by adding the log of the green intensities for each gene from the log of the red intensities for each gene
lI<-log(like[,1],2)+log(like[,2],2)
#p is our believed value of the prior probability
p<-10/nrow(like)
#Generate a random set of labels for being differentially expressed using p
temp<-rbinom(nrow(like),1,p)
#Create a matrix of labels and use it in the nudge1 function
z<-matrix(c(1-temp,temp),nrow(like),2,byrow=F)
result<-nudge1(logratio=lR,logintensity=lI,z=z)


###################################################
### chunk number 40: 
###################################################
data(hiv)
#lR is the matrix of log ratios to the base 2
#We get this by subtracting the logs of the intensities for sample 1 for each gene from the logs of the corresponding intensities for sample 2 for each gene (by corresponding intensities we mean those paired to the same slide)
lR<-log(hiv[,1:4],2)-log(hiv[,5:8],2)
#lI is the matrix of log intensities to the base 2
#We get this by adding the logs of the intensities for sample 1 for each gene from the logs of the corresponding intensities for sample 2 for each gene (by corresponding intensities we mean those paired to the same slide)
lI<-log(hiv[,1:4],2)+log(hiv[,5:8],2)
#First we generate a random matrix with low probability of differential expression
p<-0.0001
temp<-rbinom(nrow(hiv),1,p)
z<-matrix(c(1-temp,temp),nrow(hiv),2,byrow=F)
# Now we incorporate the information
z[c(3,5,7,771,773,775,1539,1541,1543,2307,2309,2311,3859),1]<-0
z[c(3,5,7,771,773,775,1539,1541,1543,2307,2309,2311,3859),2]<-1
z[c(2348,3186,1943),1]<-0.05
z[c(2348,3186,1943),2]<-0.95
z[c(3075,3077,3079,3843,3845),1]<-1
z[c(3075,3077,3079,3843,3845),2]<-0
result<-nudge1(logratio=lR,logintensity=lI,z=z)


###################################################
### chunk number 41: 
###################################################
rm(list=ls())
data(like)
ls()
dim(like)


###################################################
### chunk number 42: 
###################################################
rm(list=ls())
data(hiv)
ls()
dim(hiv)
length(pos.contr)
length(neg.contr)


###################################################
### chunk number 43: 
###################################################
rm(list=ls())


###################################################
### chunk number 44: 
###################################################
apo<-read.csv("http://www.stat.berkeley.edu/users/terry/zarray/Data/ApoA1/rg_a1ko_morph.txt",header=T)


###################################################
### chunk number 45: 
###################################################
dim(apo)


###################################################
### chunk number 46: 
###################################################
apo[,-1]<-apo[,-1]+1