library(lme4)
## Loading required package: Matrix
library(car)
## Loading required package: carData
Given a high-throughtput screening experiment results, we want to find the promising strains for next round of screening.
library(readxl)
data = read_excel("C:/Users/wwhla/Downloads/job/data scientist/interview/Amyris_coding_challenge/Sample_T1_data.xlsx",1)
dim(data)
## [1] 5760 7
par(mfrow=c(2,3))
barplot(table(data$plate),main="plate")
barplot(table(data$strain),main="strain")
barplot(table(data$well_type),main="well_type")
barplot(table(data$robot),main="robot")
barplot(table(data$stacker_position),main="stacker_pos")
We can find that there are 3 robot : “Bender”, “C3P0” and “Terminator”.
For each robot there are 20 plates. For each plate there are 96 wells.
88 of them are Standard wells, 4 are Parent wells and 4 are control
wells. So totally we have 5280 Standard wells. For Parent well and
control well, since their strain_id are the same, so apparently means
same strain are checked repeatedly. Therefore I choose to use data from
control strain to estimate the bias incorporated from plate and
robot.
boxplot(data$value[which(data$robot=="Bender" & data$strain=="Y0002")],data$value[which(data$robot=="Bender" & data$strain=="Y0001")],data$value[which(data$robot=="C3P0" & data$strain=="Y0002")],data$value[which(data$robot=="C3P0" & data$strain=="Y0001")],data$value[which(data$robot=="Terminator" & data$strain=="Y0002")],data$value[which(data$robot=="Terminator" & data$strain=="Y0001")],names = c("Bender & PS", "Bender & PC", "C3P0 & PS", "C3P0 & PC", "Terminator & PS", "Terminator & PC"), main = "boxplot of values from Control and Parent strain")
## Check the plate difference
PC_Bender = matrix(0,nrow=4,ncol=20)
PC_C3P0 = matrix(0,nrow=4,ncol=20)
PC_Terminator = matrix(0,nrow=4,ncol=20)
extract_value_id<-function(dataset,id,robot_id,nplate, nrid){ #extract value by id. dataset is the input data; id is the id of strain we want to extract such as "Y0001"; robot_id is the platform id such as "Bender"; nplate is the number of plates for the platform such as "20"; nrid is the number of repeats for the id
res = matrix(0, nrow=nrid, ncol=nplate )
for(i in 1:nplate){
res[, i] = dataset$value[which(dataset$stacker_position==i & dataset$robot==robot_id & data$strain==id)]
}
return(res)
}
PC_Bender = extract_value_id(data,"Y0001","Bender",20,4)
PC_C3P0 = extract_value_id(data,"Y0001","C3P0",20,4)
PC_Terminator = extract_value_id(data,"Y0001","Terminator",20,4)
boxplot(cbind(PC_Bender,PC_C3P0,PC_Terminator),col=c(rep("red",20),rep("blue",20),rep("green",20)),main="Control Strain different platform")
legend("topright",legend = c("Bender","C3P0","Terminator"),fill = c("red","blue","green"))
We can find that there are system bias across different plates. We need
to correct for the bias for a fair comparison across different robots
and plates.
PS_Bender = extract_value_id(data,"Y0002","Bender",20,4)
PS_C3P0 = extract_value_id(data,"Y0002","C3P0",20,4)
PS_Terminator = extract_value_id(data,"Y0002","Terminator",20,4)
boxplot(cbind(PS_Bender,PS_C3P0,PS_Terminator),col=c(rep("red",20),rep("blue",20),rep("green",20)),main="Parent Strain different platform")
legend("topright",legend = c("Bender","C3P0","Terminator"),fill = c("red","blue","green"))
We can find that the bias across plates are similar in the Parent
Strain. Which indicate that plate bias are maintained across different
strain. We can fully capture the information based on data from control
strain.
extract_value_pos<-function(dataset,robot_id,nplate,npos){#extract value by position. dataset is the input data; robot_id is the platform id, nplate is the number of plates, npos is the of standard well in a plate
res = matrix(0,nrow=npos,ncol=nplate)
for(i in 1:nplate){
res[,i]=data$value[which(dataset$stacker_position==i & dataset$robot==robot_id)[1:npos]]
}
return(res)
}
SW_Bender = extract_value_pos(data,"Bender",20,88)
SW_C3P0 = extract_value_pos(data,"C3P0",20,88)
SW_Terminator = extract_value_pos(data,"Terminator",20,88)
boxplot(cbind(SW_Bender,SW_C3P0,SW_Terminator),col=c(rep("red",20),rep("blue",20),rep("green",20)),main="Standard Strain different platform")
legend("topright",legend = c("Bender","C3P0","Terminator"),fill = c("red","blue","green"))
The SW across plates shows the same pattern of system bias comparing to
PC and PS. There are relative higher standard variation because SW are
from different strains while PC and PS are the same strain. This figure
again indicate that system bias maintained in SW, PC and PS. We can use
PC to extract this pattern. Making use of this information we can
correct the system bias in SW and PS to make them comparable across
different robots and plates.
Make use of linear mix model to estimate the contribution from plate and robot
#combine the PC information together
PC = matrix(0,nrow = 20*3*4, ncol = 3)
for (i in 1:20){
PC[((i-1)*4+1):(i*4),1]=PC_Bender[,i]
PC[((i-1)*4+1):(i*4),2]=i
PC[((i-1)*4+1):(i*4),3]="Bender"
}
for (i in 1:20){
PC[(80+(i-1)*4+1):(80+i*4),1]=PC_C3P0[,i]
PC[(80+(i-1)*4+1):(80+i*4),2]=i
PC[(80+(i-1)*4+1):(80+i*4),3]="C3P0"
}
for (i in 1:20){
PC[(160+(i-1)*4+1):(160+i*4),1]=PC_Terminator[,i]
PC[(160+(i-1)*4+1):(160+i*4),2]=i
PC[(160+(i-1)*4+1):(160+i*4),3]="Terminator"
}
PC<-data.frame(PC)
colnames(PC) = c("value","Plate","Robot")
PC$value=as.numeric(PC$value)
PC$Plate=as.factor(PC$Plate)
PC$Robot=as.factor(PC$Robot)
## check normal distribution A QQ plot to make sure the value from PC
are not far frome normal distribution which is the basic assumption of
linear mixed model
qqp(PC$value,"norm")
## [1] 83 153
modulate the contribution from plate and Robot based on PC
information
m1 <- lmer(value ~ (1|Plate)+(1|Robot),PC,REML=FALSE)
Estimate PC with model m1 which will include basic value + plate
contribution + Robot contribution
PC_Bender_est = matrix(0, nrow = 1, ncol = 20)
for (i in 1:20){
PC_Bender_est[i] = predict(m1, data.frame(value = PC_Bender[1,i], Plate = i ,Robot="Bender"))
}
PC_C3P0_est = matrix(0, nrow = 1, ncol = 20)
for (i in 1:20){
PC_C3P0_est[i] = predict(m1, data.frame(value = PC_C3P0[1,i], Plate = i ,Robot="C3P0"))
}
PC_Terminator_est = matrix(0, nrow = 1, ncol = 20)
for (i in 1:20){
PC_Terminator_est[i] = predict(m1, data.frame(value = PC_Terminator[1,i], Plate = i ,Robot="Terminator"))
}
Normalize the value of PS and SW based on estimated PC
PS_Bender_norm = PS_Bender - matrix(1,nrow=4,ncol=1)%*%PC_Bender_est
PS_C3P0_norm = PS_C3P0 - matrix(1,nrow=4,ncol=1)%*%PC_C3P0_est
PS_Terminator_norm = PS_Terminator - matrix(1,nrow=4,ncol=1)%*%PC_Terminator_est
SW_Bender_norm = SW_Bender - matrix(1,nrow=88,ncol=1)%*%PC_Bender_est
SW_C3P0_norm = SW_C3P0 - matrix(1,nrow=88,ncol=1)%*%PC_C3P0_est
SW_Terminator_norm = SW_Terminator - matrix(1,nrow=88,ncol=1)%*%PC_Terminator_est
Test how SW improved from PS. Since PS is measured repeatedly we would
expect its distribution is Normal.
qqp(cbind(PS_Bender_norm,PS_C3P0_norm,PS_Terminator_norm),"norm")
## [1] 228 17
We do find that the distribution of normalized PS is normal which is in
accordance with our expectation. We can use normal distribution to find
the level of significance of SW’s improvement. We can also use the
empirical distribution to do this.
PS_norm_mean = mean(cbind(SW_Bender_norm,SW_C3P0_norm,SW_Terminator_norm))
PS_norm_sd = sd(cbind(SW_Bender_norm,SW_C3P0_norm,SW_Terminator_norm))
SW_Bender_pvalue = matrix(sapply(SW_Bender_norm, function(z) 1-pnorm(z,PS_norm_mean,PS_norm_sd)), nrow = nrow(SW_Bender_norm))
SW_C3P0_pvalue = matrix(sapply(SW_C3P0_norm, function(z) 1-pnorm(z,PS_norm_mean,PS_norm_sd)), nrow = nrow(SW_C3P0_norm))
SW_Terminator_pvalue = matrix(sapply(SW_Terminator_norm, function(z) 1-pnorm(z,PS_norm_mean,PS_norm_sd)), nrow = nrow(SW_Terminator_norm))
#adjust pvlaue
SW_pvalue = cbind(SW_Bender_pvalue,SW_C3P0_pvalue,SW_Terminator_pvalue)
SW_padj = matrix(p.adjust(cbind(SW_Bender_pvalue,SW_C3P0_pvalue,SW_Terminator_pvalue),'fdr'),ncol =60)
par(mfrow=c(1,2))
hist(SW_padj,main="Histgram of adjusted pValue")
hist(SW_padj[which(SW_padj<0.9)],main="Histgram of adjusted pValue(<0.9)")
Adjusted pvalue <0.1 is a reasonable cutoff.
indsel = which(SW_padj<0.1)
hist(cbind(PS_Bender_norm,PS_C3P0_norm,PS_Terminator_norm),breaks=25,xlim=c(600,3500),freq = FALSE,main="Histgram of corrected value from PS", xlab = "Corrected Value")
points(seq(from = 500, to= 3500, by =100),dnorm(seq(from = 500, to= 3500, by =100),mean = PS_norm_mean,sd=PS_norm_sd), type = 'l', col ="red")
points(cbind(SW_Bender_norm,SW_C3P0_norm,SW_Terminator_norm)[indsel],rep(0.0002, 20), type = "p", pch = 21,col="blue")
legend("topright", legend = c("Esimated normal distribuiton","Significantly improved SW value"), col = c("red", "blue"), lty = c(1, NA), pch = c(NA, 21))
We can find that the location of significantly improved SW value located
far right in the normal distribution of PS value. The strain names are
as following:
paste("X",indsel,sep="")
## [1] "X66" "X358" "X894" "X912" "X1030" "X1770" "X2325" "X2413" "X3292"
## [10] "X3659" "X3788" "X3795" "X3796" "X3827" "X3866" "X4076" "X4239" "X4382"
## [19] "X4789" "X5100"
For the next round of screen, I would suggest screen these 20 strains
with repeats to learn more about the biological variations of each
strain which will help us to make statistically robust selection.