library(lme4)
## Loading required package: Matrix
library(car)
## Loading required package: carData

Instruction

Given a high-throughtput screening experiment results, we want to find the promising strains for next round of screening.

load the data

library(readxl)
data = read_excel("C:/Users/wwhla/Downloads/job/data scientist/interview/Amyris_coding_challenge/Sample_T1_data.xlsx",1)
dim(data)
## [1] 5760    7

Check the distribution of different characteristics

par(mfrow=c(2,3))
barplot(table(data$plate),main="plate")
barplot(table(data$strain),main="strain")
barplot(table(data$well_type),main="well_type")
barplot(table(data$robot),main="robot")
barplot(table(data$stacker_position),main="stacker_pos")


We can find that there are 3 robot : “Bender”, “C3P0” and “Terminator”. For each robot there are 20 plates. For each plate there are 96 wells. 88 of them are Standard wells, 4 are Parent wells and 4 are control wells. So totally we have 5280 Standard wells. For Parent well and control well, since their strain_id are the same, so apparently means same strain are checked repeatedly. Therefore I choose to use data from control strain to estimate the bias incorporated from plate and robot.

boxplot(data$value[which(data$robot=="Bender" &  data$strain=="Y0002")],data$value[which(data$robot=="Bender" &  data$strain=="Y0001")],data$value[which(data$robot=="C3P0" &  data$strain=="Y0002")],data$value[which(data$robot=="C3P0" &  data$strain=="Y0001")],data$value[which(data$robot=="Terminator" &  data$strain=="Y0002")],data$value[which(data$robot=="Terminator" & data$strain=="Y0001")],names =  c("Bender & PS", "Bender & PC", "C3P0 & PS", "C3P0 & PC", "Terminator & PS", "Terminator & PC"), main =  "boxplot of values from Control and Parent strain")


## Check the plate difference

PC_Bender = matrix(0,nrow=4,ncol=20)
PC_C3P0 = matrix(0,nrow=4,ncol=20)
PC_Terminator = matrix(0,nrow=4,ncol=20)

extract_value_id<-function(dataset,id,robot_id,nplate, nrid){ #extract value by id. dataset is the input data; id is the id of strain we want to extract such as "Y0001"; robot_id is the platform id such as "Bender"; nplate is the number of plates for the platform such as "20"; nrid is the number of repeats for the id
  res = matrix(0, nrow=nrid, ncol=nplate )
  for(i in 1:nplate){
    res[, i] = dataset$value[which(dataset$stacker_position==i & dataset$robot==robot_id & data$strain==id)]
  }
  return(res)
}
PC_Bender = extract_value_id(data,"Y0001","Bender",20,4)
PC_C3P0 = extract_value_id(data,"Y0001","C3P0",20,4)
PC_Terminator = extract_value_id(data,"Y0001","Terminator",20,4)
boxplot(cbind(PC_Bender,PC_C3P0,PC_Terminator),col=c(rep("red",20),rep("blue",20),rep("green",20)),main="Control Strain different platform")
legend("topright",legend = c("Bender","C3P0","Terminator"),fill = c("red","blue","green"))


We can find that there are system bias across different plates. We need to correct for the bias for a fair comparison across different robots and plates.

PS_Bender = extract_value_id(data,"Y0002","Bender",20,4)
PS_C3P0 = extract_value_id(data,"Y0002","C3P0",20,4)
PS_Terminator = extract_value_id(data,"Y0002","Terminator",20,4)
boxplot(cbind(PS_Bender,PS_C3P0,PS_Terminator),col=c(rep("red",20),rep("blue",20),rep("green",20)),main="Parent Strain different platform")
legend("topright",legend = c("Bender","C3P0","Terminator"),fill = c("red","blue","green"))


We can find that the bias across plates are similar in the Parent Strain. Which indicate that plate bias are maintained across different strain. We can fully capture the information based on data from control strain.

extract_value_pos<-function(dataset,robot_id,nplate,npos){#extract value by position. dataset is the input data; robot_id is the platform id, nplate is the number of plates, npos is the of standard well in a plate
  res = matrix(0,nrow=npos,ncol=nplate)
  for(i in 1:nplate){
    res[,i]=data$value[which(dataset$stacker_position==i & dataset$robot==robot_id)[1:npos]]
  }
  return(res)
}
SW_Bender = extract_value_pos(data,"Bender",20,88)
SW_C3P0 = extract_value_pos(data,"C3P0",20,88)
SW_Terminator = extract_value_pos(data,"Terminator",20,88)
boxplot(cbind(SW_Bender,SW_C3P0,SW_Terminator),col=c(rep("red",20),rep("blue",20),rep("green",20)),main="Standard Strain different platform")
legend("topright",legend = c("Bender","C3P0","Terminator"),fill = c("red","blue","green"))


The SW across plates shows the same pattern of system bias comparing to PC and PS. There are relative higher standard variation because SW are from different strains while PC and PS are the same strain. This figure again indicate that system bias maintained in SW, PC and PS. We can use PC to extract this pattern. Making use of this information we can correct the system bias in SW and PS to make them comparable across different robots and plates.

Make use of linear mix model to estimate the contribution from plate and robot

#combine the PC information together
PC = matrix(0,nrow = 20*3*4, ncol = 3)
for (i in 1:20){
  PC[((i-1)*4+1):(i*4),1]=PC_Bender[,i]
  PC[((i-1)*4+1):(i*4),2]=i
  PC[((i-1)*4+1):(i*4),3]="Bender"
}

for (i in 1:20){
  PC[(80+(i-1)*4+1):(80+i*4),1]=PC_C3P0[,i]
  PC[(80+(i-1)*4+1):(80+i*4),2]=i
  PC[(80+(i-1)*4+1):(80+i*4),3]="C3P0"
}
for (i in 1:20){
  PC[(160+(i-1)*4+1):(160+i*4),1]=PC_Terminator[,i]
  PC[(160+(i-1)*4+1):(160+i*4),2]=i
  PC[(160+(i-1)*4+1):(160+i*4),3]="Terminator"
}
PC<-data.frame(PC)
colnames(PC) = c("value","Plate","Robot")
PC$value=as.numeric(PC$value)
PC$Plate=as.factor(PC$Plate)
PC$Robot=as.factor(PC$Robot)


## check normal distribution A QQ plot to make sure the value from PC are not far frome normal distribution which is the basic assumption of linear mixed model

qqp(PC$value,"norm")

## [1]  83 153


modulate the contribution from plate and Robot based on PC information

m1 <- lmer(value ~ (1|Plate)+(1|Robot),PC,REML=FALSE)


Estimate PC with model m1 which will include basic value + plate contribution + Robot contribution

PC_Bender_est = matrix(0, nrow = 1, ncol = 20)
for (i in 1:20){
  PC_Bender_est[i] = predict(m1, data.frame(value = PC_Bender[1,i], Plate = i ,Robot="Bender"))
}
PC_C3P0_est = matrix(0, nrow = 1, ncol = 20)
for (i in 1:20){
  PC_C3P0_est[i] = predict(m1, data.frame(value = PC_C3P0[1,i], Plate = i ,Robot="C3P0"))
}
PC_Terminator_est = matrix(0, nrow = 1, ncol = 20)
for (i in 1:20){
  PC_Terminator_est[i] = predict(m1, data.frame(value = PC_Terminator[1,i], Plate = i ,Robot="Terminator"))
}

Normalize the value of PS and SW based on estimated PC

PS_Bender_norm = PS_Bender - matrix(1,nrow=4,ncol=1)%*%PC_Bender_est
PS_C3P0_norm = PS_C3P0 - matrix(1,nrow=4,ncol=1)%*%PC_C3P0_est
PS_Terminator_norm = PS_Terminator - matrix(1,nrow=4,ncol=1)%*%PC_Terminator_est

SW_Bender_norm = SW_Bender - matrix(1,nrow=88,ncol=1)%*%PC_Bender_est
SW_C3P0_norm = SW_C3P0 - matrix(1,nrow=88,ncol=1)%*%PC_C3P0_est
SW_Terminator_norm = SW_Terminator - matrix(1,nrow=88,ncol=1)%*%PC_Terminator_est


Test how SW improved from PS. Since PS is measured repeatedly we would expect its distribution is Normal.

qqp(cbind(PS_Bender_norm,PS_C3P0_norm,PS_Terminator_norm),"norm")

## [1] 228  17


We do find that the distribution of normalized PS is normal which is in accordance with our expectation. We can use normal distribution to find the level of significance of SW’s improvement. We can also use the empirical distribution to do this.

PS_norm_mean = mean(cbind(SW_Bender_norm,SW_C3P0_norm,SW_Terminator_norm))
PS_norm_sd = sd(cbind(SW_Bender_norm,SW_C3P0_norm,SW_Terminator_norm))
SW_Bender_pvalue = matrix(sapply(SW_Bender_norm, function(z) 1-pnorm(z,PS_norm_mean,PS_norm_sd)), nrow = nrow(SW_Bender_norm))
SW_C3P0_pvalue = matrix(sapply(SW_C3P0_norm, function(z) 1-pnorm(z,PS_norm_mean,PS_norm_sd)), nrow = nrow(SW_C3P0_norm))
SW_Terminator_pvalue = matrix(sapply(SW_Terminator_norm, function(z) 1-pnorm(z,PS_norm_mean,PS_norm_sd)), nrow = nrow(SW_Terminator_norm))
#adjust pvlaue 
SW_pvalue = cbind(SW_Bender_pvalue,SW_C3P0_pvalue,SW_Terminator_pvalue)
SW_padj = matrix(p.adjust(cbind(SW_Bender_pvalue,SW_C3P0_pvalue,SW_Terminator_pvalue),'fdr'),ncol =60)
par(mfrow=c(1,2))
hist(SW_padj,main="Histgram of adjusted pValue")
hist(SW_padj[which(SW_padj<0.9)],main="Histgram of adjusted pValue(<0.9)")


Adjusted pvalue <0.1 is a reasonable cutoff.

indsel = which(SW_padj<0.1)
hist(cbind(PS_Bender_norm,PS_C3P0_norm,PS_Terminator_norm),breaks=25,xlim=c(600,3500),freq = FALSE,main="Histgram of corrected value from PS", xlab = "Corrected Value")
points(seq(from = 500, to= 3500, by =100),dnorm(seq(from = 500, to= 3500, by =100),mean = PS_norm_mean,sd=PS_norm_sd), type = 'l', col ="red")
points(cbind(SW_Bender_norm,SW_C3P0_norm,SW_Terminator_norm)[indsel],rep(0.0002, 20), type = "p", pch = 21,col="blue")
legend("topright", legend = c("Esimated normal distribuiton","Significantly improved SW value"), col = c("red", "blue"), lty = c(1, NA), pch = c(NA, 21))


We can find that the location of significantly improved SW value located far right in the normal distribution of PS value. The strain names are as following:

paste("X",indsel,sep="")
##  [1] "X66"   "X358"  "X894"  "X912"  "X1030" "X1770" "X2325" "X2413" "X3292"
## [10] "X3659" "X3788" "X3795" "X3796" "X3827" "X3866" "X4076" "X4239" "X4382"
## [19] "X4789" "X5100"


For the next round of screen, I would suggest screen these 20 strains with repeats to learn more about the biological variations of each strain which will help us to make statistically robust selection.