NDDT_LibFM

setwd("/Users/adarsa/ilimi/github/Data-Science/explore/adhoc/NDDT")
s3_loc = "s3://lpdev-ekstep/Data-Sciences/explore/adhoc/NDDT/"
nddt_file_name = "NDDT_Responses.csv"
nddt_file_s3 = paste(s3_loc,nddt_file_name,sep="")
cmd <- system(paste("aws s3 cp ",nddt_file_s3,nddt_file_name,sep=" "))
df <- read.csv(nddt_file_name)
score.df <- df[,-c(1:8)]
score.df <- round(score.df)
qid<-colnames(score.df)
sid<-unique(df$Child.ID)
score.df$student<-df$Child.ID

test.df <- melt(score.df,id.vars="student",value.name="score",variable.name ="qid", na.rm=FALSE)
test.df[which(test.df$score<0.5),]$score <- -1

#get only attempted <qid sid>
train_index <- which(!is.na(test.df$score))
train.df <- test.df[train_index,]

#assign zeros to missing values. Handled by libFM
test.df$score[which(is.na(test.df$score))]<-0

test_features_df<-dummy.data.frame(as.data.frame(test.df), names=c("student","qid"), sep="_")
train_features_df<-test_features_df[train_index,]

write.csv(train.df,"libfm_train",row.names=FALSE)
write.csv(test.df,"libfm_test",row.names=FALSE)
system("./triple_format_to_libfm.pl --in libfm_train,libfm_test --header 1 --target_column 2 --separator \",\" ")
system(" ./libFM -train libfm_train.libfm -test libfm_test.libfm -dim '1,1,10' -init_stdev 0.01 -iter 1000 -method 'als' -task c -regular '1,10,10' -rlog 'rlogout' -out 'libfm.output_als' -learn_rate 0.1  -seed 1234 -save_model 'fm.model_als' ")

libfm_output<-read.csv("libfm.output_als",header = FALSE)
libfm_out_pred<-libfm_output[train_index,]

pred<-prediction(libfm_out_pred,train.df$score)
 #"ppv","sens","rpp" "acc", "rec" ,"tpr"

#recall precision
perf<- ROCR:::performance(pred,"rec","prec")

#precision recall
EvalParams1<-list((as.data.frame(perf@x.values))[-1,], (as.data.frame(perf@y.values))[-1,],(as.data.frame(perf@alpha.values))[-1,])
names(EvalParams1)<-c(perf@x.name,perf@y.name, perf@alpha.name)
plot_ly(data = EvalParams1, x = Recall, y =Precision ,text=paste("threshold:",Cutoff))

#tpr fpr
perf<- ROCR:::performance(pred,"tpr","fpr")
perf1<- ROCR:::performance(pred,"tpr","acc")

EvalParams2<-list((as.data.frame(perf@x.values))[-1,], (as.data.frame(perf@y.values))[-1,],(as.data.frame(perf@alpha.values))[-1,],(as.data.frame(perf1@y.values))[-1,])
names(EvalParams2)<-c("FalsePositiveRate","TruePositiveRate","Cutoff", "Accuracy")
plot_ly(data = EvalParams2, x = FalsePositiveRate, y = TruePositiveRate ,text=paste("threshold:",Cutoff,"Accuracy:",Accuracy))

##       l1q1       l1q2       l1q3       l1q4       l1q5       l1q6 
## 0.48339844 0.48144531 0.29199219 0.48828125 0.39941406 0.27636719 
##       l1q7       l1q8       l1q9      l1q10      l1q11      l1q12 
## 0.51660156 0.53710938 0.42089844 0.32324219 0.50000000 0.51855469 
##       l2q1       l2q2       l2q3       l2q4       l2q5       l2q6 
## 0.59765625 0.22753906 0.37402344 0.24121094 0.61328125 0.50000000 
##       l2q7       l2q8       l2q9      l2q10      l2q11      l2q12 
## 0.50585938 0.40039062 0.32031250 0.57128906 0.64453125 0.38281250 
##       l3q1       l3q2       l3q3       l3q4       l3q5       l3q6 
## 0.10839844 0.42871094 0.28417969 0.41992188 0.17089844 0.16796875 
##       l3q7       l3q8       l3q9      l3q10      l3q11      l3q12 
## 0.21386719 0.14062500 0.21777344 0.34179688 0.11230469 0.48925781 
##       l4q1       l4q2       l4q3       l4q4       l4q5       l4q6 
## 0.18457031 0.05078125 0.09863281 0.25390625 0.09082031 0.10156250 
##       l4q7       l4q8       l4q9      l4q10      l4q11      l4q12 
## 0.21875000 0.26660156 0.27929688 0.07617188 0.27636719 0.32324219 
##       l5q1       l5q2       l5q3       l5q4       l5q5       l5q6 
## 0.17675781 0.04101562 0.01855469 0.08398438 0.03027344 0.10937500 
##       l5q7       l5q8       l5q9      l5q10      l5q11      l5q12 
## 0.08593750 0.04492188 0.02929688 0.08105469 0.03710938 0.03125000

threshold=0.64635 #corresponding to fpr of 5%

libfm_out_hardclass<-matrix(0,nrow = dim(libfm_output)[1], ncol = 1)
libfm_out_hardclass[which(libfm_output>threshold)]<-1
libfm_out_hardclass[which(libfm_output<=threshold)]<--1
train_pred<-libfm_out_hardclass[train_index]
tpr<-matrix(0,nrow = length(question_perf$question), ncol = 1)
fpr<-matrix(0,nrow = length(question_perf$question), ncol = 1)
tnr<-matrix(0,nrow = length(question_perf$question), ncol = 1)
fnr<-matrix(0,nrow = length(question_perf$question), ncol = 1)
i=1
for(question_id in question_perf$question){
  stud_answered_pred<-train_pred[which(train.df$qid==question_id)]
  stud_answered_actual<-train.df$score[which(train.df$qid==question_id)]
  TruePositive<-length(which(stud_answered_pred[which(stud_answered_actual==1)]==1))
  FalsePositive<-length(which(stud_answered_pred[which(stud_answered_actual==1)]==-1))
  TrueNegetive<-length(which(stud_answered_pred[which(stud_answered_actual==-1)]==-1))
  FalseNegetive<-length(which(stud_answered_pred[which(stud_answered_actual==-1)]==1))
  tpr[i]=TruePositive/length(which(stud_answered_actual==1))
  fpr[i]=FalsePositive/length(which(stud_answered_actual==-1))
  tnr[i]=TrueNegetive/length(which(stud_answered_actual==-1))
  fnr[i]=FalseNegetive/length(which(stud_answered_actual==1))
  i=i+1
}
plot(tpr, type="l",col=1, lty=2)
lines(tnr, type="l" ,col=3, lty=4)
legend(50, 0.8, c("tpr", "tnr"), col = c(1, 3),text.col = "green4", lty = c(2, 4 ), bg = "gray90")

plot(fpr, type="l",col=1, lty=2)
lines(fnr, type="l" ,col=10, lty=5)
legend(5, .5, c("fpr", "fnr"), col = c(1, 10),text.col = "green4", lty = c(2, 4 ), bg = "gray90")

#print(TruePositive/length(which(stud_answered_actual==1)))
#print(FalsePositive/length(which(stud_answered_actual==-1)))
#print(FalseNegetive/length(which(stud_answered_actual==1)))
#print(TrueNegetive/length(which(stud_answered_actual==-1)))

cos_dist<-as.matrix(dist(ques_unary_inter_Wj.als$question_coef, method = "manhattan", diag = TRUE, upper = FALSE, p = 2))
#cos_dist<-cos_dist/norm(as.matrix(ques_unary_inter_Wj.als$question_coef))
plot_ly(z = cos_dist,x=ques_unary_inter_Wj.als$quest_num, y=ques_unary_inter_Wj.als$quest_num, type = "heatmap")

#cos_dist<-((unary_inter_Wj.als)%*% t(unary_inter_Wj.als))/norm(as.matrix(unary_inter_Wj.als)) 

#library(heatmaply)
#heatmaply(as.data.frame(cos_dist), k_col = 2, k_row = 3,na.value = "grey50",row_dend_left=TRUE, return_ppxpy = TRUE) %>% layout(margin = list(l = 130, b = 40))

Nodes<-data.frame(id=c(qid))
Nodes$label<-c(qid)
Nodes$group<-level[1015:1074]
Nodes$title<-qid

Links <- data.frame(value=cos_dist[lower.tri(cos_dist, diag = FALSE)])
elements<-which(lower.tri(cos_dist, diag = FALSE)==TRUE)
row_val<-elements%%60
row_val[row_val==0]<-60
Links$from=qid[row_val]
col_val<-((elements-1)%/%60)+1
Links$to=qid[col_val]

#subset links
sub_Links<-Links[Links$value>1.4,]

gr<-visNetwork(nodes=Nodes, edges=sub_Links) %>% visNodes(shadow = TRUE)%>% visOptions(selectedBy = "group",highlightNearest= list(enabled =TRUE, degree = 2, hover = T), nodesIdSelection = TRUE)  %>% visLegend()

gr %>% visSave(file ="LibFMCoeffGraph.html")
htmltools::includeHTML("LibFMCoeffGraph.html")

NDDT_LibFM

Adarsa

19 September 2016