Using the TagMe api, we retrieve keywords for Maths class 6 textbook from NROER repo. The following scores are obtained for each keywords:
Each keyword is then tagged as belonging/not belonging to category-Mathematics. A classifier is build on the curated dataset using logistic regression model.
Bootstarp to get accuracy
accuracy_list<-c()
for (iter in 1:500) {
train_size=floor(dim(MyData)[1]*0.8)
train_index=sample(1:dim(MyData)[1], train_size)
test_index=c(1:dim(MyData)[1])[!c(1:dim(MyData)[1]) %in% train_index]
train_data<-MyData[train_index,]
test_data<-MyData[test_index,]
model <- glm(target ~link_probability+rho+relatedness,family=binomial(link='logit'),data=train_data)
fitted.results <- predict(model,newdata=test_data,type='response')
fitted.results <- ifelse(fitted.results > 0.5,1,0)
#test_data$target
misClasificError <- mean(fitted.results != test_data$target)
accuracy_list[iter]=(1-misClasificError)
}
print (paste("Accuracy obtained:",mean(accuracy_list[1:500])))
## [1] "Accuracy obtained: 0.928850715746421"
Model and obtain FPR-TPR curve
train_size=floor(dim(MyData)[1]*0.8)
train_index=sample(1:dim(MyData)[1], train_size)
test_index=c(1:dim(MyData)[1])[!c(1:dim(MyData)[1]) %in% train_index]
train_data<-MyData[train_index,]
test_data<-MyData[test_index,]
model <- glm(target ~link_probability+rho+relatedness,family=binomial(link='logit'),data=train_data)
summary(model)
##
## Call:
## glm(formula = target ~ link_probability + rho + relatedness,
## family = binomial(link = "logit"), data = train_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.5611 -0.2203 -0.0791 -0.0410 3.4849
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.1978 0.3786 -19.01 <2e-16 ***
## link_probability -10.4965 1.0295 -10.20 <2e-16 ***
## rho 28.5130 2.0486 13.92 <2e-16 ***
## relatedness 8.6204 0.7280 11.84 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1744.97 on 1951 degrees of freedom
## Residual deviance: 641.22 on 1948 degrees of freedom
## AIC: 649.22
##
## Number of Fisher Scoring iterations: 7
#print anova(model, test="Chisq")
p <- predict(model, newdata=test_data, type="response")
pr <- prediction(p, test_data$target)
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)
auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
auc
## [1] 0.9543681