TagMe api provides a rho value and link probability to each keyword. Here we use these params of keywords extracted for Maths class 6 textbook from NROER repo. In addition to rho and and link probability we also use the relatedness value of these keywords w.r.t to Mathematics. Each Content is then tagged as belonging/not belonging to category-Mathematics. A classifier is build on the curated dataset using logistic regression model.
Bootstarp to get accuracy
accuracy_list<-c()
for (iter in 1:500) {
train_size=floor(dim(MyData)[1]*0.8)
train_index=sample(1:dim(MyData)[1], train_size)
test_index=c(1:dim(MyData)[1])[!c(1:dim(MyData)[1]) %in% train_index]
train_data<-MyData[train_index,]
test_data<-MyData[test_index,]
model <- glm(target ~link_probability+rho+relatedness,family=binomial(link='logit'),data=train_data)
fitted.results <- predict(model,newdata=test_data,type='response')
fitted.results <- ifelse(fitted.results > 0.5,1,0)
#test_data$target
misClasificError <- mean(fitted.results != test_data$target)
accuracy_list[iter]=(1-misClasificError)
}
print ("Accuracy obtained:")
## [1] "Accuracy obtained:"
print(mean(accuracy_list[1:500]))
## [1] 0.928638
train_size=floor(dim(MyData)[1]*0.8)
train_index=sample(1:dim(MyData)[1], train_size)
test_index=c(1:dim(MyData)[1])[!c(1:dim(MyData)[1]) %in% train_index]
train_data<-MyData[train_index,]
test_data<-MyData[test_index,]
model <- glm(target ~link_probability+rho+relatedness,family=binomial(link='logit'),data=train_data)
summary(model)
##
## Call:
## glm(formula = target ~ link_probability + rho + relatedness,
## family = binomial(link = "logit"), data = train_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.5458 -0.2222 -0.0815 -0.0435 3.4542
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.0677 0.3696 -19.12 <2e-16 ***
## link_probability -10.0622 1.0000 -10.06 <2e-16 ***
## rho 27.4880 1.9959 13.77 <2e-16 ***
## relatedness 8.7435 0.7272 12.02 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1738.46 on 1951 degrees of freedom
## Residual deviance: 650.07 on 1948 degrees of freedom
## AIC: 658.07
##
## Number of Fisher Scoring iterations: 7
#print anova(model, test="Chisq")
p <- predict(model, newdata=test_data, type="response")
pr <- prediction(p, test_data$target)
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)
auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
auc
## [1] 0.9573683