Classification of user’s knowledge into different levels
ATTRIBUTES :
STG (The degree of study time for goal object materials), (input value)
SCG (The degree of repetition number of user for goal object materials) (input value)
STR (The degree of study time of user for related objects with goal object) (input value)
LPR (The exam performance of user for related objects with goal object) (input value)
PEG (The exam performance of user for goal objects) (input value)
UNS (The knowledge level of user) (target value)
library(readxl)
training_set = as.data.frame( read_excel("Data_User_Modeling_Dataset_Hamdi Tolga KAHRAMAN.xls", sheet = 2))
str(training_set)
summary(training_set)
## 'data.frame': 258 obs. of 9 variables:
## $ STG : num 0 0.08 0.06 0.1 0.08 0.09 0.1 0.15 0.2 0 ...
## $ SCG : num 0 0.08 0.06 0.1 0.08 0.15 0.1 0.02 0.14 0 ...
## $ STR : num 0 0.1 0.05 0.15 0.08 0.4 0.43 0.34 0.35 0.5 ...
## $ LPR : num 0 0.24 0.25 0.65 0.98 0.1 0.29 0.4 0.72 0.2 ...
## $ PEG : num 0 0.9 0.33 0.3 0.24 0.66 0.56 0.01 0.25 0.85 ...
## $ UNS : chr "very_low" "High" "Low" "Middle" ...
## $ ...7 : logi NA NA NA NA NA NA ...
## $ ...8 : logi NA NA NA NA NA NA ...
## $ Attribute Information:: chr "STG (The degree of study time for goal object materails)," "SCG (The degree of repetition number of user for goal object materails)" "STR (The degree of study time of user for related objects with goal object)" "LPR (The exam performance of user for related objects with goal object)" ...
## STG SCG STR LPR
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.2407 1st Qu.:0.2100 1st Qu.:0.2913 1st Qu.:0.2500
## Median :0.3270 Median :0.3025 Median :0.4900 Median :0.3300
## Mean :0.3711 Mean :0.3557 Mean :0.4680 Mean :0.4327
## 3rd Qu.:0.4950 3rd Qu.:0.4975 3rd Qu.:0.6900 3rd Qu.:0.6475
## Max. :0.9900 Max. :0.9000 Max. :0.9500 Max. :0.9900
## PEG UNS ...7 ...8
## Min. :0.0000 Length:258 Mode:logical Mode:logical
## 1st Qu.:0.2500 Class :character NA's:258 NA's:258
## Median :0.5000 Mode :character
## Mean :0.4585
## 3rd Qu.:0.6600
## Max. :0.9300
## Attribute Information:
## Length:258
## Class :character
## Mode :character
##
##
##
we can see that, after sixth column, there are some explanations in the .xls file, hence removing it , and encoding the UNS attribute, which is the target variable., and there are no NA values in the summary, hence the training set.
training_set = training_set[1:6]
# looking at structure again to verify that we have all the required attributes only
str(training_set)
## 'data.frame': 258 obs. of 6 variables:
## $ STG: num 0 0.08 0.06 0.1 0.08 0.09 0.1 0.15 0.2 0 ...
## $ SCG: num 0 0.08 0.06 0.1 0.08 0.15 0.1 0.02 0.14 0 ...
## $ STR: num 0 0.1 0.05 0.15 0.08 0.4 0.43 0.34 0.35 0.5 ...
## $ LPR: num 0 0.24 0.25 0.65 0.98 0.1 0.29 0.4 0.72 0.2 ...
## $ PEG: num 0 0.9 0.33 0.3 0.24 0.66 0.56 0.01 0.25 0.85 ...
## $ UNS: chr "very_low" "High" "Low" "Middle" ...
training_set$UNS = factor(training_set$UNS, levels = c("very_low","Low","Middle","High"))
# looking at th levels of the UNS , to verify the factoring done
levels(training_set$UNS)
## [1] "very_low" "Low" "Middle" "High"
#View(training_set)
Looking at the dataset using view() command, we can come
to a basic conclusion that UNS is the target variable, and the rest are
independent variables.
CLASSIFICATION algorithmslibrary(ggplot2)
d1 =ggplot(training_set, aes(x=STG, fill=UNS)) +xlab("STG")+ylab("Density")+
geom_density(alpha=0.4)
d2 =ggplot(training_set, aes(x=SCG, fill=UNS)) +xlab("STG")+ylab("Density")+
geom_density(alpha=0.4)
d3 =ggplot(training_set, aes(x=STR, fill=UNS)) +xlab("STG")+ylab("Density")+
geom_density(alpha=0.4)
d4 =ggplot(training_set, aes(x=LPR, fill=UNS)) +xlab("STG")+ylab("Density")+
geom_density(alpha=0.4)
d5 =ggplot(training_set, aes(x=PEG, fill=UNS)) +xlab("STG")+ylab("Density")+
geom_density(alpha=0.4)+ggtitle("20MID0144")
text <- "DENSITY PLOTS OF ALL THE NUMERIC ATTRIBUTES WRT ALL UNS LEVELS"
# Create a text grob
library(ggpubr)
tgrob <- text_grob(text,size = 12)
# Draw the text
plot_0 <- as_ggplot(tgrob) + theme(plot.margin = margin(0,0,0,7, "cm"))
ggarrange(plot_0,NULL,d1,d2,d3,d4,d5 ,
ncol = 2, nrow = 4)
#2) hist
h1=ggplot(training_set, aes(x=STG)) +
geom_histogram(fill="orange", alpha=1, position="identity", bins = 20)+xlab("STG")+ylab("Frequency")
h2=ggplot(training_set, aes(x=SCG)) +
geom_histogram(fill="orange", alpha=1, position="identity", bins = 20)+xlab("STG")+ylab("Frequency")
h3=ggplot(training_set, aes(x=STR)) +
geom_histogram(fill="orange", alpha=1, position="identity", bins = 20)+xlab("STG")+ylab("Frequency")
h4=ggplot(training_set, aes(x=LPR)) +
geom_histogram(fill="orange", alpha=1, position="identity", bins = 20)+xlab("STG")+ylab("Frequency")
h5=ggplot(training_set, aes(x=PEG)) +
geom_histogram(fill="orange", alpha=1, position="identity", bins = 20) + ggtitle("20MID0144")+xlab("STG")+ylab("Frequency")
text <- "HISTOGRAMS OF THE NUMERIC ATTRIBUTES"
# Create a text grob
tgrob <- text_grob(text,size = 15)
# Draw the text
plot_0 <- as_ggplot(tgrob) + theme(plot.margin = margin(0,3,0,6, "cm"))
ggarrange(plot_0,NULL,h1,h2,h3,h4,h5 ,
ncol = 2, nrow = 4)
from the above histograms, we can see the counts for different ranges with bins = 20, and we also see that the PEG column’s distribution is highly irregular.
pairs(training_set[1:6],
main = "Scatter plot-20MID0144",
pch = 21,
bg = c("#1b9e77", "#d95f02", "#7570b3","yellow")[unclass(training_set$UNS)])
from the above scatterplot , we can see the association between variables
# making the correlation matrix
cormat <- round(cor(training_set[-6]),2)
#cormat
library(reshape2)
melted_cormat <- melt(cormat)
ggheatmap <- ggplot(melted_cormat, aes(Var1, Var2, fill = value))+
geom_tile(color = "white")+
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Pearson\nCorrelation") +
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 12, hjust = 1))+
coord_fixed() +
geom_text(aes(Var2, Var1, label = value), color = "black", size = 4) +
guides(fill = guide_colorbar(barwidth = 7, barheight = 8,
title.position = "top", title.hjust = 0.5))+ggtitle("HEAT MAP FOR CORRELATION COEFFICIENTS",subtitle = "20MID0144")
print(ggheatmap)
ggplot(data = training_set,
aes(x=UNS))+xlab("UNS LEVELS")+ylab("Frequency")+
geom_bar(fill="orange")+ggtitle("BARPLOT FOR DIFFERENT UNS Levels", subtitle = "20MID0144")
From the bar plot , we can see that Low, and middle levels have higher frequency over the other two levels
#6) area plot
a1 <- ggplot(training_set, aes(x = STG, fill = UNS))+xlab("STG")+ylab("Area")+geom_area(stat = "bin", alpha=0.6, bins=20) +
scale_fill_brewer(palette="Dark2")
a2 <- ggplot(training_set, aes(x = SCG, fill = UNS))+xlab("SCG")+ylab("Area")+ geom_area(stat = "bin",alpha=0.6, bins=20) +
scale_fill_brewer(palette="Dark2")
a3 <- ggplot(training_set, aes(x = STR, fill = UNS))+xlab("STR")+ylab("Area")+ geom_area(stat = "bin",alpha=0.6, bins=20) +
scale_fill_brewer(palette="Dark2")
a4 <- ggplot(training_set, aes(x = LPR, fill = UNS))+xlab("LPR")+ylab("Area")+ geom_area(stat = "bin",alpha=0.6, bins=20) +
scale_fill_brewer(palette="Dark2")
a5 <- ggplot(training_set, aes(x = PEG, fill = UNS))+xlab("PEG")+ylab("Area")+geom_area(stat = "bin",alpha=0.6, bins=20) +
scale_fill_brewer(palette="Dark2")+ggtitle("20MID0144")
text <- "AREA PLOTS OF ALL THE NUMERIC ATTRIBUTES WRT TO THE UNS LEVELS"
# Create a text grob
tgrob <- text_grob(text,size = 12)
# Draw the text
plot_0 <- as_ggplot(tgrob) + theme(plot.margin = margin(0,0,0,7, "cm"))
ggarrange(plot_0,NULL,a1,a2,a3,a4,a5 ,
ncol = 2, nrow = 4)
test_set = as.data.frame( read_excel("Data_User_Modeling_Dataset_Hamdi Tolga KAHRAMAN.xls", sheet = 3))
test_set = test_set[1:6]
training_set[-6] = scale(training_set[-6])
test_set[-6] = scale(test_set[-6])
training_set$UNS = factor(training_set$UNS, levels = c("very_low","Low","Middle","High"),
labels = c(0,1,2,3))
test_set$UNS = factor(test_set$UNS, levels = c("very_low","Low","Middle","High"),
labels = c(0,1,2,3))
str(test_set)
## 'data.frame': 145 obs. of 6 variables:
## $ STG: num -1.514 -1.278 -1.137 -0.571 -0.477 ...
## $ SCG: num -1.153 -1.377 -0.793 -0.703 -0.613 ...
## $ STR: num 0.245 0.447 0.77 0.972 1.86 ...
## $ LPR: num -0.616 0.624 0.624 0.879 -0.47 ...
## $ PEG: num -1.4 -1.09 1.38 1.38 1.56 ...
## $ UNS: Factor w/ 4 levels "0","1","2","3": NA 2 4 4 4 2 3 2 4 NA ...
naiveBayes() command#fitting a model
library(e1071)
classifier = naiveBayes(x= training_set[-6], y = training_set$UNS)
summary(classifier)
## Length Class Mode
## apriori 4 table numeric
## tables 5 -none- list
## levels 4 -none- character
## isnumeric 5 -none- logical
## call 3 -none- call
y_pred = predict(object = classifier, newdata = test_set)
cm = table(test_set[,6], y_pred)
cm
## y_pred
## 0 1 2 3
## 0 0 0 0 0
## 1 0 38 8 0
## 2 0 9 25 0
## 3 0 0 0 39
(sum(diag(cm))/sum(cm))*100 # in percentage
## [1] 85.71429
we can see that, the above model gives an accuracy of ~ 85.71%
tidyverse ,
caret, and nnetlibrary(tidyverse)
library(caret)
library(nnet)
multinom() command in
nnet packagemodel <- nnet::multinom(UNS ~., data = training_set)
## # weights: 28 (18 variable)
## initial value 357.663945
## iter 10 value 98.169778
## iter 20 value 35.130686
## iter 30 value 24.474775
## iter 40 value 23.469519
## iter 50 value 22.993293
## iter 60 value 22.920808
## iter 70 value 22.848065
## iter 80 value 22.343796
## iter 90 value 22.325331
## iter 100 value 22.299564
## final value 22.299564
## stopped after 100 iterations
summary(model)
## Call:
## nnet::multinom(formula = UNS ~ ., data = training_set)
##
## Coefficients:
## (Intercept) STG SCG STR LPR PEG
## 1 77.41062 -1.983256 6.202435 5.196318 17.84152 63.45894
## 2 90.43958 -1.271745 7.319614 6.038471 22.97499 87.97788
## 3 82.03361 -1.715060 7.172355 7.311405 28.06319 99.33910
##
## Std. Errors:
## (Intercept) STG SCG STR LPR PEG
## 1 53.12772 2.924104 4.202697 3.985085 12.17329 43.90764
## 2 53.61327 3.006502 4.249308 4.050227 12.33034 45.33717
## 3 53.66317 3.132257 4.322763 4.157583 12.45438 45.45024
##
## Residual Deviance: 44.59913
## AIC: 80.59913
predicted.classes <- model %>% predict(test_set)
cm=table(test_set$UNS,predicted.classes)
(sum(diag(cm))/sum(cm))*100 #in percentage
## [1] 90.7563
we can see that the above model gives an accuracy of ~ 90.76%
rpart()
commandlibrary(rpart)
fit = rpart(formula = UNS ~ .,
data = training_set,
method = 'class')
summary(fit)
## Call:
## rpart(formula = UNS ~ ., data = training_set, method = "class")
## n= 258
##
## CP nsplit rel error xerror xstd
## 1 0.41764706 0 1.0000000 1.0647059 0.04323405
## 2 0.30000000 1 0.5823529 0.5941176 0.04611601
## 3 0.07647059 2 0.2823529 0.2941176 0.03734719
## 4 0.05882353 3 0.2058824 0.2941176 0.03734719
## 5 0.02941176 4 0.1470588 0.1823529 0.03072098
## 6 0.01764706 5 0.1176471 0.1705882 0.02984406
## 7 0.01000000 6 0.1000000 0.1529412 0.02844277
##
## Variable importance
## PEG LPR SCG STG STR
## 58 24 7 7 3
##
## Node number 1: 258 observations, complexity param=0.4176471
## predicted class=2 expected loss=0.6589147 P(node) =1
## class counts: 24 83 88 63
## probabilities: 0.093 0.322 0.341 0.244
## left son=2 (115 obs) right son=3 (143 obs)
## Primary splits:
## PEG < -0.4781879 to the left, improve=58.925770, (0 missing)
## LPR < -0.4744433 to the right, improve= 9.581884, (0 missing)
## STG < 2.110858 to the left, improve= 5.754316, (0 missing)
## STR < -1.394666 to the left, improve= 3.552632, (0 missing)
## SCG < 1.978302 to the left, improve= 3.174797, (0 missing)
## Surrogate splits:
## LPR < -0.111698 to the right, agree=0.678, adj=0.278, (0 split)
## STG < -0.4429864 to the left, agree=0.612, adj=0.130, (0 split)
## STR < -0.4188175 to the left, agree=0.589, adj=0.078, (0 split)
## SCG < -1.159048 to the left, agree=0.581, adj=0.061, (0 split)
##
## Node number 2: 115 observations, complexity param=0.07647059
## predicted class=1 expected loss=0.2956522 P(node) =0.4457364
## class counts: 24 81 10 0
## probabilities: 0.209 0.704 0.087 0.000
## left son=4 (27 obs) right son=5 (88 obs)
## Primary splits:
## PEG < -1.306914 to the left, improve=17.2446500, (0 missing)
## LPR < 1.440045 to the left, improve= 4.5642740, (0 missing)
## SCG < -0.6164983 to the left, improve= 4.2460360, (0 missing)
## STR < 1.004295 to the left, improve= 1.7821750, (0 missing)
## STG < -0.1861752 to the left, improve= 0.5858356, (0 missing)
## Surrogate splits:
## STG < -1.503522 to the left, agree=0.800, adj=0.148, (0 split)
## SCG < -1.324171 to the left, agree=0.791, adj=0.111, (0 split)
## LPR < -1.643289 to the left, agree=0.783, adj=0.074, (0 split)
##
## Node number 3: 143 observations, complexity param=0.3
## predicted class=2 expected loss=0.4545455 P(node) =0.5542636
## class counts: 0 2 78 63
## probabilities: 0.000 0.014 0.545 0.441
## left son=6 (90 obs) right son=7 (53 obs)
## Primary splits:
## PEG < 0.8481662 to the left, improve=47.975730, (0 missing)
## LPR < -0.4744433 to the left, improve=13.154490, (0 missing)
## STG < 2.110858 to the left, improve= 4.494858, (0 missing)
## SCG < 2.072659 to the left, improve= 2.471749, (0 missing)
## STR < -1.394666 to the right, improve= 1.413753, (0 missing)
## Surrogate splits:
## STG < 2.229752 to the left, agree=0.671, adj=0.113, (0 split)
## SCG < 2.143426 to the left, agree=0.657, adj=0.075, (0 split)
## LPR < -0.4744433 to the left, agree=0.657, adj=0.075, (0 split)
## STR < -1.394666 to the right, agree=0.650, adj=0.057, (0 split)
##
## Node number 4: 27 observations, complexity param=0.02941176
## predicted class=0 expected loss=0.2592593 P(node) =0.1046512
## class counts: 20 7 0 0
## probabilities: 0.741 0.259 0.000 0.000
## left son=8 (18 obs) right son=9 (9 obs)
## Primary splits:
## LPR < 0.75486 to the left, improve=7.2592590, (0 missing)
## SCG < -0.3334292 to the left, improve=1.8409590, (0 missing)
## STG < -1.277623 to the right, improve=1.3177390, (0 missing)
## PEG < -1.424464 to the left, improve=0.5571836, (0 missing)
## STR < -0.70344 to the left, improve=0.2560847, (0 missing)
## Surrogate splits:
## SCG < 0.4450111 to the left, agree=0.815, adj=0.444, (0 split)
##
## Node number 5: 88 observations, complexity param=0.01764706
## predicted class=1 expected loss=0.1590909 P(node) =0.3410853
## class counts: 4 74 10 0
## probabilities: 0.045 0.841 0.114 0.000
## left son=10 (81 obs) right son=11 (7 obs)
## Primary splits:
## LPR < 1.440045 to the left, improve=5.1035750, (0 missing)
## PEG < -0.8367149 to the left, improve=0.9955754, (0 missing)
## SCG < 0.7516694 to the left, improve=0.8542403, (0 missing)
## STR < -1.597968 to the left, improve=0.6591310, (0 missing)
## STG < 1.112148 to the left, improve=0.3628347, (0 missing)
##
## Node number 6: 90 observations, complexity param=0.05882353
## predicted class=2 expected loss=0.1444444 P(node) =0.3488372
## class counts: 0 2 77 11
## probabilities: 0.000 0.022 0.856 0.122
## left son=12 (80 obs) right son=13 (10 obs)
## Primary splits:
## LPR < 1.681876 to the left, improve=16.9083300, (0 missing)
## SCG < 1.364986 to the left, improve= 1.9301370, (0 missing)
## STG < -0.3526269 to the left, improve= 0.9333333, (0 missing)
## STR < 0.8009934 to the left, improve= 0.7213141, (0 missing)
## PEG < 0.005725619 to the left, improve= 0.5583333, (0 missing)
## Surrogate splits:
## SCG < 2.072659 to the left, agree=0.911, adj=0.2, (0 split)
##
## Node number 7: 53 observations
## predicted class=3 expected loss=0.01886792 P(node) =0.2054264
## class counts: 0 0 1 52
## probabilities: 0.000 0.000 0.019 0.981
##
## Node number 8: 18 observations
## predicted class=0 expected loss=0 P(node) =0.06976744
## class counts: 18 0 0 0
## probabilities: 1.000 0.000 0.000 0.000
##
## Node number 9: 9 observations
## predicted class=1 expected loss=0.2222222 P(node) =0.03488372
## class counts: 2 7 0 0
## probabilities: 0.222 0.778 0.000 0.000
##
## Node number 10: 81 observations
## predicted class=1 expected loss=0.1111111 P(node) =0.3139535
## class counts: 4 72 5 0
## probabilities: 0.049 0.889 0.062 0.000
##
## Node number 11: 7 observations
## predicted class=2 expected loss=0.2857143 P(node) =0.02713178
## class counts: 0 2 5 0
## probabilities: 0.000 0.286 0.714 0.000
##
## Node number 12: 80 observations
## predicted class=2 expected loss=0.0375 P(node) =0.3100775
## class counts: 0 2 77 1
## probabilities: 0.000 0.025 0.962 0.012
##
## Node number 13: 10 observations
## predicted class=3 expected loss=0 P(node) =0.03875969
## class counts: 0 0 0 10
## probabilities: 0.000 0.000 0.000 1.000
the summary of the fit explains how the decision tree is split at every node based on the mathematical calculations made at every step.
rpart.plot()
commandlibrary(rpart.plot)
rpart.plot(fit)
title("DECISION TREE GENERATED")
legend(x=0.2,y=0.92, legend = c("20MID0144"))
predict_unseen = predict(object = fit, newdata = test_set, type = 'class')
cm = table(test_set$UNS, predict_unseen)
cm
## predict_unseen
## 0 1 2 3
## 0 0 0 0 0
## 1 0 27 19 0
## 2 0 2 32 0
## 3 0 0 1 38
(sum(diag(cm))/sum(cm))*100 #in percentage
## [1] 81.51261
we can see that the above model gives an accuracy of ~ 81.51%
MULTI-VARIATE LOGISTIC REGRESSION algorithm gives the best
results (90.76%) in this case.