Aim : To develop and create a suitable machine learning model for the dataset - “User Knowledge Modeling Data Set”

Dataset Information :

Classification of user’s knowledge into different levels

ATTRIBUTES :

STG (The degree of study time for goal object materials), (input value)

SCG (The degree of repetition number of user for goal object materials) (input value)

STR (The degree of study time of user for related objects with goal object) (input value)

LPR (The exam performance of user for related objects with goal object) (input value)

PEG (The exam performance of user for goal objects) (input value)

UNS (The knowledge level of user) (target value)

Loading The Data Set

library(readxl)

training_set = as.data.frame(  read_excel("Data_User_Modeling_Dataset_Hamdi Tolga KAHRAMAN.xls", sheet = 2))

Looking at the structure, and summary of the dataset , to find the datatypes, and statistics of all the attributes

str(training_set)
summary(training_set)
## 'data.frame':    258 obs. of  9 variables:
##  $ STG                   : num  0 0.08 0.06 0.1 0.08 0.09 0.1 0.15 0.2 0 ...
##  $ SCG                   : num  0 0.08 0.06 0.1 0.08 0.15 0.1 0.02 0.14 0 ...
##  $ STR                   : num  0 0.1 0.05 0.15 0.08 0.4 0.43 0.34 0.35 0.5 ...
##  $ LPR                   : num  0 0.24 0.25 0.65 0.98 0.1 0.29 0.4 0.72 0.2 ...
##  $ PEG                   : num  0 0.9 0.33 0.3 0.24 0.66 0.56 0.01 0.25 0.85 ...
##  $ UNS                   : chr  "very_low" "High" "Low" "Middle" ...
##  $ ...7                  : logi  NA NA NA NA NA NA ...
##  $ ...8                  : logi  NA NA NA NA NA NA ...
##  $ Attribute Information:: chr  "STG (The degree of study time for goal object materails)," "SCG (The degree of repetition number of user for goal object materails)" "STR (The degree of study time of user for related objects with goal object)" "LPR (The exam performance of user for related objects with goal object)" ...
##       STG              SCG              STR              LPR        
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.2407   1st Qu.:0.2100   1st Qu.:0.2913   1st Qu.:0.2500  
##  Median :0.3270   Median :0.3025   Median :0.4900   Median :0.3300  
##  Mean   :0.3711   Mean   :0.3557   Mean   :0.4680   Mean   :0.4327  
##  3rd Qu.:0.4950   3rd Qu.:0.4975   3rd Qu.:0.6900   3rd Qu.:0.6475  
##  Max.   :0.9900   Max.   :0.9000   Max.   :0.9500   Max.   :0.9900  
##       PEG             UNS              ...7           ...8        
##  Min.   :0.0000   Length:258         Mode:logical   Mode:logical  
##  1st Qu.:0.2500   Class :character   NA's:258       NA's:258      
##  Median :0.5000   Mode  :character                                
##  Mean   :0.4585                                                   
##  3rd Qu.:0.6600                                                   
##  Max.   :0.9300                                                   
##  Attribute Information:
##  Length:258            
##  Class :character      
##  Mode  :character      
##                        
##                        
## 

we can see that, after sixth column, there are some explanations in the .xls file, hence removing it , and encoding the UNS attribute, which is the target variable., and there are no NA values in the summary, hence the training set.

training_set = training_set[1:6]
# looking at structure again to verify that we have all the required attributes only
str(training_set)
## 'data.frame':    258 obs. of  6 variables:
##  $ STG: num  0 0.08 0.06 0.1 0.08 0.09 0.1 0.15 0.2 0 ...
##  $ SCG: num  0 0.08 0.06 0.1 0.08 0.15 0.1 0.02 0.14 0 ...
##  $ STR: num  0 0.1 0.05 0.15 0.08 0.4 0.43 0.34 0.35 0.5 ...
##  $ LPR: num  0 0.24 0.25 0.65 0.98 0.1 0.29 0.4 0.72 0.2 ...
##  $ PEG: num  0 0.9 0.33 0.3 0.24 0.66 0.56 0.01 0.25 0.85 ...
##  $ UNS: chr  "very_low" "High" "Low" "Middle" ...
training_set$UNS = factor(training_set$UNS, levels = c("very_low","Low","Middle","High"))
# looking at th levels of the UNS , to verify the factoring done
levels(training_set$UNS)
## [1] "very_low" "Low"      "Middle"   "High"
#View(training_set)

Looking at the dataset using view() command, we can come to a basic conclusion that UNS is the target variable, and the rest are independent variables.

Hence, to predict the UNS levels, we can use different CLASSIFICATION algorithms

Exploratory Data Analysis :

Denisty Plots

library(ggplot2)
d1 =ggplot(training_set, aes(x=STG, fill=UNS)) +xlab("STG")+ylab("Density")+
  geom_density(alpha=0.4)

d2 =ggplot(training_set, aes(x=SCG, fill=UNS)) +xlab("STG")+ylab("Density")+
  geom_density(alpha=0.4)
d3 =ggplot(training_set, aes(x=STR, fill=UNS)) +xlab("STG")+ylab("Density")+
  geom_density(alpha=0.4)

d4 =ggplot(training_set, aes(x=LPR, fill=UNS)) +xlab("STG")+ylab("Density")+
  geom_density(alpha=0.4)

d5 =ggplot(training_set, aes(x=PEG, fill=UNS)) +xlab("STG")+ylab("Density")+
  geom_density(alpha=0.4)+ggtitle("20MID0144")

text <- "DENSITY PLOTS OF ALL THE NUMERIC ATTRIBUTES WRT ALL UNS LEVELS"

# Create a text grob
library(ggpubr)
tgrob <- text_grob(text,size = 12)
# Draw the text

plot_0 <- as_ggplot(tgrob) + theme(plot.margin = margin(0,0,0,7, "cm"))
ggarrange(plot_0,NULL,d1,d2,d3,d4,d5 ,
          ncol = 2, nrow = 4)

Histograms

#2) hist
h1=ggplot(training_set, aes(x=STG)) +
  geom_histogram(fill="orange", alpha=1, position="identity", bins = 20)+xlab("STG")+ylab("Frequency")
h2=ggplot(training_set, aes(x=SCG)) +
  geom_histogram(fill="orange", alpha=1, position="identity", bins = 20)+xlab("STG")+ylab("Frequency")

h3=ggplot(training_set, aes(x=STR)) +
  geom_histogram(fill="orange", alpha=1, position="identity", bins = 20)+xlab("STG")+ylab("Frequency")

h4=ggplot(training_set, aes(x=LPR)) +
  geom_histogram(fill="orange", alpha=1, position="identity", bins = 20)+xlab("STG")+ylab("Frequency")

h5=ggplot(training_set, aes(x=PEG)) +
  geom_histogram(fill="orange", alpha=1, position="identity", bins = 20) + ggtitle("20MID0144")+xlab("STG")+ylab("Frequency")



text <- "HISTOGRAMS OF THE NUMERIC ATTRIBUTES"

# Create a text grob
tgrob <- text_grob(text,size = 15)
# Draw the text

plot_0 <- as_ggplot(tgrob) + theme(plot.margin = margin(0,3,0,6, "cm"))
ggarrange(plot_0,NULL,h1,h2,h3,h4,h5 ,
          ncol = 2, nrow = 4)

from the above histograms, we can see the counts for different ranges with bins = 20, and we also see that the PEG column’s distribution is highly irregular.

Scatterplot

pairs(training_set[1:6], 
      main = "Scatter plot-20MID0144",
      pch = 21, 
      bg = c("#1b9e77", "#d95f02", "#7570b3","yellow")[unclass(training_set$UNS)])

from the above scatterplot , we can see the association between variables

Heat Map

# making the correlation matrix
cormat <- round(cor(training_set[-6]),2)
#cormat
library(reshape2)
melted_cormat <- melt(cormat)

ggheatmap <- ggplot(melted_cormat, aes(Var1, Var2, fill = value))+
  geom_tile(color = "white")+
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
                       midpoint = 0, limit = c(-1,1), space = "Lab", 
                       name="Pearson\nCorrelation") +
  
  theme(axis.text.x = element_text(angle = 45, vjust = 1, 
                                   size = 12, hjust = 1))+
        
  coord_fixed() + 
  geom_text(aes(Var2, Var1, label = value), color = "black", size = 4) +
  
  guides(fill = guide_colorbar(barwidth = 7, barheight = 8,
                               title.position = "top", title.hjust = 0.5))+ggtitle("HEAT MAP FOR CORRELATION COEFFICIENTS",subtitle = "20MID0144")
print(ggheatmap)

Barplot

ggplot(data = training_set,
       aes(x=UNS))+xlab("UNS LEVELS")+ylab("Frequency")+
  geom_bar(fill="orange")+ggtitle("BARPLOT FOR DIFFERENT UNS Levels", subtitle = "20MID0144")

From the bar plot , we can see that Low, and middle levels have higher frequency over the other two levels

Area Plots

#6) area plot 
a1 <- ggplot(training_set, aes(x = STG, fill = UNS))+xlab("STG")+ylab("Area")+geom_area(stat = "bin", alpha=0.6, bins=20) +
  scale_fill_brewer(palette="Dark2")

a2 <- ggplot(training_set, aes(x = SCG, fill = UNS))+xlab("SCG")+ylab("Area")+ geom_area(stat = "bin",alpha=0.6, bins=20) +
  scale_fill_brewer(palette="Dark2")

a3 <- ggplot(training_set, aes(x = STR, fill = UNS))+xlab("STR")+ylab("Area")+ geom_area(stat = "bin",alpha=0.6, bins=20) +
  scale_fill_brewer(palette="Dark2")

a4 <- ggplot(training_set, aes(x = LPR, fill = UNS))+xlab("LPR")+ylab("Area")+ geom_area(stat = "bin",alpha=0.6, bins=20) +
  scale_fill_brewer(palette="Dark2")

a5 <- ggplot(training_set, aes(x = PEG, fill = UNS))+xlab("PEG")+ylab("Area")+geom_area(stat = "bin",alpha=0.6, bins=20) +
  scale_fill_brewer(palette="Dark2")+ggtitle("20MID0144")

text <- "AREA PLOTS OF ALL THE NUMERIC ATTRIBUTES WRT TO THE UNS LEVELS"

# Create a text grob
tgrob <- text_grob(text,size = 12)
# Draw the text

plot_0 <- as_ggplot(tgrob) +  theme(plot.margin = margin(0,0,0,7, "cm"))
ggarrange(plot_0,NULL,a1,a2,a3,a4,a5 ,
          ncol = 2, nrow = 4)

Reading Test data

test_set = as.data.frame(  read_excel("Data_User_Modeling_Dataset_Hamdi Tolga KAHRAMAN.xls", sheet = 3))
test_set = test_set[1:6]

Preprocessing

Feature Scaling

training_set[-6] = scale(training_set[-6])
test_set[-6] = scale(test_set[-6])

Encoding nominal attribute ‘UNS’

training_set$UNS = factor(training_set$UNS, levels = c("very_low","Low","Middle","High"), 
                          labels  = c(0,1,2,3))
test_set$UNS = factor(test_set$UNS, levels = c("very_low","Low","Middle","High"),
                          labels  = c(0,1,2,3))
str(test_set)
## 'data.frame':    145 obs. of  6 variables:
##  $ STG: num  -1.514 -1.278 -1.137 -0.571 -0.477 ...
##  $ SCG: num  -1.153 -1.377 -0.793 -0.703 -0.613 ...
##  $ STR: num  0.245 0.447 0.77 0.972 1.86 ...
##  $ LPR: num  -0.616 0.624 0.624 0.879 -0.47 ...
##  $ PEG: num  -1.4 -1.09 1.38 1.38 1.56 ...
##  $ UNS: Factor w/ 4 levels "0","1","2","3": NA 2 4 4 4 2 3 2 4 NA ...

MODEL 1: NAIVE BAYES CLASSIFIER

Fitting a model using naiveBayes() command

#fitting a model

library(e1071)

classifier = naiveBayes(x= training_set[-6], y = training_set$UNS)

summary of the classifier

summary(classifier)
##           Length Class  Mode     
## apriori   4      table  numeric  
## tables    5      -none- list     
## levels    4      -none- character
## isnumeric 5      -none- logical  
## call      3      -none- call

Prediction for test data

y_pred = predict(object = classifier, newdata = test_set)

Confusion Matrix

cm = table(test_set[,6], y_pred)
cm
##    y_pred
##      0  1  2  3
##   0  0  0  0  0
##   1  0 38  8  0
##   2  0  9 25  0
##   3  0  0  0 39

Calculating Accuracy

(sum(diag(cm))/sum(cm))*100 # in percentage
## [1] 85.71429

we can see that, the above model gives an accuracy of ~ 85.71%

MODEL 2 : MULTI-VARIATE LOGISTIC REGRESSION

Loading the required packages - tidyverse , caret, and nnet

library(tidyverse)
library(caret)
library(nnet)

Fitting a model using multinom() command in nnet package

model <- nnet::multinom(UNS ~., data = training_set)
## # weights:  28 (18 variable)
## initial  value 357.663945 
## iter  10 value 98.169778
## iter  20 value 35.130686
## iter  30 value 24.474775
## iter  40 value 23.469519
## iter  50 value 22.993293
## iter  60 value 22.920808
## iter  70 value 22.848065
## iter  80 value 22.343796
## iter  90 value 22.325331
## iter 100 value 22.299564
## final  value 22.299564 
## stopped after 100 iterations

summary of the classifier

summary(model)
## Call:
## nnet::multinom(formula = UNS ~ ., data = training_set)
## 
## Coefficients:
##   (Intercept)       STG      SCG      STR      LPR      PEG
## 1    77.41062 -1.983256 6.202435 5.196318 17.84152 63.45894
## 2    90.43958 -1.271745 7.319614 6.038471 22.97499 87.97788
## 3    82.03361 -1.715060 7.172355 7.311405 28.06319 99.33910
## 
## Std. Errors:
##   (Intercept)      STG      SCG      STR      LPR      PEG
## 1    53.12772 2.924104 4.202697 3.985085 12.17329 43.90764
## 2    53.61327 3.006502 4.249308 4.050227 12.33034 45.33717
## 3    53.66317 3.132257 4.322763 4.157583 12.45438 45.45024
## 
## Residual Deviance: 44.59913 
## AIC: 80.59913

Prediction for test data

predicted.classes <- model %>% predict(test_set)

Generating confusion matrix, and calculating accuracy

cm=table(test_set$UNS,predicted.classes)
(sum(diag(cm))/sum(cm))*100 #in percentage
## [1] 90.7563

we can see that the above model gives an accuracy of ~ 90.76%

MODEL 3 : DECISION TREE CLASSIFIER

fitting a model using rpart package’s rpart() command

library(rpart)
fit = rpart(formula = UNS ~ .,
            data = training_set,
            method = 'class')

Summary of the fit

summary(fit)
## Call:
## rpart(formula = UNS ~ ., data = training_set, method = "class")
##   n= 258 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.41764706      0 1.0000000 1.0647059 0.04323405
## 2 0.30000000      1 0.5823529 0.5941176 0.04611601
## 3 0.07647059      2 0.2823529 0.2941176 0.03734719
## 4 0.05882353      3 0.2058824 0.2941176 0.03734719
## 5 0.02941176      4 0.1470588 0.1823529 0.03072098
## 6 0.01764706      5 0.1176471 0.1705882 0.02984406
## 7 0.01000000      6 0.1000000 0.1529412 0.02844277
## 
## Variable importance
## PEG LPR SCG STG STR 
##  58  24   7   7   3 
## 
## Node number 1: 258 observations,    complexity param=0.4176471
##   predicted class=2  expected loss=0.6589147  P(node) =1
##     class counts:    24    83    88    63
##    probabilities: 0.093 0.322 0.341 0.244 
##   left son=2 (115 obs) right son=3 (143 obs)
##   Primary splits:
##       PEG < -0.4781879  to the left,  improve=58.925770, (0 missing)
##       LPR < -0.4744433  to the right, improve= 9.581884, (0 missing)
##       STG < 2.110858    to the left,  improve= 5.754316, (0 missing)
##       STR < -1.394666   to the left,  improve= 3.552632, (0 missing)
##       SCG < 1.978302    to the left,  improve= 3.174797, (0 missing)
##   Surrogate splits:
##       LPR < -0.111698   to the right, agree=0.678, adj=0.278, (0 split)
##       STG < -0.4429864  to the left,  agree=0.612, adj=0.130, (0 split)
##       STR < -0.4188175  to the left,  agree=0.589, adj=0.078, (0 split)
##       SCG < -1.159048   to the left,  agree=0.581, adj=0.061, (0 split)
## 
## Node number 2: 115 observations,    complexity param=0.07647059
##   predicted class=1  expected loss=0.2956522  P(node) =0.4457364
##     class counts:    24    81    10     0
##    probabilities: 0.209 0.704 0.087 0.000 
##   left son=4 (27 obs) right son=5 (88 obs)
##   Primary splits:
##       PEG < -1.306914   to the left,  improve=17.2446500, (0 missing)
##       LPR < 1.440045    to the left,  improve= 4.5642740, (0 missing)
##       SCG < -0.6164983  to the left,  improve= 4.2460360, (0 missing)
##       STR < 1.004295    to the left,  improve= 1.7821750, (0 missing)
##       STG < -0.1861752  to the left,  improve= 0.5858356, (0 missing)
##   Surrogate splits:
##       STG < -1.503522   to the left,  agree=0.800, adj=0.148, (0 split)
##       SCG < -1.324171   to the left,  agree=0.791, adj=0.111, (0 split)
##       LPR < -1.643289   to the left,  agree=0.783, adj=0.074, (0 split)
## 
## Node number 3: 143 observations,    complexity param=0.3
##   predicted class=2  expected loss=0.4545455  P(node) =0.5542636
##     class counts:     0     2    78    63
##    probabilities: 0.000 0.014 0.545 0.441 
##   left son=6 (90 obs) right son=7 (53 obs)
##   Primary splits:
##       PEG < 0.8481662   to the left,  improve=47.975730, (0 missing)
##       LPR < -0.4744433  to the left,  improve=13.154490, (0 missing)
##       STG < 2.110858    to the left,  improve= 4.494858, (0 missing)
##       SCG < 2.072659    to the left,  improve= 2.471749, (0 missing)
##       STR < -1.394666   to the right, improve= 1.413753, (0 missing)
##   Surrogate splits:
##       STG < 2.229752    to the left,  agree=0.671, adj=0.113, (0 split)
##       SCG < 2.143426    to the left,  agree=0.657, adj=0.075, (0 split)
##       LPR < -0.4744433  to the left,  agree=0.657, adj=0.075, (0 split)
##       STR < -1.394666   to the right, agree=0.650, adj=0.057, (0 split)
## 
## Node number 4: 27 observations,    complexity param=0.02941176
##   predicted class=0  expected loss=0.2592593  P(node) =0.1046512
##     class counts:    20     7     0     0
##    probabilities: 0.741 0.259 0.000 0.000 
##   left son=8 (18 obs) right son=9 (9 obs)
##   Primary splits:
##       LPR < 0.75486     to the left,  improve=7.2592590, (0 missing)
##       SCG < -0.3334292  to the left,  improve=1.8409590, (0 missing)
##       STG < -1.277623   to the right, improve=1.3177390, (0 missing)
##       PEG < -1.424464   to the left,  improve=0.5571836, (0 missing)
##       STR < -0.70344    to the left,  improve=0.2560847, (0 missing)
##   Surrogate splits:
##       SCG < 0.4450111   to the left,  agree=0.815, adj=0.444, (0 split)
## 
## Node number 5: 88 observations,    complexity param=0.01764706
##   predicted class=1  expected loss=0.1590909  P(node) =0.3410853
##     class counts:     4    74    10     0
##    probabilities: 0.045 0.841 0.114 0.000 
##   left son=10 (81 obs) right son=11 (7 obs)
##   Primary splits:
##       LPR < 1.440045    to the left,  improve=5.1035750, (0 missing)
##       PEG < -0.8367149  to the left,  improve=0.9955754, (0 missing)
##       SCG < 0.7516694   to the left,  improve=0.8542403, (0 missing)
##       STR < -1.597968   to the left,  improve=0.6591310, (0 missing)
##       STG < 1.112148    to the left,  improve=0.3628347, (0 missing)
## 
## Node number 6: 90 observations,    complexity param=0.05882353
##   predicted class=2  expected loss=0.1444444  P(node) =0.3488372
##     class counts:     0     2    77    11
##    probabilities: 0.000 0.022 0.856 0.122 
##   left son=12 (80 obs) right son=13 (10 obs)
##   Primary splits:
##       LPR < 1.681876    to the left,  improve=16.9083300, (0 missing)
##       SCG < 1.364986    to the left,  improve= 1.9301370, (0 missing)
##       STG < -0.3526269  to the left,  improve= 0.9333333, (0 missing)
##       STR < 0.8009934   to the left,  improve= 0.7213141, (0 missing)
##       PEG < 0.005725619 to the left,  improve= 0.5583333, (0 missing)
##   Surrogate splits:
##       SCG < 2.072659    to the left,  agree=0.911, adj=0.2, (0 split)
## 
## Node number 7: 53 observations
##   predicted class=3  expected loss=0.01886792  P(node) =0.2054264
##     class counts:     0     0     1    52
##    probabilities: 0.000 0.000 0.019 0.981 
## 
## Node number 8: 18 observations
##   predicted class=0  expected loss=0  P(node) =0.06976744
##     class counts:    18     0     0     0
##    probabilities: 1.000 0.000 0.000 0.000 
## 
## Node number 9: 9 observations
##   predicted class=1  expected loss=0.2222222  P(node) =0.03488372
##     class counts:     2     7     0     0
##    probabilities: 0.222 0.778 0.000 0.000 
## 
## Node number 10: 81 observations
##   predicted class=1  expected loss=0.1111111  P(node) =0.3139535
##     class counts:     4    72     5     0
##    probabilities: 0.049 0.889 0.062 0.000 
## 
## Node number 11: 7 observations
##   predicted class=2  expected loss=0.2857143  P(node) =0.02713178
##     class counts:     0     2     5     0
##    probabilities: 0.000 0.286 0.714 0.000 
## 
## Node number 12: 80 observations
##   predicted class=2  expected loss=0.0375  P(node) =0.3100775
##     class counts:     0     2    77     1
##    probabilities: 0.000 0.025 0.962 0.012 
## 
## Node number 13: 10 observations
##   predicted class=3  expected loss=0  P(node) =0.03875969
##     class counts:     0     0     0    10
##    probabilities: 0.000 0.000 0.000 1.000

the summary of the fit explains how the decision tree is split at every node based on the mathematical calculations made at every step.

Plotting the decision tree using rpart.plot() command

library(rpart.plot)
rpart.plot(fit)
title("DECISION TREE GENERATED")
legend(x=0.2,y=0.92, legend = c("20MID0144"))

Predicition for test data

predict_unseen = predict(object = fit, newdata = test_set, type = 'class')

Creating confusion matrix, and calculating the accuracy

cm = table(test_set$UNS, predict_unseen)

cm
##    predict_unseen
##      0  1  2  3
##   0  0  0  0  0
##   1  0 27 19  0
##   2  0  2 32  0
##   3  0  0  1 38
(sum(diag(cm))/sum(cm))*100 #in percentage
## [1] 81.51261

we can see that the above model gives an accuracy of ~ 81.51%

Result :

Thus from the accuracy values obtained from the different models, we can conclude that model based on MULTI-VARIATE LOGISTIC REGRESSION algorithm gives the best results (90.76%) in this case.