Exploration of the data.
### variables names
names(student)
## [1] "Name" "Age" "Gender"
## [4] "Admission_Test_Score" "High_School_Percentage" "City"
## [7] "Admission_Status"
### top 6 rows
head(student)
## # A tibble: 6 × 7
## Name Age Gender Admission_Test_Score High_School_Percentage City
## <fct> <dbl> <fct> <dbl> <dbl> <fct>
## 1 Shehroz 24 Female 50 68.9 Quetta
## 2 Waqar 21 Female 99 60.7 Karachi
## 3 Bushra 17 Male 89 NA Islamabad
## 4 Aliya 17 Male 55 85.3 Karachi
## 5 Bilal 20 Male 65 61.1 Lahore
## 6 Murtaza 23 Female NA NA Islamabad
## # ℹ 1 more variable: Admission_Status <fct>
### describing the data
describe(student)
## vars n mean sd median trimmed mad min max
## Name* 1 147 27.67 18.08 26.00 26.98 22.24 1 64.0
## Age 2 147 19.68 4.54 20.00 20.24 2.97 -1 24.0
## Gender* 3 147 1.44 0.50 1.00 1.42 0.00 1 2.0
## Admission_Test_Score 4 146 77.66 16.86 79.00 78.36 15.57 -5 150.0
## High_School_Percentage 5 146 75.68 17.37 77.54 76.72 17.41 -10 110.5
## City* 6 147 3.61 2.05 3.00 3.51 2.97 1 7.0
## Admission_Status* 7 147 1.48 0.50 1.00 1.48 0.00 1 2.0
## range skew kurtosis se
## Name* 63.0 0.26 -1.06 1.49
## Age 25.0 -3.08 11.80 0.37
## Gender* 1.0 0.26 -1.95 0.04
## Admission_Test_Score 155.0 -0.46 4.37 1.39
## High_School_Percentage 120.5 -1.32 4.54 1.44
## City* 6.0 0.22 -1.32 0.17
## Admission_Status* 1.0 0.07 -2.01 0.04
glimpse(student)
## Rows: 157
## Columns: 7
## $ Name <fct> Shehroz, Waqar, Bushra, Aliya, Bilal, Murtaza, …
## $ Age <dbl> 24, 21, 17, 17, 20, 23, 18, 20, 17, 18, 17, 18,…
## $ Gender <fct> Female, Female, Male, Male, Male, Female, Male,…
## $ Admission_Test_Score <dbl> 50, 99, 89, 55, 65, NA, NA, 82, 64, 53, 78, 89,…
## $ High_School_Percentage <dbl> 68.90, 60.73, NA, 85.29, 61.13, NA, 97.31, 55.6…
## $ City <fct> Quetta, Karachi, Islamabad, Karachi, Lahore, Is…
## $ Admission_Status <fct> Rejected, NA, Accepted, Rejected, NA, Accepted,…
summary(student)
## Name Age Gender Admission_Test_Score
## Shehroz: 6 Min. :-1.00 Female:83 Min. : -5.00
## Aliya : 5 1st Qu.:18.00 Male :64 1st Qu.: 68.25
## Asad : 5 Median :20.00 NA's :10 Median : 79.00
## Rohail : 5 Mean :19.68 Mean : 77.66
## Maryam : 5 3rd Qu.:22.00 3rd Qu.: 89.00
## (Other):121 Max. :24.00 Max. :150.00
## NA's : 10 NA's :10 NA's :11
## High_School_Percentage City Admission_Status
## Min. :-10.00 Quetta :30 Rejected:76
## 1st Qu.: 65.05 Karachi :28 Accepted:71
## Median : 77.55 Multan :21 NA's :10
## Mean : 75.68 Peshawar :18
## 3rd Qu.: 88.31 Islamabad:17
## Max. :110.50 (Other) :33
## NA's :11 NA's :10
student %>% keep(is.factor) %>% summary()
## Name Gender City Admission_Status
## Shehroz: 6 Female:83 Quetta :30 Rejected:76
## Aliya : 5 Male :64 Karachi :28 Accepted:71
## Asad : 5 NA's :10 Multan :21 NA's :10
## Rohail : 5 Peshawar :18
## Maryam : 5 Islamabad:17
## (Other):121 (Other) :33
## NA's : 10 NA's :10
student %>% keep(is.numeric) %>% summary()
## Age Admission_Test_Score High_School_Percentage
## Min. :-1.00 Min. : -5.00 Min. :-10.00
## 1st Qu.:18.00 1st Qu.: 68.25 1st Qu.: 65.05
## Median :20.00 Median : 79.00 Median : 77.55
## Mean :19.68 Mean : 77.66 Mean : 75.68
## 3rd Qu.:22.00 3rd Qu.: 89.00 3rd Qu.: 88.31
## Max. :24.00 Max. :150.00 Max. :110.50
## NA's :10 NA's :11 NA's :11
### structure of the data
str(student)
## spc_tbl_ [157 × 7] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Name : Factor w/ 64 levels "Shehroz","Waqar",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Age : num [1:157] 24 21 17 17 20 23 18 20 17 18 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 1 1 2 2 2 1 2 1 2 2 ...
## $ Admission_Test_Score : num [1:157] 50 99 89 55 65 NA NA 82 64 53 ...
## $ High_School_Percentage: num [1:157] 68.9 60.7 NA 85.3 61.1 ...
## $ City : Factor w/ 7 levels "Quetta","Karachi",..: 1 2 3 2 4 3 5 4 2 5 ...
## $ Admission_Status : Factor w/ 2 levels "Rejected","Accepted": 1 NA 2 1 NA 2 2 2 2 1 ...
## - attr(*, "spec")=
## .. cols(
## .. Name = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
## .. Age = col_number(),
## .. Gender = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
## .. Admission_Test_Score = col_number(),
## .. High_School_Percentage = col_number(),
## .. City = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
## .. Admission_Status = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE)
## .. )
## - attr(*, "problems")=<externalptr>
### missing data according to variables
sapply(student, function(x) sum(is.na(x)))
## Name Age Gender
## 10 10 10
## Admission_Test_Score High_School_Percentage City
## 11 11 10
## Admission_Status
## 10
## total number of missing data
sum(is.na(student))
## [1] 72
### dropping NAs from one variable
student <- student %>% drop_na(Admission_Status)
### converting to a dataframe
student <- as.data.frame(student)
### Replacing NAs with mean, median and UNK
student <- student %>% mutate(Age = ifelse(is.na(Age), mean(Age, na.rm = TRUE), Age)) %>%
mutate(Admission_Test_Score = ifelse(is.na(Admission_Test_Score), median(Admission_Test_Score, na.rm = TRUE), Admission_Test_Score)) %>%
mutate(High_School_Percentage = ifelse(is.na(High_School_Percentage), mean(High_School_Percentage, na.rm = TRUE),High_School_Percentage)) %>%
mutate(Gender = ifelse(is.na(Gender), "UNK", Gender)) %>% mutate( City = ifelse(is.na(City), "UNK", City))
student$Age <- ifelse(student$Age < 0, mean(student$Age, na.rm = TRUE), student$Age)
student$High_School_Percentage <- ifelse(student$High_School_Percentage > 100, mean(student$High_School_Percentage, na.rm = TRUE), student$High_School_Percentage)
summary(student)
## Name Age Gender Admission_Test_Score
## Shehroz: 5 Min. :17.00 Length:147 Min. : -5.0
## Asad : 5 1st Qu.:18.50 Class :character 1st Qu.: 70.5
## Maryam : 5 Median :20.00 Mode :character Median : 79.0
## Aliya : 4 Mean :20.35 Mean : 77.5
## Rohail : 4 3rd Qu.:22.00 3rd Qu.: 89.0
## (Other):116 Max. :24.00 Max. :150.0
## NA's : 8
## High_School_Percentage City Admission_Status
## Min. :-10.00 Length:147 Rejected:76
## 1st Qu.: 67.37 Class :character Accepted:71
## Median : 76.70 Mode :character
## Mean : 76.47
## 3rd Qu.: 87.03
## Max. : 99.80
##
### checking for missing data after replacing them
sapply(student, function(x) sum(is.na(x)))
## Name Age Gender
## 8 0 0
## Admission_Test_Score High_School_Percentage City
## 0 0 0
## Admission_Status
## 0
## converting to male and female
student$Gender <- fct_recode(student$Gender, "Male" = "2", "Female" = "1")
### bar chart for gender
student %>% ggplot(aes(Gender, fill = Gender)) + geom_bar()

table(student$Gender)
##
## Female Male UNK
## 81 57 9
### boxplot for age and gender
student %>% ggplot(aes(Age, fill = Gender)) + geom_boxplot() + coord_flip()

student %>% ggplot(aes(City, Age, fill = Gender)) + geom_boxplot()

###Admission_Test_Score vs gender bar chart
student %>% ggplot(aes(Admission_Test_Score, fill = Gender)) + geom_bar( )

### histogram for Admission_Test_Score
student %>% ggplot(aes(Admission_Test_Score, colour = Gender)) + geom_freqpoly(bin = 30)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

### scatter plot for Admission_Test_Score and High_School_Percentage
student %>% ggplot(aes(Admission_Test_Score, High_School_Percentage, fill = Gender)) + geom_point( )

###### applying logistic regression
### removing the name column
student <- student %>% select(-Name)
## converting to 1 and 2 for female and male
student$Gender <- fct_recode(student$Gender, "2" = "Male", "1" = "Female")
### splitting the data
set.seed(1234)
sample_index <- sample(nrow(student), round(nrow(student) * .75), replace = FALSE)
student_train <- student[sample_index, ]
student_test <- student[-sample_index, ]
### Checking for class imbalance
round(prop.table(table(select(student, Admission_Status), exclude = NULL)), 4) * 100
## Admission_Status
## Rejected Accepted
## 51.7 48.3
round(prop.table(table(select(student_train, Admission_Status), exclude = NULL)), 4) * 100
## Admission_Status
## Rejected Accepted
## 48.18 51.82
round(prop.table(table(select(student_test, Admission_Status), exclude = NULL)), 4) * 100
## Admission_Status
## Rejected Accepted
## 62.16 37.84
### Building the model
student_mod <- glm(data = student_train, family = "binomial", formula = Admission_Status ~ . )
summary(student_mod)
##
## Call:
## glm(formula = Admission_Status ~ ., family = "binomial", data = student_train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.866443 2.181445 -1.314 0.1888
## Age 0.072819 0.094419 0.771 0.4406
## Gender2 -0.252830 0.475087 -0.532 0.5946
## GenderUNK 0.084068 0.769832 0.109 0.9130
## Admission_Test_Score 0.001663 0.012513 0.133 0.8943
## High_School_Percentage 0.015365 0.015597 0.985 0.3246
## City2 -0.437965 0.681423 -0.643 0.5204
## City3 0.148759 0.752925 0.198 0.8434
## City4 1.868690 0.919234 2.033 0.0421 *
## City5 0.510918 0.748648 0.682 0.4950
## City6 -0.154829 0.794032 -0.195 0.8454
## City7 0.754033 0.819784 0.920 0.3577
## CityUNK 0.470278 0.911283 0.516 0.6058
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 152.35 on 109 degrees of freedom
## Residual deviance: 141.09 on 97 degrees of freedom
## AIC: 167.09
##
## Number of Fisher Scoring iterations: 4
### computing McFadden's R2
pscl::pR2(student_mod)["McFadden"]
## fitting null model for pseudo-r2
## McFadden
## 0.07387505
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
varImp(student_mod)
## Overall
## Age 0.7712309
## Gender2 0.5321752
## GenderUNK 0.1092032
## Admission_Test_Score 0.1328828
## High_School_Percentage 0.9851054
## City2 0.6427214
## City3 0.1975743
## City4 2.0328764
## City5 0.6824545
## City6 0.1949912
## City7 0.9197943
## CityUNK 0.5160614
vif(student_mod)
## GVIF Df GVIF^(1/(2*Df))
## Age 1.158588 1 1.076377
## Gender 1.370164 2 1.081915
## Admission_Test_Score 1.104348 1 1.050880
## High_School_Percentage 1.268615 1 1.126328
## City 1.481036 7 1.028450
test_student_pred <- predict(student_mod, student_test, type = "response")
### applying cutoff for the model
test_student_pred <- ifelse(test_student_pred >= 0.5, 1, 0)
test_student_table <- table(student_test$Admission_Status, test_student_pred)
### checking for model accuracy
sum(diag(test_student_table))/ nrow(student_test)
## [1] 0.4864865
### Applying K-NN algorithm to the same data
library(fastDummies)
summary(student)
## Age Gender Admission_Test_Score High_School_Percentage
## Min. :17.00 1 :81 Min. : -5.0 Min. :-10.00
## 1st Qu.:18.50 2 :57 1st Qu.: 70.5 1st Qu.: 67.37
## Median :20.00 UNK: 9 Median : 79.0 Median : 76.70
## Mean :20.35 Mean : 77.5 Mean : 76.47
## 3rd Qu.:22.00 3rd Qu.: 89.0 3rd Qu.: 87.03
## Max. :24.00 Max. :150.0 Max. : 99.80
## City Admission_Status
## Length:147 Rejected:76
## Class :character Accepted:71
## Mode :character
##
##
##
### normalizing the numeric variable
normalize <- function(x){
return((x - min(x))/ (max(x) - min(x)))
}
student$Age <- normalize(student$Age)
student$Admission_Test_Score <- normalize(student$Admission_Test_Score)
student$High_School_Percentage <- normalize(student$High_School_Percentage)
## changing to dataframe
student <- as.data.frame(student)
### selecting the response variable to label
student_label <- student %>% select(Admission_Status)
### building the model without the response variable
student <- student %>% select(-Admission_Status)
### creating dummies variable
student <- dummy_cols(student)
### removing duplicate dummies
student <- student %>% select(-City, -Gender)
### checking the dimension
dim(student)
## [1] 147 14
names(student)
## [1] "Age" "Admission_Test_Score" "High_School_Percentage"
## [4] "Gender_1" "Gender_2" "Gender_UNK"
## [7] "City_1" "City_2" "City_3"
## [10] "City_4" "City_5" "City_6"
## [13] "City_7" "City_UNK"
### Splitting the data
set.seed(1234)
sample_stud <- sample(nrow(student), round(nrow(student)* .75), replace = FALSE)
stud_train <- student[sample_stud, ]
stud_test <- student[-sample_stud, ]
### selecting the response variables
student_train_label <- as.factor(student_label[sample_stud, ])
student_test_label <- as.factor(student_label[-sample_stud, ])
### Building the model with k = 11
library(class)
addmin_pred <- knn(train =stud_train, test = stud_test, cl = student_train_label, k = 7)
confusionMatrix(addmin_pred, student_test_label)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Rejected Accepted
## Rejected 15 7
## Accepted 8 7
##
## Accuracy : 0.5946
## 95% CI : (0.421, 0.7525)
## No Information Rate : 0.6216
## P-Value [Acc > NIR] : 0.6977
##
## Kappa : 0.1501
##
## Mcnemar's Test P-Value : 1.0000
##
## Sensitivity : 0.6522
## Specificity : 0.5000
## Pos Pred Value : 0.6818
## Neg Pred Value : 0.4667
## Prevalence : 0.6216
## Detection Rate : 0.4054
## Detection Prevalence : 0.5946
## Balanced Accuracy : 0.5761
##
## 'Positive' Class : Rejected
##
## To evaluate the model performance
#install.packages("gmodels")
library(gmodels)
CrossTable(x= student_test_label, y =addmin_pred , prop.chisq = FALSE )
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 37
##
##
## | addmin_pred
## student_test_label | Rejected | Accepted | Row Total |
## -------------------|-----------|-----------|-----------|
## Rejected | 15 | 8 | 23 |
## | 0.652 | 0.348 | 0.622 |
## | 0.682 | 0.533 | |
## | 0.405 | 0.216 | |
## -------------------|-----------|-----------|-----------|
## Accepted | 7 | 7 | 14 |
## | 0.500 | 0.500 | 0.378 |
## | 0.318 | 0.467 | |
## | 0.189 | 0.189 | |
## -------------------|-----------|-----------|-----------|
## Column Total | 22 | 15 | 37 |
## | 0.595 | 0.405 | |
## -------------------|-----------|-----------|-----------|
##
##
#### testing for various values of k
k_values <- c(1, 3, 5, 7, 9, 11, 12, 15, 25, 26, 27, 29)
for( k_val in k_values){
addmin_pred2 <- knn(train =stud_train, test = stud_test, cl = student_train_label, k = k_val)
CrossTable(x= student_test_label, y =addmin_pred2, prop.chisq = FALSE)
}
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 37
##
##
## | addmin_pred2
## student_test_label | Rejected | Accepted | Row Total |
## -------------------|-----------|-----------|-----------|
## Rejected | 10 | 13 | 23 |
## | 0.435 | 0.565 | 0.622 |
## | 0.625 | 0.619 | |
## | 0.270 | 0.351 | |
## -------------------|-----------|-----------|-----------|
## Accepted | 6 | 8 | 14 |
## | 0.429 | 0.571 | 0.378 |
## | 0.375 | 0.381 | |
## | 0.162 | 0.216 | |
## -------------------|-----------|-----------|-----------|
## Column Total | 16 | 21 | 37 |
## | 0.432 | 0.568 | |
## -------------------|-----------|-----------|-----------|
##
##
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 37
##
##
## | addmin_pred2
## student_test_label | Rejected | Accepted | Row Total |
## -------------------|-----------|-----------|-----------|
## Rejected | 11 | 12 | 23 |
## | 0.478 | 0.522 | 0.622 |
## | 0.611 | 0.632 | |
## | 0.297 | 0.324 | |
## -------------------|-----------|-----------|-----------|
## Accepted | 7 | 7 | 14 |
## | 0.500 | 0.500 | 0.378 |
## | 0.389 | 0.368 | |
## | 0.189 | 0.189 | |
## -------------------|-----------|-----------|-----------|
## Column Total | 18 | 19 | 37 |
## | 0.486 | 0.514 | |
## -------------------|-----------|-----------|-----------|
##
##
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 37
##
##
## | addmin_pred2
## student_test_label | Rejected | Accepted | Row Total |
## -------------------|-----------|-----------|-----------|
## Rejected | 10 | 13 | 23 |
## | 0.435 | 0.565 | 0.622 |
## | 0.588 | 0.650 | |
## | 0.270 | 0.351 | |
## -------------------|-----------|-----------|-----------|
## Accepted | 7 | 7 | 14 |
## | 0.500 | 0.500 | 0.378 |
## | 0.412 | 0.350 | |
## | 0.189 | 0.189 | |
## -------------------|-----------|-----------|-----------|
## Column Total | 17 | 20 | 37 |
## | 0.459 | 0.541 | |
## -------------------|-----------|-----------|-----------|
##
##
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 37
##
##
## | addmin_pred2
## student_test_label | Rejected | Accepted | Row Total |
## -------------------|-----------|-----------|-----------|
## Rejected | 15 | 8 | 23 |
## | 0.652 | 0.348 | 0.622 |
## | 0.682 | 0.533 | |
## | 0.405 | 0.216 | |
## -------------------|-----------|-----------|-----------|
## Accepted | 7 | 7 | 14 |
## | 0.500 | 0.500 | 0.378 |
## | 0.318 | 0.467 | |
## | 0.189 | 0.189 | |
## -------------------|-----------|-----------|-----------|
## Column Total | 22 | 15 | 37 |
## | 0.595 | 0.405 | |
## -------------------|-----------|-----------|-----------|
##
##
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 37
##
##
## | addmin_pred2
## student_test_label | Rejected | Accepted | Row Total |
## -------------------|-----------|-----------|-----------|
## Rejected | 12 | 11 | 23 |
## | 0.522 | 0.478 | 0.622 |
## | 0.667 | 0.579 | |
## | 0.324 | 0.297 | |
## -------------------|-----------|-----------|-----------|
## Accepted | 6 | 8 | 14 |
## | 0.429 | 0.571 | 0.378 |
## | 0.333 | 0.421 | |
## | 0.162 | 0.216 | |
## -------------------|-----------|-----------|-----------|
## Column Total | 18 | 19 | 37 |
## | 0.486 | 0.514 | |
## -------------------|-----------|-----------|-----------|
##
##
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 37
##
##
## | addmin_pred2
## student_test_label | Rejected | Accepted | Row Total |
## -------------------|-----------|-----------|-----------|
## Rejected | 12 | 11 | 23 |
## | 0.522 | 0.478 | 0.622 |
## | 0.706 | 0.550 | |
## | 0.324 | 0.297 | |
## -------------------|-----------|-----------|-----------|
## Accepted | 5 | 9 | 14 |
## | 0.357 | 0.643 | 0.378 |
## | 0.294 | 0.450 | |
## | 0.135 | 0.243 | |
## -------------------|-----------|-----------|-----------|
## Column Total | 17 | 20 | 37 |
## | 0.459 | 0.541 | |
## -------------------|-----------|-----------|-----------|
##
##
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 37
##
##
## | addmin_pred2
## student_test_label | Rejected | Accepted | Row Total |
## -------------------|-----------|-----------|-----------|
## Rejected | 10 | 13 | 23 |
## | 0.435 | 0.565 | 0.622 |
## | 0.625 | 0.619 | |
## | 0.270 | 0.351 | |
## -------------------|-----------|-----------|-----------|
## Accepted | 6 | 8 | 14 |
## | 0.429 | 0.571 | 0.378 |
## | 0.375 | 0.381 | |
## | 0.162 | 0.216 | |
## -------------------|-----------|-----------|-----------|
## Column Total | 16 | 21 | 37 |
## | 0.432 | 0.568 | |
## -------------------|-----------|-----------|-----------|
##
##
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 37
##
##
## | addmin_pred2
## student_test_label | Rejected | Accepted | Row Total |
## -------------------|-----------|-----------|-----------|
## Rejected | 8 | 15 | 23 |
## | 0.348 | 0.652 | 0.622 |
## | 0.667 | 0.600 | |
## | 0.216 | 0.405 | |
## -------------------|-----------|-----------|-----------|
## Accepted | 4 | 10 | 14 |
## | 0.286 | 0.714 | 0.378 |
## | 0.333 | 0.400 | |
## | 0.108 | 0.270 | |
## -------------------|-----------|-----------|-----------|
## Column Total | 12 | 25 | 37 |
## | 0.324 | 0.676 | |
## -------------------|-----------|-----------|-----------|
##
##
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 37
##
##
## | addmin_pred2
## student_test_label | Rejected | Accepted | Row Total |
## -------------------|-----------|-----------|-----------|
## Rejected | 4 | 19 | 23 |
## | 0.174 | 0.826 | 0.622 |
## | 0.444 | 0.679 | |
## | 0.108 | 0.514 | |
## -------------------|-----------|-----------|-----------|
## Accepted | 5 | 9 | 14 |
## | 0.357 | 0.643 | 0.378 |
## | 0.556 | 0.321 | |
## | 0.135 | 0.243 | |
## -------------------|-----------|-----------|-----------|
## Column Total | 9 | 28 | 37 |
## | 0.243 | 0.757 | |
## -------------------|-----------|-----------|-----------|
##
##
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 37
##
##
## | addmin_pred2
## student_test_label | Rejected | Accepted | Row Total |
## -------------------|-----------|-----------|-----------|
## Rejected | 6 | 17 | 23 |
## | 0.261 | 0.739 | 0.622 |
## | 0.400 | 0.773 | |
## | 0.162 | 0.459 | |
## -------------------|-----------|-----------|-----------|
## Accepted | 9 | 5 | 14 |
## | 0.643 | 0.357 | 0.378 |
## | 0.600 | 0.227 | |
## | 0.243 | 0.135 | |
## -------------------|-----------|-----------|-----------|
## Column Total | 15 | 22 | 37 |
## | 0.405 | 0.595 | |
## -------------------|-----------|-----------|-----------|
##
##
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 37
##
##
## | addmin_pred2
## student_test_label | Rejected | Accepted | Row Total |
## -------------------|-----------|-----------|-----------|
## Rejected | 5 | 18 | 23 |
## | 0.217 | 0.783 | 0.622 |
## | 0.455 | 0.692 | |
## | 0.135 | 0.486 | |
## -------------------|-----------|-----------|-----------|
## Accepted | 6 | 8 | 14 |
## | 0.429 | 0.571 | 0.378 |
## | 0.545 | 0.308 | |
## | 0.162 | 0.216 | |
## -------------------|-----------|-----------|-----------|
## Column Total | 11 | 26 | 37 |
## | 0.297 | 0.703 | |
## -------------------|-----------|-----------|-----------|
##
##
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 37
##
##
## | addmin_pred2
## student_test_label | Rejected | Accepted | Row Total |
## -------------------|-----------|-----------|-----------|
## Rejected | 4 | 19 | 23 |
## | 0.174 | 0.826 | 0.622 |
## | 0.364 | 0.731 | |
## | 0.108 | 0.514 | |
## -------------------|-----------|-----------|-----------|
## Accepted | 7 | 7 | 14 |
## | 0.500 | 0.500 | 0.378 |
## | 0.636 | 0.269 | |
## | 0.189 | 0.189 | |
## -------------------|-----------|-----------|-----------|
## Column Total | 11 | 26 | 37 |
## | 0.297 | 0.703 | |
## -------------------|-----------|-----------|-----------|
##
##
### A graphical display of various values of k
#### Defining a range of k values
k_selection <- c(1, 3, 5, 7, 9, 11, 12, 15, 25, 26, 27, 29)
# Initialize a list to store confusion matrices
k_results <- list()
### Loop through k values and store the results
for (k in k_selection) {
addmin_pred3 <- knn(train = stud_train, test = stud_test, cl = student_train_label, k = k)
k_results[[paste0("k = ", k)]] <- confusionMatrix(addmin_pred3, student_test_label)
}
# Print the results
k_results
## $`k = 1`
## Confusion Matrix and Statistics
##
## Reference
## Prediction Rejected Accepted
## Rejected 10 6
## Accepted 13 8
##
## Accuracy : 0.4865
## 95% CI : (0.3192, 0.656)
## No Information Rate : 0.6216
## P-Value [Acc > NIR] : 0.9673
##
## Kappa : 0.0057
##
## Mcnemar's Test P-Value : 0.1687
##
## Sensitivity : 0.4348
## Specificity : 0.5714
## Pos Pred Value : 0.6250
## Neg Pred Value : 0.3810
## Prevalence : 0.6216
## Detection Rate : 0.2703
## Detection Prevalence : 0.4324
## Balanced Accuracy : 0.5031
##
## 'Positive' Class : Rejected
##
##
## $`k = 3`
## Confusion Matrix and Statistics
##
## Reference
## Prediction Rejected Accepted
## Rejected 11 7
## Accepted 12 7
##
## Accuracy : 0.4865
## 95% CI : (0.3192, 0.656)
## No Information Rate : 0.6216
## P-Value [Acc > NIR] : 0.9673
##
## Kappa : -0.0203
##
## Mcnemar's Test P-Value : 0.3588
##
## Sensitivity : 0.4783
## Specificity : 0.5000
## Pos Pred Value : 0.6111
## Neg Pred Value : 0.3684
## Prevalence : 0.6216
## Detection Rate : 0.2973
## Detection Prevalence : 0.4865
## Balanced Accuracy : 0.4891
##
## 'Positive' Class : Rejected
##
##
## $`k = 5`
## Confusion Matrix and Statistics
##
## Reference
## Prediction Rejected Accepted
## Rejected 10 7
## Accepted 13 7
##
## Accuracy : 0.4595
## 95% CI : (0.2949, 0.6308)
## No Information Rate : 0.6216
## P-Value [Acc > NIR] : 0.9850
##
## Kappa : -0.0602
##
## Mcnemar's Test P-Value : 0.2636
##
## Sensitivity : 0.4348
## Specificity : 0.5000
## Pos Pred Value : 0.5882
## Neg Pred Value : 0.3500
## Prevalence : 0.6216
## Detection Rate : 0.2703
## Detection Prevalence : 0.4595
## Balanced Accuracy : 0.4674
##
## 'Positive' Class : Rejected
##
##
## $`k = 7`
## Confusion Matrix and Statistics
##
## Reference
## Prediction Rejected Accepted
## Rejected 15 7
## Accepted 8 7
##
## Accuracy : 0.5946
## 95% CI : (0.421, 0.7525)
## No Information Rate : 0.6216
## P-Value [Acc > NIR] : 0.6977
##
## Kappa : 0.1501
##
## Mcnemar's Test P-Value : 1.0000
##
## Sensitivity : 0.6522
## Specificity : 0.5000
## Pos Pred Value : 0.6818
## Neg Pred Value : 0.4667
## Prevalence : 0.6216
## Detection Rate : 0.4054
## Detection Prevalence : 0.5946
## Balanced Accuracy : 0.5761
##
## 'Positive' Class : Rejected
##
##
## $`k = 9`
## Confusion Matrix and Statistics
##
## Reference
## Prediction Rejected Accepted
## Rejected 12 6
## Accepted 11 8
##
## Accuracy : 0.5405
## 95% CI : (0.3692, 0.7051)
## No Information Rate : 0.6216
## P-Value [Acc > NIR] : 0.8815
##
## Kappa : 0.0871
##
## Mcnemar's Test P-Value : 0.3320
##
## Sensitivity : 0.5217
## Specificity : 0.5714
## Pos Pred Value : 0.6667
## Neg Pred Value : 0.4211
## Prevalence : 0.6216
## Detection Rate : 0.3243
## Detection Prevalence : 0.4865
## Balanced Accuracy : 0.5466
##
## 'Positive' Class : Rejected
##
##
## $`k = 11`
## Confusion Matrix and Statistics
##
## Reference
## Prediction Rejected Accepted
## Rejected 12 4
## Accepted 11 10
##
## Accuracy : 0.5946
## 95% CI : (0.421, 0.7525)
## No Information Rate : 0.6216
## P-Value [Acc > NIR] : 0.6977
##
## Kappa : 0.215
##
## Mcnemar's Test P-Value : 0.1213
##
## Sensitivity : 0.5217
## Specificity : 0.7143
## Pos Pred Value : 0.7500
## Neg Pred Value : 0.4762
## Prevalence : 0.6216
## Detection Rate : 0.3243
## Detection Prevalence : 0.4324
## Balanced Accuracy : 0.6180
##
## 'Positive' Class : Rejected
##
##
## $`k = 12`
## Confusion Matrix and Statistics
##
## Reference
## Prediction Rejected Accepted
## Rejected 11 5
## Accepted 12 9
##
## Accuracy : 0.5405
## 95% CI : (0.3692, 0.7051)
## No Information Rate : 0.6216
## P-Value [Acc > NIR] : 0.8815
##
## Kappa : 0.1103
##
## Mcnemar's Test P-Value : 0.1456
##
## Sensitivity : 0.4783
## Specificity : 0.6429
## Pos Pred Value : 0.6875
## Neg Pred Value : 0.4286
## Prevalence : 0.6216
## Detection Rate : 0.2973
## Detection Prevalence : 0.4324
## Balanced Accuracy : 0.5606
##
## 'Positive' Class : Rejected
##
##
## $`k = 15`
## Confusion Matrix and Statistics
##
## Reference
## Prediction Rejected Accepted
## Rejected 8 4
## Accepted 15 10
##
## Accuracy : 0.4865
## 95% CI : (0.3192, 0.656)
## No Information Rate : 0.6216
## P-Value [Acc > NIR] : 0.96725
##
## Kappa : 0.0538
##
## Mcnemar's Test P-Value : 0.02178
##
## Sensitivity : 0.3478
## Specificity : 0.7143
## Pos Pred Value : 0.6667
## Neg Pred Value : 0.4000
## Prevalence : 0.6216
## Detection Rate : 0.2162
## Detection Prevalence : 0.3243
## Balanced Accuracy : 0.5311
##
## 'Positive' Class : Rejected
##
##
## $`k = 25`
## Confusion Matrix and Statistics
##
## Reference
## Prediction Rejected Accepted
## Rejected 4 5
## Accepted 19 9
##
## Accuracy : 0.3514
## 95% CI : (0.2021, 0.5254)
## No Information Rate : 0.6216
## P-Value [Acc > NIR] : 0.999763
##
## Kappa : -0.1532
##
## Mcnemar's Test P-Value : 0.007963
##
## Sensitivity : 0.1739
## Specificity : 0.6429
## Pos Pred Value : 0.4444
## Neg Pred Value : 0.3214
## Prevalence : 0.6216
## Detection Rate : 0.1081
## Detection Prevalence : 0.2432
## Balanced Accuracy : 0.4084
##
## 'Positive' Class : Rejected
##
##
## $`k = 26`
## Confusion Matrix and Statistics
##
## Reference
## Prediction Rejected Accepted
## Rejected 5 6
## Accepted 18 8
##
## Accuracy : 0.3514
## 95% CI : (0.2021, 0.5254)
## No Information Rate : 0.6216
## P-Value [Acc > NIR] : 0.99976
##
## Kappa : -0.1809
##
## Mcnemar's Test P-Value : 0.02474
##
## Sensitivity : 0.2174
## Specificity : 0.5714
## Pos Pred Value : 0.4545
## Neg Pred Value : 0.3077
## Prevalence : 0.6216
## Detection Rate : 0.1351
## Detection Prevalence : 0.2973
## Balanced Accuracy : 0.3944
##
## 'Positive' Class : Rejected
##
##
## $`k = 27`
## Confusion Matrix and Statistics
##
## Reference
## Prediction Rejected Accepted
## Rejected 5 6
## Accepted 18 8
##
## Accuracy : 0.3514
## 95% CI : (0.2021, 0.5254)
## No Information Rate : 0.6216
## P-Value [Acc > NIR] : 0.99976
##
## Kappa : -0.1809
##
## Mcnemar's Test P-Value : 0.02474
##
## Sensitivity : 0.2174
## Specificity : 0.5714
## Pos Pred Value : 0.4545
## Neg Pred Value : 0.3077
## Prevalence : 0.6216
## Detection Rate : 0.1351
## Detection Prevalence : 0.2973
## Balanced Accuracy : 0.3944
##
## 'Positive' Class : Rejected
##
##
## $`k = 29`
## Confusion Matrix and Statistics
##
## Reference
## Prediction Rejected Accepted
## Rejected 4 7
## Accepted 19 7
##
## Accuracy : 0.2973
## 95% CI : (0.1587, 0.4698)
## No Information Rate : 0.6216
## P-Value [Acc > NIR] : 0.99998
##
## Kappa : -0.2793
##
## Mcnemar's Test P-Value : 0.03098
##
## Sensitivity : 0.1739
## Specificity : 0.5000
## Pos Pred Value : 0.3636
## Neg Pred Value : 0.2692
## Prevalence : 0.6216
## Detection Rate : 0.1081
## Detection Prevalence : 0.2973
## Balanced Accuracy : 0.3370
##
## 'Positive' Class : Rejected
##
### Extract accuracy for each k
k_accuracy <- sapply(k_results, function(x) x$overall['Accuracy'])
### Create a data frame for plotting
accuracy_dataframe <- data.frame(k = k_selection, Accuracy = k_accuracy)
### Plot the k_accuracy
ggplot(accuracy_dataframe, aes(x = k, y = Accuracy)) +
geom_line(colour = "blue") +
geom_point(color = "purple", size = 5) +
labs(title = "K-NN Accuracy vs. k selections", x = "k selections", y = "Accuracy") +
theme_grey()
