1.1 Import Library
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
library(nnet)
## Warning: package 'nnet' was built under R version 4.4.3
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.4.3
library(MASS)
## Warning: package 'MASS' was built under R version 4.4.3
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
1.2 Load Dataset
data <- read_csv("C:/Users/asus/Downloads/Sleep_health_and_lifestyle_dataset.csv")
## Rows: 374 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Gender, Occupation, BMI Category, Blood Pressure, Sleep Disorder
## dbl (8): Person ID, Age, Sleep Duration, Quality of Sleep, Physical Activity...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
1.3 Preprocessing Data
Mengubah variabel target menjadi faktor
data$`Sleep Disorder` <- as.factor(data$`Sleep Disorder`)
1.4 EDA
table(data$`Sleep Disorder`)
##
## Insomnia None Sleep Apnea
## 77 219 78
ggplot(data, aes(x = `Sleep Disorder`, fill = `Sleep Disorder`)) +
geom_bar() +
labs(title = "Distribusi Sleep Disorder")
1.5 Regresi Logistik Multinomial
Pisahkan data training dan testing
set.seed(123)
train_idx <- createDataPartition(data$`Sleep Disorder`, p = 0.8, list = FALSE)
train_data <- data[train_idx, ]
test_data <- data[-train_idx, ]
Model Multinomial Logistic Regression :
model <- multinom(`Sleep Disorder` ~ Age + `BMI Category` + Gender +
`Physical Activity Level` + `Stress Level` + Occupation,
data = train_data)
## # weights: 57 (36 variable)
## initial value 330.682299
## iter 10 value 128.444667
## iter 20 value 95.749593
## iter 30 value 93.483759
## iter 40 value 93.406055
## iter 50 value 93.359488
## iter 60 value 93.340710
## iter 70 value 93.339300
## final value 93.339269
## converged
summary(model)
## Warning in sqrt(diag(vc)): NaNs produced
## Call:
## multinom(formula = `Sleep Disorder` ~ Age + `BMI Category` +
## Gender + `Physical Activity Level` + `Stress Level` + Occupation,
## data = train_data)
##
## Coefficients:
## (Intercept) Age `BMI Category`Normal Weight
## None 21.27266 -0.3206465 50.44546
## Sleep Apnea -16.73306 -0.2733248 47.93954
## `BMI Category`Obese `BMI Category`Overweight GenderMale
## None -32.547977 -2.189095 -1.2929854
## Sleep Apnea 1.437238 -2.582941 -0.4590417
## `Physical Activity Level` `Stress Level` OccupationDoctor
## None -0.02260746 -1.2576253 4.103668
## Sleep Apnea -0.01329249 -0.7969681 32.832670
## OccupationEngineer OccupationLawyer OccupationManager
## None 4.015599 3.993413 31.553304
## Sleep Apnea 33.082660 33.022250 -1.400954
## OccupationNurse OccupationSales Representative
## None 8.530745 -1.465919
## Sleep Apnea 43.826985 43.409017
## OccupationSalesperson OccupationScientist
## None 2.75096 13.92826
## Sleep Apnea 34.61717 46.55502
## OccupationSoftware Engineer OccupationTeacher
## None 15.19135 -0.4899114
## Sleep Apnea -16.99439 32.1725023
##
## Std. Errors:
## (Intercept) Age `BMI Category`Normal Weight
## None 7.162474 0.1515284 1.1021
## Sleep Apnea 7.389806 0.1497668 1.1021
## `BMI Category`Obese `BMI Category`Overweight GenderMale
## None 4.452753e-06 1.43082 2.184271
## Sleep Apnea 2.053363e+00 2.08964 2.196946
## `Physical Activity Level` `Stress Level` OccupationDoctor
## None 0.02762704 0.5116781 2.844714
## Sleep Apnea 0.03181008 0.5031277 2.265397
## OccupationEngineer OccupationLawyer OccupationManager
## None 2.639748 2.521204 5.205416e-11
## Sleep Apnea 1.984160 1.735338 8.474467e-28
## OccupationNurse OccupationSales Representative
## None 3.680151 NaN
## Sleep Apnea 3.604048 0.0001378813
## OccupationSalesperson OccupationScientist
## None 3.243630 1.159588
## Sleep Apnea 2.286402 1.159571
## OccupationSoftware Engineer OccupationTeacher
## None 7.462517e-06 1.684860
## Sleep Apnea 2.483138e-19 1.433694
##
## Residual Deviance: 186.6785
## AIC: 258.6785
1.6 Evaluasi dan Visualisasi
Prediksi data test
pred <- predict(model, newdata = test_data)
Confusion matrix
confusionMatrix(pred, test_data$`Sleep Disorder`)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Insomnia None Sleep Apnea
## Insomnia 11 3 2
## None 3 39 1
## Sleep Apnea 1 1 12
##
## Overall Statistics
##
## Accuracy : 0.8493
## 95% CI : (0.7464, 0.9223)
## No Information Rate : 0.589
## P-Value [Acc > NIR] : 1.488e-06
##
## Kappa : 0.735
##
## Mcnemar's Test P-Value : 0.9536
##
## Statistics by Class:
##
## Class: Insomnia Class: None Class: Sleep Apnea
## Sensitivity 0.7333 0.9070 0.8000
## Specificity 0.9138 0.8667 0.9655
## Pos Pred Value 0.6875 0.9070 0.8571
## Neg Pred Value 0.9298 0.8667 0.9492
## Prevalence 0.2055 0.5890 0.2055
## Detection Rate 0.1507 0.5342 0.1644
## Detection Prevalence 0.2192 0.5890 0.1918
## Balanced Accuracy 0.8236 0.8868 0.8828
print(confusionMatrix)
## function (data, ...)
## {
## UseMethod("confusionMatrix")
## }
## <bytecode: 0x0000025f79fb72d8>
## <environment: namespace:caret>
Hitung Akurasi
accuracy <- mean(pred == test_data$`Sleep Disorder`)
paste("Akurasi Model:", round(accuracy * 100, 2), "%")
## [1] "Akurasi Model: 84.93 %"
Hasil Prediksi dan Aktual
hasil <- data.frame(Aktual = test_data$`Sleep Disorder`, Prediksi = pred)
Visualisasi Perbandingan
ggplot(hasil, aes(x = Aktual, fill = Prediksi)) +
geom_bar(position = "dodge") +
labs(title = "Perbandingan Sleep Disorder: Aktual vs Prediksi",
x = "Kelas Aktual", y = "Jumlah")
2.1 Import Library
library(readr)
library(dplyr)
library(ggplot2)
library(nnet)
library(caret)
library(MASS)
2.2 Load dataset
data <- read.csv("C:/Users/asus/Downloads/StudentsPerformance.csv")
2.3 Preprocessing Data
Membuat kategori dari math score
data$math_level <- cut(data$math.score,
breaks = c(-Inf, 60, 80, Inf),
labels = c("Low", "Medium", "High"),
right = TRUE,
ordered_result = TRUE)
2.4 EDA
library(ggplot2)
ggplot(data, aes(x = math_level)) +
geom_bar(fill = "steelblue") +
labs(title = "Distribusi Kategori Nilai Matematika",
x = "Kategori Nilai",
y = "Jumlah Siswa") +
theme_minimal()
2.5 Regresi Logistik Ordinal
data$gender <- as.factor(data$gender)
data$race.ethnicity <- as.factor(data$race.ethnicity)
data$parental.level.of.education <- as.factor(data$parental.level.of.education)
data$test.preparation.course <- as.factor(data$test.preparation.course)
model <- polr(math_level ~ gender + race.ethnicity + parental.level.of.education + test.preparation.course,
data = data,
Hess = TRUE)
summary(model)
## Call:
## polr(formula = math_level ~ gender + race.ethnicity + parental.level.of.education +
## test.preparation.course, data = data, Hess = TRUE)
##
## Coefficients:
## Value Std. Error t value
## gendermale 0.63813 0.1241 5.1440
## race.ethnicitygroup B 0.43154 0.2507 1.7217
## race.ethnicitygroup C 0.52424 0.2344 2.2368
## race.ethnicitygroup D 0.75433 0.2382 3.1674
## race.ethnicitygroup E 1.60809 0.2690 5.9777
## parental.level.of.educationbachelor's degree 0.26472 0.2194 1.2066
## parental.level.of.educationhigh school -0.55944 0.1924 -2.9076
## parental.level.of.educationmaster's degree 0.35442 0.2879 1.2312
## parental.level.of.educationsome college -0.05064 0.1831 -0.2766
## parental.level.of.educationsome high school -0.36654 0.1956 -1.8741
## test.preparation.coursenone -0.56106 0.1287 -4.3588
##
## Intercepts:
## Value Std. Error t value
## Low|Medium -0.2660 0.2678 -0.9933
## Medium|High 2.1656 0.2774 7.8075
##
## Residual Deviance: 1928.024
## AIC: 1954.024
2.6 Uji Signifikansi
Menghitung p-value
ctable <- coef(summary(model))
p_values <- pnorm(abs(ctable[, "t value"]), lower.tail = FALSE) * 2
ctable <- cbind(ctable, "p value" = p_values)
print(ctable)
## Value Std. Error t value
## gendermale 0.63812770 0.1240537 5.1439644
## race.ethnicitygroup B 0.43154373 0.2506542 1.7216699
## race.ethnicitygroup C 0.52423532 0.2343723 2.2367627
## race.ethnicitygroup D 0.75433076 0.2381546 3.1673991
## race.ethnicitygroup E 1.60809229 0.2690140 5.9777261
## parental.level.of.educationbachelor's degree 0.26472400 0.2194000 1.2065814
## parental.level.of.educationhigh school -0.55943525 0.1924076 -2.9075531
## parental.level.of.educationmaster's degree 0.35441899 0.2878712 1.2311721
## parental.level.of.educationsome college -0.05063872 0.1831013 -0.2765612
## parental.level.of.educationsome high school -0.36654003 0.1955800 -1.8741178
## test.preparation.coursenone -0.56106403 0.1287189 -4.3588312
## Low|Medium -0.26597226 0.2677642 -0.9933077
## Medium|High 2.16558686 0.2773738 7.8074664
## p value
## gendermale 2.690006e-07
## race.ethnicitygroup B 8.512934e-02
## race.ethnicitygroup C 2.530185e-02
## race.ethnicitygroup D 1.538091e-03
## race.ethnicitygroup E 2.262737e-09
## parental.level.of.educationbachelor's degree 2.275934e-01
## parental.level.of.educationhigh school 3.642685e-03
## parental.level.of.educationmaster's degree 2.182585e-01
## parental.level.of.educationsome college 7.821170e-01
## parental.level.of.educationsome high school 6.091419e-02
## test.preparation.coursenone 1.307589e-05
## Low|Medium 3.205600e-01
## Medium|High 5.834903e-15
2.7 Evaluasi Akurasi Model dan Visualisasi
Prediksi nilai kategori
pred <- predict(model, newdata = data)
mask <- !is.na(pred)
pred_clean <- factor(pred[mask], levels = levels(data$math_level))
actual_clean <- factor(as.character(data$math_level[mask]), levels = levels(data$math_level))
Confusion matrix
confusion <- confusionMatrix(pred_clean, actual_clean)
print(confusion)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Low Medium High
## Low 112 87 13
## Medium 225 388 150
## High 2 10 13
##
## Overall Statistics
##
## Accuracy : 0.513
## 95% CI : (0.4815, 0.5444)
## No Information Rate : 0.485
## P-Value [Acc > NIR] : 0.04096
##
## Kappa : 0.1204
##
## Mcnemar's Test P-Value : < 2e-16
##
## Statistics by Class:
##
## Class: Low Class: Medium Class: High
## Sensitivity 0.3304 0.8000 0.07386
## Specificity 0.8487 0.2718 0.98544
## Pos Pred Value 0.5283 0.5085 0.52000
## Neg Pred Value 0.7119 0.5907 0.83282
## Prevalence 0.3390 0.4850 0.17600
## Detection Rate 0.1120 0.3880 0.01300
## Detection Prevalence 0.2120 0.7630 0.02500
## Balanced Accuracy 0.5895 0.5359 0.52965
Menghitung Akurasi
accuracy <- mean(pred_clean == actual_clean)
cat("Akurasi Model:", round(accuracy * 100, 2), "%\n")
## Akurasi Model: 51.3 %
Hasil Prediksi dan Aktual
hasil <- data.frame(Aktual = actual_clean, Prediksi = pred_clean)
Visualisasi Perbandingan
ggplot(hasil, aes(x = Aktual, fill = Prediksi)) +
geom_bar(position = "dodge") +
labs(title = "Perbandingan Prediksi vs Aktual: Kategori Nilai Matematika",
x = "Kategori Nilai Aktual",
y = "Jumlah Siswa",
fill = "Prediksi") +
theme_minimal()