Anggota Kelompok 7

  1. Destia Rika (23031554052)
  2. Nadira Zahra Ramadhani (23031554099)
  3. Aghnia Alya Amarilla (23031554102)
  4. Tarisa Dwita Abadi (23031554134)

1. Multinomial Logistic Regression - Analisis Sleep Disorder

1.1 Import Library

library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
library(nnet)   
## Warning: package 'nnet' was built under R version 4.4.3
library(caret) 
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.4.3
library(MASS)
## Warning: package 'MASS' was built under R version 4.4.3
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select

1.2 Load Dataset

data <- read_csv("C:/Users/asus/Downloads/Sleep_health_and_lifestyle_dataset.csv")
## Rows: 374 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Gender, Occupation, BMI Category, Blood Pressure, Sleep Disorder
## dbl (8): Person ID, Age, Sleep Duration, Quality of Sleep, Physical Activity...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

1.3 Preprocessing Data

Mengubah variabel target menjadi faktor

data$`Sleep Disorder` <- as.factor(data$`Sleep Disorder`)

1.4 EDA

table(data$`Sleep Disorder`)
## 
##    Insomnia        None Sleep Apnea 
##          77         219          78
ggplot(data, aes(x = `Sleep Disorder`, fill = `Sleep Disorder`)) +
  geom_bar() +
  labs(title = "Distribusi Sleep Disorder")

1.5 Regresi Logistik Multinomial

Pisahkan data training dan testing

set.seed(123)
train_idx <- createDataPartition(data$`Sleep Disorder`, p = 0.8, list = FALSE)
train_data <- data[train_idx, ]
test_data <- data[-train_idx, ]

Model Multinomial Logistic Regression :

model <- multinom(`Sleep Disorder` ~ Age + `BMI Category` + Gender +
                    `Physical Activity Level` + `Stress Level` + Occupation,
                  data = train_data)
## # weights:  57 (36 variable)
## initial  value 330.682299 
## iter  10 value 128.444667
## iter  20 value 95.749593
## iter  30 value 93.483759
## iter  40 value 93.406055
## iter  50 value 93.359488
## iter  60 value 93.340710
## iter  70 value 93.339300
## final  value 93.339269 
## converged
summary(model)
## Warning in sqrt(diag(vc)): NaNs produced
## Call:
## multinom(formula = `Sleep Disorder` ~ Age + `BMI Category` + 
##     Gender + `Physical Activity Level` + `Stress Level` + Occupation, 
##     data = train_data)
## 
## Coefficients:
##             (Intercept)        Age `BMI Category`Normal Weight
## None           21.27266 -0.3206465                    50.44546
## Sleep Apnea   -16.73306 -0.2733248                    47.93954
##             `BMI Category`Obese `BMI Category`Overweight GenderMale
## None                 -32.547977                -2.189095 -1.2929854
## Sleep Apnea            1.437238                -2.582941 -0.4590417
##             `Physical Activity Level` `Stress Level` OccupationDoctor
## None                      -0.02260746     -1.2576253         4.103668
## Sleep Apnea               -0.01329249     -0.7969681        32.832670
##             OccupationEngineer OccupationLawyer OccupationManager
## None                  4.015599         3.993413         31.553304
## Sleep Apnea          33.082660        33.022250         -1.400954
##             OccupationNurse OccupationSales Representative
## None               8.530745                      -1.465919
## Sleep Apnea       43.826985                      43.409017
##             OccupationSalesperson OccupationScientist
## None                      2.75096            13.92826
## Sleep Apnea              34.61717            46.55502
##             OccupationSoftware Engineer OccupationTeacher
## None                           15.19135        -0.4899114
## Sleep Apnea                   -16.99439        32.1725023
## 
## Std. Errors:
##             (Intercept)       Age `BMI Category`Normal Weight
## None           7.162474 0.1515284                      1.1021
## Sleep Apnea    7.389806 0.1497668                      1.1021
##             `BMI Category`Obese `BMI Category`Overweight GenderMale
## None               4.452753e-06                  1.43082   2.184271
## Sleep Apnea        2.053363e+00                  2.08964   2.196946
##             `Physical Activity Level` `Stress Level` OccupationDoctor
## None                       0.02762704      0.5116781         2.844714
## Sleep Apnea                0.03181008      0.5031277         2.265397
##             OccupationEngineer OccupationLawyer OccupationManager
## None                  2.639748         2.521204      5.205416e-11
## Sleep Apnea           1.984160         1.735338      8.474467e-28
##             OccupationNurse OccupationSales Representative
## None               3.680151                            NaN
## Sleep Apnea        3.604048                   0.0001378813
##             OccupationSalesperson OccupationScientist
## None                     3.243630            1.159588
## Sleep Apnea              2.286402            1.159571
##             OccupationSoftware Engineer OccupationTeacher
## None                       7.462517e-06          1.684860
## Sleep Apnea                2.483138e-19          1.433694
## 
## Residual Deviance: 186.6785 
## AIC: 258.6785

1.6 Evaluasi dan Visualisasi

Prediksi data test

pred <- predict(model, newdata = test_data)

Confusion matrix

confusionMatrix(pred, test_data$`Sleep Disorder`)
## Confusion Matrix and Statistics
## 
##              Reference
## Prediction    Insomnia None Sleep Apnea
##   Insomnia          11    3           2
##   None               3   39           1
##   Sleep Apnea        1    1          12
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8493          
##                  95% CI : (0.7464, 0.9223)
##     No Information Rate : 0.589           
##     P-Value [Acc > NIR] : 1.488e-06       
##                                           
##                   Kappa : 0.735           
##                                           
##  Mcnemar's Test P-Value : 0.9536          
## 
## Statistics by Class:
## 
##                      Class: Insomnia Class: None Class: Sleep Apnea
## Sensitivity                   0.7333      0.9070             0.8000
## Specificity                   0.9138      0.8667             0.9655
## Pos Pred Value                0.6875      0.9070             0.8571
## Neg Pred Value                0.9298      0.8667             0.9492
## Prevalence                    0.2055      0.5890             0.2055
## Detection Rate                0.1507      0.5342             0.1644
## Detection Prevalence          0.2192      0.5890             0.1918
## Balanced Accuracy             0.8236      0.8868             0.8828
print(confusionMatrix)
## function (data, ...) 
## {
##     UseMethod("confusionMatrix")
## }
## <bytecode: 0x0000025f79fb72d8>
## <environment: namespace:caret>

Hitung Akurasi

accuracy <- mean(pred == test_data$`Sleep Disorder`)
paste("Akurasi Model:", round(accuracy * 100, 2), "%")
## [1] "Akurasi Model: 84.93 %"

Hasil Prediksi dan Aktual

hasil <- data.frame(Aktual = test_data$`Sleep Disorder`, Prediksi = pred)

Visualisasi Perbandingan

ggplot(hasil, aes(x = Aktual, fill = Prediksi)) +
  geom_bar(position = "dodge") +
  labs(title = "Perbandingan Sleep Disorder: Aktual vs Prediksi",
       x = "Kelas Aktual", y = "Jumlah")

2. Ordinal Logistic Regression - Analisis Kategori Nilai Matematika

2.1 Import Library

library(readr)
library(dplyr)
library(ggplot2)
library(nnet)   
library(caret) 
library(MASS)

2.2 Load dataset

data <- read.csv("C:/Users/asus/Downloads/StudentsPerformance.csv")

2.3 Preprocessing Data

Membuat kategori dari math score

data$math_level <- cut(data$math.score,
                       breaks = c(-Inf, 60, 80, Inf),
                       labels = c("Low", "Medium", "High"),
                       right = TRUE,
                       ordered_result = TRUE)

2.4 EDA

library(ggplot2)

ggplot(data, aes(x = math_level)) +
  geom_bar(fill = "steelblue") +
  labs(title = "Distribusi Kategori Nilai Matematika",
       x = "Kategori Nilai",
       y = "Jumlah Siswa") +
  theme_minimal()

2.5 Regresi Logistik Ordinal

data$gender <- as.factor(data$gender)
data$race.ethnicity <- as.factor(data$race.ethnicity)
data$parental.level.of.education <- as.factor(data$parental.level.of.education)
data$test.preparation.course <- as.factor(data$test.preparation.course)
model <- polr(math_level ~ gender + race.ethnicity + parental.level.of.education + test.preparation.course,
              data = data,
              Hess = TRUE)

summary(model)
## Call:
## polr(formula = math_level ~ gender + race.ethnicity + parental.level.of.education + 
##     test.preparation.course, data = data, Hess = TRUE)
## 
## Coefficients:
##                                                 Value Std. Error t value
## gendermale                                    0.63813     0.1241  5.1440
## race.ethnicitygroup B                         0.43154     0.2507  1.7217
## race.ethnicitygroup C                         0.52424     0.2344  2.2368
## race.ethnicitygroup D                         0.75433     0.2382  3.1674
## race.ethnicitygroup E                         1.60809     0.2690  5.9777
## parental.level.of.educationbachelor's degree  0.26472     0.2194  1.2066
## parental.level.of.educationhigh school       -0.55944     0.1924 -2.9076
## parental.level.of.educationmaster's degree    0.35442     0.2879  1.2312
## parental.level.of.educationsome college      -0.05064     0.1831 -0.2766
## parental.level.of.educationsome high school  -0.36654     0.1956 -1.8741
## test.preparation.coursenone                  -0.56106     0.1287 -4.3588
## 
## Intercepts:
##             Value   Std. Error t value
## Low|Medium  -0.2660  0.2678    -0.9933
## Medium|High  2.1656  0.2774     7.8075
## 
## Residual Deviance: 1928.024 
## AIC: 1954.024

2.6 Uji Signifikansi

Menghitung p-value

ctable <- coef(summary(model))
p_values <- pnorm(abs(ctable[, "t value"]), lower.tail = FALSE) * 2
ctable <- cbind(ctable, "p value" = p_values)
print(ctable)
##                                                    Value Std. Error    t value
## gendermale                                    0.63812770  0.1240537  5.1439644
## race.ethnicitygroup B                         0.43154373  0.2506542  1.7216699
## race.ethnicitygroup C                         0.52423532  0.2343723  2.2367627
## race.ethnicitygroup D                         0.75433076  0.2381546  3.1673991
## race.ethnicitygroup E                         1.60809229  0.2690140  5.9777261
## parental.level.of.educationbachelor's degree  0.26472400  0.2194000  1.2065814
## parental.level.of.educationhigh school       -0.55943525  0.1924076 -2.9075531
## parental.level.of.educationmaster's degree    0.35441899  0.2878712  1.2311721
## parental.level.of.educationsome college      -0.05063872  0.1831013 -0.2765612
## parental.level.of.educationsome high school  -0.36654003  0.1955800 -1.8741178
## test.preparation.coursenone                  -0.56106403  0.1287189 -4.3588312
## Low|Medium                                   -0.26597226  0.2677642 -0.9933077
## Medium|High                                   2.16558686  0.2773738  7.8074664
##                                                   p value
## gendermale                                   2.690006e-07
## race.ethnicitygroup B                        8.512934e-02
## race.ethnicitygroup C                        2.530185e-02
## race.ethnicitygroup D                        1.538091e-03
## race.ethnicitygroup E                        2.262737e-09
## parental.level.of.educationbachelor's degree 2.275934e-01
## parental.level.of.educationhigh school       3.642685e-03
## parental.level.of.educationmaster's degree   2.182585e-01
## parental.level.of.educationsome college      7.821170e-01
## parental.level.of.educationsome high school  6.091419e-02
## test.preparation.coursenone                  1.307589e-05
## Low|Medium                                   3.205600e-01
## Medium|High                                  5.834903e-15

2.7 Evaluasi Akurasi Model dan Visualisasi

Prediksi nilai kategori

pred <- predict(model, newdata = data)
mask <- !is.na(pred)
pred_clean <- factor(pred[mask], levels = levels(data$math_level))
actual_clean <- factor(as.character(data$math_level[mask]), levels = levels(data$math_level))

Confusion matrix

confusion <- confusionMatrix(pred_clean, actual_clean)
print(confusion)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Low Medium High
##     Low    112     87   13
##     Medium 225    388  150
##     High     2     10   13
## 
## Overall Statistics
##                                           
##                Accuracy : 0.513           
##                  95% CI : (0.4815, 0.5444)
##     No Information Rate : 0.485           
##     P-Value [Acc > NIR] : 0.04096         
##                                           
##                   Kappa : 0.1204          
##                                           
##  Mcnemar's Test P-Value : < 2e-16         
## 
## Statistics by Class:
## 
##                      Class: Low Class: Medium Class: High
## Sensitivity              0.3304        0.8000     0.07386
## Specificity              0.8487        0.2718     0.98544
## Pos Pred Value           0.5283        0.5085     0.52000
## Neg Pred Value           0.7119        0.5907     0.83282
## Prevalence               0.3390        0.4850     0.17600
## Detection Rate           0.1120        0.3880     0.01300
## Detection Prevalence     0.2120        0.7630     0.02500
## Balanced Accuracy        0.5895        0.5359     0.52965

Menghitung Akurasi

accuracy <- mean(pred_clean == actual_clean)
cat("Akurasi Model:", round(accuracy * 100, 2), "%\n")
## Akurasi Model: 51.3 %

Hasil Prediksi dan Aktual

hasil <- data.frame(Aktual = actual_clean, Prediksi = pred_clean)

Visualisasi Perbandingan

ggplot(hasil, aes(x = Aktual, fill = Prediksi)) +
  geom_bar(position = "dodge") +
  labs(title = "Perbandingan Prediksi vs Aktual: Kategori Nilai Matematika",
       x = "Kategori Nilai Aktual",
       y = "Jumlah Siswa",
       fill = "Prediksi") +
  theme_minimal()