Kelompok 9 :


options(repos = c(CRAN = "https://cloud.r-project.org"))
install.packages(c("tidyverse", "car", "rstatix", "psych", "MVN"))
## package 'tidyverse' successfully unpacked and MD5 sums checked
## package 'car' successfully unpacked and MD5 sums checked
## package 'rstatix' successfully unpacked and MD5 sums checked
## package 'psych' successfully unpacked and MD5 sums checked
## package 'MVN' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Lenovo\AppData\Local\Temp\RtmpuK7QzT\downloaded_packages
library(tidyverse)
library(car)
library(rstatix)
library(psych)
library(MVN)

Load Dataset

wine_data <- read.csv("WineQT.csv")

Dataset yang digunakan dalam penelitian ini adalah WineQT.csv (https://www.kaggle.com/datasets/yasserh/wine-quality-dataset), yaitu data kualitas wine yang memuat informasi kimia-fisik wine dan skor kualitasnya.

Pre-Processing


Hapus Duplikat

df_clean <- wine_data %>%
  distinct()

Hapus Duplikat

colSums(is.na(df_clean))
##        fixed.acidity     volatile.acidity          citric.acid 
##                    0                    0                    0 
##       residual.sugar            chlorides  free.sulfur.dioxide 
##                    0                    0                    0 
## total.sulfur.dioxide              density                   pH 
##                    0                    0                    0 
##            sulphates              alcohol              quality 
##                    0                    0                    0 
##                   Id 
##                    0

Hapus Kolom ID

df_clean <- df_clean %>% select(-Id)

Memfilter Kelas (Quality) Minoritas

df_clean %>% count(quality)
##   quality   n
## 1       3   6
## 2       4  33
## 3       5 483
## 4       6 462
## 5       7 143
## 6       8  16

Dari hasil yang didapatkan maka hanya kelas 5, 6, dan 7 saja yang digunakan dan mengubahnya menjadi faktor, karena kelas 3, 4, 8 itu kelompok minoritas ekstrem (<40 baris)

df_clean <- wine_data %>%
  filter(quality %in% c(5, 6, 7)) %>%
  mutate(quality = as.factor(quality))

Cek Data


Cek Struktur data yang sudah dibersihkan

str(df_clean[, c("quality", "pH", "sulphates", "citric.acid")])
## 'data.frame':    1088 obs. of  4 variables:
##  $ quality    : Factor w/ 3 levels "5","6","7": 1 1 1 2 1 1 1 3 3 1 ...
##  $ pH         : num  3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.28 ...
##  $ sulphates  : num  0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.54 ...
##  $ citric.acid: num  0 0 0.04 0.56 0 0 0.06 0 0.02 0.08 ...
table(df_clean$quality)
## 
##   5   6   7 
## 483 462 143
summary(df_clean)
##  fixed.acidity    volatile.acidity  citric.acid     residual.sugar  
##  Min.   : 4.900   Min.   :0.120    Min.   :0.0000   Min.   : 0.900  
##  1st Qu.: 7.100   1st Qu.:0.390    1st Qu.:0.1000   1st Qu.: 1.900  
##  Median : 7.900   Median :0.520    Median :0.2500   Median : 2.200  
##  Mean   : 8.318   Mean   :0.526    Mean   :0.2694   Mean   : 2.529  
##  3rd Qu.: 9.100   3rd Qu.:0.640    3rd Qu.:0.4200   3rd Qu.: 2.600  
##  Max.   :15.900   Max.   :1.330    Max.   :0.7900   Max.   :15.500  
##    chlorides       free.sulfur.dioxide total.sulfur.dioxide    density      
##  Min.   :0.01200   Min.   : 1.00       Min.   :  6.00       Min.   :0.9901  
##  1st Qu.:0.07000   1st Qu.: 7.00       1st Qu.: 22.00       1st Qu.:0.9956  
##  Median :0.07900   Median :14.00       Median : 38.00       Median :0.9967  
##  Mean   :0.08684   Mean   :15.75       Mean   : 46.44       Mean   :0.9967  
##  3rd Qu.:0.09000   3rd Qu.:21.00       3rd Qu.: 62.00       3rd Qu.:0.9979  
##  Max.   :0.61100   Max.   :68.00       Max.   :289.00       Max.   :1.0037  
##        pH          sulphates         alcohol      quality       Id        
##  Min.   :2.860   Min.   :0.3900   Min.   : 8.40   5:483   Min.   :   0.0  
##  1st Qu.:3.200   1st Qu.:0.5500   1st Qu.: 9.50   6:462   1st Qu.: 413.8  
##  Median :3.310   Median :0.6200   Median :10.20   7:143   Median : 795.5  
##  Mean   :3.309   Mean   :0.6573   Mean   :10.43           Mean   : 806.7  
##  3rd Qu.:3.400   3rd Qu.:0.7300   3rd Qu.:11.10           3rd Qu.:1208.2  
##  Max.   :4.010   Max.   :1.9500   Max.   :14.90           Max.   :1597.0

EDA (VISUAL & KORELASI)


ggplot(df_clean, aes(x = pH)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

ggplot(df_clean, aes(x = sulphates)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

ggplot(df_clean, aes(x = citric.acid, y = pH)) +
  geom_point() +
  geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(df_clean, aes(x = citric.acid, y = sulphates)) +
  geom_point() +
  geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

cor(df_clean[, c("pH", "sulphates", "citric.acid")])
##                     pH  sulphates citric.acid
## pH           1.0000000 -0.1554168  -0.5328652
## sulphates   -0.1554168  1.0000000   0.3130925
## citric.acid -0.5328652  0.3130925   1.0000000

Cek Outlier

# Outlier Univariat
df_clean %>% group_by(quality) %>% identify_outliers(pH)
## # A tibble: 18 × 15
##    quality fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
##    <fct>           <dbl>            <dbl>       <dbl>          <dbl>     <dbl>
##  1 5                 5              1.04         0.24           1.6      0.05 
##  2 5                10.7            0.43         0.39           2.2      0.106
##  3 5                10.7            0.43         0.39           2.2      0.106
##  4 5                 6.6            0.61         0.01           1.9      0.08 
##  5 5                 5.6            0.915        0              2.1      0.041
##  6 5                10              0.69         0.11           1.4      0.084
##  7 5                 5.6            0.54         0.04           1.7      0.049
##  8 6                14.3            0.31         0.74           1.8      0.075
##  9 6                 5.1            0.47         0.02           1.3      0.034
## 10 6                 8              0.18         0.37           0.9      0.049
## 11 6                 5.2            0.645        0              2.15     0.08 
## 12 6                 5.4            0.74         0              1.2      0.041
## 13 6                 9.1            0.76         0.68           1.7      0.414
## 14 6                 5              0.74         0              1.2      0.041
## 15 7                12              0.5          0.59           1.4      0.073
## 16 7                 4.9            0.42         0              2.1      0.048
## 17 7                 5.4            0.42         0.27           2        0.092
## 18 7                 5.1            0.42         0              1.8      0.044
## # ℹ 9 more variables: free.sulfur.dioxide <dbl>, total.sulfur.dioxide <dbl>,
## #   density <dbl>, pH <dbl>, sulphates <dbl>, alcohol <dbl>, Id <int>,
## #   is.outlier <lgl>, is.extreme <lgl>
df_clean %>% group_by(quality) %>% identify_outliers(sulphates)
## # A tibble: 52 × 15
##    quality fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
##    <fct>           <dbl>            <dbl>       <dbl>          <dbl>     <dbl>
##  1 5                 7.8            0.61         0.29            1.6     0.114
##  2 5                 7.9            0.43         0.21            1.6     0.106
##  3 5                 7.3            0.45         0.36            5.9     0.074
##  4 5                 8.1            0.66         0.22            2.2     0.069
##  5 5                 9.3            0.39         0.44            2.1     0.107
##  6 5                 7.8            0.41         0.68            1.7     0.467
##  7 5                 7.8            0.56         0.19            1.8     0.104
##  8 5                 7.8            0.56         0.19            1.8     0.104
##  9 5                 8.9            0.61         0.49            2       0.27 
## 10 5                 8.9            0.635        0.37            1.7     0.263
## # ℹ 42 more rows
## # ℹ 9 more variables: free.sulfur.dioxide <dbl>, total.sulfur.dioxide <dbl>,
## #   density <dbl>, pH <dbl>, sulphates <dbl>, alcohol <dbl>, Id <int>,
## #   is.outlier <lgl>, is.extreme <lgl>
sqrt(df_clean$sulphates)
##    [1] 0.7483315 0.8246211 0.8062258 0.7615773 0.7483315 0.7483315 0.6782330
##    [8] 0.6855655 0.7549834 0.7348469 0.7211103 1.2489996 0.8660254 1.0392305
##   [15] 0.8062258 0.9539392 0.7280110 0.7937254 0.7483315 0.7681146 0.7416198
##   [22] 0.7681146 0.7348469 0.8124038 0.7416198 0.7416198 0.7745967 0.8544004
##   [29] 0.9110434 0.9486833 1.0954451 0.8544004 0.8602325 0.7937254 0.8000000
##   [36] 0.7483315 0.7937254 0.7937254 0.7681146 0.7348469 0.8000000 0.8426150
##   [43] 0.7549834 0.6244998 0.6244998 0.7615773 0.9746794 0.6928203 0.8124038
##   [50] 0.7416198 0.7211103 0.7483315 0.7549834 0.8831761 0.7681146 1.3964240
##   [57] 0.8000000 1.1045361 0.7280110 0.7348469 1.3964240 0.8000000 0.7483315
##   [64] 0.7483315 0.7681146 0.7810250 0.7681146 0.7141428 0.6928203 0.7141428
##   [71] 1.1445523 0.8888194 0.9643651 0.8124038 0.8185353 0.9643651 0.7937254
##   [78] 0.7810250 0.7549834 0.7745967 0.7211103 0.6928203 0.7000000 0.6928203
##   [85] 0.9591663 0.8124038 0.7937254 0.7416198 0.9055385 0.8000000 0.9055385
##   [92] 0.8888194 0.7615773 0.8888194 0.7874008 0.7211103 0.7615773 0.7280110
##   [99] 0.7348469 0.7348469 0.8426150 0.8485281 0.8426150 0.7071068 0.7549834
##  [106] 0.7483315 0.7483315 0.7000000 0.7000000 0.7874008 0.7211103 0.7416198
##  [113] 0.7810250 0.7549834 0.8831761 0.6855655 0.7810250 0.7483315 0.7681146
##  [120] 0.7681146 1.0099505 0.7211103 0.7615773 0.7615773 0.7280110 0.7416198
##  [127] 0.7416198 0.7874008 0.7874008 0.7416198 0.7416198 0.7681146 0.9848858
##  [134] 0.9055385 0.7483315 0.8774964 0.8774964 0.8185353 0.9273618 0.7483315
##  [141] 0.6557439 0.8426150 0.8717798 0.7549834 0.7416198 0.7348469 0.7681146
##  [148] 0.7549834 0.7745967 0.7745967 0.7615773 1.2688578 0.7874008 0.7211103
##  [155] 0.8366600 0.7416198 0.7615773 1.0440307 0.8426150 0.7000000 0.9165151
##  [162] 0.9165151 0.9797959 0.7211103 0.9797959 0.8831761 0.8366600 0.7280110
##  [169] 0.7681146 0.7549834 0.7874008 0.8185353 1.1224972 0.8944272 0.8062258
##  [176] 0.7549834 0.8185353 0.9327379 0.9539392 0.9848858 0.8185353 0.8185353
##  [183] 0.9539392 0.9848858 0.8774964 0.9055385 1.0392305 0.8774964 0.8426150
##  [190] 0.8426150 0.8717798 0.8366600 0.9273618 0.9746794 0.9273618 0.8000000
##  [197] 0.7745967 0.9000000 0.8000000 0.7211103 0.8544004 0.8944272 0.7681146
##  [204] 0.8000000 0.8485281 0.7211103 0.7211103 0.7874008 0.7937254 0.7348469
##  [211] 0.7937254 0.7280110 0.8246211 0.8426150 0.8000000 0.7874008 0.8306624
##  [218] 0.8306624 0.8602325 0.9110434 0.8185353 0.7211103 0.8602325 0.9165151
##  [225] 0.8366600 0.8000000 1.1661904 0.8660254 0.8660254 0.8831761 0.9643651
##  [232] 0.9591663 0.9110434 0.8544004 0.8774964 0.7681146 0.7810250 0.9380832
##  [239] 0.8660254 0.9219544 0.8246211 0.8366600 0.9539392 0.7483315 0.9327379
##  [246] 0.8544004 0.8544004 0.8306624 1.0630146 0.9327379 1.0198039 1.0535654
##  [253] 1.0630146 0.9949874 0.8944272 0.8602325 0.8602325 0.8944272 0.7141428
##  [260] 0.7745967 0.7280110 0.8062258 0.7416198 0.7280110 0.7810250 0.7937254
##  [267] 0.7810250 0.8062258 0.7937254 0.7874008 0.8000000 0.9055385 0.7810250
##  [274] 1.0344080 0.8124038 0.6928203 0.8717798 0.8246211 0.7483315 0.6633250
##  [281] 0.9486833 0.7483315 0.7000000 0.7483315 0.7745967 0.7745967 0.9486833
##  [288] 0.7483315 0.7000000 0.9433981 0.7000000 0.7483315 0.8124038 0.7745967
##  [295] 0.8246211 0.9165151 0.8544004 0.8485281 0.7483315 0.7810250 0.7810250
##  [302] 0.7937254 0.8602325 0.7937254 0.7615773 0.7615773 0.7810250 0.7810250
##  [309] 0.8366600 0.8485281 0.8888194 0.7874008 0.7615773 0.7810250 0.8426150
##  [316] 0.9055385 0.7745967 0.8366600 1.0295630 0.8366600 0.8124038 1.0295630
##  [323] 0.6855655 0.6480741 0.8660254 0.8888194 0.9486833 0.8602325 0.7416198
##  [330] 0.8062258 0.8888194 0.8602325 0.8062258 0.9219544 0.9219544 1.0246951
##  [337] 0.8831761 1.0099505 0.8426150 0.8246211 0.7937254 0.9000000 0.8124038
##  [344] 0.8124038 1.0677078 0.8602325 0.9486833 0.7416198 0.8000000 0.8774964
##  [351] 0.7549834 0.9327379 0.7810250 0.7810250 0.8124038 0.7483315 0.9327379
##  [358] 0.7549834 0.8774964 0.8124038 0.7874008 0.8185353 0.8544004 0.7681146
##  [365] 0.8485281 0.8888194 0.7681146 0.8831761 0.9273618 0.8660254 0.7483315
##  [372] 0.8544004 0.8602325 0.7874008 0.8602325 0.7483315 0.8602325 0.8246211
##  [379] 0.7549834 0.7810250 0.7810250 0.8246211 0.7549834 0.7745967 0.8888194
##  [386] 0.7549834 0.9539392 0.7810250 0.7745967 0.7745967 0.7141428 0.6633250
##  [393] 0.6633250 0.8124038 0.7681146 0.7549834 0.7071068 0.7549834 0.7615773
##  [400] 0.7549834 0.7745967 0.6928203 0.7211103 0.7874008 0.7280110 0.8000000
##  [407] 0.7211103 0.7348469 0.7348469 0.7745967 0.7549834 0.9643651 0.8944272
##  [414] 0.7810250 0.7874008 0.7416198 0.8485281 0.7000000 0.7000000 0.7745967
##  [421] 0.7810250 0.7745967 0.7937254 0.7549834 0.7615773 0.7416198 0.7280110
##  [428] 0.7280110 1.1661904 0.7874008 0.7874008 0.7874008 0.8185353 0.7549834
##  [435] 0.7071068 0.9165151 0.7348469 0.6782330 0.7810250 0.7071068 0.8246211
##  [442] 0.9000000 0.9000000 0.8000000 0.8366600 0.7483315 0.7745967 0.8124038
##  [449] 0.8124038 0.7211103 0.7211103 0.8185353 0.6928203 0.7280110 0.7348469
##  [456] 0.7000000 0.7348469 0.7483315 0.6928203 0.7483315 0.6855655 1.0246951
##  [463] 0.7348469 1.0816654 0.7874008 0.7810250 0.7874008 0.8185353 0.7280110
##  [470] 0.7874008 0.7549834 0.8185353 0.8185353 0.7000000 0.6855655 0.7141428
##  [477] 0.7810250 0.7549834 0.7810250 0.7141428 0.7549834 0.6633250 0.7348469
##  [484] 0.6633250 0.7280110 1.2727922 0.7416198 0.8485281 0.8246211 0.8246211
##  [491] 0.7000000 0.8185353 0.7211103 0.6855655 0.7745967 0.7874008 0.7071068
##  [498] 0.8062258 0.7745967 0.8246211 0.7549834 0.7141428 0.8544004 0.6855655
##  [505] 0.8485281 0.8544004 0.7416198 0.7810250 0.7416198 1.0295630 0.6557439
##  [512] 0.6782330 0.7071068 0.7071068 0.7211103 0.7549834 0.6480741 0.7280110
##  [519] 0.7416198 0.7348469 0.7348469 0.7280110 0.7348469 0.7280110 0.7416198
##  [526] 0.7416198 0.9591663 0.6782330 0.7681146 0.7810250 0.7745967 0.7745967
##  [533] 0.8062258 0.7681146 0.7071068 0.7483315 0.7483315 0.7211103 0.8246211
##  [540] 0.7348469 0.7745967 0.7483315 0.7280110 1.0862780 0.8602325 0.9000000
##  [547] 0.7211103 0.6633250 0.9695360 0.8124038 0.7549834 0.8426150 0.9273618
##  [554] 0.7416198 0.6855655 0.8246211 0.8717798 0.7348469 0.7745967 0.8602325
##  [561] 0.8000000 0.7280110 0.8000000 0.7000000 0.8774964 0.6244998 0.6244998
##  [568] 0.9110434 0.7937254 0.8485281 0.7874008 0.8124038 0.8366600 0.8124038
##  [575] 0.7416198 0.7416198 0.8544004 0.8544004 0.8544004 0.8831761 0.8185353
##  [582] 0.7348469 0.6633250 0.7280110 0.7348469 0.7549834 0.8888194 0.7549834
##  [589] 0.8062258 0.8831761 0.7071068 0.8000000 0.8062258 0.7483315 0.8246211
##  [596] 0.7549834 0.8062258 0.7549834 0.7615773 0.7615773 0.8774964 0.7615773
##  [603] 0.8774964 0.8306624 0.8306624 0.8306624 0.7681146 0.8426150 0.7348469
##  [610] 0.7681146 0.7549834 0.7745967 0.7874008 0.8185353 0.7810250 0.8000000
##  [617] 0.8485281 0.9433981 0.7483315 0.9433981 0.8944272 0.9327379 0.9486833
##  [624] 0.8944272 0.8544004 0.8062258 0.7681146 0.8544004 0.8485281 0.7348469
##  [631] 0.7211103 0.8000000 0.8000000 0.9949874 0.7874008 0.8717798 0.8717798
##  [638] 0.7874008 0.8246211 0.8306624 0.8185353 0.7874008 0.8426150 0.9219544
##  [645] 0.7874008 0.8306624 0.9219544 0.6928203 0.7745967 0.9327379 0.7874008
##  [652] 0.7211103 0.7071068 0.7000000 0.8426150 0.7874008 0.9055385 0.8426150
##  [659] 0.7615773 0.7071068 0.7000000 0.7141428 0.7000000 0.6928203 0.6928203
##  [666] 0.7874008 0.6928203 0.7280110 0.8485281 0.7348469 0.7416198 0.9165151
##  [673] 0.9165151 0.8366600 0.7416198 0.7416198 0.6928203 0.7615773 0.7615773
##  [680] 0.8660254 0.6633250 0.7810250 0.7280110 0.7280110 0.7348469 0.7416198
##  [687] 0.7483315 0.7745967 0.7483315 0.7211103 0.7483315 0.7280110 0.6855655
##  [694] 0.8062258 0.7681146 0.8774964 0.9110434 0.8000000 0.7874008 0.7615773
##  [701] 0.7874008 0.8000000 0.8831761 0.8774964 1.1575837 0.7416198 0.8774964
##  [708] 0.7874008 0.7681146 0.7549834 0.8717798 0.7681146 0.8366600 0.8000000
##  [715] 0.7937254 0.7549834 0.7745967 0.8485281 0.9110434 0.7874008 0.9433981
##  [722] 0.7615773 0.7615773 0.8544004 0.7615773 0.7141428 0.7141428 0.7681146
##  [729] 0.8774964 0.7000000 0.8124038 0.8306624 0.8717798 0.8717798 0.7745967
##  [736] 0.7681146 0.9273618 0.7000000 0.7000000 0.7211103 0.8831761 0.7937254
##  [743] 0.7937254 0.8124038 0.7745967 0.9219544 0.7000000 0.8717798 0.8000000
##  [750] 0.8000000 0.8426150 0.8124038 0.7745967 0.7745967 0.7745967 0.7280110
##  [757] 0.7615773 0.6928203 0.7681146 0.7745967 0.8246211 0.7937254 0.9380832
##  [764] 0.8246211 0.8602325 0.7745967 0.8717798 0.7615773 0.8000000 0.8062258
##  [771] 0.9219544 0.8062258 0.9055385 0.7874008 0.9000000 0.9327379 0.7000000
##  [778] 0.8124038 0.7211103 0.8062258 0.7280110 0.8124038 0.7280110 0.8717798
##  [785] 0.9327379 0.8426150 0.8185353 0.9380832 0.7483315 0.7874008 0.8185353
##  [792] 0.8602325 0.7874008 0.7874008 0.8366600 0.8544004 0.9000000 0.6782330
##  [799] 0.7745967 0.8366600 0.8366600 0.7549834 0.7141428 0.8944272 0.7681146
##  [806] 0.7745967 0.7348469 0.8888194 0.7615773 0.7348469 0.7348469 0.7416198
##  [813] 0.9219544 0.9219544 0.9695360 0.9219544 0.9327379 0.7874008 0.7874008
##  [820] 0.6928203 0.8306624 0.9110434 0.8774964 0.8774964 0.9055385 0.7141428
##  [827] 0.6855655 0.8485281 0.8544004 0.8000000 0.8831761 0.7874008 0.8000000
##  [834] 0.6324555 0.8000000 0.6324555 0.6324555 0.7874008 0.8246211 0.6928203
##  [841] 0.7810250 0.7681146 0.7615773 0.7615773 0.8306624 0.6480741 0.8366600
##  [848] 0.7937254 1.0723805 0.7615773 0.8485281 0.7745967 0.7745967 0.9327379
##  [855] 0.7416198 0.8660254 0.7549834 0.7280110 0.6557439 0.7280110 0.7348469
##  [862] 0.7745967 0.7348469 0.7549834 0.8000000 0.8000000 0.8246211 0.7937254
##  [869] 0.7483315 0.7549834 1.0816654 1.0816654 0.7681146 0.8660254 0.8185353
##  [876] 0.6708204 0.7810250 0.7483315 0.7810250 0.8602325 0.8944272 0.6633250
##  [883] 0.7280110 0.7874008 0.7810250 0.7681146 0.6855655 0.7615773 0.7681146
##  [890] 1.1532563 0.7681146 0.7416198 0.7745967 0.7745967 0.7745967 0.7745967
##  [897] 0.7211103 0.7483315 0.7483315 0.6782330 0.6557439 0.7141428 0.7211103
##  [904] 0.8124038 0.6708204 0.6708204 0.6708204 0.7348469 0.7348469 0.7348469
##  [911] 0.7348469 0.7280110 0.7280110 0.7483315 0.6244998 0.7141428 0.6782330
##  [918] 0.6633250 0.6633250 0.7681146 0.7071068 0.8062258 0.7348469 0.8062258
##  [925] 0.7280110 0.7071068 1.0862780 0.7071068 1.0816654 1.0148892 1.0816654
##  [932] 0.6782330 0.7141428 0.7874008 0.7874008 0.7000000 0.7000000 0.7071068
##  [939] 0.6480741 0.7615773 0.8246211 0.7681146 0.7615773 0.6928203 0.6928203
##  [946] 0.8062258 0.8062258 0.7211103 0.8485281 0.7141428 0.9055385 0.7874008
##  [953] 0.9695360 0.9643651 1.0049876 0.9643651 0.6633250 0.8544004 0.9695360
##  [960] 0.9486833 0.8831761 0.9165151 0.8831761 0.8774964 0.6782330 0.6708204
##  [967] 0.6782330 0.7615773 0.9165151 0.7615773 0.7937254 0.9643651 0.7874008
##  [974] 0.7348469 0.8602325 0.8774964 0.8062258 0.7348469 0.6855655 0.8246211
##  [981] 0.7000000 0.7211103 0.8602325 0.8544004 0.8831761 0.8717798 0.8306624
##  [988] 0.7937254 0.7615773 0.7745967 0.8944272 0.8246211 0.7280110 0.8185353
##  [995] 0.8062258 0.8062258 0.6855655 0.7211103 0.8888194 0.7348469 0.8660254
## [1002] 0.9380832 0.8660254 0.9380832 0.8246211 0.8246211 0.7810250 0.7416198
## [1009] 0.7745967 0.8831761 0.7615773 0.7483315 0.7348469 0.7681146 0.8366600
## [1016] 0.9000000 0.7937254 0.7937254 0.7280110 0.8306624 0.7416198 0.8124038
## [1023] 0.7416198 0.8485281 0.8366600 0.8774964 0.8185353 0.8485281 0.8485281
## [1030] 0.9110434 0.7937254 0.7615773 0.9848858 0.7745967 0.8000000 0.8062258
## [1037] 0.8185353 0.7745967 0.8306624 0.8124038 0.8246211 0.7810250 0.7874008
## [1044] 0.7681146 0.7937254 0.7937254 0.9000000 0.8000000 0.7810250 0.7615773
## [1051] 0.7348469 0.8124038 0.7745967 0.8246211 0.7483315 0.7745967 0.7483315
## [1058] 0.7211103 0.7211103 0.7211103 0.7348469 0.7348469 0.9055385 0.8426150
## [1065] 0.7348469 0.7681146 0.7549834 0.9643651 0.8062258 0.7141428 0.8185353
## [1072] 0.8000000 0.8831761 0.7745967 0.8185353 0.7745967 0.7071068 0.7874008
## [1079] 0.8944272 0.9219544 0.8124038 0.9055385 0.7483315 0.8660254 0.9055385
## [1086] 0.7615773 0.8717798 0.8426150
# Outlier Multivariat
dv_data <- df_clean %>% select(pH, sulphates)

df_clean$mahalanobis <- mahalanobis(
  dv_data,
  colMeans(dv_data),
  cov(dv_data)
)

cutoff <- qchisq(0.999, df = 2)

df_clean %>% filter(mahalanobis > cutoff)
##    fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1            7.8             0.61        0.29            1.6     0.114
## 2            8.6             0.49        0.28            1.9     0.110
## 3            8.6             0.49        0.28            1.9     0.110
## 4            7.8             0.41        0.68            1.7     0.467
## 5            8.9             0.59        0.50            2.0     0.337
## 6            7.7             0.41        0.76            1.8     0.611
## 7           12.5             0.28        0.54            2.3     0.082
## 8            8.9             0.29        0.35            1.9     0.067
## 9            5.1             0.47        0.02            1.3     0.034
## 10           7.1             0.31        0.30            2.2     0.053
## 11           8.5             0.46        0.59            1.4     0.414
## 12           5.4             0.74        0.00            1.2     0.041
## 13           9.1             0.76        0.68            1.7     0.414
## 14           5.0             0.74        0.00            1.2     0.041
##    free.sulfur.dioxide total.sulfur.dioxide density   pH sulphates alcohol
## 1                    9                   29 0.99740 3.26      1.56     9.1
## 2                   20                  136 0.99720 2.93      1.95     9.9
## 3                   20                  136 0.99720 2.93      1.95     9.9
## 4                   18                   69 0.99730 3.08      1.31     9.3
## 5                   27                   81 0.99640 3.04      1.61     9.5
## 6                    8                   45 0.99680 3.06      1.26     9.4
## 7                   12                   29 0.99970 3.11      1.36     9.8
## 8                   25                   57 0.99700 3.18      1.36    10.3
## 9                   18                   44 0.99210 3.90      0.62    12.8
## 10                  36                  127 0.99650 2.94      1.62     9.5
## 11                  16                   45 0.99702 3.03      1.34     9.2
## 12                  16                   46 0.99258 4.01      0.59    12.5
## 13                  18                   64 0.99652 2.90      1.33     9.1
## 14                  16                   46 0.99258 4.01      0.59    12.5
##    quality   Id mahalanobis
## 1        5   13    29.91473
## 2        6   86    62.39609
## 3        6   91    62.39609
## 4        5  106    16.29024
## 5        6  226    33.76780
## 6        5  258    14.36022
## 7        7  339    18.37658
## 8        6  639    17.99130
## 9        6  695    14.95343
## 10       5  723    36.02146
## 11       5 1051    18.37630
## 12       6 1316    20.93535
## 13       6 1319    20.70642
## 14       6 1321    20.93535
df_clean <- df_clean %>%
  mutate(outlier_multi = mahalanobis > cutoff)
table(df_clean$outlier_multi)
## 
## FALSE  TRUE 
##  1074    14
df_clean <- df_clean %>%
  filter(mahalanobis <= cutoff)

Boxplot distribusi pH

ggplot(df_clean, aes(x = quality, y = pH, fill = quality)) +
  geom_boxplot(alpha = 0.7) +
  theme_minimal() +
  labs(title = "Distribusi pH Berdasarkan Kualitas Anggur",
       x = "Kualitas Anggur (Quality)",
       y = "Tingkat pH") +
  theme(legend.position = "none")

Boxplot distribusi Sulphates

ggplot(df_clean, aes(x = quality, y = sulphates, fill = quality)) +
  geom_boxplot(alpha = 0.7) +
  theme_minimal() +
  labs(title = "Distribusi Kandungan Sulfat Berdasarkan Kualitas Anggur",
       x = "Kualitas Anggur (Quality)",
       y = "Kandungan Sulfat (Sulphates)") +
  theme(legend.position = "none")

UJI ASUMSI


UJI ASUMSI 1: NORMALITAS

# Normalitas pH untuk masing-masing kelas quality
df_clean %>%
  group_by(quality) %>%
  shapiro_test(pH)
## # A tibble: 3 × 4
##   quality variable statistic     p
##   <fct>   <chr>        <dbl> <dbl>
## 1 5       pH           0.997 0.512
## 2 6       pH           0.997 0.594
## 3 7       pH           0.986 0.165
# Normalitas sulphates untuk masing-masing kelas quality
df_clean %>%
  group_by(quality) %>%
  shapiro_test(sulphates)
## # A tibble: 3 × 4
##   quality variable  statistic        p
##   <fct>   <chr>         <dbl>    <dbl>
## 1 5       sulphates     0.817 6.41e-23
## 2 6       sulphates     0.946 9.05e-12
## 3 7       sulphates     0.986 1.76e- 1
# Normalitas Multivariat
mvn_results <- lapply(
  split(df_clean[, c("pH", "sulphates")], df_clean$quality),
  function(x) MVN::mvn(data = x, mvn_test = "hz", tidy = TRUE)
)

lapply(mvn_results, function(x) x$multivariate_normality)
## $`5`
##            Test Statistic p.value     Method          MVN
## 1 Henze-Zirkler    10.433  <0.001 asymptotic ✗ Not normal
## 
## $`6`
##            Test Statistic p.value     Method          MVN
## 1 Henze-Zirkler     5.601  <0.001 asymptotic ✗ Not normal
## 
## $`7`
##            Test Statistic p.value     Method      MVN
## 1 Henze-Zirkler     0.641   0.379 asymptotic ✓ Normal

catatan:

- p > 0.05 → tidak ada bukti melanggar normalitas multivariat

- p < 0.05 → indikasi tidak normal multivariat

UJI ASUMSI 2: HOMOGENITAS VARIANS & KOVARIANS

# Levene's Test (Homogenitas Varians Univariat)
leveneTest(pH ~ quality, data = df_clean)
## Levene's Test for Homogeneity of Variance (center = median)
##         Df F value Pr(>F)
## group    2  0.2333  0.792
##       1071
leveneTest(sulphates ~ quality, data = df_clean)
## Levene's Test for Homogeneity of Variance (center = median)
##         Df F value Pr(>F)
## group    2  1.1705 0.3106
##       1071

catatan: - p > 0.05 → homogen

# Box's M Test (Homogenitas Matriks Kovarians Multivariat)
box_m(df_clean[, c("pH", "sulphates")], df_clean$quality)
## # A tibble: 1 × 4
##   statistic p.value parameter method                                            
##       <dbl>   <dbl>     <dbl> <chr>                                             
## 1      7.20   0.302         6 Box's M-test for Homogeneity of Covariance Matric…

catatan:

- p > 0.001 biasanya dianggap aman

- kalau signifikan, Pillai’s Trace biasanya tetap paling robust

UJI ASUMSI 3: INDEPENDENSI DV (SYARAT BARTLETT)

# DV tidak boleh saling bebas; harus ada korelasi (Signifikan p < 0.05)
cor(df_clean[, c("pH", "sulphates")], use = "complete.obs")
##                    pH   sulphates
## pH         1.00000000 -0.06937676
## sulphates -0.06937676  1.00000000
cor.test(df_clean$pH, df_clean$sulphates, method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  df_clean$pH and df_clean$sulphates
## t = -2.277, df = 1072, p-value = 0.02298
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.128661178 -0.009598222
## sample estimates:
##         cor 
## -0.06937676
psych::cortest.bartlett(cor(df_clean[, c("pH", "sulphates")]), n = nrow(df_clean))
## $chisq
## [1] 5.169725
## 
## $p.value
## [1] 0.02298386
## 
## $df
## [1] 1

catatan:

- p-value < 0.05 -> ada korelasi

- DV sebaiknya berkorelasi sedang

- jangan terlalu rendah (MANOVA jadi kurang berguna)

- jangan terlalu tinggi (indikasi multikolinearitas)

UJI ASUMSI 4: LINIERITAS COV - DV

# Kovariat harus memiliki hubungan linier dengan Dependen (Signifikan p < 0.05)

# Korelasi Asam Sitrat terhadap pH
cor.test(df_clean$citric.acid, df_clean$pH, method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  df_clean$citric.acid and df_clean$pH
## t = -20.33, df = 1072, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.5693624 -0.4829305
## sample estimates:
##        cor 
## -0.5275101
ggplot(df_clean, aes(x = citric.acid, y = pH, color = quality)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula = 'y ~ x'

# Korelasi Asam Sitrat terhadap Sulphates
cor.test(df_clean$citric.acid, df_clean$sulphates, method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  df_clean$citric.acid and df_clean$sulphates
## t = 11.067, df = 1072, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2654850 0.3728937
## sample estimates:
##       cor 
## 0.3202181
ggplot(df_clean, aes(x = citric.acid, y = sulphates, color = quality)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula = 'y ~ x'

catatan:

- pola hubungan harus terlihat linear

- p-value < 0.05, maka hubungan linear

UJI ASUMSI 5: INDEPENDENSI OBSERVASI

# Membuat model linear sementara untuk mengambil nilai residualnya
model_pH <- lm(pH ~ citric.acid + quality, data = df_clean)
model_sulphates <- lm(sulphates ~ citric.acid + quality, data = df_clean)
# Uji Durbin-Watson (Nilai ideal berkisar di angka 2.0)
durbinWatsonTest(model_pH)
##  lag Autocorrelation D-W Statistic p-value
##    1       0.1894929       1.61751       0
##  Alternative hypothesis: rho != 0
durbinWatsonTest(model_sulphates)
##  lag Autocorrelation D-W Statistic p-value
##    1       0.2098177      1.579438       0
##  Alternative hypothesis: rho != 0

Catatan:

- Jika hasil sekitar 2, maka dianggap lolos atau tidak ada autokorelasi

UJI ASUMSI 6: Homogeneity of regression slopes

slope_pH <- lm(pH ~ citric.acid * quality, data = df_clean)
slope_sulphates <- lm(sulphates ~ citric.acid * quality, data = df_clean)

Anova(slope_pH, type = 2)
## Anova Table (Type II tests)
## 
## Response: pH
##                      Sum Sq   Df  F value    Pr(>F)    
## citric.acid          6.6590    1 432.7123 < 2.2e-16 ***
## quality              0.3182    2  10.3379 3.575e-05 ***
## citric.acid:quality  0.1045    2   3.3947   0.03391 *  
## Residuals           16.4354 1068                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Anova(slope_sulphates, type = 2)
## Anova Table (Type II tests)
## 
## Response: sulphates
##                      Sum Sq   Df F value  Pr(>F)    
## citric.acid          1.3747    1 84.1861 < 2e-16 ***
## quality              1.4307    2 43.8069 < 2e-16 ***
## citric.acid:quality  0.0785    2  2.4046 0.09079 .  
## Residuals           17.4397 1068                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
slope_mancova <- manova(cbind(pH, sulphates) ~ citric.acid * quality, data = df_clean)
summary(slope_mancova, test = "Pillai")
##                       Df  Pillai approx F num Df den Df  Pr(>F)    
## citric.acid            1 0.35978  299.804      2   1067 < 2e-16 ***
## quality                2 0.08705   24.301      4   2136 < 2e-16 ***
## citric.acid:quality    2 0.00999    2.682      4   2136 0.03007 *  
## Residuals           1068                                           
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

catatan:

- interaksi citric.acid:quality harus tidak signifikan

- kalau signifikan, slope antar grup berbeda → ANCOVA/MANCOVA klasik tidak terpenuhi

- p-value > 0.05

UJI UTAMA: ANCOVA

model_ancova_sul <- lm(sulphates ~ citric.acid + quality, data = df_clean)

# cek slope
model_slope_sul <- lm(sulphates ~ citric.acid * quality, data = df_clean)
Anova(model_slope_sul, type = 2)
## Anova Table (Type II tests)
## 
## Response: sulphates
##                      Sum Sq   Df F value  Pr(>F)    
## citric.acid          1.3747    1 84.1861 < 2e-16 ***
## quality              1.4307    2 43.8069 < 2e-16 ***
## citric.acid:quality  0.0785    2  2.4046 0.09079 .  
## Residuals           17.4397 1068                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# model final
Anova(model_ancova_sul, type = 2)
## Anova Table (Type II tests)
## 
## Response: sulphates
##              Sum Sq   Df F value    Pr(>F)    
## citric.acid  1.3747    1  83.966 < 2.2e-16 ***
## quality      1.4307    2  43.692 < 2.2e-16 ***
## Residuals   17.5183 1070                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Berdasarkan hasil uji ANCOVA, variabel citric acid dan quality berpengaruh signifikan terhadap sulphates (p < 0.05). Sementara itu, interaksi antara citric acid dan quality tidak signifikan (p > 0.05), sehingga model tanpa interaksi digunakan . Hal ini menunjukkan bahwa pengaruh citric acid terhadap sulphates konsisten pada setiap kategori kualitas wine.

UJI UTAMA: MANOVA

model_manova <- manova(cbind(pH, sulphates) ~ quality, data = df_clean)

summary(model_manova, test = "Pillai")
##             Df  Pillai approx F num Df den Df    Pr(>F)    
## quality      2 0.11218    31.82      4   2142 < 2.2e-16 ***
## Residuals 1071                                             
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary.aov(model_manova)
##  Response pH :
##               Df  Sum Sq  Mean Sq F value  Pr(>F)  
## quality        2  0.1589 0.079439  3.6674 0.02586 *
## Residuals   1071 23.1989 0.021661                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response sulphates :
##               Df Sum Sq Mean Sq F value    Pr(>F)    
## quality        2  2.221 1.11050  62.952 < 2.2e-16 ***
## Residuals   1071 18.893 0.01764                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Secara keseluruhan, hasil MANOVA membuktikan bahwa terdapat perbedaan profil kimia wine berdasarkan kategori kualitas (Pillai’s Trace = 0.11218 dengan F = 31.82 dan p-value < 0.001, sehingga H0 ditolak). Perbedaan tersebut terutama tampak kuat pada variabel sulphates ((F = 62.952; p < 0.001)), sementara pH juga memberikan kontribusi yang signifikan (F = 3.6674; p = 0.02586). ## UJI UTAMA: MANCOVA

# Membangun model MANCOVA menggunakan Pillai's Trace
model_mancova <- manova(cbind(pH, sulphates) ~ citric.acid + quality, data = df_clean)
# Menampilkan hasil multivariat
summary(model_mancova, test = "Pillai")
##               Df  Pillai approx F num Df den Df    Pr(>F)    
## citric.acid    1 0.35942  299.899      2   1069 < 2.2e-16 ***
## quality        2 0.08644   24.168      4   2140 < 2.2e-16 ***
## Residuals   1070                                             
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Menampilkan hasil univariat (ANCOVA terpisah jika multivariatnya signifikan)
summary.aov(model_mancova)
##  Response pH :
##               Df  Sum Sq Mean Sq F value    Pr(>F)    
## citric.acid    1  6.4997  6.4997 420.479 < 2.2e-16 ***
## quality        2  0.3182  0.1591  10.292 3.739e-05 ***
## Residuals   1070 16.5399  0.0155                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response sulphates :
##               Df  Sum Sq Mean Sq F value    Pr(>F)    
## citric.acid    1  2.1650 2.16502 132.237 < 2.2e-16 ***
## quality        2  1.4307 0.71534  43.692 < 2.2e-16 ***
## Residuals   1070 17.5183 0.01637                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1