NAMA : FIRLY FARAH AULIA

NIM : 23031554047

MATA KULIAH : ANALISIS MULTIVARIAT

DOSEN PENGAMPU : Ike Fitriyaningsih, M.Si

NIDN : 0109049001

dataset <- read.csv("C:\\DATASET\\Mobile Price Train.csv")
head(dataset)

##   battery_power blue clock_speed dual_sim fc four_g int_memory m_dep mobile_wt
## 1           842    0         2.2        0  1      0          7   0.6       188
## 2          1021    1         0.5        1  0      1         53   0.7       136
## 3           563    1         0.5        1  2      1         41   0.9       145
## 4           615    1         2.5        0  0      0         10   0.8       131
## 5          1821    1         1.2        0 13      1         44   0.6       141
## 6          1859    0         0.5        1  3      0         22   0.7       164
##   n_cores pc px_height px_width  ram sc_h sc_w talk_time three_g touch_screen
## 1       2  2        20      756 2549    9    7        19       0            0
## 2       3  6       905     1988 2631   17    3         7       1            1
## 3       5  6      1263     1716 2603   11    2         9       1            1
## 4       6  9      1216     1786 2769   16    8        11       1            0
## 5       2 14      1208     1212 1411    8    2        15       1            1
## 6       1  7      1004     1654 1067   17    1        10       1            0
##   wifi price_range
## 1    1           1
## 2    0           2
## 3    0           2
## 4    0           2
## 5    0           1
## 6    0           1

str(dataset)

## 'data.frame':    2000 obs. of  21 variables:
##  $ battery_power: int  842 1021 563 615 1821 1859 1821 1954 1445 509 ...
##  $ blue         : int  0 1 1 1 1 0 0 0 1 1 ...
##  $ clock_speed  : num  2.2 0.5 0.5 2.5 1.2 0.5 1.7 0.5 0.5 0.6 ...
##  $ dual_sim     : int  0 1 1 0 0 1 0 1 0 1 ...
##  $ fc           : int  1 0 2 0 13 3 4 0 0 2 ...
##  $ four_g       : int  0 1 1 0 1 0 1 0 0 1 ...
##  $ int_memory   : int  7 53 41 10 44 22 10 24 53 9 ...
##  $ m_dep        : num  0.6 0.7 0.9 0.8 0.6 0.7 0.8 0.8 0.7 0.1 ...
##  $ mobile_wt    : int  188 136 145 131 141 164 139 187 174 93 ...
##  $ n_cores      : int  2 3 5 6 2 1 8 4 7 5 ...
##  $ pc           : int  2 6 6 9 14 7 10 0 14 15 ...
##  $ px_height    : int  20 905 1263 1216 1208 1004 381 512 386 1137 ...
##  $ px_width     : int  756 1988 1716 1786 1212 1654 1018 1149 836 1224 ...
##  $ ram          : int  2549 2631 2603 2769 1411 1067 3220 700 1099 513 ...
##  $ sc_h         : int  9 17 11 16 8 17 13 16 17 19 ...
##  $ sc_w         : int  7 3 2 8 2 1 8 3 1 10 ...
##  $ talk_time    : int  19 7 9 11 15 10 18 5 20 12 ...
##  $ three_g      : int  0 1 1 1 1 1 1 1 1 1 ...
##  $ touch_screen : int  0 1 1 0 1 0 0 1 0 0 ...
##  $ wifi         : int  1 0 0 0 0 0 1 1 0 0 ...
##  $ price_range  : int  1 2 2 2 1 1 3 0 0 0 ...

summary(dataset)

##  battery_power         blue        clock_speed       dual_sim     
##  Min.   : 501.0   Min.   :0.000   Min.   :0.500   Min.   :0.0000  
##  1st Qu.: 851.8   1st Qu.:0.000   1st Qu.:0.700   1st Qu.:0.0000  
##  Median :1226.0   Median :0.000   Median :1.500   Median :1.0000  
##  Mean   :1238.5   Mean   :0.495   Mean   :1.522   Mean   :0.5095  
##  3rd Qu.:1615.2   3rd Qu.:1.000   3rd Qu.:2.200   3rd Qu.:1.0000  
##  Max.   :1998.0   Max.   :1.000   Max.   :3.000   Max.   :1.0000  
##        fc             four_g         int_memory        m_dep       
##  Min.   : 0.000   Min.   :0.0000   Min.   : 2.00   Min.   :0.1000  
##  1st Qu.: 1.000   1st Qu.:0.0000   1st Qu.:16.00   1st Qu.:0.2000  
##  Median : 3.000   Median :1.0000   Median :32.00   Median :0.5000  
##  Mean   : 4.309   Mean   :0.5215   Mean   :32.05   Mean   :0.5018  
##  3rd Qu.: 7.000   3rd Qu.:1.0000   3rd Qu.:48.00   3rd Qu.:0.8000  
##  Max.   :19.000   Max.   :1.0000   Max.   :64.00   Max.   :1.0000  
##    mobile_wt        n_cores            pc           px_height     
##  Min.   : 80.0   Min.   :1.000   Min.   : 0.000   Min.   :   0.0  
##  1st Qu.:109.0   1st Qu.:3.000   1st Qu.: 5.000   1st Qu.: 282.8  
##  Median :141.0   Median :4.000   Median :10.000   Median : 564.0  
##  Mean   :140.2   Mean   :4.521   Mean   : 9.916   Mean   : 645.1  
##  3rd Qu.:170.0   3rd Qu.:7.000   3rd Qu.:15.000   3rd Qu.: 947.2  
##  Max.   :200.0   Max.   :8.000   Max.   :20.000   Max.   :1960.0  
##     px_width           ram            sc_h            sc_w       
##  Min.   : 500.0   Min.   : 256   Min.   : 5.00   Min.   : 0.000  
##  1st Qu.: 874.8   1st Qu.:1208   1st Qu.: 9.00   1st Qu.: 2.000  
##  Median :1247.0   Median :2146   Median :12.00   Median : 5.000  
##  Mean   :1251.5   Mean   :2124   Mean   :12.31   Mean   : 5.767  
##  3rd Qu.:1633.0   3rd Qu.:3064   3rd Qu.:16.00   3rd Qu.: 9.000  
##  Max.   :1998.0   Max.   :3998   Max.   :19.00   Max.   :18.000  
##    talk_time        three_g        touch_screen        wifi      
##  Min.   : 2.00   Min.   :0.0000   Min.   :0.000   Min.   :0.000  
##  1st Qu.: 6.00   1st Qu.:1.0000   1st Qu.:0.000   1st Qu.:0.000  
##  Median :11.00   Median :1.0000   Median :1.000   Median :1.000  
##  Mean   :11.01   Mean   :0.7615   Mean   :0.503   Mean   :0.507  
##  3rd Qu.:16.00   3rd Qu.:1.0000   3rd Qu.:1.000   3rd Qu.:1.000  
##  Max.   :20.00   Max.   :1.0000   Max.   :1.000   Max.   :1.000  
##   price_range  
##  Min.   :0.00  
##  1st Qu.:0.75  
##  Median :1.50  
##  Mean   :1.50  
##  3rd Qu.:2.25  
##  Max.   :3.00

Pre-processing

missing_values <- colSums(is.na(dataset))
cat("Missing values per kolom:\n")

## Missing values per kolom:

print(missing_values)

## battery_power          blue   clock_speed      dual_sim            fc 
##             0             0             0             0             0 
##        four_g    int_memory         m_dep     mobile_wt       n_cores 
##             0             0             0             0             0 
##            pc     px_height      px_width           ram          sc_h 
##             0             0             0             0             0 
##          sc_w     talk_time       three_g  touch_screen          wifi 
##             0             0             0             0             0 
##   price_range 
##             0

duplicate_count <- sum(duplicated(dataset))
cat("Jumlah data duplikat:", duplicate_count, "\n")

## Jumlah data duplikat: 0

dataset <- dataset %>%
  mutate(
    blue          = as.factor(blue),
    dual_sim      = as.factor(dual_sim),
    four_g        = as.factor(four_g),
    three_g       = as.factor(three_g),
    touch_screen  = as.factor(touch_screen),
    wifi          = as.factor(wifi),
    
    price_range = recode(as.character(price_range),
                         "0" = "low cost",
                         "1" = "medium cost",
                         "2" = "high cost",
                         "3" = "very high cost"),
    
    price_range = ordered(price_range,
                          levels = c("low cost", "medium cost", "high cost", "very high cost"))
  )

summary(dataset)

##  battery_power    blue      clock_speed    dual_sim       fc         four_g  
##  Min.   : 501.0   0:1010   Min.   :0.500   0: 981   Min.   : 0.000   0: 957  
##  1st Qu.: 851.8   1: 990   1st Qu.:0.700   1:1019   1st Qu.: 1.000   1:1043  
##  Median :1226.0            Median :1.500            Median : 3.000           
##  Mean   :1238.5            Mean   :1.522            Mean   : 4.309           
##  3rd Qu.:1615.2            3rd Qu.:2.200            3rd Qu.: 7.000           
##  Max.   :1998.0            Max.   :3.000            Max.   :19.000           
##    int_memory        m_dep          mobile_wt        n_cores     
##  Min.   : 2.00   Min.   :0.1000   Min.   : 80.0   Min.   :1.000  
##  1st Qu.:16.00   1st Qu.:0.2000   1st Qu.:109.0   1st Qu.:3.000  
##  Median :32.00   Median :0.5000   Median :141.0   Median :4.000  
##  Mean   :32.05   Mean   :0.5018   Mean   :140.2   Mean   :4.521  
##  3rd Qu.:48.00   3rd Qu.:0.8000   3rd Qu.:170.0   3rd Qu.:7.000  
##  Max.   :64.00   Max.   :1.0000   Max.   :200.0   Max.   :8.000  
##        pc           px_height         px_width           ram      
##  Min.   : 0.000   Min.   :   0.0   Min.   : 500.0   Min.   : 256  
##  1st Qu.: 5.000   1st Qu.: 282.8   1st Qu.: 874.8   1st Qu.:1208  
##  Median :10.000   Median : 564.0   Median :1247.0   Median :2146  
##  Mean   : 9.916   Mean   : 645.1   Mean   :1251.5   Mean   :2124  
##  3rd Qu.:15.000   3rd Qu.: 947.2   3rd Qu.:1633.0   3rd Qu.:3064  
##  Max.   :20.000   Max.   :1960.0   Max.   :1998.0   Max.   :3998  
##       sc_h            sc_w          talk_time     three_g  touch_screen
##  Min.   : 5.00   Min.   : 0.000   Min.   : 2.00   0: 477   0: 994      
##  1st Qu.: 9.00   1st Qu.: 2.000   1st Qu.: 6.00   1:1523   1:1006      
##  Median :12.00   Median : 5.000   Median :11.00                        
##  Mean   :12.31   Mean   : 5.767   Mean   :11.01                        
##  3rd Qu.:16.00   3rd Qu.: 9.000   3rd Qu.:16.00                        
##  Max.   :19.00   Max.   :18.000   Max.   :20.00                        
##  wifi             price_range 
##  0: 986   low cost      :500  
##  1:1014   medium cost   :500  
##           high cost     :500  
##           very high cost:500  
##                               
##

str(dataset)

## 'data.frame':    2000 obs. of  21 variables:
##  $ battery_power: int  842 1021 563 615 1821 1859 1821 1954 1445 509 ...
##  $ blue         : Factor w/ 2 levels "0","1": 1 2 2 2 2 1 1 1 2 2 ...
##  $ clock_speed  : num  2.2 0.5 0.5 2.5 1.2 0.5 1.7 0.5 0.5 0.6 ...
##  $ dual_sim     : Factor w/ 2 levels "0","1": 1 2 2 1 1 2 1 2 1 2 ...
##  $ fc           : int  1 0 2 0 13 3 4 0 0 2 ...
##  $ four_g       : Factor w/ 2 levels "0","1": 1 2 2 1 2 1 2 1 1 2 ...
##  $ int_memory   : int  7 53 41 10 44 22 10 24 53 9 ...
##  $ m_dep        : num  0.6 0.7 0.9 0.8 0.6 0.7 0.8 0.8 0.7 0.1 ...
##  $ mobile_wt    : int  188 136 145 131 141 164 139 187 174 93 ...
##  $ n_cores      : int  2 3 5 6 2 1 8 4 7 5 ...
##  $ pc           : int  2 6 6 9 14 7 10 0 14 15 ...
##  $ px_height    : int  20 905 1263 1216 1208 1004 381 512 386 1137 ...
##  $ px_width     : int  756 1988 1716 1786 1212 1654 1018 1149 836 1224 ...
##  $ ram          : int  2549 2631 2603 2769 1411 1067 3220 700 1099 513 ...
##  $ sc_h         : int  9 17 11 16 8 17 13 16 17 19 ...
##  $ sc_w         : int  7 3 2 8 2 1 8 3 1 10 ...
##  $ talk_time    : int  19 7 9 11 15 10 18 5 20 12 ...
##  $ three_g      : Factor w/ 2 levels "0","1": 1 2 2 2 2 2 2 2 2 2 ...
##  $ touch_screen : Factor w/ 2 levels "0","1": 1 2 2 1 2 1 1 2 1 1 ...
##  $ wifi         : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 2 2 1 1 ...
##  $ price_range  : Ord.factor w/ 4 levels "low cost"<"medium cost"<..: 2 3 3 3 2 2 4 1 1 1 ...

Analisis Statistik Deskriptif

statistik deskriptif untuk variabel numerik

describe(dataset %>% select_if(is.numeric))

##               vars    n    mean      sd median trimmed     mad   min  max
## battery_power    1 2000 1238.52  439.42 1226.0 1236.25  566.35 501.0 1998
## clock_speed      2 2000    1.52    0.82    1.5    1.48    1.19   0.5    3
## fc               3 2000    4.31    4.34    3.0    3.69    4.45   0.0   19
## int_memory       4 2000   32.05   18.15   32.0   31.88   23.72   2.0   64
## m_dep            5 2000    0.50    0.29    0.5    0.50    0.44   0.1    1
## mobile_wt        6 2000  140.25   35.40  141.0  140.22   45.96  80.0  200
## n_cores          7 2000    4.52    2.29    4.0    4.53    2.97   1.0    8
## pc               8 2000    9.92    6.06   10.0    9.89    7.41   0.0   20
## px_height        9 2000  645.11  443.78  564.0  606.95  471.47   0.0 1960
## px_width        10 2000 1251.52  432.20 1247.0 1250.53  557.46 500.0 1998
## ram             11 2000 2124.21 1084.73 2146.5 2122.93 1382.52 256.0 3998
## sc_h            12 2000   12.31    4.21   12.0   12.37    5.93   5.0   19
## sc_w            13 2000    5.77    4.36    5.0    5.40    4.45   0.0   18
## talk_time       14 2000   11.01    5.46   11.0   11.01    7.41   2.0   20
##                range  skew kurtosis    se
## battery_power 1497.0  0.03    -1.23  9.83
## clock_speed      2.5  0.18    -1.32  0.02
## fc              19.0  1.02     0.27  0.10
## int_memory      62.0  0.06    -1.22  0.41
## m_dep            0.9  0.09    -1.28  0.01
## mobile_wt      120.0  0.01    -1.21  0.79
## n_cores          7.0  0.00    -1.23  0.05
## pc              20.0  0.02    -1.17  0.14
## px_height     1960.0  0.67    -0.32  9.92
## px_width      1498.0  0.01    -1.19  9.66
## ram           3742.0  0.01    -1.19 24.26
## sc_h            14.0 -0.10    -1.19  0.09
## sc_w            18.0  0.63    -0.39  0.10
## talk_time       18.0  0.01    -1.22  0.12

dataset %>%
  select_if(is.numeric) %>%
  gather(key = "Variabel", value = "Nilai") %>%
  ggplot(aes(x = Nilai)) +
  facet_wrap(~ Variabel, scales = "free", ncol = 3) +
  geom_histogram(bins = 10, fill = "steelblue", color = "black") +
  theme_minimal()

numerik_vars <- names(dataset)[sapply(dataset, is.numeric)]

for (var in numerik_vars) {
  p <- ggplot(dataset, aes(x = price_range, y = dataset[[var]], fill = price_range)) +
    geom_boxplot() +
    labs(title = paste("Distribusi", var, "berdasarkan Harga")) +
    theme_minimal() +
    theme(legend.position = "none")
  print(p)
}

## Warning: Use of `dataset[[var]]` is discouraged.
## ℹ Use `.data[[var]]` instead.

## Warning: Use of `dataset[[var]]` is discouraged.
## ℹ Use `.data[[var]]` instead.

## Warning: Use of `dataset[[var]]` is discouraged.
## ℹ Use `.data[[var]]` instead.

## Warning: Use of `dataset[[var]]` is discouraged.
## ℹ Use `.data[[var]]` instead.

## Warning: Use of `dataset[[var]]` is discouraged.
## ℹ Use `.data[[var]]` instead.

## Warning: Use of `dataset[[var]]` is discouraged.
## ℹ Use `.data[[var]]` instead.

## Warning: Use of `dataset[[var]]` is discouraged.
## ℹ Use `.data[[var]]` instead.

## Warning: Use of `dataset[[var]]` is discouraged.
## ℹ Use `.data[[var]]` instead.

## Warning: Use of `dataset[[var]]` is discouraged.
## ℹ Use `.data[[var]]` instead.

## Warning: Use of `dataset[[var]]` is discouraged.
## ℹ Use `.data[[var]]` instead.

## Warning: Use of `dataset[[var]]` is discouraged.
## ℹ Use `.data[[var]]` instead.

## Warning: Use of `dataset[[var]]` is discouraged.
## ℹ Use `.data[[var]]` instead.

## Warning: Use of `dataset[[var]]` is discouraged.
## ℹ Use `.data[[var]]` instead.

## Warning: Use of `dataset[[var]]` is discouraged.
## ℹ Use `.data[[var]]` instead.

numeric_data <- dataset %>% select_if(is.numeric)

cor_matrix <- cor(numeric_data, use = "complete.obs")

ggcorr(numeric_data,label = TRUE,label_size = 3, hjust = 0.95, layout.exp = 3, name = "Correlation") +
  ggtitle("Matriks Korelasi Tiap Variabel") +
  theme(
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    legend.title = element_text(size = 12),
    legend.text = element_text(size = 10)
  )

ggplot(dataset, aes(x = price_range, fill = price_range)) +
  geom_bar() +
  scale_fill_brewer(palette = "Set2") + 
  labs(
    title = "Distribusi Variabel Price Range",
    x = "Price Range",
    y = "Jumlah Data",
    fill = "Price Range"
  ) +
  theme_minimal(base_size = 14) +
  theme(legend.position = "right")

dataset %>%
  group_by(price_range) %>%
  summarise(across(where(is.numeric), 
                   list(mean = ~mean(.x, na.rm = TRUE), 
                        sd = ~sd(.x, na.rm = TRUE)),
                   .names = "{.col}_{.fn}"))

## # A tibble: 4 × 29
##   price_range    battery_power_mean battery_power_sd clock_speed_mean
##   <ord>                       <dbl>            <dbl>            <dbl>
## 1 low cost                    1117.             411.             1.55
## 2 medium cost                 1229.             439.             1.49
## 3 high cost                   1228.             453.             1.53
## 4 very high cost              1380.             415.             1.52
## # ℹ 25 more variables: clock_speed_sd <dbl>, fc_mean <dbl>, fc_sd <dbl>,
## #   int_memory_mean <dbl>, int_memory_sd <dbl>, m_dep_mean <dbl>,
## #   m_dep_sd <dbl>, mobile_wt_mean <dbl>, mobile_wt_sd <dbl>,
## #   n_cores_mean <dbl>, n_cores_sd <dbl>, pc_mean <dbl>, pc_sd <dbl>,
## #   px_height_mean <dbl>, px_height_sd <dbl>, px_width_mean <dbl>,
## #   px_width_sd <dbl>, ram_mean <dbl>, ram_sd <dbl>, sc_h_mean <dbl>,
## #   sc_h_sd <dbl>, sc_w_mean <dbl>, sc_w_sd <dbl>, talk_time_mean <dbl>, …

head(dataset)

##   battery_power blue clock_speed dual_sim fc four_g int_memory m_dep mobile_wt
## 1           842    0         2.2        0  1      0          7   0.6       188
## 2          1021    1         0.5        1  0      1         53   0.7       136
## 3           563    1         0.5        1  2      1         41   0.9       145
## 4           615    1         2.5        0  0      0         10   0.8       131
## 5          1821    1         1.2        0 13      1         44   0.6       141
## 6          1859    0         0.5        1  3      0         22   0.7       164
##   n_cores pc px_height px_width  ram sc_h sc_w talk_time three_g touch_screen
## 1       2  2        20      756 2549    9    7        19       0            0
## 2       3  6       905     1988 2631   17    3         7       1            1
## 3       5  6      1263     1716 2603   11    2         9       1            1
## 4       6  9      1216     1786 2769   16    8        11       1            0
## 5       2 14      1208     1212 1411    8    2        15       1            1
## 6       1  7      1004     1654 1067   17    1        10       1            0
##   wifi price_range
## 1    1 medium cost
## 2    0   high cost
## 3    0   high cost
## 4    0   high cost
## 5    0 medium cost
## 6    0 medium cost

memastikan price_range adalah factor ordinal

dataset$price_range <- factor(dataset$price_range, ordered = TRUE)

mengambil variabel numerik saja untuk PCA

num_vars <- dataset %>% select_if(is.numeric)

standarisasi

num_scaled <- scale(num_vars)

Train-Test Split

set.seed(123) 
train_index <- createDataPartition(dataset$price_range, p = 0.8, list = FALSE)
train_data <- dataset[train_index, ]
test_data  <- dataset[-train_index, ]

PCA

train_num <- train_data %>% select_if(is.numeric)
train_scaled <- scale(train_num)

pca_model <- prcomp(train_scaled, center = TRUE, scale. = TRUE)

summary(pca_model)

## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5     PC6     PC7
## Standard deviation     1.2948 1.2465 1.2084 1.03104 1.01890 1.01806 1.00530
## Proportion of Variance 0.1197 0.1110 0.1043 0.07593 0.07415 0.07403 0.07219
## Cumulative Proportion  0.1197 0.2307 0.3350 0.41097 0.48512 0.55916 0.63134
##                            PC8     PC9    PC10    PC11    PC12    PC13    PC14
## Standard deviation     0.99505 0.98416 0.98057 0.94678 0.70256 0.70026 0.60054
## Proportion of Variance 0.07072 0.06918 0.06868 0.06403 0.03526 0.03503 0.02576
## Cumulative Proportion  0.70207 0.77125 0.83993 0.90396 0.93921 0.97424 1.00000

mengambil 10 komponen karena sudah menjelaskan 84% variasi data

train_pca <- as.data.frame(pca_model$x[, 1:10])
train_pca$price_range <- train_data$price_range

# OLR

model_olr <- polr(price_range ~ ., data = train_pca, Hess = TRUE)
summary(model_olr)

## Call:
## polr(formula = price_range ~ ., data = train_pca, Hess = TRUE)
## 
## Coefficients:
##        Value Std. Error t value
## PC1   0.1007    0.06504   1.549
## PC2   1.3134    0.09058  14.500
## PC3   1.1329    0.08798  12.876
## PC4   3.4402    0.17467  19.696
## PC5   3.7709    0.18652  20.217
## PC6   2.8552    0.15328  18.627
## PC7  -0.8928    0.09266  -9.635
## PC8  -6.6378    0.30926 -21.463
## PC9   2.9044    0.15419  18.837
## PC10  2.1759    0.12908  16.857
## 
## Intercepts:
##                          Value    Std. Error t value 
## low cost|medium cost      -8.0296   0.3939   -20.3847
## medium cost|high cost      0.0822   0.1375     0.5982
## high cost|very high cost   7.8400   0.3850    20.3640
## 
## Residual Deviance: 952.1686 
## AIC: 978.1686

Asumsi Proportional Odds

brant(model_olr)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## -------------------------------------------- 
## Test for X2  df  probability 
## -------------------------------------------- 
## Omnibus      33.89   20  0.03
## PC1      1.18    2   0.55
## PC2      3.85    2   0.15
## PC3      2.28    2   0.32
## PC4      2.58    2   0.27
## PC5      2.97    2   0.23
## PC6      1.14    2   0.56
## PC7      7.3 2   0.03
## PC8      0.43    2   0.81
## PC9      0.19    2   0.91
## PC10     2.15    2   0.34
## -------------------------------------------- 
## 
## H0: Parallel Regression Assumption holds

Transform test set menggunakan PCA yang sama

test_num <- test_data %>% select_if(is.numeric)

# standardisasi 
test_scaled <- scale(test_num, 
                     center = attr(train_scaled, "scaled:center"), 
                     scale  = attr(train_scaled, "scaled:scale"))

test_pca_data <- as.data.frame(predict(pca_model, newdata = test_scaled)[, 1:10])

label_mapping <- c("low cost", "medium cost", "high cost", "very high cost")

test_pca_data$price_range <- factor(test_data$price_range, 
                                    levels = label_mapping, 
                                    ordered = TRUE)

pred <- predict(model_olr, newdata = test_pca_data)

label_mapping <- c("low cost", "medium cost", "high cost", "very high cost")

pred_label <- factor(pred, levels = label_mapping, ordered = TRUE)

Evaluasi Model

conf <- confusionMatrix(pred_label, test_pca_data$price_range)
print(conf)

## Confusion Matrix and Statistics
## 
##                 Reference
## Prediction       low cost medium cost high cost very high cost
##   low cost             90           7         0              0
##   medium cost          10          74        16              0
##   high cost             0          19        71              9
##   very high cost        0           0        13             91
## 
## Overall Statistics
##                                           
##                Accuracy : 0.815           
##                  95% CI : (0.7734, 0.8519)
##     No Information Rate : 0.25            
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7533          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: low cost Class: medium cost Class: high cost
## Sensitivity                   0.9000             0.7400           0.7100
## Specificity                   0.9767             0.9133           0.9067
## Pos Pred Value                0.9278             0.7400           0.7172
## Neg Pred Value                0.9670             0.9133           0.9037
## Prevalence                    0.2500             0.2500           0.2500
## Detection Rate                0.2250             0.1850           0.1775
## Detection Prevalence          0.2425             0.2500           0.2475
## Balanced Accuracy             0.9383             0.8267           0.8083
##                      Class: very high cost
## Sensitivity                         0.9100
## Specificity                         0.9567
## Pos Pred Value                      0.8750
## Neg Pred Value                      0.9696
## Prevalence                          0.2500
## Detection Rate                      0.2275
## Detection Prevalence                0.2600
## Balanced Accuracy                   0.9333

Cohen’s Kappa

kappa_val <- kappa2(data.frame(pred_label, test_pca_data$price_range), "unweighted")
print(kappa_val)

##  Cohen's Kappa for 2 Raters (Weights: unweighted)
## 
##  Subjects = 400 
##    Raters = 2 
##     Kappa = 0.753 
## 
##         z = 26.1 
##   p-value = 0

MAE

mae_cat <- mean(abs(as.numeric(pred_label) - as.numeric(test_pca_data$price_range)))
cat("MAE Kategori:", round(mae_cat, 3), "\n")

## MAE Kategori: 0.185

pred_train <- predict(model_olr, newdata = train_pca)
conf_train <- confusionMatrix(pred_train, train_pca$price_range)
cat("Akurasi Data Train:", round(conf_train$overall["Accuracy"], 4), "\n")

## Akurasi Data Train: 0.8625

conf_test <- confusionMatrix(pred, test_pca_data$price_range)
cat("Akurasi Data Test :", round(conf_test$overall["Accuracy"], 4), "\n")

## Akurasi Data Test : 0.815

Script Metode Ordinal Logistic Regression (OLR) dan Principal Component Analysis (PCA)

Firly Farah Aulia

2025-06-07

Pre-processing

Analisis Statistik Deskriptif

Train-Test Split

PCA

Asumsi Proportional Odds

Evaluasi Model

Cohen’s Kappa

MAE