1 Load Dataset
2 Pre-processing
3 ANALISIS STATISTIK DESKRIPTIF
4 Pengelompokkan Harga
5 Train-Test Split
6 uji multikolniearitas (OLR)
7 Asumsi Propotional Odds
8 Evaluasi Model
9 Cohen’s Kappa

1 Load Dataset

dataset <- read.csv("C:/SEMESTER 4/ANALISIS MULTIVARIAT/Mobile Price Train.csv")
head(dataset)

##   battery_power blue clock_speed dual_sim fc four_g int_memory m_dep mobile_wt
## 1           842    0         2.2        0  1      0          7   0.6       188
## 2          1021    1         0.5        1  0      1         53   0.7       136
## 3           563    1         0.5        1  2      1         41   0.9       145
## 4           615    1         2.5        0  0      0         10   0.8       131
## 5          1821    1         1.2        0 13      1         44   0.6       141
## 6          1859    0         0.5        1  3      0         22   0.7       164
##   n_cores pc px_height px_width  ram sc_h sc_w talk_time three_g touch_screen
## 1       2  2        20      756 2549    9    7        19       0            0
## 2       3  6       905     1988 2631   17    3         7       1            1
## 3       5  6      1263     1716 2603   11    2         9       1            1
## 4       6  9      1216     1786 2769   16    8        11       1            0
## 5       2 14      1208     1212 1411    8    2        15       1            1
## 6       1  7      1004     1654 1067   17    1        10       1            0
##   wifi price_range
## 1    1           1
## 2    0           2
## 3    0           2
## 4    0           2
## 5    0           1
## 6    0           1

str(dataset)

## 'data.frame':    2000 obs. of  21 variables:
##  $ battery_power: int  842 1021 563 615 1821 1859 1821 1954 1445 509 ...
##  $ blue         : int  0 1 1 1 1 0 0 0 1 1 ...
##  $ clock_speed  : num  2.2 0.5 0.5 2.5 1.2 0.5 1.7 0.5 0.5 0.6 ...
##  $ dual_sim     : int  0 1 1 0 0 1 0 1 0 1 ...
##  $ fc           : int  1 0 2 0 13 3 4 0 0 2 ...
##  $ four_g       : int  0 1 1 0 1 0 1 0 0 1 ...
##  $ int_memory   : int  7 53 41 10 44 22 10 24 53 9 ...
##  $ m_dep        : num  0.6 0.7 0.9 0.8 0.6 0.7 0.8 0.8 0.7 0.1 ...
##  $ mobile_wt    : int  188 136 145 131 141 164 139 187 174 93 ...
##  $ n_cores      : int  2 3 5 6 2 1 8 4 7 5 ...
##  $ pc           : int  2 6 6 9 14 7 10 0 14 15 ...
##  $ px_height    : int  20 905 1263 1216 1208 1004 381 512 386 1137 ...
##  $ px_width     : int  756 1988 1716 1786 1212 1654 1018 1149 836 1224 ...
##  $ ram          : int  2549 2631 2603 2769 1411 1067 3220 700 1099 513 ...
##  $ sc_h         : int  9 17 11 16 8 17 13 16 17 19 ...
##  $ sc_w         : int  7 3 2 8 2 1 8 3 1 10 ...
##  $ talk_time    : int  19 7 9 11 15 10 18 5 20 12 ...
##  $ three_g      : int  0 1 1 1 1 1 1 1 1 1 ...
##  $ touch_screen : int  0 1 1 0 1 0 0 1 0 0 ...
##  $ wifi         : int  1 0 0 0 0 0 1 1 0 0 ...
##  $ price_range  : int  1 2 2 2 1 1 3 0 0 0 ...

2 Pre-processing

missing_values <- colSums(is.na(dataset))
cat("Missing values per kolom:\n")

## Missing values per kolom:

print(missing_values)

## battery_power          blue   clock_speed      dual_sim            fc 
##             0             0             0             0             0 
##        four_g    int_memory         m_dep     mobile_wt       n_cores 
##             0             0             0             0             0 
##            pc     px_height      px_width           ram          sc_h 
##             0             0             0             0             0 
##          sc_w     talk_time       three_g  touch_screen          wifi 
##             0             0             0             0             0 
##   price_range 
##             0

Cek Data Duplikat

duplicate_count <- sum(duplicated(dataset))
cat("Jumlah data duplikat:", duplicate_count, "\n")

## Jumlah data duplikat: 0

dataset <- dataset %>%
  mutate(
    blue          = as.factor(blue),
    dual_sim      = as.factor(dual_sim),
    four_g        = as.factor(four_g),
    three_g       = as.factor(three_g),
    touch_screen  = as.factor(touch_screen),
    wifi          = as.factor(wifi),
    
    price_range = recode(as.character(price_range),
                         "0" = "low cost",
                         "1" = "medium cost",
                         "2" = "high cost",
                         "3" = "very high cost"),
    
    price_range = ordered(price_range,
                          levels = c("low cost", "medium cost", "high cost", "very high cost"))
  )

summary(dataset)

##  battery_power    blue      clock_speed    dual_sim       fc         four_g  
##  Min.   : 501.0   0:1010   Min.   :0.500   0: 981   Min.   : 0.000   0: 957  
##  1st Qu.: 851.8   1: 990   1st Qu.:0.700   1:1019   1st Qu.: 1.000   1:1043  
##  Median :1226.0            Median :1.500            Median : 3.000           
##  Mean   :1238.5            Mean   :1.522            Mean   : 4.309           
##  3rd Qu.:1615.2            3rd Qu.:2.200            3rd Qu.: 7.000           
##  Max.   :1998.0            Max.   :3.000            Max.   :19.000           
##    int_memory        m_dep          mobile_wt        n_cores     
##  Min.   : 2.00   Min.   :0.1000   Min.   : 80.0   Min.   :1.000  
##  1st Qu.:16.00   1st Qu.:0.2000   1st Qu.:109.0   1st Qu.:3.000  
##  Median :32.00   Median :0.5000   Median :141.0   Median :4.000  
##  Mean   :32.05   Mean   :0.5018   Mean   :140.2   Mean   :4.521  
##  3rd Qu.:48.00   3rd Qu.:0.8000   3rd Qu.:170.0   3rd Qu.:7.000  
##  Max.   :64.00   Max.   :1.0000   Max.   :200.0   Max.   :8.000  
##        pc           px_height         px_width           ram      
##  Min.   : 0.000   Min.   :   0.0   Min.   : 500.0   Min.   : 256  
##  1st Qu.: 5.000   1st Qu.: 282.8   1st Qu.: 874.8   1st Qu.:1208  
##  Median :10.000   Median : 564.0   Median :1247.0   Median :2146  
##  Mean   : 9.916   Mean   : 645.1   Mean   :1251.5   Mean   :2124  
##  3rd Qu.:15.000   3rd Qu.: 947.2   3rd Qu.:1633.0   3rd Qu.:3064  
##  Max.   :20.000   Max.   :1960.0   Max.   :1998.0   Max.   :3998  
##       sc_h            sc_w          talk_time     three_g  touch_screen
##  Min.   : 5.00   Min.   : 0.000   Min.   : 2.00   0: 477   0: 994      
##  1st Qu.: 9.00   1st Qu.: 2.000   1st Qu.: 6.00   1:1523   1:1006      
##  Median :12.00   Median : 5.000   Median :11.00                        
##  Mean   :12.31   Mean   : 5.767   Mean   :11.01                        
##  3rd Qu.:16.00   3rd Qu.: 9.000   3rd Qu.:16.00                        
##  Max.   :19.00   Max.   :18.000   Max.   :20.00                        
##  wifi             price_range 
##  0: 986   low cost      :500  
##  1:1014   medium cost   :500  
##           high cost     :500  
##           very high cost:500  
##                               
##

str(dataset)

## 'data.frame':    2000 obs. of  21 variables:
##  $ battery_power: int  842 1021 563 615 1821 1859 1821 1954 1445 509 ...
##  $ blue         : Factor w/ 2 levels "0","1": 1 2 2 2 2 1 1 1 2 2 ...
##  $ clock_speed  : num  2.2 0.5 0.5 2.5 1.2 0.5 1.7 0.5 0.5 0.6 ...
##  $ dual_sim     : Factor w/ 2 levels "0","1": 1 2 2 1 1 2 1 2 1 2 ...
##  $ fc           : int  1 0 2 0 13 3 4 0 0 2 ...
##  $ four_g       : Factor w/ 2 levels "0","1": 1 2 2 1 2 1 2 1 1 2 ...
##  $ int_memory   : int  7 53 41 10 44 22 10 24 53 9 ...
##  $ m_dep        : num  0.6 0.7 0.9 0.8 0.6 0.7 0.8 0.8 0.7 0.1 ...
##  $ mobile_wt    : int  188 136 145 131 141 164 139 187 174 93 ...
##  $ n_cores      : int  2 3 5 6 2 1 8 4 7 5 ...
##  $ pc           : int  2 6 6 9 14 7 10 0 14 15 ...
##  $ px_height    : int  20 905 1263 1216 1208 1004 381 512 386 1137 ...
##  $ px_width     : int  756 1988 1716 1786 1212 1654 1018 1149 836 1224 ...
##  $ ram          : int  2549 2631 2603 2769 1411 1067 3220 700 1099 513 ...
##  $ sc_h         : int  9 17 11 16 8 17 13 16 17 19 ...
##  $ sc_w         : int  7 3 2 8 2 1 8 3 1 10 ...
##  $ talk_time    : int  19 7 9 11 15 10 18 5 20 12 ...
##  $ three_g      : Factor w/ 2 levels "0","1": 1 2 2 2 2 2 2 2 2 2 ...
##  $ touch_screen : Factor w/ 2 levels "0","1": 1 2 2 1 2 1 1 2 1 1 ...
##  $ wifi         : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 2 2 1 1 ...
##  $ price_range  : Ord.factor w/ 4 levels "low cost"<"medium cost"<..: 2 3 3 3 2 2 4 1 1 1 ...

3 ANALISIS STATISTIK DESKRIPTIF

statistik deskriptif untuk variabel numerik

describe(dataset %>% select_if(is.numeric))

##               vars    n    mean      sd median trimmed     mad   min  max
## battery_power    1 2000 1238.52  439.42 1226.0 1236.25  566.35 501.0 1998
## clock_speed      2 2000    1.52    0.82    1.5    1.48    1.19   0.5    3
## fc               3 2000    4.31    4.34    3.0    3.69    4.45   0.0   19
## int_memory       4 2000   32.05   18.15   32.0   31.88   23.72   2.0   64
## m_dep            5 2000    0.50    0.29    0.5    0.50    0.44   0.1    1
## mobile_wt        6 2000  140.25   35.40  141.0  140.22   45.96  80.0  200
## n_cores          7 2000    4.52    2.29    4.0    4.53    2.97   1.0    8
## pc               8 2000    9.92    6.06   10.0    9.89    7.41   0.0   20
## px_height        9 2000  645.11  443.78  564.0  606.95  471.47   0.0 1960
## px_width        10 2000 1251.52  432.20 1247.0 1250.53  557.46 500.0 1998
## ram             11 2000 2124.21 1084.73 2146.5 2122.93 1382.52 256.0 3998
## sc_h            12 2000   12.31    4.21   12.0   12.37    5.93   5.0   19
## sc_w            13 2000    5.77    4.36    5.0    5.40    4.45   0.0   18
## talk_time       14 2000   11.01    5.46   11.0   11.01    7.41   2.0   20
##                range  skew kurtosis    se
## battery_power 1497.0  0.03    -1.23  9.83
## clock_speed      2.5  0.18    -1.32  0.02
## fc              19.0  1.02     0.27  0.10
## int_memory      62.0  0.06    -1.22  0.41
## m_dep            0.9  0.09    -1.28  0.01
## mobile_wt      120.0  0.01    -1.21  0.79
## n_cores          7.0  0.00    -1.23  0.05
## pc              20.0  0.02    -1.17  0.14
## px_height     1960.0  0.67    -0.32  9.92
## px_width      1498.0  0.01    -1.19  9.66
## ram           3742.0  0.01    -1.19 24.26
## sc_h            14.0 -0.10    -1.19  0.09
## sc_w            18.0  0.63    -0.39  0.10
## talk_time       18.0  0.01    -1.22  0.12

dataset %>%
  select_if(is.numeric) %>%
  gather(key = "Variabel", value = "Nilai") %>%
  ggplot(aes(x = Nilai)) +
  facet_wrap(~ Variabel, scales = "free", ncol = 3) +
  geom_histogram(bins = 10, fill = "steelblue", color = "black") +
  theme_minimal()

> Boxplot untuk melihat distribusi fitur berdasarkan harga_kategori

numerik_vars <- names(dataset)[sapply(dataset, is.numeric)]

for (var in numerik_vars) {
  p <- ggplot(dataset, aes(x = price_range, y = dataset[[var]], fill = price_range)) +
    geom_boxplot() +
    labs(title = paste("Distribusi", var, "berdasarkan Harga")) +
    theme_minimal() +
    theme(legend.position = "none")
  print(p)
}

## Warning: Use of `dataset[[var]]` is discouraged.
## ℹ Use `.data[[var]]` instead.

## Warning: Use of `dataset[[var]]` is discouraged.
## ℹ Use `.data[[var]]` instead.

## Warning: Use of `dataset[[var]]` is discouraged.
## ℹ Use `.data[[var]]` instead.

## Warning: Use of `dataset[[var]]` is discouraged.
## ℹ Use `.data[[var]]` instead.

## Warning: Use of `dataset[[var]]` is discouraged.
## ℹ Use `.data[[var]]` instead.

## Warning: Use of `dataset[[var]]` is discouraged.
## ℹ Use `.data[[var]]` instead.

## Warning: Use of `dataset[[var]]` is discouraged.
## ℹ Use `.data[[var]]` instead.

## Warning: Use of `dataset[[var]]` is discouraged.
## ℹ Use `.data[[var]]` instead.

## Warning: Use of `dataset[[var]]` is discouraged.
## ℹ Use `.data[[var]]` instead.

## Warning: Use of `dataset[[var]]` is discouraged.
## ℹ Use `.data[[var]]` instead.

## Warning: Use of `dataset[[var]]` is discouraged.
## ℹ Use `.data[[var]]` instead.

## Warning: Use of `dataset[[var]]` is discouraged.
## ℹ Use `.data[[var]]` instead.

## Warning: Use of `dataset[[var]]` is discouraged.
## ℹ Use `.data[[var]]` instead.

## Warning: Use of `dataset[[var]]` is discouraged.
## ℹ Use `.data[[var]]` instead.

Korelasi antar variabel numerik

numeric_data <- dataset %>% select_if(is.numeric)

cor_matrix <- cor(numeric_data, use = "complete.obs")

ggcorr(numeric_data,label = TRUE,label_size = 3, hjust = 0.95, layout.exp = 3, name = "Correlation") +
  ggtitle("Matriks Korelasi Tiap Variabel") +
  theme(
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    legend.title = element_text(size = 12),
    legend.text = element_text(size = 10)
  )

Statistik Deskriptif berdasarkan Kategori Harga

ggplot(dataset, aes(x = price_range, fill = price_range)) +
  geom_bar() +
  scale_fill_brewer(palette = "Set2") + 
  labs(
    title = "Distribusi Variabel Price Range",
    x = "Price Range",
    y = "Jumlah Data",
    fill = "Price Range"
  ) +
  theme_minimal(base_size = 14) +
  theme(legend.position = "right")

4 Pengelompokkan Harga

dataset %>%
  group_by(price_range) %>%
  summarise(across(where(is.numeric), 
                   list(mean = ~mean(.x, na.rm = TRUE), 
                        sd = ~sd(.x, na.rm = TRUE)),
                   .names = "{.col}_{.fn}"))

## # A tibble: 4 × 29
##   price_range    battery_power_mean battery_power_sd clock_speed_mean
##   <ord>                       <dbl>            <dbl>            <dbl>
## 1 low cost                    1117.             411.             1.55
## 2 medium cost                 1229.             439.             1.49
## 3 high cost                   1228.             453.             1.53
## 4 very high cost              1380.             415.             1.52
## # ℹ 25 more variables: clock_speed_sd <dbl>, fc_mean <dbl>, fc_sd <dbl>,
## #   int_memory_mean <dbl>, int_memory_sd <dbl>, m_dep_mean <dbl>,
## #   m_dep_sd <dbl>, mobile_wt_mean <dbl>, mobile_wt_sd <dbl>,
## #   n_cores_mean <dbl>, n_cores_sd <dbl>, pc_mean <dbl>, pc_sd <dbl>,
## #   px_height_mean <dbl>, px_height_sd <dbl>, px_width_mean <dbl>,
## #   px_width_sd <dbl>, ram_mean <dbl>, ram_sd <dbl>, sc_h_mean <dbl>,
## #   sc_h_sd <dbl>, sc_w_mean <dbl>, sc_w_sd <dbl>, talk_time_mean <dbl>, …

head(dataset)

##   battery_power blue clock_speed dual_sim fc four_g int_memory m_dep mobile_wt
## 1           842    0         2.2        0  1      0          7   0.6       188
## 2          1021    1         0.5        1  0      1         53   0.7       136
## 3           563    1         0.5        1  2      1         41   0.9       145
## 4           615    1         2.5        0  0      0         10   0.8       131
## 5          1821    1         1.2        0 13      1         44   0.6       141
## 6          1859    0         0.5        1  3      0         22   0.7       164
##   n_cores pc px_height px_width  ram sc_h sc_w talk_time three_g touch_screen
## 1       2  2        20      756 2549    9    7        19       0            0
## 2       3  6       905     1988 2631   17    3         7       1            1
## 3       5  6      1263     1716 2603   11    2         9       1            1
## 4       6  9      1216     1786 2769   16    8        11       1            0
## 5       2 14      1208     1212 1411    8    2        15       1            1
## 6       1  7      1004     1654 1067   17    1        10       1            0
##   wifi price_range
## 1    1 medium cost
## 2    0   high cost
## 3    0   high cost
## 4    0   high cost
## 5    0 medium cost
## 6    0 medium cost

memastikan price_range adalah factor ordinal

dataset$price_range <- factor(dataset$price_range, ordered = TRUE)

standarisasi semua kolom numerik

num_vars <- dataset %>% select_if(is.numeric)
num_scaled <- scale(num_vars)

5 Train-Test Split

set.seed(123)
data_split <- sample(1:nrow(dataset), 0.7 * nrow(dataset))
train_data <- dataset[data_split, ]
test_data <- dataset[-data_split, ]

Memastikan price_range sebagai ordered factor

train_data$price_range <- factor(train_data$price_range, ordered = TRUE)
test_data$price_range  <- factor(test_data$price_range, ordered = TRUE)

6 uji multikolniearitas (OLR)

model_step1 <- polr(price_range ~ battery_power + ram + mobile_wt + battery_power + sc_h + talk_time, data = train_data, Hess=TRUE)
summary(model_step1)

## Call:
## polr(formula = price_range ~ battery_power + ram + mobile_wt + 
##     battery_power + sc_h + talk_time, data = train_data, Hess = TRUE)
## 
## Coefficients:
##                   Value Std. Error t value
## battery_power  0.004176  0.0001629  25.635
## ram            0.006658  0.0001610  41.356
## mobile_wt     -0.006374  0.0019216  -3.317
## sc_h           0.035049  0.0170245   2.059
## talk_time      0.011993  0.0139451   0.860
## 
## Intercepts:
##                          Value     Std. Error t value  
## low cost|medium cost       12.7786    0.0015  8685.2012
## medium cost|high cost      18.8281    0.2479    75.9548
## high cost|very high cost   25.3718    0.3727    68.0847
## 
## Residual Deviance: 1073.895 
## AIC: 1089.895

null_model <- polr(price_range ~ 1, data=train_data, Hess=TRUE)
full_model <- polr(price_range ~ battery_power + ram + mobile_wt + sc_h + talk_time, data=train_data, Hess=TRUE)

library(MASS)
step_model <- stepAIC(null_model, scope=list(lower=null_model, upper=full_model), direction="both")

## Start:  AIC=3886.68
## price_range ~ 1
## 
##                 Df    AIC
## + ram            1 1529.6
## + battery_power  1 3841.7
## + talk_time      1 3886.4
## <none>             3886.7
## + sc_h           1 3887.8
## + mobile_wt      1 3888.7
## 
## Step:  AIC=1529.62
## price_range ~ ram
## 
##                 Df    AIC
## + battery_power  1 1096.5
## + mobile_wt      1 1527.2
## + sc_h           1 1528.6
## + talk_time      1 1529.0
## <none>             1529.6
## - ram            1 3886.7
## 
## Step:  AIC=1096.51
## price_range ~ ram + battery_power
## 
##                 Df    AIC
## + mobile_wt      1 1090.2
## + sc_h           1 1094.9
## <none>             1096.5
## + talk_time      1 1097.9
## - battery_power  1 1529.6
## - ram            1 3841.7
## 
## Step:  AIC=1090.16
## price_range ~ ram + battery_power + mobile_wt
## 
##                 Df    AIC
## + sc_h           1 1088.6
## <none>             1090.2
## + talk_time      1 1091.5
## - mobile_wt      1 1096.5
## - battery_power  1 1527.2
## - ram            1 3843.7
## 
## Step:  AIC=1088.59
## price_range ~ ram + battery_power + mobile_wt + sc_h
## 
##                 Df    AIC
## <none>             1088.6
## + talk_time      1 1089.9
## - sc_h           1 1090.2
## - mobile_wt      1 1094.9
## - battery_power  1 1526.2
## - ram            1 3844.8

summary(step_model)

## Call:
## polr(formula = price_range ~ ram + battery_power + mobile_wt + 
##     sc_h, data = train_data, Hess = TRUE)
## 
## Coefficients:
##                   Value Std. Error t value
## ram            0.006665  0.0001579  42.225
## battery_power  0.004181  0.0001589  26.320
## mobile_wt     -0.006350  0.0018925  -3.356
## sc_h           0.034799  0.0169650   2.051
## 
## Intercepts:
##                          Value     Std. Error t value  
## low cost|medium cost       12.6650    0.0016  8137.3144
## medium cost|high cost      18.7188    0.2470    75.7998
## high cost|very high cost   25.2705    0.3683    68.6083
## 
## Residual Deviance: 1074.592 
## AIC: 1088.592

7 Asumsi Propotional Odds

brant_result <- brant(step_model)

## -------------------------------------------- 
## Test for X2  df  probability 
## -------------------------------------------- 
## Omnibus      17.35   8   0.03
## ram      2.27    2   0.32
## battery_power    3.36    2   0.19
## mobile_wt    7.06    2   0.03
## sc_h     4.57    2   0.1
## -------------------------------------------- 
## 
## H0: Parallel Regression Assumption holds

print(brant_result)

##                      X2 df probability
## Omnibus       17.346104  8  0.02670026
## ram            2.270623  2  0.32132196
## battery_power  3.362508  2  0.18614039
## mobile_wt      7.056251  2  0.02935990
## sc_h           4.570134  2  0.10176723

pred <- predict(step_model, newdata = test_data, type = "class")

actual <- factor(test_data$price_range, 
                 levels = c("low cost", "medium cost", "high cost", "very high cost"), 
                 ordered = TRUE)

pred_label <- factor(pred, 
                     levels = c("low cost", "medium cost", "high cost", "very high cost"), 
                     ordered = TRUE)

8 Evaluasi Model

conf_matrix <- confusionMatrix(pred_label, actual)
print(conf_matrix)

## Confusion Matrix and Statistics
## 
##                 Reference
## Prediction       low cost medium cost high cost very high cost
##   low cost            145          18         0              0
##   medium cost          10         120        13              0
##   high cost             0          21       110             30
##   very high cost        0           0        12            121
## 
## Overall Statistics
##                                          
##                Accuracy : 0.8267         
##                  95% CI : (0.794, 0.8561)
##     No Information Rate : 0.265          
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.769          
##                                          
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: low cost Class: medium cost Class: high cost
## Sensitivity                   0.9355             0.7547           0.8148
## Specificity                   0.9596             0.9478           0.8903
## Pos Pred Value                0.8896             0.8392           0.6832
## Neg Pred Value                0.9771             0.9147           0.9431
## Prevalence                    0.2583             0.2650           0.2250
## Detection Rate                0.2417             0.2000           0.1833
## Detection Prevalence          0.2717             0.2383           0.2683
## Balanced Accuracy             0.9475             0.8513           0.8526
##                      Class: very high cost
## Sensitivity                         0.8013
## Specificity                         0.9733
## Pos Pred Value                      0.9098
## Neg Pred Value                      0.9358
## Prevalence                          0.2517
## Detection Rate                      0.2017
## Detection Prevalence                0.2217
## Balanced Accuracy                   0.8873

9 Cohen’s Kappa

ratings <- data.frame(rater1 = pred_label, rater2 = actual)
kappa_val <- kappa2(ratings, weight = "unweighted")
print(kappa_val)

##  Cohen's Kappa for 2 Raters (Weights: unweighted)
## 
##  Subjects = 600 
##    Raters = 2 
##     Kappa = 0.769 
## 
##         z = 32.7 
##   p-value = 0

#MAE

actual_num <- as.numeric(actual)
pred_num <- as.numeric(pred_label)
mae <- mean(abs(actual_num - pred_num))
cat("Mean Absolute Error (MAE):", mae, "\n")

## Mean Absolute Error (MAE): 0.1733333

Akurasi train

train_pred <- predict(step_model, train_data, type = "class")
conf_matrix_train <- table(train_data$price_range, train_pred)
accuracy_train <- sum(diag(conf_matrix_train)) / sum(conf_matrix_train)
print(paste("Akurasi data train:", round(accuracy_train, 4)))

## [1] "Akurasi data train: 0.8229"

Akurasi test

test_pred <- predict(step_model, test_data, type = "class")
conf_matrix_test <- table(test_data$price_range, test_pred)
accuracy_test <- sum(diag(conf_matrix_test)) / sum(conf_matrix_test)
print(paste("Akurasi data test:", round(accuracy_test, 4)))

## [1] "Akurasi data test: 0.8267"

Klasifikasi Tingkat Harga Smartphone Berdasarkan Spesifikasi: Ordinal Logistik Regression

Citra Ardhia Intan Nurhapsari

2025-05-26

1 Load Dataset

2 Pre-processing

3 ANALISIS STATISTIK DESKRIPTIF

4 Pengelompokkan Harga

5 Train-Test Split

6 uji multikolniearitas (OLR)

7 Asumsi Propotional Odds

8 Evaluasi Model

9 Cohen’s Kappa