Libraries dan Setup
library(tidyverse)## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(caret)## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(class)
library(gtools)
library(gmodels)## Warning: package 'gmodels' was built under R version 4.2.2
Saya menggunakan dataset mengenai Mobile Price, dimana akan mengklasifikasikan harga suatu mobile phone. Dataset ini memiliki informasi fitur beberapa tipe mobile phone berserta spesifikasi seperti bluetooth, layar, RAM, dll.
Berawal dari seorang jonny ingin memulai perusahaan ponsel sendiri namun dia tidak tahu bagaimana memperkirakan harga ponsel yang akan diciptkan oleh perusahaannya. Dari ponsel yang saat ini sudah dipasaran, dia tidak bisa berasumsi subjektif, sehingga dia mengumpulkan data penjualan ponsel dari berbagai perusahaan.
phone_train <- read.csv("data_input/train.csv")Jumlah Data Train
nrow(phone_train)## [1] 2000
Melihat Tipe Data
glimpse(phone_train)## Rows: 2,000
## Columns: 21
## $ battery_power <int> 842, 1021, 563, 615, 1821, 1859, 1821, 1954, 1445, 509, …
## $ blue <int> 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1,…
## $ clock_speed <dbl> 2.2, 0.5, 0.5, 2.5, 1.2, 0.5, 1.7, 0.5, 0.5, 0.6, 2.9, 2…
## $ dual_sim <int> 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1,…
## $ fc <int> 1, 0, 2, 0, 13, 3, 4, 0, 0, 2, 0, 5, 2, 7, 13, 3, 1, 7, …
## $ four_g <int> 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0,…
## $ int_memory <int> 7, 53, 41, 10, 44, 22, 10, 24, 53, 9, 9, 33, 33, 17, 52,…
## $ m_dep <dbl> 0.6, 0.7, 0.9, 0.8, 0.6, 0.7, 0.8, 0.8, 0.7, 0.1, 0.1, 0…
## $ mobile_wt <int> 188, 136, 145, 131, 141, 164, 139, 187, 174, 93, 182, 17…
## $ n_cores <int> 2, 3, 5, 6, 2, 1, 8, 4, 7, 5, 5, 8, 4, 4, 1, 2, 8, 3, 5,…
## $ pc <int> 2, 6, 6, 9, 14, 7, 10, 0, 14, 15, 1, 18, 17, 11, 17, 16,…
## $ px_height <int> 20, 905, 1263, 1216, 1208, 1004, 381, 512, 386, 1137, 24…
## $ px_width <int> 756, 1988, 1716, 1786, 1212, 1654, 1018, 1149, 836, 1224…
## $ ram <int> 2549, 2631, 2603, 2769, 1411, 1067, 3220, 700, 1099, 513…
## $ sc_h <int> 9, 17, 11, 16, 8, 17, 13, 16, 17, 19, 5, 14, 18, 7, 14, …
## $ sc_w <int> 7, 3, 2, 8, 2, 1, 8, 3, 1, 10, 2, 9, 0, 1, 9, 15, 9, 2, …
## $ talk_time <int> 19, 7, 9, 11, 15, 10, 18, 5, 20, 12, 7, 13, 2, 4, 3, 11,…
## $ three_g <int> 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ touch_screen <int> 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1,…
## $ wifi <int> 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0,…
## $ price_range <int> 1, 2, 2, 2, 1, 1, 3, 0, 0, 0, 3, 3, 1, 2, 0, 0, 3, 3, 1,…
cek apakah ada nilai yang NA..
anyNA(phone_train)## [1] FALSE
Target dari prediksi adalah price_range, maka kita lihat dulu jumlah data pengkategorian di price_range serta value yang ada.
table(phone_train$price_range)##
## 0 1 2 3
## 500 500 500 500
phone_train <- phone_train %>%
mutate(
price_range=factor(price_range, levels = c(0,1,2,3), labels = c(0,0,1,1))
)
str(phone_train$price_range)## Factor w/ 2 levels "0","1": 1 2 2 2 1 1 2 1 1 1 ...
Dari hasil diatas jumlah dari masing-masing value adalah sama, kemudian karena klasifikasi dari case study logistic regression ini adalah biner (binomial) yaitu 0 dan 1, maka perlu ada penyesuaian dari value tersebut dimana value 0,1,2,3 nantinya saya sesuaikan menjadi rendah (0) dan tinggi (1) dengan kriteria 0-1 adalah rendah dan 2-3 adalah tinggi.
Melakukan pemilihan variabel prediktor yang berhubungan
mobilespec <- glm(price_range~., data=phone_train, family = "binomial")## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(mobilespec)##
## Call:
## glm(formula = price_range ~ ., family = "binomial", data = phone_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.911 0.000 0.000 0.000 3.116
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.056e+02 7.408e+01 -4.126 3.69e-05 ***
## battery_power 5.515e-02 1.343e-02 4.106 4.02e-05 ***
## blue 1.290e-02 1.358e+00 0.010 0.9924
## clock_speed 4.119e-01 7.533e-01 0.547 0.5845
## dual_sim -1.022e+00 1.206e+00 -0.847 0.3968
## fc -9.439e-02 1.569e-01 -0.602 0.5474
## four_g -1.958e+00 1.442e+00 -1.358 0.1746
## int_memory 8.821e-02 3.583e-02 2.462 0.0138 *
## m_dep -2.789e+00 1.943e+00 -1.435 0.1513
## mobile_wt -8.644e-02 2.146e-02 -4.027 5.64e-05 ***
## n_cores 5.025e-01 2.340e-01 2.148 0.0317 *
## pc 2.282e-01 1.062e-01 2.149 0.0316 *
## px_height 3.003e-02 7.156e-03 4.197 2.71e-05 ***
## px_width 3.342e-02 8.343e-03 4.005 6.20e-05 ***
## ram 8.744e-02 2.123e-02 4.119 3.80e-05 ***
## sc_h -8.939e-02 1.415e-01 -0.632 0.5276
## sc_w 8.493e-02 1.355e-01 0.627 0.5309
## talk_time 2.519e-02 8.980e-02 0.280 0.7791
## three_g 6.688e-01 1.220e+00 0.548 0.5837
## touch_screen -8.454e-01 1.114e+00 -0.759 0.4479
## wifi -3.026e+00 1.351e+00 -2.240 0.0251 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2772.589 on 1999 degrees of freedom
## Residual deviance: 43.521 on 1979 degrees of freedom
## AIC: 85.521
##
## Number of Fisher Scoring iterations: 15
Hasil AIC adalah 85.521 dan prediktor yang signifikan dengan target variabelnya adalah : - battery_power - mobile_wt - px_height - px_width - ram
Namun ada variabel prediktor yang ada hubungan dengan target variabel yaitu - int_memory - n_cores - pc - wifi
Kemudian melihat hasil dengan semua prediktor menggunakan fungsi step
step(mobilespec)## Start: AIC=85.52
## price_range ~ battery_power + blue + clock_speed + dual_sim +
## fc + four_g + int_memory + m_dep + mobile_wt + n_cores +
## pc + px_height + px_width + ram + sc_h + sc_w + talk_time +
## three_g + touch_screen + wifi
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## - blue 1 43.52 83.52
## - talk_time 1 43.60 83.60
## - three_g 1 43.83 83.83
## - clock_speed 1 43.83 83.83
## - fc 1 43.88 83.88
## - sc_w 1 43.92 83.92
## - sc_h 1 43.93 83.93
## - touch_screen 1 44.10 84.10
## - dual_sim 1 44.25 84.25
## <none> 43.52 85.52
## - four_g 1 45.75 85.75
## - m_dep 1 46.15 86.15
## - n_cores 1 49.17 89.17
## - pc 1 49.46 89.46
## - wifi 1 51.44 91.44
## - int_memory 1 52.60 92.60
## - mobile_wt 1 85.16 125.16
## - px_height 1 252.49 292.49
## - px_width 1 264.03 304.03
## - battery_power 1 578.20 618.20
## - ram 1 2679.94 2719.94
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##
## Step: AIC=83.52
## price_range ~ battery_power + clock_speed + dual_sim + fc + four_g +
## int_memory + m_dep + mobile_wt + n_cores + pc + px_height +
## px_width + ram + sc_h + sc_w + talk_time + three_g + touch_screen +
## wifi
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## - talk_time 1 43.60 81.60
## - three_g 1 43.83 81.83
## - clock_speed 1 43.84 81.84
## - fc 1 43.88 81.88
## - sc_w 1 43.93 81.93
## - sc_h 1 44.04 82.04
## - touch_screen 1 44.11 82.11
## - dual_sim 1 44.43 82.43
## <none> 43.52 83.52
## - four_g 1 45.75 83.75
## - m_dep 1 46.26 84.26
## - pc 1 49.46 87.46
## - n_cores 1 49.59 87.59
## - wifi 1 52.04 90.04
## - int_memory 1 52.76 90.76
## - mobile_wt 1 85.21 123.21
## - px_height 1 252.58 290.58
## - px_width 1 264.80 302.80
## - battery_power 1 579.24 617.24
## - ram 1 2680.48 2718.48
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##
## Step: AIC=81.6
## price_range ~ battery_power + clock_speed + dual_sim + fc + four_g +
## int_memory + m_dep + mobile_wt + n_cores + pc + px_height +
## px_width + ram + sc_h + sc_w + three_g + touch_screen + wifi
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## - three_g 1 43.84 79.84
## - clock_speed 1 43.99 79.99
## - fc 1 44.01 80.01
## - sc_w 1 44.03 80.03
## - touch_screen 1 44.12 80.12
## - sc_h 1 44.28 80.28
## - dual_sim 1 44.62 80.62
## <none> 43.60 81.60
## - four_g 1 46.18 82.18
## - m_dep 1 46.39 82.39
## - pc 1 49.80 85.80
## - n_cores 1 49.97 85.97
## - wifi 1 52.12 88.12
## - int_memory 1 53.56 89.56
## - mobile_wt 1 86.34 122.34
## - px_height 1 252.94 288.94
## - px_width 1 266.00 302.00
## - battery_power 1 581.13 617.13
## - ram 1 2680.48 2716.48
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##
## Step: AIC=79.84
## price_range ~ battery_power + clock_speed + dual_sim + fc + four_g +
## int_memory + m_dep + mobile_wt + n_cores + pc + px_height +
## px_width + ram + sc_h + sc_w + touch_screen + wifi
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## - clock_speed 1 44.23 78.23
## - touch_screen 1 44.27 78.27
## - sc_w 1 44.32 78.32
## - fc 1 44.33 78.33
## - sc_h 1 44.57 78.57
## - dual_sim 1 45.11 79.11
## <none> 43.84 79.84
## - four_g 1 46.20 80.20
## - m_dep 1 46.56 80.56
## - pc 1 49.89 83.89
## - n_cores 1 50.01 84.01
## - wifi 1 52.15 86.15
## - int_memory 1 53.73 87.73
## - mobile_wt 1 86.39 120.39
## - px_height 1 252.96 286.96
## - px_width 1 267.49 301.49
## - battery_power 1 584.11 618.11
## - ram 1 2682.48 2716.48
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##
## Step: AIC=78.23
## price_range ~ battery_power + dual_sim + fc + four_g + int_memory +
## m_dep + mobile_wt + n_cores + pc + px_height + px_width +
## ram + sc_h + sc_w + touch_screen + wifi
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## - sc_w 1 44.70 76.70
## - touch_screen 1 44.75 76.75
## - sc_h 1 44.89 76.89
## - fc 1 45.01 77.01
## - dual_sim 1 45.80 77.80
## - four_g 1 46.22 78.22
## <none> 44.23 78.23
## - m_dep 1 46.83 78.83
## - n_cores 1 50.57 82.57
## - wifi 1 52.52 84.52
## - pc 1 52.75 84.75
## - int_memory 1 53.77 85.77
## - mobile_wt 1 87.22 119.22
## - px_height 1 253.95 285.95
## - px_width 1 268.10 300.10
## - battery_power 1 584.48 616.48
## - ram 1 2682.52 2714.52
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##
## Step: AIC=76.7
## price_range ~ battery_power + dual_sim + fc + four_g + int_memory +
## m_dep + mobile_wt + n_cores + pc + px_height + px_width +
## ram + sc_h + touch_screen + wifi
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## - sc_h 1 45.01 75.01
## - fc 1 45.30 75.30
## - touch_screen 1 45.47 75.47
## - dual_sim 1 46.11 76.11
## - four_g 1 46.43 76.43
## <none> 44.70 76.70
## - m_dep 1 46.96 76.96
## - n_cores 1 51.20 81.20
## - wifi 1 52.73 82.73
## - pc 1 53.07 83.07
## - int_memory 1 53.85 83.85
## - mobile_wt 1 88.52 118.52
## - px_height 1 253.96 283.96
## - px_width 1 269.49 299.49
## - battery_power 1 584.49 614.49
## - ram 1 2684.95 2714.95
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##
## Step: AIC=75.01
## price_range ~ battery_power + dual_sim + fc + four_g + int_memory +
## m_dep + mobile_wt + n_cores + pc + px_height + px_width +
## ram + touch_screen + wifi
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## - fc 1 45.56 73.56
## - touch_screen 1 45.72 73.72
## - dual_sim 1 46.31 74.31
## - four_g 1 46.51 74.51
## - m_dep 1 47.00 75.00
## <none> 45.01 75.01
## - n_cores 1 51.61 79.61
## - wifi 1 52.74 80.74
## - pc 1 53.43 81.43
## - int_memory 1 54.58 82.58
## - mobile_wt 1 89.56 117.56
## - px_height 1 253.98 281.98
## - px_width 1 270.42 298.42
## - battery_power 1 584.66 612.66
## - ram 1 2685.03 2713.03
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##
## Step: AIC=73.56
## price_range ~ battery_power + dual_sim + four_g + int_memory +
## m_dep + mobile_wt + n_cores + pc + px_height + px_width +
## ram + touch_screen + wifi
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## - touch_screen 1 45.94 71.94
## - dual_sim 1 46.84 72.84
## - four_g 1 46.84 72.84
## <none> 45.56 73.56
## - m_dep 1 47.78 73.78
## - n_cores 1 52.94 78.94
## - wifi 1 53.09 79.09
## - pc 1 54.47 80.47
## - int_memory 1 55.44 81.44
## - mobile_wt 1 89.78 115.78
## - px_height 1 254.78 280.78
## - px_width 1 272.16 298.16
## - battery_power 1 585.19 611.19
## - ram 1 2685.07 2711.07
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##
## Step: AIC=71.94
## price_range ~ battery_power + dual_sim + four_g + int_memory +
## m_dep + mobile_wt + n_cores + pc + px_height + px_width +
## ram + wifi
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## - four_g 1 47.08 71.08
## <none> 45.94 71.94
## - m_dep 1 48.46 72.46
## - dual_sim 1 48.58 72.58
## - n_cores 1 53.34 77.34
## - wifi 1 54.40 78.40
## - pc 1 54.69 78.69
## - int_memory 1 56.23 80.23
## - mobile_wt 1 91.30 115.30
## - px_height 1 255.93 279.93
## - px_width 1 272.58 296.58
## - battery_power 1 586.84 610.84
## - ram 1 2688.26 2712.26
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##
## Step: AIC=71.08
## price_range ~ battery_power + dual_sim + int_memory + m_dep +
## mobile_wt + n_cores + pc + px_height + px_width + ram + wifi
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## - m_dep 1 48.96 70.96
## <none> 47.08 71.08
## - dual_sim 1 50.44 72.44
## - n_cores 1 55.37 77.37
## - pc 1 55.82 77.82
## - wifi 1 56.14 78.14
## - int_memory 1 56.85 78.85
## - mobile_wt 1 92.27 114.27
## - px_height 1 262.16 284.16
## - px_width 1 272.90 294.90
## - battery_power 1 586.92 608.92
## - ram 1 2688.26 2710.26
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##
## Step: AIC=70.96
## price_range ~ battery_power + dual_sim + int_memory + mobile_wt +
## n_cores + pc + px_height + px_width + ram + wifi
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## <none> 48.96 70.96
## - dual_sim 1 51.51 71.51
## - wifi 1 56.34 76.34
## - n_cores 1 56.60 76.60
## - pc 1 57.53 77.53
## - int_memory 1 57.89 77.89
## - mobile_wt 1 94.63 114.63
## - px_height 1 269.16 289.16
## - px_width 1 272.90 292.90
## - battery_power 1 588.35 608.35
## - ram 1 2689.78 2709.78
##
## Call: glm(formula = price_range ~ battery_power + dual_sim + int_memory +
## mobile_wt + n_cores + pc + px_height + px_width + ram + wifi,
## family = "binomial", data = phone_train)
##
## Coefficients:
## (Intercept) battery_power dual_sim int_memory mobile_wt
## -266.41698 0.04798 -1.34273 0.07051 -0.08240
## n_cores pc px_height px_width ram
## 0.50714 0.19489 0.02736 0.02814 0.07597
## wifi
## -2.12398
##
## Degrees of Freedom: 1999 Total (i.e. Null); 1989 Residual
## Null Deviance: 2773
## Residual Deviance: 48.96 AIC: 70.96
hasil AIC adalah 70.96 dan prediktor dari hasil step() adalah - battery_power - dual_sim - int_memory - mobile_wt - n_cores - pc - px_height - px_width - ram - wifi
Karena ada hubungan negatif maka, nilai negatifnya di keluarkan dari list sehingga prediktor yang digunakan adalah - battery_power - int_memory - n_cores - pc - px_height - px_width - ram
Sehingga hasilnya adalah
mobilespec2 <- glm(price_range~n_cores+battery_power+px_width+int_memory+pc+px_height+ram, data=phone_train, family = "binomial")## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(mobilespec)##
## Call:
## glm(formula = price_range ~ ., family = "binomial", data = phone_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.911 0.000 0.000 0.000 3.116
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.056e+02 7.408e+01 -4.126 3.69e-05 ***
## battery_power 5.515e-02 1.343e-02 4.106 4.02e-05 ***
## blue 1.290e-02 1.358e+00 0.010 0.9924
## clock_speed 4.119e-01 7.533e-01 0.547 0.5845
## dual_sim -1.022e+00 1.206e+00 -0.847 0.3968
## fc -9.439e-02 1.569e-01 -0.602 0.5474
## four_g -1.958e+00 1.442e+00 -1.358 0.1746
## int_memory 8.821e-02 3.583e-02 2.462 0.0138 *
## m_dep -2.789e+00 1.943e+00 -1.435 0.1513
## mobile_wt -8.644e-02 2.146e-02 -4.027 5.64e-05 ***
## n_cores 5.025e-01 2.340e-01 2.148 0.0317 *
## pc 2.282e-01 1.062e-01 2.149 0.0316 *
## px_height 3.003e-02 7.156e-03 4.197 2.71e-05 ***
## px_width 3.342e-02 8.343e-03 4.005 6.20e-05 ***
## ram 8.744e-02 2.123e-02 4.119 3.80e-05 ***
## sc_h -8.939e-02 1.415e-01 -0.632 0.5276
## sc_w 8.493e-02 1.355e-01 0.627 0.5309
## talk_time 2.519e-02 8.980e-02 0.280 0.7791
## three_g 6.688e-01 1.220e+00 0.548 0.5837
## touch_screen -8.454e-01 1.114e+00 -0.759 0.4479
## wifi -3.026e+00 1.351e+00 -2.240 0.0251 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2772.589 on 1999 degrees of freedom
## Residual deviance: 43.521 on 1979 degrees of freedom
## AIC: 85.521
##
## Number of Fisher Scoring iterations: 15
Dari model tersebut menghasilkan nilai AIC adalah 117.67 , namun dari hasil tersebut ada 2 prediktor yang tidak ada hubungannya yaitu int_memory dan n_cores, sehingga buat model baru tanpa prediktor tersebut
mobilespec3 <- glm(price_range~battery_power+pc+px_height+px_width+ram, data=phone_train, family = "binomial")## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(mobilespec3)##
## Call:
## glm(formula = price_range ~ battery_power + pc + px_height +
## px_width + ram, family = "binomial", data = phone_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.367 0.000 0.000 0.000 2.202
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.173e+02 1.511e+01 -7.763 8.31e-15 ***
## battery_power 2.028e-02 2.673e-03 7.588 3.25e-14 ***
## pc 8.918e-02 4.539e-02 1.965 0.0495 *
## px_height 1.140e-02 1.603e-03 7.112 1.14e-12 ***
## px_width 1.206e-02 1.713e-03 7.042 1.89e-12 ***
## ram 3.260e-02 4.189e-03 7.782 7.14e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2772.59 on 1999 degrees of freedom
## Residual deviance: 106.35 on 1994 degrees of freedom
## AIC: 118.35
##
## Number of Fisher Scoring iterations: 12
Dari model tersebut menghasilkan nilai AIC adalah 118.35
Dari dataset yang disediakan yaitu train, perlu di pecah jadi 2 yaitu train dan test
set.seed(456)
indexm <- sample(nrow(phone_train), nrow(phone_train) * 0.8)
mobile_train <- phone_train[indexm,]
mobile_test <- phone_train[-indexm,]Cek proporsi dari jumlah masing-masing target prediktor
table(mobile_train$price_range)##
## 0 1
## 796 804
prop.table(table(mobile_train$price_range))##
## 0 1
## 0.4975 0.5025
Jika jumlah datanya beda maka perlu dilakukan Down sampling :
mobile_train_down <- downSample(x = phone_train[, -20], y = phone_train[, 21], yname = "price_range")
table(mobile_train_down$price_range)##
## 0 1
## 1000 1000
Modeling dengan menggunakan step yang sudah dijalankan
Modeling menggunakan imbalance dataset
mobilespec_model <- glm(price_range~battery_power+pc+px_height+px_width+ram, data=mobile_train, family = "binomial")## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
mobilespec_model##
## Call: glm(formula = price_range ~ battery_power + pc + px_height +
## px_width + ram, family = "binomial", data = mobile_train)
##
## Coefficients:
## (Intercept) battery_power pc px_height px_width
## -121.56855 0.02156 0.09770 0.01208 0.01194
## ram
## 0.03359
##
## Degrees of Freedom: 1599 Total (i.e. Null); 1594 Residual
## Null Deviance: 2218
## Residual Deviance: 81.26 AIC: 93.26
Hasil AIC yang di dapat adalah 93.26
Modeling menggunakan down sampling
mobilespec_model_down <- glm(price_range~battery_power+pc+px_height+px_width+ram, data=mobile_train_down, family = "binomial")## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
mobilespec_model_down##
## Call: glm(formula = price_range ~ battery_power + pc + px_height +
## px_width + ram, family = "binomial", data = mobile_train_down)
##
## Coefficients:
## (Intercept) battery_power pc px_height px_width
## -117.32630 0.02028 0.08918 0.01140 0.01206
## ram
## 0.03260
##
## Degrees of Freedom: 1999 Total (i.e. Null); 1994 Residual
## Null Deviance: 2773
## Residual Deviance: 106.4 AIC: 118.4
Hasil AIC yang di dapat adalah 118.4
Prediksi menggunakan data yang imbalance
mobile_train$peluang <- predict(mobilespec_model, mobile_train, type = "response")
mobile_train$predik <- as.factor(ifelse(mobile_train$peluang > 0.5, 1, 0))mobile_test$peluang <- predict(mobilespec_model, mobile_test, type = "response")
mobile_test$predik <- as.factor(ifelse(mobile_test$peluang > 0.5, 1, 0))Predicting menggunakan down sampling dataset
mobile_train_down$peluang <- predict(mobilespec_model_down, mobile_train_down, type = "response")
mobile_train_down$predik <- as.factor(ifelse(mobile_train_down$peluang > 0.5, 1, 0))mobile_test$peluang_down <- predict(mobilespec_model_down, mobile_test, type = "response")
mobile_test$predik_down <- as.factor(ifelse(mobile_test$peluang_down > 0.5, 1, 0))Model evaluation menggunakan imbalance dataset
confusionMatrix(mobile_train$predik, mobile_train$price_range, positive = "1")## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 787 11
## 1 9 793
##
## Accuracy : 0.9875
## 95% CI : (0.9808, 0.9923)
## No Information Rate : 0.5025
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.975
##
## Mcnemar's Test P-Value : 0.8231
##
## Sensitivity : 0.9863
## Specificity : 0.9887
## Pos Pred Value : 0.9888
## Neg Pred Value : 0.9862
## Prevalence : 0.5025
## Detection Rate : 0.4956
## Detection Prevalence : 0.5012
## Balanced Accuracy : 0.9875
##
## 'Positive' Class : 1
##
confusionMatrix(mobile_test$predik, mobile_test$price_range, positive = "1")## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 203 3
## 1 1 193
##
## Accuracy : 0.99
## 95% CI : (0.9746, 0.9973)
## No Information Rate : 0.51
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.98
##
## Mcnemar's Test P-Value : 0.6171
##
## Sensitivity : 0.9847
## Specificity : 0.9951
## Pos Pred Value : 0.9948
## Neg Pred Value : 0.9854
## Prevalence : 0.4900
## Detection Rate : 0.4825
## Detection Prevalence : 0.4850
## Balanced Accuracy : 0.9899
##
## 'Positive' Class : 1
##
Model evaluation menggunakan down sampling dataset
confusionMatrix(mobile_train_down$predik, mobile_train_down$price_range, positive = "1")## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 989 11
## 1 11 989
##
## Accuracy : 0.989
## 95% CI : (0.9834, 0.9931)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.978
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9890
## Specificity : 0.9890
## Pos Pred Value : 0.9890
## Neg Pred Value : 0.9890
## Prevalence : 0.5000
## Detection Rate : 0.4945
## Detection Prevalence : 0.5000
## Balanced Accuracy : 0.9890
##
## 'Positive' Class : 1
##
confusionMatrix(mobile_test$predik_down, mobile_test$price_range, positive = "1")## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 203 3
## 1 1 193
##
## Accuracy : 0.99
## 95% CI : (0.9746, 0.9973)
## No Information Rate : 0.51
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.98
##
## Mcnemar's Test P-Value : 0.6171
##
## Sensitivity : 0.9847
## Specificity : 0.9951
## Pos Pred Value : 0.9948
## Neg Pred Value : 0.9854
## Prevalence : 0.4900
## Detection Rate : 0.4825
## Detection Prevalence : 0.4850
## Balanced Accuracy : 0.9899
##
## 'Positive' Class : 1
##
Dari semua data yang kita evaluasi memiliki Presisi hingga 99%
performa <- function(cutoff, prob, ref, postarget, negtarget)
{
predict <- as.factor(ifelse(prob >= cutoff, postarget, negtarget))
conf <- caret::confusionMatrix(predict , ref, positive = postarget)
acc <- conf$overall[1]
rec <- conf$byClass[1]
prec <- conf$byClass[3]
spec <- conf$byClass[2]
mat <- t(as.matrix(c(rec , acc , prec, spec)))
colnames(mat) <- c("recall", "accuracy", "precicion", "specificity")
return(mat)
}
co <- seq(0.01,0.8,length=100)
result <- matrix(0,100,4)
for(i in 1:100){
result[i,] = performa(cutoff = co[i],
prob = mobile_test$peluang_down,
ref = mobile_test$price_range,
postarget = "1",
negtarget = "0")
}
data_frame("Recall" = result[,1],
"Accuracy" = result[,2],
"Precision" = result[,3],
"Specificity" = result[,4],
"Cutoff" = co) %>%
gather(key = "performa", value = "value", 1:4) %>%
ggplot(aes(x = Cutoff, y = value, col = performa)) +
geom_line(lwd = 1.5) +
scale_color_manual(values = c("darkred","darkgreen","orange", "blue")) +
scale_y_continuous(breaks = seq(0,1,0.1), limits = c(0,1)) +
scale_x_continuous(breaks = seq(0,1,0.1)) +
labs(title = "Tradeoff model perfomance") +
theme_minimal() +
theme(legend.position = "top",
panel.grid.minor.y = element_blank(),
panel.grid.minor.x = element_blank())## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## ℹ Please use `tibble()` instead.
Data model menghasilkan presisi hampir mendekati angka 1
mobilephone_train_all <- read.csv("data_input/train.csv")
mobilephone_train_all <- mobilephone_train_all %>%
mutate(
price_range=factor(price_range, levels = c(0,1,2,3), labels = c(0,0,1,1))
)normalize <- function(x){
return (
(x - min(x))/(max(x) - min(x))
)
}mobilephone_train_all_s <- mobilephone_train_all %>%
mutate_if(is.numeric, normalize)set.seed(567)
# data yang di train dan test dari dataset sebanyak 80%
index <- sample(nrow(mobilephone_train_all),nrow(mobilephone_train_all)*0.8)
mobile_train_s <- mobilephone_train_all_s[index, ]
mobile_test_s <- mobilephone_train_all_s[-index, ]mobilephone_train_all_mm <- sqrt(nrow(mobilephone_train_all))
nilaik_all_mm <- round(mobilephone_train_all_mm)
test_spredik1 <- knn(train = mobile_train_s[, -21],
test = mobile_test_s[, -21],
cl = mobile_train_s[, 21],
k = (nilaik_all_mm-4))
test_spredik2 <- knn(train = mobile_train_s[, -21],
test = mobile_test_s[, -21],
cl = mobile_train_s[, 21],
k = (nilaik_all_mm-2))
test_spredik3 <- knn(train = mobile_train_s[, -21],
test = mobile_test_s[, -21],
cl = mobile_train_s[, 21],
k = nilaik_all_mm)
test_spredik4 <- knn(train = mobile_train_s[, -21],
test = mobile_test_s[, -21],
cl = mobile_train_s[, 21],
k = (nilaik_all_mm+2))
test_spredik5 <- knn(train = mobile_train_s[, -21],
test = mobile_test_s[, -21],
cl = mobile_train_s[, 21],
k = (nilaik_all_mm+4))nilai k default adalah 45 , kemudian dicoba dengan nilai K yang berbeda yaitu 41 , 43 , 45 , 47 , 49
hasilcm_1 <- confusionMatrix(test_spredik1, mobile_test_s$price_range)
hasilcm_2 <- confusionMatrix(test_spredik2, mobile_test_s$price_range)
hasilcm_3 <- confusionMatrix(test_spredik3, mobile_test_s$price_range)
hasilcm_4 <- confusionMatrix(test_spredik4, mobile_test_s$price_range)
hasilcm_5 <- confusionMatrix(test_spredik5, mobile_test_s$price_range)Hasil perbandingan nya adalah
resultcm <- matrix(0,5,4)
colnames(resultcm) <- c("recall", "accuracy", "precision", "specificity")
resultcm[1,] <- c(hasilcm_1$overall[1],hasilcm_1$overall[2],hasilcm_1$overall[3],hasilcm_1$overall[4])
resultcm[2,] <- c(hasilcm_2$overall[1],hasilcm_2$overall[2],hasilcm_2$overall[3],hasilcm_2$overall[4])
resultcm[3,] <- c(hasilcm_3$overall[1],hasilcm_3$overall[2],hasilcm_3$overall[3],hasilcm_3$overall[4])
resultcm[4,] <- c(hasilcm_4$overall[1],hasilcm_4$overall[2],hasilcm_4$overall[3],hasilcm_4$overall[4])
resultcm[5,] <- c(hasilcm_5$overall[1],hasilcm_5$overall[2],hasilcm_5$overall[3],hasilcm_5$overall[4])
resultcm## recall accuracy precision specificity
## [1,] 0.8525 0.7050000 0.8138982 0.8857819
## [2,] 0.8600 0.7198950 0.8220795 0.8924822
## [3,] 0.8575 0.7148289 0.8193489 0.8902523
## [4,] 0.8525 0.7048229 0.8138982 0.8857819
## [5,] 0.8550 0.7098912 0.8166218 0.8880189
Dari hasil tersebut maka nilai K yang tinggi adalah 43 dengan nilai presisi adalah 0.8220795
Dari 2 Metode yang kita gunakan pada logistik regresi diperoleh model dengan nilai AIC (Akaike information criterion) 118.4 dan pada K-Nearest Neighbor menggunakan nilai k adalah 43 dengan nilai akurasinya adalah 0.7198950 , dari 2 metode tersebut didapat kesimpulan bahwa logistik regresi didapatkan dari tingkat lost informasinya, sedangkan K-NN mendapatkan nilai presisi,akurasi,spesific dan recall nya.