R Markdown

Pada kali ini, saya ingin memprediksi berapakah harga smartphone dari beberapa data yang kita punya. saya mengambil data dari link https://www.kaggle.com/iabhishekofficial/mobile-price-classification

Disini kita menggunakan model keras RNN

library(dplyr)

library(keras)

library(caret)
library(recipes)
library(ROSE)
library(rsample)

library(tensorflow)
reticulate::use_python(python = "C:/Users/LENOVO/anaconda3/envs/r-tensorflow-gpu/python.exe",required=TRUE)

Pertama-tama kita akan membaca file yang berbentuk csv

data <- read.csv("dataset/mobile_train.csv")

glimpse(data)
## Rows: 2,000
## Columns: 21
## $ battery_power <int> 842, 1021, 563, 615, 1821, 1859, 1821, 1954, 1445, 509, ~
## $ blue          <int> 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1,~
## $ clock_speed   <dbl> 2.2, 0.5, 0.5, 2.5, 1.2, 0.5, 1.7, 0.5, 0.5, 0.6, 2.9, 2~
## $ dual_sim      <int> 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1,~
## $ fc            <int> 1, 0, 2, 0, 13, 3, 4, 0, 0, 2, 0, 5, 2, 7, 13, 3, 1, 7, ~
## $ four_g        <int> 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0,~
## $ int_memory    <int> 7, 53, 41, 10, 44, 22, 10, 24, 53, 9, 9, 33, 33, 17, 52,~
## $ m_dep         <dbl> 0.6, 0.7, 0.9, 0.8, 0.6, 0.7, 0.8, 0.8, 0.7, 0.1, 0.1, 0~
## $ mobile_wt     <int> 188, 136, 145, 131, 141, 164, 139, 187, 174, 93, 182, 17~
## $ n_cores       <int> 2, 3, 5, 6, 2, 1, 8, 4, 7, 5, 5, 8, 4, 4, 1, 2, 8, 3, 5,~
## $ pc            <int> 2, 6, 6, 9, 14, 7, 10, 0, 14, 15, 1, 18, 17, 11, 17, 16,~
## $ px_height     <int> 20, 905, 1263, 1216, 1208, 1004, 381, 512, 386, 1137, 24~
## $ px_width      <int> 756, 1988, 1716, 1786, 1212, 1654, 1018, 1149, 836, 1224~
## $ ram           <int> 2549, 2631, 2603, 2769, 1411, 1067, 3220, 700, 1099, 513~
## $ sc_h          <int> 9, 17, 11, 16, 8, 17, 13, 16, 17, 19, 5, 14, 18, 7, 14, ~
## $ sc_w          <int> 7, 3, 2, 8, 2, 1, 8, 3, 1, 10, 2, 9, 0, 1, 9, 15, 9, 2, ~
## $ talk_time     <int> 19, 7, 9, 11, 15, 10, 18, 5, 20, 12, 7, 13, 2, 4, 3, 11,~
## $ three_g       <int> 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,~
## $ touch_screen  <int> 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1,~
## $ wifi          <int> 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0,~
## $ price_range   <int> 1, 2, 2, 2, 1, 1, 3, 0, 0, 0, 3, 3, 1, 2, 0, 0, 3, 3, 1,~

Di data tersebut, ternyata tidak ada data yang NA

anyNA(data)
## [1] FALSE

Disini kita menyesuaikan tipe columnya yang belum sesuai

df <- data %>% 
  mutate(blue = as.factor(blue),
         dual_sim = as.factor(dual_sim),
         four_g = as.factor(four_g),
         three_g = as.factor(three_g),
         touch_screen = as.factor(touch_screen),
         wifi = as.factor(wifi))
glimpse(df)
## Rows: 2,000
## Columns: 21
## $ battery_power <int> 842, 1021, 563, 615, 1821, 1859, 1821, 1954, 1445, 509, ~
## $ blue          <fct> 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1,~
## $ clock_speed   <dbl> 2.2, 0.5, 0.5, 2.5, 1.2, 0.5, 1.7, 0.5, 0.5, 0.6, 2.9, 2~
## $ dual_sim      <fct> 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1,~
## $ fc            <int> 1, 0, 2, 0, 13, 3, 4, 0, 0, 2, 0, 5, 2, 7, 13, 3, 1, 7, ~
## $ four_g        <fct> 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0,~
## $ int_memory    <int> 7, 53, 41, 10, 44, 22, 10, 24, 53, 9, 9, 33, 33, 17, 52,~
## $ m_dep         <dbl> 0.6, 0.7, 0.9, 0.8, 0.6, 0.7, 0.8, 0.8, 0.7, 0.1, 0.1, 0~
## $ mobile_wt     <int> 188, 136, 145, 131, 141, 164, 139, 187, 174, 93, 182, 17~
## $ n_cores       <int> 2, 3, 5, 6, 2, 1, 8, 4, 7, 5, 5, 8, 4, 4, 1, 2, 8, 3, 5,~
## $ pc            <int> 2, 6, 6, 9, 14, 7, 10, 0, 14, 15, 1, 18, 17, 11, 17, 16,~
## $ px_height     <int> 20, 905, 1263, 1216, 1208, 1004, 381, 512, 386, 1137, 24~
## $ px_width      <int> 756, 1988, 1716, 1786, 1212, 1654, 1018, 1149, 836, 1224~
## $ ram           <int> 2549, 2631, 2603, 2769, 1411, 1067, 3220, 700, 1099, 513~
## $ sc_h          <int> 9, 17, 11, 16, 8, 17, 13, 16, 17, 19, 5, 14, 18, 7, 14, ~
## $ sc_w          <int> 7, 3, 2, 8, 2, 1, 8, 3, 1, 10, 2, 9, 0, 1, 9, 15, 9, 2, ~
## $ talk_time     <int> 19, 7, 9, 11, 15, 10, 18, 5, 20, 12, 7, 13, 2, 4, 3, 11,~
## $ three_g       <fct> 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,~
## $ touch_screen  <fct> 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1,~
## $ wifi          <fct> 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0,~
## $ price_range   <int> 1, 2, 2, 2, 1, 1, 3, 0, 0, 0, 3, 3, 1, 2, 0, 0, 3, 3, 1,~

Setelah tipe data sudah sesuai, kita akan membuat obj untuk scaling dan membuat dummy dengan menggunakan function recipe

rec_obj <- recipe(price_range ~ ., data = df) %>% 
  step_center(all_numeric_predictors()) %>% 
  step_scale(all_numeric_predictors()) %>% 
  step_dummy(all_nominal_predictors()) %>% 
  prep(df)
rec_obj
## Data Recipe
## 
## Inputs:
## 
##       role #variables
##    outcome          1
##  predictor         20
## 
## Training data contained 2000 data points and no missing data.
## 
## Operations:
## 
## Centering for battery_power, clock_speed, fc, ... [trained]
## Scaling for battery_power, clock_speed, fc, ... [trained]
## Dummy variables from blue, dual_sim, four_g, three_g, touch_screen, wifi [trained]

setelah kita membuat object , kita akan mentransform datanya sesuai dengan transform dari obj recipe sebelumnya

df_transform <- bake(rec_obj, df)

head(df_transform)
## # A tibble: 6 x 21
##   battery_power clock_speed     fc int_memory m_dep mobile_wt n_cores     pc
##           <dbl>       <dbl>  <dbl>      <dbl> <dbl>     <dbl>   <dbl>  <dbl>
## 1        -0.902       0.831 -0.762     -1.38  0.341    1.35    -1.10  -1.31 
## 2        -0.495      -1.25  -0.993      1.15  0.687   -0.120   -0.665 -0.646
## 3        -1.54       -1.25  -0.532      0.493 1.38     0.134    0.210 -0.646
## 4        -1.42        1.20  -0.993     -1.21  1.03    -0.261    0.647 -0.151
## 5         1.33       -0.395  2.00       0.659 0.341    0.0212  -1.10   0.673
## 6         1.41       -1.25  -0.302     -0.554 0.687    0.671   -1.54  -0.481
## # ... with 13 more variables: px_height <dbl>, px_width <dbl>, ram <dbl>,
## #   sc_h <dbl>, sc_w <dbl>, talk_time <dbl>, price_range <int>, blue_X1 <dbl>,
## #   dual_sim_X1 <dbl>, four_g_X1 <dbl>, three_g_X1 <dbl>,
## #   touch_screen_X1 <dbl>, wifi_X1 <dbl>

Setelah itu, kita akan melakukan spliting data menjadi training dan testing sebesar 0.8

set.seed(2021)

split <- initial_split(data = df_transform, prop = 0.8)

train <- training(split)
test <- testing(split)

Disini kita cek apakah data train kita targetnya sudah balanced

table(as.factor(train$price_range)) %>% prop.table()
## 
##        0        1        2        3 
## 0.250625 0.250625 0.247500 0.251250

setelah itu, kita akan memecah predictor dan target, dan merubahnya sesuai kebutuhan

train_x <- train %>% select(-price_range) %>% as.matrix()
test_x <- test %>% select(-price_range) %>% as.matrix()

train_x <- array_reshape(train_x, dim(train_x))
test_x <- array_reshape(test_x, dim(test_x))


train_y <- to_categorical(train$price_range)
test_y <- to_categorical(test$price_range)

n_input <- ncol(train_x)
n_output <- ncol(train_y)

Sekarang kita membuat arsitektur NN nya dengan menggunakan hiden_layer 3x, dan output_layer 1x. untuk hiden_layer pertama kita menggunakan units 64 dan activation tanh untuk hiden_layer kedua kita menggunakan units 128 dan activation tanh untuk hiden_layer ketiga kita menggunakan units 256 dan activation tanh

untuk optimizer kira menggunakan optimizer adam dengan learning rate default yaitu 0.001

Untuk training model, kita memakai epoch 20 dan batch size sebesar 64

model_base <- keras_model_sequential(name = "model_base") %>% 
  layer_dense(units = 64,
              input_shape = n_input,
              activation = "tanh",
              name = "layer1") %>% 
  layer_dense(units = 128,
              activation = "tanh",
              name = "layer2") %>% 
  layer_dense(units = 256,
              activation = "tanh",
              name = "layer3") %>% 
  layer_dense(units = n_output,
              activation = "sigmoid",
              name = "output")

model_base %>% 
  compile(loss = "categorical_crossentropy",
          metrics = "accuracy",
          optimizer = optimizer_adam(learning_rate = 0.001))

set.seed(2021)

history_base <- model_base %>% 
  fit(x = train_x,
      y= train_y,
      epoch = 20,
      batch_size = 64,
      validation_data = list(test_x, test_y))

history_base
## 
## Final epoch (plot to see history):
##         loss: 0.06317
##     accuracy: 0.9769
##     val_loss: 0.1187
## val_accuracy: 0.945
plot(history_base)
## `geom_smooth()` using formula 'y ~ x'

Setelah model belajar, maka kita dapat melakukan prediksi menggunakan model tersebut.

pred_test <- predict(model_base, test_x) %>% k_argmax() %>% as.array()

setelah dirubah maka kita akan menjalankan confusionMatrix untuk melihat accuracy, recall, dan precision

confusionMatrix(as.factor(pred_test), as.factor(test$price_range))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1  2  3
##          0 95  5  0  0
##          1  4 93  2  0
##          2  0  1 96  4
##          3  0  0  6 94
## 
## Overall Statistics
##                                           
##                Accuracy : 0.945           
##                  95% CI : (0.9179, 0.9652)
##     No Information Rate : 0.26            
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9267          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2 Class: 3
## Sensitivity            0.9596   0.9394   0.9231   0.9592
## Specificity            0.9834   0.9801   0.9831   0.9801
## Pos Pred Value         0.9500   0.9394   0.9505   0.9400
## Neg Pred Value         0.9867   0.9801   0.9732   0.9867
## Prevalence             0.2475   0.2475   0.2600   0.2450
## Detection Rate         0.2375   0.2325   0.2400   0.2350
## Detection Prevalence   0.2500   0.2475   0.2525   0.2500
## Balanced Accuracy      0.9715   0.9597   0.9531   0.9697

Dari hasil di atas kita medapatkan accuracy di atas 93% yang hampir sempurna, dan untuk recall dan juga precision, kita mendapatkan hasil yang sangat tinggi, maka bisa dibilang memakai metode neural network menurut saya terlalu overkill, bisa kita pakai logistic regression ataupun decision tree, karena akan mengurangi beban sistem.