R Markdown

Disini kita menggunakan data dari https://www.kaggle.com/mysarahmadbhat/toyota-used-car-listing

library(dplyr)

library(keras)

library(caret)
library(recipes)
library(rsample)

library(tensorflow)
reticulate::use_python(python = "C:/Users/LENOVO/anaconda3/envs/r-tensorflow-gpu/python.exe",required=TRUE)

Pertama-tama kita akan membaca file yang berbentuk csv

data <- read.csv("dataset/toyota.csv")

str(data)
## 'data.frame':    6738 obs. of  9 variables:
##  $ model       : chr  " GT86" " GT86" " GT86" " GT86" ...
##  $ year        : int  2016 2017 2015 2017 2017 2017 2017 2017 2020 2016 ...
##  $ price       : int  16000 15995 13998 18998 17498 15998 18522 18995 27998 13990 ...
##  $ transmission: chr  "Manual" "Manual" "Manual" "Manual" ...
##  $ mileage     : int  24089 18615 27469 14736 36284 26919 10456 12340 516 37999 ...
##  $ fuelType    : chr  "Petrol" "Petrol" "Petrol" "Petrol" ...
##  $ tax         : int  265 145 265 150 145 260 145 145 150 265 ...
##  $ mpg         : num  36.2 36.2 36.2 36.2 36.2 36.2 36.2 36.2 33.2 36.2 ...
##  $ engineSize  : num  2 2 2 2 2 2 2 2 2 2 ...

Di data tersebut, ternyata tidak ada data yang NA

anyNA(data)
## [1] FALSE

Disini kita menyesuaikan tipe columnya yang belum sesuai

df <- data %>% 
  mutate(model = as.factor(model),
         transmission = as.factor(transmission),
         fuelType = as.factor(fuelType),
         year = as.factor(year))

str(df)
## 'data.frame':    6738 obs. of  9 variables:
##  $ model       : Factor w/ 18 levels " Auris"," Avensis",..: 7 7 7 7 7 7 7 7 7 7 ...
##  $ year        : Factor w/ 23 levels "1998","1999",..: 19 20 18 20 20 20 20 20 23 19 ...
##  $ price       : int  16000 15995 13998 18998 17498 15998 18522 18995 27998 13990 ...
##  $ transmission: Factor w/ 4 levels "Automatic","Manual",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ mileage     : int  24089 18615 27469 14736 36284 26919 10456 12340 516 37999 ...
##  $ fuelType    : Factor w/ 4 levels "Diesel","Hybrid",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ tax         : int  265 145 265 150 145 260 145 145 150 265 ...
##  $ mpg         : num  36.2 36.2 36.2 36.2 36.2 36.2 36.2 36.2 33.2 36.2 ...
##  $ engineSize  : num  2 2 2 2 2 2 2 2 2 2 ...

Setelah tipe data sudah sesuai, kita akan membuat obj untuk scaling dan membuat dummy dengan menggunakan function recipe

rec_obj <- recipe(price ~ ., data = df) %>% 
  step_center(all_numeric_predictors()) %>% 
  step_scale(all_numeric_predictors()) %>% 
  step_dummy(all_nominal_predictors()) %>% 
  prep(df)
rec_obj
## Data Recipe
## 
## Inputs:
## 
##       role #variables
##    outcome          1
##  predictor          8
## 
## Training data contained 6738 data points and no missing data.
## 
## Operations:
## 
## Centering for mileage, tax, mpg, engineSize [trained]
## Scaling for mileage, tax, mpg, engineSize [trained]
## Dummy variables from model, year, transmission, fuelType [trained]

setelah kita membuat object , kita akan mentransform datanya sesuai dengan transform dari obj recipe sebelumnya

df_transform <- bake(rec_obj, df)

head(df_transform)
## # A tibble: 6 x 50
##   mileage   tax   mpg engineSize price model_X.Avensis model_X.Aygo model_X.C.HR
##     <dbl> <dbl> <dbl>      <dbl> <int>           <dbl>        <dbl>        <dbl>
## 1  0.0644 2.31  -1.69       1.21 16000               0            0            0
## 2 -0.222  0.681 -1.69       1.21 15995               0            0            0
## 3  0.241  2.31  -1.69       1.21 13998               0            0            0
## 4 -0.425  0.749 -1.69       1.21 18998               0            0            0
## 5  0.702  0.681 -1.69       1.21 17498               0            0            0
## 6  0.212  2.24  -1.69       1.21 15998               0            0            0
## # ... with 42 more variables: model_X.Camry <dbl>, model_X.Corolla <dbl>,
## #   model_X.GT86 <dbl>, model_X.Hilux <dbl>, model_X.IQ <dbl>,
## #   model_X.Land.Cruiser <dbl>, model_X.Prius <dbl>,
## #   model_X.PROACE.VERSO <dbl>, model_X.RAV4 <dbl>, model_X.Supra <dbl>,
## #   model_X.Urban.Cruiser <dbl>, model_X.Verso <dbl>, model_X.Verso.S <dbl>,
## #   model_X.Yaris <dbl>, year_X1999 <dbl>, year_X2000 <dbl>, year_X2001 <dbl>,
## #   year_X2002 <dbl>, year_X2003 <dbl>, year_X2004 <dbl>, year_X2005 <dbl>,
## #   year_X2006 <dbl>, year_X2007 <dbl>, year_X2008 <dbl>, year_X2009 <dbl>,
## #   year_X2010 <dbl>, year_X2011 <dbl>, year_X2012 <dbl>, year_X2013 <dbl>,
## #   year_X2014 <dbl>, year_X2015 <dbl>, year_X2016 <dbl>, year_X2017 <dbl>,
## #   year_X2018 <dbl>, year_X2019 <dbl>, year_X2020 <dbl>,
## #   transmission_Manual <dbl>, transmission_Other <dbl>,
## #   transmission_Semi.Auto <dbl>, fuelType_Hybrid <dbl>, fuelType_Other <dbl>,
## #   fuelType_Petrol <dbl>

Setelah itu, kita akan melakukan spliting data menjadi training dan testing sebesar 0.8

set.seed(2021)

split <- initial_split(data = df_transform, prop = 0.8)

train <- training(split)
test <- testing(split)

setelah itu, kita akan memecah predictor dan target, dan merubahnya sesuai kebutuhan

train_x <- train %>% select(-price) %>% as.matrix()
test_x <- test %>% select(-price) %>% as.matrix()

train_x <- array_reshape(train_x, dim(train_x))
test_x <- array_reshape(test_x, dim(test_x))

train_y <- train$price %>% as.matrix()
test_y <- test$price %>% as.matrix()

n_input <- ncol(train_x)

Sekarang kita membuat arsitektur NN

model pertama kita menggunakan menggunakan hiden_layer 3x, dan output_layer 1x. untuk seluruh hiden_layer kita menggunakan units 64 dan activation relu

untuk optimizer kira menggunakan optimizer adam dengan learning rate default yaitu 0.001

Untuk training model, kita memakai epoch 50 dan batch size sebesar 64

model_base <- keras_model_sequential(name = "model_base") %>% 
  layer_dense(units = 64,
              input_shape = n_input,
              activation = "relu",
              name = "layer1") %>% 
  layer_dense(units = 64,
              activation = "relu",
              name = "layer2") %>% 
  layer_dense(units = 64,
              activation = "relu",
              name = "layer3") %>% 
  layer_dense(units = 1,
              activation = "relu",
              name = "output")

model_base %>% 
  compile(loss = "mse",
          metrics = "mean_absolute_percentage_error",
          optimizer = optimizer_adam(learning_rate = 0.001))

set.seed(2021)

history_base <- model_base %>% 
  fit(x = train_x,
      y= train_y,
      epoch = 50,
      batch_size = 64,
      validation_data = list(test_x, test_y))

plot(history_base)
## `geom_smooth()` using formula 'y ~ x'

model kedua kita menggunakan menggunakan hiden_layer 3x, dan output_layer 1x. untuk seluruh hiden_layer kita menggunakan units 128, 256, 512 dan activation relu

untuk optimizer kira menggunakan optimizer adam dengan learning rate default yaitu 0.001

Untuk training model, kita memakai epoch 50 dan batch size sebesar 64

model_base2 <- keras_model_sequential(name = "model_base") %>% 
  layer_dense(units = 128,
              input_shape = n_input,
              activation = "relu",
              name = "layer1") %>% 
  layer_dense(units = 256,
              activation = "tanh",
              name = "layer2") %>% 
  layer_dense(units = 512,
              activation = "relu",
              name = "layer3") %>% 
  layer_dense(units = 1,
              activation = "relu",
              name = "output")

model_base2 %>% 
  compile(loss = "mse",
          metrics = "mean_absolute_percentage_error",
          optimizer = optimizer_adam(learning_rate = 0.001))

set.seed(2021)

history_base2 <- model_base2 %>% 
  fit(x = train_x,
      y= train_y,
      epoch = 50,
      batch_size = 64,
      validation_data = list(test_x, test_y))

plot(history_base2)
## `geom_smooth()` using formula 'y ~ x'

model kedua kita menggunakan menggunakan hiden_layer 3x, dan output_layer 1x. untuk seluruh hiden_layer kita menggunakan units 128, 128, 128 dan activation relu

untuk optimizer kira menggunakan optimizer adam dengan learning rate default yaitu 0.001

Untuk training model, kita memakai epoch 50 dan batch size sebesar 64

model_base3 <- keras_model_sequential(name = "model_base") %>% 
  layer_dense(units = 128,
              input_shape = n_input,
              activation = "relu",
              name = "layer1") %>% 
  layer_dense(units = 128,
              activation = "relu",
              name = "layer2") %>% 
  layer_dense(units = 128,
              activation = "relu",
              name = "layer3") %>% 
  layer_dense(units = 1,
              activation = "relu",
              name = "output")

model_base3 %>% 
  compile(loss = "mse",
          metrics = "mean_absolute_percentage_error",
          optimizer = optimizer_adam(learning_rate = 0.001))

set.seed(2021)

history_base3 <- model_base3 %>% 
  fit(x = train_x,
      y= train_y,
      epoch = 50,
      batch_size = 64,
      validation_data = list(test_x, test_y))

plot(history_base3)
## `geom_smooth()` using formula 'y ~ x'

berikut hasil yang kitadapatkan

history_base
## 
## Final epoch (plot to see history):
##                               loss: 1,588,600
##     mean_absolute_percentage_error: 7.294
##                           val_loss: 1,404,123
## val_mean_absolute_percentage_error: 7.194
history_base2
## 
## Final epoch (plot to see history):
##                               loss: 1,158,966
##     mean_absolute_percentage_error: 6.081
##                           val_loss: 1,378,823
## val_mean_absolute_percentage_error: 7.029
history_base3
## 
## Final epoch (plot to see history):
##                               loss: 1,441,435
##     mean_absolute_percentage_error: 6.826
##                           val_loss: 1,274,875
## val_mean_absolute_percentage_error: 6.838

dari hasil yang kita dapatkan, kita mendapatkan model ke 2 lebih baik sedikit daripada model lain.

dan tentu saja model ini masih dapat di improve kedepannya