Prediksi

Regresi Linear

Memuat Package

library(tidyverse)
library(caret)

Dataset

cars

##    speed dist
## 1      4    2
## 2      4   10
## 3      7    4
## 4      7   22
## 5      8   16
## 6      9   10
## 7     10   18
## 8     10   26
## 9     10   34
## 10    11   17
## 11    11   28
## 12    12   14
## 13    12   20
## 14    12   24
## 15    12   28
## 16    13   26
## 17    13   34
## 18    13   34
## 19    13   46
## 20    14   26
## 21    14   36
## 22    14   60
## 23    14   80
## 24    15   20
## 25    15   26
## 26    15   54
## 27    16   32
## 28    16   40
## 29    17   32
## 30    17   40
## 31    17   50
## 32    18   42
## 33    18   56
## 34    18   76
## 35    18   84
## 36    19   36
## 37    19   46
## 38    19   68
## 39    20   32
## 40    20   48
## 41    20   52
## 42    20   56
## 43    20   64
## 44    22   66
## 45    23   54
## 46    24   70
## 47    24   92
## 48    24   93
## 49    24  120
## 50    25   85

Grafik dari Dataset

cars %>% 
  ggplot(aes(x = speed, y = dist)) +
  geom_point()

Model

set.seed(1)

# Membagi Data Menjadi Training dan Test ----

sampled_cars <- cars %>% 
  mutate(training = sample(0:1,
                           nrow(cars),
                           replace = TRUE))

training_data <- sampled_cars %>% 
  filter(training == 1)

test_data <- sampled_cars %>%
  filter(training == 0)

# Model Regresi Linear ----
model1 <- training_data %>%
  lm(dist ~ speed, data = .)

model1

## 
## Call:
## lm(formula = dist ~ speed, data = .)
## 
## Coefficients:
## (Intercept)        speed  
##     -10.466        3.377

summary(model1)

## 
## Call:
## lm(formula = dist ~ speed, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -25.0646  -7.9998  -0.5707   5.0124  25.6884 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -10.4658     8.7180  -1.200    0.243    
## speed         3.3765     0.4941   6.834 9.33e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.59 on 21 degrees of freedom
## Multiple R-squared:  0.6898, Adjusted R-squared:  0.6751 
## F-statistic:  46.7 on 1 and 21 DF,  p-value: 9.333e-07

Perbaikan Model

Berdasarkan output diatas terlihat bahwa intercept tidak signifikan (p-value = 0.243). Jadi intercept dihilangkan pada model.

set.seed(2)
training_data %>% 
  lm(dist ~ speed-1, data = .)

## 
## Call:
## lm(formula = dist ~ speed - 1, data = .)
## 
## Coefficients:
## speed  
## 2.811

model1 <-
  training_data %>% 
  lm(dist ~ speed-1, data = .)

training_data %>% 
  lm(dist ~ speed-1, data = .)%>%
  summary

## 
## Call:
## lm(formula = dist ~ speed - 1, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -24.218  -8.408  -2.542   3.349  25.538 
## 
## Coefficients:
##       Estimate Std. Error t value Pr(>|t|)    
## speed   2.8109     0.1503   18.71 5.36e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.72 on 22 degrees of freedom
## Multiple R-squared:  0.9409, Adjusted R-squared:  0.9382 
## F-statistic:   350 on 1 and 22 DF,  p-value: 5.36e-15

Validasi Menggunakan RMSE

rmse <- function(x,t) {
  sqrt(mean(sum((t - x)^2)))
}
rmse(predict(model1, test_data),
     test_data$dist)

## [1] 97.59294

Cross Validasi

set.seed(3)

random_group <- 
  function(n, probs) { 
  probs <- probs / sum(probs) 
  g <- findInterval(seq(0, 1, length = n),
                    c(0, cumsum(probs)), 
                    rightmost.closed = TRUE)
  names(probs)[sample(g)] 
  }


partition <- 
  function(df, n, probs) { 
  replicate(n,
            split(df, random_group(nrow(df),
                                   probs)),
            FALSE) 
  }


prediction_accuracy_cars <- 
  function(test_and_training) { 
    result <- vector(mode = "numeric", 
                     length = length(test_and_training))
    for (i in seq_along(test_and_training)) {
      training <- 
        test_and_training[[i]]$training
      test <- test_and_training[[i]]$test 
      model <- training %>%
        lm(dist ~ speed-1, data = .)  
      predictions <- 
        predict(model, test) 
      targets <- test$dist   
      result[i] <- rmse(targets, predictions)  }
    result
  }


cars %>%
  partition(5, c(training = 0.6, test = 0.4)) %>%
  prediction_accuracy_cars

## [1] 69.23068 62.94201 93.80765 53.87501 75.10687

akurasi <-
  cars %>%
  partition(5, c(training = 0.6, test = 0.4)) %>%
  prediction_accuracy_cars

mean(akurasi)

## [1] 72.70533

PLot Data dengan Prediksi

cars %>% 
  ggplot(aes(x = speed, y = dist)) +
  geom_point() +
  geom_smooth(method = "lm", formula = y ~ x-1)

Decision Tree

Memuat Package

library(party)

Dataset

cars

##    speed dist
## 1      4    2
## 2      4   10
## 3      7    4
## 4      7   22
## 5      8   16
## 6      9   10
## 7     10   18
## 8     10   26
## 9     10   34
## 10    11   17
## 11    11   28
## 12    12   14
## 13    12   20
## 14    12   24
## 15    12   28
## 16    13   26
## 17    13   34
## 18    13   34
## 19    13   46
## 20    14   26
## 21    14   36
## 22    14   60
## 23    14   80
## 24    15   20
## 25    15   26
## 26    15   54
## 27    16   32
## 28    16   40
## 29    17   32
## 30    17   40
## 31    17   50
## 32    18   42
## 33    18   56
## 34    18   76
## 35    18   84
## 36    19   36
## 37    19   46
## 38    19   68
## 39    20   32
## 40    20   48
## 41    20   52
## 42    20   56
## 43    20   64
## 44    22   66
## 45    23   54
## 46    24   70
## 47    24   92
## 48    24   93
## 49    24  120
## 50    25   85

Grafik dari Dataset

cars %>% 
  ggplot(aes(x = speed, y = dist)) +
  geom_point()

Model

set.seed(11)

# Model Regresi Linear ----
model11 <- training_data %>%
  ctree(dist ~ speed, data = .)

model11

## 
##   Conditional inference tree with 2 terminal nodes
## 
## Response:  dist 
## Input:  speed 
## Number of observations:  23 
## 
## 1) speed <= 16; criterion = 1, statistic = 15.176
##   2)*  weights = 9 
## 1) speed > 16
##   3)*  weights = 14

# Plot Tree
model11 %>% plot

Validasi menggunakan RMSE

rmse(predict(model11, test_data),
     test_data$dist)

## [1] 114.7792

Cross Validations

set.seed(33)

random_group <- 
  function(n, probs) { 
  probs <- probs / sum(probs) 
  g <- findInterval(seq(0, 1, length = n),
                    c(0, cumsum(probs)), 
                    rightmost.closed = TRUE)
  names(probs)[sample(g)] 
  }


partition <- 
  function(df, n, probs) { 
  replicate(n,
            split(df, random_group(nrow(df),
                                   probs)),
            FALSE) 
  }


prediction_accuracy_cars <- 
  function(test_and_training) { 
    result <- vector(mode = "numeric", 
                     length = length(test_and_training))
    for (i in seq_along(test_and_training)) {
      training <- 
        test_and_training[[i]]$training
      test <- test_and_training[[i]]$test 
      model <- ctree(dist ~ speed, data = test)  
      predictions <- 
        predict(model, test) 
      targets <- test$dist   
      result[i] <- rmse(targets, predictions)  }
    result
  }


cars %>%
  partition(5, c(training = 0.6, test = 0.4)) %>%
  prediction_accuracy_cars

## [1] 90.30919 89.84260 75.98438 70.58417 79.78826

akurasi1 <-
  cars %>%
  partition(5, c(training = 0.6, test = 0.4)) %>%
  prediction_accuracy_cars

mean(akurasi1)

## [1] 92.27278

Random Forest

Memuat Package

library(randomForest)

Dataset

cars

##    speed dist
## 1      4    2
## 2      4   10
## 3      7    4
## 4      7   22
## 5      8   16
## 6      9   10
## 7     10   18
## 8     10   26
## 9     10   34
## 10    11   17
## 11    11   28
## 12    12   14
## 13    12   20
## 14    12   24
## 15    12   28
## 16    13   26
## 17    13   34
## 18    13   34
## 19    13   46
## 20    14   26
## 21    14   36
## 22    14   60
## 23    14   80
## 24    15   20
## 25    15   26
## 26    15   54
## 27    16   32
## 28    16   40
## 29    17   32
## 30    17   40
## 31    17   50
## 32    18   42
## 33    18   56
## 34    18   76
## 35    18   84
## 36    19   36
## 37    19   46
## 38    19   68
## 39    20   32
## 40    20   48
## 41    20   52
## 42    20   56
## 43    20   64
## 44    22   66
## 45    23   54
## 46    24   70
## 47    24   92
## 48    24   93
## 49    24  120
## 50    25   85

Grafik dari Dataset

cars %>% 
  ggplot(aes(x = speed, y = dist)) +
  geom_point()

Model

set.seed(111)

# Model Regresi Linear ----
model111 <- training_data %>%
  randomForest(dist ~ speed, data = .)

model111

## 
## Call:
##  randomForest(formula = dist ~ speed, data = .) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 1
## 
##           Mean of squared residuals: 217.039
##                     % Var explained: 53.48

Validasi menggunakan RMSE

rmse(predict(model111, test_data),
     test_data$dist)

## [1] 86.09424

Cross Validations

set.seed(333)

random_group <- 
  function(n, probs) { 
  probs <- probs / sum(probs) 
  g <- findInterval(seq(0, 1, length = n),
                    c(0, cumsum(probs)), 
                    rightmost.closed = TRUE)
  names(probs)[sample(g)] 
  }


partition <- 
  function(df, n, probs) { 
  replicate(n,
            split(df, random_group(nrow(df),
                                   probs)),
            FALSE) 
  }


prediction_accuracy_cars <- 
  function(test_and_training) { 
    result <- vector(mode = "numeric", 
                     length = length(test_and_training))
    for (i in seq_along(test_and_training)) {
      training <- 
        test_and_training[[i]]$training
      test <- test_and_training[[i]]$test 
      model <- randomForest(dist ~ speed, data = test)  
      predictions <- 
        predict(model, test) 
      targets <- test$dist   
      result[i] <- rmse(targets, predictions)  }
    result
  }


cars %>%
  partition(5, c(training = 0.6, test = 0.4)) %>%
  prediction_accuracy_cars

## [1] 28.55229 50.76490 35.16914 37.86496 61.55537

akurasi2 <-
  cars %>%
  partition(5, c(training = 0.6, test = 0.4)) %>%
  prediction_accuracy_cars

mean(akurasi2)

## [1] 42.54693

Neural Network

Memuat Package

library(nnet)

Dataset

cars

##    speed dist
## 1      4    2
## 2      4   10
## 3      7    4
## 4      7   22
## 5      8   16
## 6      9   10
## 7     10   18
## 8     10   26
## 9     10   34
## 10    11   17
## 11    11   28
## 12    12   14
## 13    12   20
## 14    12   24
## 15    12   28
## 16    13   26
## 17    13   34
## 18    13   34
## 19    13   46
## 20    14   26
## 21    14   36
## 22    14   60
## 23    14   80
## 24    15   20
## 25    15   26
## 26    15   54
## 27    16   32
## 28    16   40
## 29    17   32
## 30    17   40
## 31    17   50
## 32    18   42
## 33    18   56
## 34    18   76
## 35    18   84
## 36    19   36
## 37    19   46
## 38    19   68
## 39    20   32
## 40    20   48
## 41    20   52
## 42    20   56
## 43    20   64
## 44    22   66
## 45    23   54
## 46    24   70
## 47    24   92
## 48    24   93
## 49    24  120
## 50    25   85

Grafik dari Dataset

cars %>% 
  ggplot(aes(x = speed, y = dist)) +
  geom_point()

Model

set.seed(1111)

# Model Regresi Linear ----
model1111 <- training_data %>%
  nnet(dist ~ speed, data = ., size = 5)

## # weights:  16
## initial  value 59324.165942 
## final  value 58029.000000 
## converged

model1111

## a 1-5-1 network with 16 weights
## inputs: speed 
## output(s): dist 
## options were -

Validasi menggunakan RMSE

rmse(predict(model1111, test_data),
     test_data$dist)

## [1] 250.2519

Cross Validations

set.seed(3333)

random_group <- 
  function(n, probs) { 
  probs <- probs / sum(probs) 
  g <- findInterval(seq(0, 1, length = n),
                    c(0, cumsum(probs)), 
                    rightmost.closed = TRUE)
  names(probs)[sample(g)] 
  }


partition <- 
  function(df, n, probs) { 
  replicate(n,
            split(df, random_group(nrow(df),
                                   probs)),
            FALSE) 
  }


prediction_accuracy_cars <- 
  function(test_and_training) { 
    result <- vector(mode = "numeric", 
                     length = length(test_and_training))
    for (i in seq_along(test_and_training)) {
      training <- 
        test_and_training[[i]]$training
      test <- test_and_training[[i]]$test 
      model <- nnet(dist ~ speed, data = test, size= 5)  
      predictions <- 
        predict(model, test) 
      targets <- test$dist   
      result[i] <- rmse(targets, predictions)  }
    result
  }


cars %>%
  partition(5, c(training = 0.6, test = 0.4)) %>%
  prediction_accuracy_cars

## # weights:  16
## initial  value 48451.302343 
## final  value 47931.000000 
## converged
## # weights:  16
## initial  value 35428.883955 
## final  value 34675.000000 
## converged
## # weights:  16
## initial  value 56878.548802 
## final  value 55467.000000 
## converged
## # weights:  16
## initial  value 53226.242589 
## final  value 52099.000000 
## converged
## # weights:  16
## initial  value 57069.054848 
## final  value 56250.000000 
## converged

## [1] 218.9315 186.2122 235.5143 228.2521 237.1708

akurasi3 <-
  cars %>%
  partition(5, c(training = 0.6, test = 0.4)) %>%
  prediction_accuracy_cars

## # weights:  16
## initial  value 52776.016601 
## final  value 51875.000000 
## converged
## # weights:  16
## initial  value 62544.367282 
## final  value 61402.000000 
## converged
## # weights:  16
## initial  value 49992.727867 
## final  value 49180.000000 
## converged
## # weights:  16
## initial  value 38652.804814 
## final  value 37643.000000 
## converged
## # weights:  16
## initial  value 50694.007678 
## final  value 49907.000000 
## converged

mean(akurasi3)

## [1] 222.9475

SVM

Memuat Package

library(kernlab)

Dataset

cars

##    speed dist
## 1      4    2
## 2      4   10
## 3      7    4
## 4      7   22
## 5      8   16
## 6      9   10
## 7     10   18
## 8     10   26
## 9     10   34
## 10    11   17
## 11    11   28
## 12    12   14
## 13    12   20
## 14    12   24
## 15    12   28
## 16    13   26
## 17    13   34
## 18    13   34
## 19    13   46
## 20    14   26
## 21    14   36
## 22    14   60
## 23    14   80
## 24    15   20
## 25    15   26
## 26    15   54
## 27    16   32
## 28    16   40
## 29    17   32
## 30    17   40
## 31    17   50
## 32    18   42
## 33    18   56
## 34    18   76
## 35    18   84
## 36    19   36
## 37    19   46
## 38    19   68
## 39    20   32
## 40    20   48
## 41    20   52
## 42    20   56
## 43    20   64
## 44    22   66
## 45    23   54
## 46    24   70
## 47    24   92
## 48    24   93
## 49    24  120
## 50    25   85

Grafik dari Dataset

cars %>% 
  ggplot(aes(x = speed, y = dist)) +
  geom_point()

Model

set.seed(11111)

# Model Regresi Linear ----
model11111 <- training_data %>%
  ksvm(dist ~ speed, data = .)

model11111

## Support Vector Machine object of class "ksvm" 
## 
## SV type: eps-svr  (regression) 
##  parameter : epsilon = 0.1  cost C = 1 
## 
## Gaussian Radial Basis kernel function. 
##  Hyperparameter : sigma =  2.17636213288387 
## 
## Number of Support Vectors : 19 
## 
## Objective Function Value : -8.4675 
## Training error : 0.271066

Validasi menggunakan RMSE

rmse(predict(model11111, test_data),
     test_data$dist)

## [1] 97.90536

Cross Validations

set.seed(33333)

random_group <- 
  function(n, probs) { 
  probs <- probs / sum(probs) 
  g <- findInterval(seq(0, 1, length = n),
                    c(0, cumsum(probs)), 
                    rightmost.closed = TRUE)
  names(probs)[sample(g)] 
  }


partition <- 
  function(df, n, probs) { 
  replicate(n,
            split(df, random_group(nrow(df),
                                   probs)),
            FALSE) 
  }


prediction_accuracy_cars <- 
  function(test_and_training) { 
    result <- vector(mode = "numeric", 
                     length = length(test_and_training))
    for (i in seq_along(test_and_training)) {
      training <- 
        test_and_training[[i]]$training
      test <- test_and_training[[i]]$test 
      model <- ksvm(dist ~ speed, data = test)  
      predictions <- 
        predict(model, test) 
      targets <- test$dist   
      result[i] <- rmse(targets, predictions)  }
    result
  }


cars %>%
  partition(5, c(training = 0.6, test = 0.4)) %>%
  prediction_accuracy_cars

## [1] 58.55086 70.12731 79.32970 40.83409 75.87795

akurasi4 <-
  cars %>%
  partition(5, c(training = 0.6, test = 0.4)) %>%
  prediction_accuracy_cars

mean(akurasi4)

## [1] 65.1345

tribble(
  ~RegresiLinear, ~DecisionTree, ~RandomForest, ~NeuralNetworks, ~SVM,
  mean(akurasi), mean(akurasi1), mean(akurasi2), mean(akurasi3), mean(akurasi4)
)

## # A tibble: 1 x 5
##   RegresiLinear DecisionTree RandomForest NeuralNetworks   SVM
##           <dbl>        <dbl>        <dbl>          <dbl> <dbl>
## 1          72.7         92.3         42.5           223.  65.1

RMSE paling kecil adalah Random forest, model terbaik menggunakan random forest.

Prediksi

Arwan Zhagi

4/13/2021

Regresi Linear

Decision Tree

Random Forest

Neural Network

SVM