Email             :
RPubs            : https://rpubs.com/veronicayose/
Github           : https://github.com/veronicayose/
Jurusan          : Teknik Informatika
Address         : ARA Center, Matana University Tower
                          Jl. CBD Barat Kav, RT.1, Curug Sangereng, Kelapa Dua, Tangerang, Banten 15810.


1 Import Data

x_train<-read.csv("dataset.csv") #Mengimpor data dari dataset.csv
x_train
##     X    Province Age     Wage Life.insured
## 1   0      Banten  24  5000000          Yes
## 2   1 DKI Jakarta  NA  3400000           No
## 3   2  Jawa Barat  60  7350000           No
## 4   3      Banten  34  3500000           No
## 5   4  Jawa Barat  58       NA          Yes
## 6   5 DKI Jakarta  NA  8000000           No
## 7   6      Banten  21  5500000           No
## 8   7      Banten  44 10000000          Yes
## 9   8  Jawa Barat  40  9000000          Yes
## 10  9 DKI Jakarta  51 10500000          Yes
## 11 10      Banten  32       NA           No
## 12 11      Banten  30  6400000           No
## 13 12  Jawa Barat  30       NA           No
## 14 13 DKI Jakarta  19  2200000          Yes
## 15 14 DKI Jakarta  25  4500000          Yes

a. Enam Baris Pertama

head(x_train)
##   X    Province Age    Wage Life.insured
## 1 0      Banten  24 5000000          Yes
## 2 1 DKI Jakarta  NA 3400000           No
## 3 2  Jawa Barat  60 7350000           No
## 4 3      Banten  34 3500000           No
## 5 4  Jawa Barat  58      NA          Yes
## 6 5 DKI Jakarta  NA 8000000           No

b. Enam Baris Terakhir

tail(x_train)
##     X    Province Age     Wage Life.insured
## 10  9 DKI Jakarta  51 10500000          Yes
## 11 10      Banten  32       NA           No
## 12 11      Banten  30  6400000           No
## 13 12  Jawa Barat  30       NA           No
## 14 13 DKI Jakarta  19  2200000          Yes
## 15 14 DKI Jakarta  25  4500000          Yes

2 Menangani Data yang Hilang

Untuk mengecek banyaknya data yang hilang

colSums(is.na(x_train))
##            X     Province          Age         Wage Life.insured 
##            0            0            2            3            0

2.1 Dengan cara menghapus

colSums(is.na(na.omit(x_train)))
##            X     Province          Age         Wage Life.insured 
##            0            0            0            0            0

2.2 Input Mean/Modus/Median

a. Mengisi numerik yang hilang dengan Mean

x_train$Age[is.na(x_train$Age)] = mean(x_train$Age,na.rm = TRUE)
colSums(is.na(x_train))
##            X     Province          Age         Wage Life.insured 
##            0            0            0            3            0

b. Mengisi numerik yang hilang dengan Modus

x_train$Wage[is.na(x_train$Wage)] = mode(x_train$Wage)
colSums(is.na(x_train))
##            X     Province          Age         Wage Life.insured 
##            0            0            0            0            0

2.3 Interpolasi Linear

library(zoo)
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
#Mengimpor data dari dataset.csv
x_train<-read.csv("dataset.csv") 

#Interpolasi Linear
x_train$Wage<-na.approx(x_train$Wage)
colSums(is.na(x_train))
##            X     Province          Age         Wage Life.insured 
##            0            0            2            0            0

2.4 Forward Filling

#Mengimpor data dari dataset.csv
x_train<-read.csv("dataset.csv")

#forward filling
require(tidyr)
## Loading required package: tidyr
require(dplyr)
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
x_train <- x_train %>% fill(Age)
colSums(is.na(x_train))
##            X     Province          Age         Wage Life.insured 
##            0            0            0            3            0

2.5 Backward Filling

#backward filling
require(tidyr)
require(dplyr)

x_train <- x_train %>% fill(Wage, .direction = "up")
colSums(is.na(x_train))
##            X     Province          Age         Wage Life.insured 
##            0            0            0            0            0

3 Periksa Nilai Duplikat

#Mengimpor data dari dataset.csv
x_train<-read.csv("dataset.csv")

#Memeriksa nilai duplikat pada dataframe x_tr
x_train %>% count(x_train$Age) %>% filter(n>1) %>% select(-n)
##   x_train$Age
## 1          30
## 2          NA
#Untuk memeriksa banyaknya jumlah pada data yang terduplikat
#Catatan: n adalah jumlah yang terduplikat
x_train %>% count(x_train$Age) %>% filter(n>1)
##   x_train$Age n
## 1          30 2
## 2          NA 2

4 Split Data Kategorikal dan Numerik

a. Memilah data numerik

Filter(is.numeric, x_train)
##     X Age     Wage
## 1   0  24  5000000
## 2   1  NA  3400000
## 3   2  60  7350000
## 4   3  34  3500000
## 5   4  58       NA
## 6   5  NA  8000000
## 7   6  21  5500000
## 8   7  44 10000000
## 9   8  40  9000000
## 10  9  51 10500000
## 11 10  32       NA
## 12 11  30  6400000
## 13 12  30       NA
## 14 13  19  2200000
## 15 14  25  4500000

b. Memilah data kategorikal

Filter(is.character, x_train)
##       Province Life.insured
## 1       Banten          Yes
## 2  DKI Jakarta           No
## 3   Jawa Barat           No
## 4       Banten           No
## 5   Jawa Barat          Yes
## 6  DKI Jakarta           No
## 7       Banten           No
## 8       Banten          Yes
## 9   Jawa Barat          Yes
## 10 DKI Jakarta          Yes
## 11      Banten           No
## 12      Banten           No
## 13  Jawa Barat           No
## 14 DKI Jakarta          Yes
## 15 DKI Jakarta          Yes

5 Menangani Data Numerik

5.1 Standardisasi

x_train <- read.csv("dataset.csv")
x_del <- na.omit(x_train)   # menghilangkan na
x_del
##     X    Province Age     Wage Life.insured
## 1   0      Banten  24  5000000          Yes
## 3   2  Jawa Barat  60  7350000           No
## 4   3      Banten  34  3500000           No
## 7   6      Banten  21  5500000           No
## 8   7      Banten  44 10000000          Yes
## 9   8  Jawa Barat  40  9000000          Yes
## 10  9 DKI Jakarta  51 10500000          Yes
## 12 11      Banten  30  6400000           No
## 14 13 DKI Jakarta  19  2200000          Yes
## 15 14 DKI Jakarta  25  4500000          Yes
x_del$Age_stan <- scale(x_del$Age)
x_del$Wage_stan <- scale(x_del$Wage)
x_del
##     X    Province Age     Wage Life.insured    Age_stan   Wage_stan
## 1   0      Banten  24  5000000          Yes -0.78916488 -0.50069606
## 3   2  Jawa Barat  60  7350000           No  1.84138472  0.34277042
## 4   3      Banten  34  3500000           No -0.05845666 -1.03907893
## 7   6      Banten  21  5500000           No -1.00837734 -0.32123511
## 8   7      Banten  44 10000000          Yes  0.67225156  1.29391349
## 9   8  Jawa Barat  40  9000000          Yes  0.37996827  0.93499158
## 10  9 DKI Jakarta  51 10500000          Yes  1.18374732  1.47337444
## 12 11      Banten  30  6400000           No -0.35073995  0.00179461
## 14 13 DKI Jakarta  19  2200000          Yes -1.15451899 -1.50567741
## 15 14 DKI Jakarta  25  4500000          Yes -0.71609406 -0.68015702

5.2 Normalisasi

x_train <- read.csv("dataset.csv")
x_del <- na.omit(x_train)  # menghilangkan na
normalize <- function(x) {
  return((x - min(x)/(max(x)-min(x))))
}
x_del
##     X    Province Age     Wage Life.insured
## 1   0      Banten  24  5000000          Yes
## 3   2  Jawa Barat  60  7350000           No
## 4   3      Banten  34  3500000           No
## 7   6      Banten  21  5500000           No
## 8   7      Banten  44 10000000          Yes
## 9   8  Jawa Barat  40  9000000          Yes
## 10  9 DKI Jakarta  51 10500000          Yes
## 12 11      Banten  30  6400000           No
## 14 13 DKI Jakarta  19  2200000          Yes
## 15 14 DKI Jakarta  25  4500000          Yes
x_del$Age_norm <- normalize(x_del$Age)
x_del$Wage_norm <- normalize(x_del$Wage)

x_del
##     X    Province Age     Wage Life.insured Age_norm Wage_norm
## 1   0      Banten  24  5000000          Yes 23.53659   5000000
## 3   2  Jawa Barat  60  7350000           No 59.53659   7350000
## 4   3      Banten  34  3500000           No 33.53659   3500000
## 7   6      Banten  21  5500000           No 20.53659   5500000
## 8   7      Banten  44 10000000          Yes 43.53659  10000000
## 9   8  Jawa Barat  40  9000000          Yes 39.53659   9000000
## 10  9 DKI Jakarta  51 10500000          Yes 50.53659  10500000
## 12 11      Banten  30  6400000           No 29.53659   6400000
## 14 13 DKI Jakarta  19  2200000          Yes 18.53659   2200000
## 15 14 DKI Jakarta  25  4500000          Yes 24.53659   4500000

5.3 Penskalaan Robust

x_train <- read.csv("dataset.csv")
x_del <- na.omit(x_train)  # menghilangkan na
robust <- function(x) {
  return((x-quantile(x)[2])/(quantile(x)[4]-quantile(x)[2]))
}
x_del
##     X    Province Age     Wage Life.insured
## 1   0      Banten  24  5000000          Yes
## 3   2  Jawa Barat  60  7350000           No
## 4   3      Banten  34  3500000           No
## 7   6      Banten  21  5500000           No
## 8   7      Banten  44 10000000          Yes
## 9   8  Jawa Barat  40  9000000          Yes
## 10  9 DKI Jakarta  51 10500000          Yes
## 12 11      Banten  30  6400000           No
## 14 13 DKI Jakarta  19  2200000          Yes
## 15 14 DKI Jakarta  25  4500000          Yes
x_del$Age_robus <- normalize(x_del$Age)
x_del$Wage_robus <- normalize(x_del$Wage)

x_del
##     X    Province Age     Wage Life.insured Age_robus Wage_robus
## 1   0      Banten  24  5000000          Yes  23.53659    5000000
## 3   2  Jawa Barat  60  7350000           No  59.53659    7350000
## 4   3      Banten  34  3500000           No  33.53659    3500000
## 7   6      Banten  21  5500000           No  20.53659    5500000
## 8   7      Banten  44 10000000          Yes  43.53659   10000000
## 9   8  Jawa Barat  40  9000000          Yes  39.53659    9000000
## 10  9 DKI Jakarta  51 10500000          Yes  50.53659   10500000
## 12 11      Banten  30  6400000           No  29.53659    6400000
## 14 13 DKI Jakarta  19  2200000          Yes  18.53659    2200000
## 15 14 DKI Jakarta  25  4500000          Yes  24.53659    4500000

6 Penanganan Data Pencilan

6.1 Metode Statistik

Distribusi Gaussian

x_train <- read.csv("dataset.csv")
x_train <- na.omit(x_train)  # menghilangkan na
pencilan <- function(x) {
  sample_mean <- mean(x)
  sample_std <- sd(x)
  cut_off <- sample_std * 1
  lower <- sample_mean - cut_off
  upper <- sample_mean + cut_off
  
  return(sapply(x, function(x) {
    return(x < lower || x > upper)
  }))
}
x_train[pencilan(x_train$Age),]
##     X    Province Age     Wage Life.insured
## 3   2  Jawa Barat  60  7350000           No
## 7   6      Banten  21  5500000           No
## 10  9 DKI Jakarta  51 10500000          Yes
## 14 13 DKI Jakarta  19  2200000          Yes
x_train[pencilan(x_train$Wage),]
##     X    Province Age     Wage Life.insured
## 4   3      Banten  34  3500000           No
## 8   7      Banten  44 10000000          Yes
## 10  9 DKI Jakarta  51 10500000          Yes
## 14 13 DKI Jakarta  19  2200000          Yes

6.2 Boxplot atau Rentang Interkuartil (IQR)

x_train <- read.csv("dataset.csv")
boxplot(x_train$Age)

boxplot(x_train$Wage)

7 Encoding Data Kategorical

dim(x_train)
## [1] 15  5
head(x_train, 5)
##   X    Province Age    Wage Life.insured
## 1 0      Banten  24 5000000          Yes
## 2 1 DKI Jakarta  NA 3400000           No
## 3 2  Jawa Barat  60 7350000           No
## 4 3      Banten  34 3500000           No
## 5 4  Jawa Barat  58      NA          Yes
x_Category<-Filter(is.character, x_train)

colSums(is.na(x_Category))
##     Province Life.insured 
##            0            0

7.1 Pelabelan

x_train <- read.csv("dataset.csv")    #import data training X
library(superml)
## Loading required package: R6
x_label <- LabelEncoder$new()
x_train$Province <- x_label$fit_transform(x_train$Province)
x_train$Life.insured <- x_label$fit_transform(x_train$Life.insured)
x_train
##     X Province Age     Wage Life.insured
## 1   0        0  24  5000000            0
## 2   1        1  NA  3400000            1
## 3   2        2  60  7350000            1
## 4   3        0  34  3500000            1
## 5   4        2  58       NA            0
## 6   5        1  NA  8000000            1
## 7   6        0  21  5500000            1
## 8   7        0  44 10000000            0
## 9   8        2  40  9000000            0
## 10  9        1  51 10500000            0
## 11 10        0  32       NA            1
## 12 11        0  30  6400000            1
## 13 12        2  30       NA            1
## 14 13        1  19  2200000            0
## 15 14        1  25  4500000            0

7.2 Pemetaan Kustom

x_train <- read.csv("dataset.csv")

x_train$Province[x_train$Province=="Banten"]<-1
x_train$Province[x_train$Province=="DKI Jakarta"]<-2
x_train$Province[x_train$Province=="Jawa Barat"]<-3

x_train$Life.insured[x_train$Life.insured=="Yes"]<-1
x_train$Life.insured[x_train$Life.insured=="No"]<-2

print(x_train)
##     X Province Age     Wage Life.insured
## 1   0        1  24  5000000            1
## 2   1        2  NA  3400000            2
## 3   2        3  60  7350000            2
## 4   3        1  34  3500000            2
## 5   4        3  58       NA            1
## 6   5        2  NA  8000000            2
## 7   6        1  21  5500000            2
## 8   7        1  44 10000000            1
## 9   8        3  40  9000000            1
## 10  9        2  51 10500000            1
## 11 10        1  32       NA            2
## 12 11        1  30  6400000            2
## 13 12        3  30       NA            2
## 14 13        2  19  2200000            1
## 15 14        2  25  4500000            1

7.3 Variabel Dummy

library(fastDummies)
x_train <- read.csv("dataset.csv") 
x_train <- dummy_cols(x_train)
print(x_train)
##     X    Province Age     Wage Life.insured Province_Banten
## 1   0      Banten  24  5000000          Yes               1
## 2   1 DKI Jakarta  NA  3400000           No               0
## 3   2  Jawa Barat  60  7350000           No               0
## 4   3      Banten  34  3500000           No               1
## 5   4  Jawa Barat  58       NA          Yes               0
## 6   5 DKI Jakarta  NA  8000000           No               0
## 7   6      Banten  21  5500000           No               1
## 8   7      Banten  44 10000000          Yes               1
## 9   8  Jawa Barat  40  9000000          Yes               0
## 10  9 DKI Jakarta  51 10500000          Yes               0
## 11 10      Banten  32       NA           No               1
## 12 11      Banten  30  6400000           No               1
## 13 12  Jawa Barat  30       NA           No               0
## 14 13 DKI Jakarta  19  2200000          Yes               0
## 15 14 DKI Jakarta  25  4500000          Yes               0
##    Province_DKI Jakarta Province_Jawa Barat Life.insured_No Life.insured_Yes
## 1                     0                   0               0                1
## 2                     1                   0               1                0
## 3                     0                   1               1                0
## 4                     0                   0               1                0
## 5                     0                   1               0                1
## 6                     1                   0               1                0
## 7                     0                   0               1                0
## 8                     0                   0               0                1
## 9                     0                   1               0                1
## 10                    1                   0               0                1
## 11                    0                   0               1                0
## 12                    0                   0               1                0
## 13                    0                   1               1                0
## 14                    1                   0               0                1
## 15                    1                   0               0                1

7.4 K-fold/Cross-fold

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.4     v stringr 1.4.0
## v readr   2.0.1     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(ISLR)
library(lattice)
x_train <- read.csv("dataset.csv")
x_train$Life.insured[x_train$Life.insured=="Yes"]<-1
x_train$Life.insured[x_train$Life.insured=="No"]<-2
y_train <- na.omit(x_train)
set.seed(100)

dataset <- trainControl(method = "cv", number = 10, savePredictions = TRUE)

nb_fit <- train(Age ~., data = y_train, method = "lm", trControl =dataset, tuneLength = 14)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
nb_fit
## Linear Regression 
## 
## 10 samples
##  4 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 9, 9, 9, 9, 9, 9, ... 
## Resampling results:
## 
##   RMSE      Rsquared  MAE     
##   11.29568  NaN       11.29568
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE