Email             :
RPubs            : https://rpubs.com/sabrinayose
Github           : https://github.com/sabrinayose
Jurusan          : Teknik Informatika
Address         : ARA Center, Matana University Tower
                         Jl. CBD Barat Kav, RT.1, Curug Sangereng, Kelapa Dua, Tangerang, Banten 15810.


1 Import Data

library(zoo)
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
X_train <- read.csv("dataset.csv")    #import data training X

head(X_train, 5)                         #print 5 data awal
##   X    Province Age    Wage Life.insured
## 1 0      Banten  24 5000000          Yes
## 2 1 DKI Jakarta  NA 3400000           No
## 3 2  Jawa Barat  60 7350000           No
## 4 3      Banten  34 3500000           No
## 5 4  Jawa Barat  58      NA          Yes
tail(X_train, 5)                         #print 5 data akhir
##     X    Province Age    Wage Life.insured
## 11 10      Banten  32      NA           No
## 12 11      Banten  30 6400000           No
## 13 12  Jawa Barat  30      NA           No
## 14 13 DKI Jakarta  19 2200000          Yes
## 15 14 DKI Jakarta  25 4500000          Yes

2 Menangani Data yang Hilang

Check menghitung semua data yang hilang pada data frame

colSums(is.na(X_train))               #Check Data yang hilang
##            X     Province          Age         Wage Life.insured 
##            0            0            2            3            0

2.1 Dengan Cara Menghapus

Harus hati-hati karena hasil analisis dapat menjadi tidak relevan apabila salah menghapus data.

df_rm <- na.omit(X_train)             #hapus baris yang berisi nilai yang hilang
print(colSums(is.na(df_rm)))          #check apakah masih ada data yang hilang
##            X     Province          Age         Wage Life.insured 
##            0            0            0            0            0

2.2 Input Mean/Modus/Median

Untuk menghitung data berupa numerik menggunakan Mean dan Median.
Untuk data berupa kategoris menggunakan mode.

#Mengisi nilai variabel numerik yang hilang dengan rata-rata
X_train$Age[is.na(X_train$Age)] <- mean(X_train$Age, na.rm = TRUE)
X_train$Age
##  [1] 24 36 60 34 58 36 21 44 40 51 32 30 30 19 25
X_train$Wage[is.na(X_train$Wage)] <- mean(X_train$Wage, na.rm = TRUE)
X_train$Wage
##  [1]  5000000  3400000  7350000  3500000  6279167  8000000  5500000 10000000
##  [9]  9000000 10500000  6279167  6400000  6279167  2200000  4500000
#Mengisi nilai variabel karakter yang hilang dengan mode
X_train$Life.insured[is.na(X_train$Life.insured)] <- mode(X_train$Life.insured)
X_train$Life.insured
##  [1] "Yes" "No"  "No"  "No"  "Yes" "No"  "No"  "Yes" "Yes" "Yes" "No"  "No" 
## [13] "No"  "Yes" "Yes"

2.3 Interpolasi Linier

X_train <- read.csv("dataset.csv")    #import data training X
X_train$Wage<-na.approx(X_train$Wage)
X_train$Wage
##  [1]  5000000  3400000  7350000  3500000  5750000  8000000  5500000 10000000
##  [9]  9000000 10500000  8450000  6400000  4300000  2200000  4500000

2.4 Forwarding Filling

require(tidyr)
## Loading required package: tidyr
require(dplyr)
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
X_train <- read.csv("dataset.csv")    #import data training X
X_train <- X_train %>% fill(Age)
X_train
##     X    Province Age     Wage Life.insured
## 1   0      Banten  24  5000000          Yes
## 2   1 DKI Jakarta  24  3400000           No
## 3   2  Jawa Barat  60  7350000           No
## 4   3      Banten  34  3500000           No
## 5   4  Jawa Barat  58       NA          Yes
## 6   5 DKI Jakarta  58  8000000           No
## 7   6      Banten  21  5500000           No
## 8   7      Banten  44 10000000          Yes
## 9   8  Jawa Barat  40  9000000          Yes
## 10  9 DKI Jakarta  51 10500000          Yes
## 11 10      Banten  32       NA           No
## 12 11      Banten  30  6400000           No
## 13 12  Jawa Barat  30       NA           No
## 14 13 DKI Jakarta  19  2200000          Yes
## 15 14 DKI Jakarta  25  4500000          Yes

2.5 Backward Filling

require(tidyr)
require(dplyr)

X_train <- read.csv("dataset.csv")    #import data training X
X_train <- X_train %>% fill(Wage, .direction="up")
X_train
##     X    Province Age     Wage Life.insured
## 1   0      Banten  24  5000000          Yes
## 2   1 DKI Jakarta  NA  3400000           No
## 3   2  Jawa Barat  60  7350000           No
## 4   3      Banten  34  3500000           No
## 5   4  Jawa Barat  58  8000000          Yes
## 6   5 DKI Jakarta  NA  8000000           No
## 7   6      Banten  21  5500000           No
## 8   7      Banten  44 10000000          Yes
## 9   8  Jawa Barat  40  9000000          Yes
## 10  9 DKI Jakarta  51 10500000          Yes
## 11 10      Banten  32  6400000           No
## 12 11      Banten  30  6400000           No
## 13 12  Jawa Barat  30  2200000           No
## 14 13 DKI Jakarta  19  2200000          Yes
## 15 14 DKI Jakarta  25  4500000          Yes

3 Periksa Nilai Duplikat

X_train <- read.csv("dataset.csv")    #import data training X

X_train %>% count(X_train$Age) %>% filter(n>1) %>% select(-n)
##   X_train$Age
## 1          30
## 2          NA

4 Split Data Kategorikal dan Numerik

Filter(is.numeric, X_train)
##     X Age     Wage
## 1   0  24  5000000
## 2   1  NA  3400000
## 3   2  60  7350000
## 4   3  34  3500000
## 5   4  58       NA
## 6   5  NA  8000000
## 7   6  21  5500000
## 8   7  44 10000000
## 9   8  40  9000000
## 10  9  51 10500000
## 11 10  32       NA
## 12 11  30  6400000
## 13 12  30       NA
## 14 13  19  2200000
## 15 14  25  4500000
Filter(is.character, X_train)
##       Province Life.insured
## 1       Banten          Yes
## 2  DKI Jakarta           No
## 3   Jawa Barat           No
## 4       Banten           No
## 5   Jawa Barat          Yes
## 6  DKI Jakarta           No
## 7       Banten           No
## 8       Banten          Yes
## 9   Jawa Barat          Yes
## 10 DKI Jakarta          Yes
## 11      Banten           No
## 12      Banten           No
## 13  Jawa Barat           No
## 14 DKI Jakarta          Yes
## 15 DKI Jakarta          Yes

5 Menangani Data Numerik

5.1 Standardisasi

X_train <- read.csv("dataset.csv")    #import data training X
df_rm <- na.omit(X_train)

df_rm$Age_scaled <- scale(df_rm$Age)
df_rm$Wage_scaled <- scale(df_rm$Wage)

df_rm
##     X    Province Age     Wage Life.insured  Age_scaled Wage_scaled
## 1   0      Banten  24  5000000          Yes -0.78916488 -0.50069606
## 3   2  Jawa Barat  60  7350000           No  1.84138472  0.34277042
## 4   3      Banten  34  3500000           No -0.05845666 -1.03907893
## 7   6      Banten  21  5500000           No -1.00837734 -0.32123511
## 8   7      Banten  44 10000000          Yes  0.67225156  1.29391349
## 9   8  Jawa Barat  40  9000000          Yes  0.37996827  0.93499158
## 10  9 DKI Jakarta  51 10500000          Yes  1.18374732  1.47337444
## 12 11      Banten  30  6400000           No -0.35073995  0.00179461
## 14 13 DKI Jakarta  19  2200000          Yes -1.15451899 -1.50567741
## 15 14 DKI Jakarta  25  4500000          Yes -0.71609406 -0.68015702

5.2 Normalisasi

X_train <- read.csv("dataset.csv")    #import data training X

normalize <- function(x){
  return ((x - min(x) / max(x) - min(x)))
}

df_rm <- na.omit(X_train)

df_rm$Age_norm <- normalize(df_rm$Age)
df_rm$Wage_norm <- normalize(df_rm$Wage)

df_rm
##     X    Province Age     Wage Life.insured   Age_norm     Wage_norm
## 1   0      Banten  24  5000000          Yes  4.6833333  2.800000e+06
## 3   2  Jawa Barat  60  7350000           No 40.6833333  5.150000e+06
## 4   3      Banten  34  3500000           No 14.6833333  1.300000e+06
## 7   6      Banten  21  5500000           No  1.6833333  3.300000e+06
## 8   7      Banten  44 10000000          Yes 24.6833333  7.800000e+06
## 9   8  Jawa Barat  40  9000000          Yes 20.6833333  6.800000e+06
## 10  9 DKI Jakarta  51 10500000          Yes 31.6833333  8.300000e+06
## 12 11      Banten  30  6400000           No 10.6833333  4.200000e+06
## 14 13 DKI Jakarta  19  2200000          Yes -0.3166667 -2.095238e-01
## 15 14 DKI Jakarta  25  4500000          Yes  5.6833333  2.300000e+06

5.3 Penskalaan Robust

X_train <- read.csv("dataset.csv")    #import data training X

robust <- function(x){
  return ((x - quantile(x)[2] / (quantile(x)[4] - quantile(x)[2])))
}

df_rm <- na.omit(X_train)

df_rm$Age_norm <- robust(df_rm$Age)
df_rm$Wage_norm <- robust(df_rm$Wage)

df_rm
##     X    Province Age     Wage Life.insured Age_norm Wage_norm
## 1   0      Banten  24  5000000          Yes 22.70667   4999999
## 3   2  Jawa Barat  60  7350000           No 58.70667   7349999
## 4   3      Banten  34  3500000           No 32.70667   3499999
## 7   6      Banten  21  5500000           No 19.70667   5499999
## 8   7      Banten  44 10000000          Yes 42.70667   9999999
## 9   8  Jawa Barat  40  9000000          Yes 38.70667   8999999
## 10  9 DKI Jakarta  51 10500000          Yes 49.70667  10499999
## 12 11      Banten  30  6400000           No 28.70667   6399999
## 14 13 DKI Jakarta  19  2200000          Yes 17.70667   2199999
## 15 14 DKI Jakarta  25  4500000          Yes 23.70667   4499999

6 Penanganan Data Pencilan

6.1 Metode Statistik

Distribusi Gaussian

X_train <- read.csv("dataset.csv")    #import data training X

outliers <- function(x){
  sample_mean = mean(x)
  sample_std = sd(x)
  cut_off = sample_std * 1
  lower = sample_mean - cut_off
  upper = sample_mean + cut_off
  
  return (sapply(x, function(x) {
    return(x < lower || x > upper)
  }))
}

df_rm <- na.omit(X_train)

df_rm[outliers(df_rm$Age),]
##     X    Province Age     Wage Life.insured
## 3   2  Jawa Barat  60  7350000           No
## 7   6      Banten  21  5500000           No
## 10  9 DKI Jakarta  51 10500000          Yes
## 14 13 DKI Jakarta  19  2200000          Yes

Boxplot atau Rentang Interkuartil (IQR)

X_train <- read.csv("dataset.csv") 
boxplot(X_train$Wage)

7 Encoding Data Kategorikal

X_train <- read.csv("dataset.csv") 
dim(X_train)
## [1] 15  5
X_train <- read.csv("dataset.csv") 
head(X_train, 5)
##   X    Province Age    Wage Life.insured
## 1 0      Banten  24 5000000          Yes
## 2 1 DKI Jakarta  NA 3400000           No
## 3 2  Jawa Barat  60 7350000           No
## 4 3      Banten  34 3500000           No
## 5 4  Jawa Barat  58      NA          Yes
X_train <- read.csv("dataset.csv") 
X_train_char <- Filter(is.character, X_train)
colSums(is.na(X_train_char))    
##     Province Life.insured 
##            0            0

7.1 Pelabelan

X_train <- read.csv("dataset.csv")    #import data training X
library(superml)
## Loading required package: R6
X_train_label <- LabelEncoder$new()
X_train$Age <- X_train_label$fit_transform(X_train$Age)
## The data contains NA values. Imputing NA with 'NA'
X_train$Province <- X_train_label$fit_transform(X_train$Province)
X_train$Wage <- X_train_label$fit_transform(X_train$Wage)
## The data contains NA values. Imputing NA with 'NA'
X_train$Life.insured <- X_train_label$fit_transform(X_train$Life.insured)
X_train
##     X Province Age Wage Life.insured
## 1   0        0   0    0            0
## 2   1        1   1    1            1
## 3   2        2   2    2            1
## 4   3        0   3    3            1
## 5   4        2   4    4            0
## 6   5        1   1    5            1
## 7   6        0   5    6            1
## 8   7        0   6    7            0
## 9   8        2   7    8            0
## 10  9        1   8    9            0
## 11 10        0   9    4            1
## 12 11        0  10   10            1
## 13 12        2  10    4            1
## 14 13        1  11   11            0
## 15 14        1  12   12            0

7.2 Pemetaan Kustom

X_train <- read.csv("dataset.csv")    #import data training X

X_train$Life.insured[X_train$Life.insured=="Yes"] <- 1
X_train$ Life.insured[X_train$Life.insured=="No"] <- 0

X_train
##     X    Province Age     Wage Life.insured
## 1   0      Banten  24  5000000            1
## 2   1 DKI Jakarta  NA  3400000            0
## 3   2  Jawa Barat  60  7350000            0
## 4   3      Banten  34  3500000            0
## 5   4  Jawa Barat  58       NA            1
## 6   5 DKI Jakarta  NA  8000000            0
## 7   6      Banten  21  5500000            0
## 8   7      Banten  44 10000000            1
## 9   8  Jawa Barat  40  9000000            1
## 10  9 DKI Jakarta  51 10500000            1
## 11 10      Banten  32       NA            0
## 12 11      Banten  30  6400000            0
## 13 12  Jawa Barat  30       NA            0
## 14 13 DKI Jakarta  19  2200000            1
## 15 14 DKI Jakarta  25  4500000            1

7.3 Variabel Dummy

library(fastDummies)
X_train <- read.csv("dataset.csv")    #import data training X

X_train <- dummy_cols(X_train)
X_train
##     X    Province Age     Wage Life.insured Province_Banten
## 1   0      Banten  24  5000000          Yes               1
## 2   1 DKI Jakarta  NA  3400000           No               0
## 3   2  Jawa Barat  60  7350000           No               0
## 4   3      Banten  34  3500000           No               1
## 5   4  Jawa Barat  58       NA          Yes               0
## 6   5 DKI Jakarta  NA  8000000           No               0
## 7   6      Banten  21  5500000           No               1
## 8   7      Banten  44 10000000          Yes               1
## 9   8  Jawa Barat  40  9000000          Yes               0
## 10  9 DKI Jakarta  51 10500000          Yes               0
## 11 10      Banten  32       NA           No               1
## 12 11      Banten  30  6400000           No               1
## 13 12  Jawa Barat  30       NA           No               0
## 14 13 DKI Jakarta  19  2200000          Yes               0
## 15 14 DKI Jakarta  25  4500000          Yes               0
##    Province_DKI Jakarta Province_Jawa Barat Life.insured_No Life.insured_Yes
## 1                     0                   0               0                1
## 2                     1                   0               1                0
## 3                     0                   1               1                0
## 4                     0                   0               1                0
## 5                     0                   1               0                1
## 6                     1                   0               1                0
## 7                     0                   0               1                0
## 8                     0                   0               0                1
## 9                     0                   1               0                1
## 10                    1                   0               0                1
## 11                    0                   0               1                0
## 12                    0                   0               1                0
## 13                    0                   1               1                0
## 14                    1                   0               0                1
## 15                    1                   0               0                1

7.4 K-fold/Cross-fold

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.4     v stringr 1.4.0
## v readr   2.0.1     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(ISLR)
library(lattice)

X_train <- read.csv("dataset.csv")    #import data training X

X_train$Life.insured[X_train$Life.insured=="Yes"] <- 1
X_train$Life.insured[X_train$Life.insured=="No"] <- 0

X_train1 <- na.omit(X_train)
set.seed(100)
dataset <- trainControl(method = "cv", number = 10, savePredictions = TRUE)
nb_fit <- train(Age~., data = X_train1, method = "lm", trControl = dataset, tuneLength =14)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
nb_fit
## Linear Regression 
## 
## 10 samples
##  4 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 9, 9, 9, 9, 9, 9, ... 
## Resampling results:
## 
##   RMSE      Rsquared  MAE     
##   11.29568  NaN       11.29568
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE