Email             :
RPubs            : https://rpubs.com/rizalandriana
Github           : https://github.com/rizalandriana
Jurusan          : Teknik Informatika
Address         : ARA Center, Matana University Tower
                         Jl. CBD Barat Kav, RT.1, Curug Sangereng, Kelapa Dua, Tangerang, Banten 15810.


1 Persiapan data

1.1 Import Data

df <- read.csv("dataset.csv")
print(df)
##     X    Province Age     Wage Life.insured
## 1   0      Banten  24  5000000          Yes
## 2   1 DKI Jakarta  NA  3400000           No
## 3   2  Jawa Barat  60  7350000           No
## 4   3      Banten  34  3500000           No
## 5   4  Jawa Barat  58       NA          Yes
## 6   5 DKI Jakarta  NA  8000000           No
## 7   6      Banten  21  5500000           No
## 8   7      Banten  44 10000000          Yes
## 9   8  Jawa Barat  40  9000000          Yes
## 10  9 DKI Jakarta  51 10500000          Yes
## 11 10      Banten  32       NA           No
## 12 11      Banten  30  6400000           No
## 13 12  Jawa Barat  30       NA           No
## 14 13 DKI Jakarta  19  2200000          Yes
## 15 14 DKI Jakarta  25  4500000          Yes

1.2 Menangani data yang hilang

1.2.1 Melihat jumlah data yang hilang

df1 <- df
sapply(df1, function(x) sum(is.na(x)))
##            X     Province          Age         Wage Life.insured 
##            0            0            2            3            0

1.2.2 Menghapus baris yang memiliki missing values

df_by_delete <- df[complete.cases(df),]
df_by_delete
##     X    Province Age     Wage Life.insured
## 1   0      Banten  24  5000000          Yes
## 3   2  Jawa Barat  60  7350000           No
## 4   3      Banten  34  3500000           No
## 7   6      Banten  21  5500000           No
## 8   7      Banten  44 10000000          Yes
## 9   8  Jawa Barat  40  9000000          Yes
## 10  9 DKI Jakarta  51 10500000          Yes
## 12 11      Banten  30  6400000           No
## 14 13 DKI Jakarta  19  2200000          Yes
## 15 14 DKI Jakarta  25  4500000          Yes

1.2.3 Mean, Median, Modus

df1 <- df

# Mengganti dengan menggunakan mean untuk nilai kuantitatif
df1$Age[is.na(df$Age)] <- mean(df$Age, na.rm = T)

# Fungsi menghitung modus dari sebuah data series
hitung_modus <- function(x) {
  nilai_beda <- unique(x)
  frekuensi <- tabulate(match(x, nilai_beda))
  nilai_beda[which.max(frekuensi)]
}

# Mengganti dengan menggunakan modus
df1$Wage[is.na(df$Wage)] <- hitung_modus(na.omit(df$Wage))

# mengganti dengan median
df1$Age[is.na(df$Age)] <- median(df$Age, na.rm= T)
df1
##     X    Province Age     Wage Life.insured
## 1   0      Banten  24  5000000          Yes
## 2   1 DKI Jakarta  32  3400000           No
## 3   2  Jawa Barat  60  7350000           No
## 4   3      Banten  34  3500000           No
## 5   4  Jawa Barat  58  5000000          Yes
## 6   5 DKI Jakarta  32  8000000           No
## 7   6      Banten  21  5500000           No
## 8   7      Banten  44 10000000          Yes
## 9   8  Jawa Barat  40  9000000          Yes
## 10  9 DKI Jakarta  51 10500000          Yes
## 11 10      Banten  32  5000000           No
## 12 11      Banten  30  6400000           No
## 13 12  Jawa Barat  30  5000000           No
## 14 13 DKI Jakarta  19  2200000          Yes
## 15 14 DKI Jakarta  25  4500000          Yes

1.2.4 Interpolasi linier

library(zoo)
df1 <- df
df1$Wage <- na.approx(df$Wage)
df1$Age <- na.approx(df$Age)
df1
##     X    Province  Age     Wage Life.insured
## 1   0      Banten 24.0  5000000          Yes
## 2   1 DKI Jakarta 42.0  3400000           No
## 3   2  Jawa Barat 60.0  7350000           No
## 4   3      Banten 34.0  3500000           No
## 5   4  Jawa Barat 58.0  5750000          Yes
## 6   5 DKI Jakarta 39.5  8000000           No
## 7   6      Banten 21.0  5500000           No
## 8   7      Banten 44.0 10000000          Yes
## 9   8  Jawa Barat 40.0  9000000          Yes
## 10  9 DKI Jakarta 51.0 10500000          Yes
## 11 10      Banten 32.0  8450000           No
## 12 11      Banten 30.0  6400000           No
## 13 12  Jawa Barat 30.0  4300000           No
## 14 13 DKI Jakarta 19.0  2200000          Yes
## 15 14 DKI Jakarta 25.0  4500000          Yes

1.2.5 Forward and backward filling

library(tidyr)
df1 <- df

# Forward filling
df1 %>% fill(Age, Wage, .direction="down")
##     X    Province Age     Wage Life.insured
## 1   0      Banten  24  5000000          Yes
## 2   1 DKI Jakarta  24  3400000           No
## 3   2  Jawa Barat  60  7350000           No
## 4   3      Banten  34  3500000           No
## 5   4  Jawa Barat  58  3500000          Yes
## 6   5 DKI Jakarta  58  8000000           No
## 7   6      Banten  21  5500000           No
## 8   7      Banten  44 10000000          Yes
## 9   8  Jawa Barat  40  9000000          Yes
## 10  9 DKI Jakarta  51 10500000          Yes
## 11 10      Banten  32 10500000           No
## 12 11      Banten  30  6400000           No
## 13 12  Jawa Barat  30  6400000           No
## 14 13 DKI Jakarta  19  2200000          Yes
## 15 14 DKI Jakarta  25  4500000          Yes
# Backward filling
df1 %>% fill(Age, Wage, .direction="up")
##     X    Province Age     Wage Life.insured
## 1   0      Banten  24  5000000          Yes
## 2   1 DKI Jakarta  60  3400000           No
## 3   2  Jawa Barat  60  7350000           No
## 4   3      Banten  34  3500000           No
## 5   4  Jawa Barat  58  8000000          Yes
## 6   5 DKI Jakarta  21  8000000           No
## 7   6      Banten  21  5500000           No
## 8   7      Banten  44 10000000          Yes
## 9   8  Jawa Barat  40  9000000          Yes
## 10  9 DKI Jakarta  51 10500000          Yes
## 11 10      Banten  32  6400000           No
## 12 11      Banten  30  6400000           No
## 13 12  Jawa Barat  30  2200000           No
## 14 13 DKI Jakarta  19  2200000          Yes
## 15 14 DKI Jakarta  25  4500000          Yes

1.3 Periksa nilai duplikat

df1 <- df

# Menympan nilai paling atas
df1[!duplicated(df1$Province),]
##   X    Province Age    Wage Life.insured
## 1 0      Banten  24 5000000          Yes
## 2 1 DKI Jakarta  NA 3400000           No
## 3 2  Jawa Barat  60 7350000           No
# Menyimpan nilai paling bawah
df1[!duplicated(df1$Province, fromLast=T),]
##     X    Province Age    Wage Life.insured
## 12 11      Banten  30 6400000           No
## 13 12  Jawa Barat  30      NA           No
## 15 14 DKI Jakarta  25 4500000          Yes

1.4 Memisahkan data numerik dan kategorikal

df_numeric <- df[sapply(df, is.numeric)]
df_numeric
##     X Age     Wage
## 1   0  24  5000000
## 2   1  NA  3400000
## 3   2  60  7350000
## 4   3  34  3500000
## 5   4  58       NA
## 6   5  NA  8000000
## 7   6  21  5500000
## 8   7  44 10000000
## 9   8  40  9000000
## 10  9  51 10500000
## 11 10  32       NA
## 12 11  30  6400000
## 13 12  30       NA
## 14 13  19  2200000
## 15 14  25  4500000
df_categorical <- df[!sapply(df, is.numeric)]
df_categorical
##       Province Life.insured
## 1       Banten          Yes
## 2  DKI Jakarta           No
## 3   Jawa Barat           No
## 4       Banten           No
## 5   Jawa Barat          Yes
## 6  DKI Jakarta           No
## 7       Banten           No
## 8       Banten          Yes
## 9   Jawa Barat          Yes
## 10 DKI Jakarta          Yes
## 11      Banten           No
## 12      Banten           No
## 13  Jawa Barat           No
## 14 DKI Jakarta          Yes
## 15 DKI Jakarta          Yes

1.5 Menangani data numerik

1.5.1 Standarisasi

df_numeric <- df[sapply(df, is.numeric)]
df_numeric <- scale(df_numeric %>% fill(Age, Wage))
df_numeric
##                X        Age         Wage
##  [1,] -1.5652476 -0.8866371 -0.501226686
##  [2,] -1.3416408 -0.8866371 -1.080958757
##  [3,] -1.1180340  1.6332789  0.350254793
##  [4,] -0.8944272 -0.1866604 -1.044725502
##  [5,] -0.6708204  1.4932836 -1.044725502
##  [6,] -0.4472136  1.4932836  0.585770946
##  [7,] -0.2236068 -1.0966301 -0.320060414
##  [8,]  0.0000000  0.5133162  1.310436035
##  [9,]  0.2236068  0.2333256  0.948103491
## [10,]  0.4472136  1.0032999  1.491602307
## [11,]  0.6708204 -0.3266558  1.491602307
## [12,]  0.8944272 -0.4666511  0.006038876
## [13,]  1.1180340 -0.4666511  0.006038876
## [14,]  1.3416408 -1.2366254 -1.515757810
## [15,]  1.5652476 -0.8166394 -0.682392958
## attr(,"scaled:center")
##            X          Age         Wage 
## 7.000000e+00 3.666667e+01 6.383333e+06 
## attr(,"scaled:scale")
##            X          Age         Wage 
## 4.472136e+00 1.428619e+01 2.759896e+06

1.5.2 Normalisasi Min-Max

# Ambil kolom numerik
df_numeric <- df[sapply(df, is.numeric)]

# Handle data kosong
df_numeric <- df_numeric %>% fill(Age, Wage)

# Fungsi normalisasi
normalize <- function(x, na.rm = T) {
  return((x- min(x))/(max(x)-min(x)))
}

df_numeric$Age <- normalize(df_numeric$Age)
df_numeric$Wage <- normalize(df_numeric$Age)

df_numeric
##     X        Age       Wage
## 1   0 0.12195122 0.12195122
## 2   1 0.12195122 0.12195122
## 3   2 1.00000000 1.00000000
## 4   3 0.36585366 0.36585366
## 5   4 0.95121951 0.95121951
## 6   5 0.95121951 0.95121951
## 7   6 0.04878049 0.04878049
## 8   7 0.60975610 0.60975610
## 9   8 0.51219512 0.51219512
## 10  9 0.78048780 0.78048780
## 11 10 0.31707317 0.31707317
## 12 11 0.26829268 0.26829268
## 13 12 0.26829268 0.26829268
## 14 13 0.00000000 0.00000000
## 15 14 0.14634146 0.14634146

1.5.3 Penskalaan Robust

# Ambil kolom numerik
df_numeric <- df[sapply(df, is.numeric)]

# Handle data kosong
df_numeric <- df_numeric %>% fill(Age, Wage)

# Fungsi skala robust
robust_scale <- function(x) {
  return((x-quantile(x)[2])/(quantile(x)[4]-quantile(x)[2]))
}

df_numeric$Age <- robust_scale(df_numeric$Age)
df_numeric$Wage <- robust_scale(df_numeric$Wage)

df_numeric
##     X         Age       Wage
## 1   0 -0.02173913  0.2222222
## 2   1 -0.02173913 -0.1333333
## 3   2  1.54347826  0.7444444
## 4   3  0.41304348 -0.1111111
## 5   4  1.45652174 -0.1111111
## 6   5  1.45652174  0.8888889
## 7   6 -0.15217391  0.3333333
## 8   7  0.84782609  1.3333333
## 9   8  0.67391304  1.1111111
## 10  9  1.15217391  1.4444444
## 11 10  0.32608696  1.4444444
## 12 11  0.23913043  0.5333333
## 13 12  0.23913043  0.5333333
## 14 13 -0.23913043 -0.4000000
## 15 14  0.02173913  0.1111111

1.6 Penanganan data pencilan

# Ambil kolom numerik
df_numeric <- df[sapply(df, is.numeric)]

# Handle data kosong
df_numeric <- df_numeric %>% fill(Age, Wage)

# Fungsi untuk mencari outlier
outlier <- function(x) {
  sample_mean <- mean(x)
  sample_std <- sd(x)
  cut_off <- sample_std * 1
  lower_bound <- sample_mean - cut_off
  upper_bound <- sample_mean + cut_off
  
  return(sapply(x, function(x) {
    return(x < lower_bound || x > upper_bound)
  }))
}

# Daftar outlier Wage
df_numeric[outlier(df_numeric$Wage),]
##     X Age     Wage
## 2   1  24  3400000
## 4   3  34  3500000
## 5   4  58  3500000
## 8   7  44 10000000
## 10  9  51 10500000
## 11 10  32 10500000
## 14 13  19  2200000
# Daftar outlier Age
df_numeric[outlier(df_numeric$Age),]
##     X Age     Wage
## 3   2  60  7350000
## 5   4  58  3500000
## 6   5  58  8000000
## 7   6  21  5500000
## 10  9  51 10500000
## 14 13  19  2200000

1.6.1 Boxplot

# Ambil kolom numerik
df_numeric <- df[sapply(df, is.numeric)]

# Handle data kosong
df_numeric <- df_numeric %>% fill(Age, Wage)

boxplot(df_numeric$Age, horizontal = T, main="Age")

boxplot(df_numeric$Wage, horizontal = T, main="Wage")

1.7 Encoding data kategorikal

1.7.1 Mengubah label menjadi angka

# Mengambil kolom kategorikal
df_categorical <- df[!sapply(df, is.numeric)]

df_categorical$Province_Labeled <- 
  factor(df_categorical$Province, labels=c(1, 2, 3))

df_categorical
##       Province Life.insured Province_Labeled
## 1       Banten          Yes                1
## 2  DKI Jakarta           No                2
## 3   Jawa Barat           No                3
## 4       Banten           No                1
## 5   Jawa Barat          Yes                3
## 6  DKI Jakarta           No                2
## 7       Banten           No                1
## 8       Banten          Yes                1
## 9   Jawa Barat          Yes                3
## 10 DKI Jakarta          Yes                2
## 11      Banten           No                1
## 12      Banten           No                1
## 13  Jawa Barat           No                3
## 14 DKI Jakarta          Yes                2
## 15 DKI Jakarta          Yes                2

1.7.2 Mengubah label custom

# Mengambil kolom kategorikal
df_categorical <- df[!sapply(df, is.numeric)]

df_categorical$Life.insured_Labeled <- 
  factor(df_categorical$Life.insured, levels=c("Yes", "No"), labels=c("T", "F"))

df_categorical
##       Province Life.insured Life.insured_Labeled
## 1       Banten          Yes                    T
## 2  DKI Jakarta           No                    F
## 3   Jawa Barat           No                    F
## 4       Banten           No                    F
## 5   Jawa Barat          Yes                    T
## 6  DKI Jakarta           No                    F
## 7       Banten           No                    F
## 8       Banten          Yes                    T
## 9   Jawa Barat          Yes                    T
## 10 DKI Jakarta          Yes                    T
## 11      Banten           No                    F
## 12      Banten           No                    F
## 13  Jawa Barat           No                    F
## 14 DKI Jakarta          Yes                    T
## 15 DKI Jakarta          Yes                    T

1.7.3 Variabel Dummy

library(fastDummies)
# Mengambil kolom kategorikal
df_categorical <- df[!sapply(df, is.numeric)]

dummy_cols(df_categorical$Province)
##          .data .data_Banten .data_DKI Jakarta .data_Jawa Barat
## 1       Banten            1                 0                0
## 2  DKI Jakarta            0                 1                0
## 3   Jawa Barat            0                 0                1
## 4       Banten            1                 0                0
## 5   Jawa Barat            0                 0                1
## 6  DKI Jakarta            0                 1                0
## 7       Banten            1                 0                0
## 8       Banten            1                 0                0
## 9   Jawa Barat            0                 0                1
## 10 DKI Jakarta            0                 1                0
## 11      Banten            1                 0                0
## 12      Banten            1                 0                0
## 13  Jawa Barat            0                 0                1
## 14 DKI Jakarta            0                 1                0
## 15 DKI Jakarta            0                 1                0

1.7.4 Cyclic

# Mengambil kolom kategorikal
df_categorical <- df[!sapply(df, is.numeric)]

pi <- 3.14

# Fungsi sinus
sin_fn <- function(x) {
  return(sin((2*pi*x)/max(x)))
}

cos_fn <- function(x) {
  return(cos((2*pi*x)/max(x)))
}

df_categorical$Province_sin <- 
  sin_fn(unclass(factor(df_categorical$Province, labels=c(0, 1, 2))))

df_categorical$Province_cos <- 
  cos_fn(unclass(factor(df_categorical$Province, labels=c(0, 1, 2))))

df_categorical
##       Province Life.insured Province_sin Province_cos
## 1       Banten          Yes  0.866555800   -0.4990802
## 2  DKI Jakarta           No -0.864961683   -0.5018379
## 3   Jawa Barat           No -0.003185302    0.9999949
## 4       Banten           No  0.866555800   -0.4990802
## 5   Jawa Barat          Yes -0.003185302    0.9999949
## 6  DKI Jakarta           No -0.864961683   -0.5018379
## 7       Banten           No  0.866555800   -0.4990802
## 8       Banten          Yes  0.866555800   -0.4990802
## 9   Jawa Barat          Yes -0.003185302    0.9999949
## 10 DKI Jakarta          Yes -0.864961683   -0.5018379
## 11      Banten           No  0.866555800   -0.4990802
## 12      Banten           No  0.866555800   -0.4990802
## 13  Jawa Barat           No -0.003185302    0.9999949
## 14 DKI Jakarta          Yes -0.864961683   -0.5018379
## 15 DKI Jakarta          Yes -0.864961683   -0.5018379
