Email          : rizal.andriana@student.matanauniversity.ac.id
RPubs         : https://rpubs.com/rizalandriana
Github        : https://github.com/rizalandriana
Jurusan      : Teknik Informatika
Address     : ARA Center, Matana University Tower
             Jl. CBD Barat Kav, RT.1, Curug Sangereng, Kelapa Dua, Tangerang, Banten 15810.
df <- read.csv("dataset.csv")
print(df)## X Province Age Wage Life.insured
## 1 0 Banten 24 5000000 Yes
## 2 1 DKI Jakarta NA 3400000 No
## 3 2 Jawa Barat 60 7350000 No
## 4 3 Banten 34 3500000 No
## 5 4 Jawa Barat 58 NA Yes
## 6 5 DKI Jakarta NA 8000000 No
## 7 6 Banten 21 5500000 No
## 8 7 Banten 44 10000000 Yes
## 9 8 Jawa Barat 40 9000000 Yes
## 10 9 DKI Jakarta 51 10500000 Yes
## 11 10 Banten 32 NA No
## 12 11 Banten 30 6400000 No
## 13 12 Jawa Barat 30 NA No
## 14 13 DKI Jakarta 19 2200000 Yes
## 15 14 DKI Jakarta 25 4500000 Yes
df1 <- df
sapply(df1, function(x) sum(is.na(x)))## X Province Age Wage Life.insured
## 0 0 2 3 0
df_by_delete <- df[complete.cases(df),]
df_by_delete## X Province Age Wage Life.insured
## 1 0 Banten 24 5000000 Yes
## 3 2 Jawa Barat 60 7350000 No
## 4 3 Banten 34 3500000 No
## 7 6 Banten 21 5500000 No
## 8 7 Banten 44 10000000 Yes
## 9 8 Jawa Barat 40 9000000 Yes
## 10 9 DKI Jakarta 51 10500000 Yes
## 12 11 Banten 30 6400000 No
## 14 13 DKI Jakarta 19 2200000 Yes
## 15 14 DKI Jakarta 25 4500000 Yes
df1 <- df
# Mengganti dengan menggunakan mean untuk nilai kuantitatif
df1$Age[is.na(df$Age)] <- mean(df$Age, na.rm = T)
# Fungsi menghitung modus dari sebuah data series
hitung_modus <- function(x) {
nilai_beda <- unique(x)
frekuensi <- tabulate(match(x, nilai_beda))
nilai_beda[which.max(frekuensi)]
}
# Mengganti dengan menggunakan modus
df1$Wage[is.na(df$Wage)] <- hitung_modus(na.omit(df$Wage))
# mengganti dengan median
df1$Age[is.na(df$Age)] <- median(df$Age, na.rm= T)
df1## X Province Age Wage Life.insured
## 1 0 Banten 24 5000000 Yes
## 2 1 DKI Jakarta 32 3400000 No
## 3 2 Jawa Barat 60 7350000 No
## 4 3 Banten 34 3500000 No
## 5 4 Jawa Barat 58 5000000 Yes
## 6 5 DKI Jakarta 32 8000000 No
## 7 6 Banten 21 5500000 No
## 8 7 Banten 44 10000000 Yes
## 9 8 Jawa Barat 40 9000000 Yes
## 10 9 DKI Jakarta 51 10500000 Yes
## 11 10 Banten 32 5000000 No
## 12 11 Banten 30 6400000 No
## 13 12 Jawa Barat 30 5000000 No
## 14 13 DKI Jakarta 19 2200000 Yes
## 15 14 DKI Jakarta 25 4500000 Yes
library(zoo)
df1 <- df
df1$Wage <- na.approx(df$Wage)
df1$Age <- na.approx(df$Age)
df1## X Province Age Wage Life.insured
## 1 0 Banten 24.0 5000000 Yes
## 2 1 DKI Jakarta 42.0 3400000 No
## 3 2 Jawa Barat 60.0 7350000 No
## 4 3 Banten 34.0 3500000 No
## 5 4 Jawa Barat 58.0 5750000 Yes
## 6 5 DKI Jakarta 39.5 8000000 No
## 7 6 Banten 21.0 5500000 No
## 8 7 Banten 44.0 10000000 Yes
## 9 8 Jawa Barat 40.0 9000000 Yes
## 10 9 DKI Jakarta 51.0 10500000 Yes
## 11 10 Banten 32.0 8450000 No
## 12 11 Banten 30.0 6400000 No
## 13 12 Jawa Barat 30.0 4300000 No
## 14 13 DKI Jakarta 19.0 2200000 Yes
## 15 14 DKI Jakarta 25.0 4500000 Yes
library(tidyr)
df1 <- df
# Forward filling
df1 %>% fill(Age, Wage, .direction="down")## X Province Age Wage Life.insured
## 1 0 Banten 24 5000000 Yes
## 2 1 DKI Jakarta 24 3400000 No
## 3 2 Jawa Barat 60 7350000 No
## 4 3 Banten 34 3500000 No
## 5 4 Jawa Barat 58 3500000 Yes
## 6 5 DKI Jakarta 58 8000000 No
## 7 6 Banten 21 5500000 No
## 8 7 Banten 44 10000000 Yes
## 9 8 Jawa Barat 40 9000000 Yes
## 10 9 DKI Jakarta 51 10500000 Yes
## 11 10 Banten 32 10500000 No
## 12 11 Banten 30 6400000 No
## 13 12 Jawa Barat 30 6400000 No
## 14 13 DKI Jakarta 19 2200000 Yes
## 15 14 DKI Jakarta 25 4500000 Yes
# Backward filling
df1 %>% fill(Age, Wage, .direction="up")## X Province Age Wage Life.insured
## 1 0 Banten 24 5000000 Yes
## 2 1 DKI Jakarta 60 3400000 No
## 3 2 Jawa Barat 60 7350000 No
## 4 3 Banten 34 3500000 No
## 5 4 Jawa Barat 58 8000000 Yes
## 6 5 DKI Jakarta 21 8000000 No
## 7 6 Banten 21 5500000 No
## 8 7 Banten 44 10000000 Yes
## 9 8 Jawa Barat 40 9000000 Yes
## 10 9 DKI Jakarta 51 10500000 Yes
## 11 10 Banten 32 6400000 No
## 12 11 Banten 30 6400000 No
## 13 12 Jawa Barat 30 2200000 No
## 14 13 DKI Jakarta 19 2200000 Yes
## 15 14 DKI Jakarta 25 4500000 Yes
df1 <- df
# Menympan nilai paling atas
df1[!duplicated(df1$Province),]## X Province Age Wage Life.insured
## 1 0 Banten 24 5000000 Yes
## 2 1 DKI Jakarta NA 3400000 No
## 3 2 Jawa Barat 60 7350000 No
# Menyimpan nilai paling bawah
df1[!duplicated(df1$Province, fromLast=T),]## X Province Age Wage Life.insured
## 12 11 Banten 30 6400000 No
## 13 12 Jawa Barat 30 NA No
## 15 14 DKI Jakarta 25 4500000 Yes
df_numeric <- df[sapply(df, is.numeric)]
df_numeric## X Age Wage
## 1 0 24 5000000
## 2 1 NA 3400000
## 3 2 60 7350000
## 4 3 34 3500000
## 5 4 58 NA
## 6 5 NA 8000000
## 7 6 21 5500000
## 8 7 44 10000000
## 9 8 40 9000000
## 10 9 51 10500000
## 11 10 32 NA
## 12 11 30 6400000
## 13 12 30 NA
## 14 13 19 2200000
## 15 14 25 4500000
df_categorical <- df[!sapply(df, is.numeric)]
df_categorical## Province Life.insured
## 1 Banten Yes
## 2 DKI Jakarta No
## 3 Jawa Barat No
## 4 Banten No
## 5 Jawa Barat Yes
## 6 DKI Jakarta No
## 7 Banten No
## 8 Banten Yes
## 9 Jawa Barat Yes
## 10 DKI Jakarta Yes
## 11 Banten No
## 12 Banten No
## 13 Jawa Barat No
## 14 DKI Jakarta Yes
## 15 DKI Jakarta Yes
df_numeric <- df[sapply(df, is.numeric)]
df_numeric <- scale(df_numeric %>% fill(Age, Wage))
df_numeric## X Age Wage
## [1,] -1.5652476 -0.8866371 -0.501226686
## [2,] -1.3416408 -0.8866371 -1.080958757
## [3,] -1.1180340 1.6332789 0.350254793
## [4,] -0.8944272 -0.1866604 -1.044725502
## [5,] -0.6708204 1.4932836 -1.044725502
## [6,] -0.4472136 1.4932836 0.585770946
## [7,] -0.2236068 -1.0966301 -0.320060414
## [8,] 0.0000000 0.5133162 1.310436035
## [9,] 0.2236068 0.2333256 0.948103491
## [10,] 0.4472136 1.0032999 1.491602307
## [11,] 0.6708204 -0.3266558 1.491602307
## [12,] 0.8944272 -0.4666511 0.006038876
## [13,] 1.1180340 -0.4666511 0.006038876
## [14,] 1.3416408 -1.2366254 -1.515757810
## [15,] 1.5652476 -0.8166394 -0.682392958
## attr(,"scaled:center")
## X Age Wage
## 7.000000e+00 3.666667e+01 6.383333e+06
## attr(,"scaled:scale")
## X Age Wage
## 4.472136e+00 1.428619e+01 2.759896e+06
# Ambil kolom numerik
df_numeric <- df[sapply(df, is.numeric)]
# Handle data kosong
df_numeric <- df_numeric %>% fill(Age, Wage)
# Fungsi normalisasi
normalize <- function(x, na.rm = T) {
return((x- min(x))/(max(x)-min(x)))
}
df_numeric$Age <- normalize(df_numeric$Age)
df_numeric$Wage <- normalize(df_numeric$Age)
df_numeric## X Age Wage
## 1 0 0.12195122 0.12195122
## 2 1 0.12195122 0.12195122
## 3 2 1.00000000 1.00000000
## 4 3 0.36585366 0.36585366
## 5 4 0.95121951 0.95121951
## 6 5 0.95121951 0.95121951
## 7 6 0.04878049 0.04878049
## 8 7 0.60975610 0.60975610
## 9 8 0.51219512 0.51219512
## 10 9 0.78048780 0.78048780
## 11 10 0.31707317 0.31707317
## 12 11 0.26829268 0.26829268
## 13 12 0.26829268 0.26829268
## 14 13 0.00000000 0.00000000
## 15 14 0.14634146 0.14634146
# Ambil kolom numerik
df_numeric <- df[sapply(df, is.numeric)]
# Handle data kosong
df_numeric <- df_numeric %>% fill(Age, Wage)
# Fungsi skala robust
robust_scale <- function(x) {
return((x-quantile(x)[2])/(quantile(x)[4]-quantile(x)[2]))
}
df_numeric$Age <- robust_scale(df_numeric$Age)
df_numeric$Wage <- robust_scale(df_numeric$Wage)
df_numeric## X Age Wage
## 1 0 -0.02173913 0.2222222
## 2 1 -0.02173913 -0.1333333
## 3 2 1.54347826 0.7444444
## 4 3 0.41304348 -0.1111111
## 5 4 1.45652174 -0.1111111
## 6 5 1.45652174 0.8888889
## 7 6 -0.15217391 0.3333333
## 8 7 0.84782609 1.3333333
## 9 8 0.67391304 1.1111111
## 10 9 1.15217391 1.4444444
## 11 10 0.32608696 1.4444444
## 12 11 0.23913043 0.5333333
## 13 12 0.23913043 0.5333333
## 14 13 -0.23913043 -0.4000000
## 15 14 0.02173913 0.1111111
# Ambil kolom numerik
df_numeric <- df[sapply(df, is.numeric)]
# Handle data kosong
df_numeric <- df_numeric %>% fill(Age, Wage)
# Fungsi untuk mencari outlier
outlier <- function(x) {
sample_mean <- mean(x)
sample_std <- sd(x)
cut_off <- sample_std * 1
lower_bound <- sample_mean - cut_off
upper_bound <- sample_mean + cut_off
return(sapply(x, function(x) {
return(x < lower_bound || x > upper_bound)
}))
}
# Daftar outlier Wage
df_numeric[outlier(df_numeric$Wage),]## X Age Wage
## 2 1 24 3400000
## 4 3 34 3500000
## 5 4 58 3500000
## 8 7 44 10000000
## 10 9 51 10500000
## 11 10 32 10500000
## 14 13 19 2200000
# Daftar outlier Age
df_numeric[outlier(df_numeric$Age),]## X Age Wage
## 3 2 60 7350000
## 5 4 58 3500000
## 6 5 58 8000000
## 7 6 21 5500000
## 10 9 51 10500000
## 14 13 19 2200000
# Ambil kolom numerik
df_numeric <- df[sapply(df, is.numeric)]
# Handle data kosong
df_numeric <- df_numeric %>% fill(Age, Wage)
boxplot(df_numeric$Age, horizontal = T, main="Age")boxplot(df_numeric$Wage, horizontal = T, main="Wage")# Mengambil kolom kategorikal
df_categorical <- df[!sapply(df, is.numeric)]
df_categorical$Province_Labeled <-
factor(df_categorical$Province, labels=c(1, 2, 3))
df_categorical## Province Life.insured Province_Labeled
## 1 Banten Yes 1
## 2 DKI Jakarta No 2
## 3 Jawa Barat No 3
## 4 Banten No 1
## 5 Jawa Barat Yes 3
## 6 DKI Jakarta No 2
## 7 Banten No 1
## 8 Banten Yes 1
## 9 Jawa Barat Yes 3
## 10 DKI Jakarta Yes 2
## 11 Banten No 1
## 12 Banten No 1
## 13 Jawa Barat No 3
## 14 DKI Jakarta Yes 2
## 15 DKI Jakarta Yes 2
# Mengambil kolom kategorikal
df_categorical <- df[!sapply(df, is.numeric)]
df_categorical$Life.insured_Labeled <-
factor(df_categorical$Life.insured, levels=c("Yes", "No"), labels=c("T", "F"))
df_categorical## Province Life.insured Life.insured_Labeled
## 1 Banten Yes T
## 2 DKI Jakarta No F
## 3 Jawa Barat No F
## 4 Banten No F
## 5 Jawa Barat Yes T
## 6 DKI Jakarta No F
## 7 Banten No F
## 8 Banten Yes T
## 9 Jawa Barat Yes T
## 10 DKI Jakarta Yes T
## 11 Banten No F
## 12 Banten No F
## 13 Jawa Barat No F
## 14 DKI Jakarta Yes T
## 15 DKI Jakarta Yes T
library(fastDummies)
# Mengambil kolom kategorikal
df_categorical <- df[!sapply(df, is.numeric)]
dummy_cols(df_categorical$Province)## .data .data_Banten .data_DKI Jakarta .data_Jawa Barat
## 1 Banten 1 0 0
## 2 DKI Jakarta 0 1 0
## 3 Jawa Barat 0 0 1
## 4 Banten 1 0 0
## 5 Jawa Barat 0 0 1
## 6 DKI Jakarta 0 1 0
## 7 Banten 1 0 0
## 8 Banten 1 0 0
## 9 Jawa Barat 0 0 1
## 10 DKI Jakarta 0 1 0
## 11 Banten 1 0 0
## 12 Banten 1 0 0
## 13 Jawa Barat 0 0 1
## 14 DKI Jakarta 0 1 0
## 15 DKI Jakarta 0 1 0
# Mengambil kolom kategorikal
df_categorical <- df[!sapply(df, is.numeric)]
pi <- 3.14
# Fungsi sinus
sin_fn <- function(x) {
return(sin((2*pi*x)/max(x)))
}
cos_fn <- function(x) {
return(cos((2*pi*x)/max(x)))
}
df_categorical$Province_sin <-
sin_fn(unclass(factor(df_categorical$Province, labels=c(0, 1, 2))))
df_categorical$Province_cos <-
cos_fn(unclass(factor(df_categorical$Province, labels=c(0, 1, 2))))
df_categorical## Province Life.insured Province_sin Province_cos
## 1 Banten Yes 0.866555800 -0.4990802
## 2 DKI Jakarta No -0.864961683 -0.5018379
## 3 Jawa Barat No -0.003185302 0.9999949
## 4 Banten No 0.866555800 -0.4990802
## 5 Jawa Barat Yes -0.003185302 0.9999949
## 6 DKI Jakarta No -0.864961683 -0.5018379
## 7 Banten No 0.866555800 -0.4990802
## 8 Banten Yes 0.866555800 -0.4990802
## 9 Jawa Barat Yes -0.003185302 0.9999949
## 10 DKI Jakarta Yes -0.864961683 -0.5018379
## 11 Banten No 0.866555800 -0.4990802
## 12 Banten No 0.866555800 -0.4990802
## 13 Jawa Barat No -0.003185302 0.9999949
## 14 DKI Jakarta Yes -0.864961683 -0.5018379
## 15 DKI Jakarta Yes -0.864961683 -0.5018379