Email : sabrina.amelia@student.matanauniversity.ac.id
RPubs : https://rpubs.com/sabrinayose
Github : https://github.com/sabrinayose
Jurusan : Teknik Informatika
Address : ARA Center, Matana University Tower
Jl. CBD Barat Kav, RT.1, Curug Sangereng, Kelapa Dua, Tangerang, Banten 15810.
library(zoo)##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
X_train <- read.csv("dataset.csv") #import data training X
head(X_train, 5) #print 5 data awal## X Province Age Wage Life.insured
## 1 0 Banten 24 5000000 Yes
## 2 1 DKI Jakarta NA 3400000 No
## 3 2 Jawa Barat 60 7350000 No
## 4 3 Banten 34 3500000 No
## 5 4 Jawa Barat 58 NA Yes
tail(X_train, 5) #print 5 data akhir## X Province Age Wage Life.insured
## 11 10 Banten 32 NA No
## 12 11 Banten 30 6400000 No
## 13 12 Jawa Barat 30 NA No
## 14 13 DKI Jakarta 19 2200000 Yes
## 15 14 DKI Jakarta 25 4500000 Yes
Check menghitung semua data yang hilang pada data frame
colSums(is.na(X_train)) #Check Data yang hilang## X Province Age Wage Life.insured
## 0 0 2 3 0
Harus hati-hati karena hasil analisis dapat menjadi tidak relevan apabila salah menghapus data.
df_rm <- na.omit(X_train) #hapus baris yang berisi nilai yang hilang
print(colSums(is.na(df_rm))) #check apakah masih ada data yang hilang## X Province Age Wage Life.insured
## 0 0 0 0 0
Untuk menghitung data berupa numerik menggunakan Mean dan Median.
Untuk data berupa kategoris menggunakan mode.
#Mengisi nilai variabel numerik yang hilang dengan rata-rata
X_train$Age[is.na(X_train$Age)] <- mean(X_train$Age, na.rm = TRUE)
X_train$Age## [1] 24 36 60 34 58 36 21 44 40 51 32 30 30 19 25
X_train$Wage[is.na(X_train$Wage)] <- mean(X_train$Wage, na.rm = TRUE)
X_train$Wage## [1] 5000000 3400000 7350000 3500000 6279167 8000000 5500000 10000000
## [9] 9000000 10500000 6279167 6400000 6279167 2200000 4500000
#Mengisi nilai variabel karakter yang hilang dengan mode
X_train$Life.insured[is.na(X_train$Life.insured)] <- mode(X_train$Life.insured)
X_train$Life.insured## [1] "Yes" "No" "No" "No" "Yes" "No" "No" "Yes" "Yes" "Yes" "No" "No"
## [13] "No" "Yes" "Yes"
X_train <- read.csv("dataset.csv") #import data training X
X_train$Wage<-na.approx(X_train$Wage)
X_train$Wage## [1] 5000000 3400000 7350000 3500000 5750000 8000000 5500000 10000000
## [9] 9000000 10500000 8450000 6400000 4300000 2200000 4500000
require(tidyr)## Loading required package: tidyr
require(dplyr)## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
X_train <- read.csv("dataset.csv") #import data training X
X_train <- X_train %>% fill(Age)
X_train## X Province Age Wage Life.insured
## 1 0 Banten 24 5000000 Yes
## 2 1 DKI Jakarta 24 3400000 No
## 3 2 Jawa Barat 60 7350000 No
## 4 3 Banten 34 3500000 No
## 5 4 Jawa Barat 58 NA Yes
## 6 5 DKI Jakarta 58 8000000 No
## 7 6 Banten 21 5500000 No
## 8 7 Banten 44 10000000 Yes
## 9 8 Jawa Barat 40 9000000 Yes
## 10 9 DKI Jakarta 51 10500000 Yes
## 11 10 Banten 32 NA No
## 12 11 Banten 30 6400000 No
## 13 12 Jawa Barat 30 NA No
## 14 13 DKI Jakarta 19 2200000 Yes
## 15 14 DKI Jakarta 25 4500000 Yes
require(tidyr)
require(dplyr)
X_train <- read.csv("dataset.csv") #import data training X
X_train <- X_train %>% fill(Wage, .direction="up")
X_train## X Province Age Wage Life.insured
## 1 0 Banten 24 5000000 Yes
## 2 1 DKI Jakarta NA 3400000 No
## 3 2 Jawa Barat 60 7350000 No
## 4 3 Banten 34 3500000 No
## 5 4 Jawa Barat 58 8000000 Yes
## 6 5 DKI Jakarta NA 8000000 No
## 7 6 Banten 21 5500000 No
## 8 7 Banten 44 10000000 Yes
## 9 8 Jawa Barat 40 9000000 Yes
## 10 9 DKI Jakarta 51 10500000 Yes
## 11 10 Banten 32 6400000 No
## 12 11 Banten 30 6400000 No
## 13 12 Jawa Barat 30 2200000 No
## 14 13 DKI Jakarta 19 2200000 Yes
## 15 14 DKI Jakarta 25 4500000 Yes
X_train <- read.csv("dataset.csv") #import data training X
X_train %>% count(X_train$Age) %>% filter(n>1) %>% select(-n)## X_train$Age
## 1 30
## 2 NA
Filter(is.numeric, X_train)## X Age Wage
## 1 0 24 5000000
## 2 1 NA 3400000
## 3 2 60 7350000
## 4 3 34 3500000
## 5 4 58 NA
## 6 5 NA 8000000
## 7 6 21 5500000
## 8 7 44 10000000
## 9 8 40 9000000
## 10 9 51 10500000
## 11 10 32 NA
## 12 11 30 6400000
## 13 12 30 NA
## 14 13 19 2200000
## 15 14 25 4500000
Filter(is.character, X_train)## Province Life.insured
## 1 Banten Yes
## 2 DKI Jakarta No
## 3 Jawa Barat No
## 4 Banten No
## 5 Jawa Barat Yes
## 6 DKI Jakarta No
## 7 Banten No
## 8 Banten Yes
## 9 Jawa Barat Yes
## 10 DKI Jakarta Yes
## 11 Banten No
## 12 Banten No
## 13 Jawa Barat No
## 14 DKI Jakarta Yes
## 15 DKI Jakarta Yes
X_train <- read.csv("dataset.csv") #import data training X
df_rm <- na.omit(X_train)
df_rm$Age_scaled <- scale(df_rm$Age)
df_rm$Wage_scaled <- scale(df_rm$Wage)
df_rm## X Province Age Wage Life.insured Age_scaled Wage_scaled
## 1 0 Banten 24 5000000 Yes -0.78916488 -0.50069606
## 3 2 Jawa Barat 60 7350000 No 1.84138472 0.34277042
## 4 3 Banten 34 3500000 No -0.05845666 -1.03907893
## 7 6 Banten 21 5500000 No -1.00837734 -0.32123511
## 8 7 Banten 44 10000000 Yes 0.67225156 1.29391349
## 9 8 Jawa Barat 40 9000000 Yes 0.37996827 0.93499158
## 10 9 DKI Jakarta 51 10500000 Yes 1.18374732 1.47337444
## 12 11 Banten 30 6400000 No -0.35073995 0.00179461
## 14 13 DKI Jakarta 19 2200000 Yes -1.15451899 -1.50567741
## 15 14 DKI Jakarta 25 4500000 Yes -0.71609406 -0.68015702
X_train <- read.csv("dataset.csv") #import data training X
normalize <- function(x){
return ((x - min(x) / max(x) - min(x)))
}
df_rm <- na.omit(X_train)
df_rm$Age_norm <- normalize(df_rm$Age)
df_rm$Wage_norm <- normalize(df_rm$Wage)
df_rm## X Province Age Wage Life.insured Age_norm Wage_norm
## 1 0 Banten 24 5000000 Yes 4.6833333 2.800000e+06
## 3 2 Jawa Barat 60 7350000 No 40.6833333 5.150000e+06
## 4 3 Banten 34 3500000 No 14.6833333 1.300000e+06
## 7 6 Banten 21 5500000 No 1.6833333 3.300000e+06
## 8 7 Banten 44 10000000 Yes 24.6833333 7.800000e+06
## 9 8 Jawa Barat 40 9000000 Yes 20.6833333 6.800000e+06
## 10 9 DKI Jakarta 51 10500000 Yes 31.6833333 8.300000e+06
## 12 11 Banten 30 6400000 No 10.6833333 4.200000e+06
## 14 13 DKI Jakarta 19 2200000 Yes -0.3166667 -2.095238e-01
## 15 14 DKI Jakarta 25 4500000 Yes 5.6833333 2.300000e+06
X_train <- read.csv("dataset.csv") #import data training X
robust <- function(x){
return ((x - quantile(x)[2] / (quantile(x)[4] - quantile(x)[2])))
}
df_rm <- na.omit(X_train)
df_rm$Age_norm <- robust(df_rm$Age)
df_rm$Wage_norm <- robust(df_rm$Wage)
df_rm## X Province Age Wage Life.insured Age_norm Wage_norm
## 1 0 Banten 24 5000000 Yes 22.70667 4999999
## 3 2 Jawa Barat 60 7350000 No 58.70667 7349999
## 4 3 Banten 34 3500000 No 32.70667 3499999
## 7 6 Banten 21 5500000 No 19.70667 5499999
## 8 7 Banten 44 10000000 Yes 42.70667 9999999
## 9 8 Jawa Barat 40 9000000 Yes 38.70667 8999999
## 10 9 DKI Jakarta 51 10500000 Yes 49.70667 10499999
## 12 11 Banten 30 6400000 No 28.70667 6399999
## 14 13 DKI Jakarta 19 2200000 Yes 17.70667 2199999
## 15 14 DKI Jakarta 25 4500000 Yes 23.70667 4499999
Distribusi Gaussian
X_train <- read.csv("dataset.csv") #import data training X
outliers <- function(x){
sample_mean = mean(x)
sample_std = sd(x)
cut_off = sample_std * 1
lower = sample_mean - cut_off
upper = sample_mean + cut_off
return (sapply(x, function(x) {
return(x < lower || x > upper)
}))
}
df_rm <- na.omit(X_train)
df_rm[outliers(df_rm$Age),]## X Province Age Wage Life.insured
## 3 2 Jawa Barat 60 7350000 No
## 7 6 Banten 21 5500000 No
## 10 9 DKI Jakarta 51 10500000 Yes
## 14 13 DKI Jakarta 19 2200000 Yes
Boxplot atau Rentang Interkuartil (IQR)
X_train <- read.csv("dataset.csv")
boxplot(X_train$Wage)X_train <- read.csv("dataset.csv")
dim(X_train)## [1] 15 5
X_train <- read.csv("dataset.csv")
head(X_train, 5)## X Province Age Wage Life.insured
## 1 0 Banten 24 5000000 Yes
## 2 1 DKI Jakarta NA 3400000 No
## 3 2 Jawa Barat 60 7350000 No
## 4 3 Banten 34 3500000 No
## 5 4 Jawa Barat 58 NA Yes
X_train <- read.csv("dataset.csv")
X_train_char <- Filter(is.character, X_train)
colSums(is.na(X_train_char)) ## Province Life.insured
## 0 0
X_train <- read.csv("dataset.csv") #import data training X
library(superml)## Loading required package: R6
X_train_label <- LabelEncoder$new()
X_train$Age <- X_train_label$fit_transform(X_train$Age)## The data contains NA values. Imputing NA with 'NA'
X_train$Province <- X_train_label$fit_transform(X_train$Province)
X_train$Wage <- X_train_label$fit_transform(X_train$Wage)## The data contains NA values. Imputing NA with 'NA'
X_train$Life.insured <- X_train_label$fit_transform(X_train$Life.insured)
X_train## X Province Age Wage Life.insured
## 1 0 0 0 0 0
## 2 1 1 1 1 1
## 3 2 2 2 2 1
## 4 3 0 3 3 1
## 5 4 2 4 4 0
## 6 5 1 1 5 1
## 7 6 0 5 6 1
## 8 7 0 6 7 0
## 9 8 2 7 8 0
## 10 9 1 8 9 0
## 11 10 0 9 4 1
## 12 11 0 10 10 1
## 13 12 2 10 4 1
## 14 13 1 11 11 0
## 15 14 1 12 12 0
X_train <- read.csv("dataset.csv") #import data training X
X_train$Life.insured[X_train$Life.insured=="Yes"] <- 1
X_train$ Life.insured[X_train$Life.insured=="No"] <- 0
X_train## X Province Age Wage Life.insured
## 1 0 Banten 24 5000000 1
## 2 1 DKI Jakarta NA 3400000 0
## 3 2 Jawa Barat 60 7350000 0
## 4 3 Banten 34 3500000 0
## 5 4 Jawa Barat 58 NA 1
## 6 5 DKI Jakarta NA 8000000 0
## 7 6 Banten 21 5500000 0
## 8 7 Banten 44 10000000 1
## 9 8 Jawa Barat 40 9000000 1
## 10 9 DKI Jakarta 51 10500000 1
## 11 10 Banten 32 NA 0
## 12 11 Banten 30 6400000 0
## 13 12 Jawa Barat 30 NA 0
## 14 13 DKI Jakarta 19 2200000 1
## 15 14 DKI Jakarta 25 4500000 1
library(fastDummies)
X_train <- read.csv("dataset.csv") #import data training X
X_train <- dummy_cols(X_train)
X_train## X Province Age Wage Life.insured Province_Banten
## 1 0 Banten 24 5000000 Yes 1
## 2 1 DKI Jakarta NA 3400000 No 0
## 3 2 Jawa Barat 60 7350000 No 0
## 4 3 Banten 34 3500000 No 1
## 5 4 Jawa Barat 58 NA Yes 0
## 6 5 DKI Jakarta NA 8000000 No 0
## 7 6 Banten 21 5500000 No 1
## 8 7 Banten 44 10000000 Yes 1
## 9 8 Jawa Barat 40 9000000 Yes 0
## 10 9 DKI Jakarta 51 10500000 Yes 0
## 11 10 Banten 32 NA No 1
## 12 11 Banten 30 6400000 No 1
## 13 12 Jawa Barat 30 NA No 0
## 14 13 DKI Jakarta 19 2200000 Yes 0
## 15 14 DKI Jakarta 25 4500000 Yes 0
## Province_DKI Jakarta Province_Jawa Barat Life.insured_No Life.insured_Yes
## 1 0 0 0 1
## 2 1 0 1 0
## 3 0 1 1 0
## 4 0 0 1 0
## 5 0 1 0 1
## 6 1 0 1 0
## 7 0 0 1 0
## 8 0 0 0 1
## 9 0 1 0 1
## 10 1 0 0 1
## 11 0 0 1 0
## 12 0 0 1 0
## 13 0 1 1 0
## 14 1 0 0 1
## 15 1 0 0 1
library(tidyverse)## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.4 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(caret)## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(ISLR)
library(lattice)
X_train <- read.csv("dataset.csv") #import data training X
X_train$Life.insured[X_train$Life.insured=="Yes"] <- 1
X_train$Life.insured[X_train$Life.insured=="No"] <- 0
X_train1 <- na.omit(X_train)set.seed(100)
dataset <- trainControl(method = "cv", number = 10, savePredictions = TRUE)
nb_fit <- train(Age~., data = X_train1, method = "lm", trControl = dataset, tuneLength =14)## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
nb_fit## Linear Regression
##
## 10 samples
## 4 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 9, 9, 9, 9, 9, 9, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 11.29568 NaN 11.29568
##
## Tuning parameter 'intercept' was held constant at a value of TRUE