Email          : veronica.ardilla@student.matanauniversity.ac.id
RPubs         : https://rpubs.com/veronicayose/
Github        : https://github.com/veronicayose/
Jurusan      : Teknik Informatika
Address     : ARA Center, Matana University Tower
             Jl. CBD Barat Kav, RT.1, Curug Sangereng, Kelapa Dua, Tangerang, Banten 15810.
x_train<-read.csv("dataset.csv") #Mengimpor data dari dataset.csv
x_train## X Province Age Wage Life.insured
## 1 0 Banten 24 5000000 Yes
## 2 1 DKI Jakarta NA 3400000 No
## 3 2 Jawa Barat 60 7350000 No
## 4 3 Banten 34 3500000 No
## 5 4 Jawa Barat 58 NA Yes
## 6 5 DKI Jakarta NA 8000000 No
## 7 6 Banten 21 5500000 No
## 8 7 Banten 44 10000000 Yes
## 9 8 Jawa Barat 40 9000000 Yes
## 10 9 DKI Jakarta 51 10500000 Yes
## 11 10 Banten 32 NA No
## 12 11 Banten 30 6400000 No
## 13 12 Jawa Barat 30 NA No
## 14 13 DKI Jakarta 19 2200000 Yes
## 15 14 DKI Jakarta 25 4500000 Yes
a. Enam Baris Pertama
head(x_train)## X Province Age Wage Life.insured
## 1 0 Banten 24 5000000 Yes
## 2 1 DKI Jakarta NA 3400000 No
## 3 2 Jawa Barat 60 7350000 No
## 4 3 Banten 34 3500000 No
## 5 4 Jawa Barat 58 NA Yes
## 6 5 DKI Jakarta NA 8000000 No
b. Enam Baris Terakhir
tail(x_train)## X Province Age Wage Life.insured
## 10 9 DKI Jakarta 51 10500000 Yes
## 11 10 Banten 32 NA No
## 12 11 Banten 30 6400000 No
## 13 12 Jawa Barat 30 NA No
## 14 13 DKI Jakarta 19 2200000 Yes
## 15 14 DKI Jakarta 25 4500000 Yes
Untuk mengecek banyaknya data yang hilang
colSums(is.na(x_train))## X Province Age Wage Life.insured
## 0 0 2 3 0
colSums(is.na(na.omit(x_train)))## X Province Age Wage Life.insured
## 0 0 0 0 0
a. Mengisi numerik yang hilang dengan Mean
x_train$Age[is.na(x_train$Age)] = mean(x_train$Age,na.rm = TRUE)
colSums(is.na(x_train))## X Province Age Wage Life.insured
## 0 0 0 3 0
b. Mengisi numerik yang hilang dengan Modus
x_train$Wage[is.na(x_train$Wage)] = mode(x_train$Wage)
colSums(is.na(x_train))## X Province Age Wage Life.insured
## 0 0 0 0 0
library(zoo)##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
#Mengimpor data dari dataset.csv
x_train<-read.csv("dataset.csv")
#Interpolasi Linear
x_train$Wage<-na.approx(x_train$Wage)
colSums(is.na(x_train))## X Province Age Wage Life.insured
## 0 0 2 0 0
#Mengimpor data dari dataset.csv
x_train<-read.csv("dataset.csv")
#forward filling
require(tidyr)## Loading required package: tidyr
require(dplyr)## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
x_train <- x_train %>% fill(Age)
colSums(is.na(x_train))## X Province Age Wage Life.insured
## 0 0 0 3 0
#backward filling
require(tidyr)
require(dplyr)
x_train <- x_train %>% fill(Wage, .direction = "up")
colSums(is.na(x_train))## X Province Age Wage Life.insured
## 0 0 0 0 0
#Mengimpor data dari dataset.csv
x_train<-read.csv("dataset.csv")
#Memeriksa nilai duplikat pada dataframe x_tr
x_train %>% count(x_train$Age) %>% filter(n>1) %>% select(-n)## x_train$Age
## 1 30
## 2 NA
#Untuk memeriksa banyaknya jumlah pada data yang terduplikat
#Catatan: n adalah jumlah yang terduplikat
x_train %>% count(x_train$Age) %>% filter(n>1)## x_train$Age n
## 1 30 2
## 2 NA 2
a. Memilah data numerik
Filter(is.numeric, x_train)## X Age Wage
## 1 0 24 5000000
## 2 1 NA 3400000
## 3 2 60 7350000
## 4 3 34 3500000
## 5 4 58 NA
## 6 5 NA 8000000
## 7 6 21 5500000
## 8 7 44 10000000
## 9 8 40 9000000
## 10 9 51 10500000
## 11 10 32 NA
## 12 11 30 6400000
## 13 12 30 NA
## 14 13 19 2200000
## 15 14 25 4500000
b. Memilah data kategorikal
Filter(is.character, x_train)## Province Life.insured
## 1 Banten Yes
## 2 DKI Jakarta No
## 3 Jawa Barat No
## 4 Banten No
## 5 Jawa Barat Yes
## 6 DKI Jakarta No
## 7 Banten No
## 8 Banten Yes
## 9 Jawa Barat Yes
## 10 DKI Jakarta Yes
## 11 Banten No
## 12 Banten No
## 13 Jawa Barat No
## 14 DKI Jakarta Yes
## 15 DKI Jakarta Yes
x_train <- read.csv("dataset.csv")
x_del <- na.omit(x_train) # menghilangkan na
x_del## X Province Age Wage Life.insured
## 1 0 Banten 24 5000000 Yes
## 3 2 Jawa Barat 60 7350000 No
## 4 3 Banten 34 3500000 No
## 7 6 Banten 21 5500000 No
## 8 7 Banten 44 10000000 Yes
## 9 8 Jawa Barat 40 9000000 Yes
## 10 9 DKI Jakarta 51 10500000 Yes
## 12 11 Banten 30 6400000 No
## 14 13 DKI Jakarta 19 2200000 Yes
## 15 14 DKI Jakarta 25 4500000 Yes
x_del$Age_stan <- scale(x_del$Age)
x_del$Wage_stan <- scale(x_del$Wage)
x_del## X Province Age Wage Life.insured Age_stan Wage_stan
## 1 0 Banten 24 5000000 Yes -0.78916488 -0.50069606
## 3 2 Jawa Barat 60 7350000 No 1.84138472 0.34277042
## 4 3 Banten 34 3500000 No -0.05845666 -1.03907893
## 7 6 Banten 21 5500000 No -1.00837734 -0.32123511
## 8 7 Banten 44 10000000 Yes 0.67225156 1.29391349
## 9 8 Jawa Barat 40 9000000 Yes 0.37996827 0.93499158
## 10 9 DKI Jakarta 51 10500000 Yes 1.18374732 1.47337444
## 12 11 Banten 30 6400000 No -0.35073995 0.00179461
## 14 13 DKI Jakarta 19 2200000 Yes -1.15451899 -1.50567741
## 15 14 DKI Jakarta 25 4500000 Yes -0.71609406 -0.68015702
x_train <- read.csv("dataset.csv")
x_del <- na.omit(x_train) # menghilangkan na
normalize <- function(x) {
return((x - min(x)/(max(x)-min(x))))
}
x_del## X Province Age Wage Life.insured
## 1 0 Banten 24 5000000 Yes
## 3 2 Jawa Barat 60 7350000 No
## 4 3 Banten 34 3500000 No
## 7 6 Banten 21 5500000 No
## 8 7 Banten 44 10000000 Yes
## 9 8 Jawa Barat 40 9000000 Yes
## 10 9 DKI Jakarta 51 10500000 Yes
## 12 11 Banten 30 6400000 No
## 14 13 DKI Jakarta 19 2200000 Yes
## 15 14 DKI Jakarta 25 4500000 Yes
x_del$Age_norm <- normalize(x_del$Age)
x_del$Wage_norm <- normalize(x_del$Wage)
x_del## X Province Age Wage Life.insured Age_norm Wage_norm
## 1 0 Banten 24 5000000 Yes 23.53659 5000000
## 3 2 Jawa Barat 60 7350000 No 59.53659 7350000
## 4 3 Banten 34 3500000 No 33.53659 3500000
## 7 6 Banten 21 5500000 No 20.53659 5500000
## 8 7 Banten 44 10000000 Yes 43.53659 10000000
## 9 8 Jawa Barat 40 9000000 Yes 39.53659 9000000
## 10 9 DKI Jakarta 51 10500000 Yes 50.53659 10500000
## 12 11 Banten 30 6400000 No 29.53659 6400000
## 14 13 DKI Jakarta 19 2200000 Yes 18.53659 2200000
## 15 14 DKI Jakarta 25 4500000 Yes 24.53659 4500000
x_train <- read.csv("dataset.csv")
x_del <- na.omit(x_train) # menghilangkan na
robust <- function(x) {
return((x-quantile(x)[2])/(quantile(x)[4]-quantile(x)[2]))
}
x_del## X Province Age Wage Life.insured
## 1 0 Banten 24 5000000 Yes
## 3 2 Jawa Barat 60 7350000 No
## 4 3 Banten 34 3500000 No
## 7 6 Banten 21 5500000 No
## 8 7 Banten 44 10000000 Yes
## 9 8 Jawa Barat 40 9000000 Yes
## 10 9 DKI Jakarta 51 10500000 Yes
## 12 11 Banten 30 6400000 No
## 14 13 DKI Jakarta 19 2200000 Yes
## 15 14 DKI Jakarta 25 4500000 Yes
x_del$Age_robus <- normalize(x_del$Age)
x_del$Wage_robus <- normalize(x_del$Wage)
x_del## X Province Age Wage Life.insured Age_robus Wage_robus
## 1 0 Banten 24 5000000 Yes 23.53659 5000000
## 3 2 Jawa Barat 60 7350000 No 59.53659 7350000
## 4 3 Banten 34 3500000 No 33.53659 3500000
## 7 6 Banten 21 5500000 No 20.53659 5500000
## 8 7 Banten 44 10000000 Yes 43.53659 10000000
## 9 8 Jawa Barat 40 9000000 Yes 39.53659 9000000
## 10 9 DKI Jakarta 51 10500000 Yes 50.53659 10500000
## 12 11 Banten 30 6400000 No 29.53659 6400000
## 14 13 DKI Jakarta 19 2200000 Yes 18.53659 2200000
## 15 14 DKI Jakarta 25 4500000 Yes 24.53659 4500000
Distribusi Gaussian
x_train <- read.csv("dataset.csv")
x_train <- na.omit(x_train) # menghilangkan na
pencilan <- function(x) {
sample_mean <- mean(x)
sample_std <- sd(x)
cut_off <- sample_std * 1
lower <- sample_mean - cut_off
upper <- sample_mean + cut_off
return(sapply(x, function(x) {
return(x < lower || x > upper)
}))
}
x_train[pencilan(x_train$Age),]## X Province Age Wage Life.insured
## 3 2 Jawa Barat 60 7350000 No
## 7 6 Banten 21 5500000 No
## 10 9 DKI Jakarta 51 10500000 Yes
## 14 13 DKI Jakarta 19 2200000 Yes
x_train[pencilan(x_train$Wage),]## X Province Age Wage Life.insured
## 4 3 Banten 34 3500000 No
## 8 7 Banten 44 10000000 Yes
## 10 9 DKI Jakarta 51 10500000 Yes
## 14 13 DKI Jakarta 19 2200000 Yes
x_train <- read.csv("dataset.csv")
boxplot(x_train$Age)boxplot(x_train$Wage)dim(x_train)## [1] 15 5
head(x_train, 5)## X Province Age Wage Life.insured
## 1 0 Banten 24 5000000 Yes
## 2 1 DKI Jakarta NA 3400000 No
## 3 2 Jawa Barat 60 7350000 No
## 4 3 Banten 34 3500000 No
## 5 4 Jawa Barat 58 NA Yes
x_Category<-Filter(is.character, x_train)
colSums(is.na(x_Category))## Province Life.insured
## 0 0
x_train <- read.csv("dataset.csv") #import data training X
library(superml)## Loading required package: R6
x_label <- LabelEncoder$new()
x_train$Province <- x_label$fit_transform(x_train$Province)
x_train$Life.insured <- x_label$fit_transform(x_train$Life.insured)
x_train## X Province Age Wage Life.insured
## 1 0 0 24 5000000 0
## 2 1 1 NA 3400000 1
## 3 2 2 60 7350000 1
## 4 3 0 34 3500000 1
## 5 4 2 58 NA 0
## 6 5 1 NA 8000000 1
## 7 6 0 21 5500000 1
## 8 7 0 44 10000000 0
## 9 8 2 40 9000000 0
## 10 9 1 51 10500000 0
## 11 10 0 32 NA 1
## 12 11 0 30 6400000 1
## 13 12 2 30 NA 1
## 14 13 1 19 2200000 0
## 15 14 1 25 4500000 0
x_train <- read.csv("dataset.csv")
x_train$Province[x_train$Province=="Banten"]<-1
x_train$Province[x_train$Province=="DKI Jakarta"]<-2
x_train$Province[x_train$Province=="Jawa Barat"]<-3
x_train$Life.insured[x_train$Life.insured=="Yes"]<-1
x_train$Life.insured[x_train$Life.insured=="No"]<-2
print(x_train)## X Province Age Wage Life.insured
## 1 0 1 24 5000000 1
## 2 1 2 NA 3400000 2
## 3 2 3 60 7350000 2
## 4 3 1 34 3500000 2
## 5 4 3 58 NA 1
## 6 5 2 NA 8000000 2
## 7 6 1 21 5500000 2
## 8 7 1 44 10000000 1
## 9 8 3 40 9000000 1
## 10 9 2 51 10500000 1
## 11 10 1 32 NA 2
## 12 11 1 30 6400000 2
## 13 12 3 30 NA 2
## 14 13 2 19 2200000 1
## 15 14 2 25 4500000 1
library(fastDummies)x_train <- read.csv("dataset.csv")
x_train <- dummy_cols(x_train)
print(x_train)## X Province Age Wage Life.insured Province_Banten
## 1 0 Banten 24 5000000 Yes 1
## 2 1 DKI Jakarta NA 3400000 No 0
## 3 2 Jawa Barat 60 7350000 No 0
## 4 3 Banten 34 3500000 No 1
## 5 4 Jawa Barat 58 NA Yes 0
## 6 5 DKI Jakarta NA 8000000 No 0
## 7 6 Banten 21 5500000 No 1
## 8 7 Banten 44 10000000 Yes 1
## 9 8 Jawa Barat 40 9000000 Yes 0
## 10 9 DKI Jakarta 51 10500000 Yes 0
## 11 10 Banten 32 NA No 1
## 12 11 Banten 30 6400000 No 1
## 13 12 Jawa Barat 30 NA No 0
## 14 13 DKI Jakarta 19 2200000 Yes 0
## 15 14 DKI Jakarta 25 4500000 Yes 0
## Province_DKI Jakarta Province_Jawa Barat Life.insured_No Life.insured_Yes
## 1 0 0 0 1
## 2 1 0 1 0
## 3 0 1 1 0
## 4 0 0 1 0
## 5 0 1 0 1
## 6 1 0 1 0
## 7 0 0 1 0
## 8 0 0 0 1
## 9 0 1 0 1
## 10 1 0 0 1
## 11 0 0 1 0
## 12 0 0 1 0
## 13 0 1 1 0
## 14 1 0 0 1
## 15 1 0 0 1
library(tidyverse)## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.4 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(caret)## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(ISLR)
library(lattice)
x_train <- read.csv("dataset.csv")
x_train$Life.insured[x_train$Life.insured=="Yes"]<-1
x_train$Life.insured[x_train$Life.insured=="No"]<-2
y_train <- na.omit(x_train)set.seed(100)
dataset <- trainControl(method = "cv", number = 10, savePredictions = TRUE)
nb_fit <- train(Age ~., data = y_train, method = "lm", trControl =dataset, tuneLength = 14)## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
nb_fit## Linear Regression
##
## 10 samples
## 4 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 9, 9, 9, 9, 9, 9, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 11.29568 NaN 11.29568
##
## Tuning parameter 'intercept' was held constant at a value of TRUE