options(repos = c(CRAN = "https://cloud.r-project.org/"))
install.packages("readxl") # jika belum install
## Installing package into 'C:/Users/Muhammad Hafizh Ilmi/AppData/Local/R/win-library/4.5'
## (as 'lib' is unspecified)
## package 'readxl' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'readxl'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\Muhammad Hafizh
## Ilmi\AppData\Local\R\win-library\4.5\00LOCK\readxl\libs\x64\readxl.dll to
## C:\Users\Muhammad Hafizh
## Ilmi\AppData\Local\R\win-library\4.5\readxl\libs\x64\readxl.dll: Permission
## denied
## Warning: restored 'readxl'
##
## The downloaded binary packages are in
## C:\Users\Muhammad Hafizh Ilmi\AppData\Local\Temp\Rtmpeav9CD\downloaded_packages
library(readxl)
## Warning: package 'readxl' was built under R version 4.5.2
insurance <- read_xlsx("insurance.xlsx")
head(insurance)
## # A tibble: 6 × 7
## age sex bmi children smoker region charges
## <dbl> <chr> <chr> <dbl> <chr> <chr> <chr>
## 1 19 female 27.9 0 yes southwest 16884924
## 2 18 male 33.77 1 no southeast 17255523
## 3 28 male 33 3 no southeast 4449462
## 4 33 male 22705 0 no northwest 2198447061
## 5 32 male 28.88 0 no northwest 38668552
## 6 31 female 25.74 0 no southeast 37566216
#Mencari missing value
data <- read_excel("insurance.xlsx")
sum(is.na(data))
## [1] 0
colSums(is.na(data))
## age sex bmi children smoker region charges
## 0 0 0 0 0 0 0
rowSums(is.na(data))
## [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [38] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [75] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [112] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [149] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [186] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [223] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [260] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [297] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [334] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [371] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [408] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [445] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [482] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [519] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [556] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [593] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [630] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [667] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [704] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [741] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [778] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [815] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [852] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [889] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [926] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [963] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [1000] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [1037] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [1074] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [1111] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [1148] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [1185] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [1222] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [1259] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [1296] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [1333] 0 0 0 0 0 0
data[!complete.cases(data), ]
## # A tibble: 0 × 7
## # ℹ 7 variables: age <dbl>, sex <chr>, bmi <chr>, children <dbl>, smoker <chr>,
## # region <chr>, charges <chr>
colMeans(is.na(data)) * 100
## age sex bmi children smoker region charges
## 0 0 0 0 0 0 0
#Mencari duplicate
sum(duplicated(data))
## [1] 1
data[duplicated(data), ]
## # A tibble: 1 × 7
## age sex bmi children smoker region charges
## <dbl> <chr> <chr> <dbl> <chr> <chr> <chr>
## 1 19 male 30.59 0 no northwest 16395631
data[duplicated(data) | duplicated(data, fromLast = TRUE), ]
## # A tibble: 2 × 7
## age sex bmi children smoker region charges
## <dbl> <chr> <chr> <dbl> <chr> <chr> <chr>
## 1 19 male 30.59 0 no northwest 16395631
## 2 19 male 30.59 0 no northwest 16395631
data_no_duplicate <- data[!duplicated(data), ]
sum(duplicated(data$age))
## [1] 1291
sum(duplicated(data[, c("age", "sex")]))
## [1] 1244
#Mencari outlier
# Hitung Q1 dan Q3
Q1 <- quantile(data$age, 0.25, na.rm = TRUE)
Q3 <- quantile(data$age, 0.75, na.rm = TRUE)
# Hitung IQR
IQR_value <- IQR(data$age, na.rm = TRUE)
# Tentukan batas bawah dan atas
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
# Tampilkan outlier
outlier_age <- data$age[data$age < lower_bound | data$age > upper_bound]
outlier_age
## numeric(0)
data[data$age < lower_bound | data$age > upper_bound, ]
## # A tibble: 0 × 7
## # ℹ 7 variables: age <dbl>, sex <chr>, bmi <chr>, children <dbl>, smoker <chr>,
## # region <chr>, charges <chr>
boxplot.stats(data$age)$out
## numeric(0)
# Ambil hanya kolom numerik
numeric_data <- data[sapply(data, is.numeric)]
# Fungsi mencari outlier
outliers <- lapply(numeric_data, function(x) {
boxplot.stats(x)$out
})
outliers
## $age
## numeric(0)
##
## $children
## numeric(0)
boxplot(data$age, main = "Boxplot Age")