R Markdown

options(repos = c(CRAN = "https://cloud.r-project.org/"))
install.packages("readxl")   # jika belum install
## Installing package into 'C:/Users/Muhammad Hafizh Ilmi/AppData/Local/R/win-library/4.5'
## (as 'lib' is unspecified)
## package 'readxl' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'readxl'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\Muhammad Hafizh
## Ilmi\AppData\Local\R\win-library\4.5\00LOCK\readxl\libs\x64\readxl.dll to
## C:\Users\Muhammad Hafizh
## Ilmi\AppData\Local\R\win-library\4.5\readxl\libs\x64\readxl.dll: Permission
## denied
## Warning: restored 'readxl'
## 
## The downloaded binary packages are in
##  C:\Users\Muhammad Hafizh Ilmi\AppData\Local\Temp\Rtmpeav9CD\downloaded_packages
library(readxl)
## Warning: package 'readxl' was built under R version 4.5.2
insurance <- read_xlsx("insurance.xlsx")
head(insurance)
## # A tibble: 6 × 7
##     age sex    bmi   children smoker region    charges   
##   <dbl> <chr>  <chr>    <dbl> <chr>  <chr>     <chr>     
## 1    19 female 27.9         0 yes    southwest 16884924  
## 2    18 male   33.77        1 no     southeast 17255523  
## 3    28 male   33           3 no     southeast 4449462   
## 4    33 male   22705        0 no     northwest 2198447061
## 5    32 male   28.88        0 no     northwest 38668552  
## 6    31 female 25.74        0 no     southeast 37566216

#Mencari missing value

data <- read_excel("insurance.xlsx")
sum(is.na(data))
## [1] 0
colSums(is.na(data))
##      age      sex      bmi children   smoker   region  charges 
##        0        0        0        0        0        0        0
rowSums(is.na(data))
##    [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##   [38] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##   [75] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [112] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [149] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [186] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [223] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [260] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [297] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [334] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [371] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [408] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [445] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [482] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [519] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [556] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [593] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [630] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [667] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [704] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [741] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [778] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [815] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [852] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [889] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [926] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [963] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [1000] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [1037] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [1074] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [1111] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [1148] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [1185] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [1222] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [1259] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [1296] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [1333] 0 0 0 0 0 0
data[!complete.cases(data), ]
## # A tibble: 0 × 7
## # ℹ 7 variables: age <dbl>, sex <chr>, bmi <chr>, children <dbl>, smoker <chr>,
## #   region <chr>, charges <chr>
colMeans(is.na(data)) * 100
##      age      sex      bmi children   smoker   region  charges 
##        0        0        0        0        0        0        0

#Mencari duplicate

sum(duplicated(data))
## [1] 1
data[duplicated(data), ]
## # A tibble: 1 × 7
##     age sex   bmi   children smoker region    charges 
##   <dbl> <chr> <chr>    <dbl> <chr>  <chr>     <chr>   
## 1    19 male  30.59        0 no     northwest 16395631
data[duplicated(data) | duplicated(data, fromLast = TRUE), ]
## # A tibble: 2 × 7
##     age sex   bmi   children smoker region    charges 
##   <dbl> <chr> <chr>    <dbl> <chr>  <chr>     <chr>   
## 1    19 male  30.59        0 no     northwest 16395631
## 2    19 male  30.59        0 no     northwest 16395631
data_no_duplicate <- data[!duplicated(data), ]
sum(duplicated(data$age))
## [1] 1291
sum(duplicated(data[, c("age", "sex")]))
## [1] 1244

#Mencari outlier

# Hitung Q1 dan Q3
Q1 <- quantile(data$age, 0.25, na.rm = TRUE)
Q3 <- quantile(data$age, 0.75, na.rm = TRUE)

# Hitung IQR
IQR_value <- IQR(data$age, na.rm = TRUE)

# Tentukan batas bawah dan atas
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value

# Tampilkan outlier
outlier_age <- data$age[data$age < lower_bound | data$age > upper_bound]

outlier_age
## numeric(0)
data[data$age < lower_bound | data$age > upper_bound, ]
## # A tibble: 0 × 7
## # ℹ 7 variables: age <dbl>, sex <chr>, bmi <chr>, children <dbl>, smoker <chr>,
## #   region <chr>, charges <chr>
boxplot.stats(data$age)$out
## numeric(0)
# Ambil hanya kolom numerik
numeric_data <- data[sapply(data, is.numeric)]

# Fungsi mencari outlier
outliers <- lapply(numeric_data, function(x) {
  boxplot.stats(x)$out
})

outliers
## $age
## numeric(0)
## 
## $children
## numeric(0)
boxplot(data$age, main = "Boxplot Age")