library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.5.2
library(readr)
## Warning: package 'readr' was built under R version 4.5.2
clv_datauji <- read_csv("D:/EDA/Week 3/clv_datauji.csv")
## New names:
## Rows: 5000 Columns: 8
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (2): gender, city dbl (6): ...1, id, age, income, days_on_platform, purchases
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
head(clv_datauji)
## # A tibble: 6 × 8
## ...1 id age gender income days_on_platform city purchases
## <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <chr> <dbl>
## 1 0 0 NA Male 126895 14 San Francisco 0
## 2 1 1 NA Male 161474 14 Tokyo 0
## 3 2 2 24 Male 104723 34 London 1
## 4 3 3 29 Male 43791 28 London 2
## 5 4 4 18 Female 132181 26 London 2
## 6 5 5 23 Male 12315 14 New York City 0
summary(clv_datauji)
## ...1 id age gender
## Min. : 0 Min. : 0 Min. :10.0 Length:5000
## 1st Qu.:1250 1st Qu.:1250 1st Qu.:19.0 Class :character
## Median :2500 Median :2500 Median :30.0 Mode :character
## Mean :2500 Mean :2500 Mean :30.2
## 3rd Qu.:3749 3rd Qu.:3749 3rd Qu.:41.0
## Max. :4999 Max. :4999 Max. :50.0
## NA's :2446
## income days_on_platform city purchases
## Min. : 4 Min. : 1.00 Length:5000 Min. :0.000
## 1st Qu.: 32753 1st Qu.: 10.00 Class :character 1st Qu.:0.000
## Median : 65989 Median : 21.00 Mode :character Median :1.000
## Mean : 79593 Mean : 24.39 Mean :1.101
## 3rd Qu.:115407 3rd Qu.: 35.00 3rd Qu.:2.000
## Max. :388572 Max. :111.00 Max. :6.000
## NA's :141
MISSING VALUES
colSums(is.na(clv_datauji))
## ...1 id age gender
## 0 0 2446 0
## income days_on_platform city purchases
## 0 141 0 0
# Visualisasi missing values
library(VIM)
## Warning: package 'VIM' was built under R version 4.5.2
## Loading required package: colorspace
## Warning: package 'colorspace' was built under R version 4.5.2
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
aggr(clv_datauji, numbers = TRUE, prop = FALSE)

# Mengganti missing values dengan median di setiap kolom
clv_datauji$age[is.na(clv_datauji$age)] <- median(clv_datauji$age, na.rm = TRUE)
clv_datauji$days_on_platform[is.na(clv_datauji$days_on_platform)] <- median(clv_datauji$days_on_platform, na.rm = TRUE)
clv_datauji
## # A tibble: 5,000 × 8
## ...1 id age gender income days_on_platform city purchases
## <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <chr> <dbl>
## 1 0 0 30 Male 126895 14 San Francisco 0
## 2 1 1 30 Male 161474 14 Tokyo 0
## 3 2 2 24 Male 104723 34 London 1
## 4 3 3 29 Male 43791 28 London 2
## 5 4 4 18 Female 132181 26 London 2
## 6 5 5 23 Male 12315 14 New York City 0
## 7 6 6 30 Male 51679 30 Tokyo 0
## 8 7 7 30 Female 53341 43 San Francisco 1
## 9 8 8 46 Male 129157 23 New York City 0
## 10 9 9 49 Female 76842 19 Tokyo 2
## # ℹ 4,990 more rows
colSums(is.na(clv_datauji))
## ...1 id age gender
## 0 0 0 0
## income days_on_platform city purchases
## 0 0 0 0
summary(clv_datauji)
## ...1 id age gender
## Min. : 0 Min. : 0 Min. :10.0 Length:5000
## 1st Qu.:1250 1st Qu.:1250 1st Qu.:30.0 Class :character
## Median :2500 Median :2500 Median :30.0 Mode :character
## Mean :2500 Mean :2500 Mean :30.1
## 3rd Qu.:3749 3rd Qu.:3749 3rd Qu.:31.0
## Max. :4999 Max. :4999 Max. :50.0
## income days_on_platform city purchases
## Min. : 4 Min. : 1.00 Length:5000 Min. :0.000
## 1st Qu.: 32753 1st Qu.: 10.00 Class :character 1st Qu.:0.000
## Median : 65989 Median : 21.00 Mode :character Median :1.000
## Mean : 79593 Mean : 24.29 Mean :1.101
## 3rd Qu.:115407 3rd Qu.: 34.00 3rd Qu.:2.000
## Max. :388572 Max. :111.00 Max. :6.000
OUTLIER
# 1. Tentukan batas tetap dari data asli
batas_atas <- quantile(clv_datauji$income, 0.75, na.rm=T) + 1.5 * IQR(clv_datauji$income, na.rm=T)
batas_bawah <- quantile(clv_datauji$income, 0.25, na.rm=T) - 1.5 * IQR(clv_datauji$income, na.rm=T)
# 2. "Paksa" nilai yang melebihi batas untuk menjadi nilai batas tersebut (Capping)
clv_datauji$income <- ifelse(clv_datauji$income > batas_atas, batas_atas,
ifelse(clv_datauji$income < batas_bawah, batas_bawah, clv_datauji$income))
# Visualisasi boxplot untuk melihat outlier pada kolom income
boxplot(clv_datauji$income, main = "Boxplot", col = "lightblue")

# Hitung batas IQR
Q1 <- quantile(clv_datauji$purchases, 0.25, na.rm = TRUE)
Q3 <- quantile(clv_datauji$purchases, 0.75, na.rm = TRUE)
IQR_value <- IQR(clv_datauji$purchases, na.rm = TRUE)
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
# Deteksi outlier
outliers <- clv_datauji$purchases < lower_bound | clv_datauji$purchases > upper_bound
# Winsorizing
clv_datauji$purchases[outliers] <- ifelse(
clv_datauji$purchases [outliers] < lower_bound,
lower_bound,
upper_bound
)
# Visualisasi boxplot untuk melihat outlier pada kolom purchases
boxplot(clv_datauji$purchases, main = "Boxplot", col = "lightblue")

# Hitung batas IQR
Q1 <- quantile(clv_datauji$days_on_platform, 0.25, na.rm = TRUE)
Q3 <- quantile(clv_datauji$days_on_platform, 0.75, na.rm = TRUE)
IQR_value <- IQR(clv_datauji$days_on_platform, na.rm = TRUE)
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
# Deteksi outlier
outliers <- clv_datauji$days_on_platform < lower_bound | clv_datauji$days_on_platform > upper_bound
# Winsorizing
clv_datauji$days_on_platform[outliers] <- ifelse(
clv_datauji$days_on_platform [outliers] < lower_bound,
lower_bound,
upper_bound
)
# Visualisasi boxplot untuk melihat outlier pada kolom days_on_platform
boxplot(clv_datauji$days_on_platform, main = "Boxplot", col = "lightblue")

DUPLIKASI
# Cek jumlah duplikasi dalam dataset
sum(duplicated(clv_datauji)) # Menghitung jumlah baris yang duplikat
## [1] 0
# Hapus duplikasi jika ada
airquality <- airquality[!duplicated(airquality), ] # Menyaring hanya baris unik