library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.5.2
library(readr)
## Warning: package 'readr' was built under R version 4.5.2
clv_datauji <- read_csv("D:/EDA/Week 3/clv_datauji.csv")
## New names:
## Rows: 5000 Columns: 8
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (2): gender, city dbl (6): ...1, id, age, income, days_on_platform, purchases
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
head(clv_datauji)
## # A tibble: 6 × 8
##    ...1    id   age gender income days_on_platform city          purchases
##   <dbl> <dbl> <dbl> <chr>   <dbl>            <dbl> <chr>             <dbl>
## 1     0     0    NA Male   126895               14 San Francisco         0
## 2     1     1    NA Male   161474               14 Tokyo                 0
## 3     2     2    24 Male   104723               34 London                1
## 4     3     3    29 Male    43791               28 London                2
## 5     4     4    18 Female 132181               26 London                2
## 6     5     5    23 Male    12315               14 New York City         0
summary(clv_datauji)
##       ...1            id            age          gender         
##  Min.   :   0   Min.   :   0   Min.   :10.0   Length:5000       
##  1st Qu.:1250   1st Qu.:1250   1st Qu.:19.0   Class :character  
##  Median :2500   Median :2500   Median :30.0   Mode  :character  
##  Mean   :2500   Mean   :2500   Mean   :30.2                     
##  3rd Qu.:3749   3rd Qu.:3749   3rd Qu.:41.0                     
##  Max.   :4999   Max.   :4999   Max.   :50.0                     
##                                NA's   :2446                     
##      income       days_on_platform     city             purchases    
##  Min.   :     4   Min.   :  1.00   Length:5000        Min.   :0.000  
##  1st Qu.: 32753   1st Qu.: 10.00   Class :character   1st Qu.:0.000  
##  Median : 65989   Median : 21.00   Mode  :character   Median :1.000  
##  Mean   : 79593   Mean   : 24.39                      Mean   :1.101  
##  3rd Qu.:115407   3rd Qu.: 35.00                      3rd Qu.:2.000  
##  Max.   :388572   Max.   :111.00                      Max.   :6.000  
##                   NA's   :141

MISSING VALUES

colSums(is.na(clv_datauji))
##             ...1               id              age           gender 
##                0                0             2446                0 
##           income days_on_platform             city        purchases 
##                0              141                0                0
# Visualisasi missing values
library(VIM) 
## Warning: package 'VIM' was built under R version 4.5.2
## Loading required package: colorspace
## Warning: package 'colorspace' was built under R version 4.5.2
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
aggr(clv_datauji, numbers = TRUE, prop = FALSE)

# Mengganti missing values dengan median di setiap kolom
clv_datauji$age[is.na(clv_datauji$age)] <- median(clv_datauji$age, na.rm = TRUE)
clv_datauji$days_on_platform[is.na(clv_datauji$days_on_platform)] <- median(clv_datauji$days_on_platform, na.rm = TRUE)

clv_datauji
## # A tibble: 5,000 × 8
##     ...1    id   age gender income days_on_platform city          purchases
##    <dbl> <dbl> <dbl> <chr>   <dbl>            <dbl> <chr>             <dbl>
##  1     0     0    30 Male   126895               14 San Francisco         0
##  2     1     1    30 Male   161474               14 Tokyo                 0
##  3     2     2    24 Male   104723               34 London                1
##  4     3     3    29 Male    43791               28 London                2
##  5     4     4    18 Female 132181               26 London                2
##  6     5     5    23 Male    12315               14 New York City         0
##  7     6     6    30 Male    51679               30 Tokyo                 0
##  8     7     7    30 Female  53341               43 San Francisco         1
##  9     8     8    46 Male   129157               23 New York City         0
## 10     9     9    49 Female  76842               19 Tokyo                 2
## # ℹ 4,990 more rows
colSums(is.na(clv_datauji))
##             ...1               id              age           gender 
##                0                0                0                0 
##           income days_on_platform             city        purchases 
##                0                0                0                0
summary(clv_datauji)
##       ...1            id            age          gender         
##  Min.   :   0   Min.   :   0   Min.   :10.0   Length:5000       
##  1st Qu.:1250   1st Qu.:1250   1st Qu.:30.0   Class :character  
##  Median :2500   Median :2500   Median :30.0   Mode  :character  
##  Mean   :2500   Mean   :2500   Mean   :30.1                     
##  3rd Qu.:3749   3rd Qu.:3749   3rd Qu.:31.0                     
##  Max.   :4999   Max.   :4999   Max.   :50.0                     
##      income       days_on_platform     city             purchases    
##  Min.   :     4   Min.   :  1.00   Length:5000        Min.   :0.000  
##  1st Qu.: 32753   1st Qu.: 10.00   Class :character   1st Qu.:0.000  
##  Median : 65989   Median : 21.00   Mode  :character   Median :1.000  
##  Mean   : 79593   Mean   : 24.29                      Mean   :1.101  
##  3rd Qu.:115407   3rd Qu.: 34.00                      3rd Qu.:2.000  
##  Max.   :388572   Max.   :111.00                      Max.   :6.000

OUTLIER

# 1. Tentukan batas tetap dari data asli
batas_atas <- quantile(clv_datauji$income, 0.75, na.rm=T) + 1.5 * IQR(clv_datauji$income, na.rm=T)
batas_bawah <- quantile(clv_datauji$income, 0.25, na.rm=T) - 1.5 * IQR(clv_datauji$income, na.rm=T)

# 2. "Paksa" nilai yang melebihi batas untuk menjadi nilai batas tersebut (Capping)
clv_datauji$income <- ifelse(clv_datauji$income > batas_atas, batas_atas, 
                ifelse(clv_datauji$income < batas_bawah, batas_bawah, clv_datauji$income))
# Visualisasi boxplot untuk melihat outlier pada kolom income
boxplot(clv_datauji$income, main = "Boxplot", col = "lightblue")

# Hitung batas IQR
Q1 <- quantile(clv_datauji$purchases, 0.25, na.rm = TRUE)
Q3 <- quantile(clv_datauji$purchases, 0.75, na.rm = TRUE)
IQR_value <- IQR(clv_datauji$purchases, na.rm = TRUE)

lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value

# Deteksi outlier
outliers <- clv_datauji$purchases < lower_bound | clv_datauji$purchases > upper_bound

# Winsorizing
clv_datauji$purchases[outliers] <- ifelse(
  clv_datauji$purchases [outliers] < lower_bound,
  lower_bound,
  upper_bound
)
# Visualisasi boxplot untuk melihat outlier pada kolom purchases
boxplot(clv_datauji$purchases, main = "Boxplot", col = "lightblue")

# Hitung batas IQR
Q1 <- quantile(clv_datauji$days_on_platform, 0.25, na.rm = TRUE)
Q3 <- quantile(clv_datauji$days_on_platform, 0.75, na.rm = TRUE)
IQR_value <- IQR(clv_datauji$days_on_platform, na.rm = TRUE)

lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value

# Deteksi outlier
outliers <- clv_datauji$days_on_platform < lower_bound | clv_datauji$days_on_platform > upper_bound

# Winsorizing
clv_datauji$days_on_platform[outliers] <- ifelse(
  clv_datauji$days_on_platform [outliers] < lower_bound,
  lower_bound,
  upper_bound
)
# Visualisasi boxplot untuk melihat outlier pada kolom days_on_platform
boxplot(clv_datauji$days_on_platform, main = "Boxplot", col = "lightblue")

DUPLIKASI

# Cek jumlah duplikasi dalam dataset
sum(duplicated(clv_datauji))  # Menghitung jumlah baris yang duplikat
## [1] 0
# Hapus duplikasi jika ada
airquality <- airquality[!duplicated(airquality), ]  # Menyaring hanya baris unik