Load Library

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep

Load Dataset Online Retail

library(readr)
retail <- read_csv("D:/EDA PROJECT/retail.csv")
## Rows: 50000 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (3): payment_method, region, customer_segment
## dbl (11): customer_id, age, annual_income, months_active, avg_monthly_spend,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
retail
## # A tibble: 50,000 × 14
##    customer_id   age annual_income months_active avg_monthly_spend
##          <dbl> <dbl>         <dbl>         <dbl>             <dbl>
##  1       33554    53       100473.            63              121.
##  2        9428    54        54731.            67              573.
##  3         200    44        58268.            57              267.
##  4       12448    54        64830.            40              691.
##  5       39490    28        27431.            15              833.
##  6       42725    67        13511.            49              212.
##  7       10823    50        66877.            60              453.
##  8       49499    64           NA             59              194.
##  9        4145    46        62196.            53              275.
## 10       36959    50        34685.            39              926.
## # ℹ 49,990 more rows
## # ℹ 9 more variables: purchase_frequency <dbl>, avg_order_value <dbl>,
## #   discount_usage_rate <dbl>, return_rate <dbl>, browsing_time_minutes <dbl>,
## #   support_interactions <dbl>, payment_method <chr>, region <chr>,
## #   customer_segment <chr>
summary(retail)
##   customer_id         age     annual_income    months_active  
##  Min.   :    1   Min.   :18   Min.   :  2946   Min.   : 1.00  
##  1st Qu.:12501   1st Qu.:31   1st Qu.: 24373   1st Qu.:18.00  
##  Median :25001   Median :44   Median : 36788   Median :36.00  
##  Mean   :25001   Mean   :44   Mean   : 44543   Mean   :36.48  
##  3rd Qu.:37500   3rd Qu.:57   3rd Qu.: 55497   3rd Qu.:55.00  
##  Max.   :50000   Max.   :70   Max.   :530788   Max.   :72.00  
##                               NA's   :3075                    
##  avg_monthly_spend  purchase_frequency avg_order_value     discount_usage_rate
##  Min.   :   8.072   Min.   : 0.01701   Min.   :   0.9404   Min.   :0.00053    
##  1st Qu.: 192.075   1st Qu.: 2.45687   1st Qu.:  32.0185   1st Qu.:0.16050    
##  Median : 297.477   Median : 4.15867   Median :  56.6003   Median :0.26259    
##  Mean   : 337.294   Mean   : 4.91084   Mean   :  77.3759   Mean   :0.28436    
##  3rd Qu.: 439.112   3rd Qu.: 6.52690   3rd Qu.:  98.8320   3rd Qu.:0.38764    
##  Max.   :3026.342   Max.   :67.68253   Max.   :1289.4273   Max.   :0.93465    
##  NA's   :2520       NA's   :1979                           NA's   :2549       
##   return_rate      browsing_time_minutes support_interactions
##  Min.   :0.00005   Min.   :  0.2361      Min.   :0.000       
##  1st Qu.:0.06394   1st Qu.: 31.0823      1st Qu.:1.000       
##  Median :0.12363   Median : 52.6792      Median :1.000       
##  Mean   :0.14896   Mean   : 61.7813      Mean   :1.495       
##  3rd Qu.:0.20961   3rd Qu.: 82.5496      3rd Qu.:2.000       
##  Max.   :0.83040   Max.   :406.8130      Max.   :9.000       
##  NA's   :2487      NA's   :3934          NA's   :1988        
##  payment_method        region          customer_segment  
##  Length:50000       Length:50000       Length:50000      
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
## 
head(is.na(retail))
##      customer_id   age annual_income months_active avg_monthly_spend
## [1,]       FALSE FALSE         FALSE         FALSE             FALSE
## [2,]       FALSE FALSE         FALSE         FALSE             FALSE
## [3,]       FALSE FALSE         FALSE         FALSE             FALSE
## [4,]       FALSE FALSE         FALSE         FALSE             FALSE
## [5,]       FALSE FALSE         FALSE         FALSE             FALSE
## [6,]       FALSE FALSE         FALSE         FALSE             FALSE
##      purchase_frequency avg_order_value discount_usage_rate return_rate
## [1,]              FALSE           FALSE               FALSE       FALSE
## [2,]              FALSE           FALSE               FALSE       FALSE
## [3,]              FALSE           FALSE               FALSE       FALSE
## [4,]              FALSE           FALSE               FALSE       FALSE
## [5,]              FALSE           FALSE               FALSE       FALSE
## [6,]              FALSE           FALSE               FALSE       FALSE
##      browsing_time_minutes support_interactions payment_method region
## [1,]                 FALSE                FALSE          FALSE  FALSE
## [2,]                 FALSE                FALSE          FALSE  FALSE
## [3,]                 FALSE                FALSE          FALSE  FALSE
## [4,]                 FALSE                FALSE          FALSE  FALSE
## [5,]                 FALSE                FALSE          FALSE  FALSE
## [6,]                 FALSE                FALSE          FALSE  FALSE
##      customer_segment
## [1,]            FALSE
## [2,]            FALSE
## [3,]            FALSE
## [4,]            FALSE
## [5,]            FALSE
## [6,]            FALSE
colSums(is.na(retail))
##           customer_id                   age         annual_income 
##                     0                     0                  3075 
##         months_active     avg_monthly_spend    purchase_frequency 
##                     0                  2520                  1979 
##       avg_order_value   discount_usage_rate           return_rate 
##                     0                  2549                  2487 
## browsing_time_minutes  support_interactions        payment_method 
##                  3934                  1988                     0 
##                region      customer_segment 
##                     0                     0
aggr(retail, numbers = TRUE, prop = FALSE)
## Warning in plot.aggr(res, ...): not enough vertical space to display
## frequencies (too many combinations)

Imputasi Missing Values

# Mengganti missing values dengan median di setiap kolom
retail$annual_income[is.na(retail$annual_income)] <- median(retail$annual_income, na.rm = TRUE)

retail$avg_monthly_spend[is.na(retail$avg_monthly_spend)] <- median(retail$avg_monthly_spend, na.rm = TRUE)

retail$purchase_frequency [is.na(retail$purchase_frequency )] <- median(retail$purchase_frequency, na.rm = TRUE)

retail$discount_usage_rate[is.na(retail$discount_usage_rate)] <- median(retail$discount_usage_rate, na.rm = TRUE)

retail$return_rate[is.na(retail$return_rate)] <- median(retail$return_rate, na.rm = TRUE)

retail$browsing_time_minutes[is.na(retail$browsing_time_minutes)] <- median(retail$browsing_time_minutes, na.rm = TRUE)

retail$support_interactions[is.na(retail$support_interactions)] <- median(retail$support_interactions, na.rm = TRUE)

retail
## # A tibble: 50,000 × 14
##    customer_id   age annual_income months_active avg_monthly_spend
##          <dbl> <dbl>         <dbl>         <dbl>             <dbl>
##  1       33554    53       100473.            63              121.
##  2        9428    54        54731.            67              573.
##  3         200    44        58268.            57              267.
##  4       12448    54        64830.            40              691.
##  5       39490    28        27431.            15              833.
##  6       42725    67        13511.            49              212.
##  7       10823    50        66877.            60              453.
##  8       49499    64        36788.            59              194.
##  9        4145    46        62196.            53              275.
## 10       36959    50        34685.            39              926.
## # ℹ 49,990 more rows
## # ℹ 9 more variables: purchase_frequency <dbl>, avg_order_value <dbl>,
## #   discount_usage_rate <dbl>, return_rate <dbl>, browsing_time_minutes <dbl>,
## #   support_interactions <dbl>, payment_method <chr>, region <chr>,
## #   customer_segment <chr>
colSums(is.na(retail))
##           customer_id                   age         annual_income 
##                     0                     0                     0 
##         months_active     avg_monthly_spend    purchase_frequency 
##                     0                     0                     0 
##       avg_order_value   discount_usage_rate           return_rate 
##                     0                     0                     0 
## browsing_time_minutes  support_interactions        payment_method 
##                     0                     0                     0 
##                region      customer_segment 
##                     0                     0
summary(retail)
##   customer_id         age     annual_income    months_active  
##  Min.   :    1   Min.   :18   Min.   :  2946   Min.   : 1.00  
##  1st Qu.:12501   1st Qu.:31   1st Qu.: 25113   1st Qu.:18.00  
##  Median :25001   Median :44   Median : 36788   Median :36.00  
##  Mean   :25001   Mean   :44   Mean   : 44066   Mean   :36.48  
##  3rd Qu.:37500   3rd Qu.:57   3rd Qu.: 53830   3rd Qu.:55.00  
##  Max.   :50000   Max.   :70   Max.   :530788   Max.   :72.00  
##  avg_monthly_spend  purchase_frequency avg_order_value     discount_usage_rate
##  Min.   :   8.072   Min.   : 0.01701   Min.   :   0.9404   Min.   :0.000534   
##  1st Qu.: 197.221   1st Qu.: 2.52642   1st Qu.:  32.0185   1st Qu.:0.166100   
##  Median : 297.477   Median : 4.15867   Median :  56.6003   Median :0.262591   
##  Mean   : 335.287   Mean   : 4.88107   Mean   :  77.3759   Mean   :0.283248   
##  3rd Qu.: 429.192   3rd Qu.: 6.39953   3rd Qu.:  98.8320   3rd Qu.:0.380198   
##  Max.   :3026.342   Max.   :67.68253   Max.   :1289.4273   Max.   :0.934655   
##   return_rate        browsing_time_minutes support_interactions
##  Min.   :4.968e-05   Min.   :  0.2361      Min.   :0.000       
##  1st Qu.:6.683e-02   1st Qu.: 32.8365      1st Qu.:1.000       
##  Median :1.236e-01   Median : 52.6792      Median :1.000       
##  Mean   :1.477e-01   Mean   : 61.0652      Mean   :1.475       
##  3rd Qu.:2.039e-01   3rd Qu.: 79.3675      3rd Qu.:2.000       
##  Max.   :8.304e-01   Max.   :406.8130      Max.   :9.000       
##  payment_method        region          customer_segment  
##  Length:50000       Length:50000       Length:50000      
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
## 

Penanganan Outlier pada kolom purchase_frequency

Q1 <- quantile(retail$purchase_frequency, 0.25)
Q3 <- quantile(retail$purchase_frequency, 0.75)
IQR <- Q3 - Q1

# Batas bawah dan atas
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
outliersa <- retail$purchase_frequency < lower_bound
outliers <- retail$purchase_frequency > upper_bound
sum(outliersa)
## [1] 0
sum(outliers)
## [1] 1799
boxplot(retail$purchase_frequency, main = "Boxplot Purchase Frequency", col = "yellow")

retail$purchase_frequency[outliers] <- ifelse(retail$purchase_frequency[outliers] < lower_bound, lower_bound, upper_bound)
#Boxplot setelah penanganan outlier
boxplot(retail$purchase_frequency, main = "Boxplot Purchase Frequency", col = "yellow")

Duplicate Data

sum(duplicated(retail))
## [1] 0

Mengecek Data Setelah Preprocessing

summary(retail)
##   customer_id         age     annual_income    months_active  
##  Min.   :    1   Min.   :18   Min.   :  2946   Min.   : 1.00  
##  1st Qu.:12501   1st Qu.:31   1st Qu.: 25113   1st Qu.:18.00  
##  Median :25001   Median :44   Median : 36788   Median :36.00  
##  Mean   :25001   Mean   :44   Mean   : 44066   Mean   :36.48  
##  3rd Qu.:37500   3rd Qu.:57   3rd Qu.: 53830   3rd Qu.:55.00  
##  Max.   :50000   Max.   :70   Max.   :530788   Max.   :72.00  
##  avg_monthly_spend  purchase_frequency avg_order_value     discount_usage_rate
##  Min.   :   8.072   Min.   : 0.01701   Min.   :   0.9404   Min.   :0.000534   
##  1st Qu.: 197.221   1st Qu.: 2.52642   1st Qu.:  32.0185   1st Qu.:0.166100   
##  Median : 297.477   Median : 4.15867   Median :  56.6003   Median :0.262591   
##  Mean   : 335.287   Mean   : 4.76856   Mean   :  77.3759   Mean   :0.283248   
##  3rd Qu.: 429.192   3rd Qu.: 6.39953   3rd Qu.:  98.8320   3rd Qu.:0.380198   
##  Max.   :3026.342   Max.   :12.20920   Max.   :1289.4273   Max.   :0.934655   
##   return_rate        browsing_time_minutes support_interactions
##  Min.   :4.968e-05   Min.   :  0.2361      Min.   :0.000       
##  1st Qu.:6.683e-02   1st Qu.: 32.8365      1st Qu.:1.000       
##  Median :1.236e-01   Median : 52.6792      Median :1.000       
##  Mean   :1.477e-01   Mean   : 61.0652      Mean   :1.475       
##  3rd Qu.:2.039e-01   3rd Qu.: 79.3675      3rd Qu.:2.000       
##  Max.   :8.304e-01   Max.   :406.8130      Max.   :9.000       
##  payment_method        region          customer_segment  
##  Length:50000       Length:50000       Length:50000      
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##