Load Library
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
Load Dataset Online Retail
library(readr)
retail <- read_csv("D:/EDA PROJECT/retail.csv")
## Rows: 50000 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): payment_method, region, customer_segment
## dbl (11): customer_id, age, annual_income, months_active, avg_monthly_spend,...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
retail
## # A tibble: 50,000 × 14
## customer_id age annual_income months_active avg_monthly_spend
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 33554 53 100473. 63 121.
## 2 9428 54 54731. 67 573.
## 3 200 44 58268. 57 267.
## 4 12448 54 64830. 40 691.
## 5 39490 28 27431. 15 833.
## 6 42725 67 13511. 49 212.
## 7 10823 50 66877. 60 453.
## 8 49499 64 NA 59 194.
## 9 4145 46 62196. 53 275.
## 10 36959 50 34685. 39 926.
## # ℹ 49,990 more rows
## # ℹ 9 more variables: purchase_frequency <dbl>, avg_order_value <dbl>,
## # discount_usage_rate <dbl>, return_rate <dbl>, browsing_time_minutes <dbl>,
## # support_interactions <dbl>, payment_method <chr>, region <chr>,
## # customer_segment <chr>
summary(retail)
## customer_id age annual_income months_active
## Min. : 1 Min. :18 Min. : 2946 Min. : 1.00
## 1st Qu.:12501 1st Qu.:31 1st Qu.: 24373 1st Qu.:18.00
## Median :25001 Median :44 Median : 36788 Median :36.00
## Mean :25001 Mean :44 Mean : 44543 Mean :36.48
## 3rd Qu.:37500 3rd Qu.:57 3rd Qu.: 55497 3rd Qu.:55.00
## Max. :50000 Max. :70 Max. :530788 Max. :72.00
## NA's :3075
## avg_monthly_spend purchase_frequency avg_order_value discount_usage_rate
## Min. : 8.072 Min. : 0.01701 Min. : 0.9404 Min. :0.00053
## 1st Qu.: 192.075 1st Qu.: 2.45687 1st Qu.: 32.0185 1st Qu.:0.16050
## Median : 297.477 Median : 4.15867 Median : 56.6003 Median :0.26259
## Mean : 337.294 Mean : 4.91084 Mean : 77.3759 Mean :0.28436
## 3rd Qu.: 439.112 3rd Qu.: 6.52690 3rd Qu.: 98.8320 3rd Qu.:0.38764
## Max. :3026.342 Max. :67.68253 Max. :1289.4273 Max. :0.93465
## NA's :2520 NA's :1979 NA's :2549
## return_rate browsing_time_minutes support_interactions
## Min. :0.00005 Min. : 0.2361 Min. :0.000
## 1st Qu.:0.06394 1st Qu.: 31.0823 1st Qu.:1.000
## Median :0.12363 Median : 52.6792 Median :1.000
## Mean :0.14896 Mean : 61.7813 Mean :1.495
## 3rd Qu.:0.20961 3rd Qu.: 82.5496 3rd Qu.:2.000
## Max. :0.83040 Max. :406.8130 Max. :9.000
## NA's :2487 NA's :3934 NA's :1988
## payment_method region customer_segment
## Length:50000 Length:50000 Length:50000
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
head(is.na(retail))
## customer_id age annual_income months_active avg_monthly_spend
## [1,] FALSE FALSE FALSE FALSE FALSE
## [2,] FALSE FALSE FALSE FALSE FALSE
## [3,] FALSE FALSE FALSE FALSE FALSE
## [4,] FALSE FALSE FALSE FALSE FALSE
## [5,] FALSE FALSE FALSE FALSE FALSE
## [6,] FALSE FALSE FALSE FALSE FALSE
## purchase_frequency avg_order_value discount_usage_rate return_rate
## [1,] FALSE FALSE FALSE FALSE
## [2,] FALSE FALSE FALSE FALSE
## [3,] FALSE FALSE FALSE FALSE
## [4,] FALSE FALSE FALSE FALSE
## [5,] FALSE FALSE FALSE FALSE
## [6,] FALSE FALSE FALSE FALSE
## browsing_time_minutes support_interactions payment_method region
## [1,] FALSE FALSE FALSE FALSE
## [2,] FALSE FALSE FALSE FALSE
## [3,] FALSE FALSE FALSE FALSE
## [4,] FALSE FALSE FALSE FALSE
## [5,] FALSE FALSE FALSE FALSE
## [6,] FALSE FALSE FALSE FALSE
## customer_segment
## [1,] FALSE
## [2,] FALSE
## [3,] FALSE
## [4,] FALSE
## [5,] FALSE
## [6,] FALSE
colSums(is.na(retail))
## customer_id age annual_income
## 0 0 3075
## months_active avg_monthly_spend purchase_frequency
## 0 2520 1979
## avg_order_value discount_usage_rate return_rate
## 0 2549 2487
## browsing_time_minutes support_interactions payment_method
## 3934 1988 0
## region customer_segment
## 0 0
aggr(retail, numbers = TRUE, prop = FALSE)
## Warning in plot.aggr(res, ...): not enough vertical space to display
## frequencies (too many combinations)

Imputasi Missing Values
# Mengganti missing values dengan median di setiap kolom
retail$annual_income[is.na(retail$annual_income)] <- median(retail$annual_income, na.rm = TRUE)
retail$avg_monthly_spend[is.na(retail$avg_monthly_spend)] <- median(retail$avg_monthly_spend, na.rm = TRUE)
retail$purchase_frequency [is.na(retail$purchase_frequency )] <- median(retail$purchase_frequency, na.rm = TRUE)
retail$discount_usage_rate[is.na(retail$discount_usage_rate)] <- median(retail$discount_usage_rate, na.rm = TRUE)
retail$return_rate[is.na(retail$return_rate)] <- median(retail$return_rate, na.rm = TRUE)
retail$browsing_time_minutes[is.na(retail$browsing_time_minutes)] <- median(retail$browsing_time_minutes, na.rm = TRUE)
retail$support_interactions[is.na(retail$support_interactions)] <- median(retail$support_interactions, na.rm = TRUE)
retail
## # A tibble: 50,000 × 14
## customer_id age annual_income months_active avg_monthly_spend
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 33554 53 100473. 63 121.
## 2 9428 54 54731. 67 573.
## 3 200 44 58268. 57 267.
## 4 12448 54 64830. 40 691.
## 5 39490 28 27431. 15 833.
## 6 42725 67 13511. 49 212.
## 7 10823 50 66877. 60 453.
## 8 49499 64 36788. 59 194.
## 9 4145 46 62196. 53 275.
## 10 36959 50 34685. 39 926.
## # ℹ 49,990 more rows
## # ℹ 9 more variables: purchase_frequency <dbl>, avg_order_value <dbl>,
## # discount_usage_rate <dbl>, return_rate <dbl>, browsing_time_minutes <dbl>,
## # support_interactions <dbl>, payment_method <chr>, region <chr>,
## # customer_segment <chr>
colSums(is.na(retail))
## customer_id age annual_income
## 0 0 0
## months_active avg_monthly_spend purchase_frequency
## 0 0 0
## avg_order_value discount_usage_rate return_rate
## 0 0 0
## browsing_time_minutes support_interactions payment_method
## 0 0 0
## region customer_segment
## 0 0
summary(retail)
## customer_id age annual_income months_active
## Min. : 1 Min. :18 Min. : 2946 Min. : 1.00
## 1st Qu.:12501 1st Qu.:31 1st Qu.: 25113 1st Qu.:18.00
## Median :25001 Median :44 Median : 36788 Median :36.00
## Mean :25001 Mean :44 Mean : 44066 Mean :36.48
## 3rd Qu.:37500 3rd Qu.:57 3rd Qu.: 53830 3rd Qu.:55.00
## Max. :50000 Max. :70 Max. :530788 Max. :72.00
## avg_monthly_spend purchase_frequency avg_order_value discount_usage_rate
## Min. : 8.072 Min. : 0.01701 Min. : 0.9404 Min. :0.000534
## 1st Qu.: 197.221 1st Qu.: 2.52642 1st Qu.: 32.0185 1st Qu.:0.166100
## Median : 297.477 Median : 4.15867 Median : 56.6003 Median :0.262591
## Mean : 335.287 Mean : 4.88107 Mean : 77.3759 Mean :0.283248
## 3rd Qu.: 429.192 3rd Qu.: 6.39953 3rd Qu.: 98.8320 3rd Qu.:0.380198
## Max. :3026.342 Max. :67.68253 Max. :1289.4273 Max. :0.934655
## return_rate browsing_time_minutes support_interactions
## Min. :4.968e-05 Min. : 0.2361 Min. :0.000
## 1st Qu.:6.683e-02 1st Qu.: 32.8365 1st Qu.:1.000
## Median :1.236e-01 Median : 52.6792 Median :1.000
## Mean :1.477e-01 Mean : 61.0652 Mean :1.475
## 3rd Qu.:2.039e-01 3rd Qu.: 79.3675 3rd Qu.:2.000
## Max. :8.304e-01 Max. :406.8130 Max. :9.000
## payment_method region customer_segment
## Length:50000 Length:50000 Length:50000
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
Penanganan Outlier pada kolom purchase_frequency
Q1 <- quantile(retail$purchase_frequency, 0.25)
Q3 <- quantile(retail$purchase_frequency, 0.75)
IQR <- Q3 - Q1
# Batas bawah dan atas
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
outliersa <- retail$purchase_frequency < lower_bound
outliers <- retail$purchase_frequency > upper_bound
sum(outliersa)
## [1] 0
sum(outliers)
## [1] 1799
boxplot(retail$purchase_frequency, main = "Boxplot Purchase Frequency", col = "yellow")

retail$purchase_frequency[outliers] <- ifelse(retail$purchase_frequency[outliers] < lower_bound, lower_bound, upper_bound)
#Boxplot setelah penanganan outlier
boxplot(retail$purchase_frequency, main = "Boxplot Purchase Frequency", col = "yellow")

Duplicate Data
sum(duplicated(retail))
## [1] 0
Mengecek Data Setelah Preprocessing
summary(retail)
## customer_id age annual_income months_active
## Min. : 1 Min. :18 Min. : 2946 Min. : 1.00
## 1st Qu.:12501 1st Qu.:31 1st Qu.: 25113 1st Qu.:18.00
## Median :25001 Median :44 Median : 36788 Median :36.00
## Mean :25001 Mean :44 Mean : 44066 Mean :36.48
## 3rd Qu.:37500 3rd Qu.:57 3rd Qu.: 53830 3rd Qu.:55.00
## Max. :50000 Max. :70 Max. :530788 Max. :72.00
## avg_monthly_spend purchase_frequency avg_order_value discount_usage_rate
## Min. : 8.072 Min. : 0.01701 Min. : 0.9404 Min. :0.000534
## 1st Qu.: 197.221 1st Qu.: 2.52642 1st Qu.: 32.0185 1st Qu.:0.166100
## Median : 297.477 Median : 4.15867 Median : 56.6003 Median :0.262591
## Mean : 335.287 Mean : 4.76856 Mean : 77.3759 Mean :0.283248
## 3rd Qu.: 429.192 3rd Qu.: 6.39953 3rd Qu.: 98.8320 3rd Qu.:0.380198
## Max. :3026.342 Max. :12.20920 Max. :1289.4273 Max. :0.934655
## return_rate browsing_time_minutes support_interactions
## Min. :4.968e-05 Min. : 0.2361 Min. :0.000
## 1st Qu.:6.683e-02 1st Qu.: 32.8365 1st Qu.:1.000
## Median :1.236e-01 Median : 52.6792 Median :1.000
## Mean :1.477e-01 Mean : 61.0652 Mean :1.475
## 3rd Qu.:2.039e-01 3rd Qu.: 79.3675 3rd Qu.:2.000
## Max. :8.304e-01 Max. :406.8130 Max. :9.000
## payment_method region customer_segment
## Length:50000 Length:50000 Length:50000
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##