library(readr)
churn <- read_csv("C:/Users/User/Downloads/churn.csv")
## Rows: 36992 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (16): gender, security_no, region_category, membership_category, joinin...
## dbl (6): age, days_since_last_login, avg_time_spent, avg_transaction_value...
## time (1): last_visit_time
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(churn)
## spc_tbl_ [36,992 × 23] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ age : num [1:36992] 18 32 44 37 31 13 21 42 44 45 ...
## $ gender : chr [1:36992] "F" "F" "F" "M" ...
## $ security_no : chr [1:36992] "XW0DQ7H" "5K0N3X1" "1F2TCL3" "VJGJ33N" ...
## $ region_category : chr [1:36992] "Village" "City" "Town" "City" ...
## $ membership_category : chr [1:36992] "Platinum Membership" "Premium Membership" "No Membership" "No Membership" ...
## $ joining_date : chr [1:36992] "17-08-2017" "28-08-2017" "11-11-2016" "29-10-2016" ...
## $ joined_through_referral : chr [1:36992] "No" "?" "Yes" "Yes" ...
## $ referral_id : chr [1:36992] "xxxxxxxx" "CID21329" "CID12313" "CID3793" ...
## $ preferred_offer_types : chr [1:36992] "Gift Vouchers/Coupons" "Gift Vouchers/Coupons" "Gift Vouchers/Coupons" "Gift Vouchers/Coupons" ...
## $ medium_of_operation : chr [1:36992] "?" "Desktop" "Desktop" "Desktop" ...
## $ internet_option : chr [1:36992] "Wi-Fi" "Mobile_Data" "Wi-Fi" "Mobile_Data" ...
## $ last_visit_time : 'hms' num [1:36992] 16:08:02 12:38:13 22:53:21 15:57:50 ...
## ..- attr(*, "units")= chr "secs"
## $ days_since_last_login : num [1:36992] 17 16 14 11 20 23 10 19 15 10 ...
## $ avg_time_spent : num [1:36992] 300.6 306.3 516.2 53.3 113.1 ...
## $ avg_transaction_value : num [1:36992] 53005 12838 21027 25240 24484 ...
## $ avg_frequency_login_days : chr [1:36992] "17" "10" "22" "6" ...
## $ points_in_wallet : num [1:36992] 782 NA 501 568 663 ...
## $ used_special_discount : chr [1:36992] "Yes" "Yes" "No" "No" ...
## $ offer_application_preference: chr [1:36992] "Yes" "No" "Yes" "Yes" ...
## $ past_complaint : chr [1:36992] "No" "Yes" "Yes" "Yes" ...
## $ complaint_status : chr [1:36992] "Not Applicable" "Solved" "Solved in Follow-up" "Unsolved" ...
## $ feedback : chr [1:36992] "Products always in Stock" "Quality Customer Care" "Poor Website" "Poor Website" ...
## $ churn_risk_score : num [1:36992] 0 0 1 1 1 0 0 1 0 1 ...
## - attr(*, "spec")=
## .. cols(
## .. age = col_double(),
## .. gender = col_character(),
## .. security_no = col_character(),
## .. region_category = col_character(),
## .. membership_category = col_character(),
## .. joining_date = col_character(),
## .. joined_through_referral = col_character(),
## .. referral_id = col_character(),
## .. preferred_offer_types = col_character(),
## .. medium_of_operation = col_character(),
## .. internet_option = col_character(),
## .. last_visit_time = col_time(format = ""),
## .. days_since_last_login = col_double(),
## .. avg_time_spent = col_double(),
## .. avg_transaction_value = col_double(),
## .. avg_frequency_login_days = col_character(),
## .. points_in_wallet = col_double(),
## .. used_special_discount = col_character(),
## .. offer_application_preference = col_character(),
## .. past_complaint = col_character(),
## .. complaint_status = col_character(),
## .. feedback = col_character(),
## .. churn_risk_score = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
### Exploratory Data Analysis
library(DataExplorer)
## Warning: package 'DataExplorer' was built under R version 4.4.3
plot_intro(churn)

library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
plotdata <- churn %>%
count(churn_risk_score) %>%
arrange(desc(churn_risk_score)) %>%
mutate(prop = round(n*100/sum(n), 1),
lab.ypos = cumsum(prop) - 0.5*prop)
ggplot(plotdata, aes(x = "", y = prop, fill = churn_risk_score )) +
geom_bar(width = 1, stat = "identity", color = "white") +
coord_polar("y", start = 0)+
geom_text(aes(y = lab.ypos, label = prop), color = "black")+
theme_void()+
labs(title = "Persentase churn_risk_score")

plot_boxplot(churn,by='churn_risk_score')
## Warning: Removed 3443 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Mencari missing value dari tiap kolom
colSums(is.na(churn)) ## region_category karna > 5000 ubah jadi unknown/lainnya, prefered -> modus, points in wallet -> median
## age gender
## 0 0
## security_no region_category
## 0 5428
## membership_category joining_date
## 0 0
## joined_through_referral referral_id
## 0 0
## preferred_offer_types medium_of_operation
## 288 0
## internet_option last_visit_time
## 0 0
## days_since_last_login avg_time_spent
## 0 0
## avg_transaction_value avg_frequency_login_days
## 0 0
## points_in_wallet used_special_discount
## 3443 0
## offer_application_preference past_complaint
## 0 0
## complaint_status feedback
## 0 0
## churn_risk_score
## 0
sum(churn$gender == "Unknown") ## karna jml nya dikit jadi pake modus
## [1] 59
sum(churn$joined_through_referral == "?") ## karna banyak > 5000 maka tetep ubah jadi var unknown
## [1] 5438
sum(churn$medium_of_operation == "?") ## sama > 5000
## [1] 5393
sum(churn$days_since_last_login < 0, na.rm = TRUE) # karna bahas login jadi yg negatif jadiin nol
## [1] 1999
sum(churn$avg_time_spent < 0, na.rm = TRUE) ## -> median
## [1] 1719
sum(churn$avg_frequency_login_days < 0, na.rm = TRUE) ## -> median
## [1] 683
sum(churn$avg_frequency_login_days == 'Error') ## -> NA -> median
## [1] 3522
### Preprocessing Data
## variabel yang hanya perlu diubah namanya
churn$region_category[is.na(churn$region_category)] <- "Lainnya"
churn$avg_frequency_login_days[churn$avg_frequency_login_days == "Error"] <- 0
churn$avg_frequency_login_days <- as.numeric(as.character(churn$avg_frequency_login_days))
churn$joined_through_referral[churn$joined_through_referral == "?"] <- "Lainnya"
churn$medium_of_operation[churn$medium_of_operation == "?"] <- "Lainnya"
churn$days_since_last_login[churn$days_since_last_login < 0] <- 0
## variabel yg dijadiin median/modus
Mode <- function(x) {
uniq_x <- unique(x)
uniq_x[which.max(tabulate(match(x, uniq_x)))]
}
modus_gender <- Mode(churn$gender == "Unknown")
modus_pot <- Mode(churn$preferred_offer_types[!is.na(churn$preferred_offer_types)])
churn$gender <- ifelse(churn$gender == "Unknown", modus_gender, churn$gender)
churn$preferred_offer_types <- ifelse(is.na(churn$preferred_offer_types), modus_pot, churn$preferred_offer_types)
churn$avg_time_spent[churn$avg_time_spent < 0] <- NA
churn$avg_frequency_login_days[churn$avg_frequency_login_days < 0] <- NA
median_ats <- median(churn$avg_time_spent, na.rm = TRUE)
median_afl <- median(churn$avg_frequency_login_days, na.rm = TRUE)
median_piw <- median(churn$points_in_wallet, na.rm = TRUE)
churn$avg_time_spent <- ifelse(is.na(churn$avg_time_spent), median_ats, churn$avg_time_spent)
churn$avg_frequency_login_days <- ifelse(is.na(churn$avg_frequency_login_days), median_afl, churn$avg_frequency_login_days)
churn$points_in_wallet <- ifelse(is.na(churn$points_in_wallet), median_piw, churn$points_in_wallet)
plot_intro(churn)

plot_boxplot(churn,by='churn_risk_score') # ini ak ga tau mau pake lagi ga? di atas udah

churn <- subset(churn, select = -c(security_no,referral_id,last_visit_time,joining_date)) ## yg ga penting
library(dplyr)
churn <- churn %>%
mutate(across(c(gender,region_category,membership_category,joined_through_referral,
preferred_offer_types,medium_of_operation,internet_option,
used_special_discount,offer_application_preference,past_complaint,
complaint_status,feedback,churn_risk_score), as.factor))
str(churn)
## tibble [36,992 × 19] (S3: tbl_df/tbl/data.frame)
## $ age : num [1:36992] 18 32 44 37 31 13 21 42 44 45 ...
## $ gender : Factor w/ 3 levels "F","FALSE","M": 1 1 1 3 1 3 3 3 3 1 ...
## $ region_category : Factor w/ 4 levels "City","Lainnya",..: 4 1 3 1 1 1 3 2 4 3 ...
## $ membership_category : Factor w/ 6 levels "Basic Membership",..: 4 5 3 3 3 2 2 3 6 3 ...
## $ joined_through_referral : Factor w/ 3 levels "Lainnya","No",..: 2 1 3 3 2 2 3 1 2 2 ...
## $ preferred_offer_types : Factor w/ 3 levels "Credit/Debit Card Offers",..: 2 2 2 2 1 2 2 1 3 2 ...
## $ medium_of_operation : Factor w/ 4 levels "Both","Desktop",..: 3 2 2 2 4 3 2 1 4 3 ...
## $ internet_option : Factor w/ 3 levels "Fiber_Optic",..: 3 2 3 2 2 3 2 1 1 3 ...
## $ days_since_last_login : num [1:36992] 17 16 14 11 20 23 10 19 15 10 ...
## $ avg_time_spent : num [1:36992] 300.6 306.3 516.2 53.3 113.1 ...
## $ avg_transaction_value : num [1:36992] 53005 12838 21027 25240 24484 ...
## $ avg_frequency_login_days : num [1:36992] 17 10 22 6 16 24 28 24 20 28 ...
## $ points_in_wallet : num [1:36992] 782 698 501 568 663 ...
## $ used_special_discount : Factor w/ 2 levels "No","Yes": 2 2 1 1 1 2 2 1 2 1 ...
## $ offer_application_preference: Factor w/ 2 levels "No","Yes": 2 1 2 2 2 1 1 2 1 2 ...
## $ past_complaint : Factor w/ 2 levels "No","Yes": 1 2 2 2 2 2 2 2 2 2 ...
## $ complaint_status : Factor w/ 5 levels "No Information Available",..: 2 3 4 5 3 5 4 5 4 1 ...
## $ feedback : Factor w/ 9 levels "No reason specified",..: 5 6 4 4 4 1 1 3 2 2 ...
## $ churn_risk_score : Factor w/ 2 levels "0","1": 1 1 2 2 2 1 1 2 1 2 ...