library(readr)
churn <- read_csv("C:/Users/User/Downloads/churn.csv")
## Rows: 36992 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (16): gender, security_no, region_category, membership_category, joinin...
## dbl   (6): age, days_since_last_login, avg_time_spent, avg_transaction_value...
## time  (1): last_visit_time
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(churn)
## spc_tbl_ [36,992 × 23] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ age                         : num [1:36992] 18 32 44 37 31 13 21 42 44 45 ...
##  $ gender                      : chr [1:36992] "F" "F" "F" "M" ...
##  $ security_no                 : chr [1:36992] "XW0DQ7H" "5K0N3X1" "1F2TCL3" "VJGJ33N" ...
##  $ region_category             : chr [1:36992] "Village" "City" "Town" "City" ...
##  $ membership_category         : chr [1:36992] "Platinum Membership" "Premium Membership" "No Membership" "No Membership" ...
##  $ joining_date                : chr [1:36992] "17-08-2017" "28-08-2017" "11-11-2016" "29-10-2016" ...
##  $ joined_through_referral     : chr [1:36992] "No" "?" "Yes" "Yes" ...
##  $ referral_id                 : chr [1:36992] "xxxxxxxx" "CID21329" "CID12313" "CID3793" ...
##  $ preferred_offer_types       : chr [1:36992] "Gift Vouchers/Coupons" "Gift Vouchers/Coupons" "Gift Vouchers/Coupons" "Gift Vouchers/Coupons" ...
##  $ medium_of_operation         : chr [1:36992] "?" "Desktop" "Desktop" "Desktop" ...
##  $ internet_option             : chr [1:36992] "Wi-Fi" "Mobile_Data" "Wi-Fi" "Mobile_Data" ...
##  $ last_visit_time             : 'hms' num [1:36992] 16:08:02 12:38:13 22:53:21 15:57:50 ...
##   ..- attr(*, "units")= chr "secs"
##  $ days_since_last_login       : num [1:36992] 17 16 14 11 20 23 10 19 15 10 ...
##  $ avg_time_spent              : num [1:36992] 300.6 306.3 516.2 53.3 113.1 ...
##  $ avg_transaction_value       : num [1:36992] 53005 12838 21027 25240 24484 ...
##  $ avg_frequency_login_days    : chr [1:36992] "17" "10" "22" "6" ...
##  $ points_in_wallet            : num [1:36992] 782 NA 501 568 663 ...
##  $ used_special_discount       : chr [1:36992] "Yes" "Yes" "No" "No" ...
##  $ offer_application_preference: chr [1:36992] "Yes" "No" "Yes" "Yes" ...
##  $ past_complaint              : chr [1:36992] "No" "Yes" "Yes" "Yes" ...
##  $ complaint_status            : chr [1:36992] "Not Applicable" "Solved" "Solved in Follow-up" "Unsolved" ...
##  $ feedback                    : chr [1:36992] "Products always in Stock" "Quality Customer Care" "Poor Website" "Poor Website" ...
##  $ churn_risk_score            : num [1:36992] 0 0 1 1 1 0 0 1 0 1 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   age = col_double(),
##   ..   gender = col_character(),
##   ..   security_no = col_character(),
##   ..   region_category = col_character(),
##   ..   membership_category = col_character(),
##   ..   joining_date = col_character(),
##   ..   joined_through_referral = col_character(),
##   ..   referral_id = col_character(),
##   ..   preferred_offer_types = col_character(),
##   ..   medium_of_operation = col_character(),
##   ..   internet_option = col_character(),
##   ..   last_visit_time = col_time(format = ""),
##   ..   days_since_last_login = col_double(),
##   ..   avg_time_spent = col_double(),
##   ..   avg_transaction_value = col_double(),
##   ..   avg_frequency_login_days = col_character(),
##   ..   points_in_wallet = col_double(),
##   ..   used_special_discount = col_character(),
##   ..   offer_application_preference = col_character(),
##   ..   past_complaint = col_character(),
##   ..   complaint_status = col_character(),
##   ..   feedback = col_character(),
##   ..   churn_risk_score = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
### Exploratory Data Analysis
library(DataExplorer)
## Warning: package 'DataExplorer' was built under R version 4.4.3
plot_intro(churn)

library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
plotdata <- churn %>%
  count(churn_risk_score) %>%
  arrange(desc(churn_risk_score)) %>%
  mutate(prop = round(n*100/sum(n), 1),
         lab.ypos = cumsum(prop) - 0.5*prop)
ggplot(plotdata, aes(x = "", y = prop, fill = churn_risk_score )) +
  geom_bar(width = 1, stat = "identity", color = "white") +
  coord_polar("y", start = 0)+
  geom_text(aes(y = lab.ypos, label = prop), color = "black")+
  theme_void()+
  labs(title = "Persentase churn_risk_score")

plot_boxplot(churn,by='churn_risk_score')
## Warning: Removed 3443 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Mencari missing value dari tiap kolom
colSums(is.na(churn)) ## region_category karna > 5000 ubah jadi unknown/lainnya, prefered -> modus, points in wallet -> median
##                          age                       gender 
##                            0                            0 
##                  security_no              region_category 
##                            0                         5428 
##          membership_category                 joining_date 
##                            0                            0 
##      joined_through_referral                  referral_id 
##                            0                            0 
##        preferred_offer_types          medium_of_operation 
##                          288                            0 
##              internet_option              last_visit_time 
##                            0                            0 
##        days_since_last_login               avg_time_spent 
##                            0                            0 
##        avg_transaction_value     avg_frequency_login_days 
##                            0                            0 
##             points_in_wallet        used_special_discount 
##                         3443                            0 
## offer_application_preference               past_complaint 
##                            0                            0 
##             complaint_status                     feedback 
##                            0                            0 
##             churn_risk_score 
##                            0
sum(churn$gender == "Unknown") ## karna jml nya dikit jadi pake modus
## [1] 59
sum(churn$joined_through_referral == "?") ## karna banyak > 5000 maka tetep ubah jadi var unknown
## [1] 5438
sum(churn$medium_of_operation == "?") ## sama > 5000
## [1] 5393
sum(churn$days_since_last_login < 0, na.rm = TRUE) # karna bahas login jadi yg negatif jadiin nol
## [1] 1999
sum(churn$avg_time_spent < 0, na.rm = TRUE) ## -> median
## [1] 1719
sum(churn$avg_frequency_login_days < 0, na.rm = TRUE) ## -> median
## [1] 683
sum(churn$avg_frequency_login_days == 'Error') ## -> NA -> median
## [1] 3522
### Preprocessing Data
## variabel yang hanya perlu diubah namanya
churn$region_category[is.na(churn$region_category)] <- "Lainnya"
churn$avg_frequency_login_days[churn$avg_frequency_login_days == "Error"] <- 0
churn$avg_frequency_login_days <- as.numeric(as.character(churn$avg_frequency_login_days))
churn$joined_through_referral[churn$joined_through_referral == "?"] <- "Lainnya"
churn$medium_of_operation[churn$medium_of_operation == "?"] <- "Lainnya"
churn$days_since_last_login[churn$days_since_last_login < 0] <- 0

## variabel yg dijadiin median/modus
Mode <- function(x) {
  uniq_x <- unique(x)
  uniq_x[which.max(tabulate(match(x, uniq_x)))]
}
modus_gender <- Mode(churn$gender == "Unknown")
modus_pot <- Mode(churn$preferred_offer_types[!is.na(churn$preferred_offer_types)])
churn$gender <- ifelse(churn$gender == "Unknown", modus_gender, churn$gender)
churn$preferred_offer_types <- ifelse(is.na(churn$preferred_offer_types), modus_pot, churn$preferred_offer_types)

churn$avg_time_spent[churn$avg_time_spent < 0] <- NA
churn$avg_frequency_login_days[churn$avg_frequency_login_days < 0] <- NA
median_ats <- median(churn$avg_time_spent, na.rm = TRUE)
median_afl <- median(churn$avg_frequency_login_days, na.rm = TRUE)
median_piw <- median(churn$points_in_wallet, na.rm = TRUE)
churn$avg_time_spent <- ifelse(is.na(churn$avg_time_spent), median_ats, churn$avg_time_spent)
churn$avg_frequency_login_days <- ifelse(is.na(churn$avg_frequency_login_days), median_afl, churn$avg_frequency_login_days)
churn$points_in_wallet <- ifelse(is.na(churn$points_in_wallet), median_piw, churn$points_in_wallet)

plot_intro(churn)

plot_boxplot(churn,by='churn_risk_score') # ini ak ga tau mau pake lagi ga? di atas udah

churn <- subset(churn, select = -c(security_no,referral_id,last_visit_time,joining_date)) ## yg ga penting
library(dplyr)
churn <- churn %>%
  mutate(across(c(gender,region_category,membership_category,joined_through_referral,
                  preferred_offer_types,medium_of_operation,internet_option,
                  used_special_discount,offer_application_preference,past_complaint,
                  complaint_status,feedback,churn_risk_score), as.factor))
str(churn)
## tibble [36,992 × 19] (S3: tbl_df/tbl/data.frame)
##  $ age                         : num [1:36992] 18 32 44 37 31 13 21 42 44 45 ...
##  $ gender                      : Factor w/ 3 levels "F","FALSE","M": 1 1 1 3 1 3 3 3 3 1 ...
##  $ region_category             : Factor w/ 4 levels "City","Lainnya",..: 4 1 3 1 1 1 3 2 4 3 ...
##  $ membership_category         : Factor w/ 6 levels "Basic Membership",..: 4 5 3 3 3 2 2 3 6 3 ...
##  $ joined_through_referral     : Factor w/ 3 levels "Lainnya","No",..: 2 1 3 3 2 2 3 1 2 2 ...
##  $ preferred_offer_types       : Factor w/ 3 levels "Credit/Debit Card Offers",..: 2 2 2 2 1 2 2 1 3 2 ...
##  $ medium_of_operation         : Factor w/ 4 levels "Both","Desktop",..: 3 2 2 2 4 3 2 1 4 3 ...
##  $ internet_option             : Factor w/ 3 levels "Fiber_Optic",..: 3 2 3 2 2 3 2 1 1 3 ...
##  $ days_since_last_login       : num [1:36992] 17 16 14 11 20 23 10 19 15 10 ...
##  $ avg_time_spent              : num [1:36992] 300.6 306.3 516.2 53.3 113.1 ...
##  $ avg_transaction_value       : num [1:36992] 53005 12838 21027 25240 24484 ...
##  $ avg_frequency_login_days    : num [1:36992] 17 10 22 6 16 24 28 24 20 28 ...
##  $ points_in_wallet            : num [1:36992] 782 698 501 568 663 ...
##  $ used_special_discount       : Factor w/ 2 levels "No","Yes": 2 2 1 1 1 2 2 1 2 1 ...
##  $ offer_application_preference: Factor w/ 2 levels "No","Yes": 2 1 2 2 2 1 1 2 1 2 ...
##  $ past_complaint              : Factor w/ 2 levels "No","Yes": 1 2 2 2 2 2 2 2 2 2 ...
##  $ complaint_status            : Factor w/ 5 levels "No Information Available",..: 2 3 4 5 3 5 4 5 4 1 ...
##  $ feedback                    : Factor w/ 9 levels "No reason specified",..: 5 6 4 4 4 1 1 3 2 2 ...
##  $ churn_risk_score            : Factor w/ 2 levels "0","1": 1 1 2 2 2 1 1 2 1 2 ...