library(tidyverse)
library(xlsx)
library(ggmap)
getwd()
## [1] "C:/Users/SK/Desktop/Customer Analytics Project/OneDrive_1_3-2-2020"
setwd("C:/Users/SK/Desktop/Customer Analytics Project/OneDrive_1_3-2-2020")
telco_raw <- read.xlsx("Telco_customer_churn.xlsx", sheetName = "Telco_Churn")
str(telco_raw)
## 'data.frame': 7043 obs. of 33 variables:
## $ CustomerID : Factor w/ 7043 levels "0002-ORFBO","0003-MKNFE",..: 2565 6512 6552 5605 175 2938 6208 724 4585 6120 ...
## $ Count : num 1 1 1 1 1 1 1 1 1 1 ...
## $ Country : Factor w/ 1 level "United States": 1 1 1 1 1 1 1 1 1 1 ...
## $ State : Factor w/ 1 level "California": 1 1 1 1 1 1 1 1 1 1 ...
## $ City : Factor w/ 1129 levels "Acampo","Acton",..: 563 563 563 563 563 563 563 563 563 563 ...
## $ Zip.Code : num 90003 90005 90006 90010 90015 ...
## $ Lat.Long : Factor w/ 1652 levels "32.555828, -117.040073",..: 328 406 394 411 386 417 368 416 448 441 ...
## $ Latitude : num 34 34.1 34 34.1 34 ...
## $ Longitude : num -118 -118 -118 -118 -118 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 2 1 1 1 2 1 2 2 2 2 ...
## $ Senior.Citizen : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 2 1 1 1 ...
## $ Partner : Factor w/ 2 levels "No","Yes": 1 1 1 2 1 2 1 1 2 2 ...
## $ Dependents : Factor w/ 2 levels "No","Yes": 1 2 2 2 2 1 1 1 2 1 ...
## $ Tenure.Months : num 2 2 8 28 49 10 1 1 47 1 ...
## $ Phone.Service : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 1 2 2 1 ...
## $ Multiple.Lines : Factor w/ 3 levels "No","No phone service",..: 1 1 3 3 3 1 2 1 3 2 ...
## $ Internet.Service : Factor w/ 3 levels "DSL","Fiber optic",..: 1 2 2 2 2 1 1 3 2 1 ...
## $ Online.Security : Factor w/ 3 levels "No","No internet service",..: 3 1 1 1 1 1 1 2 1 1 ...
## $ Online.Backup : Factor w/ 3 levels "No","No internet service",..: 3 1 1 1 3 1 1 2 3 3 ...
## $ Device.Protection: Factor w/ 3 levels "No","No internet service",..: 1 1 3 3 3 3 3 2 1 1 ...
## $ Tech.Support : Factor w/ 3 levels "No","No internet service",..: 1 1 1 3 1 3 1 2 1 1 ...
## $ Streaming.TV : Factor w/ 3 levels "No","No internet service",..: 1 1 3 3 3 1 1 2 3 1 ...
## $ Streaming.Movies : Factor w/ 3 levels "No","No internet service",..: 1 1 3 3 3 1 3 2 3 1 ...
## $ Contract : Factor w/ 3 levels "Month-to-month",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Paperless.Billing: Factor w/ 2 levels "No","Yes": 2 2 2 2 2 1 2 1 2 1 ...
## $ Payment.Method : Factor w/ 4 levels "Bank transfer (automatic)",..: 4 3 3 3 1 2 3 4 3 3 ...
## $ Monthly.Charges : num 53.9 70.7 99.7 104.8 103.7 ...
## $ Total.Charges : Factor w/ 6531 levels " ","100.2","100.25",..: 158 926 6105 2647 4266 4417 3341 1610 4020 2593 ...
## $ Churn.Label : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ Churn.Value : num 1 1 1 1 1 1 1 1 1 1 ...
## $ Churn.Score : num 86 67 86 84 89 78 100 92 77 97 ...
## $ CLTV : num 3239 2701 5372 5003 5340 ...
## $ Churn.Reason : Factor w/ 20 levels "Attitude of service provider",..: 4 14 14 14 3 5 6 4 3 3 ...
summary(telco_raw)
## CustomerID Count Country State
## 0002-ORFBO: 1 Min. :1 United States:7043 California:7043
## 0003-MKNFE: 1 1st Qu.:1
## 0004-TLHLJ: 1 Median :1
## 0011-IGKFF: 1 Mean :1
## 0013-EXCHZ: 1 3rd Qu.:1
## 0013-MHZWF: 1 Max. :1
## (Other) :7037
## City Zip.Code Lat.Long
## Los Angeles : 305 Min. :90001 32.555828, -117.040073: 5
## San Diego : 150 1st Qu.:92102 32.578103, -117.012975: 5
## San Jose : 112 Median :93552 32.579134, -117.119009: 5
## Sacramento : 108 Mean :93522 32.587557, -116.636816: 5
## San Francisco: 104 3rd Qu.:95351 32.605012, -116.97595 : 5
## Fresno : 64 Max. :96161 32.607964, -117.059459: 5
## (Other) :6200 (Other) :7013
## Latitude Longitude Gender Senior.Citizen Partner
## Min. :32.56 Min. :-124.3 Female:3488 No :5901 No :3641
## 1st Qu.:34.03 1st Qu.:-121.8 Male :3555 Yes:1142 Yes:3402
## Median :36.39 Median :-119.7
## Mean :36.28 Mean :-119.8
## 3rd Qu.:38.22 3rd Qu.:-118.0
## Max. :41.96 Max. :-114.2
##
## Dependents Tenure.Months Phone.Service Multiple.Lines
## No :5416 Min. : 0.00 No : 682 No :3390
## Yes:1627 1st Qu.: 9.00 Yes:6361 No phone service: 682
## Median :29.00 Yes :2971
## Mean :32.37
## 3rd Qu.:55.00
## Max. :72.00
##
## Internet.Service Online.Security Online.Backup
## DSL :2421 No :3498 No :3088
## Fiber optic:3096 No internet service:1526 No internet service:1526
## No :1526 Yes :2019 Yes :2429
##
##
##
##
## Device.Protection Tech.Support
## No :3095 No :3473
## No internet service:1526 No internet service:1526
## Yes :2422 Yes :2044
##
##
##
##
## Streaming.TV Streaming.Movies
## No :2810 No :2785
## No internet service:1526 No internet service:1526
## Yes :2707 Yes :2732
##
##
##
##
## Contract Paperless.Billing Payment.Method
## Month-to-month:3875 No :2872 Bank transfer (automatic):1544
## One year :1473 Yes:4171 Credit card (automatic) :1522
## Two year :1695 Electronic check :2365
## Mailed check :1612
##
##
##
## Monthly.Charges Total.Charges Churn.Label Churn.Value
## Min. : 18.25 : 11 No :5174 Min. :0.0000
## 1st Qu.: 35.50 20.2 : 11 Yes:1869 1st Qu.:0.0000
## Median : 70.35 19.75 : 9 Median :0.0000
## Mean : 64.76 19.65 : 8 Mean :0.2654
## 3rd Qu.: 89.85 19.9 : 8 3rd Qu.:1.0000
## Max. :118.75 20.05 : 8 Max. :1.0000
## (Other):6988
## Churn.Score CLTV
## Min. : 5.0 Min. :2003
## 1st Qu.: 40.0 1st Qu.:3469
## Median : 61.0 Median :4527
## Mean : 58.7 Mean :4400
## 3rd Qu.: 75.0 3rd Qu.:5380
## Max. :100.0 Max. :6500
##
## Churn.Reason
## Attitude of support person : 192
## Competitor offered higher download speeds: 189
## Competitor offered more data : 162
## Don't know : 154
## Competitor made better offer : 140
## (Other) :1032
## NA's :5174
sapply(telco_raw, class)
## CustomerID Count Country State
## "factor" "numeric" "factor" "factor"
## City Zip.Code Lat.Long Latitude
## "factor" "numeric" "factor" "numeric"
## Longitude Gender Senior.Citizen Partner
## "numeric" "factor" "factor" "factor"
## Dependents Tenure.Months Phone.Service Multiple.Lines
## "factor" "numeric" "factor" "factor"
## Internet.Service Online.Security Online.Backup Device.Protection
## "factor" "factor" "factor" "factor"
## Tech.Support Streaming.TV Streaming.Movies Contract
## "factor" "factor" "factor" "factor"
## Paperless.Billing Payment.Method Monthly.Charges Total.Charges
## "factor" "factor" "numeric" "factor"
## Churn.Label Churn.Value Churn.Score CLTV
## "factor" "numeric" "numeric" "numeric"
## Churn.Reason
## "factor"
table(telco_raw$Churn.Label)
##
## No Yes
## 5174 1869
(1869/5174)*100
## [1] 36.12292
telco_raw_churn <- telco_raw %>% filter(Churn.Label == "Yes")
register_google(key= "AIzaSyBmXB5S5_NIqo6lAGH-_U-TbhrQjhOsplU")
summary(telco_raw_churn$Latitude)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 32.56 34.04 36.30 36.27 38.20 41.96
summary(telco_raw_churn$Longitude)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -124.3 -121.8 -119.7 -119.8 -118.0 -114.2
house_loc = c(-125, 33, -120, 42)
our_map=get_map(location = house_loc,maptype="roadmap", source="google")
ggmap(our_map) + geom_point(aes(telco_raw_churn$Longitude, telco_raw_churn$Latitude), data = telco_raw_churn)
telco_raw_churn %>%
group_by(City) %>%
summarise(total = n()) %>%
top_n(10, total) %>%
ggplot(aes(x = reorder(City, -desc(total)), y = total)) + geom_bar(stat = 'identity', fill = 'pink') +
coord_flip() + ggtitle("Total Churn by City") + xlab("City") + ylab("Total")
table(telco_raw_churn$Gender)
##
## Female Male
## 939 930
telco_raw_churn %>%
group_by(Senior.Citizen) %>%
summarise(total = n()) %>%
ggplot(aes(x = Senior.Citizen, y = total)) + geom_bar(stat = 'identity', fill = 'pink') +
coord_flip() + ggtitle("Total Churn by Senior Citizen") + xlab("Senior Citizen") + ylab("Total")
telco_raw_churn %>%
group_by(Partner) %>%
summarise(total = n()) %>%
ggplot(aes(x = Partner, y = total)) + geom_bar(stat = 'identity', fill = 'pink') +
coord_flip() + ggtitle("Total Churn by Partner") + xlab("Partner") + ylab("Total")
telco_raw_churn %>%
group_by(Dependents) %>%
summarise(total = n()) %>%
ggplot(aes(x = Dependents, y = total)) + geom_bar(stat = 'identity', fill = 'pink') +
coord_flip() + ggtitle("Total Churn by Dependents") + xlab("Dependents") + ylab("Total")
summary(telco_raw_churn$Tenure.Months)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 2.00 10.00 17.98 29.00 72.00
class(telco_raw_churn$Tenure.Months)
## [1] "numeric"
telco_raw_churn %>%
ggplot(aes(Tenure.Months)) +
geom_histogram(bins = 20) + ggtitle("Distribution of Churn by Tenure(Months)") +
xlab("Tenure Months")
## Check phone service contribute to churn customer
table(telco_raw_churn$Phone.Service)
##
## No Yes
## 170 1699
table(telco_raw_churn$Multiple.Lines)
##
## No No phone service Yes
## 849 170 850
telco_raw_churn %>%
group_by(Internet.Service) %>%
summarise(total = n()) %>%
ggplot(aes(x = Internet.Service, y = total)) + geom_bar(stat = 'identity', fill = 'pink') +
coord_flip() + ggtitle("Total Churn by Internet Service") + xlab("Internet Service") + ylab("Total")
telco_raw_churn %>%
group_by(Online.Security) %>%
summarise(total = n()) %>%
ggplot(aes(x = Online.Security, y = total)) + geom_bar(stat = 'identity', fill = 'pink') +
coord_flip() + ggtitle("Total Churn by Online Security") + xlab("Online Security") + ylab("Total")
## Check the online backup contribute to churn customer
telco_raw_churn %>%
group_by(Online.Backup) %>%
summarise(total = n()) %>%
ggplot(aes(x = Online.Backup, y = total)) + geom_bar(stat = 'identity', fill = 'pink') +
coord_flip() + ggtitle("Total Churn by Online Backup") + xlab("Online Backup") + ylab("Total")
## Check Device Protection contribute to churn customer
telco_raw_churn %>%
group_by(Device.Protection) %>%
summarise(total = n()) %>%
ggplot(aes(x = Device.Protection, y = total)) + geom_bar(stat = 'identity', fill = 'pink') +
coord_flip() + ggtitle("Total Churn by Device Protection") + xlab("Device Protection") + ylab("Total")
## Check the tech support contribute to churn customer
telco_raw_churn %>%
group_by(Tech.Support) %>%
summarise(total = n()) %>%
ggplot(aes(x = Tech.Support, y = total)) + geom_bar(stat = 'identity', fill = 'pink') +
coord_flip() + ggtitle("Total Churn by Tech Support") + xlab("Tech Support") + ylab("Total")
## Check the streaming TV contribute to churn customer
telco_raw_churn %>%
group_by(Streaming.TV) %>%
summarise(total = n()) %>%
ggplot(aes(x = Streaming.TV, y = total)) + geom_bar(stat = 'identity', fill = 'pink') +
coord_flip() + ggtitle("Total Churn by Streaming TV") + xlab("Streaming TV") + ylab("Total")
## Check the streaming movies contribute to churn customer
telco_raw_churn %>%
group_by(Streaming.Movies) %>%
summarise(total = n()) %>%
ggplot(aes(x = Streaming.Movies, y = total)) + geom_bar(stat = 'identity', fill = 'pink') +
coord_flip() + ggtitle("Total Churn by Streaming Movies") + xlab("Streaming Movies") + ylab("Total")
## Check type of contract contribute to churn customer
telco_raw_churn %>%
group_by(Contract) %>%
summarise(total = n()) %>%
ggplot(aes(x = Contract, y = total)) + geom_bar(stat = 'identity', fill = 'pink') +
coord_flip() + ggtitle("Total Churn by Contract Type") + xlab("Contract Type") + ylab("Total")
## Check the paperless billing contribute to churn customer
telco_raw_churn %>%
group_by(Paperless.Billing) %>%
summarise(total = n()) %>%
ggplot(aes(x = Paperless.Billing, y = total)) + geom_bar(stat = 'identity', fill = 'pink') +
coord_flip() + ggtitle("Total Churn by Paperless Billing") + xlab("Paperless Billing") + ylab("Total")
## Check Payment method contribute churn customer
telco_raw_churn %>%
group_by(Payment.Method) %>%
summarise(total = n()) %>%
ggplot(aes(x = Payment.Method, y = total)) + geom_bar(stat = 'identity', fill = 'pink') +
coord_flip() + ggtitle("Total Churn by Payment Method") + xlab("Payment Method") + ylab("Total")
## Check the monthly charge distribution of churn customer
telco_raw_churn %>%
ggplot(aes(Monthly.Charges)) +
geom_histogram(bins = 20) + ggtitle("Distribution of Churn by Monthly Charges") +
xlab("Monthly Charges")
## Check the total charge distribution of churn customer
telco_raw_churn %>%
ggplot(aes(as.numeric(Total.Charges))) +
geom_histogram(bins = 20) + ggtitle("Distribution of Churn by Total Charge") +
xlab("Total Charge")
## Check the CLTV distribution of churn customer
telco_raw_churn %>%
ggplot(aes(CLTV)) +
geom_histogram(bins = 20) + ggtitle("Distribution of Churn by Customer Lifetime Value") +
xlab("Customer Lifetime Value")
## Check the Reason of churn customer
telco_raw_churn %>%
group_by(Churn.Reason) %>%
filter(Churn.Reason != "Don't know") %>%
summarise(total = n()) %>%
ggplot(aes(x = reorder(Churn.Reason, total), y = total)) + geom_bar(stat = 'identity', fill = 'pink') +
coord_flip() + ggtitle("Total Churn by Churn Reason") + xlab("Churn Reason") + ylab("Total")