library(tidyverse)
library(xlsx)
library(ggmap)
getwd()
## [1] "C:/Users/SK/Desktop/Customer Analytics Project/OneDrive_1_3-2-2020"
setwd("C:/Users/SK/Desktop/Customer Analytics Project/OneDrive_1_3-2-2020")

Read the xlsx file

telco_raw <- read.xlsx("Telco_customer_churn.xlsx", sheetName  = "Telco_Churn")

Exploratory Data Analysis

Check the variables

str(telco_raw)
## 'data.frame':    7043 obs. of  33 variables:
##  $ CustomerID       : Factor w/ 7043 levels "0002-ORFBO","0003-MKNFE",..: 2565 6512 6552 5605 175 2938 6208 724 4585 6120 ...
##  $ Count            : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ Country          : Factor w/ 1 level "United States": 1 1 1 1 1 1 1 1 1 1 ...
##  $ State            : Factor w/ 1 level "California": 1 1 1 1 1 1 1 1 1 1 ...
##  $ City             : Factor w/ 1129 levels "Acampo","Acton",..: 563 563 563 563 563 563 563 563 563 563 ...
##  $ Zip.Code         : num  90003 90005 90006 90010 90015 ...
##  $ Lat.Long         : Factor w/ 1652 levels "32.555828, -117.040073",..: 328 406 394 411 386 417 368 416 448 441 ...
##  $ Latitude         : num  34 34.1 34 34.1 34 ...
##  $ Longitude        : num  -118 -118 -118 -118 -118 ...
##  $ Gender           : Factor w/ 2 levels "Female","Male": 2 1 1 1 2 1 2 2 2 2 ...
##  $ Senior.Citizen   : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 2 1 1 1 ...
##  $ Partner          : Factor w/ 2 levels "No","Yes": 1 1 1 2 1 2 1 1 2 2 ...
##  $ Dependents       : Factor w/ 2 levels "No","Yes": 1 2 2 2 2 1 1 1 2 1 ...
##  $ Tenure.Months    : num  2 2 8 28 49 10 1 1 47 1 ...
##  $ Phone.Service    : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 1 2 2 1 ...
##  $ Multiple.Lines   : Factor w/ 3 levels "No","No phone service",..: 1 1 3 3 3 1 2 1 3 2 ...
##  $ Internet.Service : Factor w/ 3 levels "DSL","Fiber optic",..: 1 2 2 2 2 1 1 3 2 1 ...
##  $ Online.Security  : Factor w/ 3 levels "No","No internet service",..: 3 1 1 1 1 1 1 2 1 1 ...
##  $ Online.Backup    : Factor w/ 3 levels "No","No internet service",..: 3 1 1 1 3 1 1 2 3 3 ...
##  $ Device.Protection: Factor w/ 3 levels "No","No internet service",..: 1 1 3 3 3 3 3 2 1 1 ...
##  $ Tech.Support     : Factor w/ 3 levels "No","No internet service",..: 1 1 1 3 1 3 1 2 1 1 ...
##  $ Streaming.TV     : Factor w/ 3 levels "No","No internet service",..: 1 1 3 3 3 1 1 2 3 1 ...
##  $ Streaming.Movies : Factor w/ 3 levels "No","No internet service",..: 1 1 3 3 3 1 3 2 3 1 ...
##  $ Contract         : Factor w/ 3 levels "Month-to-month",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Paperless.Billing: Factor w/ 2 levels "No","Yes": 2 2 2 2 2 1 2 1 2 1 ...
##  $ Payment.Method   : Factor w/ 4 levels "Bank transfer (automatic)",..: 4 3 3 3 1 2 3 4 3 3 ...
##  $ Monthly.Charges  : num  53.9 70.7 99.7 104.8 103.7 ...
##  $ Total.Charges    : Factor w/ 6531 levels " ","100.2","100.25",..: 158 926 6105 2647 4266 4417 3341 1610 4020 2593 ...
##  $ Churn.Label      : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Churn.Value      : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ Churn.Score      : num  86 67 86 84 89 78 100 92 77 97 ...
##  $ CLTV             : num  3239 2701 5372 5003 5340 ...
##  $ Churn.Reason     : Factor w/ 20 levels "Attitude of service provider",..: 4 14 14 14 3 5 6 4 3 3 ...
summary(telco_raw)
##       CustomerID       Count            Country            State     
##  0002-ORFBO:   1   Min.   :1   United States:7043   California:7043  
##  0003-MKNFE:   1   1st Qu.:1                                         
##  0004-TLHLJ:   1   Median :1                                         
##  0011-IGKFF:   1   Mean   :1                                         
##  0013-EXCHZ:   1   3rd Qu.:1                                         
##  0013-MHZWF:   1   Max.   :1                                         
##  (Other)   :7037                                                     
##             City         Zip.Code                       Lat.Long   
##  Los Angeles  : 305   Min.   :90001   32.555828, -117.040073:   5  
##  San Diego    : 150   1st Qu.:92102   32.578103, -117.012975:   5  
##  San Jose     : 112   Median :93552   32.579134, -117.119009:   5  
##  Sacramento   : 108   Mean   :93522   32.587557, -116.636816:   5  
##  San Francisco: 104   3rd Qu.:95351   32.605012, -116.97595 :   5  
##  Fresno       :  64   Max.   :96161   32.607964, -117.059459:   5  
##  (Other)      :6200                   (Other)               :7013  
##     Latitude       Longitude         Gender     Senior.Citizen Partner   
##  Min.   :32.56   Min.   :-124.3   Female:3488   No :5901       No :3641  
##  1st Qu.:34.03   1st Qu.:-121.8   Male  :3555   Yes:1142       Yes:3402  
##  Median :36.39   Median :-119.7                                          
##  Mean   :36.28   Mean   :-119.8                                          
##  3rd Qu.:38.22   3rd Qu.:-118.0                                          
##  Max.   :41.96   Max.   :-114.2                                          
##                                                                          
##  Dependents Tenure.Months   Phone.Service          Multiple.Lines
##  No :5416   Min.   : 0.00   No : 682      No              :3390  
##  Yes:1627   1st Qu.: 9.00   Yes:6361      No phone service: 682  
##             Median :29.00                 Yes             :2971  
##             Mean   :32.37                                        
##             3rd Qu.:55.00                                        
##             Max.   :72.00                                        
##                                                                  
##     Internet.Service            Online.Security             Online.Backup 
##  DSL        :2421    No                 :3498   No                 :3088  
##  Fiber optic:3096    No internet service:1526   No internet service:1526  
##  No         :1526    Yes                :2019   Yes                :2429  
##                                                                           
##                                                                           
##                                                                           
##                                                                           
##            Device.Protection              Tech.Support 
##  No                 :3095    No                 :3473  
##  No internet service:1526    No internet service:1526  
##  Yes                :2422    Yes                :2044  
##                                                        
##                                                        
##                                                        
##                                                        
##               Streaming.TV             Streaming.Movies
##  No                 :2810   No                 :2785   
##  No internet service:1526   No internet service:1526   
##  Yes                :2707   Yes                :2732   
##                                                        
##                                                        
##                                                        
##                                                        
##            Contract    Paperless.Billing                   Payment.Method
##  Month-to-month:3875   No :2872          Bank transfer (automatic):1544  
##  One year      :1473   Yes:4171          Credit card (automatic)  :1522  
##  Two year      :1695                     Electronic check         :2365  
##                                          Mailed check             :1612  
##                                                                          
##                                                                          
##                                                                          
##  Monthly.Charges  Total.Charges  Churn.Label  Churn.Value    
##  Min.   : 18.25          :  11   No :5174    Min.   :0.0000  
##  1st Qu.: 35.50   20.2   :  11   Yes:1869    1st Qu.:0.0000  
##  Median : 70.35   19.75  :   9               Median :0.0000  
##  Mean   : 64.76   19.65  :   8               Mean   :0.2654  
##  3rd Qu.: 89.85   19.9   :   8               3rd Qu.:1.0000  
##  Max.   :118.75   20.05  :   8               Max.   :1.0000  
##                   (Other):6988                               
##   Churn.Score         CLTV     
##  Min.   :  5.0   Min.   :2003  
##  1st Qu.: 40.0   1st Qu.:3469  
##  Median : 61.0   Median :4527  
##  Mean   : 58.7   Mean   :4400  
##  3rd Qu.: 75.0   3rd Qu.:5380  
##  Max.   :100.0   Max.   :6500  
##                                
##                                     Churn.Reason 
##  Attitude of support person               : 192  
##  Competitor offered higher download speeds: 189  
##  Competitor offered more data             : 162  
##  Don't know                               : 154  
##  Competitor made better offer             : 140  
##  (Other)                                  :1032  
##  NA's                                     :5174
sapply(telco_raw, class)
##        CustomerID             Count           Country             State 
##          "factor"         "numeric"          "factor"          "factor" 
##              City          Zip.Code          Lat.Long          Latitude 
##          "factor"         "numeric"          "factor"         "numeric" 
##         Longitude            Gender    Senior.Citizen           Partner 
##         "numeric"          "factor"          "factor"          "factor" 
##        Dependents     Tenure.Months     Phone.Service    Multiple.Lines 
##          "factor"         "numeric"          "factor"          "factor" 
##  Internet.Service   Online.Security     Online.Backup Device.Protection 
##          "factor"          "factor"          "factor"          "factor" 
##      Tech.Support      Streaming.TV  Streaming.Movies          Contract 
##          "factor"          "factor"          "factor"          "factor" 
## Paperless.Billing    Payment.Method   Monthly.Charges     Total.Charges 
##          "factor"          "factor"         "numeric"          "factor" 
##       Churn.Label       Churn.Value       Churn.Score              CLTV 
##          "factor"         "numeric"         "numeric"         "numeric" 
##      Churn.Reason 
##          "factor"

Check the churn ratio

table(telco_raw$Churn.Label)
## 
##   No  Yes 
## 5174 1869

Churn percentage ~ 36.12%

(1869/5174)*100
## [1] 36.12292

Understand the churn based on each variables

telco_raw_churn <- telco_raw %>% filter(Churn.Label == "Yes")

Check the location of churn

register_google(key= "AIzaSyBmXB5S5_NIqo6lAGH-_U-TbhrQjhOsplU")
summary(telco_raw_churn$Latitude)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   32.56   34.04   36.30   36.27   38.20   41.96
summary(telco_raw_churn$Longitude)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -124.3  -121.8  -119.7  -119.8  -118.0  -114.2
house_loc = c(-125, 33, -120, 42)
our_map=get_map(location = house_loc,maptype="roadmap", source="google")
ggmap(our_map) + geom_point(aes(telco_raw_churn$Longitude, telco_raw_churn$Latitude), data = telco_raw_churn)

Check Top_10 city contribute the highest churn

telco_raw_churn %>%
  group_by(City) %>%
  summarise(total = n()) %>%
  top_n(10, total) %>% 
  ggplot(aes(x = reorder(City, -desc(total)), y = total)) + geom_bar(stat = 'identity', fill = 'pink') + 
  coord_flip() + ggtitle("Total Churn by City") + xlab("City") + ylab("Total")

Check the gender contribute to churn

No much difference between male and female

table(telco_raw_churn$Gender)
## 
## Female   Male 
##    939    930

Check the senior citizen contribute to Churn

telco_raw_churn %>% 
  group_by(Senior.Citizen) %>%
  summarise(total = n()) %>%
  ggplot(aes(x = Senior.Citizen, y = total)) + geom_bar(stat = 'identity', fill = 'pink') +
  coord_flip() + ggtitle("Total Churn by Senior Citizen") + xlab("Senior Citizen") + ylab("Total")

Check the partner contribute to Churn

telco_raw_churn %>% 
  group_by(Partner) %>%
  summarise(total = n()) %>%
  ggplot(aes(x = Partner, y = total)) + geom_bar(stat = 'identity', fill = 'pink') +
  coord_flip() + ggtitle("Total Churn by Partner") + xlab("Partner") + ylab("Total")

Check the dependent contribute to Churn

telco_raw_churn %>% 
  group_by(Dependents) %>%
  summarise(total = n()) %>%
  ggplot(aes(x = Dependents, y = total)) + geom_bar(stat = 'identity', fill = 'pink') +
  coord_flip() + ggtitle("Total Churn by Dependents") + xlab("Dependents") + ylab("Total")

Check the tenure month of the churn customer

summary(telco_raw_churn$Tenure.Months)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    2.00   10.00   17.98   29.00   72.00
class(telco_raw_churn$Tenure.Months)
## [1] "numeric"
telco_raw_churn %>%
  ggplot(aes(Tenure.Months)) +
  geom_histogram(bins = 20) + ggtitle("Distribution of Churn by Tenure(Months)") +
  xlab("Tenure Months") 

## Check phone service contribute to churn customer

table(telco_raw_churn$Phone.Service)
## 
##   No  Yes 
##  170 1699

Check multtple line contribute to churn customer

Almost same between have and do not have phone service

table(telco_raw_churn$Multiple.Lines)
## 
##               No No phone service              Yes 
##              849              170              850

Check the internet service contribute to churn customer

telco_raw_churn %>% 
  group_by(Internet.Service) %>%
  summarise(total = n()) %>%
  ggplot(aes(x = Internet.Service, y = total)) + geom_bar(stat = 'identity', fill = 'pink') +
  coord_flip() + ggtitle("Total Churn by Internet Service") + xlab("Internet Service") + ylab("Total")

Check Onlice security contribute to churn customer

telco_raw_churn %>% 
  group_by(Online.Security) %>%
  summarise(total = n()) %>%
  ggplot(aes(x = Online.Security, y = total)) + geom_bar(stat = 'identity', fill = 'pink') +
  coord_flip() + ggtitle("Total Churn by Online Security") + xlab("Online Security") + ylab("Total")

## Check the online backup contribute to churn customer

telco_raw_churn %>% 
  group_by(Online.Backup) %>%
  summarise(total = n()) %>%
  ggplot(aes(x = Online.Backup, y = total)) + geom_bar(stat = 'identity', fill = 'pink') +
  coord_flip() + ggtitle("Total Churn by Online Backup") + xlab("Online Backup") + ylab("Total")

## Check Device Protection contribute to churn customer

telco_raw_churn %>% 
  group_by(Device.Protection) %>%
  summarise(total = n()) %>%
  ggplot(aes(x = Device.Protection, y = total)) + geom_bar(stat = 'identity', fill = 'pink') +
  coord_flip() + ggtitle("Total Churn by Device Protection") + xlab("Device Protection") + ylab("Total")

## Check the tech support contribute to churn customer

telco_raw_churn %>% 
  group_by(Tech.Support) %>%
  summarise(total = n()) %>%
  ggplot(aes(x = Tech.Support, y = total)) + geom_bar(stat = 'identity', fill = 'pink') +
  coord_flip() + ggtitle("Total Churn by Tech Support") + xlab("Tech Support") + ylab("Total")

## Check the streaming TV contribute to churn customer

telco_raw_churn %>% 
  group_by(Streaming.TV) %>%
  summarise(total = n()) %>%
  ggplot(aes(x = Streaming.TV, y = total)) + geom_bar(stat = 'identity', fill = 'pink') +
  coord_flip() + ggtitle("Total Churn by Streaming TV") + xlab("Streaming TV") + ylab("Total")

## Check the streaming movies contribute to churn customer

telco_raw_churn %>% 
  group_by(Streaming.Movies) %>%
  summarise(total = n()) %>%
  ggplot(aes(x = Streaming.Movies, y = total)) + geom_bar(stat = 'identity', fill = 'pink') +
  coord_flip() + ggtitle("Total Churn by Streaming Movies") + xlab("Streaming Movies") + ylab("Total")

## Check type of contract contribute to churn customer

telco_raw_churn %>% 
  group_by(Contract) %>%
  summarise(total = n()) %>%
  ggplot(aes(x = Contract, y = total)) + geom_bar(stat = 'identity', fill = 'pink') +
  coord_flip() + ggtitle("Total Churn by Contract Type") + xlab("Contract Type") + ylab("Total")

## Check the paperless billing contribute to churn customer

telco_raw_churn %>% 
  group_by(Paperless.Billing) %>%
  summarise(total = n()) %>%
  ggplot(aes(x = Paperless.Billing, y = total)) + geom_bar(stat = 'identity', fill = 'pink') +
  coord_flip() + ggtitle("Total Churn by Paperless Billing") + xlab("Paperless Billing") + ylab("Total")

## Check Payment method contribute churn customer

telco_raw_churn %>% 
  group_by(Payment.Method) %>%
  summarise(total = n()) %>%
  ggplot(aes(x = Payment.Method, y = total)) + geom_bar(stat = 'identity', fill = 'pink') +
  coord_flip() + ggtitle("Total Churn by Payment Method") + xlab("Payment Method") + ylab("Total")

## Check the monthly charge distribution of churn customer

telco_raw_churn %>%
  ggplot(aes(Monthly.Charges)) +
  geom_histogram(bins = 20) + ggtitle("Distribution of Churn by Monthly Charges") +
  xlab("Monthly Charges") 

## Check the total charge distribution of churn customer

telco_raw_churn %>% 
  ggplot(aes(as.numeric(Total.Charges))) +
  geom_histogram(bins = 20) + ggtitle("Distribution of Churn by Total Charge") +
  xlab("Total Charge") 

## Check the CLTV distribution of churn customer

telco_raw_churn %>% 
  ggplot(aes(CLTV)) +
  geom_histogram(bins = 20) + ggtitle("Distribution of Churn by Customer Lifetime Value") +
  xlab("Customer Lifetime Value") 

## Check the Reason of churn customer

telco_raw_churn %>% 
  group_by(Churn.Reason) %>%
  filter(Churn.Reason != "Don't know") %>%
  summarise(total = n()) %>%
  ggplot(aes(x = reorder(Churn.Reason, total), y = total)) + geom_bar(stat = 'identity', fill = 'pink') +
  coord_flip() + ggtitle("Total Churn by Churn Reason") + xlab("Churn Reason") + ylab("Total")