1 Importing Libraries and Dataset

1.1 Imporing Libraries

library(magrittr)
library(dplyr)
library(tidyr)
library(ggplot2)
library(knitr)
library(kableExtra)
library(reshape2)

1.2 Setting the path

setwd(“C:_homeworks_2023”)

1.3 Importing the dataset

1.3.1 First 5 rows

##   customerID gender SeniorCitizen Partner Dependents tenure PhoneService
## 1 7590-VHVEG Female             0     Yes         No      1           No
## 2 5575-GNVDE   Male             0      No         No     34          Yes
## 3 3668-QPYBK   Male             0      No         No      2          Yes
## 4 7795-CFOCW   Male             0      No         No     45           No
## 5 9237-HQITU Female             0      No         No      2          Yes
## 6 9305-CDSKC Female             0      No         No      8          Yes
##      MultipleLines InternetService OnlineSecurity OnlineBackup DeviceProtection
## 1 No phone service             DSL             No          Yes               No
## 2               No             DSL            Yes           No              Yes
## 3               No             DSL            Yes          Yes               No
## 4 No phone service             DSL            Yes           No              Yes
## 5               No     Fiber optic             No           No               No
## 6              Yes     Fiber optic             No           No              Yes
##   TechSupport StreamingTV StreamingMovies       Contract PaperlessBilling
## 1          No          No              No Month-to-month              Yes
## 2          No          No              No       One year               No
## 3          No          No              No Month-to-month              Yes
## 4         Yes          No              No       One year               No
## 5          No          No              No Month-to-month              Yes
## 6          No         Yes             Yes Month-to-month              Yes
##               PaymentMethod MonthlyCharges TotalCharges Churn
## 1          Electronic check          29.85        29.85    No
## 2              Mailed check          56.95      1889.50    No
## 3              Mailed check          53.85       108.15   Yes
## 4 Bank transfer (automatic)          42.30      1840.75    No
## 5          Electronic check          70.70       151.65   Yes
## 6          Electronic check          99.65       820.50   Yes
kbl(df[1:5,]) %>%
  kable_paper("hover", full_width = F)
customerID gender SeniorCitizen Partner Dependents tenure PhoneService MultipleLines InternetService OnlineSecurity OnlineBackup DeviceProtection TechSupport StreamingTV StreamingMovies Contract PaperlessBilling PaymentMethod MonthlyCharges TotalCharges Churn
7590-VHVEG Female 0 Yes No 1 No No phone service DSL No Yes No No No No Month-to-month Yes Electronic check 29.85 29.85 No
5575-GNVDE Male 0 No No 34 Yes No DSL Yes No Yes No No No One year No Mailed check 56.95 1889.50 No
3668-QPYBK Male 0 No No 2 Yes No DSL Yes Yes No No No No Month-to-month Yes Mailed check 53.85 108.15 Yes
7795-CFOCW Male 0 No No 45 No No phone service DSL Yes No Yes Yes No No One year No Bank transfer (automatic) 42.30 1840.75 No
9237-HQITU Female 0 No No 2 Yes No Fiber optic No No No No No No Month-to-month Yes Electronic check 70.70 151.65 Yes
str(df)
## 'data.frame':    7043 obs. of  21 variables:
##  $ customerID      : chr  "7590-VHVEG" "5575-GNVDE" "3668-QPYBK" "7795-CFOCW" ...
##  $ gender          : chr  "Female" "Male" "Male" "Male" ...
##  $ SeniorCitizen   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Partner         : chr  "Yes" "No" "No" "No" ...
##  $ Dependents      : chr  "No" "No" "No" "No" ...
##  $ tenure          : int  1 34 2 45 2 8 22 10 28 62 ...
##  $ PhoneService    : chr  "No" "Yes" "Yes" "No" ...
##  $ MultipleLines   : chr  "No phone service" "No" "No" "No phone service" ...
##  $ InternetService : chr  "DSL" "DSL" "DSL" "DSL" ...
##  $ OnlineSecurity  : chr  "No" "Yes" "Yes" "Yes" ...
##  $ OnlineBackup    : chr  "Yes" "No" "Yes" "No" ...
##  $ DeviceProtection: chr  "No" "Yes" "No" "Yes" ...
##  $ TechSupport     : chr  "No" "No" "No" "Yes" ...
##  $ StreamingTV     : chr  "No" "No" "No" "No" ...
##  $ StreamingMovies : chr  "No" "No" "No" "No" ...
##  $ Contract        : chr  "Month-to-month" "One year" "Month-to-month" "One year" ...
##  $ PaperlessBilling: chr  "Yes" "No" "Yes" "No" ...
##  $ PaymentMethod   : chr  "Electronic check" "Mailed check" "Mailed check" "Bank transfer (automatic)" ...
##  $ MonthlyCharges  : num  29.9 57 53.9 42.3 70.7 ...
##  $ TotalCharges    : num  29.9 1889.5 108.2 1840.8 151.7 ...
##  $ Churn           : chr  "No" "No" "Yes" "No" ...
summary(df)
##   customerID           gender          SeniorCitizen      Partner         
##  Length:7043        Length:7043        Min.   :0.0000   Length:7043       
##  Class :character   Class :character   1st Qu.:0.0000   Class :character  
##  Mode  :character   Mode  :character   Median :0.0000   Mode  :character  
##                                        Mean   :0.1621                     
##                                        3rd Qu.:0.0000                     
##                                        Max.   :1.0000                     
##                                                                           
##   Dependents            tenure      PhoneService       MultipleLines     
##  Length:7043        Min.   : 0.00   Length:7043        Length:7043       
##  Class :character   1st Qu.: 9.00   Class :character   Class :character  
##  Mode  :character   Median :29.00   Mode  :character   Mode  :character  
##                     Mean   :32.37                                        
##                     3rd Qu.:55.00                                        
##                     Max.   :72.00                                        
##                                                                          
##  InternetService    OnlineSecurity     OnlineBackup       DeviceProtection  
##  Length:7043        Length:7043        Length:7043        Length:7043       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  TechSupport        StreamingTV        StreamingMovies      Contract        
##  Length:7043        Length:7043        Length:7043        Length:7043       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  PaperlessBilling   PaymentMethod      MonthlyCharges    TotalCharges   
##  Length:7043        Length:7043        Min.   : 18.25   Min.   :  18.8  
##  Class :character   Class :character   1st Qu.: 35.50   1st Qu.: 401.4  
##  Mode  :character   Mode  :character   Median : 70.35   Median :1397.5  
##                                        Mean   : 64.76   Mean   :2283.3  
##                                        3rd Qu.: 89.85   3rd Qu.:3794.7  
##                                        Max.   :118.75   Max.   :8684.8  
##                                                         NA's   :11      
##     Churn          
##  Length:7043       
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 

2 About Dataset

The dataset contains the data of the customers in telecommunication industry. Therefore the unit of observation is a customer. The dataset includes 21 variables and data of 7043 customers.

2.1 Description of variables

The variable description is as below.

variables=read.csv("variables.csv")
variables
##          customerID
## 1            gender
## 2     SeniorCitizen
## 3           Partner
## 4        Dependents
## 5            tenure
## 6      PhoneService
## 7     MultipleLines
## 8   InternetService
## 9    OnlineSecurity
## 10     OnlineBackup
## 11 DeviceProtection
## 12      TechSupport
## 13      StreamingTV
## 14  StreamingMovies
## 15         Contract
## 16 PaperlessBilling
## 17    PaymentMethod
## 18   MonthlyCharges
## 19     TotalCharges
## 20            Churn
##                                                                                                       ID.of.the.Customer
## 1                                                                             Whether the customer is a male or a female
## 2                                                                 Whether the customer is a senior citizen or not (1, 0)
## 3                                                                    Whether the customer has a partner or not (Yes, No)
## 4                                                                   Whether the customer has dependents or not (Yes, No)
## 5                                                              Number of months the customer has stayed with the company
## 6                                                              Whether the customer has a phone service or not (Yes, No)
## 7                                             Whether the customer has multiple lines or not (Yes, No, No phone service)
## 8                                                         Customer\x92s internet service provider (DSL, Fiber optic, No)
## 9                                         Whether the customer has online security or not (Yes, No, No internet service)
## 10                                          Whether the customer has online backup or not (Yes, No, No internet service)
## 11                                      Whether the customer has device protection or not (Yes, No, No internet service)
## 12                                           Whether the customer has tech support or not (Yes, No, No internet service)
## 13                                           Whether the customer has streaming TV or not (Yes, No, No internet service)
## 14                                       Whether the customer has streaming movies or not (Yes, No, No internet service)
## 15                                                The contract term of the customer (Month-to-month, One year, Two year)
## 16                                                           Whether the customer has paperless billing or not (Yes, No)
## 17 The customer\x92s payment method (Electronic check, Mailed check, Bank transfer (automatic), Credit card (automatic))
## 18                                                                        The amount charged to the customer monthly ($)
## 19                                                                          The total amount charged to the customer ($)
## 20                                                                       Whether the customer churned or not (Yes or No)

3 Main Goal

The main goal of this study is to build a prediction model for predicting the customers who are likely to churn from the network. The second goal of the study is to conduct a descriptive analaysis to identyfy trends, factors associate with the customer prediction.

4 Data Manipulation

4.1 Dealing with Missing Values

#Checking the existence of null values
sapply(df, function(x) sum(is.na(x)))
##       customerID           gender    SeniorCitizen          Partner 
##                0                0                0                0 
##       Dependents           tenure     PhoneService    MultipleLines 
##                0                0                0                0 
##  InternetService   OnlineSecurity     OnlineBackup DeviceProtection 
##                0                0                0                0 
##      TechSupport      StreamingTV  StreamingMovies         Contract 
##                0                0                0                0 
## PaperlessBilling    PaymentMethod   MonthlyCharges     TotalCharges 
##                0                0                0               11 
##            Churn 
##                0

Only TotalCharges variable contains 11 missing values. Since the sample size is large enough, the 11 observations will be removed from the datasets.

df = na.omit(df)
str(df)
## 'data.frame':    7032 obs. of  21 variables:
##  $ customerID      : chr  "7590-VHVEG" "5575-GNVDE" "3668-QPYBK" "7795-CFOCW" ...
##  $ gender          : chr  "Female" "Male" "Male" "Male" ...
##  $ SeniorCitizen   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Partner         : chr  "Yes" "No" "No" "No" ...
##  $ Dependents      : chr  "No" "No" "No" "No" ...
##  $ tenure          : int  1 34 2 45 2 8 22 10 28 62 ...
##  $ PhoneService    : chr  "No" "Yes" "Yes" "No" ...
##  $ MultipleLines   : chr  "No phone service" "No" "No" "No phone service" ...
##  $ InternetService : chr  "DSL" "DSL" "DSL" "DSL" ...
##  $ OnlineSecurity  : chr  "No" "Yes" "Yes" "Yes" ...
##  $ OnlineBackup    : chr  "Yes" "No" "Yes" "No" ...
##  $ DeviceProtection: chr  "No" "Yes" "No" "Yes" ...
##  $ TechSupport     : chr  "No" "No" "No" "Yes" ...
##  $ StreamingTV     : chr  "No" "No" "No" "No" ...
##  $ StreamingMovies : chr  "No" "No" "No" "No" ...
##  $ Contract        : chr  "Month-to-month" "One year" "Month-to-month" "One year" ...
##  $ PaperlessBilling: chr  "Yes" "No" "Yes" "No" ...
##  $ PaymentMethod   : chr  "Electronic check" "Mailed check" "Mailed check" "Bank transfer (automatic)" ...
##  $ MonthlyCharges  : num  29.9 57 53.9 42.3 70.7 ...
##  $ TotalCharges    : num  29.9 1889.5 108.2 1840.8 151.7 ...
##  $ Churn           : chr  "No" "No" "Yes" "No" ...
##  - attr(*, "na.action")= 'omit' Named int [1:11] 489 754 937 1083 1341 3332 3827 4381 5219 6671 ...
##   ..- attr(*, "names")= chr [1:11] "489" "754" "937" "1083" ...
sapply(df, function(x) sum(is.na(x)))
##       customerID           gender    SeniorCitizen          Partner 
##                0                0                0                0 
##       Dependents           tenure     PhoneService    MultipleLines 
##                0                0                0                0 
##  InternetService   OnlineSecurity     OnlineBackup DeviceProtection 
##                0                0                0                0 
##      TechSupport      StreamingTV  StreamingMovies         Contract 
##                0                0                0                0 
## PaperlessBilling    PaymentMethod   MonthlyCharges     TotalCharges 
##                0                0                0                0 
##            Churn 
##                0

4.2 Dealing with Categorical Variables (Feature Encoding)

Renaming the Senior Citizen variable

levels(df$SeniorCitizen) = list("No" = 0, "Yes" = 1)
df$SeniorCitizen = as.factor(df$SeniorCitizen)

4.3 Changing the type of the variables

All the categorical variables in the dataset are in ‘char’ format. Therefore they will be converted to factors before moving to the descriptive analysis.

df$gender = as.factor(df$gender)
df$Partner = as.factor(df$Partner)
df$Dependents = as.factor(df$Dependents)
df$PhoneService = as.factor(df$PhoneService)
df$MultipleLines= as.factor(df$MultipleLines)
df$InternetService= as.factor(df$InternetService)
df$OnlineSecurity= as.factor(df$OnlineSecurity)
df$OnlineBackup= as.factor(df$OnlineBackup)
df$DeviceProtection= as.factor(df$DeviceProtection)
df$TechSupport= as.factor(df$TechSupport)
df$StreamingTV= as.factor(df$StreamingTV)
df$StreamingMovies= as.factor(df$StreamingMovies)
df$Contract= as.factor(df$Contract)
df$PaperlessBilling= as.factor(df$PaperlessBilling)
df$PaymentMethod= as.factor(df$PaymentMethod)
df$Churn= as.factor(df$Churn)

str(df)
## 'data.frame':    7032 obs. of  21 variables:
##  $ customerID      : chr  "7590-VHVEG" "5575-GNVDE" "3668-QPYBK" "7795-CFOCW" ...
##  $ gender          : Factor w/ 2 levels "Female","Male": 1 2 2 2 1 1 2 1 1 2 ...
##  $ SeniorCitizen   : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Partner         : Factor w/ 2 levels "No","Yes": 2 1 1 1 1 1 1 1 2 1 ...
##  $ Dependents      : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 2 1 1 2 ...
##  $ tenure          : int  1 34 2 45 2 8 22 10 28 62 ...
##  $ PhoneService    : Factor w/ 2 levels "No","Yes": 1 2 2 1 2 2 2 1 2 2 ...
##  $ MultipleLines   : Factor w/ 3 levels "No","No phone service",..: 2 1 1 2 1 3 3 2 3 1 ...
##  $ InternetService : Factor w/ 3 levels "DSL","Fiber optic",..: 1 1 1 1 2 2 2 1 2 1 ...
##  $ OnlineSecurity  : Factor w/ 3 levels "No","No internet service",..: 1 3 3 3 1 1 1 3 1 3 ...
##  $ OnlineBackup    : Factor w/ 3 levels "No","No internet service",..: 3 1 3 1 1 1 3 1 1 3 ...
##  $ DeviceProtection: Factor w/ 3 levels "No","No internet service",..: 1 3 1 3 1 3 1 1 3 1 ...
##  $ TechSupport     : Factor w/ 3 levels "No","No internet service",..: 1 1 1 3 1 1 1 1 3 1 ...
##  $ StreamingTV     : Factor w/ 3 levels "No","No internet service",..: 1 1 1 1 1 3 3 1 3 1 ...
##  $ StreamingMovies : Factor w/ 3 levels "No","No internet service",..: 1 1 1 1 1 3 1 1 3 1 ...
##  $ Contract        : Factor w/ 3 levels "Month-to-month",..: 1 2 1 2 1 1 1 1 1 2 ...
##  $ PaperlessBilling: Factor w/ 2 levels "No","Yes": 2 1 2 1 2 2 2 1 2 1 ...
##  $ PaymentMethod   : Factor w/ 4 levels "Bank transfer (automatic)",..: 3 4 4 1 3 3 2 4 3 1 ...
##  $ MonthlyCharges  : num  29.9 57 53.9 42.3 70.7 ...
##  $ TotalCharges    : num  29.9 1889.5 108.2 1840.8 151.7 ...
##  $ Churn           : Factor w/ 2 levels "No","Yes": 1 1 2 1 2 2 1 1 2 1 ...
##  - attr(*, "na.action")= 'omit' Named int [1:11] 489 754 937 1083 1341 3332 3827 4381 5219 6671 ...
##   ..- attr(*, "names")= chr [1:11] "489" "754" "937" "1083" ...

5 Descriptive Analysis

5.1 Univariate Analysis

5.1.1 Categorical Variables

require('gridExtra')

plot1 <- ggplot(df, aes(Churn)) + 
  geom_bar() +
  labs(title = "Customer Churn Status", 
       x = "Whether the customer churned or not (Yes or No", 
       y = "Count")

#Gender
plot2 <- ggplot(df, aes(gender)) + 
  geom_bar() +
  labs(title = "Customer Gender", 
       x = "Whether the customer Male/Female", 
       y = "Count")

#Senior Citizen
plot3 <- ggplot(df, aes(SeniorCitizen)) + 
  geom_bar() +
  labs(title = "Senior Citizen Customers", 
       x = "Whether the customer Senior Citizen/not", 
       y = "Count")

#Dependents
plot4 <- ggplot(df, aes(Dependents)) + 
  geom_bar() +
  labs(title = "Dependents", 
       x = "Whether the customer have dependent/not", 
       y = "Count")

#Partner
plot5 <- ggplot(df, aes(Partner)) + 
  geom_bar() +
  labs(title = "Partners", 
       x = "Whether the customer having a partner/not", 
       y = "Count")

grid.arrange(plot1, plot2, plot3, ncol=3)

  • Churn: The response variable of the analysis is churn status of the customer, from the sample of 7032 of the customers about 1850 of the customers have churned from the network, which makes it about 25% of the customers have churned. The churn percentage is considerable high; therefore, the objective of the study is also clear, that the factors associated with the customer churn should be identified to prevent the customer churn.

  • Gender: The percentages of males and females are approximately same according to the bar chart. Therefore, it can be observed that both males and females use this network equally.

  • Senior Citizen Customer: From the bar chart it is observed that about 1100 of the customers out of 7032, are senior citizens in this sample. That is about 15% from the sample.

require('gridExtra')
grid.arrange(plot4, plot5, ncol=2)

About 2100 (30%) of the sample have dependents while 3400 (50%) customers have partners. Here a great opportunity can be identified that the network can be promoted to the dependents and partners via the customer.

  • OnlineSecurity
  • OnlineBackup
  • DeviceProtection
  • TechSupport
  • StreamingTV
  • StreamingMovies

for all 6 variables have three categories ‘Yes’, ‘No’, ‘No internet connection’. Therefore, the customers who do not have internet connection will not be considered and the percentage of ‘Yes’ are obtained for each variable.

Internet_Addons_df = data.frame('Online Security' = df$OnlineSecurity, 
                                  'Online Backup' =  df$OnlineBackup,
                                  'Device Protection' = df$DeviceProtection,
                                  'Tech Support' = df$TechSupport,
                                  'Streaming TV'= df$StreamingTV,
                                  'Streaming Movies' = df$StreamingMovies)
perc = function(x){
  l = length(x)
  v = length(grep("Yes", x))
  v/l*100
}

sumer = Internet_Addons_df %>%
  filter(Online.Security != "No internet service")%>%
  summarise_all(funs(perc))%>%
  gather('metric','percent')

ggplot(sumer, aes(reorder(metric,percent),  percent))+ 
  geom_bar( stat = "identity") +
  coord_flip()+
  labs(title = "Popularity of Internet Add-ons",
       subtitle = "Only considering customers who have internet",
       x = "", 
       y = "Percentage of Internet Subscribers with Service")

  • From the customers who have net connections about 50% of Customers have subscribed for streaming movies and TV.
  • About 44% of the customers have subscribed for online back up and device protection services.
  • Approximately 37% of the customers are using tech support and online security services.
  • From these insights it can be seen that there’s more opportunity to promote these services among the customers to get more profit from the additional services.

5.1.2 Continuous Variables

plot6 <- ggplot(df, aes(MonthlyCharges)) + 
  geom_histogram(bins = 20) +
  labs(title = "Histogram of MonthlyCharges", 
       x = "MonthlyCharges", 
       y = "Count")

plot7 <- ggplot(df, aes(TotalCharges)) + 
  geom_histogram(bins = 20) +
  labs(title = "Histogram of TotalCharges", 
       x = "TotalCharges", 
       y = "Count")

grid.arrange(plot6, plot7, ncol=2)

# Tenure
ggplot(data = df, aes(x = tenure)) +
  geom_bar() +
  labs(title = "Distribution of tenure", 
       x = "Tenure (No of months)", 
       y = "No of customers")

Mode of the customers’ monthly charge is below 25. Both the monthly charges and the total charges histograms are skewed to the right and do not follow normal distributions.

5.2 Bivariate Analysis

5.2.1 Categorical Variable Vs Categorical Variable

plot8 <- ggplot(df, aes(Partner, fill = Churn)) + 
  geom_bar() +
  labs(title = "Partner Vs Churn", 
       x = "Does the Customer have a Partner(Yes or No)?", 
       y = "Count")

plot9 <- ggplot(df, aes(Dependents, fill = Churn)) + 
  geom_bar(position = 'fill') +
  labs(title = "Dependents Status Vs Churn", 
       x = "Does the Customer have Dependents (Yes or No)?", 
       y = "Count")

plot10 <-ggplot(df, aes(gender, fill = Churn)) + 
  geom_bar() +
  labs(title = "Gender Vs Churn", 
       x = "Gender", 
       y = "Count")

plot8

The number of churned customers are high in the customer segment who do not have partners.

plot9

The number of churned customers are high in the customer segment who do not have dependents.

plot10

Churn rates are pretty much similar to both gernders.

#grid.arrange(plot8, plot9, plot10, ncol=3)

plot11 <- ggplot(df, aes(PaperlessBilling, fill = Churn)) + 
  geom_bar() +
  labs(title = "Paperless Billing Status", 
       x = "Does the Customer Use Paperless Billing?", 
       y = "Count")

plot12 <- ggplot(df, aes(PaymentMethod, fill = Churn)) + 
  geom_bar() +
  labs(title = "Payment Method", 
       x = "What Payment Method does the Customer Use?", 
       y = "Count") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

plot11

In overall, More than 4000 customers have subscribed for paperless billing services and also the higher count of churned customers are also identified in this category.

plot12

#grid.arrange(plot11, plot12, ncol=2)

Most people tend to use electronic check method as their payment method. The percentages are approximately similar for other three categories. The highest churn count is also observed in the customers who use electronic checking.

ggplot(df, aes(SeniorCitizen, fill = PaymentMethod)) + 
  geom_bar(position = 'fill') +
  labs(title = "Payment Method of Seniors Citizen", 
       x = "Is the Customer a Senior Citizen?", 
       y = "Fraction")

  • From the stack bar chart, it can be seen that more senior citizens tend to use electronic check method than the not senior citizen persons.
  • Mailed check method is used by non-senior citizen customers than the senior customers. Other two methods are approximately equally popular between the two categories.

5.2.2 Categorical Variable Vs Continuous Variable

#Monthly Charge vs Churn
plot13 <- ggplot(data = df, aes(MonthlyCharges, color = Churn))+
  geom_freqpoly(binwidth = 5, size = 1)+
  labs(title = "Churn Count vs MonthlyCharges", 
       x = "Monthly Charges", 
       y = "Churn Count")
       
#Total Charge vs Churn
plot14 <-ggplot(data = df, aes(TotalCharges, color = Churn))+
  geom_freqpoly(binwidth = 200, size = 1)+
  labs(title = "Churn Count vs TotalCharges", 
       x = "Total Charges", 
       y = "Churn Count")
       
#tenure vs churn 
plot15 <-ggplot(data = df, aes(tenure, colour = Churn))+
  geom_freqpoly(binwidth = 5, size = 1)+
  labs(title = "Churn Count vs tenure", 
       x = "tenure", 
       y = "Churn Count")

plot13

From the monthly charges univariate histogram also, it was observed that the highest frequency of customers belonged to below 25 category. The same pattern is here also observed for the customers who are staying on the network. But the pattern is different for the customers who have churned from the network. More customers who have churned, have used between monthly charges between 75-100.

plot14

The total charge two-line charts follow the same pattern for both churned and current customers. Although the count of churned customers is low in the dataset, still the count who use below about 250, are same for both categories.

plot15

#grid.arrange(plot13, plot14, plot15, ncol=3)

ggplot(df, aes(x=gender, y=TotalCharges)) + 
  geom_boxplot() +
  labs(title = "Gender vs Total Charges", 
       x = "Gender", 
       y = "Total Charge($)")

The distributions of the total charges are similar for both males and females. The median total charge is about 1350 while 25% of the customers use more than total charge of 3750 for both categories.

ggplot(df, aes(x=gender, y=MonthlyCharges)) + geom_boxplot() +
  labs(title = "Gender vs Monthly Charges", 
       x = "Gender", 
       y = "Monthly Charge($)")

The distributions of the monthly charges are similar for both males and females. The median monthly charge is about 71 while 25% of the customers use more than total charge about 90 in both categories

5.2.3 Continuous Variable Vs Continuous Variable

#scatter plots
ggplot(df, aes(x=MonthlyCharges, y=TotalCharges)) +
  geom_point(size=1)+
  labs(title = "MonthlyCharges vs TotalCharges", 
       x = "MonthlyCharges", 
       y = "TotalCharges")

As the monthly charge increase, the total charge also increases which is a fact. A positive moderate strong relationship can be identified in the two factors considered.

ggplot(df, aes(x=tenure, y=TotalCharges)) +
  geom_point(size=1)+
  labs(title = "TotalCharges vs tenure", 
       x = "tenure", 
       y = "TotalCharges")

As the tenure increase, the total charge also increases according to the scatterplot. A positive moderate strong relationship can be identified in the two factors considered.

ggplot(df, aes(x=tenure, y=MonthlyCharges)) + geom_point(size=1)+
  labs(title = "MonthlyCharges vs tenure", 
       x = "tenure", 
       y = "MonthlyCharges")

There is no relationship can be identified between the monthly charge and the tenure from the scatterplot.

5.3 Multivariate Analysis

5.3.1 Two Continuous Variables and Categorical Variable

ggplot(df, aes(x = MonthlyCharges, y = TotalCharges, color = Churn)) +
  geom_point()

The same positive moderate strong relationship can be identified for both churned and current customers.

ggplot(data=df, mapping = aes(x = MonthlyCharges, y = TotalCharges)) + 
  geom_point() +
  facet_wrap(~Churn) +
  theme_bw()

The same positive moderate strong relationship can be identified for both churned and current customers.

5.3.2 Heatmap

data <- cor(df[sapply(df,is.numeric)])
data1 <- melt(data)
ggplot(data1, aes(x = Var1,
                  y = Var2,
                  fill = value))+geom_tile()+
  labs(title = "Heatmp", 
       x = "", 
       y = "")