setwd("~/R training")
library(rio)
library(tidyverse)
library(readxl)
library(dplyr)
library(psych)
df=read.csv("Telcom-data.csv")
head(df)
library(tidyverse)
missing_value <- colSums(is.na(df))
data.frame(missing_value)
colSums(is.na(df))
customerID gender SeniorCitizen Partner Dependents tenure PhoneService
0 0 0 0 0 0 0
MultipleLines InternetService OnlineSecurity OnlineBackup DeviceProtection TechSupport StreamingTV
0 0 0 0 0 0 0
StreamingMovies Contract PaperlessBilling PaymentMethod MonthlyCharges TotalCharges Churn
0 0 0 0 0 11 0
##handling missing variables
df$TotalCharges[is.na(df$TotalCharges)] <- median(df$TotalCharges, na.rm = TRUE)
head(df)
colSums(is.na(df))
customerID gender SeniorCitizen Partner Dependents tenure PhoneService
0 0 0 0 0 0 0
MultipleLines InternetService OnlineSecurity OnlineBackup DeviceProtection TechSupport StreamingTV
0 0 0 0 0 0 0
StreamingMovies Contract PaperlessBilling PaymentMethod MonthlyCharges TotalCharges Churn
0 0 0 0 0 0 0
df[]<-lapply(df,function(x){
if (is.character(x))
as.numeric(factor(x)) else x
})
head(df)
find_outliers_iqr <- function(x) {
Q1 <- quantile(x, 0.25, na.rm = TRUE)
Q3 <- quantile(x, 0.75, na.rm = TRUE)
IQR_value <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
return(which(x < lower_bound | x > upper_bound))
}
numeric_columns <- sapply(df, is.numeric)
outlier_indices_list <- lapply(df[, numeric_columns], find_outliers_iqr)
# Print summary of outliers
for (col in names(outlier_indices_list)) {
cat("Variable:", col, " - Outliers found:", length(outlier_indices_list[[col]]), "\n")
}
Variable: customerID - Outliers found: 0
Variable: gender - Outliers found: 0
Variable: SeniorCitizen - Outliers found: 1142
Variable: Partner - Outliers found: 0
Variable: Dependents - Outliers found: 0
Variable: tenure - Outliers found: 0
Variable: PhoneService - Outliers found: 682
Variable: MultipleLines - Outliers found: 0
Variable: InternetService - Outliers found: 0
Variable: OnlineSecurity - Outliers found: 0
Variable: OnlineBackup - Outliers found: 0
Variable: DeviceProtection - Outliers found: 0
Variable: TechSupport - Outliers found: 0
Variable: StreamingTV - Outliers found: 0
Variable: StreamingMovies - Outliers found: 0
Variable: Contract - Outliers found: 0
Variable: PaperlessBilling - Outliers found: 0
Variable: PaymentMethod - Outliers found: 0
Variable: MonthlyCharges - Outliers found: 0
Variable: TotalCharges - Outliers found: 0
Variable: Churn - Outliers found: 0
Q1 <- quantile(df$SeniorCitizen, 0.25)
Q3 <- quantile(df$SeniorCitizen, 0.75)
IQR <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q1 +1.5 * IQR
df_clean <- df[df$SeniorCitizen >= lower_bound & df$SeniorCitizen <= upper_bound, ]
df_clean
Q1 <- quantile(df_clean$SeniorCitizen, 0.25, na.rm = TRUE)
Q3 <- quantile(df_clean$SeniorCitizen, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
lower <- Q1 - 1.5 * IQR
upper <- Q3 + 1.5 * IQR
sum(df_clean$SeniorCitizen < lower | df_clean$SeniorCitizen > upper, na.rm = TRUE)
[1] 0
• Cleaned dataset stored as a new CSV file.
write.csv(df_clean,"df_clean.csv",row.names = FALSE)
getwd()
[1] "C:/Users/PC/Documents/R training"
summary(df)
customerID gender SeniorCitizen Partner Dependents tenure PhoneService
Min. : 1 Min. :1.000 Min. :0.0000 Min. :1.000 Min. :1.0 Min. : 0.00 Min. :1.000
1st Qu.:1762 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:1.000 1st Qu.:1.0 1st Qu.: 9.00 1st Qu.:2.000
Median :3522 Median :2.000 Median :0.0000 Median :1.000 Median :1.0 Median :29.00 Median :2.000
Mean :3522 Mean :1.505 Mean :0.1621 Mean :1.483 Mean :1.3 Mean :32.37 Mean :1.903
3rd Qu.:5282 3rd Qu.:2.000 3rd Qu.:0.0000 3rd Qu.:2.000 3rd Qu.:2.0 3rd Qu.:55.00 3rd Qu.:2.000
Max. :7043 Max. :2.000 Max. :1.0000 Max. :2.000 Max. :2.0 Max. :72.00 Max. :2.000
MultipleLines InternetService OnlineSecurity OnlineBackup DeviceProtection TechSupport StreamingTV
Min. :1.000 Min. :1.000 Min. :1.00 Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.00 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000
Median :2.000 Median :2.000 Median :2.00 Median :2.000 Median :2.000 Median :2.000 Median :2.000
Mean :1.941 Mean :1.873 Mean :1.79 Mean :1.906 Mean :1.904 Mean :1.797 Mean :1.985
3rd Qu.:3.000 3rd Qu.:2.000 3rd Qu.:3.00 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:3.000
Max. :3.000 Max. :3.000 Max. :3.00 Max. :3.000 Max. :3.000 Max. :3.000 Max. :3.000
StreamingMovies Contract PaperlessBilling PaymentMethod MonthlyCharges TotalCharges Churn
Min. :1.000 Min. :1.00 Min. :1.000 Min. :1.000 Min. : 18.25 Min. : 18.8 Min. :1.000
1st Qu.:1.000 1st Qu.:1.00 1st Qu.:1.000 1st Qu.:2.000 1st Qu.: 35.50 1st Qu.: 402.2 1st Qu.:1.000
Median :2.000 Median :1.00 Median :2.000 Median :3.000 Median : 70.35 Median :1397.5 Median :1.000
Mean :1.992 Mean :1.69 Mean :1.592 Mean :2.574 Mean : 64.76 Mean :2281.9 Mean :1.265
3rd Qu.:3.000 3rd Qu.:2.00 3rd Qu.:2.000 3rd Qu.:3.000 3rd Qu.: 89.85 3rd Qu.:3786.6 3rd Qu.:2.000
Max. :3.000 Max. :3.00 Max. :2.000 Max. :4.000 Max. :118.75 Max. :8684.8 Max. :2.000
Explanation
1.correlation matrix
numeric_df<-df[sapply(df,is.numeric)]
cor_matrix<-cor(numeric_df,use="complete.obs")
print(cor_matrix)
customerID gender SeniorCitizen Partner Dependents tenure PhoneService
customerID 1.000000e+00 6.287560e-03 -0.002074491 -0.0267293673 -0.012822980 0.008035245 -0.006483114
gender 6.287560e-03 1.000000e+00 -0.001873712 -0.0018080797 0.010516612 0.005106223 -0.006487672
SeniorCitizen -2.074491e-03 -1.873712e-03 1.000000000 0.0164786576 -0.211185088 0.016566878 0.008576401
Partner -2.672937e-02 -1.808080e-03 0.016478658 1.0000000000 0.452676283 0.379697461 0.017705663
Dependents -1.282298e-02 1.051661e-02 -0.211185088 0.4526762829 1.000000000 0.159712331 -0.001761679
tenure 8.035245e-03 5.106223e-03 0.016566878 0.3796974612 0.159712331 1.000000000 0.008448207
PhoneService -6.483114e-03 -6.487672e-03 0.008576401 0.0177056632 -0.001761679 0.008448207 1.000000000
MultipleLines 4.315642e-03 -6.738743e-03 0.146184870 0.1424104914 -0.024990642 0.343031854 -0.020537831
InternetService -1.240661e-02 -8.633825e-04 -0.032310150 0.0008913469 0.044590441 -0.030359034 0.387436022
OnlineSecurity 1.329223e-02 -1.501689e-02 -0.128221307 0.1508282814 0.152166383 0.325467840 -0.015198344
OnlineBackup -3.333703e-03 -1.205675e-02 -0.013632137 0.1531301336 0.091014768 0.370876123 0.024104791
DeviceProtection -6.918004e-03 5.490359e-04 -0.021397649 0.1663303505 0.080537163 0.371105436 0.003727316
TechSupport 1.139834e-03 -6.824559e-03 -0.151267697 0.1267333219 0.133523531 0.322941583 -0.019157686
StreamingTV -7.776878e-03 -6.421366e-03 0.030776349 0.1373414370 0.046884886 0.289373218 0.055353254
StreamingMovies -1.674614e-02 -8.743154e-03 0.047266359 0.1295738935 0.021321016 0.296866273 0.043869619
Contract 1.502778e-02 1.257477e-04 -0.142554080 0.2948062244 0.243187473 0.671606549 0.002247374
PaperlessBilling -1.945042e-03 -1.175369e-02 0.156529559 -0.0148766223 -0.111377229 0.006152482 0.016504806
PaymentMethod 1.160439e-02 1.735226e-02 -0.038550813 -0.1547975288 -0.040291899 -0.370436118 -0.004184054
MonthlyCharges -3.916417e-03 -1.456932e-02 0.220173339 0.0968479363 -0.113890230 0.247899856 0.247397963
TotalCharges -1.903837e-05 -1.921911e-06 0.102651590 0.3183638954 0.063593305 0.825464086 0.113012809
Churn -1.744721e-02 -8.612095e-03 0.150889328 -0.1504475450 -0.164221402 -0.352228670 0.011941980
MultipleLines InternetService OnlineSecurity OnlineBackup DeviceProtection TechSupport StreamingTV
customerID 0.004315642 -0.0124066064 0.013292235 -0.003333703 -0.0069180043 0.001139834 -0.007776878
gender -0.006738743 -0.0008633825 -0.015016887 -0.012056754 0.0005490359 -0.006824559 -0.006421366
SeniorCitizen 0.146184870 -0.0323101502 -0.128221307 -0.013632137 -0.0213976486 -0.151267697 0.030776349
Partner 0.142410491 0.0008913469 0.150828281 0.153130134 0.1663303505 0.126733322 0.137341437
Dependents -0.024990642 0.0445904407 0.152166383 0.091014768 0.0805371627 0.133523531 0.046884886
tenure 0.343031854 -0.0303590343 0.325467840 0.370876123 0.3711054358 0.322941583 0.289373218
PhoneService -0.020537831 0.3874360220 -0.015198344 0.024104791 0.0037273158 -0.019157686 0.055353254
MultipleLines 1.000000000 -0.1092157984 0.007140508 0.117327302 0.1223180267 0.011466091 0.175059025
InternetService -0.109215798 1.0000000000 -0.028416181 0.036137937 0.0449442685 -0.026046946 0.107416772
OnlineSecurity 0.007140508 -0.0284161810 1.000000000 0.185125586 0.1759853288 0.285028465 0.044669272
OnlineBackup 0.117327302 0.0361379374 0.185125586 1.000000000 0.1877568644 0.195748148 0.147185665
DeviceProtection 0.122318027 0.0449442685 0.175985329 0.187756864 1.0000000000 0.240593121 0.276651547
TechSupport 0.011466091 -0.0260469460 0.285028465 0.195748148 0.2405931212 1.000000000 0.161305024
StreamingTV 0.175059025 0.1074167724 0.044669272 0.147185665 0.2766515471 0.161305024 1.000000000
StreamingMovies 0.180957159 0.0983496062 0.055953610 0.136721965 0.2887993891 0.161315802 0.434772353
Contract 0.110842245 0.0997207826 0.374415538 0.280980144 0.3502770893 0.425366672 0.227115572
PaperlessBilling 0.165145894 -0.1386247786 -0.157641192 -0.013370380 -0.0382340313 -0.113599641 0.096641567
PaymentMethod -0.176793410 0.0861398581 -0.096725947 -0.124847381 -0.1357499660 -0.104669714 -0.104233699
MonthlyCharges 0.433576010 -0.3232601923 -0.053878094 0.119777245 0.1636518655 -0.008681602 0.336706239
TotalCharges 0.452849367 -0.1755884904 0.253935041 0.375062556 0.3885615563 0.276342769 0.392046136
Churn 0.038036886 -0.0472913877 -0.289308996 -0.195525014 -0.1781338611 -0.282491608 -0.036580993
StreamingMovies Contract PaperlessBilling PaymentMethod MonthlyCharges TotalCharges
customerID -0.016746145 0.0150277787 -0.001945042 0.011604392 -0.003916417 -1.903837e-05
gender -0.008743154 0.0001257477 -0.011753691 0.017352264 -0.014569324 -1.921911e-06
SeniorCitizen 0.047266359 -0.1425540803 0.156529559 -0.038550813 0.220173339 1.026516e-01
Partner 0.129573893 0.2948062244 -0.014876622 -0.154797529 0.096847936 3.183639e-01
Dependents 0.021321016 0.2431874726 -0.111377229 -0.040291899 -0.113890230 6.359330e-02
tenure 0.296866273 0.6716065492 0.006152482 -0.370436118 0.247899856 8.254641e-01
PhoneService 0.043869619 0.0022473744 0.016504806 -0.004184054 0.247397963 1.130128e-01
MultipleLines 0.180957159 0.1108422447 0.165145894 -0.176793410 0.433576010 4.528494e-01
InternetService 0.098349606 0.0997207826 -0.138624779 0.086139858 -0.323260192 -1.755885e-01
OnlineSecurity 0.055953610 0.3744155384 -0.157641192 -0.096725947 -0.053878094 2.539350e-01
OnlineBackup 0.136721965 0.2809801444 -0.013370380 -0.124847381 0.119777245 3.750626e-01
DeviceProtection 0.288799389 0.3502770893 -0.038234031 -0.135749966 0.163651866 3.885616e-01
TechSupport 0.161315802 0.4253666716 -0.113599641 -0.104669714 -0.008681602 2.763428e-01
StreamingTV 0.434772353 0.2271155725 0.096641567 -0.104233699 0.336706239 3.920461e-01
StreamingMovies 1.000000000 0.2312255160 0.083700449 -0.111240576 0.335459482 3.980449e-01
Contract 0.231225516 1.0000000000 -0.176733078 -0.227543497 -0.074194667 4.485643e-01
PaperlessBilling 0.083700449 -0.1767330778 1.000000000 -0.062903951 0.352149968 1.580546e-01
PaymentMethod -0.111240576 -0.2275434968 -0.062903951 1.000000000 -0.193407294 -3.305106e-01
MonthlyCharges 0.335459482 -0.0741946672 0.352149968 -0.193407294 1.000000000 6.508643e-01
TotalCharges 0.398044852 0.4485642564 0.158054596 -0.330510561 0.650864350 1.000000e+00
Churn -0.038491629 -0.3967126292 0.191825332 0.107062006 0.193356422 -1.990368e-01
Churn
customerID -0.017447211
gender -0.008612095
SeniorCitizen 0.150889328
Partner -0.150447545
Dependents -0.164221402
tenure -0.352228670
PhoneService 0.011941980
MultipleLines 0.038036886
InternetService -0.047291388
OnlineSecurity -0.289308996
OnlineBackup -0.195525014
DeviceProtection -0.178133861
TechSupport -0.282491608
StreamingTV -0.036580993
StreamingMovies -0.038491629
Contract -0.396712629
PaperlessBilling 0.191825332
PaymentMethod 0.107062006
MonthlyCharges 0.193356422
TotalCharges -0.199036832
Churn 1.000000000
library(corrplot)
corrplot(cor_matrix,method="color")
inspecting contract column
table(df$Contract)
1 2 3
3875 1473 1695
##tabled df
table(df$Contract)
1 2 3
3875 1473 1695
barplot(
table(df$Contract),
main="customer distribution by contract type",
xlab="Contract type",
ylab="Number of Customers",
,col="blue")
library(dplyr)
library(ggplot2)
### assigning column names
colnames(df)<-c("customerId","gender","SeniorCitizen","Partner","Dependents","tenure",
"PhoneService","MultipleLines","InternetService","OnlineSecurity","OnlineBackup",
"DeviceProtection","TechSupport","StreamingTv","StreamingMovies",
"Contract","PaperlessBilling","PaymentMethod","MonthlyCharges",
"TotalCharges","Churn")
## ensuring Monthly Charges And Total Charges are numeric
df$MonthlyCharges<-as.numeric(as.character(df$MonthlyCharges))
df$TotalCharges<-as.numeric(gsub("[^0-9.]","",as.character(df$TotalCharges)))
## Group by Contract Type
contract_summary <- df %>%
group_by(Contract) %>%
summarise(
AvgMonthly=mean(MonthlyCharges,na.rm=TRUE),
AvgTotal=mean(TotalCharges,na.rm=TRUE),
)
print(contract_summary)
conclusion longer contracts have least monthly charges but higher average totaldue to higher lifetime value
churn_summary <- df %>%
group_by(Churn) %>%
summarise(
AvgMonthly=mean(MonthlyCharges,na.rm=TRUE),
AvgTotal=mean(TotalCharges,na.rm=TRUE),
count=n()
)
print(churn_summary)
churned customers pay more monthly charges but less total charges(mostlikely they stay for a shorter time)
internet_summary <- df %>%
group_by(InternetService) %>%
summarise(
AvgMonthly=mean(MonthlyCharges,na.rm=TRUE),
AvgTotal=mean(TotalCharges,na.rm=TRUE),
count=n()
)
print(internet_summary)
conclusion Fiber optic pay the most monthly and also the most in total charges ## group by PaymentMethod
payment_summary <- df %>%
group_by(PaymentMethod) %>%
summarise(
AvgMonthly=mean(MonthlyCharges,na.rm=TRUE),
AvgTotal=mean(TotalCharges,na.rm=TRUE),
count=n()
)
print(payment_summary)
gender_summary <- df %>%
group_by(gender) %>%
summarise(
AvgMonthly=mean(MonthlyCharges,na.rm=TRUE),
AvgTotal=mean(TotalCharges,na.rm=TRUE),
count=n()
)
print(gender_summary)
conclusion no significant difference but males tend to pay slightly more
##Visualizing monthly charges and total charges across different Customer Groups
library(tidyr)
library(dplyr)
library(ggplot2)
### assigning column names
colnames(df)<-c("customerId","gender","SeniorCitizen","Partner","Dependents","tenure",
"PhoneService","MultipleLines","InternetService","OnlineSecurity","OnlineBackup",
"DeviceProtection","TechSupport","StreamingTv","StreamingMovies",
"Contract","PaperlessBilling","PaymentMethod","MonthlyCharges",
"TotalCharges","Churn")
## ensuring Monthly Charges And Total Charges are numeric
df$MonthlyCharges<-as.numeric(as.character(df$MonthlyCharges))
df$TotalCharges<-as.numeric(gsub("[^0-9.]","",as.character(df$TotalCharges)))
## Grouped Summary
contract_summary <- df %>%
group_by(Contract) %>%
summarise(
AvgMonthly=mean(MonthlyCharges,na.rm=TRUE),
AvgTotal=mean(TotalCharges,na.rm=TRUE),
)
## reshaping data for plotting
contract_plot_data<-contract_summary%>%pivot_longer(cols=c(AvgMonthly,AvgTotal),
names_to="ChargeType",
values_to="Average")
## plotting Comparison of Monthly Charges and Total Charges by contract
ggplot(contract_plot_data,aes(x=Contract,y=Average,fill=ChargeType))+
geom_bar(stat="identity",position = "dodge")+
labs(title = "Average Monthly Charges vs Total Charges by Contract Type",
x="Contract Type",
y="Average Charge($)",
fill="ChargeType")+
theme_minimal()
Conclusion month to month customers pay more per month but stay shorter two year contracts pay the least but have higher total revenue
library(tidyr)
library(dplyr)
library(ggplot2)
## Grouped Summary
churn_summary<-df%>%group_by(Churn)%>%summarise(
AvgMonthly=mean(MonthlyCharges,na.rm=TRUE),
AvgTotal=mean(TotalCharges,na.rm=TRUE),
)
## reshaping data for plotting
churn_summary%>%pivot_longer(cols=c(AvgMonthly,AvgTotal),
names_to="ChargeType",
values_to="Average")%>%
ggplot(aes(x=Churn,y=Average,fill=ChargeType))+
geom_bar(stat="identity",position = "dodge")+
labs(title = "Average Monthly Charges vs Total Charges by Churn status",
x="Churn",
y="Average Charge($)",
fill="ChargeType")+
theme_minimal()
Conclusion A significant number of c
library(tidyr)
library(dplyr)
library(ggplot2)
## Grouped Summary
internet_summary<-df%>%group_by(InternetService)%>%summarise(
AvgMonthly=mean(MonthlyCharges,na.rm=TRUE),
AvgTotal=mean(TotalCharges,na.rm=TRUE),
)
## reshaping data for plotting
internet_summary%>%pivot_longer(cols=c(AvgMonthly,AvgTotal),
names_to="ChargeType",
values_to="Average")%>%
ggplot(aes(x=InternetService,y=Average,fill=ChargeType))+
geom_bar(stat="identity",position = "dodge")+
labs(title = "Average Monthly Charges vs Total Charges by InternetService",
x="InternetService",
y="Average Charge($)",
fill="ChargeType")+
theme_minimal()
##Scatterplot of monthly vs total charges
ggplot(df,aes(x=MonthlyCharges,y=TotalCharges,color=Churn))+
geom_point(alpha=0.6)+
labs(title = "Monthly Charges vs Total Charges(coloredby Churn)",
x="MonthlyCharge($))",y="TotalCharges($)")+
theme_minimal()
# converting numeric columns to correct type
df$tenure<-as.numeric(as.character(df$tenure))
df$MonthlyCharges<-as.numeric(as.character(df$MonthlyCharges))
df$TotalCharges<-as.numeric(as.character(df$TotalCharges))
## create a data frame with only numerical variables
numeric_df<-df%>%
select(tenure,MonthlyCharges,TotalCharges)
#compute correlation matrix
cor_matrix<-cor(numeric_df,use="complete.obs")
print(cor_matrix)
tenure MonthlyCharges TotalCharges
tenure 1.0000000 0.2478999 0.8254641
MonthlyCharges 0.2478999 1.0000000 0.6508643
TotalCharges 0.8254641 0.6508643 1.0000000
Explanation Tenure vs total charges have a strong positive correlation(cor=0.8258805),meaning the longer the customer stays,the higher their charges. Tenure vs monthly charges have a week positive relationship(cor=0.2468618),meaning customers who stay longer maybe slightly on a cheaper plan. **Monthly charges vs Total Charges have an average positive correlation of 0.65,meaning the more the monthly charges,the higher the total charges
##pearson Rank
library(corrplot)
df<-df[,c("tenure","MonthlyCharges","TotalCharges")]
pearson_matrix<-cor(df,method="pearson")
pearson_matrix
tenure MonthlyCharges TotalCharges
tenure 1.0000000 0.2478999 0.8254641
MonthlyCharges 0.2478999 1.0000000 0.6508643
TotalCharges 0.8254641 0.6508643 1.0000000
##spearman rank correlation
library(corrplot)
df<-df[,c("tenure","MonthlyCharges","TotalCharges")]
spearman_matrix<-cor(df,method="spearman")
spearman_matrix
tenure MonthlyCharges TotalCharges
tenure 1.0000000 0.2764168 0.8869979
MonthlyCharges 0.2764168 1.0000000 0.6374752
TotalCharges 0.8869979 0.6374752 1.0000000
library(ggplot2)
ggplot(df,aes(x="",y=MonthlyCharges))+
geom_boxplot(fill="blue",width=0.5)+
labs(title="Boxplot of Monthly Charges",
y="MonthlyCharges")+
theme_minimal()
explanation **most customers pay around 70$ monthly charges,but some have higher monthly charges,maybe they are using premium services
library(ggplot2)
ggplot(df,aes(x="",y=TotalCharges))+
geom_boxplot(fill="green",width=0.5)+
labs(title="Boxplot of Total Charges",
y="TotalCharges")+
theme_minimal()
Explanation ** Wide spread indicating varying tenure and monthly spending,likely long term customers or those with high monthly charges
library(ggplot2)
df<-read.csv("Telcom-data.csv")
### assigning column names
colnames(df)<-c("customerId","gender","SeniorCitizen","Partner","Dependents","tenure",
"PhoneService","MultipleLines","InternetService","OnlineSecurity","OnlineBackup",
"DeviceProtection","TechSupport","StreamingTv","StreamingMovies",
"Contract","PaperlessBilling","PaymentMethod","MonthlyCharges",
"TotalCharges","Churn")
## ensuring Monthly Charges And Total Charges are numeric
df$MonthlyCharges<-as.numeric(as.character(df$MonthlyCharges))
## plotting Boxplot of Montly charges
ggplot(df,aes(x=Churn,y=MonthlyCharges,fill=Churn))+
geom_boxplot()+
labs(title="Boxplot of monthly Charges by churn",
y="MonthlyCharges")+
theme_minimal()
#Total charges by churn
ggplot(df,aes(x=Churn,y=TotalCharges,fill=Churn))+
geom_boxplot()+
labs(title="Boxplot of Total Charges by churn",
y="TotalCharges")+
theme_minimal()
Explanation churned customers paid more monthly charges than those not churned churned customers have much lower total spending,indicating short tenures.
library(ggplot2)
ggplot(df,aes(x=MonthlyCharges))+
geom_histogram(bins=30,fill="red",color="black")+
labs(title="Histogram of Monthly Charges Distribution",
x="MonthlyCharges",
y="frequency")+
theme_minimal()
#Explanation Monthly charges cloud at certain price points,suggesting tiered pricing or bundled packages,
library(ggplot2)
ggplot(df,aes(x=TotalCharges))+
geom_histogram(bins=30,fill="red",color="black")+
labs(title="Histogram of Total Charges Distribution",
x="TotalCharges",
y="frequency")+
theme_minimal()
#Explanation most customers have low to moderate total charges,likely due to short tenure,few customers indicate high total spending,indicating long-term loyalty.
Hypothesis 1: Do customers with long-term contracts have higher total spending? • Null Hypothesis (H₀): No difference in TotalCharges among contract types. • Alternative Hypothesis (H₁): Customers with longer contracts have significantly higher TotalCharges. #Descriptive statistics by Contract Type
summary_by_contract<-df%>%
group_by(Contract)%>%
summarise(
count=n(),
meanTotalCharge=mean(TotalCharges,na.rm=TRUE),
sdTotalCharge=sd(TotalCharges,na.rm=TRUE)
)
print(summary_by_contract)
NA
library(ggplot2)
ggplot(df,aes(x=TotalCharges))+
geom_boxplot(fill="red")+
labs(title="Total Charges by Contract",
x="Contract Type",
y="Total Charges")+
theme_minimal()
# fitting a linear model
anova_model<-aov(TotalCharges ~ Contract,data=df)
summary(anova_model)
Df Sum Sq Mean Sq F value Pr(>F)
Contract 2 7.590e+09 3.795e+09 934.7 <2e-16 ***
Residuals 7029 2.854e+10 4.060e+06
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
11 observations deleted due to missingness
#Conclusion p-value<0.05,reject null hypothesis, and conclude that there is statistically significant difference in Total Charges between Contract Types. #Recommendation customers with longer contracts spend significantly more compared to those with shorter contracts,hence encourage the customers to move to longer contracts
TukeyTest<-TukeyHSD(anova_model)
plot(TukeyTest,las=1)
print(TukeyTest)
Tukey multiple comparisons of means
95% family-wise confidence level
Fit: aov(formula = TotalCharges ~ Contract, data = df)
$Contract
diff lwr upr p adj
One year-Month-to-month 1665.4285 1520.8120 1810.0450 0
Two year-Month-to-month 2359.6794 2221.8461 2497.5126 0
Two year-One year 694.2509 525.7369 862.7648 0
• Null Hypothesis (H₀): No difference in MonthlyCharges between male and female customers. • Alternative Hypothesis (H₁): Males and females have different spending behaviors. • Test to use: Independent t-test.
t_test_result <- t.test(MonthlyCharges~gender, data=df)
# 7. Print the result
print(t_test_result)
Welch Two Sample t-test
data: MonthlyCharges by gender
t = 1.2227, df = 7038.9, p-value = 0.2215
alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
95 percent confidence interval:
-0.5289426 2.2824640
sample estimates:
mean in group Female mean in group Male
65.20424 64.32748
p_value(0.2215)>0.05,hence do not reject the null hypothesis, and conclude that there is no difference between male and female spending.
#5. Regression Analysis Building a Linear Regression Model to Predict MonthlyCharges • Define MonthlyCharges as the dependent variable. • Use Tenure, Contract, and Payment Method as independent variables.
model<-lm(MonthlyCharges~tenure+Contract+PaymentMethod,data=df)
summary(model)
Call:
lm(formula = MonthlyCharges ~ tenure + Contract + PaymentMethod,
data = df)
Residuals:
Min 1Q Median 3Q Max
-72.154 -20.211 3.787 19.016 76.913
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 54.445739 0.894149 60.891 <2e-16 ***
tenure 0.551490 0.018054 30.547 <2e-16 ***
ContractOne year -11.885419 0.912707 -13.022 <2e-16 ***
ContractTwo year -22.775408 1.047304 -21.747 <2e-16 ***
PaymentMethodCredit card (automatic) 0.006174 0.935446 0.007 0.995
PaymentMethodElectronic check 11.288247 0.896915 12.586 <2e-16 ***
PaymentMethodMailed check -14.685846 0.973991 -15.078 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 25.89 on 7036 degrees of freedom
Multiple R-squared: 0.2605, Adjusted R-squared: 0.2599
F-statistic: 413.1 on 6 and 7036 DF, p-value: < 2.2e-16
library(car)
G2;H2;Warningh: package ‘car’ was built under R version 4.4.3g
G3;Loading required package: carData
gG2;H2;Warningh: package ‘carData’ was built under R version 4.4.3g
G3;
Attaching package: ‘car’
gG3;The following object is masked from ‘package:psych’:
logit
gG3;The following object is masked from ‘package:dplyr’:
recode
gG3;The following object is masked from ‘package:purrr’:
some
g
vif_values=vif(model)
print(vif_values)
GVIF Df GVIF^(1/(2*Df))
tenure 2.065973 1 1.437349
Contract 2.023452 2 1.192678
PaymentMethod 1.308307 3 1.045807
no multicollinearity
# Load performance package
library(performance)
# Get model performance metrics
model_performance(model)
# R²: Measures model goodness-of-fit.
# Cook’s Distance Plot
plot(cooks.distance(model),
type = "h",
main = "Cook’s Distance",
ylab = "Cook’s Distance")
##conclusion no need for further investigations
#Residual diagnostics
# Histogram of residuals
ggplot(data = data.frame(residuals = resid(model)), aes(x = residuals)) +
geom_histogram(color = "black", fill = "blue", bins = 10) +
labs(title = "Histogram of Residuals", x = "Residuals", y = "Count")
Bell shaped curve,Residuals are normal