# AE 5
#Step 1: Set working directory
setwd("C:/Users/beckercw/Downloads")
dir()
## [1] "Company Sales Dashboard.pdf"
## [2] "company sales data dashboard (1).pdf"
## [3] "company sales data dashboard.pdf"
## [4] "company_sales_data.csv"
## [5] "company_sales_data.xlsx"
## [6] "CRM_Data_ActualVersionA_300.csv"
## [7] "desktop.ini"
## [8] "R-4.4.3-win.exe"
## [9] "RStudio-2024.12.1-563 (1).exe"
## [10] "RStudio-2024.12.1-563.exe"
## [11] "TelecommunicationCustomerData_TestedVersionB.csv"
df <- read.csv("TelecommunicationCustomerData_TestedVersionB.csv")
######
### T-test: MonthlyCharges ~ Dependents
# Load necessary libraries
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.2.3
# Load the dataset
df <- read.csv("TelecommunicationCustomerData_TestedVersionB.csv")
# Convert 'Dependents' to a factor if needed
df$Dependents <- as.factor(df$Dependents)
# Split data into two groups based on 'Dependents'
group_with_dependents <- df$MonthlyCharges[df$Dependents == 1]
group_without_dependents <- df$MonthlyCharges[df$Dependents == 0]
# Perform an independent t-test
t_test_result <- t.test(group_with_dependents, group_without_dependents, var.equal = FALSE)
# Compute group averages
avg_with_dependents <- mean(group_with_dependents, na.rm = TRUE)
avg_without_dependents <- mean(group_without_dependents, na.rm = TRUE)
# Print results
print(paste("Average Monthly Charges (With Dependents):", round(avg_with_dependents, 2)))
## [1] "Average Monthly Charges (With Dependents): 59.89"
print(paste("Average Monthly Charges (Without Dependents):", round(avg_without_dependents, 2)))
## [1] "Average Monthly Charges (Without Dependents): 67.51"
print(paste("P-value:", round(t_test_result$p.value, 5)))
## [1] "P-value: 0.00043"
# Visualize group averages with a bar graph
ggplot(df, aes(x = Dependents, y = MonthlyCharges, fill = Dependents)) +
stat_summary(fun = mean, geom = "bar", position = "dodge") +
labs(title = "Comparison of Monthly Charges by Dependents Status",
x = "Dependents",
y = "Average Monthly Charges") +
theme_minimal()

#####
# ANOVA: MonthlyCharges ~ Contracts
# Load necessary libraries
library(ggplot2)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.2.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Load the dataset
df <- read.csv("TelecommunicationCustomerData_TestedVersionB.csv")
# Perform ANOVA test
anova_result <- aov(MonthlyCharges ~ Contract, data = df)
summary(anova_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## Contract 2 6928 3464 3.754 0.0238 *
## Residuals 940 867480 923
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Compute group averages
contract_averages <- df %>%
group_by(Contract) %>%
summarise(Average_Monthly_Charges = mean(MonthlyCharges, na.rm = TRUE))
# Print averages
print(contract_averages)
## # A tibble: 3 × 2
## Contract Average_Monthly_Charges
## <chr> <dbl>
## 1 Month-to-month 66.4
## 2 One year 67.4
## 3 Two year 60.4
# Visualize group averages with a bar graph
ggplot(contract_averages, aes(x = Contract, y = Average_Monthly_Charges, fill = Contract)) +
geom_bar(stat = "identity") +
labs(title = "Comparison of Monthly Charges by Contract Type",
x = "Contract Type",
y = "Average Monthly Charges") +
theme_minimal()

####
#Chi-Square Test: SeniorCitizen & StreamingTV
# Load necessary libraries
library(ggplot2)
# Load the dataset
df <- read.csv("TelecommunicationCustomerData_TestedVersionB.csv")
# Compute proportions of senior and non-senior customers that stream TV
senior_streaming <- mean(df$StreamingTV[df$SeniorCitizen == 1], na.rm = TRUE)
non_senior_streaming <- mean(df$StreamingTV[df$SeniorCitizen == 0], na.rm = TRUE)
# Create a contingency table
contingency_table <- table(df$SeniorCitizen, df$StreamingTV)
# Perform chi-square test
chi_test <- chisq.test(contingency_table)
# Print results
print(paste("Proportion of Senior Customers that Stream TV:", round(senior_streaming, 4)))
## [1] "Proportion of Senior Customers that Stream TV: 0.5366"
print(paste("Proportion of Non-Senior Customers that Stream TV:", round(non_senior_streaming, 4)))
## [1] "Proportion of Non-Senior Customers that Stream TV: 0.3415"
print(paste("P-value:", round(chi_test$p.value, 8)))
## [1] "P-value: 4.19e-06"
# Visualize proportions with a bar graph
proportions <- data.frame(
CustomerType = c("Senior", "Non-Senior"),
Proportion = c(senior_streaming, non_senior_streaming)
)
ggplot(proportions, aes(x = CustomerType, y = Proportion, fill = CustomerType)) +
geom_bar(stat = "identity") +
labs(title = "Proportion of Seniors and Non-Seniors Who Stream TV",
x = "Customer Type",
y = "Proportion Streaming TV") +
ylim(0, 1) +
theme_minimal()

###
#Correlation Analysis: tenure & MonthlyCharge
# Load necessary libraries
library(ggplot2)
# Load the dataset
df <- read.csv("TelecommunicationCustomerData_TestedVersionB.csv")
# Perform Pearson correlation test
correlation_test <- cor.test(df$tenure, df$MonthlyCharges, method = "pearson")
# Print correlation coefficient and p-value
print(paste("Correlation Coefficient:", round(correlation_test$estimate, 3)))
## [1] "Correlation Coefficient: 0.243"
print(paste("P-value:", formatC(correlation_test$p.value, format = "e", digits = 5)))
## [1] "P-value: 3.88918e-14"
# Visualize correlation with a scatter plot
ggplot(df, aes(x = tenure, y = MonthlyCharges)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", color = "red", se = FALSE) +
labs(title = "Correlation between Tenure and Monthly Charges",
x = "Tenure (Months)",
y = "Monthly Charges ($)") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

#####
#Linear Regression: MonthlyCharge ~ gender + Dependent + tenure
# Load necessary libraries
library(ggplot2)
# Load the dataset
df <- read.csv("TelecommunicationCustomerData_TestedVersionB.csv")
# Encode 'gender' as numeric (Male = 1, Female = 0)
df$gender <- ifelse(df$gender == "Male", 1, 0)
# Fit the linear regression model
model <- lm(MonthlyCharges ~ gender + Dependents + tenure, data = df)
# Print regression summary
summary(model)
##
## Call:
## lm(formula = MonthlyCharges ~ gender + Dependents + tenure, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -61.633 -27.568 6.925 24.437 51.724
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 57.76542 1.93719 29.819 < 2e-16 ***
## gender -1.00704 1.90827 -0.528 0.598
## Dependents -9.54023 2.09628 -4.551 6.04e-06 ***
## tenure 0.32316 0.03936 8.210 7.28e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 29.27 on 939 degrees of freedom
## Multiple R-squared: 0.07974, Adjusted R-squared: 0.0768
## F-statistic: 27.12 on 3 and 939 DF, p-value: < 2.2e-16
# Predict Monthly Charges for a new customer (Male, Dependents=1, Tenure=10)
new_customer <- data.frame(gender = 1, Dependents = 1, tenure = 10)
predicted_bill <- predict(model, new_customer)
# Print the predicted monthly bill
print(paste("Predicted Monthly Bill:", round(predicted_bill, 2)))
## [1] "Predicted Monthly Bill: 50.45"
######
# Logistic Regression: Churn ~ Contract + tenure + InternetService
# Load necessary libraries
library(nnet)
## Warning: package 'nnet' was built under R version 4.2.3
# Load the dataset
df <- read.csv("TelecommunicationCustomerData_TestedVersionB.csv")
# Convert categorical variables to factors
df$Contract <- as.factor(df$Contract)
df$InternetService <- as.factor(df$InternetService)
# Fit logistic regression model
model <- glm(Churn ~ Contract + tenure + InternetService, data = df, family = binomial)
# Print regression summary
summary(model)
##
## Call:
## glm(formula = Churn ~ Contract + tenure + InternetService, family = binomial,
## data = df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6016 -0.7331 -0.3528 0.8159 2.7775
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.27610 0.17179 -1.607 0.108010
## ContractOne year -0.64364 0.25054 -2.569 0.010197 *
## ContractTwo year -1.66877 0.44667 -3.736 0.000187 ***
## tenure -0.02865 0.00506 -5.663 1.49e-08 ***
## InternetServiceFiber optic 1.26247 0.19397 6.509 7.58e-11 ***
## InternetServiceNo -1.31030 0.33473 -3.914 9.06e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1125.71 on 942 degrees of freedom
## Residual deviance: 841.82 on 937 degrees of freedom
## AIC: 853.82
##
## Number of Fisher Scoring iterations: 6
# Predict churn probabilities for new customers
new_customer_1 <- data.frame(Contract = "Month-to-month", tenure = 6, InternetService = "Fiber optic")
new_customer_2 <- data.frame(Contract = "Two year", tenure = 2, InternetService = "DSL")
prob_1 <- predict(model, new_customer_1, type = "response")
prob_2 <- predict(model, new_customer_2, type = "response")
# Print predicted churn probabilities
print(paste("Churn Probability (Customer 1):", round(prob_1 * 100, 2), "%"))
## [1] "Churn Probability (Customer 1): 69.31 %"
print(paste("Churn Probability (Customer 2):", round(prob_2 * 100, 2), "%"))
## [1] "Churn Probability (Customer 2): 11.9 %"