# AE 5


#Step 1: Set working directory
setwd("C:/Users/beckercw/Downloads")
dir()
##  [1] "Company Sales Dashboard.pdf"                     
##  [2] "company sales data dashboard (1).pdf"            
##  [3] "company sales data dashboard.pdf"                
##  [4] "company_sales_data.csv"                          
##  [5] "company_sales_data.xlsx"                         
##  [6] "CRM_Data_ActualVersionA_300.csv"                 
##  [7] "desktop.ini"                                     
##  [8] "R-4.4.3-win.exe"                                 
##  [9] "RStudio-2024.12.1-563 (1).exe"                   
## [10] "RStudio-2024.12.1-563.exe"                       
## [11] "TelecommunicationCustomerData_TestedVersionB.csv"
df <- read.csv("TelecommunicationCustomerData_TestedVersionB.csv")

######
### T-test: MonthlyCharges ~ Dependents
# Load necessary libraries
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.2.3
# Load the dataset
df <- read.csv("TelecommunicationCustomerData_TestedVersionB.csv")

# Convert 'Dependents' to a factor if needed
df$Dependents <- as.factor(df$Dependents)

# Split data into two groups based on 'Dependents'
group_with_dependents <- df$MonthlyCharges[df$Dependents == 1]
group_without_dependents <- df$MonthlyCharges[df$Dependents == 0]

# Perform an independent t-test
t_test_result <- t.test(group_with_dependents, group_without_dependents, var.equal = FALSE)

# Compute group averages
avg_with_dependents <- mean(group_with_dependents, na.rm = TRUE)
avg_without_dependents <- mean(group_without_dependents, na.rm = TRUE)

# Print results
print(paste("Average Monthly Charges (With Dependents):", round(avg_with_dependents, 2)))
## [1] "Average Monthly Charges (With Dependents): 59.89"
print(paste("Average Monthly Charges (Without Dependents):", round(avg_without_dependents, 2)))
## [1] "Average Monthly Charges (Without Dependents): 67.51"
print(paste("P-value:", round(t_test_result$p.value, 5)))
## [1] "P-value: 0.00043"
# Visualize group averages with a bar graph
ggplot(df, aes(x = Dependents, y = MonthlyCharges, fill = Dependents)) +
  stat_summary(fun = mean, geom = "bar", position = "dodge") +
  labs(title = "Comparison of Monthly Charges by Dependents Status",
       x = "Dependents",
       y = "Average Monthly Charges") +
  theme_minimal()

#####
# ANOVA: MonthlyCharges ~ Contracts
# Load necessary libraries
library(ggplot2)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.2.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Load the dataset
df <- read.csv("TelecommunicationCustomerData_TestedVersionB.csv")

# Perform ANOVA test
anova_result <- aov(MonthlyCharges ~ Contract, data = df)
summary(anova_result)
##              Df Sum Sq Mean Sq F value Pr(>F)  
## Contract      2   6928    3464   3.754 0.0238 *
## Residuals   940 867480     923                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Compute group averages
contract_averages <- df %>%
  group_by(Contract) %>%
  summarise(Average_Monthly_Charges = mean(MonthlyCharges, na.rm = TRUE))

# Print averages
print(contract_averages)
## # A tibble: 3 × 2
##   Contract       Average_Monthly_Charges
##   <chr>                            <dbl>
## 1 Month-to-month                    66.4
## 2 One year                          67.4
## 3 Two year                          60.4
# Visualize group averages with a bar graph
ggplot(contract_averages, aes(x = Contract, y = Average_Monthly_Charges, fill = Contract)) +
  geom_bar(stat = "identity") +
  labs(title = "Comparison of Monthly Charges by Contract Type",
       x = "Contract Type",
       y = "Average Monthly Charges") +
  theme_minimal()

####
#Chi-Square Test: SeniorCitizen & StreamingTV
# Load necessary libraries
library(ggplot2)

# Load the dataset
df <- read.csv("TelecommunicationCustomerData_TestedVersionB.csv")

# Compute proportions of senior and non-senior customers that stream TV
senior_streaming <- mean(df$StreamingTV[df$SeniorCitizen == 1], na.rm = TRUE)
non_senior_streaming <- mean(df$StreamingTV[df$SeniorCitizen == 0], na.rm = TRUE)

# Create a contingency table
contingency_table <- table(df$SeniorCitizen, df$StreamingTV)

# Perform chi-square test
chi_test <- chisq.test(contingency_table)

# Print results
print(paste("Proportion of Senior Customers that Stream TV:", round(senior_streaming, 4)))
## [1] "Proportion of Senior Customers that Stream TV: 0.5366"
print(paste("Proportion of Non-Senior Customers that Stream TV:", round(non_senior_streaming, 4)))
## [1] "Proportion of Non-Senior Customers that Stream TV: 0.3415"
print(paste("P-value:", round(chi_test$p.value, 8)))
## [1] "P-value: 4.19e-06"
# Visualize proportions with a bar graph
proportions <- data.frame(
  CustomerType = c("Senior", "Non-Senior"),
  Proportion = c(senior_streaming, non_senior_streaming)
)

ggplot(proportions, aes(x = CustomerType, y = Proportion, fill = CustomerType)) +
  geom_bar(stat = "identity") +
  labs(title = "Proportion of Seniors and Non-Seniors Who Stream TV",
       x = "Customer Type",
       y = "Proportion Streaming TV") +
  ylim(0, 1) +
  theme_minimal()

###
#Correlation Analysis: tenure & MonthlyCharge
# Load necessary libraries
library(ggplot2)

# Load the dataset
df <- read.csv("TelecommunicationCustomerData_TestedVersionB.csv")

# Perform Pearson correlation test
correlation_test <- cor.test(df$tenure, df$MonthlyCharges, method = "pearson")

# Print correlation coefficient and p-value
print(paste("Correlation Coefficient:", round(correlation_test$estimate, 3)))
## [1] "Correlation Coefficient: 0.243"
print(paste("P-value:", formatC(correlation_test$p.value, format = "e", digits = 5)))
## [1] "P-value: 3.88918e-14"
# Visualize correlation with a scatter plot
ggplot(df, aes(x = tenure, y = MonthlyCharges)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", color = "red", se = FALSE) +
  labs(title = "Correlation between Tenure and Monthly Charges",
       x = "Tenure (Months)",
       y = "Monthly Charges ($)") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

#####
#Linear Regression: MonthlyCharge ~ gender + Dependent + tenure
# Load necessary libraries
library(ggplot2)

# Load the dataset
df <- read.csv("TelecommunicationCustomerData_TestedVersionB.csv")

# Encode 'gender' as numeric (Male = 1, Female = 0)
df$gender <- ifelse(df$gender == "Male", 1, 0)

# Fit the linear regression model
model <- lm(MonthlyCharges ~ gender + Dependents + tenure, data = df)

# Print regression summary
summary(model)
## 
## Call:
## lm(formula = MonthlyCharges ~ gender + Dependents + tenure, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -61.633 -27.568   6.925  24.437  51.724 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 57.76542    1.93719  29.819  < 2e-16 ***
## gender      -1.00704    1.90827  -0.528    0.598    
## Dependents  -9.54023    2.09628  -4.551 6.04e-06 ***
## tenure       0.32316    0.03936   8.210 7.28e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 29.27 on 939 degrees of freedom
## Multiple R-squared:  0.07974,    Adjusted R-squared:  0.0768 
## F-statistic: 27.12 on 3 and 939 DF,  p-value: < 2.2e-16
# Predict Monthly Charges for a new customer (Male, Dependents=1, Tenure=10)
new_customer <- data.frame(gender = 1, Dependents = 1, tenure = 10)
predicted_bill <- predict(model, new_customer)

# Print the predicted monthly bill
print(paste("Predicted Monthly Bill:", round(predicted_bill, 2)))
## [1] "Predicted Monthly Bill: 50.45"
######
# Logistic Regression: Churn ~ Contract + tenure + InternetService
# Load necessary libraries
library(nnet)
## Warning: package 'nnet' was built under R version 4.2.3
# Load the dataset
df <- read.csv("TelecommunicationCustomerData_TestedVersionB.csv")

# Convert categorical variables to factors
df$Contract <- as.factor(df$Contract)
df$InternetService <- as.factor(df$InternetService)

# Fit logistic regression model
model <- glm(Churn ~ Contract + tenure + InternetService, data = df, family = binomial)

# Print regression summary
summary(model)
## 
## Call:
## glm(formula = Churn ~ Contract + tenure + InternetService, family = binomial, 
##     data = df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6016  -0.7331  -0.3528   0.8159   2.7775  
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                -0.27610    0.17179  -1.607 0.108010    
## ContractOne year           -0.64364    0.25054  -2.569 0.010197 *  
## ContractTwo year           -1.66877    0.44667  -3.736 0.000187 ***
## tenure                     -0.02865    0.00506  -5.663 1.49e-08 ***
## InternetServiceFiber optic  1.26247    0.19397   6.509 7.58e-11 ***
## InternetServiceNo          -1.31030    0.33473  -3.914 9.06e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1125.71  on 942  degrees of freedom
## Residual deviance:  841.82  on 937  degrees of freedom
## AIC: 853.82
## 
## Number of Fisher Scoring iterations: 6
# Predict churn probabilities for new customers
new_customer_1 <- data.frame(Contract = "Month-to-month", tenure = 6, InternetService = "Fiber optic")
new_customer_2 <- data.frame(Contract = "Two year", tenure = 2, InternetService = "DSL")

prob_1 <- predict(model, new_customer_1, type = "response")
prob_2 <- predict(model, new_customer_2, type = "response")

# Print predicted churn probabilities
print(paste("Churn Probability (Customer 1):", round(prob_1 * 100, 2), "%"))
## [1] "Churn Probability (Customer 1): 69.31 %"
print(paste("Churn Probability (Customer 2):", round(prob_2 * 100, 2), "%"))
## [1] "Churn Probability (Customer 2): 11.9 %"