library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
library(ggfortify)
library(cluster)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(caret)
## Loading required package: lattice
library(broom)
data <- read_csv("Book1_cleaned.csv")
## Rows: 3 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (7): Respondent_ID, Q1_Use_MS365, Q2_Use_GoogleMore, Q3_Used_Amazon_Appl...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data <- data[complete.cases(data), ]
We’ll treat Q6_Switched_Platform
as the binary response
variable to predict.
data$Q6_Switched_Platform <- as.factor(data$Q6_Switched_Platform)
model <- glm(Q6_Switched_Platform ~ ., data = data[,-1], family = binomial)
summary(model)
##
## Call:
## glm(formula = Q6_Switched_Platform ~ ., family = binomial, data = data[,
## -1])
##
## Coefficients: (3 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.357e+01 1.376e+05 0 1
## Q1_Use_MS365 5.630e-11 1.124e+05 0 1
## Q2_Use_GoogleMore 4.713e+01 1.124e+05 0 1
## Q3_Used_Amazon_Apple NA NA NA NA
## Q4_Pref_GoogleDocs NA NA NA NA
## Q5_Pref_Teams NA NA NA NA
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3.8191e+00 on 2 degrees of freedom
## Residual deviance: 3.4957e-10 on 0 degrees of freedom
## AIC: 6
##
## Number of Fisher Scoring iterations: 22
tidy(model)
## # A tibble: 6 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -2.36e+ 1 137632. -1.71e- 4 1.00
## 2 Q1_Use_MS365 5.63e-11 112376. 5.01e-16 1
## 3 Q2_Use_GoogleMore 4.71e+ 1 112376. 4.19e- 4 1.00
## 4 Q3_Used_Amazon_Apple NA NA NA NA
## 5 Q4_Pref_GoogleDocs NA NA NA NA
## 6 Q5_Pref_Teams NA NA NA NA
# Predict probabilities and classes
predicted_probs <- predict(model, type = "response")
predicted_class <- ifelse(predicted_probs > 0.5, 1, 0)
# Confusion matrix
confusionMatrix(as.factor(predicted_class), data$Q6_Switched_Platform)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1 0
## 1 0 2
##
## Accuracy : 1
## 95% CI : (0.2924, 1)
## No Information Rate : 0.6667
## P-Value [Acc > NIR] : 0.2963
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.3333
## Detection Rate : 0.3333
## Detection Prevalence : 0.3333
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : 0
##
coefficients <- tidy(model)
coefficients <- coefficients[order(abs(coefficients$estimate), decreasing = TRUE), ]
ggplot(coefficients, aes(x = reorder(term, estimate), y = estimate)) +
geom_col(fill = "#336699") +
coord_flip() +
labs(title = "Regression Coefficients", x = "Predictors", y = "Estimate") +
theme_minimal()
## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_col()`).
# Remove constant (zero-variance) columns before PCA
predictors <- data[,-c(1, ncol(data))] # exclude ID and response
constant_cols <- sapply(predictors, function(x) var(x) == 0)
predictors_clean <- predictors[, !constant_cols]
# Run PCA
pca <- prcomp(predictors_clean, scale. = TRUE)
summary(pca)
## Importance of components:
## PC1 PC2 PC3
## Standard deviation 1.732 1.00 1.979e-16
## Proportion of Variance 0.750 0.25 0.000e+00
## Cumulative Proportion 0.750 1.00 1.000e+00
plot(pca, type = "l", main = "Scree Plot")
biplot(pca, scale = 0)
Answer: It helps determine which features significantly impact the likelihood of someone switching platforms. This can inform marketing or UX changes.
Answer: Check the tidy(model) output. Variables with p-values < 0.05 are considered statistically significant.
Answer: The confusion matrix and accuracy values will show model performance. A good model has high sensitivity and specificity.
Answer: Logistic regression assumes a linear relationship between the log-odds and predictors. Also, multicollinearity can affect reliability.