install.packages("readxl")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
install.packages("cluster")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
install.packages("factoextra")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
install.packages("tidyverse")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
# Load libraries
library(readxl)
library(cluster)
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.4 ✔ tibble 3.2.1
## ✔ purrr 1.0.4 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
data <- read_excel("netflix_survey_data.xlsx")
colnames(data)
## [1] "perceived_value" "cancel_likelihood" "pricing_competitive"
## [4] "device" "subscription_length"
data_selected <- data %>%
select(perceived_value, cancel_likelihood, pricing_competitive)
# Remove missing values (if any)
data_clean <- drop_na(data_selected)
# Scale the data to prepare for clustering
data_scaled <- scale(data_clean)
# Step 1: Perform K-means clustering
set.seed(123) # For consistent results each time you run it
kmeans_result <- kmeans(data_scaled, centers = 3, nstart = 25)
# Step 2: View cluster centers (to see what each group is like)
kmeans_result$centers
## perceived_value cancel_likelihood pricing_competitive
## 1 0.1786626 0.7968191 0.62604273
## 2 1.3578358 -1.1952286 0.09631427
## 3 -1.2625491 -0.7968191 -1.31629497
# Step 3: View which row belongs to which cluster
kmeans_result$cluster
## [1] 3 3 3 1 1 1 1 1 1 2 2
# Step 4: Visualize the clusters
library(factoextra)
fviz_cluster(kmeans_result, data = data_scaled)

data_with_clusters <- data_clean %>%
mutate(cluster = kmeans_result$cluster)
# View(data_with_clusters)
# Basic regression: does perceived value affect cancel likelihood?
model1 <- lm(cancel_likelihood ~ perceived_value, data = data_clean)
# See the summary of the model
summary(model1)
##
## Call:
## lm(formula = cancel_likelihood ~ perceived_value, data = data_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.42697 -1.06180 0.06742 1.44382 1.82022
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.1910 1.5031 1.458 0.179
## perceived_value 0.2472 0.4307 0.574 0.580
##
## Residual standard error: 1.732 on 9 degrees of freedom
## Multiple R-squared: 0.03531, Adjusted R-squared: -0.07187
## F-statistic: 0.3295 on 1 and 9 DF, p-value: 0.58
# Visualize the relationship
plot(data_clean$perceived_value, data_clean$cancel_likelihood,
xlab = "Perceived Value", ylab = "Likelihood to Cancel",
main = "Effect of Perceived Value on Cancellation Likelihood",
pch = 19, col = "steelblue")
abline(model1, col = "red", lwd = 2)

# Model with two predictors
model2 <- lm(cancel_likelihood ~ perceived_value + pricing_competitive, data = data_clean)
# Summary of this model
summary(model2)
##
## Call:
## lm(formula = cancel_likelihood ~ perceived_value + pricing_competitive,
## data = data_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.6 -0.1 0.0 0.1 0.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.6000 0.3240 -1.852 0.101
## perceived_value -1.3000 0.1237 -10.506 5.87e-06 ***
## pricing_competitive 2.7000 0.1668 16.190 2.13e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3162 on 8 degrees of freedom
## Multiple R-squared: 0.9714, Adjusted R-squared: 0.9643
## F-statistic: 136 on 2 and 8 DF, p-value: 6.664e-07
# Create a grid of values for plotting
new_data <- expand.grid(
perceived_value = 1:5,
pricing_competitive = 1:5
)
# Predict cancellation likelihood using your second model
new_data$predicted_cancel <- predict(model2, newdata = new_data)
# Load ggplot2 for cleaner plotting (already included in tidyverse)
library(ggplot2)
# Plot predicted cancel likelihood
ggplot(new_data, aes(x = perceived_value, y = predicted_cancel, color = as.factor(pricing_competitive))) +
geom_line(size = 1.2) +
geom_point() +
labs(
title = "Predicted Cancellation Likelihood",
subtitle = "Based on Perceived Value and Pricing Competitiveness",
x = "Perceived Value",
y = "Predicted Likelihood to Cancel",
color = "Pricing Competitive"
) +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
