install.packages("readxl")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
install.packages("cluster")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
install.packages("factoextra")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
install.packages("tidyverse")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
# Load libraries
library(readxl)
library(cluster)
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.4     ✔ tibble    3.2.1
## ✔ purrr     1.0.4     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
data <- read_excel("netflix_survey_data.xlsx")

colnames(data)
## [1] "perceived_value"     "cancel_likelihood"   "pricing_competitive"
## [4] "device"              "subscription_length"
data_selected <- data %>%
  select(perceived_value, cancel_likelihood, pricing_competitive)
# Remove missing values (if any)
data_clean <- drop_na(data_selected)
# Scale the data to prepare for clustering
data_scaled <- scale(data_clean)
# Step 1: Perform K-means clustering
set.seed(123)  # For consistent results each time you run it

kmeans_result <- kmeans(data_scaled, centers = 3, nstart = 25)
# Step 2: View cluster centers (to see what each group is like)
kmeans_result$centers
##   perceived_value cancel_likelihood pricing_competitive
## 1       0.1786626         0.7968191          0.62604273
## 2       1.3578358        -1.1952286          0.09631427
## 3      -1.2625491        -0.7968191         -1.31629497
# Step 3: View which row belongs to which cluster
kmeans_result$cluster
##  [1] 3 3 3 1 1 1 1 1 1 2 2
# Step 4: Visualize the clusters
library(factoextra)
fviz_cluster(kmeans_result, data = data_scaled)

data_with_clusters <- data_clean %>%
  mutate(cluster = kmeans_result$cluster)

# View(data_with_clusters)

# Basic regression: does perceived value affect cancel likelihood?
model1 <- lm(cancel_likelihood ~ perceived_value, data = data_clean)

# See the summary of the model
summary(model1)
## 
## Call:
## lm(formula = cancel_likelihood ~ perceived_value, data = data_clean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.42697 -1.06180  0.06742  1.44382  1.82022 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)
## (Intercept)       2.1910     1.5031   1.458    0.179
## perceived_value   0.2472     0.4307   0.574    0.580
## 
## Residual standard error: 1.732 on 9 degrees of freedom
## Multiple R-squared:  0.03531,    Adjusted R-squared:  -0.07187 
## F-statistic: 0.3295 on 1 and 9 DF,  p-value: 0.58
# Visualize the relationship
plot(data_clean$perceived_value, data_clean$cancel_likelihood,
     xlab = "Perceived Value", ylab = "Likelihood to Cancel",
     main = "Effect of Perceived Value on Cancellation Likelihood",
     pch = 19, col = "steelblue")
abline(model1, col = "red", lwd = 2)

# Model with two predictors
model2 <- lm(cancel_likelihood ~ perceived_value + pricing_competitive, data = data_clean)

# Summary of this model
summary(model2)
## 
## Call:
## lm(formula = cancel_likelihood ~ perceived_value + pricing_competitive, 
##     data = data_clean)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##   -0.6   -0.1    0.0    0.1    0.4 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -0.6000     0.3240  -1.852    0.101    
## perceived_value      -1.3000     0.1237 -10.506 5.87e-06 ***
## pricing_competitive   2.7000     0.1668  16.190 2.13e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3162 on 8 degrees of freedom
## Multiple R-squared:  0.9714, Adjusted R-squared:  0.9643 
## F-statistic:   136 on 2 and 8 DF,  p-value: 6.664e-07
# Create a grid of values for plotting
new_data <- expand.grid(
  perceived_value = 1:5,
  pricing_competitive = 1:5
)

# Predict cancellation likelihood using your second model
new_data$predicted_cancel <- predict(model2, newdata = new_data)

# Load ggplot2 for cleaner plotting (already included in tidyverse)
library(ggplot2)

# Plot predicted cancel likelihood
ggplot(new_data, aes(x = perceived_value, y = predicted_cancel, color = as.factor(pricing_competitive))) +
  geom_line(size = 1.2) +
  geom_point() +
  labs(
    title = "Predicted Cancellation Likelihood",
    subtitle = "Based on Perceived Value and Pricing Competitiveness",
    x = "Perceived Value",
    y = "Predicted Likelihood to Cancel",
    color = "Pricing Competitive"
  ) +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.