# Load required libraries
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(cluster)
## Warning: package 'cluster' was built under R version 4.4.2
# Read and clean the data
hulu_data <- read.csv("hulu_dataset.csv")
hulu_data_clean <- na.omit(hulu_data)
# Select features for clustering
clustering_data <- hulu_data_clean %>%
  select(imdbAverageRating, imdbNumVotes)
# Normalize the data
clustering_data_normalized <- scale(clustering_data)
# Calculate within-cluster sum of squares for different k values
wss <- sapply(1:10, function(k) {
  kmeans(clustering_data_normalized, centers = k, nstart = 10)$tot.withinss
})
# Plot the elbow curve
ggplot(data.frame(k = 1:10, wss = wss), aes(x = k, y = wss)) +
  geom_line() +
  geom_point() +
  labs(title = "Elbow Method for Optimal k",
       x = "Number of Clusters (k)",
       y = "Within-cluster Sum of Squares")

# Perform k-means clustering
set.seed(123)  # for reproducibility
kmeans_result <- kmeans(clustering_data_normalized, centers = 3, nstart = 25)
# Add cluster assignments to the original data
hulu_data_clean$cluster <- as.factor(kmeans_result$cluster)
# Create a scatter plot of the clusters
ggplot(hulu_data_clean, aes(x = imdbAverageRating, y = imdbNumVotes, color = cluster)) +
  geom_point(alpha = 0.6) +
  scale_y_log10() +  # Use log scale for number of votes
  labs(title = "K-means Clustering of Hulu Content",
       x = "IMDB Average Rating",
       y = "IMDB Number of Votes (log scale)",
       color = "Cluster") +
  theme_minimal()

# Calculate cluster centers
cluster_centers <- kmeans_result$centers
# Summarize cluster characteristics
cluster_summary <- hulu_data_clean %>%
  group_by(cluster) %>%
  summarize(
    count = n(),
    avg_rating = mean(imdbAverageRating),
    avg_votes = mean(imdbNumVotes),
    top_genres = paste(names(sort(table(unlist(strsplit(genres, ", "))), decreasing = TRUE)[1:3]), collapse = ", ")
  )

print(cluster_summary)
## # A tibble: 3 × 5
##   cluster count avg_rating avg_votes top_genres              
##   <fct>   <int>      <dbl>     <dbl> <chr>                   
## 1 1        3229       5.49    10996. Drama, Comedy, Action   
## 2 2        5254       7.17    22851. Drama, Comedy, Animation
## 3 3         213       7.78   770453. Drama, Action, Adventure

Interpretation of Results

Based on the visualization and summary statistics, we can interpret the clusters:

Cluster 1: High-rated, low-popularity content (niche or cult favorites)

Cluster 2: Moderate-rated, moderate-popularity content (average titles)

Cluster 3: Variable-rated, high-popularity content (mainstream hits)

This clustering analysis provides insights into the different segments of content in the Hulu library, which can be used for content strategy and recommendation systems.