Week8

# Load required libraries
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(cluster)

## Warning: package 'cluster' was built under R version 4.4.2

# Read and clean the data
hulu_data <- read.csv("hulu_dataset.csv")
hulu_data_clean <- na.omit(hulu_data)

# Select features for clustering
clustering_data <- hulu_data_clean %>%
  select(imdbAverageRating, imdbNumVotes)

# Normalize the data
clustering_data_normalized <- scale(clustering_data)

# Calculate within-cluster sum of squares for different k values
wss <- sapply(1:10, function(k) {
  kmeans(clustering_data_normalized, centers = k, nstart = 10)$tot.withinss
})

# Plot the elbow curve
ggplot(data.frame(k = 1:10, wss = wss), aes(x = k, y = wss)) +
  geom_line() +
  geom_point() +
  labs(title = "Elbow Method for Optimal k",
       x = "Number of Clusters (k)",
       y = "Within-cluster Sum of Squares")

# Perform k-means clustering
set.seed(123)  # for reproducibility
kmeans_result <- kmeans(clustering_data_normalized, centers = 3, nstart = 25)

# Add cluster assignments to the original data
hulu_data_clean$cluster <- as.factor(kmeans_result$cluster)

# Create a scatter plot of the clusters
ggplot(hulu_data_clean, aes(x = imdbAverageRating, y = imdbNumVotes, color = cluster)) +
  geom_point(alpha = 0.6) +
  scale_y_log10() +  # Use log scale for number of votes
  labs(title = "K-means Clustering of Hulu Content",
       x = "IMDB Average Rating",
       y = "IMDB Number of Votes (log scale)",
       color = "Cluster") +
  theme_minimal()

# Calculate cluster centers
cluster_centers <- kmeans_result$centers

# Summarize cluster characteristics
cluster_summary <- hulu_data_clean %>%
  group_by(cluster) %>%
  summarize(
    count = n(),
    avg_rating = mean(imdbAverageRating),
    avg_votes = mean(imdbNumVotes),
    top_genres = paste(names(sort(table(unlist(strsplit(genres, ", "))), decreasing = TRUE)[1:3]), collapse = ", ")
  )

print(cluster_summary)

## # A tibble: 3 × 5
##   cluster count avg_rating avg_votes top_genres              
##   <fct>   <int>      <dbl>     <dbl> <chr>                   
## 1 1        3229       5.49    10996. Drama, Comedy, Action   
## 2 2        5254       7.17    22851. Drama, Comedy, Animation
## 3 3         213       7.78   770453. Drama, Action, Adventure

Interpretation of Results

Based on the visualization and summary statistics, we can interpret the clusters:

Cluster 1: High-rated, low-popularity content (niche or cult favorites)

Cluster 2: Moderate-rated, moderate-popularity content (average titles)

Cluster 3: Variable-rated, high-popularity content (mainstream hits)

This clustering analysis provides insights into the different segments of content in the Hulu library, which can be used for content strategy and recommendation systems.

Week8

Tedros Habtemariam

2024-12-17