Packages and Data Set

library("cluster") 
library("fastcluster")
library("NbClust")
library("fpc")
library("factoextra")
library("tidyverse")

About the Data

The data is about Movies, Gross Income, and Review. It is a modified version of a publicly available IMDB data set.

Variables:

GROSS - gross income of the movie (in USD)
REVIEW - Majority/Overall of review of movie critics. This is a binary variable – it is either RECOMMENDED (i.e. by majority) or NOT RECOMMENDED.
BUDGET - total budget spent for producing the movie (in USD)
SCREENS - total number of screenings (in US during first two weeks of showing)
SEQUEL - MAIN if it is the first movie, and SEQUEL if the film is a sequel in a franchise.
SENTIMENT - A sentiment score derived from social media posts about the film (positive score means positive/reinforcing sentiments)
VIEWS - Total number of views of the trailer in YouTube
LIKES - Total number of likes of the trailer in YouTube
DISLIKES - Total number of dislikes of the trailer in YouTube
COMMENTS - Total number of comments of the trailer in YouTube
FOLLOWERS - Total number of followers of main/primary movie actors in Facebook, Instagram, and twitter

movie <- read_rds("movie_data.rds")
movie_sub <- movie %>%
                select(gross, review, budget, screens, sentiment, views, likes, dislikes, comments, followers) %>%
                mutate(sentiment = as.factor(case_when(sentiment == 0 ~ "Neutral", 
                                             sentiment >  0 ~ "Positive",
                                             sentiment <  0 ~ "Negative"))) %>%
                na.omit()

Preparing Data

movie_sub2 <- movie_sub %>%
                transmute(gross, review = (review == "RECOMMENDED")*1, budget, screens, 
                          positive = (sentiment == "Positive")*1, 
                          neutral = (sentiment == "Neutral")*1, 
                          views, likes, dislikes, comments, followers)

Means of all variables

options(scipen = 10)
movie_sub2 %>% 
  summarise_all(.funs = mean) %>%
  pivot_longer(cols = gross:followers, names_to = "variables", values_to = "mean")

Standard Deviations of all variables

options(scipen = 10)
movie_sub2 %>% 
  summarise_all(.funs = sd) %>%
  pivot_longer(cols = gross:followers, names_to = "variables", values_to = "sd")

Strategy 1 : Connectivity-based clustering

Distance Measure: Gower’s Index Linkage Method: Ward’s Linkage

Distance Matrix

dist_gower <- cluster::daisy(movie_sub2, metric = "gower")

strat_ward<- fastcluster::hclust(d = dist_gower, method = "ward.D")

Clustering Values of the average silhouette and calinski harabasz index for different numbers of clusters.

strat1 <- NbClust(data = movie_sub2, diss = dist_gower, min.nc = 3, max.nc = 10, 
                  distance = NULL, method = "ward.D", index = c("silhouette", "ch", "db"))
strat1$Best.nc %>%
  as_tibble(rownames = "number of clusters")

6, 7, or 9 clusters seem to optimal

Distribution of Clusters

strat1    <- hclust(d = dist_gower, method = "ward.D")

strat1_k6 <- cutree(strat1, k = 6)
strat1_k7 <- cutree(strat1, k = 7)
strat1_k9 <- cutree(strat1, k = 9)

dist_euc <- daisy(x = movie_sub2, metric = "euclidean", stand = T)

table(strat1_k6)

## strat1_k6
##  1  2  3  4  5  6 
## 97 72 44 81 65 36

table(strat1_k7)

## strat1_k7
##  1  2  3  4  5  6  7 
## 97 72 44 81 39 26 36

table(strat1_k9)

## strat1_k9
##  1  2  3  4  5  6  7  8  9 
## 34 44 63 44 81 39 28 26 36

Comparing 6 clusters to 7 clusters

compare_clusters <- cluster.stats(clustering = strat1_k6, alt.clustering = strat1_k7, compareonly = T)  
compare_clusters

## $corrected.rand
## [1] 0.9548411
## 
## $vi
## [1] 0.1107488

Comparing 7 clusters to 9 clusters

compare_clusters1 <- cluster.stats(clustering = strat1_k7, alt.clustering = strat1_k9, compareonly = T)  
compare_clusters1

## $corrected.rand
## [1] 0.8273534
## 
## $vi
## [1] 0.2808788

Since all the clusters have a high agreement and well distributed, we will just choose 6 clusters in strategy 1.

Strategy 2: K-means clustering

movie_sub2_scaled <- scale(movie_sub2) %>%
                as_tibble()

cluster_means <- movie_sub2_scaled %>%
                    mutate(hierarchical = strat1_k6) %>%
                    group_by(hierarchical) %>%
                    summarise_all(.funs = median) 

init_centroids <- cluster_means %>%
                    select(-hierarchical) %>%
                    as.matrix()

kmeans_result <- kmeans(movie_sub2_scaled, centers = init_centroids)
table(kmeans_result$cluster)

## 
##  1  2  3  4  5  6 
## 94 61 84 72 49 35

Strategy 3: K-medoids

calc_sil <- silhouette(strat1_k6, dist = dist_euc)

movie_sub2_sil <- movie_sub2_scaled %>%
                  mutate(sil = calc_sil[ ,3], hierarchical = strat1_k6)

best_sil <- movie_sub2_sil %>%
              mutate(row = row_number()) %>%
              group_by(hierarchical) %>%
              top_n(n = 1, wt = sil) %>% 
              sample_n(size = 1) %>%
              ungroup()
  
best_sil$row

## [1]  39 165 124 210  71 283

pam_result <- pam(movie_sub2_scaled, metric = "euclidean", k = 6, medoids = best_sil$row)
table(pam_result$cluster)

## 
##  1  2  3  4  5  6 
## 52 50 94 62 86 51

Comparison of the 3 Strategies

hierarchical <- fpc::cluster.stats(d = dist_euc, clustering = strat1_k6)
kmean_clust  <- fpc::cluster.stats(d = dist_euc, clustering = kmeans_result$cluster)
pam_clust    <- fpc::cluster.stats(d = dist_euc, clustering = pam_result$cluster)

c("hclust" = hierarchical$avg.silwidth, 
  "kmeans" = kmean_clust$avg.silwidth, 
  "pam"    = pam_clust$avg.silwidth)

##     hclust     kmeans        pam 
## 0.05498212 0.07788412 0.08309252

pam is the best since it is the highest

c("hclust" = hierarchical$ch, 
  "kmeans" = kmean_clust$ch, 
  "pam"    = pam_clust$ch)

##   hclust   kmeans      pam 
## 17.47542 24.81089 26.34896

still, pam is the best since it is the highest

c("hclust" = hierarchical$wb.ratio, 
  "kmeans" = kmean_clust$wb.ratio, 
  "pam"    = pam_clust$wb.ratio)

##    hclust    kmeans       pam 
## 0.8012560 0.7546322 0.7359219

pam is still the best because the lower is the better

c("hclust" = hierarchical$dunn2, 
  "kmeans" = kmean_clust$dunn2, 
  "pam"    = pam_clust$dunn2)

##    hclust    kmeans       pam 
## 0.6645372 0.3972529 0.4436700

higher is the better, so hclust is now the best

Based on the metrics, pam won the majority

movie_cluster <- movie_sub2 %>%
                mutate(cluster = pam_result$cluster)

Presentation of Clusters

summary_stats <- movie_cluster %>%
                    group_by(cluster) %>%
                    summarise_all(.funs = mean)

summary_stats

cluster_distribution <- movie_cluster %>%
                          group_by(cluster) %>%
                          summarise(count = n()) %>%
                          mutate(percent = 100*count/sum(count))
cluster_distribution

movie_cluster %>%
   group_by(cluster) %>%
   summarise(n = n(),
             "gross" = median(gross),
             "budget" = median(budget),
             "screens" = median(screens),
             "avg. review" = mean(review),
             "avg. views" = mean(views),
             "avg. followers" = mean(followers)) %>%
   arrange(desc(`gross`))

Interpretation

Based on the table, cluster 6 are composed of internationally prominent movies, or most commonly known to us as Hollywood Movies. These movies are highly anticipated to be box office even if it is not yet officially announced in the public whether it is just based from rumors, or still in the making. Mainly because of the popularity of the actors, the reputation of these Hollywood movies are very high in creating such a good story, cinematography and acting of course that can be reflected on the numbers of budget which seems to be extremely well prepared. The other factors affect its very high gross is the number of views in their trailer and the followers which seemed to be much connected to the promotions of the movie that can be seen all over the social media. While cluster 2,3, and 5 are composed of movies that need to somehow prove themselves to be box office whether it is based on the story if its good, the trailer, and the actors if they popular because as we can see, cluster 2 have higher gross than 3 even if they are much more popular based on the views and followers. The reason for this relies on many external factors but based on the table, the review seems to have a lot more impact since it helps the audience to gauge how good the movie based on the people who have already watch it. Lastly, the unfortunate reality in the world of movie are well represented in the cluster 1 and 4. These movies are often the Indie ones that despite how good the cinematography, plot and acting, they did not earn that much mainly because of the popularity, promotions and the preference of the audience. As we can see, the budget for these clusters are very low compared to others and having a higher budget seems to be correlated with higher gross since they can allot more resources especially in the promotion side, where they can promote their movie to the mainstream media, making it more popular and also, the audience simply do not want to watch movies that they do not know about and much more preferred those that are popular. Other thing that we can also consider based on the views and followers, that maybe these movies just simply flop, or did not fit to the taste of the people which can be seen to the reviews and number of screens.

Simple Analystics Project on Cluster Analysis

Mania, Dexter James U.