EDA on Biopics Dataset

setwd("/Users/harrychang/Desktop/Data Analyst Assessments/Grab")
biopics = read.csv("biopics.csv")

library(tidyverse) # For data manipulation and visualization

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✔ ggplot2 3.4.1     ✔ purrr   1.0.1
## ✔ tibble  3.1.7     ✔ dplyr   1.1.0
## ✔ tidyr   1.3.0     ✔ stringr 1.5.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(lubridate) # For handling date and time

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(stringr)   # For handling strings

biopics ## Load dataset

Data pre-processing

biopics <- biopics %>%
  mutate(across(everything(), replace_na, "Unknown")) # Replace NA values with "Unknown"

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(everything(), replace_na, "Unknown")`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
## 
##   # Previously
##   across(a:b, mean, na.rm = TRUE)
## 
##   # Now
##   across(a:b, \(x) mean(x, na.rm = TRUE))

biopics

biopics <- biopics %>%
  mutate(
    person_of_color = as.factor(person_of_color), # Convert to factor
    subject_sex = as.factor(subject_sex)
  ) 

biopics

convert_box_office <- function(x) {
  x <- gsub(",", "", x) # Remove commas
  x <- gsub("\\$", "", x) # Remove dollar sign
  
  # If the value is in millions, replace "M" with "e6" (scientific notation)
  x <- ifelse(str_detect(x, "M"), str_replace(x, "M", "e6"), x)
  
  # If the value is in thousands, replace "K" with "e3" (scientific notation)
  x <- ifelse(str_detect(x, "K"), str_replace(x, "K", "e3"), x)
  
  # Convert to numeric and handle non-numeric values
  as.numeric(gsub("-", "NA", x))
}

biopics <- biopics %>%
  mutate(
    box_office = convert_box_office(box_office)
  )

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `box_office = convert_box_office(box_office)`.
## Caused by warning in `convert_box_office()`:
## ! NAs introduced by coercion

biopics

biopics <- biopics %>%
  mutate(
    box_office = ifelse(is.na(box_office), -1, box_office) # Replace NA with a specific value (e.g., -1) or use any other imputation technique
  )

biopics

Sample Visualizations

ggplot(biopics, aes(x = box_office)) +
  geom_histogram(binwidth = 1e6, fill = "blue", color = "black") +
  scale_x_continuous(labels = scales::comma) +
  labs(title = "Box Office Distribution", x = "Box Office", y = "Frequency")

ggplot(biopics, aes(x = year_release)) +
  geom_bar(fill = "blue", color = "black") +
  labs(title = "Number of Biopics per Year", x = "Release Year", y = "Count")

ggplot(biopics, aes(x = country)) +
  geom_bar(fill = "lightblue") +
  coord_flip() +
  labs(title = "Number of Biopics per Country", x = "Country", y = "Count")

ggplot(biopics, aes(x = type_of_subject)) +
  geom_bar(fill = "red") +
  coord_flip() +
  labs(title = "Number of Biopics by Type of Subject", x = "Type of Subject", y = "Count")

ggplot(biopics, aes(x = subject_race)) +
  geom_bar(fill = "green") +
  coord_flip() +
  labs(title = "Number of Biopics by Subject Race", x = "Subject Race", y = "Count")

ggplot(biopics, aes(x = subject_sex)) +
  geom_bar(fill = "orange") +
  labs(title = "Number of Biopics by Subject Sex", x = "Subject Sex", y = "Count")

top_directors <- biopics %>%
  count(director, sort = TRUE) %>%
  head(10) %>%
  pull(director)

ggplot(filter(biopics, director %in% top_directors), aes(x = director)) +
  geom_bar(fill = "purple") +
  coord_flip() +
  labs(title = "Number of Biopics by Top 10 Directors", x = "Director", y = "Count")

ggplot(biopics, aes(x = subject_sex, y = box_office)) +
  geom_boxplot(fill = "brown") +
  scale_y_continuous(labels = scales::comma) +
  coord_flip()+
  labs(title = "Box Office by Subject Sex", x = "Subject Sex", y = "Box Office")

Linear Regression to Predict Box Office Revenue

# Load required library
library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

biopics[] <- lapply(biopics, function(x) {
  if (is.character(x)) {
    x <- enc2utf8(x)
  }
  return(x)
})

# Load required library
library(janitor)

## 
## Attaching package: 'janitor'

## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

# Clean the column names
biopics <- clean_names(biopics)

# Define preprocessing steps
preProcess_steps <- c("center", "scale", "nzv")

# Create preprocessing object
preProcess_obj <- preProcess(biopics, method = preProcess_steps)

# Apply preprocessing to the whole dataset
biopics_preprocessed <- predict(preProcess_obj, biopics)

# Create a dummyVars object
dummy_obj <- dummyVars(~ ., data = biopics_preprocessed, fullRank = TRUE)

# Create dummy variables using the dummyVars object
biopics_dummy <- data.frame(predict(dummy_obj, newdata = biopics_preprocessed))

set.seed(42)
splitIndex <- createDataPartition(biopics_dummy$box_office, p = 0.8, list = FALSE)
train_df <- biopics_dummy[splitIndex, ]
test_df <- biopics_dummy[-splitIndex, ]

linear_model <- lm(box_office ~ ., data = train_df)

# Predict the test set
predictions <- predict(linear_model, newdata = test_df)

## Warning in predict.lm(linear_model, newdata = test_df): prediction from a
## rank-deficient fit may be misleading

# Calculate performance metrics
RMSE <- sqrt(mean((test_df$box_office - predictions)^2))
R_squared <- cor(test_df$box_office, predictions)^2

RMSE

## [1] 1.268149

R_squared

## [1] 0.06431058

Random Forest

library(randomForest)

## randomForest 4.7-1.1

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

# Train the random forest model
random_forest_model <- randomForest(box_office ~ ., data = train_df, ntree = 500, importance = TRUE)

# Predict the test set
random_forest_predictions <- predict(random_forest_model, newdata = test_df)

# Calculate performance metrics
random_forest_RMSE <- sqrt(mean((test_df$box_office - random_forest_predictions)^2))
random_forest_R_squared <- cor(test_df$box_office, random_forest_predictions)^2

random_forest_RMSE

## [1] 1.180852

random_forest_R_squared

## [1] 0.2423069

SVM

library(e1071)

# Train the SVM model
svm_model <- svm(box_office ~ ., data = train_df, kernel = "radial", cost = 10, gamma = 0.1)

## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'title12.Years.a.Slave' and 'title127.Hours' and 'title8.Seconds'
## and 'titleA.Beautiful.Mind' and 'titleAbe.Lincoln.in.Illinois' and
## 'titleAmerican.Sniper' and 'titleAnastasia' and 'titleAnnie.Oakley'
## and 'titleAwakenings' and 'titleBathory..Countess.of.Blood' and
## 'titleBeau.Brummell' and 'titleBehind.the.Lines' and 'titleBest' and
## 'titleBoys.Don.t.Cry' and 'titleBreach' and 'titleCaptain.Kidd' and
## 'titleCarbine.Williams' and 'titleCarrington' and 'titleCass' and
## 'titleCharlie.Wilson.s.War' and 'titleConviction' and 'titleCopying.Beethoven'
## and 'titleDahmer' and 'titleDangerous.Minds' and 'titleDeath.in.Granada'
## and 'titleDesert.Dancer' and 'titleDesert.Flower' and 'titleDiana'
## and 'titleDreamchild' and 'titleEvel.Knievel' and 'titleFrances' and
## 'titleFrankie...Alice' and 'titleFreedom.Writers' and 'titleGentleman.Jim'
## and 'titleGoltzius.and.the.Pelican.Company' and 'titleGreased.Lightning'
## and 'titleGreat.Balls.of.Fire.' and 'titleHollywoodland' and 'titleHoudini'
## and 'titleI.Am.a.Sex.Addict' and 'titleI.Love.You.Phillip.Morris' and
## 'titleIf.You.Could.See.What.I.Hear' and 'titleInterrupted.Melody' and
## 'titleInvincible' and 'titleJefferson.in.Paris' and 'titleJesse.James'
## and 'titleJim.Thorpe....All.American' and 'titleJinnah' and
## 'titleJo.Jo.Dancer..Your.Life.Is.Calling' and 'titleJoan.of.Arc'
## and 'titleKnute.Rockne.All.American' and 'titleLa.Bamba' and
## 'titleLady.Sings.the.Blues' and 'titleLee.Daniels..The.Butler' and
## 'titleLove.Me.or.Leave.Me' and 'titleLuther' and 'titleMary.of.Scotland'
## and 'titleMesmer' and 'titleMillion.Dollar.Arm' and 'titleMiracle'
## and 'titleMockingbird.Don.t.Sing' and 'titleMonster' and
## 'titleMrs..Parker.and.the.Vicious.Circle' and 'titleMurder..Inc.' and
## 'titleMusic.Within' and 'titlePapillon' and 'titlePrefontaine' and
## 'titlePublic.Enemies' and 'titleQuills' and 'titleRasputin..The.Mad.Monk'
## and 'titleRay' and 'titleReach.for.the.Sky' and 'titleRestoration'
## and 'titleRicky.6' and 'titleRosewater' and 'titleSalome.s.Last.Dance'
## and 'titleSearching.for.Bobby.Fischer' and 'titleSelena' and
## 'titleSelf.Medicated' and 'titleSon.of.God' and 'titleSweet.Dreams' and
## 'titleTed.Bundy' and 'titleThe.Babe' and 'titleThe.Basketball.Diaries'
## and 'titleThe.Broken.Tower' and 'titleThe.Cross.and.the.Switchblade'
## and 'titleThe.Eddy.Duchin.Story' and 'titleThe.Edge.of.Love' and
## 'titleThe.Express' and 'titleThe.Gallant.Hours' and 'titleThe.General' and
## 'titleThe.Girl.in.the.Red.Velvet.Swing' and 'titleThe.Glenn.Miller.Story'
## and 'titleThe.Great.Ziegfeld' and 'titleThe.Hunter' and 'titleThe.Insider'
## and 'titleThe.Lady' and 'titleThe.Last.Station' and 'titleThe.Longshots' and
## 'titleThe.Perils.of.Pauline' and 'titleThe.Plainsman' and 'titleThe.Queen'
## and 'titleThe.Seven.Little.Foys' and 'titleThe.Social.Network'
## and 'titleThe.Song.of.Bernadette' and 'titleThe.Soul.Keeper' and
## 'titleThe.Story.of.Louis.Pasteur' and 'titleThe.Wings.of.Eagles' and
## 'titleThe.Winning.Team' and 'titleThe.Young.Victoria' and 'titleThis.Is.Elvis'
## and 'titleTotal.Eclipse' and 'titleVeronica.Guerin' and 'titleViva.Villa.'
## and 'titleWhen.Did.You.Last.See.Your.Father.' and 'titleWilson' and
## 'titleYoung.Winston' and 'sitehttp...www.imdb.com.title.tt0025948.'
## and 'sitehttp...www.imdb.com.title.tt0026073.'
## and 'sitehttp...www.imdb.com.title.tt0027698.'
## and 'sitehttp...www.imdb.com.title.tt0027948.'
## and 'sitehttp...www.imdb.com.title.tt0028108.'
## and 'sitehttp...www.imdb.com.title.tt0028313.'
## and 'sitehttp...www.imdb.com.title.tt0031507.'
## and 'sitehttp...www.imdb.com.title.tt0032181.'
## and 'sitehttp...www.imdb.com.title.tt0032676.'
## and 'sitehttp...www.imdb.com.title.tt0034778.'
## and 'sitehttp...www.imdb.com.title.tt0036377.'
## and 'sitehttp...www.imdb.com.title.tt0037465.'
## and 'sitehttp...www.imdb.com.title.tt0037576.'
## and 'sitehttp...www.imdb.com.title.tt0039698.'
## and 'sitehttp...www.imdb.com.title.tt0040491.'
## and 'sitehttp...www.imdb.com.title.tt0043687.'
## and 'sitehttp...www.imdb.com.title.tt0044480.'
## and 'sitehttp...www.imdb.com.title.tt0045332.'
## and 'sitehttp...www.imdb.com.title.tt0045886.'
## and 'sitehttp...www.imdb.com.title.tt0046759.'
## and 'sitehttp...www.imdb.com.title.tt0047030.'
## and 'sitehttp...www.imdb.com.title.tt0048119.'
## and 'sitehttp...www.imdb.com.title.tt0048210.'
## and 'sitehttp...www.imdb.com.title.tt0048317.'
## and 'sitehttp...www.imdb.com.title.tt0048604.'
## and 'sitehttp...www.imdb.com.title.tt0048947.'
## and 'sitehttp...www.imdb.com.title.tt0049170.'
## and 'sitehttp...www.imdb.com.title.tt0049665.'
## and 'sitehttp...www.imdb.com.title.tt0051198.'
## and 'sitehttp...www.imdb.com.title.tt0053849.'
## and 'sitehttp...www.imdb.com.title.tt0054102.'
## and 'sitehttp...www.imdb.com.title.tt0059635.'
## and 'sitehttp...www.imdb.com.title.tt0067069.'
## and 'sitehttp...www.imdb.com.title.tt0068428.'
## and 'sitehttp...www.imdb.com.title.tt0068828.'
## and 'sitehttp...www.imdb.com.title.tt0069528.'
## and 'sitehttp...www.imdb.com.title.tt0069976.'
## and 'sitehttp...www.imdb.com.title.tt0070511.'
## and 'sitehttp...www.imdb.com.title.tt0076106.'
## and 'sitehttp...www.imdb.com.title.tt0080907.'
## and 'sitehttp...www.imdb.com.title.tt0083193.'
## and 'sitehttp...www.imdb.com.title.tt0083967.'
## and 'sitehttp...www.imdb.com.title.tt0084117.'
## and 'sitehttp...www.imdb.com.title.tt0089052.'
## and 'sitehttp...www.imdb.com.title.tt0090110.'
## and 'sitehttp...www.imdb.com.title.tt0091295.'
## and 'sitehttp...www.imdb.com.title.tt0093378.'
## and 'sitehttp...www.imdb.com.title.tt0096029.'
## and 'sitehttp...www.imdb.com.title.tt0097457.'
## and 'sitehttp...www.imdb.com.title.tt0099077.'
## and 'sitehttp...www.imdb.com.title.tt0103747.'
## and 'sitehttp...www.imdb.com.title.tt0108065.'
## and 'sitehttp...www.imdb.com.title.tt0109021.'
## and 'sitehttp...www.imdb.com.title.tt0110496.'
## and 'sitehttp...www.imdb.com.title.tt0110588.'
## and 'sitehttp...www.imdb.com.title.tt0112461.'
## and 'sitehttp...www.imdb.com.title.tt0112637.'
## and 'sitehttp...www.imdb.com.title.tt0112792.'
## and 'sitehttp...www.imdb.com.title.tt0113463.'
## and 'sitehttp...www.imdb.com.title.tt0114272.'
## and 'sitehttp...www.imdb.com.title.tt0114702.'
## and 'sitehttp...www.imdb.com.title.tt0117106.'
## and 'sitehttp...www.imdb.com.title.tt0119937.'
## and 'sitehttp...www.imdb.com.title.tt0120001.'
## and 'sitehttp...www.imdb.com.title.tt0120094.'
## and 'sitehttp...www.imdb.com.title.tt0120706.'
## and 'sitehttp...www.imdb.com.title.tt0140352.'
## and 'sitehttp...www.imdb.com.title.tt0156020.'
## and 'sitehttp...www.imdb.com.title.tt0171804.'
## and 'sitehttp...www.imdb.com.title.tt0180073.'
## and 'sitehttp...www.imdb.com.title.tt0181311.'
## and 'sitehttp...www.imdb.com.title.tt0183306.'
## and 'sitehttp...www.imdb.com.title.tt0268978.'
## and 'sitehttp...www.imdb.com.title.tt0273822.'
## and 'sitehttp...www.imdb.com.title.tt0284929.'
## and 'sitehttp...www.imdb.com.title.tt0285728.'
## and 'sitehttp...www.imdb.com.title.tt0309820.'
## and 'sitehttp...www.imdb.com.title.tt0312549.'
## and 'sitehttp...www.imdb.com.title.tt0340855.'
## and 'sitehttp...www.imdb.com.title.tt0341569.'
## and 'sitehttp...www.imdb.com.title.tt0349825.'
## and 'sitehttp...www.imdb.com.title.tt0349995.'
## and 'sitehttp...www.imdb.com.title.tt0350258.'
## and 'sitehttp...www.imdb.com.title.tt0401997.'
## and 'sitehttp...www.imdb.com.title.tt0422720.'
## and 'sitehttp...www.imdb.com.title.tt0422783.'
## and 'sitehttp...www.imdb.com.title.tt0424908.'
## and 'sitehttp...www.imdb.com.title.tt0427969.'
## and 'sitehttp...www.imdb.com.title.tt0428649.'
## and 'sitehttp...www.imdb.com.title.tt0436697.'
## and 'sitehttp...www.imdb.com.title.tt0445990.'
## and 'sitehttp...www.imdb.com.title.tt0463998.'
## and 'sitehttp...www.imdb.com.title.tt0469640.'
## and 'sitehttp...www.imdb.com.title.tt0469903.'
## and 'sitehttp...www.imdb.com.title.tt0472062.'
## and 'sitehttp...www.imdb.com.title.tt0819714.'
## and 'sitehttp...www.imdb.com.title.tt0824758.'
## and 'sitehttp...www.imdb.com.title.tt0829098.'
## and 'sitehttp...www.imdb.com.title.tt0954981.'
## and 'sitehttp...www.imdb.com.title.tt0962736.'
## and 'sitehttp...www.imdb.com.title.tt1045772.'
## and 'sitehttp...www.imdb.com.title.tt1054580.'
## and 'sitehttp...www.imdb.com.title.tt1091751.'
## and 'sitehttp...www.imdb.com.title.tt1152836.'
## and 'sitehttp...www.imdb.com.title.tt1221208.'
## and 'sitehttp...www.imdb.com.title.tt1244754.' and
## 'sitehttp...www.imdb.com.title.tt1285016.' and '

# Predict the test set
svm_predictions <- predict(svm_model, newdata = test_df)

# Calculate performance metrics
svm_RMSE <- sqrt(mean((test_df$box_office - svm_predictions)^2))
svm_R_squared <- cor(test_df$box_office, svm_predictions)^2

svm_RMSE

## [1] 1.091505

svm_R_squared

## [1] 0.2507735

# Create a data frame with performance metrics
performance_metrics <- data.frame(
  Model = c("Linear Regression", "Random Forest", "SVM"),
  RMSE = c(RMSE, random_forest_RMSE, svm_RMSE),
  R_squared = c(R_squared, random_forest_R_squared, svm_R_squared)
)

# Print the performance metrics
print(performance_metrics)

##               Model     RMSE  R_squared
## 1 Linear Regression 1.268149 0.06431058
## 2     Random Forest 1.180852 0.24230686
## 3               SVM 1.091505 0.25077347

Based on the above, it seems that SVM is the optimal model due to its lowest RMSE value and highest R-squared score.

K-means clustering

# Load required libraries
library(ggplot2)
library(cluster)

# Select numeric columns for clustering
numeric_cols <- c("year_release", "box_office", "number_of_subjects")
data_numeric <- biopics[, numeric_cols]

# Scale the numeric features
scaled_data <- scale(data_numeric)

# Determine the optimal number of clusters using the elbow method
wss <- c()
for (i in 1:15) {
  kmeans_model <- kmeans(scaled_data, centers=i, nstart=25)
  wss[i] <- kmeans_model$tot.withinss
}

# Plot the elbow method results
plot(1:15, wss, type="b", xlab="Number of clusters", ylab="Within groups sum of squares", main="Elbow Method")

# Choose the optimal number of clusters (k) based on the elbow method
k <- 4 # Replace with your choice based on the elbow method plot

# Perform k-means clustering with the optimal number of clusters
kmeans_model <- kmeans(scaled_data, centers=k, nstart=25)

# Visualize the clustering results (using PCA for dimensionality reduction)
pca <- prcomp(scaled_data, center = TRUE, scale. = TRUE)
pca_data <- data.frame(pca$x[, 1:2])
pca_data$cluster <- as.factor(kmeans_model$cluster)

# Plot the PCA results with cluster assignments
ggplot(pca_data, aes(x=PC1, y=PC2, color=cluster)) +
  geom_point() +
  theme_minimal() +
  labs(title="K-means Clustering (PCA Visualization)", x="PC1", y="PC2")

Logistic regression

# Load required libraries
library(caret)
library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

# Select relevant columns for the classification problem
selected_cols <- c("year_release", "box_office", "number_of_subjects", "person_of_color")
data_selected <- biopics[, selected_cols]

# Split the data into training (80%) and testing (20%) sets
set.seed(123)
train_indices <- createDataPartition(data_selected$person_of_color, p = 0.8, list = FALSE)
train_data <- data_selected[train_indices, ]
test_data <- data_selected[-train_indices, ]

# Train a logistic regression model
logit_model <- glm(person_of_color ~ ., data = train_data, family = "binomial")

# Make predictions on the testing set
predictions_prob <- predict(logit_model, newdata = test_data, type = "response")
predictions <- ifelse(predictions_prob > 0.5, 1, 0)

# Calculate evaluation metrics
confusion_matrix <- table(Predicted = predictions, Actual = test_data$person_of_color)
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)

confusion_matrix

##          Actual
## Predicted   0   1
##         0 132  20

# Load required libraries for precision, recall, and F1 score
library(MLmetrics)

## 
## Attaching package: 'MLmetrics'

## The following objects are masked from 'package:caret':
## 
##     MAE, RMSE

## The following object is masked from 'package:base':
## 
##     Recall

precision <- Precision(predictions, test_data$person_of_color)
recall <- Recall(predictions, test_data$person_of_color)
f1_score <- F1_Score(predictions, test_data$person_of_color)

cat("Accuracy:", accuracy, "\n")

## Accuracy: 0.8684211

cat("Precision:", precision, "\n")

## Precision: 1

cat("Recall:", recall, "\n")

## Recall: 0.8684211

cat("F1 Score:", f1_score, "\n")

## F1 Score: 0.9295775

# Plot the ROC curve
roc_obj <- roc(test_data$person_of_color, predictions_prob)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

plot(roc_obj, main = "ROC Curve")

Content-based recommendation system

# Load required libraries
library(dplyr)
library(tidyr)

# Select relevant columns for the recommendation system
selected_cols <- c("title", "type_of_subject", "subject_race", "subject_sex")
data_selected <- biopics[, selected_cols]

# Convert categorical variables to dummy variables
data_dummies <- data_selected %>%
  mutate(across(type_of_subject:subject_sex, as.factor)) %>%
  pivot_wider(names_from = type_of_subject:subject_sex, values_from = type_of_subject:subject_sex,
              values_fill = 0, values_fn = length) %>%
  column_to_rownames("title")

# Calculate the similarity between movies using the cosine similarity
cosine_similarity <- function(x, y) {
  return(sum(x * y) / (sqrt(sum(x^2)) * sqrt(sum(y^2))))
}

#install.packages("proxy")
library(proxy)

## 
## Attaching package: 'proxy'

## The following objects are masked from 'package:stats':
## 
##     as.dist, dist

## The following object is masked from 'package:base':
## 
##     as.matrix

# Calculate the similarity matrix using the proxy package
similarity_matrix <- 1 - proxy::dist(data_dummies, method = "cosine")

recommend_movies <- function(movie_data, similarity_matrix, movie_title, top_n = 10) {
  # Find the index of the given movie_title
  movie_index <- which(movie_data$title == movie_title)
  
  # Check if the movie is found in the dataset
  if (length(movie_index) == 0) {
    stop("Movie not found in the dataset.")
  }
  
  # Get similarity scores
  similarity_scores <- similarity_matrix[movie_index]
  
  # Get the indices of the top_n most similar movies
  top_movie_indices <- order(similarity_scores, decreasing = TRUE)[1:(top_n + 1)] # +1 because the movie itself will be in the list
  
  # Remove the original movie from the list
  top_movie_indices <- top_movie_indices[top_movie_indices != movie_index]
  
  # Return the titles of the top_n most similar movies
  return(movie_data[top_movie_indices, "title"])
}

# Test the recommend_movies function
recommended_movies <- recommend_movies(movie_data = biopics, similarity_matrix = similarity_matrix, movie_title = "12 Years a Slave", top_n = 5)
print(recommended_movies)

## [1] "10 Rillington Place" NA                    NA                   
## [4] NA                    NA                    NA

EDA on Biopics Dataset

Harry Chang

2023-04-21

Data pre-processing

Sample Visualizations

Linear Regression to Predict Box Office Revenue

Random Forest

SVM

K-means clustering

Logistic regression

Content-based recommendation system