setwd("/Users/harrychang/Desktop/Data Analyst Assessments/Grab")
biopics = read.csv("biopics.csv")
library(tidyverse) # For data manipulation and visualization
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.4.1 ✔ purrr 1.0.1
## ✔ tibble 3.1.7 ✔ dplyr 1.1.0
## ✔ tidyr 1.3.0 ✔ stringr 1.5.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(lubridate) # For handling date and time
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(stringr) # For handling strings
biopics ## Load dataset
biopics <- biopics %>%
mutate(across(everything(), replace_na, "Unknown")) # Replace NA values with "Unknown"
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(everything(), replace_na, "Unknown")`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
##
## # Previously
## across(a:b, mean, na.rm = TRUE)
##
## # Now
## across(a:b, \(x) mean(x, na.rm = TRUE))
biopics
biopics <- biopics %>%
mutate(
person_of_color = as.factor(person_of_color), # Convert to factor
subject_sex = as.factor(subject_sex)
)
biopics
convert_box_office <- function(x) {
x <- gsub(",", "", x) # Remove commas
x <- gsub("\\$", "", x) # Remove dollar sign
# If the value is in millions, replace "M" with "e6" (scientific notation)
x <- ifelse(str_detect(x, "M"), str_replace(x, "M", "e6"), x)
# If the value is in thousands, replace "K" with "e3" (scientific notation)
x <- ifelse(str_detect(x, "K"), str_replace(x, "K", "e3"), x)
# Convert to numeric and handle non-numeric values
as.numeric(gsub("-", "NA", x))
}
biopics <- biopics %>%
mutate(
box_office = convert_box_office(box_office)
)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `box_office = convert_box_office(box_office)`.
## Caused by warning in `convert_box_office()`:
## ! NAs introduced by coercion
biopics
biopics <- biopics %>%
mutate(
box_office = ifelse(is.na(box_office), -1, box_office) # Replace NA with a specific value (e.g., -1) or use any other imputation technique
)
biopics
ggplot(biopics, aes(x = box_office)) +
geom_histogram(binwidth = 1e6, fill = "blue", color = "black") +
scale_x_continuous(labels = scales::comma) +
labs(title = "Box Office Distribution", x = "Box Office", y = "Frequency")
ggplot(biopics, aes(x = year_release)) +
geom_bar(fill = "blue", color = "black") +
labs(title = "Number of Biopics per Year", x = "Release Year", y = "Count")
ggplot(biopics, aes(x = country)) +
geom_bar(fill = "lightblue") +
coord_flip() +
labs(title = "Number of Biopics per Country", x = "Country", y = "Count")
ggplot(biopics, aes(x = type_of_subject)) +
geom_bar(fill = "red") +
coord_flip() +
labs(title = "Number of Biopics by Type of Subject", x = "Type of Subject", y = "Count")
ggplot(biopics, aes(x = subject_race)) +
geom_bar(fill = "green") +
coord_flip() +
labs(title = "Number of Biopics by Subject Race", x = "Subject Race", y = "Count")
ggplot(biopics, aes(x = subject_sex)) +
geom_bar(fill = "orange") +
labs(title = "Number of Biopics by Subject Sex", x = "Subject Sex", y = "Count")
top_directors <- biopics %>%
count(director, sort = TRUE) %>%
head(10) %>%
pull(director)
ggplot(filter(biopics, director %in% top_directors), aes(x = director)) +
geom_bar(fill = "purple") +
coord_flip() +
labs(title = "Number of Biopics by Top 10 Directors", x = "Director", y = "Count")
ggplot(biopics, aes(x = subject_sex, y = box_office)) +
geom_boxplot(fill = "brown") +
scale_y_continuous(labels = scales::comma) +
coord_flip()+
labs(title = "Box Office by Subject Sex", x = "Subject Sex", y = "Box Office")
# Load required library
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
biopics[] <- lapply(biopics, function(x) {
if (is.character(x)) {
x <- enc2utf8(x)
}
return(x)
})
# Load required library
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
# Clean the column names
biopics <- clean_names(biopics)
# Define preprocessing steps
preProcess_steps <- c("center", "scale", "nzv")
# Create preprocessing object
preProcess_obj <- preProcess(biopics, method = preProcess_steps)
# Apply preprocessing to the whole dataset
biopics_preprocessed <- predict(preProcess_obj, biopics)
# Create a dummyVars object
dummy_obj <- dummyVars(~ ., data = biopics_preprocessed, fullRank = TRUE)
# Create dummy variables using the dummyVars object
biopics_dummy <- data.frame(predict(dummy_obj, newdata = biopics_preprocessed))
set.seed(42)
splitIndex <- createDataPartition(biopics_dummy$box_office, p = 0.8, list = FALSE)
train_df <- biopics_dummy[splitIndex, ]
test_df <- biopics_dummy[-splitIndex, ]
linear_model <- lm(box_office ~ ., data = train_df)
# Predict the test set
predictions <- predict(linear_model, newdata = test_df)
## Warning in predict.lm(linear_model, newdata = test_df): prediction from a
## rank-deficient fit may be misleading
# Calculate performance metrics
RMSE <- sqrt(mean((test_df$box_office - predictions)^2))
R_squared <- cor(test_df$box_office, predictions)^2
RMSE
## [1] 1.268149
R_squared
## [1] 0.06431058
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
# Train the random forest model
random_forest_model <- randomForest(box_office ~ ., data = train_df, ntree = 500, importance = TRUE)
# Predict the test set
random_forest_predictions <- predict(random_forest_model, newdata = test_df)
# Calculate performance metrics
random_forest_RMSE <- sqrt(mean((test_df$box_office - random_forest_predictions)^2))
random_forest_R_squared <- cor(test_df$box_office, random_forest_predictions)^2
random_forest_RMSE
## [1] 1.180852
random_forest_R_squared
## [1] 0.2423069
library(e1071)
# Train the SVM model
svm_model <- svm(box_office ~ ., data = train_df, kernel = "radial", cost = 10, gamma = 0.1)
## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'title12.Years.a.Slave' and 'title127.Hours' and 'title8.Seconds'
## and 'titleA.Beautiful.Mind' and 'titleAbe.Lincoln.in.Illinois' and
## 'titleAmerican.Sniper' and 'titleAnastasia' and 'titleAnnie.Oakley'
## and 'titleAwakenings' and 'titleBathory..Countess.of.Blood' and
## 'titleBeau.Brummell' and 'titleBehind.the.Lines' and 'titleBest' and
## 'titleBoys.Don.t.Cry' and 'titleBreach' and 'titleCaptain.Kidd' and
## 'titleCarbine.Williams' and 'titleCarrington' and 'titleCass' and
## 'titleCharlie.Wilson.s.War' and 'titleConviction' and 'titleCopying.Beethoven'
## and 'titleDahmer' and 'titleDangerous.Minds' and 'titleDeath.in.Granada'
## and 'titleDesert.Dancer' and 'titleDesert.Flower' and 'titleDiana'
## and 'titleDreamchild' and 'titleEvel.Knievel' and 'titleFrances' and
## 'titleFrankie...Alice' and 'titleFreedom.Writers' and 'titleGentleman.Jim'
## and 'titleGoltzius.and.the.Pelican.Company' and 'titleGreased.Lightning'
## and 'titleGreat.Balls.of.Fire.' and 'titleHollywoodland' and 'titleHoudini'
## and 'titleI.Am.a.Sex.Addict' and 'titleI.Love.You.Phillip.Morris' and
## 'titleIf.You.Could.See.What.I.Hear' and 'titleInterrupted.Melody' and
## 'titleInvincible' and 'titleJefferson.in.Paris' and 'titleJesse.James'
## and 'titleJim.Thorpe....All.American' and 'titleJinnah' and
## 'titleJo.Jo.Dancer..Your.Life.Is.Calling' and 'titleJoan.of.Arc'
## and 'titleKnute.Rockne.All.American' and 'titleLa.Bamba' and
## 'titleLady.Sings.the.Blues' and 'titleLee.Daniels..The.Butler' and
## 'titleLove.Me.or.Leave.Me' and 'titleLuther' and 'titleMary.of.Scotland'
## and 'titleMesmer' and 'titleMillion.Dollar.Arm' and 'titleMiracle'
## and 'titleMockingbird.Don.t.Sing' and 'titleMonster' and
## 'titleMrs..Parker.and.the.Vicious.Circle' and 'titleMurder..Inc.' and
## 'titleMusic.Within' and 'titlePapillon' and 'titlePrefontaine' and
## 'titlePublic.Enemies' and 'titleQuills' and 'titleRasputin..The.Mad.Monk'
## and 'titleRay' and 'titleReach.for.the.Sky' and 'titleRestoration'
## and 'titleRicky.6' and 'titleRosewater' and 'titleSalome.s.Last.Dance'
## and 'titleSearching.for.Bobby.Fischer' and 'titleSelena' and
## 'titleSelf.Medicated' and 'titleSon.of.God' and 'titleSweet.Dreams' and
## 'titleTed.Bundy' and 'titleThe.Babe' and 'titleThe.Basketball.Diaries'
## and 'titleThe.Broken.Tower' and 'titleThe.Cross.and.the.Switchblade'
## and 'titleThe.Eddy.Duchin.Story' and 'titleThe.Edge.of.Love' and
## 'titleThe.Express' and 'titleThe.Gallant.Hours' and 'titleThe.General' and
## 'titleThe.Girl.in.the.Red.Velvet.Swing' and 'titleThe.Glenn.Miller.Story'
## and 'titleThe.Great.Ziegfeld' and 'titleThe.Hunter' and 'titleThe.Insider'
## and 'titleThe.Lady' and 'titleThe.Last.Station' and 'titleThe.Longshots' and
## 'titleThe.Perils.of.Pauline' and 'titleThe.Plainsman' and 'titleThe.Queen'
## and 'titleThe.Seven.Little.Foys' and 'titleThe.Social.Network'
## and 'titleThe.Song.of.Bernadette' and 'titleThe.Soul.Keeper' and
## 'titleThe.Story.of.Louis.Pasteur' and 'titleThe.Wings.of.Eagles' and
## 'titleThe.Winning.Team' and 'titleThe.Young.Victoria' and 'titleThis.Is.Elvis'
## and 'titleTotal.Eclipse' and 'titleVeronica.Guerin' and 'titleViva.Villa.'
## and 'titleWhen.Did.You.Last.See.Your.Father.' and 'titleWilson' and
## 'titleYoung.Winston' and 'sitehttp...www.imdb.com.title.tt0025948.'
## and 'sitehttp...www.imdb.com.title.tt0026073.'
## and 'sitehttp...www.imdb.com.title.tt0027698.'
## and 'sitehttp...www.imdb.com.title.tt0027948.'
## and 'sitehttp...www.imdb.com.title.tt0028108.'
## and 'sitehttp...www.imdb.com.title.tt0028313.'
## and 'sitehttp...www.imdb.com.title.tt0031507.'
## and 'sitehttp...www.imdb.com.title.tt0032181.'
## and 'sitehttp...www.imdb.com.title.tt0032676.'
## and 'sitehttp...www.imdb.com.title.tt0034778.'
## and 'sitehttp...www.imdb.com.title.tt0036377.'
## and 'sitehttp...www.imdb.com.title.tt0037465.'
## and 'sitehttp...www.imdb.com.title.tt0037576.'
## and 'sitehttp...www.imdb.com.title.tt0039698.'
## and 'sitehttp...www.imdb.com.title.tt0040491.'
## and 'sitehttp...www.imdb.com.title.tt0043687.'
## and 'sitehttp...www.imdb.com.title.tt0044480.'
## and 'sitehttp...www.imdb.com.title.tt0045332.'
## and 'sitehttp...www.imdb.com.title.tt0045886.'
## and 'sitehttp...www.imdb.com.title.tt0046759.'
## and 'sitehttp...www.imdb.com.title.tt0047030.'
## and 'sitehttp...www.imdb.com.title.tt0048119.'
## and 'sitehttp...www.imdb.com.title.tt0048210.'
## and 'sitehttp...www.imdb.com.title.tt0048317.'
## and 'sitehttp...www.imdb.com.title.tt0048604.'
## and 'sitehttp...www.imdb.com.title.tt0048947.'
## and 'sitehttp...www.imdb.com.title.tt0049170.'
## and 'sitehttp...www.imdb.com.title.tt0049665.'
## and 'sitehttp...www.imdb.com.title.tt0051198.'
## and 'sitehttp...www.imdb.com.title.tt0053849.'
## and 'sitehttp...www.imdb.com.title.tt0054102.'
## and 'sitehttp...www.imdb.com.title.tt0059635.'
## and 'sitehttp...www.imdb.com.title.tt0067069.'
## and 'sitehttp...www.imdb.com.title.tt0068428.'
## and 'sitehttp...www.imdb.com.title.tt0068828.'
## and 'sitehttp...www.imdb.com.title.tt0069528.'
## and 'sitehttp...www.imdb.com.title.tt0069976.'
## and 'sitehttp...www.imdb.com.title.tt0070511.'
## and 'sitehttp...www.imdb.com.title.tt0076106.'
## and 'sitehttp...www.imdb.com.title.tt0080907.'
## and 'sitehttp...www.imdb.com.title.tt0083193.'
## and 'sitehttp...www.imdb.com.title.tt0083967.'
## and 'sitehttp...www.imdb.com.title.tt0084117.'
## and 'sitehttp...www.imdb.com.title.tt0089052.'
## and 'sitehttp...www.imdb.com.title.tt0090110.'
## and 'sitehttp...www.imdb.com.title.tt0091295.'
## and 'sitehttp...www.imdb.com.title.tt0093378.'
## and 'sitehttp...www.imdb.com.title.tt0096029.'
## and 'sitehttp...www.imdb.com.title.tt0097457.'
## and 'sitehttp...www.imdb.com.title.tt0099077.'
## and 'sitehttp...www.imdb.com.title.tt0103747.'
## and 'sitehttp...www.imdb.com.title.tt0108065.'
## and 'sitehttp...www.imdb.com.title.tt0109021.'
## and 'sitehttp...www.imdb.com.title.tt0110496.'
## and 'sitehttp...www.imdb.com.title.tt0110588.'
## and 'sitehttp...www.imdb.com.title.tt0112461.'
## and 'sitehttp...www.imdb.com.title.tt0112637.'
## and 'sitehttp...www.imdb.com.title.tt0112792.'
## and 'sitehttp...www.imdb.com.title.tt0113463.'
## and 'sitehttp...www.imdb.com.title.tt0114272.'
## and 'sitehttp...www.imdb.com.title.tt0114702.'
## and 'sitehttp...www.imdb.com.title.tt0117106.'
## and 'sitehttp...www.imdb.com.title.tt0119937.'
## and 'sitehttp...www.imdb.com.title.tt0120001.'
## and 'sitehttp...www.imdb.com.title.tt0120094.'
## and 'sitehttp...www.imdb.com.title.tt0120706.'
## and 'sitehttp...www.imdb.com.title.tt0140352.'
## and 'sitehttp...www.imdb.com.title.tt0156020.'
## and 'sitehttp...www.imdb.com.title.tt0171804.'
## and 'sitehttp...www.imdb.com.title.tt0180073.'
## and 'sitehttp...www.imdb.com.title.tt0181311.'
## and 'sitehttp...www.imdb.com.title.tt0183306.'
## and 'sitehttp...www.imdb.com.title.tt0268978.'
## and 'sitehttp...www.imdb.com.title.tt0273822.'
## and 'sitehttp...www.imdb.com.title.tt0284929.'
## and 'sitehttp...www.imdb.com.title.tt0285728.'
## and 'sitehttp...www.imdb.com.title.tt0309820.'
## and 'sitehttp...www.imdb.com.title.tt0312549.'
## and 'sitehttp...www.imdb.com.title.tt0340855.'
## and 'sitehttp...www.imdb.com.title.tt0341569.'
## and 'sitehttp...www.imdb.com.title.tt0349825.'
## and 'sitehttp...www.imdb.com.title.tt0349995.'
## and 'sitehttp...www.imdb.com.title.tt0350258.'
## and 'sitehttp...www.imdb.com.title.tt0401997.'
## and 'sitehttp...www.imdb.com.title.tt0422720.'
## and 'sitehttp...www.imdb.com.title.tt0422783.'
## and 'sitehttp...www.imdb.com.title.tt0424908.'
## and 'sitehttp...www.imdb.com.title.tt0427969.'
## and 'sitehttp...www.imdb.com.title.tt0428649.'
## and 'sitehttp...www.imdb.com.title.tt0436697.'
## and 'sitehttp...www.imdb.com.title.tt0445990.'
## and 'sitehttp...www.imdb.com.title.tt0463998.'
## and 'sitehttp...www.imdb.com.title.tt0469640.'
## and 'sitehttp...www.imdb.com.title.tt0469903.'
## and 'sitehttp...www.imdb.com.title.tt0472062.'
## and 'sitehttp...www.imdb.com.title.tt0819714.'
## and 'sitehttp...www.imdb.com.title.tt0824758.'
## and 'sitehttp...www.imdb.com.title.tt0829098.'
## and 'sitehttp...www.imdb.com.title.tt0954981.'
## and 'sitehttp...www.imdb.com.title.tt0962736.'
## and 'sitehttp...www.imdb.com.title.tt1045772.'
## and 'sitehttp...www.imdb.com.title.tt1054580.'
## and 'sitehttp...www.imdb.com.title.tt1091751.'
## and 'sitehttp...www.imdb.com.title.tt1152836.'
## and 'sitehttp...www.imdb.com.title.tt1221208.'
## and 'sitehttp...www.imdb.com.title.tt1244754.' and
## 'sitehttp...www.imdb.com.title.tt1285016.' and '
# Predict the test set
svm_predictions <- predict(svm_model, newdata = test_df)
# Calculate performance metrics
svm_RMSE <- sqrt(mean((test_df$box_office - svm_predictions)^2))
svm_R_squared <- cor(test_df$box_office, svm_predictions)^2
svm_RMSE
## [1] 1.091505
svm_R_squared
## [1] 0.2507735
# Create a data frame with performance metrics
performance_metrics <- data.frame(
Model = c("Linear Regression", "Random Forest", "SVM"),
RMSE = c(RMSE, random_forest_RMSE, svm_RMSE),
R_squared = c(R_squared, random_forest_R_squared, svm_R_squared)
)
# Print the performance metrics
print(performance_metrics)
## Model RMSE R_squared
## 1 Linear Regression 1.268149 0.06431058
## 2 Random Forest 1.180852 0.24230686
## 3 SVM 1.091505 0.25077347
Based on the above, it seems that SVM is the optimal model due to its lowest RMSE value and highest R-squared score.
# Load required libraries
library(ggplot2)
library(cluster)
# Select numeric columns for clustering
numeric_cols <- c("year_release", "box_office", "number_of_subjects")
data_numeric <- biopics[, numeric_cols]
# Scale the numeric features
scaled_data <- scale(data_numeric)
# Determine the optimal number of clusters using the elbow method
wss <- c()
for (i in 1:15) {
kmeans_model <- kmeans(scaled_data, centers=i, nstart=25)
wss[i] <- kmeans_model$tot.withinss
}
# Plot the elbow method results
plot(1:15, wss, type="b", xlab="Number of clusters", ylab="Within groups sum of squares", main="Elbow Method")
# Choose the optimal number of clusters (k) based on the elbow method
k <- 4 # Replace with your choice based on the elbow method plot
# Perform k-means clustering with the optimal number of clusters
kmeans_model <- kmeans(scaled_data, centers=k, nstart=25)
# Visualize the clustering results (using PCA for dimensionality reduction)
pca <- prcomp(scaled_data, center = TRUE, scale. = TRUE)
pca_data <- data.frame(pca$x[, 1:2])
pca_data$cluster <- as.factor(kmeans_model$cluster)
# Plot the PCA results with cluster assignments
ggplot(pca_data, aes(x=PC1, y=PC2, color=cluster)) +
geom_point() +
theme_minimal() +
labs(title="K-means Clustering (PCA Visualization)", x="PC1", y="PC2")
# Load required libraries
library(caret)
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
# Select relevant columns for the classification problem
selected_cols <- c("year_release", "box_office", "number_of_subjects", "person_of_color")
data_selected <- biopics[, selected_cols]
# Split the data into training (80%) and testing (20%) sets
set.seed(123)
train_indices <- createDataPartition(data_selected$person_of_color, p = 0.8, list = FALSE)
train_data <- data_selected[train_indices, ]
test_data <- data_selected[-train_indices, ]
# Train a logistic regression model
logit_model <- glm(person_of_color ~ ., data = train_data, family = "binomial")
# Make predictions on the testing set
predictions_prob <- predict(logit_model, newdata = test_data, type = "response")
predictions <- ifelse(predictions_prob > 0.5, 1, 0)
# Calculate evaluation metrics
confusion_matrix <- table(Predicted = predictions, Actual = test_data$person_of_color)
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
confusion_matrix
## Actual
## Predicted 0 1
## 0 132 20
# Load required libraries for precision, recall, and F1 score
library(MLmetrics)
##
## Attaching package: 'MLmetrics'
## The following objects are masked from 'package:caret':
##
## MAE, RMSE
## The following object is masked from 'package:base':
##
## Recall
precision <- Precision(predictions, test_data$person_of_color)
recall <- Recall(predictions, test_data$person_of_color)
f1_score <- F1_Score(predictions, test_data$person_of_color)
cat("Accuracy:", accuracy, "\n")
## Accuracy: 0.8684211
cat("Precision:", precision, "\n")
## Precision: 1
cat("Recall:", recall, "\n")
## Recall: 0.8684211
cat("F1 Score:", f1_score, "\n")
## F1 Score: 0.9295775
# Plot the ROC curve
roc_obj <- roc(test_data$person_of_color, predictions_prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_obj, main = "ROC Curve")
# Load required libraries
library(dplyr)
library(tidyr)
# Select relevant columns for the recommendation system
selected_cols <- c("title", "type_of_subject", "subject_race", "subject_sex")
data_selected <- biopics[, selected_cols]
# Convert categorical variables to dummy variables
data_dummies <- data_selected %>%
mutate(across(type_of_subject:subject_sex, as.factor)) %>%
pivot_wider(names_from = type_of_subject:subject_sex, values_from = type_of_subject:subject_sex,
values_fill = 0, values_fn = length) %>%
column_to_rownames("title")
# Calculate the similarity between movies using the cosine similarity
cosine_similarity <- function(x, y) {
return(sum(x * y) / (sqrt(sum(x^2)) * sqrt(sum(y^2))))
}
#install.packages("proxy")
library(proxy)
##
## Attaching package: 'proxy'
## The following objects are masked from 'package:stats':
##
## as.dist, dist
## The following object is masked from 'package:base':
##
## as.matrix
# Calculate the similarity matrix using the proxy package
similarity_matrix <- 1 - proxy::dist(data_dummies, method = "cosine")
recommend_movies <- function(movie_data, similarity_matrix, movie_title, top_n = 10) {
# Find the index of the given movie_title
movie_index <- which(movie_data$title == movie_title)
# Check if the movie is found in the dataset
if (length(movie_index) == 0) {
stop("Movie not found in the dataset.")
}
# Get similarity scores
similarity_scores <- similarity_matrix[movie_index]
# Get the indices of the top_n most similar movies
top_movie_indices <- order(similarity_scores, decreasing = TRUE)[1:(top_n + 1)] # +1 because the movie itself will be in the list
# Remove the original movie from the list
top_movie_indices <- top_movie_indices[top_movie_indices != movie_index]
# Return the titles of the top_n most similar movies
return(movie_data[top_movie_indices, "title"])
}
# Test the recommend_movies function
recommended_movies <- recommend_movies(movie_data = biopics, similarity_matrix = similarity_matrix, movie_title = "12 Years a Slave", top_n = 5)
print(recommended_movies)
## [1] "10 Rillington Place" NA NA
## [4] NA NA NA