Movies Chosen:
People Asked to Rate Movies”
library(DBI)
library(RMySQL)
host <- "cuny607sql.mysql.database.azure.com"
port <- 3306
username <- "silma.khan"
password <- "7e468508e3c78185"
schema <- "silma.khan"
con <- dbConnect(MySQL(),
host = host,
port = port,
user = username,
password = password,
dbname = schema)
if (!dbIsValid(con)) {
stop("Connection failed!")
} else {
cat("Successfully connected to the database.\n")
}
## Successfully connected to the database.
query <- "SELECT * FROM movie_ratings_data607;"
movie_ratings_df <- dbGetQuery(con, query)
head(movie_ratings_df)
## id rater movie rating
## 1 1 Eddie Interstellar 5
## 2 2 Eddie Nightmare Before Christmas 4
## 3 3 Eddie Matilda 3
## 4 4 Eddie The Whale 5
## 5 5 Eddie Us 5
## 6 6 Eddie Scooby Doo 4
movie_ratings_df
## id rater movie rating
## 1 1 Eddie Interstellar 5
## 2 2 Eddie Nightmare Before Christmas 4
## 3 3 Eddie Matilda 3
## 4 4 Eddie The Whale 5
## 5 5 Eddie Us 5
## 6 6 Eddie Scooby Doo 4
## 7 7 Elma Interstellar 5
## 8 8 Elma Nightmare Before Christmas 5
## 9 9 Elma Matilda 4
## 10 10 Elma The Whale 5
## 11 11 Elma Us 5
## 12 12 Elma Scooby Doo 5
## 13 13 Saima Interstellar 5
## 14 14 Saima Nightmare Before Christmas 4
## 15 15 Saima Matilda 2
## 16 16 Saima The Whale 5
## 17 17 Saima Us 5
## 18 18 Saima Scooby Doo 5
## 19 19 Dawa Interstellar 3
## 20 20 Dawa Nightmare Before Christmas 4
## 21 21 Dawa Matilda 4
## 22 22 Dawa The Whale 2
## 23 23 Dawa Us 3
## 24 24 Dawa Scooby Doo 4
## 25 25 Addie Interstellar 4
## 26 26 Addie Nightmare Before Christmas 3
## 27 27 Addie Matilda 4
## 28 28 Addie The Whale 4
## 29 29 Addie Us 5
## 30 30 Addie Scooby Doo 2
Pros: This method is quick and simple to implement without needing extra work
Cons: It is easier to lose valuable data and data that can be a bigger asset to some insights
Pros: Better use case to retain valuable information
Cons: This is still just an estimate for the value and does not entirely resemble the actual rating as it can be completely different
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
average_imputation <- movie_ratings_df %>%
group_by(movie) %>%
summarize(average_ratings = mean(rating, na.rm = TRUE))
movie_ratings_df_imputation <- movie_ratings_df %>%
left_join(average_imputation, by = "movie") %>%
mutate(rating_imputation = ifelse(is.na(rating), average_ratings, rating)) %>%
select(-average_ratings)
head(movie_ratings_df_imputation)
## id rater movie rating rating_imputation
## 1 1 Eddie Interstellar 5 5
## 2 2 Eddie Nightmare Before Christmas 4 4
## 3 3 Eddie Matilda 3 3
## 4 4 Eddie The Whale 5 5
## 5 5 Eddie Us 5 5
## 6 6 Eddie Scooby Doo 4 4