library(DBI)
library(RMySQL)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# set up connection to MySQL Database
connect <- dbConnect(RMySQL::MySQL(),
host = "cuny607sql.mysql.database.azure.com",
user = "jayden.jiang36",
password = "Data607work!",
dbname = "jayden.jiang36")
# Query the data table
rating_df <- dbGetQuery(connect, "SELECT * FROM movieratings;")
print(rating_df)
## id person movie rating
## 1 271 Aaron Ne Zha2 5
## 2 272 Aaron The Dark Knight 3
## 3 273 Aaron Avatar: The Way of Water 3
## 4 274 Aaron Inception 5
## 5 275 Aaron The Invisible Man 4
## 6 276 Aaron Soul 2
## 7 277 Ashley Ne Zha2 5
## 8 278 Ashley The Dark Knight 2
## 9 279 Ashley Avatar: The Way of Water 5
## 10 280 Ashley Inception 4
## 11 281 Ashley The Invisible Man 3
## 12 282 Ashley Soul 4
## 13 283 Seren Ne Zha2 5
## 14 284 Seren The Dark Knight 2
## 15 285 Seren Avatar: The Way of Water 2
## 16 286 Seren Inception 5
## 17 287 Seren The Invisible Man 4
## 18 288 Seren Soul 5
## 19 289 Jasper Ne Zha2 5
## 20 290 Jasper The Dark Knight 5
## 21 291 Jasper Avatar: The Way of Water 2
## 22 292 Jasper Inception 3
## 23 293 Jasper The Invisible Man 3
## 24 294 Jasper Soul 4
## 25 295 Jovian Ne Zha2 5
## 26 296 Jovian The Dark Knight 3
## 27 297 Jovian Avatar: The Way of Water 5
## 28 298 Jovian Inception 2
## 29 299 Jovian The Invisible Man 4
## 30 300 Jovian Soul 4
# checking for missing values
summary(rating_df)
## id person movie rating
## Min. :271.0 Length:30 Length:30 Min. :2.000
## 1st Qu.:278.2 Class :character Class :character 1st Qu.:3.000
## Median :285.5 Mode :character Mode :character Median :4.000
## Mean :285.5 Mean :3.767
## 3rd Qu.:292.8 3rd Qu.:5.000
## Max. :300.0 Max. :5.000
# Calculate mean rating for each movie
movies_means <- rating_df %>%
group_by(movie) %>%
summarize(mean_rating = mean(rating, na.rm = TRUE))
# Join the means back to the original dataframe
rating_imputed <- rating_df %>%
left_join(movies_means, by = "movie") %>%
mutate(rating = ifelse(is.na(rating), mean_rating, rating)) %>%
select(-mean_rating)
print(rating_imputed)
## id person movie rating
## 1 271 Aaron Ne Zha2 5
## 2 272 Aaron The Dark Knight 3
## 3 273 Aaron Avatar: The Way of Water 3
## 4 274 Aaron Inception 5
## 5 275 Aaron The Invisible Man 4
## 6 276 Aaron Soul 2
## 7 277 Ashley Ne Zha2 5
## 8 278 Ashley The Dark Knight 2
## 9 279 Ashley Avatar: The Way of Water 5
## 10 280 Ashley Inception 4
## 11 281 Ashley The Invisible Man 3
## 12 282 Ashley Soul 4
## 13 283 Seren Ne Zha2 5
## 14 284 Seren The Dark Knight 2
## 15 285 Seren Avatar: The Way of Water 2
## 16 286 Seren Inception 5
## 17 287 Seren The Invisible Man 4
## 18 288 Seren Soul 5
## 19 289 Jasper Ne Zha2 5
## 20 290 Jasper The Dark Knight 5
## 21 291 Jasper Avatar: The Way of Water 2
## 22 292 Jasper Inception 3
## 23 293 Jasper The Invisible Man 3
## 24 294 Jasper Soul 4
## 25 295 Jovian Ne Zha2 5
## 26 296 Jovian The Dark Knight 3
## 27 297 Jovian Avatar: The Way of Water 5
## 28 298 Jovian Inception 2
## 29 299 Jovian The Invisible Man 4
## 30 300 Jovian Soul 4
I used mean imputation for possible missing data. It is a simple strategy where missing ratings are replaced by the average rating computed from available data.