Library

library(DBI)
library(RMySQL)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Connecting to MySQL

# set up connection to MySQL Database
connect <- dbConnect(RMySQL::MySQL(),
                     host = "cuny607sql.mysql.database.azure.com",
                     user = "jayden.jiang36",
                     password = "Data607work!",
                     dbname = "jayden.jiang36")
# Query the data table
rating_df <- dbGetQuery(connect, "SELECT * FROM movieratings;")

print(rating_df)
##     id person                    movie rating
## 1  271  Aaron                  Ne Zha2      5
## 2  272  Aaron          The Dark Knight      3
## 3  273  Aaron Avatar: The Way of Water      3
## 4  274  Aaron                Inception      5
## 5  275  Aaron        The Invisible Man      4
## 6  276  Aaron                     Soul      2
## 7  277 Ashley                  Ne Zha2      5
## 8  278 Ashley          The Dark Knight      2
## 9  279 Ashley Avatar: The Way of Water      5
## 10 280 Ashley                Inception      4
## 11 281 Ashley        The Invisible Man      3
## 12 282 Ashley                     Soul      4
## 13 283  Seren                  Ne Zha2      5
## 14 284  Seren          The Dark Knight      2
## 15 285  Seren Avatar: The Way of Water      2
## 16 286  Seren                Inception      5
## 17 287  Seren        The Invisible Man      4
## 18 288  Seren                     Soul      5
## 19 289 Jasper                  Ne Zha2      5
## 20 290 Jasper          The Dark Knight      5
## 21 291 Jasper Avatar: The Way of Water      2
## 22 292 Jasper                Inception      3
## 23 293 Jasper        The Invisible Man      3
## 24 294 Jasper                     Soul      4
## 25 295 Jovian                  Ne Zha2      5
## 26 296 Jovian          The Dark Knight      3
## 27 297 Jovian Avatar: The Way of Water      5
## 28 298 Jovian                Inception      2
## 29 299 Jovian        The Invisible Man      4
## 30 300 Jovian                     Soul      4

Implement An Approach for Missing Data

# checking for missing values
summary(rating_df)
##        id           person             movie               rating     
##  Min.   :271.0   Length:30          Length:30          Min.   :2.000  
##  1st Qu.:278.2   Class :character   Class :character   1st Qu.:3.000  
##  Median :285.5   Mode  :character   Mode  :character   Median :4.000  
##  Mean   :285.5                                         Mean   :3.767  
##  3rd Qu.:292.8                                         3rd Qu.:5.000  
##  Max.   :300.0                                         Max.   :5.000
# Calculate mean rating for each movie
movies_means <- rating_df %>%
  group_by(movie) %>%
  summarize(mean_rating = mean(rating, na.rm = TRUE))

# Join the means back to the original dataframe
rating_imputed <- rating_df %>%
  left_join(movies_means, by = "movie") %>%
  mutate(rating = ifelse(is.na(rating), mean_rating, rating)) %>%
  select(-mean_rating)

print(rating_imputed)
##     id person                    movie rating
## 1  271  Aaron                  Ne Zha2      5
## 2  272  Aaron          The Dark Knight      3
## 3  273  Aaron Avatar: The Way of Water      3
## 4  274  Aaron                Inception      5
## 5  275  Aaron        The Invisible Man      4
## 6  276  Aaron                     Soul      2
## 7  277 Ashley                  Ne Zha2      5
## 8  278 Ashley          The Dark Knight      2
## 9  279 Ashley Avatar: The Way of Water      5
## 10 280 Ashley                Inception      4
## 11 281 Ashley        The Invisible Man      3
## 12 282 Ashley                     Soul      4
## 13 283  Seren                  Ne Zha2      5
## 14 284  Seren          The Dark Knight      2
## 15 285  Seren Avatar: The Way of Water      2
## 16 286  Seren                Inception      5
## 17 287  Seren        The Invisible Man      4
## 18 288  Seren                     Soul      5
## 19 289 Jasper                  Ne Zha2      5
## 20 290 Jasper          The Dark Knight      5
## 21 291 Jasper Avatar: The Way of Water      2
## 22 292 Jasper                Inception      3
## 23 293 Jasper        The Invisible Man      3
## 24 294 Jasper                     Soul      4
## 25 295 Jovian                  Ne Zha2      5
## 26 296 Jovian          The Dark Knight      3
## 27 297 Jovian Avatar: The Way of Water      5
## 28 298 Jovian                Inception      2
## 29 299 Jovian        The Invisible Man      4
## 30 300 Jovian                     Soul      4

Conclusion

I used mean imputation for possible missing data. It is a simple strategy where missing ratings are replaced by the average rating computed from available data.