Data 607 Assignment 3A

Introduction: Global Baseline Estimate Recommendations

As we saw in the previous assignment working with the Movie Survey Database, the only table with missing data is the ratings table. We are missing ratings as some of the survey takers did not watch every movie in the survey. Instead of deleting the rows that don’t contain ratings, this time we will fill in those rows after we create a global baseline estimate system.

We will use the following libraries

The DBI library
The RPostgres library
The dplyr library
The tidyverse library

con <- dbConnect(RPostgres::Postgres(), 
                 host="localhost", 
                 port="5432",
                 dbname="moviesurvey",
                 user="postgres", 
                 password=Sys.getenv("DB_PASSWORD"))

con

## <PqConnection> moviesurvey@localhost:5432

Table Names and Querying our Database

dbListTables(con)

## [1] "movies"        "ratings"       "survey_takers"

dbGetQuery(con, "SELECT * FROM movies")

##   movie_id        title content_rating genre_1         genre_2
## 1        3 Materialists              R Romance          Comedy
## 2        4    Eddington              R Western        Thriller
## 3        5      Weapons              R  Horror        Thriller
## 4        6     Superman          PG-13  Action Science Fiction
## 5        1    Nosferatu              R  Horror         Mystery
## 6        2      Sinners              R  Horror         Mystery

Database Tables to R Data Frames

Let’s load our database tables into data frames.

df_movies  <- dbGetQuery(con, "SELECT * FROM movies")
df_takers  <- dbGetQuery(con, "SELECT * FROM survey_takers")
df_ratings   <- dbGetQuery(con, "SELECT * FROM ratings")

Glimpse the Data Frames

Let’s make sure our data frames were properly loaded.

glimpse(df_movies)

## Rows: 6
## Columns: 5
## $ movie_id       <int> 3, 4, 5, 6, 1, 2
## $ title          <chr> "Materialists", "Eddington", "Weapons", "Superman", "No…
## $ content_rating <chr> "R", "R", "R", "PG-13", "R", "R"
## $ genre_1        <chr> "Romance", "Western", "Horror", "Action", "Horror", "Ho…
## $ genre_2        <chr> "Comedy", "Thriller", "Thriller", "Science Fiction", "M…

glimpse(df_takers)

## Rows: 5
## Columns: 2
## $ survey_takers_id <int> 1, 2, 3, 4, 5
## $ name             <chr> "Hillary Ramos", "Marwa Khan", "Sandra Beck", "Rachel…

glimpse(df_ratings)

## Rows: 30
## Columns: 5
## $ rating_id          <int> 100, 104, 118, 129, 101, 102, 103, 105, 106, 107, 1…
## $ rating             <chr> NA, NA, NA, NA, "5", "4", "3", "3", "3", "1", "5", …
## $ film_id            <int> 1, 5, 1, 6, 2, 3, 4, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, …
## $ takers_id          <int> 1, 5, 4, 5, 2, 3, 4, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, …
## $ rating_description <chr> NA, NA, NA, NA, "Amazing", "Good", "Average", "Aver…

Looking for Null Values

As in our previous time working with this database and from using glimpse() we know there null values in the tables, let’s remind ourselves where they are.

sapply(df_ratings, function(x) sum(is.na(x)))

##          rating_id             rating            film_id          takers_id 
##                  0                  4                  0                  0 
## rating_description 
##                  4

We can see that there are null values in the rating and rating_description columns within the ratings dataframe.

First let’s convert our rating column values from character to integer

df_ratings$rating <- as.integer(df_ratings$rating)

glimpse(df_ratings)

## Rows: 30
## Columns: 5
## $ rating_id          <int> 100, 104, 118, 129, 101, 102, 103, 105, 106, 107, 1…
## $ rating             <int> NA, NA, NA, NA, 5, 4, 3, 3, 3, 1, 5, 2, 4, 2, 5, 1,…
## $ film_id            <int> 1, 5, 1, 6, 2, 3, 4, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, …
## $ takers_id          <int> 1, 5, 4, 5, 2, 3, 4, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, …
## $ rating_description <chr> NA, NA, NA, NA, "Amazing", "Good", "Average", "Aver…

Finding the Average Movie Rating

Now we can find the average movie rating.

movie_avg <- df_ratings %>%
                group_by(film_id) %>%
                summarise(movieavg = mean(rating, na.rm = TRUE))

movie_avg

## # A tibble: 6 × 2
##   film_id movieavg
##     <int>    <dbl>
## 1       1     3   
## 2       2     2.4 
## 3       3     4.2 
## 4       4     3.2 
## 5       5     2.75
## 6       6     3.25

glimpse(movie_avg)

## Rows: 6
## Columns: 2
## $ film_id  <int> 1, 2, 3, 4, 5, 6
## $ movieavg <dbl> 3.00, 2.40, 4.20, 3.20, 2.75, 3.25

Finding the Average Survey Taker Rating

Now let’s find the average rating per survey taker.

taker_avg <- df_ratings %>%
                group_by(takers_id) %>%
                summarise(takeravg = mean(rating, na.rm = TRUE))

taker_avg

## # A tibble: 5 × 2
##   takers_id takeravg
##       <int>    <dbl>
## 1         1     3.8 
## 2         2     3.67
## 3         3     3.17
## 4         4     3   
## 5         5     1.75

glimpse(taker_avg)

## Rows: 5
## Columns: 2
## $ takers_id <int> 1, 2, 3, 4, 5
## $ takeravg  <dbl> 3.800000, 3.666667, 3.166667, 3.000000, 1.750000

Global Mean

First let’s find our global mean rating

global_mean <- mean(df_ratings$rating, na.rm = TRUE)

Great, now we can subtract the global mean rating from each movie average rating and add the results as a column to our original ratings data frame. This will give us our movie bias column.

movie_avg$mavg_minus_mean <- movie_avg$movieavg - global_mean

Let’s subtract the global mean rating from each survey taker average rating, this will give us our survey taker bias column.

taker_avg$tavg_minus_mean <- taker_avg$takeravg - global_mean

Add Bias Columns to Original Data Frame

Let’s add our movie bias column to our original ratings data frame.

df_ratings <- df_ratings %>%
  left_join(movie_avg, by = "film_id")

Let’s add our survey taker bias column to our original ratings data frame.

df_ratings <- df_ratings %>%
  left_join(taker_avg, by = "takers_id")

Now let’s find the global baseline estimates and add it as column to our data frame.

df_ratings <- df_ratings %>% 
  mutate(global_estimate_baseline =global_mean + 
           mavg_minus_mean + tavg_minus_mean)

Let’s round our global baseline estimates column.

df_ratings$global_estimate_baseline <- round(df_ratings$global_estimate_baseline,
                                             digits = 1)

Let’s also order our data frame by movie and then by survey taker id.

df_ratings <- df_ratings[order(df_ratings$film_id,df_ratings$takers_id),]

Subset our Data Frame

Let’s have a table with the original ratings table columns with only the addition of the global_baseline estimates, movie and survey taker bias columns.

First lets save our data frame so far.

df1 <- df_ratings

#Now we can change up our data frame. 

df_ratings <- df_ratings[, c("rating_id","rating", "film_id", "takers_id",
                             "rating_description", "mavg_minus_mean", 
                             "tavg_minus_mean", "global_estimate_baseline")]

Renaming our columns

df_ratings <- df_ratings %>% 
                     rename(user_bias = tavg_minus_mean, 
                     movie_bias = mavg_minus_mean, 
                     global_be = global_estimate_baseline)

Plot Global Baseline Estimates

A plot of films vs their global baseline estimates.

ggplot( data = df_ratings, aes(x = film_id , y = global_be, 
                               color = takers_id )) +
geom_point() +
  labs(title = "Film Global Estimates")

Replace Nulls with Global Estimaes

Let’s replace our null ratings with their global estimates and ensure the rating column has no null values.

df_ratings$rating[is.na(df_ratings$rating)] <- 
  df_ratings$global_be[is.na(df_ratings$rating)]

print(df_ratings$rating)

##  [1] 3.6 3.0 5.0 2.8 1.0 4.0 5.0 1.0 1.0 1.0 4.0 5.0 4.0 5.0 3.0 4.0 5.0 2.0 3.0
## [20] 2.0 4.0 2.0 2.0 3.0 1.3 3.0 2.0 5.0 3.0 1.8

Lets’s include the film names in our ratings data frame

df_ratings <- df_ratings %>%
  left_join(select(df_movies, movie_id, title), 
            by = c("film_id" = "movie_id" ))

Making a Plot of Ratings vs Films

Let’s try our plot again but with our updates ratings values vs film

ggplot( data = df_ratings, aes(x = title , y = rating, 
                               color = takers_id )) +
  geom_point() +
  labs(title = "Movie Survey Ratings",
       x = "Movies",
       y = "Ratings")

dbDisconnect(con)