Implementing Global Baseline Estimate Recommender System in R

Load Libraries

#library(tidyverse)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(DBI)
library(RMySQL)
## Warning: package 'RMySQL' was built under R version 4.3.2

Connect to MySQL Database

password has been masked

lamadb <- dbConnect(RMySQL::MySQL(), 
                 user = "chhiring.lama65", 
                 password = Sys.getenv("PASSWORD_DB"),
                 dbname = "chhiring.lama65",
                 host = "cunydata607sql.mysql.database.azure.com")

List the tables

dbListTables(lamadb)
## [1] "chess_tournament"  "clean_company_ids" "clean_job_dets"   
## [4] "company"           "gbe_updated"       "job"              
## [7] "movie_rating"

Show Column Names using dbListFields function (assigned it to column_names)

table_name <- "movie_rating"  
column_names <- dbListFields(lamadb, table_name)

Construct the SQL query using sprintf function and then retrieve the data from MySQL database using dbGetQuery function

query <- sprintf("SELECT * FROM %s", table_name)
table_data <- dbGetQuery(lamadb, query)
table_data
##   ratingID      CriticName Gender Age The_Creator The_Holdovers The_Marvels
## 1        1      Nancy Jain Female  30           3          <NA>           4
## 2        2 David Goldsmith   Male  45           4             1           2
## 3        3     Putar Junee   Male  28           2             3        <NA>
## 4        4    Jasprit Devi Female  35           1          <NA>        <NA>
## 5        5        John Doe   Male  40           5             4           3
## 6        6      Jane Smith Female  38           2             3           4
## 7        7   Alice Johnson Female  25        <NA>             4           2
##   Wonka Society_of_the_Snow
## 1     5                <NA>
## 2  <NA>                   5
## 3     3                   4
## 4     2                   3
## 5  <NA>                   2
## 6     5                <NA>
## 7     3                   5

Import the table from MySQL back to R

table_name <- "movie_rating"

query <- paste("SELECT * FROM", table_name)

table_data <- dbGetQuery(lamadb, query)
table_data
##   ratingID      CriticName Gender Age The_Creator The_Holdovers The_Marvels
## 1        1      Nancy Jain Female  30           3          <NA>           4
## 2        2 David Goldsmith   Male  45           4             1           2
## 3        3     Putar Junee   Male  28           2             3        <NA>
## 4        4    Jasprit Devi Female  35           1          <NA>        <NA>
## 5        5        John Doe   Male  40           5             4           3
## 6        6      Jane Smith Female  38           2             3           4
## 7        7   Alice Johnson Female  25        <NA>             4           2
##   Wonka Society_of_the_Snow
## 1     5                <NA>
## 2  <NA>                   5
## 3     3                   4
## 4     2                   3
## 5  <NA>                   2
## 6     5                <NA>
## 7     3                   5

Implement Global Baseline Estimate

Calculate global mean rating

str(table_data)
## 'data.frame':    7 obs. of  9 variables:
##  $ ratingID           : int  1 2 3 4 5 6 7
##  $ CriticName         : chr  "Nancy Jain" "David Goldsmith" "Putar Junee" "Jasprit Devi" ...
##  $ Gender             : chr  "Female" "Male" "Male" "Female" ...
##  $ Age                : int  30 45 28 35 40 38 25
##  $ The_Creator        : chr  "3" "4" "2" "1" ...
##  $ The_Holdovers      : chr  NA "1" "3" NA ...
##  $ The_Marvels        : chr  "4" "2" NA NA ...
##  $ Wonka              : chr  "5" NA "3" "2" ...
##  $ Society_of_the_Snow: chr  NA "5" "4" "3" ...

Convert list columns to numeric vectors

table_data[, 5:9] <- lapply(table_data[, 5:9], function(x) unlist(x))

Convert rating columns to numeric

table_data[, 5:9] <- lapply(table_data[, 5:9], as.numeric)

Calculate global mean rating

global_mean <- mean(unlist(table_data[, 5:9]), na.rm = TRUE)

Calculate mean rating per movie

movie_means <- colMeans(table_data[, 5:9], na.rm = TRUE)

Find the baseline rating

# Subtract movie means from global mean
baseline_ratings <- global_mean - movie_means

Fill NA values in each column with the corresponding baseline rating

table_data$The_Creator[is.na(table_data$The_Creator)] <- baseline_ratings["The_Creator"]
table_data$The_Holdovers[is.na(table_data$The_Holdovers)] <- baseline_ratings["The_Holdovers"]
table_data$The_Marvels[is.na(table_data$The_Marvels)] <- baseline_ratings["The_Marvels"]
table_data$Wonka[is.na(table_data$Wonka)] <- baseline_ratings["Wonka"]
table_data$Society_of_the_Snow[is.na(table_data$Society_of_the_Snow)] <- baseline_ratings["Society_of_the_Snow"]
baseline_ratings
##         The_Creator       The_Holdovers         The_Marvels               Wonka 
##           0.3974359           0.2307692           0.2307692          -0.3692308 
## Society_of_the_Snow 
##          -0.5692308
# Display updated table_data
print(table_data)

Write the dataframe to MySQL and create a new table as ‘gbe_updated’

dbWriteTable(lamadb, "gbe_updated", table_data, overwrite = TRUE)
## [1] TRUE

Best security practice, disconnect from the database

dbDisconnect(lamadb)
## [1] TRUE