library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(dbplyr)
##
## Attaching package: 'dbplyr'
## The following objects are masked from 'package:dplyr':
##
## ident, sql
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.4
## ✔ ggplot2 3.4.3 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.1 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dbplyr::ident() masks dplyr::ident()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dbplyr::sql() masks dplyr::sql()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(odbc)
# Establishing a connection to MySQL workbench to access the movies table I made there
install.packages("odbc");require("odbc")
## Warning: package 'odbc' is in use and will not be installed
con <- dbConnect(odbc::odbc(),
.connection_string = "Driver={MySQL ODBC 8.0 Unicode Driver};",
Server= "localhost", Database= "data", UID= "root", PWD = "1Qaz2wsx!MySQL",
Port= 3306)
# Alternatively, install.packages(c("DBI,"RMySQL")) and con <- dbConnect(RMySQL),host="localhost",user="root",dbname="movies")
moviesdata <- dbGetQuery(con, "SELECT * FROM data.movies") # Pulling the entire movies table
moviesframe <- as.data.frame(moviesdata) # Convert to dataframe
moviesframe
## Movie_ID Movie_Name Rating_1 Rating_2 Rating_3
## 1 1 The Barbie Movie 4 NA 5
## 2 2 Oppenheimer 4 2 4
## 3 3 Spider-Man: Across The Spider-Verse 5 5 4
## 4 4 The Super Mario Bros. Movie 3 4 5
## 5 5 Bullet Train 5 3 3
## 6 6 Indiana Jones and The Dial of Destiny NA 3 3
## Rating_4 Rating_5
## 1 3 5
## 2 5 NA
## 3 3 4
## 4 NA 5
## 5 2 4
## 6 4 2
I’d like to know the average score of the movies. However, some movies have not been seen, so we’ll ignore the N/A ratings
avg_scores <- rowMeans(moviesframe[,3:7],na.rm = TRUE)
# Provides average rating values of the 6 movies across the surveyed participants
# 4.25 3.75 4.20 4.25 3.40 3.00
I also considered the approach of having a separate table for those who provided their movie ratings, called ‘participants’, that stored the following data; Participant_ID, Participant_Name, and a list of which movies they had seen/rated. I thought to perform a join on movies.rating_X to participant.Participant_ID, but the tables were not structured entirely normalized.