library(DBI)
library(RMySQL)
library(RSQLite)
db_user <- 'data607user'
db_password <- 'pass'
db_name <- 'Movies'
db_host <- '35.226.130.145' # for access to SQL Database hosted on Google Cloud
db_port <- 3306
drv <- dbDriver("MySQL")
# creating tbdb connection object with RMysql package
connect <- dbConnect(drv, user = db_user, password = db_password, dbname = db_name, host = db_host, port = db_port)
Movies <- dbGetQuery(connect, "SELECT * FROM Movies" )
Reviewers <- dbGetQuery(connect, "SELECT * FROM Reviewers" )
ReviewsList <- dbGetQuery(connect, "SELECT * FROM Scores" )
#List of movies chosen for this assignment
Movies
## m_ID name length_mins rate
## 1 1 Avengers: Infinity Wars 160 PG-13
## 2 2 Black Planther 135 PG-13
## 3 3 Bird Box 124 R
## 4 4 Incredibles 2 125 PG
## 5 5 Aquaman 142 PG-13
## 6 6 A Quiet Place 91 PG-13
## 7 7 Deadpool 169 R
#People selected in this study
Reviewers
## r_ID name age gender
## 1 1 Jan 38 f
## 2 2 Zo 28 m
## 3 3 Malachi 7 m
## 4 4 Dowren 56 f
## 5 5 Adrian 55 m
## 6 6 Javern 27 f
## 7 7 Camz 27 f
## 8 8 Wendy 28 f
## 9 9 Reece 34 f
## 10 10 Soriya 12 f
#ID numbers of movies and reviews along with scores
head (ReviewsList)
## m_ID r_ID score
## 1 4 3 4
## 2 1 6 5
## 3 3 6 5
## 4 5 2 3
## 5 6 7 4
## 6 1 5 NA
Use of SQL statements via R to change the name of some colomns. R statements were used for the others
#Replaces each movie id and reviewer id with the corresponding names
RefinedRL <- dbGetQuery(connect, "SELECT Movies.name AS Title, Movies.length_mins, Movies.rate, Reviewers.name AS Participants, Reviewers.age, Reviewers.gender, Scores.score AS score FROM Movies INNER JOIN Scores ON Movies.m_ID = Scores.m_ID
INNER JOIN Reviewers on Scores.r_ID = Reviewers.r_ID" )
colnames(RefinedRL)[colnames(RefinedRL) == 'length_mins'] <- 'Length'
colnames(RefinedRL)[colnames(RefinedRL) == 'rate'] <- 'Rate'
colnames(RefinedRL)[colnames(RefinedRL) == 'age'] <- 'Age'
colnames(RefinedRL)[colnames(RefinedRL) == 'gender'] <- 'Gender'
colnames(RefinedRL)[colnames(RefinedRL) == 'score'] <- 'Score'
RefinedRL$Score[is.na(RefinedRL$Score)] = 0 #change missing values to 0
head(RefinedRL, 20)
## Title Length Rate Participants Age Gender Score
## 1 Avengers: Infinity Wars 160 PG-13 Javern 27 f 5
## 2 Avengers: Infinity Wars 160 PG-13 Adrian 55 m 0
## 3 Avengers: Infinity Wars 160 PG-13 Zo 28 m 4
## 4 Avengers: Infinity Wars 160 PG-13 Soriya 12 f 5
## 5 Avengers: Infinity Wars 160 PG-13 Reece 34 f 4
## 6 Avengers: Infinity Wars 160 PG-13 Dowren 56 f 0
## 7 Avengers: Infinity Wars 160 PG-13 Malachi 7 m 0
## 8 Avengers: Infinity Wars 160 PG-13 Wendy 28 f 5
## 9 Avengers: Infinity Wars 160 PG-13 Jan 38 f 2
## 10 Avengers: Infinity Wars 160 PG-13 Camz 27 f 5
## 11 Black Planther 135 PG-13 Zo 28 m 4
## 12 Black Planther 135 PG-13 Javern 27 f 5
## 13 Black Planther 135 PG-13 Wendy 28 f 4
## 14 Black Planther 135 PG-13 Reece 34 f 5
## 15 Black Planther 135 PG-13 Soriya 12 f 3
## 16 Black Planther 135 PG-13 Adrian 55 m 1
## 17 Black Planther 135 PG-13 Dowren 56 f 1
## 18 Black Planther 135 PG-13 Jan 38 f 3
## 19 Black Planther 135 PG-13 Malachi 7 m 4
## 20 Black Planther 135 PG-13 Camz 27 f 5
summary(RefinedRL)
## Title Length Rate Participants
## Length:70 Min. : 91.0 Length:70 Length:70
## Class :character 1st Qu.:124.0 Class :character Class :character
## Mode :character Median :135.0 Mode :character Mode :character
## Mean :135.1
## 3rd Qu.:160.0
## Max. :169.0
## Age Gender Score
## Min. : 7.00 Length:70 Min. :0.000
## 1st Qu.:27.00 Class :character 1st Qu.:0.000
## Median :28.00 Mode :character Median :3.000
## Mean :31.49 Mean :2.543
## 3rd Qu.:38.00 3rd Qu.:4.000
## Max. :56.00 Max. :5.000
library(dplyr)
avgsc <- aggregate(Score ~ Participants + Age, RefinedRL, mean)
avgsc
## Participants Age Score
## 1 Malachi 7 1.8333333
## 2 Soriya 12 2.2857143
## 3 Camz 27 4.2500000
## 4 Javern 27 4.7142857
## 5 Wendy 28 4.2857143
## 6 Zo 28 3.0000000
## 7 Reece 34 2.8571429
## 8 Jan 38 1.5714286
## 9 Adrian 55 0.1428571
## 10 Dowren 56 0.1428571
hist(RefinedRL$Score, main = "Ratings Collected", xlab = "Rating Scores", ylab = "Count")
This histogram is bimodal with peaks at rating for 0 and 4 - 5. Persons rated a movie 0 because they have not seen it or not of age.
Malachi <- sum(RefinedRL$Participants == 'Malachi' & RefinedRL$Score > 0)
Zo <- sum(RefinedRL$Participants == 'Zo' & RefinedRL$Score > 0)
Dowren <- sum(RefinedRL$Participants == 'Dowren' & RefinedRL$Score > 0)
Adrian <- sum(RefinedRL$Participants == 'Adrian' & RefinedRL$Score > 0)
Wendy <- sum(RefinedRL$Participants == 'Wendy' & RefinedRL$Score > 0)
Camz <- sum(RefinedRL$Participants == 'Camz' & RefinedRL$Score > 0)
Javern <- sum(RefinedRL$Participants == 'Javern' & RefinedRL$Score > 0)
Jan <- sum(RefinedRL$Participants == 'Jan' & RefinedRL$Score > 0)
Soriya <- sum(RefinedRL$Participants == 'Soriya' & RefinedRL$Score > 0)
Reece <- sum(RefinedRL$Participants == 'Reece' & RefinedRL$Score > 0)
names <- c("M", "Z", "D", "A", "W", "C", "J", "Jan", "S", "R")
Persons <- c(Malachi, Zo, Dowren, Adrian, Wendy, Camz, Javern, Jan, Soriya, Reece)
barplot(Persons, names.arg = names, main = "Number of Movies watched from the list by each person", xlab = "Participants", ylab = "Number of Movies")
Dowren and Adrian watched the least amount of movies. They are the oldest as well. Current movies are not of their interest. Malachi has the second lowest as he is the youngest and is limited to movies rated PG or maybe PG-13. Javern, Camz and Wendy watched all movies.