Connecting to SQL database

library(DBI)
library(RMySQL)
library(RSQLite)


db_user <- 'data607user'
db_password <- 'pass'
db_name <- 'Movies'
db_host <- '35.226.130.145' # for access to SQL Database hosted on Google Cloud
db_port <- 3306

drv <- dbDriver("MySQL")

# creating tbdb connection object with RMysql package
connect <- dbConnect(drv, user = db_user, password = db_password, dbname = db_name, host = db_host, port = db_port)

Loading tables from SQL into R dataframes

Movies <- dbGetQuery(connect, "SELECT * FROM Movies" )
Reviewers <- dbGetQuery(connect, "SELECT * FROM Reviewers" )
ReviewsList <- dbGetQuery(connect, "SELECT * FROM Scores" ) 

Tables loaded with raw data collected

#List of movies chosen for this assignment
Movies
##   m_ID                    name length_mins  rate
## 1    1 Avengers: Infinity Wars         160 PG-13
## 2    2          Black Planther         135 PG-13
## 3    3                Bird Box         124     R
## 4    4           Incredibles 2         125    PG
## 5    5                 Aquaman         142 PG-13
## 6    6           A Quiet Place          91 PG-13
## 7    7                Deadpool         169     R
#People selected in this study
Reviewers
##    r_ID    name age gender
## 1     1     Jan  38      f
## 2     2      Zo  28      m
## 3     3 Malachi   7      m
## 4     4  Dowren  56      f
## 5     5  Adrian  55      m
## 6     6  Javern  27      f
## 7     7    Camz  27      f
## 8     8   Wendy  28      f
## 9     9   Reece  34      f
## 10   10  Soriya  12      f
#ID numbers of movies and reviews along with scores
head (ReviewsList)
##   m_ID r_ID score
## 1    4    3     4
## 2    1    6     5
## 3    3    6     5
## 4    5    2     3
## 5    6    7     4
## 6    1    5    NA

Putting all data into one table.

Use of SQL statements via R to change the name of some colomns. R statements were used for the others

#Replaces each movie id and reviewer id with the corresponding names
RefinedRL <- dbGetQuery(connect, "SELECT Movies.name AS Title, Movies.length_mins, Movies.rate, Reviewers.name AS Participants, Reviewers.age, Reviewers.gender,              Scores.score AS score FROM Movies INNER JOIN Scores ON Movies.m_ID = Scores.m_ID
INNER JOIN Reviewers on Scores.r_ID = Reviewers.r_ID"  )

colnames(RefinedRL)[colnames(RefinedRL) == 'length_mins'] <- 'Length'
colnames(RefinedRL)[colnames(RefinedRL) == 'rate'] <- 'Rate'
colnames(RefinedRL)[colnames(RefinedRL) == 'age'] <- 'Age'
colnames(RefinedRL)[colnames(RefinedRL) == 'gender'] <- 'Gender'
colnames(RefinedRL)[colnames(RefinedRL) == 'score'] <- 'Score'
RefinedRL$Score[is.na(RefinedRL$Score)] = 0 #change missing values to 0
head(RefinedRL, 20)
##                      Title Length  Rate Participants Age Gender Score
## 1  Avengers: Infinity Wars    160 PG-13       Javern  27      f     5
## 2  Avengers: Infinity Wars    160 PG-13       Adrian  55      m     0
## 3  Avengers: Infinity Wars    160 PG-13           Zo  28      m     4
## 4  Avengers: Infinity Wars    160 PG-13       Soriya  12      f     5
## 5  Avengers: Infinity Wars    160 PG-13        Reece  34      f     4
## 6  Avengers: Infinity Wars    160 PG-13       Dowren  56      f     0
## 7  Avengers: Infinity Wars    160 PG-13      Malachi   7      m     0
## 8  Avengers: Infinity Wars    160 PG-13        Wendy  28      f     5
## 9  Avengers: Infinity Wars    160 PG-13          Jan  38      f     2
## 10 Avengers: Infinity Wars    160 PG-13         Camz  27      f     5
## 11          Black Planther    135 PG-13           Zo  28      m     4
## 12          Black Planther    135 PG-13       Javern  27      f     5
## 13          Black Planther    135 PG-13        Wendy  28      f     4
## 14          Black Planther    135 PG-13        Reece  34      f     5
## 15          Black Planther    135 PG-13       Soriya  12      f     3
## 16          Black Planther    135 PG-13       Adrian  55      m     1
## 17          Black Planther    135 PG-13       Dowren  56      f     1
## 18          Black Planther    135 PG-13          Jan  38      f     3
## 19          Black Planther    135 PG-13      Malachi   7      m     4
## 20          Black Planther    135 PG-13         Camz  27      f     5

Summary of each column

summary(RefinedRL)
##     Title               Length          Rate           Participants      
##  Length:70          Min.   : 91.0   Length:70          Length:70         
##  Class :character   1st Qu.:124.0   Class :character   Class :character  
##  Mode  :character   Median :135.0   Mode  :character   Mode  :character  
##                     Mean   :135.1                                        
##                     3rd Qu.:160.0                                        
##                     Max.   :169.0                                        
##       Age           Gender              Score      
##  Min.   : 7.00   Length:70          Min.   :0.000  
##  1st Qu.:27.00   Class :character   1st Qu.:0.000  
##  Median :28.00   Mode  :character   Median :3.000  
##  Mean   :31.49                      Mean   :2.543  
##  3rd Qu.:38.00                      3rd Qu.:4.000  
##  Max.   :56.00                      Max.   :5.000

Ratings: Average Score per Person and Counts of each score (graph)

library(dplyr)

avgsc <- aggregate(Score ~ Participants + Age, RefinedRL, mean)

avgsc
##    Participants Age     Score
## 1       Malachi   7 1.8333333
## 2        Soriya  12 2.2857143
## 3          Camz  27 4.2500000
## 4        Javern  27 4.7142857
## 5         Wendy  28 4.2857143
## 6            Zo  28 3.0000000
## 7         Reece  34 2.8571429
## 8           Jan  38 1.5714286
## 9        Adrian  55 0.1428571
## 10       Dowren  56 0.1428571
hist(RefinedRL$Score, main = "Ratings Collected", xlab = "Rating Scores", ylab = "Count")

This histogram is bimodal with peaks at rating for 0 and 4 - 5. Persons rated a movie 0 because they have not seen it or not of age.

Malachi <- sum(RefinedRL$Participants == 'Malachi' & RefinedRL$Score > 0)
Zo <- sum(RefinedRL$Participants == 'Zo' & RefinedRL$Score > 0)
Dowren <- sum(RefinedRL$Participants == 'Dowren' & RefinedRL$Score > 0)
Adrian <- sum(RefinedRL$Participants == 'Adrian' & RefinedRL$Score > 0)
Wendy <- sum(RefinedRL$Participants == 'Wendy' & RefinedRL$Score > 0)
Camz <- sum(RefinedRL$Participants == 'Camz' & RefinedRL$Score > 0)
Javern <- sum(RefinedRL$Participants == 'Javern' & RefinedRL$Score > 0)
Jan <- sum(RefinedRL$Participants == 'Jan' & RefinedRL$Score > 0)
Soriya <- sum(RefinedRL$Participants == 'Soriya' & RefinedRL$Score > 0)
Reece <- sum(RefinedRL$Participants == 'Reece' & RefinedRL$Score > 0)

names <- c("M", "Z", "D", "A", "W", "C", "J", "Jan", "S", "R")
Persons <- c(Malachi, Zo, Dowren, Adrian, Wendy, Camz, Javern, Jan, Soriya, Reece)
barplot(Persons, names.arg = names, main = "Number of Movies watched from the list by each person", xlab = "Participants", ylab = "Number of Movies")

Dowren and Adrian watched the least amount of movies. They are the oldest as well. Current movies are not of their interest. Malachi has the second lowest as he is the youngest and is limited to movies rated PG or maybe PG-13. Javern, Camz and Wendy watched all movies.