Summary of Assignment This assignment involves a Movie Rating Dataset. Seven Movies are rated by seven reviewers and the results are analyzed.
This Assignment requires the following:
1). MySql 2). R-Studio
The following R-packages are used: 1.RMySQL 2.dplyr 3.sqldf 4.ggplot2 5.downloader 6.htmlTable
Two csv files are required for input: 1). Movies.csv 2). Reviewer.csv
The path for the in put files is: c:
Steps: 1). run the Movies.sql file 2). run the R-Studio file: R_607_Week_2_Hmk_SQL_R_Daniel_Thonn.Rmd
Setting up and Preparing the Environment
knitr::opts_chunk$set(echo = TRUE)
#install.packages("knitr")
#install.packages("RMySQL")
#install.packages("dplyr")
#install.packages("sqldf")
#install.packages("ggplot2")
#install.packages("downloader")
#install.packages("htmlTable")
library(RMySQL)
## Loading required package: DBI
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
##
## Attaching package: 'RSQLite'
## The following object is masked from 'package:RMySQL':
##
## isIdCurrent
## sqldf will default to using MySQL
library(ggplot2)
library(downloader)
library(htmlTable)
Load the database tables into R dataframes
#note: put in placeholders for user and password
mydb = dbConnect(MySQL(), user='dthonn', password='1804', dbname='movies', host='localhost')
# return the movies query below and store the results a dataframe called movies1
movies1 <- dbGetQuery(mydb, "select movie,movietype,reviewer,score from movies")
#list(movies1)
reviewer1 <- dbGetQuery(mydb, "select reviewer,gender,age_bracket from reviewer")
#list(reviewer1)
gender1 <- dbGetQuery(mydb, "select m.movietype,r.gender,avg(m.score) from movies m inner join reviewer r on m.reviewer = r.reviewer group by m.movietype,r.gender order by m.movietype asc,r.gender asc")
## Warning in .local(conn, statement, ...): Decimal MySQL column 2 imported as
## numeric
list(gender1)
## [[1]]
## movietype gender avg(m.score)
## 1 Action Adventure F 2.7500
## 2 Action Adventure M 4.6667
## 3 Comedy F 4.5000
## 4 Comedy M 2.0000
## 5 Drama Action F 2.5000
## 6 Drama Action M 2.3333
## 7 Drama Biography F 3.7500
## 8 Drama Biography M 5.0000
## 9 Drama Sport F 2.7500
## 10 Drama Sport M 4.6667
## 11 Romance F 4.7500
## 12 Romance M 2.3333
## 13 Thriller Action F 3.2500
## 14 Thriller Action M 5.0000
age_bracket1 <- dbGetQuery(mydb, "select m.movietype,r.age_bracket,avg(m.score)
from movies m
inner join reviewer r
on m.reviewer = r.reviewer
group by m.movietype,r.age_bracket
order by m.movietype asc,r.gender asc")
## Warning in .local(conn, statement, ...): Decimal MySQL column 2 imported as
## numeric
list(age_bracket1)
## [[1]]
## movietype age_bracket avg(m.score)
## 1 Action Adventure Young Adult 3.6667
## 2 Action Adventure Middle Age 3.5000
## 3 Comedy Young Adult 3.3333
## 4 Comedy Middle Age 3.5000
## 5 Drama Action Young Adult 2.6667
## 6 Drama Action Middle Age 2.2500
## 7 Drama Biography Young Adult 4.0000
## 8 Drama Biography Middle Age 4.5000
## 9 Drama Sport Young Adult 3.3333
## 10 Drama Sport Middle Age 3.7500
## 11 Romance Young Adult 3.6667
## 12 Romance Middle Age 3.7500
## 13 Thriller Action Young Adult 3.6667
## 14 Thriller Action Middle Age 4.2500
movies3 <- aggregate(x=movies1$score, data=movies1, by = list(movies1$movie), FUN=mean)
list(movies3)
## [[1]]
## Group.1 x
## 1 Bad Moms 3.428571
## 2 Ben Hur 2.428571
## 3 Hands of Stone 3.571429
## 4 Jason Bourne 4.000000
## 5 Sully 4.285714
## 6 The Light Between Oceans 3.714286
## 7 War Dogs 3.571429
Plot and Analyze the data
## Movies
# Scatterplot Chart of movies with all scores
ggplot(movies1, aes(x=movie, y=score)) + geom_point() + theme(axis.text.x=element_text(angle=90, hjust=1))
# Scatterplot Chart of movies with mean of scores
ggplot(movies3, aes(x=Group.1, y=x)) + geom_point(aes(color = "average score")) + theme(axis.text.x=element_text(angle=90, hjust=1)) + labs(color="Legend")
#Box Plot of Movies and Score
ggplot(movies1, aes(x = movie, y = score)) +
geom_boxplot() + theme(axis.text.x=element_text(angle=90, hjust=1))
##Reviewer
list(reviewer1)
## [[1]]
## reviewer gender age_bracket
## 1 Dan M Middle Age
## 2 Maria F Middle Age
## 3 Kyla F Young Adult
## 4 Doug M Middle Age
## 5 Sara F Middle Age
## 6 Ryan M Young Adult
## 7 Ericka F Young Adult
# Scatterplot Chart of reviewers
ggplot(reviewer1, aes(x=reviewer, y=age_bracket)) + geom_point(aes(color = "age_bracket")) + geom_point(aes(x = reviewer, y = gender,color = "gender")) + theme(axis.text.x=element_text(angle=90, hjust=1)) + labs(color="Legend")
#By Gender
#list(gender1)
#names(gender1)
names(gender1)[3] <- "avg_score"
gender1_M <- gender1[which(gender1$gender == "M"), ]
gender1_F <- gender1[which(gender1$gender == "F"), ]
list (gender1_M)
## [[1]]
## movietype gender avg_score
## 2 Action Adventure M 4.6667
## 4 Comedy M 2.0000
## 6 Drama Action M 2.3333
## 8 Drama Biography M 5.0000
## 10 Drama Sport M 4.6667
## 12 Romance M 2.3333
## 14 Thriller Action M 5.0000
ggplot(gender1_M, aes(x=movietype, y=avg_score)) + geom_point(aes(color = "Male")) + geom_point(aes(x =movietype, y = gender1_F$avg_score,color = "Female")) + theme(axis.text.x=element_text(angle=90, hjust=1)) + labs(color="Legend")
#By Age Bracket
#list(age_bracket1)
#names(age_bracket1)
names(age_bracket1)[3] <- "avg_score"
age_bracket1_YA <- age_bracket1[which(age_bracket1$age_bracket == "Young Adult"), ]
age_bracket1_MA <- age_bracket1[which(age_bracket1$age_bracket == "Middle Age"), ]
list (age_bracket1_YA)
## [[1]]
## movietype age_bracket avg_score
## 1 Action Adventure Young Adult 3.6667
## 3 Comedy Young Adult 3.3333
## 5 Drama Action Young Adult 2.6667
## 7 Drama Biography Young Adult 4.0000
## 9 Drama Sport Young Adult 3.3333
## 11 Romance Young Adult 3.6667
## 13 Thriller Action Young Adult 3.6667
list (age_bracket1_MA)
## [[1]]
## movietype age_bracket avg_score
## 2 Action Adventure Middle Age 3.50
## 4 Comedy Middle Age 3.50
## 6 Drama Action Middle Age 2.25
## 8 Drama Biography Middle Age 4.50
## 10 Drama Sport Middle Age 3.75
## 12 Romance Middle Age 3.75
## 14 Thriller Action Middle Age 4.25
ggplot(age_bracket1_YA, aes(x=movietype, y=avg_score)) + geom_point(aes(color = "Young Adult")) + geom_point(aes(x =movietype, y = age_bracket1_MA$avg_score,color = "Middle Age")) + theme(axis.text.x=element_text(angle=90, hjust=1)) + labs(color="Legend")
Conclustions:
The three most popular movies were Sully, Jason Bourne, and the Light between Oceans. There was a mix of male and female, and young adults and middle age adults as reviewers. The highest average score in terms of Movie Types were Drama Biography, and Thriller Action. There were differing results for males and females. Males preferred Action Adventure, Drama Biography, and Thriller Action. Females preferred Comedy, and Romance movies. Middle age reviewers generally scored most movies types higher, except for Action Adventure which Young Adults scored higher.